PR middle-end/85090
[official-gcc.git] / gcc / config / i386 / i386.c
blob337545f127ccd1019bc4c110e89d5cb7383e99f7
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
93 #include "wide-int-bitmask.h"
95 /* This file should be included last. */
96 #include "target-def.h"
98 #include "x86-tune-costs.h"
100 static rtx legitimize_dllimport_symbol (rtx, bool);
101 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
102 static rtx legitimize_pe_coff_symbol (rtx, bool);
103 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
104 static bool ix86_save_reg (unsigned int, bool, bool);
105 static bool ix86_function_naked (const_tree);
106 static bool ix86_notrack_prefixed_insn_p (rtx);
107 static void ix86_emit_restore_reg_using_pop (rtx);
110 #ifndef CHECK_STACK_LIMIT
111 #define CHECK_STACK_LIMIT (-1)
112 #endif
114 /* Return index of given mode in mult and division cost tables. */
115 #define MODE_INDEX(mode) \
116 ((mode) == QImode ? 0 \
117 : (mode) == HImode ? 1 \
118 : (mode) == SImode ? 2 \
119 : (mode) == DImode ? 3 \
120 : 4)
123 /* Set by -mtune. */
124 const struct processor_costs *ix86_tune_cost = NULL;
126 /* Set by -mtune or -Os. */
127 const struct processor_costs *ix86_cost = NULL;
129 /* Processor feature/optimization bitmasks. */
130 #define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
131 #define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
132 #define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
133 #define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
134 #define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
135 #define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
136 #define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
137 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
138 #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
139 #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
140 #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
141 #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
142 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
143 #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
144 #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
145 #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
146 #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
147 #define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
148 #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
149 #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
150 #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
151 #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
153 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
154 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
155 #define m_K6_GEODE (m_K6 | m_GEODE)
156 #define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
157 #define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
158 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
159 #define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
160 #define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
161 #define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
162 #define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
163 #define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
164 #define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
165 #define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
166 #define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
167 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
168 #define m_BTVER (m_BTVER1 | m_BTVER2)
169 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
170 | m_ZNVER1)
172 #define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
174 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
175 #undef DEF_TUNE
176 #define DEF_TUNE(tune, name, selector) name,
177 #include "x86-tune.def"
178 #undef DEF_TUNE
181 /* Feature tests against the various tunings. */
182 unsigned char ix86_tune_features[X86_TUNE_LAST];
184 /* Feature tests against the various tunings used to create ix86_tune_features
185 based on the processor mask. */
186 static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
187 #undef DEF_TUNE
188 #define DEF_TUNE(tune, name, selector) selector,
189 #include "x86-tune.def"
190 #undef DEF_TUNE
193 /* Feature tests against the various architecture variations. */
194 unsigned char ix86_arch_features[X86_ARCH_LAST];
196 /* Feature tests against the various architecture variations, used to create
197 ix86_arch_features based on the processor mask. */
198 static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
199 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
200 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
202 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
203 ~m_386,
205 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
206 ~(m_386 | m_486),
208 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
209 ~m_386,
211 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
212 ~m_386,
215 /* In case the average insn count for single function invocation is
216 lower than this constant, emit fast (but longer) prologue and
217 epilogue code. */
218 #define FAST_PROLOGUE_INSN_COUNT 20
220 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
221 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
222 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
223 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
225 /* Array of the smallest class containing reg number REGNO, indexed by
226 REGNO. Used by REGNO_REG_CLASS in i386.h. */
228 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
230 /* ax, dx, cx, bx */
231 AREG, DREG, CREG, BREG,
232 /* si, di, bp, sp */
233 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
234 /* FP registers */
235 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
236 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
237 /* arg pointer */
238 NON_Q_REGS,
239 /* flags, fpsr, fpcr, frame */
240 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
241 /* SSE registers */
242 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
243 SSE_REGS, SSE_REGS,
244 /* MMX registers */
245 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
246 MMX_REGS, MMX_REGS,
247 /* REX registers */
248 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
250 /* SSE REX registers */
251 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
252 SSE_REGS, SSE_REGS,
253 /* AVX-512 SSE registers */
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
256 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
257 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
258 /* Mask registers. */
259 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
260 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
261 /* MPX bound registers */
262 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
265 /* The "default" register map used in 32bit mode. */
267 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
269 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
270 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
271 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
272 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
273 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
275 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
276 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
277 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
278 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
279 101, 102, 103, 104, /* bound registers */
282 /* The "default" register map used in 64bit mode. */
284 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
286 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
287 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
288 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
289 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
290 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
291 8,9,10,11,12,13,14,15, /* extended integer registers */
292 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
293 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
294 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
295 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
296 126, 127, 128, 129, /* bound registers */
299 /* Define the register numbers to be used in Dwarf debugging information.
300 The SVR4 reference port C compiler uses the following register numbers
301 in its Dwarf output code:
302 0 for %eax (gcc regno = 0)
303 1 for %ecx (gcc regno = 2)
304 2 for %edx (gcc regno = 1)
305 3 for %ebx (gcc regno = 3)
306 4 for %esp (gcc regno = 7)
307 5 for %ebp (gcc regno = 6)
308 6 for %esi (gcc regno = 4)
309 7 for %edi (gcc regno = 5)
310 The following three DWARF register numbers are never generated by
311 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
312 believed these numbers have these meanings.
313 8 for %eip (no gcc equivalent)
314 9 for %eflags (gcc regno = 17)
315 10 for %trapno (no gcc equivalent)
316 It is not at all clear how we should number the FP stack registers
317 for the x86 architecture. If the version of SDB on x86/svr4 were
318 a bit less brain dead with respect to floating-point then we would
319 have a precedent to follow with respect to DWARF register numbers
320 for x86 FP registers, but the SDB on x86/svr4 was so completely
321 broken with respect to FP registers that it is hardly worth thinking
322 of it as something to strive for compatibility with.
323 The version of x86/svr4 SDB I had does (partially)
324 seem to believe that DWARF register number 11 is associated with
325 the x86 register %st(0), but that's about all. Higher DWARF
326 register numbers don't seem to be associated with anything in
327 particular, and even for DWARF regno 11, SDB only seemed to under-
328 stand that it should say that a variable lives in %st(0) (when
329 asked via an `=' command) if we said it was in DWARF regno 11,
330 but SDB still printed garbage when asked for the value of the
331 variable in question (via a `/' command).
332 (Also note that the labels SDB printed for various FP stack regs
333 when doing an `x' command were all wrong.)
334 Note that these problems generally don't affect the native SVR4
335 C compiler because it doesn't allow the use of -O with -g and
336 because when it is *not* optimizing, it allocates a memory
337 location for each floating-point variable, and the memory
338 location is what gets described in the DWARF AT_location
339 attribute for the variable in question.
340 Regardless of the severe mental illness of the x86/svr4 SDB, we
341 do something sensible here and we use the following DWARF
342 register numbers. Note that these are all stack-top-relative
343 numbers.
344 11 for %st(0) (gcc regno = 8)
345 12 for %st(1) (gcc regno = 9)
346 13 for %st(2) (gcc regno = 10)
347 14 for %st(3) (gcc regno = 11)
348 15 for %st(4) (gcc regno = 12)
349 16 for %st(5) (gcc regno = 13)
350 17 for %st(6) (gcc regno = 14)
351 18 for %st(7) (gcc regno = 15)
353 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
355 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
356 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
357 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
358 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
359 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
361 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
362 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
363 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
364 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
365 101, 102, 103, 104, /* bound registers */
368 /* Define parameter passing and return registers. */
370 static int const x86_64_int_parameter_registers[6] =
372 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
375 static int const x86_64_ms_abi_int_parameter_registers[4] =
377 CX_REG, DX_REG, R8_REG, R9_REG
380 static int const x86_64_int_return_registers[4] =
382 AX_REG, DX_REG, DI_REG, SI_REG
385 /* Additional registers that are clobbered by SYSV calls. */
387 #define NUM_X86_64_MS_CLOBBERED_REGS 12
388 static int const x86_64_ms_sysv_extra_clobbered_registers
389 [NUM_X86_64_MS_CLOBBERED_REGS] =
391 SI_REG, DI_REG,
392 XMM6_REG, XMM7_REG,
393 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
394 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
397 enum xlogue_stub {
398 XLOGUE_STUB_SAVE,
399 XLOGUE_STUB_RESTORE,
400 XLOGUE_STUB_RESTORE_TAIL,
401 XLOGUE_STUB_SAVE_HFP,
402 XLOGUE_STUB_RESTORE_HFP,
403 XLOGUE_STUB_RESTORE_HFP_TAIL,
405 XLOGUE_STUB_COUNT
408 enum xlogue_stub_sets {
409 XLOGUE_SET_ALIGNED,
410 XLOGUE_SET_ALIGNED_PLUS_8,
411 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
412 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
414 XLOGUE_SET_COUNT
417 /* Register save/restore layout used by out-of-line stubs. */
418 class xlogue_layout {
419 public:
420 struct reginfo
422 unsigned regno;
423 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
424 rsi) to where each register is stored. */
427 unsigned get_nregs () const {return m_nregs;}
428 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
430 const reginfo &get_reginfo (unsigned reg) const
432 gcc_assert (reg < m_nregs);
433 return m_regs[reg];
436 static const char *get_stub_name (enum xlogue_stub stub,
437 unsigned n_extra_args);
439 /* Returns an rtx for the stub's symbol based upon
440 1.) the specified stub (save, restore or restore_ret) and
441 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
442 3.) rather or not stack alignment is being performed. */
443 static rtx get_stub_rtx (enum xlogue_stub stub);
445 /* Returns the amount of stack space (including padding) that the stub
446 needs to store registers based upon data in the machine_function. */
447 HOST_WIDE_INT get_stack_space_used () const
449 const struct machine_function *m = cfun->machine;
450 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
452 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
453 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
456 /* Returns the offset for the base pointer used by the stub. */
457 HOST_WIDE_INT get_stub_ptr_offset () const
459 return STUB_INDEX_OFFSET + m_stack_align_off_in;
462 static const struct xlogue_layout &get_instance ();
463 static unsigned count_stub_managed_regs ();
464 static bool is_stub_managed_reg (unsigned regno, unsigned count);
466 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
467 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
468 static const unsigned MAX_REGS = 18;
469 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
470 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
471 static const unsigned STUB_NAME_MAX_LEN = 20;
472 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
473 static const unsigned REG_ORDER[MAX_REGS];
474 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
476 private:
477 xlogue_layout ();
478 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
479 xlogue_layout (const xlogue_layout &);
481 /* True if hard frame pointer is used. */
482 bool m_hfp;
484 /* Max number of register this layout manages. */
485 unsigned m_nregs;
487 /* Incoming offset from 16-byte alignment. */
488 HOST_WIDE_INT m_stack_align_off_in;
490 /* Register order and offsets. */
491 struct reginfo m_regs[MAX_REGS];
493 /* Lazy-inited cache of symbol names for stubs. */
494 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
495 [STUB_NAME_MAX_LEN];
497 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
500 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
501 "savms64",
502 "resms64",
503 "resms64x",
504 "savms64f",
505 "resms64f",
506 "resms64fx"
509 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
510 /* The below offset values are where each register is stored for the layout
511 relative to incoming stack pointer. The value of each m_regs[].offset will
512 be relative to the incoming base pointer (rax or rsi) used by the stub.
514 s_instances: 0 1 2 3
515 Offset: realigned or aligned + 8
516 Register aligned aligned + 8 aligned w/HFP w/HFP */
517 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
518 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
519 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
520 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
521 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
522 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
523 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
524 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
525 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
526 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
527 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
528 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
529 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
530 BP_REG, /* 0xc0 0xc8 N/A N/A */
531 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
532 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
533 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
534 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
537 /* Instantiate static const values. */
538 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
539 const unsigned xlogue_layout::MIN_REGS;
540 const unsigned xlogue_layout::MAX_REGS;
541 const unsigned xlogue_layout::MAX_EXTRA_REGS;
542 const unsigned xlogue_layout::VARIANT_COUNT;
543 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
545 /* Initialize xlogue_layout::s_stub_names to zero. */
546 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
547 [STUB_NAME_MAX_LEN];
549 /* Instantiates all xlogue_layout instances. */
550 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
551 xlogue_layout (0, false),
552 xlogue_layout (8, false),
553 xlogue_layout (0, true),
554 xlogue_layout (8, true)
557 /* Return an appropriate const instance of xlogue_layout based upon values
558 in cfun->machine and crtl. */
559 const struct xlogue_layout &
560 xlogue_layout::get_instance ()
562 enum xlogue_stub_sets stub_set;
563 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
565 if (stack_realign_fp)
566 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
567 else if (frame_pointer_needed)
568 stub_set = aligned_plus_8
569 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
570 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
571 else
572 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
574 return s_instances[stub_set];
577 /* Determine how many clobbered registers can be saved by the stub.
578 Returns the count of registers the stub will save and restore. */
579 unsigned
580 xlogue_layout::count_stub_managed_regs ()
582 bool hfp = frame_pointer_needed || stack_realign_fp;
583 unsigned i, count;
584 unsigned regno;
586 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
588 regno = REG_ORDER[i];
589 if (regno == BP_REG && hfp)
590 continue;
591 if (!ix86_save_reg (regno, false, false))
592 break;
593 ++count;
595 return count;
598 /* Determine if register REGNO is a stub managed register given the
599 total COUNT of stub managed registers. */
600 bool
601 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
603 bool hfp = frame_pointer_needed || stack_realign_fp;
604 unsigned i;
606 for (i = 0; i < count; ++i)
608 gcc_assert (i < MAX_REGS);
609 if (REG_ORDER[i] == BP_REG && hfp)
610 ++count;
611 else if (REG_ORDER[i] == regno)
612 return true;
614 return false;
617 /* Constructor for xlogue_layout. */
618 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
619 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
620 m_stack_align_off_in (stack_align_off_in)
622 HOST_WIDE_INT offset = stack_align_off_in;
623 unsigned i, j;
625 for (i = j = 0; i < MAX_REGS; ++i)
627 unsigned regno = REG_ORDER[i];
629 if (regno == BP_REG && hfp)
630 continue;
631 if (SSE_REGNO_P (regno))
633 offset += 16;
634 /* Verify that SSE regs are always aligned. */
635 gcc_assert (!((stack_align_off_in + offset) & 15));
637 else
638 offset += 8;
640 m_regs[j].regno = regno;
641 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
643 gcc_assert (j == m_nregs);
646 const char *
647 xlogue_layout::get_stub_name (enum xlogue_stub stub,
648 unsigned n_extra_regs)
650 const int have_avx = TARGET_AVX;
651 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
653 /* Lazy init */
654 if (!*name)
656 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
657 (have_avx ? "avx" : "sse"),
658 STUB_BASE_NAMES[stub],
659 MIN_REGS + n_extra_regs);
660 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
663 return name;
666 /* Return rtx of a symbol ref for the entry point (based upon
667 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
669 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
671 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
672 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
673 gcc_assert (stub < XLOGUE_STUB_COUNT);
674 gcc_assert (crtl->stack_realign_finalized);
676 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
679 /* Define the structure for the machine field in struct function. */
681 struct GTY(()) stack_local_entry {
682 unsigned short mode;
683 unsigned short n;
684 rtx rtl;
685 struct stack_local_entry *next;
688 /* Which cpu are we scheduling for. */
689 enum attr_cpu ix86_schedule;
691 /* Which cpu are we optimizing for. */
692 enum processor_type ix86_tune;
694 /* Which instruction set architecture to use. */
695 enum processor_type ix86_arch;
697 /* True if processor has SSE prefetch instruction. */
698 unsigned char x86_prefetch_sse;
700 /* -mstackrealign option */
701 static const char ix86_force_align_arg_pointer_string[]
702 = "force_align_arg_pointer";
704 static rtx (*ix86_gen_leave) (void);
705 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
708 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
709 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_clzero) (rtx);
712 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
714 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
715 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
716 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
717 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
719 /* Preferred alignment for stack boundary in bits. */
720 unsigned int ix86_preferred_stack_boundary;
722 /* Alignment for incoming stack boundary in bits specified at
723 command line. */
724 static unsigned int ix86_user_incoming_stack_boundary;
726 /* Default alignment for incoming stack boundary in bits. */
727 static unsigned int ix86_default_incoming_stack_boundary;
729 /* Alignment for incoming stack boundary in bits. */
730 unsigned int ix86_incoming_stack_boundary;
732 /* Calling abi specific va_list type nodes. */
733 static GTY(()) tree sysv_va_list_type_node;
734 static GTY(()) tree ms_va_list_type_node;
736 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
737 char internal_label_prefix[16];
738 int internal_label_prefix_len;
740 /* Fence to use after loop using movnt. */
741 tree x86_mfence;
743 /* Register class used for passing given 64bit part of the argument.
744 These represent classes as documented by the PS ABI, with the exception
745 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
746 use SF or DFmode move instead of DImode to avoid reformatting penalties.
748 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
749 whenever possible (upper half does contain padding). */
750 enum x86_64_reg_class
752 X86_64_NO_CLASS,
753 X86_64_INTEGER_CLASS,
754 X86_64_INTEGERSI_CLASS,
755 X86_64_SSE_CLASS,
756 X86_64_SSESF_CLASS,
757 X86_64_SSEDF_CLASS,
758 X86_64_SSEUP_CLASS,
759 X86_64_X87_CLASS,
760 X86_64_X87UP_CLASS,
761 X86_64_COMPLEX_X87_CLASS,
762 X86_64_MEMORY_CLASS
765 #define MAX_CLASSES 8
767 /* Table of constants used by fldpi, fldln2, etc.... */
768 static REAL_VALUE_TYPE ext_80387_constants_table [5];
769 static bool ext_80387_constants_init;
772 static struct machine_function * ix86_init_machine_status (void);
773 static rtx ix86_function_value (const_tree, const_tree, bool);
774 static bool ix86_function_value_regno_p (const unsigned int);
775 static unsigned int ix86_function_arg_boundary (machine_mode,
776 const_tree);
777 static rtx ix86_static_chain (const_tree, bool);
778 static int ix86_function_regparm (const_tree, const_tree);
779 static void ix86_compute_frame_layout (void);
780 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
781 rtx, rtx, int);
782 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
783 static tree ix86_canonical_va_list_type (tree);
784 static void predict_jump (int);
785 static unsigned int split_stack_prologue_scratch_regno (void);
786 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
788 enum ix86_function_specific_strings
790 IX86_FUNCTION_SPECIFIC_ARCH,
791 IX86_FUNCTION_SPECIFIC_TUNE,
792 IX86_FUNCTION_SPECIFIC_MAX
795 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
796 const char *, const char *, enum fpmath_unit,
797 bool);
798 static void ix86_function_specific_save (struct cl_target_option *,
799 struct gcc_options *opts);
800 static void ix86_function_specific_restore (struct gcc_options *opts,
801 struct cl_target_option *);
802 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
803 static void ix86_function_specific_print (FILE *, int,
804 struct cl_target_option *);
805 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
806 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
807 struct gcc_options *,
808 struct gcc_options *,
809 struct gcc_options *);
810 static bool ix86_can_inline_p (tree, tree);
811 static void ix86_set_current_function (tree);
812 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
814 static enum calling_abi ix86_function_abi (const_tree);
817 #ifndef SUBTARGET32_DEFAULT_CPU
818 #define SUBTARGET32_DEFAULT_CPU "i386"
819 #endif
821 /* Whether -mtune= or -march= were specified */
822 static int ix86_tune_defaulted;
823 static int ix86_arch_specified;
825 /* Vectorization library interface and handlers. */
826 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
828 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
829 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
831 /* Processor target table, indexed by processor number */
832 struct ptt
834 const char *const name; /* processor name */
835 const struct processor_costs *cost; /* Processor costs */
836 const int align_loop; /* Default alignments. */
837 const int align_loop_max_skip;
838 const int align_jump;
839 const int align_jump_max_skip;
840 const int align_func;
843 /* This table must be in sync with enum processor_type in i386.h. */
844 static const struct ptt processor_target_table[PROCESSOR_max] =
846 {"generic", &generic_cost, 16, 10, 16, 10, 16},
847 {"i386", &i386_cost, 4, 3, 4, 3, 4},
848 {"i486", &i486_cost, 16, 15, 16, 15, 16},
849 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
850 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
851 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
852 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
853 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
854 {"core2", &core_cost, 16, 10, 16, 10, 16},
855 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
856 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
857 {"haswell", &core_cost, 16, 10, 16, 10, 16},
858 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
859 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
860 {"knl", &slm_cost, 16, 15, 16, 7, 16},
861 {"knm", &slm_cost, 16, 15, 16, 7, 16},
862 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
863 {"cannonlake", &skylake_cost, 16, 10, 16, 10, 16},
864 {"icelake-client", &skylake_cost, 16, 10, 16, 10, 16},
865 {"icelake-server", &skylake_cost, 16, 10, 16, 10, 16},
866 {"intel", &intel_cost, 16, 15, 16, 7, 16},
867 {"geode", &geode_cost, 0, 0, 0, 0, 0},
868 {"k6", &k6_cost, 32, 7, 32, 7, 32},
869 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
870 {"k8", &k8_cost, 16, 7, 16, 7, 16},
871 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
872 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
873 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
874 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
875 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
876 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
877 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
878 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
881 static unsigned int
882 rest_of_handle_insert_vzeroupper (void)
884 int i;
886 /* vzeroupper instructions are inserted immediately after reload to
887 account for possible spills from 256bit or 512bit registers. The pass
888 reuses mode switching infrastructure by re-running mode insertion
889 pass, so disable entities that have already been processed. */
890 for (i = 0; i < MAX_386_ENTITIES; i++)
891 ix86_optimize_mode_switching[i] = 0;
893 ix86_optimize_mode_switching[AVX_U128] = 1;
895 /* Call optimize_mode_switching. */
896 g->get_passes ()->execute_pass_mode_switching ();
897 return 0;
900 /* Return 1 if INSN uses or defines a hard register.
901 Hard register uses in a memory address are ignored.
902 Clobbers and flags definitions are ignored. */
904 static bool
905 has_non_address_hard_reg (rtx_insn *insn)
907 df_ref ref;
908 FOR_EACH_INSN_DEF (ref, insn)
909 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
910 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
911 && DF_REF_REGNO (ref) != FLAGS_REG)
912 return true;
914 FOR_EACH_INSN_USE (ref, insn)
915 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
916 return true;
918 return false;
921 /* Check if comparison INSN may be transformed
922 into vector comparison. Currently we transform
923 zero checks only which look like:
925 (set (reg:CCZ 17 flags)
926 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
927 (subreg:SI (reg:DI x) 0))
928 (const_int 0 [0]))) */
930 static bool
931 convertible_comparison_p (rtx_insn *insn)
933 if (!TARGET_SSE4_1)
934 return false;
936 rtx def_set = single_set (insn);
938 gcc_assert (def_set);
940 rtx src = SET_SRC (def_set);
941 rtx dst = SET_DEST (def_set);
943 gcc_assert (GET_CODE (src) == COMPARE);
945 if (GET_CODE (dst) != REG
946 || REGNO (dst) != FLAGS_REG
947 || GET_MODE (dst) != CCZmode)
948 return false;
950 rtx op1 = XEXP (src, 0);
951 rtx op2 = XEXP (src, 1);
953 if (op2 != CONST0_RTX (GET_MODE (op2)))
954 return false;
956 if (GET_CODE (op1) != IOR)
957 return false;
959 op2 = XEXP (op1, 1);
960 op1 = XEXP (op1, 0);
962 if (!SUBREG_P (op1)
963 || !SUBREG_P (op2)
964 || GET_MODE (op1) != SImode
965 || GET_MODE (op2) != SImode
966 || ((SUBREG_BYTE (op1) != 0
967 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
968 && (SUBREG_BYTE (op2) != 0
969 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
970 return false;
972 op1 = SUBREG_REG (op1);
973 op2 = SUBREG_REG (op2);
975 if (op1 != op2
976 || !REG_P (op1)
977 || GET_MODE (op1) != DImode)
978 return false;
980 return true;
983 /* The DImode version of scalar_to_vector_candidate_p. */
985 static bool
986 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
988 rtx def_set = single_set (insn);
990 if (!def_set)
991 return false;
993 if (has_non_address_hard_reg (insn))
994 return false;
996 rtx src = SET_SRC (def_set);
997 rtx dst = SET_DEST (def_set);
999 if (GET_CODE (src) == COMPARE)
1000 return convertible_comparison_p (insn);
1002 /* We are interested in DImode promotion only. */
1003 if ((GET_MODE (src) != DImode
1004 && !CONST_INT_P (src))
1005 || GET_MODE (dst) != DImode)
1006 return false;
1008 if (!REG_P (dst) && !MEM_P (dst))
1009 return false;
1011 switch (GET_CODE (src))
1013 case ASHIFTRT:
1014 if (!TARGET_AVX512VL)
1015 return false;
1016 /* FALLTHRU */
1018 case ASHIFT:
1019 case LSHIFTRT:
1020 if (!REG_P (XEXP (src, 1))
1021 && (!SUBREG_P (XEXP (src, 1))
1022 || SUBREG_BYTE (XEXP (src, 1)) != 0
1023 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1024 && (!CONST_INT_P (XEXP (src, 1))
1025 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1026 return false;
1028 if (GET_MODE (XEXP (src, 1)) != QImode
1029 && !CONST_INT_P (XEXP (src, 1)))
1030 return false;
1031 break;
1033 case PLUS:
1034 case MINUS:
1035 case IOR:
1036 case XOR:
1037 case AND:
1038 if (!REG_P (XEXP (src, 1))
1039 && !MEM_P (XEXP (src, 1))
1040 && !CONST_INT_P (XEXP (src, 1)))
1041 return false;
1043 if (GET_MODE (XEXP (src, 1)) != DImode
1044 && !CONST_INT_P (XEXP (src, 1)))
1045 return false;
1046 break;
1048 case NEG:
1049 case NOT:
1050 break;
1052 case REG:
1053 return true;
1055 case MEM:
1056 case CONST_INT:
1057 return REG_P (dst);
1059 default:
1060 return false;
1063 if (!REG_P (XEXP (src, 0))
1064 && !MEM_P (XEXP (src, 0))
1065 && !CONST_INT_P (XEXP (src, 0))
1066 /* Check for andnot case. */
1067 && (GET_CODE (src) != AND
1068 || GET_CODE (XEXP (src, 0)) != NOT
1069 || !REG_P (XEXP (XEXP (src, 0), 0))))
1070 return false;
1072 if (GET_MODE (XEXP (src, 0)) != DImode
1073 && !CONST_INT_P (XEXP (src, 0)))
1074 return false;
1076 return true;
1079 /* The TImode version of scalar_to_vector_candidate_p. */
1081 static bool
1082 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1084 rtx def_set = single_set (insn);
1086 if (!def_set)
1087 return false;
1089 if (has_non_address_hard_reg (insn))
1090 return false;
1092 rtx src = SET_SRC (def_set);
1093 rtx dst = SET_DEST (def_set);
1095 /* Only TImode load and store are allowed. */
1096 if (GET_MODE (dst) != TImode)
1097 return false;
1099 if (MEM_P (dst))
1101 /* Check for store. Memory must be aligned or unaligned store
1102 is optimal. Only support store from register, standard SSE
1103 constant or CONST_WIDE_INT generated from piecewise store.
1105 ??? Verify performance impact before enabling CONST_INT for
1106 __int128 store. */
1107 if (misaligned_operand (dst, TImode)
1108 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1109 return false;
1111 switch (GET_CODE (src))
1113 default:
1114 return false;
1116 case REG:
1117 case CONST_WIDE_INT:
1118 return true;
1120 case CONST_INT:
1121 return standard_sse_constant_p (src, TImode);
1124 else if (MEM_P (src))
1126 /* Check for load. Memory must be aligned or unaligned load is
1127 optimal. */
1128 return (REG_P (dst)
1129 && (!misaligned_operand (src, TImode)
1130 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1133 return false;
1136 /* Return 1 if INSN may be converted into vector
1137 instruction. */
1139 static bool
1140 scalar_to_vector_candidate_p (rtx_insn *insn)
1142 if (TARGET_64BIT)
1143 return timode_scalar_to_vector_candidate_p (insn);
1144 else
1145 return dimode_scalar_to_vector_candidate_p (insn);
1148 /* The DImode version of remove_non_convertible_regs. */
1150 static void
1151 dimode_remove_non_convertible_regs (bitmap candidates)
1153 bitmap_iterator bi;
1154 unsigned id;
1155 bitmap regs = BITMAP_ALLOC (NULL);
1157 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1159 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1160 rtx reg = SET_DEST (def_set);
1162 if (!REG_P (reg)
1163 || bitmap_bit_p (regs, REGNO (reg))
1164 || HARD_REGISTER_P (reg))
1165 continue;
1167 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1168 def;
1169 def = DF_REF_NEXT_REG (def))
1171 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1173 if (dump_file)
1174 fprintf (dump_file,
1175 "r%d has non convertible definition in insn %d\n",
1176 REGNO (reg), DF_REF_INSN_UID (def));
1178 bitmap_set_bit (regs, REGNO (reg));
1179 break;
1184 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1186 for (df_ref def = DF_REG_DEF_CHAIN (id);
1187 def;
1188 def = DF_REF_NEXT_REG (def))
1189 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1191 if (dump_file)
1192 fprintf (dump_file, "Removing insn %d from candidates list\n",
1193 DF_REF_INSN_UID (def));
1195 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1199 BITMAP_FREE (regs);
1202 /* For a register REGNO, scan instructions for its defs and uses.
1203 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1205 static void
1206 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1207 unsigned int regno)
1209 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1210 def;
1211 def = DF_REF_NEXT_REG (def))
1213 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1215 if (dump_file)
1216 fprintf (dump_file,
1217 "r%d has non convertible def in insn %d\n",
1218 regno, DF_REF_INSN_UID (def));
1220 bitmap_set_bit (regs, regno);
1221 break;
1225 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1226 ref;
1227 ref = DF_REF_NEXT_REG (ref))
1229 /* Debug instructions are skipped. */
1230 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1231 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1233 if (dump_file)
1234 fprintf (dump_file,
1235 "r%d has non convertible use in insn %d\n",
1236 regno, DF_REF_INSN_UID (ref));
1238 bitmap_set_bit (regs, regno);
1239 break;
1244 /* The TImode version of remove_non_convertible_regs. */
1246 static void
1247 timode_remove_non_convertible_regs (bitmap candidates)
1249 bitmap_iterator bi;
1250 unsigned id;
1251 bitmap regs = BITMAP_ALLOC (NULL);
1253 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1255 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1256 rtx dest = SET_DEST (def_set);
1257 rtx src = SET_SRC (def_set);
1259 if ((!REG_P (dest)
1260 || bitmap_bit_p (regs, REGNO (dest))
1261 || HARD_REGISTER_P (dest))
1262 && (!REG_P (src)
1263 || bitmap_bit_p (regs, REGNO (src))
1264 || HARD_REGISTER_P (src)))
1265 continue;
1267 if (REG_P (dest))
1268 timode_check_non_convertible_regs (candidates, regs,
1269 REGNO (dest));
1271 if (REG_P (src))
1272 timode_check_non_convertible_regs (candidates, regs,
1273 REGNO (src));
1276 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1278 for (df_ref def = DF_REG_DEF_CHAIN (id);
1279 def;
1280 def = DF_REF_NEXT_REG (def))
1281 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1283 if (dump_file)
1284 fprintf (dump_file, "Removing insn %d from candidates list\n",
1285 DF_REF_INSN_UID (def));
1287 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1290 for (df_ref ref = DF_REG_USE_CHAIN (id);
1291 ref;
1292 ref = DF_REF_NEXT_REG (ref))
1293 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1295 if (dump_file)
1296 fprintf (dump_file, "Removing insn %d from candidates list\n",
1297 DF_REF_INSN_UID (ref));
1299 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1303 BITMAP_FREE (regs);
1306 /* For a given bitmap of insn UIDs scans all instruction and
1307 remove insn from CANDIDATES in case it has both convertible
1308 and not convertible definitions.
1310 All insns in a bitmap are conversion candidates according to
1311 scalar_to_vector_candidate_p. Currently it implies all insns
1312 are single_set. */
1314 static void
1315 remove_non_convertible_regs (bitmap candidates)
1317 if (TARGET_64BIT)
1318 timode_remove_non_convertible_regs (candidates);
1319 else
1320 dimode_remove_non_convertible_regs (candidates);
1323 class scalar_chain
1325 public:
1326 scalar_chain ();
1327 virtual ~scalar_chain ();
1329 static unsigned max_id;
1331 /* ID of a chain. */
1332 unsigned int chain_id;
1333 /* A queue of instructions to be included into a chain. */
1334 bitmap queue;
1335 /* Instructions included into a chain. */
1336 bitmap insns;
1337 /* All registers defined by a chain. */
1338 bitmap defs;
1339 /* Registers used in both vector and sclar modes. */
1340 bitmap defs_conv;
1342 void build (bitmap candidates, unsigned insn_uid);
1343 virtual int compute_convert_gain () = 0;
1344 int convert ();
1346 protected:
1347 void add_to_queue (unsigned insn_uid);
1348 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1350 private:
1351 void add_insn (bitmap candidates, unsigned insn_uid);
1352 void analyze_register_chain (bitmap candidates, df_ref ref);
1353 virtual void mark_dual_mode_def (df_ref def) = 0;
1354 virtual void convert_insn (rtx_insn *insn) = 0;
1355 virtual void convert_registers () = 0;
1358 class dimode_scalar_chain : public scalar_chain
1360 public:
1361 int compute_convert_gain ();
1362 private:
1363 void mark_dual_mode_def (df_ref def);
1364 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1365 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1366 void convert_insn (rtx_insn *insn);
1367 void convert_op (rtx *op, rtx_insn *insn);
1368 void convert_reg (unsigned regno);
1369 void make_vector_copies (unsigned regno);
1370 void convert_registers ();
1371 int vector_const_cost (rtx exp);
1374 class timode_scalar_chain : public scalar_chain
1376 public:
1377 /* Convert from TImode to V1TImode is always faster. */
1378 int compute_convert_gain () { return 1; }
1380 private:
1381 void mark_dual_mode_def (df_ref def);
1382 void fix_debug_reg_uses (rtx reg);
1383 void convert_insn (rtx_insn *insn);
1384 /* We don't convert registers to difference size. */
1385 void convert_registers () {}
1388 unsigned scalar_chain::max_id = 0;
1390 /* Initialize new chain. */
1392 scalar_chain::scalar_chain ()
1394 chain_id = ++max_id;
1396 if (dump_file)
1397 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1399 bitmap_obstack_initialize (NULL);
1400 insns = BITMAP_ALLOC (NULL);
1401 defs = BITMAP_ALLOC (NULL);
1402 defs_conv = BITMAP_ALLOC (NULL);
1403 queue = NULL;
1406 /* Free chain's data. */
1408 scalar_chain::~scalar_chain ()
1410 BITMAP_FREE (insns);
1411 BITMAP_FREE (defs);
1412 BITMAP_FREE (defs_conv);
1413 bitmap_obstack_release (NULL);
1416 /* Add instruction into chains' queue. */
1418 void
1419 scalar_chain::add_to_queue (unsigned insn_uid)
1421 if (bitmap_bit_p (insns, insn_uid)
1422 || bitmap_bit_p (queue, insn_uid))
1423 return;
1425 if (dump_file)
1426 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1427 insn_uid, chain_id);
1428 bitmap_set_bit (queue, insn_uid);
1431 /* For DImode conversion, mark register defined by DEF as requiring
1432 conversion. */
1434 void
1435 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1437 gcc_assert (DF_REF_REG_DEF_P (def));
1439 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1440 return;
1442 if (dump_file)
1443 fprintf (dump_file,
1444 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1445 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1447 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1450 /* For TImode conversion, it is unused. */
1452 void
1453 timode_scalar_chain::mark_dual_mode_def (df_ref)
1455 gcc_unreachable ();
1458 /* Check REF's chain to add new insns into a queue
1459 and find registers requiring conversion. */
1461 void
1462 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1464 df_link *chain;
1466 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1467 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1468 add_to_queue (DF_REF_INSN_UID (ref));
1470 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1472 unsigned uid = DF_REF_INSN_UID (chain->ref);
1474 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1475 continue;
1477 if (!DF_REF_REG_MEM_P (chain->ref))
1479 if (bitmap_bit_p (insns, uid))
1480 continue;
1482 if (bitmap_bit_p (candidates, uid))
1484 add_to_queue (uid);
1485 continue;
1489 if (DF_REF_REG_DEF_P (chain->ref))
1491 if (dump_file)
1492 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1493 DF_REF_REGNO (chain->ref), uid);
1494 mark_dual_mode_def (chain->ref);
1496 else
1498 if (dump_file)
1499 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1500 DF_REF_REGNO (chain->ref), uid);
1501 mark_dual_mode_def (ref);
1506 /* Add instruction into a chain. */
1508 void
1509 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1511 if (bitmap_bit_p (insns, insn_uid))
1512 return;
1514 if (dump_file)
1515 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1517 bitmap_set_bit (insns, insn_uid);
1519 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1520 rtx def_set = single_set (insn);
1521 if (def_set && REG_P (SET_DEST (def_set))
1522 && !HARD_REGISTER_P (SET_DEST (def_set)))
1523 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1525 df_ref ref;
1526 df_ref def;
1527 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1528 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1529 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1530 def;
1531 def = DF_REF_NEXT_REG (def))
1532 analyze_register_chain (candidates, def);
1533 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1534 if (!DF_REF_REG_MEM_P (ref))
1535 analyze_register_chain (candidates, ref);
1538 /* Build new chain starting from insn INSN_UID recursively
1539 adding all dependent uses and definitions. */
1541 void
1542 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1544 queue = BITMAP_ALLOC (NULL);
1545 bitmap_set_bit (queue, insn_uid);
1547 if (dump_file)
1548 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1550 while (!bitmap_empty_p (queue))
1552 insn_uid = bitmap_first_set_bit (queue);
1553 bitmap_clear_bit (queue, insn_uid);
1554 bitmap_clear_bit (candidates, insn_uid);
1555 add_insn (candidates, insn_uid);
1558 if (dump_file)
1560 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1561 fprintf (dump_file, " insns: ");
1562 dump_bitmap (dump_file, insns);
1563 if (!bitmap_empty_p (defs_conv))
1565 bitmap_iterator bi;
1566 unsigned id;
1567 const char *comma = "";
1568 fprintf (dump_file, " defs to convert: ");
1569 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1571 fprintf (dump_file, "%sr%d", comma, id);
1572 comma = ", ";
1574 fprintf (dump_file, "\n");
1578 BITMAP_FREE (queue);
1581 /* Return a cost of building a vector costant
1582 instead of using a scalar one. */
1585 dimode_scalar_chain::vector_const_cost (rtx exp)
1587 gcc_assert (CONST_INT_P (exp));
1589 if (standard_sse_constant_p (exp, V2DImode))
1590 return COSTS_N_INSNS (1);
1591 return ix86_cost->sse_load[1];
1594 /* Compute a gain for chain conversion. */
1597 dimode_scalar_chain::compute_convert_gain ()
1599 bitmap_iterator bi;
1600 unsigned insn_uid;
1601 int gain = 0;
1602 int cost = 0;
1604 if (dump_file)
1605 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1607 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1609 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1610 rtx def_set = single_set (insn);
1611 rtx src = SET_SRC (def_set);
1612 rtx dst = SET_DEST (def_set);
1614 if (REG_P (src) && REG_P (dst))
1615 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1616 else if (REG_P (src) && MEM_P (dst))
1617 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1618 else if (MEM_P (src) && REG_P (dst))
1619 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1620 else if (GET_CODE (src) == ASHIFT
1621 || GET_CODE (src) == ASHIFTRT
1622 || GET_CODE (src) == LSHIFTRT)
1624 if (CONST_INT_P (XEXP (src, 0)))
1625 gain -= vector_const_cost (XEXP (src, 0));
1626 if (CONST_INT_P (XEXP (src, 1)))
1628 gain += ix86_cost->shift_const;
1629 if (INTVAL (XEXP (src, 1)) >= 32)
1630 gain -= COSTS_N_INSNS (1);
1632 else
1633 /* Additional gain for omitting two CMOVs. */
1634 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1636 else if (GET_CODE (src) == PLUS
1637 || GET_CODE (src) == MINUS
1638 || GET_CODE (src) == IOR
1639 || GET_CODE (src) == XOR
1640 || GET_CODE (src) == AND)
1642 gain += ix86_cost->add;
1643 /* Additional gain for andnot for targets without BMI. */
1644 if (GET_CODE (XEXP (src, 0)) == NOT
1645 && !TARGET_BMI)
1646 gain += 2 * ix86_cost->add;
1648 if (CONST_INT_P (XEXP (src, 0)))
1649 gain -= vector_const_cost (XEXP (src, 0));
1650 if (CONST_INT_P (XEXP (src, 1)))
1651 gain -= vector_const_cost (XEXP (src, 1));
1653 else if (GET_CODE (src) == NEG
1654 || GET_CODE (src) == NOT)
1655 gain += ix86_cost->add - COSTS_N_INSNS (1);
1656 else if (GET_CODE (src) == COMPARE)
1658 /* Assume comparison cost is the same. */
1660 else if (CONST_INT_P (src))
1662 if (REG_P (dst))
1663 gain += COSTS_N_INSNS (2);
1664 else if (MEM_P (dst))
1665 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1666 gain -= vector_const_cost (src);
1668 else
1669 gcc_unreachable ();
1672 if (dump_file)
1673 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1675 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1676 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1678 if (dump_file)
1679 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1681 gain -= cost;
1683 if (dump_file)
1684 fprintf (dump_file, " Total gain: %d\n", gain);
1686 return gain;
1689 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1692 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1694 if (x == reg)
1695 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1697 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1698 int i, j;
1699 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1701 if (fmt[i] == 'e')
1702 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1703 else if (fmt[i] == 'E')
1704 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1705 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1706 reg, new_reg);
1709 return x;
1712 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1714 void
1715 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1716 rtx reg, rtx new_reg)
1718 replace_with_subreg (single_set (insn), reg, new_reg);
1721 /* Insert generated conversion instruction sequence INSNS
1722 after instruction AFTER. New BB may be required in case
1723 instruction has EH region attached. */
1725 void
1726 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1728 if (!control_flow_insn_p (after))
1730 emit_insn_after (insns, after);
1731 return;
1734 basic_block bb = BLOCK_FOR_INSN (after);
1735 edge e = find_fallthru_edge (bb->succs);
1736 gcc_assert (e);
1738 basic_block new_bb = split_edge (e);
1739 emit_insn_after (insns, BB_HEAD (new_bb));
1742 /* Make vector copies for all register REGNO definitions
1743 and replace its uses in a chain. */
1745 void
1746 dimode_scalar_chain::make_vector_copies (unsigned regno)
1748 rtx reg = regno_reg_rtx[regno];
1749 rtx vreg = gen_reg_rtx (DImode);
1750 bool count_reg = false;
1751 df_ref ref;
1753 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1754 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1756 df_ref use;
1758 /* Detect the count register of a shift instruction. */
1759 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1760 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1762 rtx_insn *insn = DF_REF_INSN (use);
1763 rtx def_set = single_set (insn);
1765 gcc_assert (def_set);
1767 rtx src = SET_SRC (def_set);
1769 if ((GET_CODE (src) == ASHIFT
1770 || GET_CODE (src) == ASHIFTRT
1771 || GET_CODE (src) == LSHIFTRT)
1772 && !CONST_INT_P (XEXP (src, 1))
1773 && reg_or_subregno (XEXP (src, 1)) == regno)
1774 count_reg = true;
1777 start_sequence ();
1778 if (count_reg)
1780 rtx qreg = gen_lowpart (QImode, reg);
1781 rtx tmp = gen_reg_rtx (SImode);
1783 if (TARGET_ZERO_EXTEND_WITH_AND
1784 && optimize_function_for_speed_p (cfun))
1786 emit_move_insn (tmp, const0_rtx);
1787 emit_insn (gen_movstrictqi
1788 (gen_lowpart (QImode, tmp), qreg));
1790 else
1791 emit_insn (gen_rtx_SET
1792 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1794 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1796 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1797 emit_move_insn (slot, tmp);
1798 tmp = copy_rtx (slot);
1801 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1803 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1805 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1806 emit_move_insn (adjust_address (tmp, SImode, 0),
1807 gen_rtx_SUBREG (SImode, reg, 0));
1808 emit_move_insn (adjust_address (tmp, SImode, 4),
1809 gen_rtx_SUBREG (SImode, reg, 4));
1810 emit_move_insn (vreg, tmp);
1812 else if (TARGET_SSE4_1)
1814 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1815 CONST0_RTX (V4SImode),
1816 gen_rtx_SUBREG (SImode, reg, 0)));
1817 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1818 gen_rtx_SUBREG (V4SImode, vreg, 0),
1819 gen_rtx_SUBREG (SImode, reg, 4),
1820 GEN_INT (2)));
1822 else
1824 rtx tmp = gen_reg_rtx (DImode);
1825 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1826 CONST0_RTX (V4SImode),
1827 gen_rtx_SUBREG (SImode, reg, 0)));
1828 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1829 CONST0_RTX (V4SImode),
1830 gen_rtx_SUBREG (SImode, reg, 4)));
1831 emit_insn (gen_vec_interleave_lowv4si
1832 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1833 gen_rtx_SUBREG (V4SImode, vreg, 0),
1834 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1836 rtx_insn *seq = get_insns ();
1837 end_sequence ();
1838 rtx_insn *insn = DF_REF_INSN (ref);
1839 emit_conversion_insns (seq, insn);
1841 if (dump_file)
1842 fprintf (dump_file,
1843 " Copied r%d to a vector register r%d for insn %d\n",
1844 regno, REGNO (vreg), INSN_UID (insn));
1847 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1848 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1850 rtx_insn *insn = DF_REF_INSN (ref);
1851 if (count_reg)
1853 rtx def_set = single_set (insn);
1854 gcc_assert (def_set);
1856 rtx src = SET_SRC (def_set);
1858 if ((GET_CODE (src) == ASHIFT
1859 || GET_CODE (src) == ASHIFTRT
1860 || GET_CODE (src) == LSHIFTRT)
1861 && !CONST_INT_P (XEXP (src, 1))
1862 && reg_or_subregno (XEXP (src, 1)) == regno)
1863 XEXP (src, 1) = vreg;
1865 else
1866 replace_with_subreg_in_insn (insn, reg, vreg);
1868 if (dump_file)
1869 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1870 regno, REGNO (vreg), INSN_UID (insn));
1874 /* Convert all definitions of register REGNO
1875 and fix its uses. Scalar copies may be created
1876 in case register is used in not convertible insn. */
1878 void
1879 dimode_scalar_chain::convert_reg (unsigned regno)
1881 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1882 rtx reg = regno_reg_rtx[regno];
1883 rtx scopy = NULL_RTX;
1884 df_ref ref;
1885 bitmap conv;
1887 conv = BITMAP_ALLOC (NULL);
1888 bitmap_copy (conv, insns);
1890 if (scalar_copy)
1891 scopy = gen_reg_rtx (DImode);
1893 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1895 rtx_insn *insn = DF_REF_INSN (ref);
1896 rtx def_set = single_set (insn);
1897 rtx src = SET_SRC (def_set);
1898 rtx reg = DF_REF_REG (ref);
1900 if (!MEM_P (src))
1902 replace_with_subreg_in_insn (insn, reg, reg);
1903 bitmap_clear_bit (conv, INSN_UID (insn));
1906 if (scalar_copy)
1908 start_sequence ();
1909 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1911 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1912 emit_move_insn (tmp, reg);
1913 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1914 adjust_address (tmp, SImode, 0));
1915 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1916 adjust_address (tmp, SImode, 4));
1918 else if (TARGET_SSE4_1)
1920 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1921 emit_insn
1922 (gen_rtx_SET
1923 (gen_rtx_SUBREG (SImode, scopy, 0),
1924 gen_rtx_VEC_SELECT (SImode,
1925 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1927 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1928 emit_insn
1929 (gen_rtx_SET
1930 (gen_rtx_SUBREG (SImode, scopy, 4),
1931 gen_rtx_VEC_SELECT (SImode,
1932 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1934 else
1936 rtx vcopy = gen_reg_rtx (V2DImode);
1937 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1938 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1939 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 emit_move_insn (vcopy,
1941 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1942 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1943 gen_rtx_SUBREG (SImode, vcopy, 0));
1945 rtx_insn *seq = get_insns ();
1946 end_sequence ();
1947 emit_conversion_insns (seq, insn);
1949 if (dump_file)
1950 fprintf (dump_file,
1951 " Copied r%d to a scalar register r%d for insn %d\n",
1952 regno, REGNO (scopy), INSN_UID (insn));
1956 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1957 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1959 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1961 rtx_insn *insn = DF_REF_INSN (ref);
1963 rtx def_set = single_set (insn);
1964 gcc_assert (def_set);
1966 rtx src = SET_SRC (def_set);
1967 rtx dst = SET_DEST (def_set);
1969 if ((GET_CODE (src) == ASHIFT
1970 || GET_CODE (src) == ASHIFTRT
1971 || GET_CODE (src) == LSHIFTRT)
1972 && !CONST_INT_P (XEXP (src, 1))
1973 && reg_or_subregno (XEXP (src, 1)) == regno)
1975 rtx tmp2 = gen_reg_rtx (V2DImode);
1977 start_sequence ();
1979 if (TARGET_SSE4_1)
1980 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1981 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1982 else
1984 rtx vec_cst
1985 = gen_rtx_CONST_VECTOR (V2DImode,
1986 gen_rtvec (2, GEN_INT (0xff),
1987 const0_rtx));
1988 vec_cst
1989 = validize_mem (force_const_mem (V2DImode, vec_cst));
1991 emit_insn (gen_rtx_SET
1992 (tmp2,
1993 gen_rtx_AND (V2DImode,
1994 gen_rtx_SUBREG (V2DImode, reg, 0),
1995 vec_cst)));
1997 rtx_insn *seq = get_insns ();
1998 end_sequence ();
2000 emit_insn_before (seq, insn);
2002 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
2004 else if (!MEM_P (dst) || !REG_P (src))
2005 replace_with_subreg_in_insn (insn, reg, reg);
2007 bitmap_clear_bit (conv, INSN_UID (insn));
2010 /* Skip debug insns and uninitialized uses. */
2011 else if (DF_REF_CHAIN (ref)
2012 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2014 gcc_assert (scopy);
2015 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2016 df_insn_rescan (DF_REF_INSN (ref));
2019 BITMAP_FREE (conv);
2022 /* Convert operand OP in INSN. We should handle
2023 memory operands and uninitialized registers.
2024 All other register uses are converted during
2025 registers conversion. */
2027 void
2028 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2030 *op = copy_rtx_if_shared (*op);
2032 if (GET_CODE (*op) == NOT)
2034 convert_op (&XEXP (*op, 0), insn);
2035 PUT_MODE (*op, V2DImode);
2037 else if (MEM_P (*op))
2039 rtx tmp = gen_reg_rtx (DImode);
2041 emit_insn_before (gen_move_insn (tmp, *op), insn);
2042 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2044 if (dump_file)
2045 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2046 INSN_UID (insn), REGNO (tmp));
2048 else if (REG_P (*op))
2050 /* We may have not converted register usage in case
2051 this register has no definition. Otherwise it
2052 should be converted in convert_reg. */
2053 df_ref ref;
2054 FOR_EACH_INSN_USE (ref, insn)
2055 if (DF_REF_REGNO (ref) == REGNO (*op))
2057 gcc_assert (!DF_REF_CHAIN (ref));
2058 break;
2060 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2062 else if (CONST_INT_P (*op))
2064 rtx vec_cst;
2065 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2067 /* Prefer all ones vector in case of -1. */
2068 if (constm1_operand (*op, GET_MODE (*op)))
2069 vec_cst = CONSTM1_RTX (V2DImode);
2070 else
2071 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2072 gen_rtvec (2, *op, const0_rtx));
2074 if (!standard_sse_constant_p (vec_cst, V2DImode))
2076 start_sequence ();
2077 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2078 rtx_insn *seq = get_insns ();
2079 end_sequence ();
2080 emit_insn_before (seq, insn);
2083 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2084 *op = tmp;
2086 else
2088 gcc_assert (SUBREG_P (*op));
2089 gcc_assert (GET_MODE (*op) == V2DImode);
2093 /* Convert INSN to vector mode. */
2095 void
2096 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2098 rtx def_set = single_set (insn);
2099 rtx src = SET_SRC (def_set);
2100 rtx dst = SET_DEST (def_set);
2101 rtx subreg;
2103 if (MEM_P (dst) && !REG_P (src))
2105 /* There are no scalar integer instructions and therefore
2106 temporary register usage is required. */
2107 rtx tmp = gen_reg_rtx (DImode);
2108 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2109 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2112 switch (GET_CODE (src))
2114 case ASHIFT:
2115 case ASHIFTRT:
2116 case LSHIFTRT:
2117 convert_op (&XEXP (src, 0), insn);
2118 PUT_MODE (src, V2DImode);
2119 break;
2121 case PLUS:
2122 case MINUS:
2123 case IOR:
2124 case XOR:
2125 case AND:
2126 convert_op (&XEXP (src, 0), insn);
2127 convert_op (&XEXP (src, 1), insn);
2128 PUT_MODE (src, V2DImode);
2129 break;
2131 case NEG:
2132 src = XEXP (src, 0);
2133 convert_op (&src, insn);
2134 subreg = gen_reg_rtx (V2DImode);
2135 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2136 src = gen_rtx_MINUS (V2DImode, subreg, src);
2137 break;
2139 case NOT:
2140 src = XEXP (src, 0);
2141 convert_op (&src, insn);
2142 subreg = gen_reg_rtx (V2DImode);
2143 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2144 src = gen_rtx_XOR (V2DImode, src, subreg);
2145 break;
2147 case MEM:
2148 if (!REG_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case REG:
2153 if (!MEM_P (dst))
2154 convert_op (&src, insn);
2155 break;
2157 case SUBREG:
2158 gcc_assert (GET_MODE (src) == V2DImode);
2159 break;
2161 case COMPARE:
2162 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2164 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2165 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2167 if (REG_P (src))
2168 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2169 else
2170 subreg = copy_rtx_if_shared (src);
2171 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2172 copy_rtx_if_shared (subreg),
2173 copy_rtx_if_shared (subreg)),
2174 insn);
2175 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2176 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2177 copy_rtx_if_shared (src)),
2178 UNSPEC_PTEST);
2179 break;
2181 case CONST_INT:
2182 convert_op (&src, insn);
2183 break;
2185 default:
2186 gcc_unreachable ();
2189 SET_SRC (def_set) = src;
2190 SET_DEST (def_set) = dst;
2192 /* Drop possible dead definitions. */
2193 PATTERN (insn) = def_set;
2195 INSN_CODE (insn) = -1;
2196 recog_memoized (insn);
2197 df_insn_rescan (insn);
2200 /* Fix uses of converted REG in debug insns. */
2202 void
2203 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2205 if (!flag_var_tracking)
2206 return;
2208 df_ref ref, next;
2209 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2211 rtx_insn *insn = DF_REF_INSN (ref);
2212 /* Make sure the next ref is for a different instruction,
2213 so that we're not affected by the rescan. */
2214 next = DF_REF_NEXT_REG (ref);
2215 while (next && DF_REF_INSN (next) == insn)
2216 next = DF_REF_NEXT_REG (next);
2218 if (DEBUG_INSN_P (insn))
2220 /* It may be a debug insn with a TImode variable in
2221 register. */
2222 bool changed = false;
2223 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2225 rtx *loc = DF_REF_LOC (ref);
2226 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2228 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2229 changed = true;
2232 if (changed)
2233 df_insn_rescan (insn);
2238 /* Convert INSN from TImode to V1T1mode. */
2240 void
2241 timode_scalar_chain::convert_insn (rtx_insn *insn)
2243 rtx def_set = single_set (insn);
2244 rtx src = SET_SRC (def_set);
2245 rtx dst = SET_DEST (def_set);
2247 switch (GET_CODE (dst))
2249 case REG:
2251 rtx tmp = find_reg_equal_equiv_note (insn);
2252 if (tmp)
2253 PUT_MODE (XEXP (tmp, 0), V1TImode);
2254 PUT_MODE (dst, V1TImode);
2255 fix_debug_reg_uses (dst);
2257 break;
2258 case MEM:
2259 PUT_MODE (dst, V1TImode);
2260 break;
2262 default:
2263 gcc_unreachable ();
2266 switch (GET_CODE (src))
2268 case REG:
2269 PUT_MODE (src, V1TImode);
2270 /* Call fix_debug_reg_uses only if SRC is never defined. */
2271 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2272 fix_debug_reg_uses (src);
2273 break;
2275 case MEM:
2276 PUT_MODE (src, V1TImode);
2277 break;
2279 case CONST_WIDE_INT:
2280 if (NONDEBUG_INSN_P (insn))
2282 /* Since there are no instructions to store 128-bit constant,
2283 temporary register usage is required. */
2284 rtx tmp = gen_reg_rtx (V1TImode);
2285 start_sequence ();
2286 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2287 src = validize_mem (force_const_mem (V1TImode, src));
2288 rtx_insn *seq = get_insns ();
2289 end_sequence ();
2290 if (seq)
2291 emit_insn_before (seq, insn);
2292 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2293 dst = tmp;
2295 break;
2297 case CONST_INT:
2298 switch (standard_sse_constant_p (src, TImode))
2300 case 1:
2301 src = CONST0_RTX (GET_MODE (dst));
2302 break;
2303 case 2:
2304 src = CONSTM1_RTX (GET_MODE (dst));
2305 break;
2306 default:
2307 gcc_unreachable ();
2309 if (NONDEBUG_INSN_P (insn))
2311 rtx tmp = gen_reg_rtx (V1TImode);
2312 /* Since there are no instructions to store standard SSE
2313 constant, temporary register usage is required. */
2314 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2315 dst = tmp;
2317 break;
2319 default:
2320 gcc_unreachable ();
2323 SET_SRC (def_set) = src;
2324 SET_DEST (def_set) = dst;
2326 /* Drop possible dead definitions. */
2327 PATTERN (insn) = def_set;
2329 INSN_CODE (insn) = -1;
2330 recog_memoized (insn);
2331 df_insn_rescan (insn);
2334 void
2335 dimode_scalar_chain::convert_registers ()
2337 bitmap_iterator bi;
2338 unsigned id;
2340 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2341 convert_reg (id);
2343 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2344 make_vector_copies (id);
2347 /* Convert whole chain creating required register
2348 conversions and copies. */
2351 scalar_chain::convert ()
2353 bitmap_iterator bi;
2354 unsigned id;
2355 int converted_insns = 0;
2357 if (!dbg_cnt (stv_conversion))
2358 return 0;
2360 if (dump_file)
2361 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2363 convert_registers ();
2365 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2367 convert_insn (DF_INSN_UID_GET (id)->insn);
2368 converted_insns++;
2371 return converted_insns;
2374 /* Main STV pass function. Find and convert scalar
2375 instructions into vector mode when profitable. */
2377 static unsigned int
2378 convert_scalars_to_vector ()
2380 basic_block bb;
2381 bitmap candidates;
2382 int converted_insns = 0;
2384 bitmap_obstack_initialize (NULL);
2385 candidates = BITMAP_ALLOC (NULL);
2387 calculate_dominance_info (CDI_DOMINATORS);
2388 df_set_flags (DF_DEFER_INSN_RESCAN);
2389 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2390 df_md_add_problem ();
2391 df_analyze ();
2393 /* Find all instructions we want to convert into vector mode. */
2394 if (dump_file)
2395 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2397 FOR_EACH_BB_FN (bb, cfun)
2399 rtx_insn *insn;
2400 FOR_BB_INSNS (bb, insn)
2401 if (scalar_to_vector_candidate_p (insn))
2403 if (dump_file)
2404 fprintf (dump_file, " insn %d is marked as a candidate\n",
2405 INSN_UID (insn));
2407 bitmap_set_bit (candidates, INSN_UID (insn));
2411 remove_non_convertible_regs (candidates);
2413 if (bitmap_empty_p (candidates))
2414 if (dump_file)
2415 fprintf (dump_file, "There are no candidates for optimization.\n");
2417 while (!bitmap_empty_p (candidates))
2419 unsigned uid = bitmap_first_set_bit (candidates);
2420 scalar_chain *chain;
2422 if (TARGET_64BIT)
2423 chain = new timode_scalar_chain;
2424 else
2425 chain = new dimode_scalar_chain;
2427 /* Find instructions chain we want to convert to vector mode.
2428 Check all uses and definitions to estimate all required
2429 conversions. */
2430 chain->build (candidates, uid);
2432 if (chain->compute_convert_gain () > 0)
2433 converted_insns += chain->convert ();
2434 else
2435 if (dump_file)
2436 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2437 chain->chain_id);
2439 delete chain;
2442 if (dump_file)
2443 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2445 BITMAP_FREE (candidates);
2446 bitmap_obstack_release (NULL);
2447 df_process_deferred_rescans ();
2449 /* Conversion means we may have 128bit register spills/fills
2450 which require aligned stack. */
2451 if (converted_insns)
2453 if (crtl->stack_alignment_needed < 128)
2454 crtl->stack_alignment_needed = 128;
2455 if (crtl->stack_alignment_estimated < 128)
2456 crtl->stack_alignment_estimated = 128;
2457 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2458 if (TARGET_64BIT)
2459 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2460 parm; parm = DECL_CHAIN (parm))
2462 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2463 continue;
2464 if (DECL_RTL_SET_P (parm)
2465 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2467 rtx r = DECL_RTL (parm);
2468 if (REG_P (r))
2469 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2471 if (DECL_INCOMING_RTL (parm)
2472 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2474 rtx r = DECL_INCOMING_RTL (parm);
2475 if (REG_P (r))
2476 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2481 return 0;
2484 namespace {
2486 const pass_data pass_data_insert_vzeroupper =
2488 RTL_PASS, /* type */
2489 "vzeroupper", /* name */
2490 OPTGROUP_NONE, /* optinfo_flags */
2491 TV_MACH_DEP, /* tv_id */
2492 0, /* properties_required */
2493 0, /* properties_provided */
2494 0, /* properties_destroyed */
2495 0, /* todo_flags_start */
2496 TODO_df_finish, /* todo_flags_finish */
2499 class pass_insert_vzeroupper : public rtl_opt_pass
2501 public:
2502 pass_insert_vzeroupper(gcc::context *ctxt)
2503 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2506 /* opt_pass methods: */
2507 virtual bool gate (function *)
2509 return TARGET_AVX
2510 && TARGET_VZEROUPPER && flag_expensive_optimizations
2511 && !optimize_size;
2514 virtual unsigned int execute (function *)
2516 return rest_of_handle_insert_vzeroupper ();
2519 }; // class pass_insert_vzeroupper
2521 const pass_data pass_data_stv =
2523 RTL_PASS, /* type */
2524 "stv", /* name */
2525 OPTGROUP_NONE, /* optinfo_flags */
2526 TV_MACH_DEP, /* tv_id */
2527 0, /* properties_required */
2528 0, /* properties_provided */
2529 0, /* properties_destroyed */
2530 0, /* todo_flags_start */
2531 TODO_df_finish, /* todo_flags_finish */
2534 class pass_stv : public rtl_opt_pass
2536 public:
2537 pass_stv (gcc::context *ctxt)
2538 : rtl_opt_pass (pass_data_stv, ctxt),
2539 timode_p (false)
2542 /* opt_pass methods: */
2543 virtual bool gate (function *)
2545 return (timode_p == !!TARGET_64BIT
2546 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2549 virtual unsigned int execute (function *)
2551 return convert_scalars_to_vector ();
2554 opt_pass *clone ()
2556 return new pass_stv (m_ctxt);
2559 void set_pass_param (unsigned int n, bool param)
2561 gcc_assert (n == 0);
2562 timode_p = param;
2565 private:
2566 bool timode_p;
2567 }; // class pass_stv
2569 } // anon namespace
2571 rtl_opt_pass *
2572 make_pass_insert_vzeroupper (gcc::context *ctxt)
2574 return new pass_insert_vzeroupper (ctxt);
2577 rtl_opt_pass *
2578 make_pass_stv (gcc::context *ctxt)
2580 return new pass_stv (ctxt);
2583 /* Inserting ENDBRANCH instructions. */
2585 static unsigned int
2586 rest_of_insert_endbranch (void)
2588 timevar_push (TV_MACH_DEP);
2590 rtx cet_eb;
2591 rtx_insn *insn;
2592 basic_block bb;
2594 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2595 absent among function attributes. Later an optimization will be
2596 introduced to make analysis if an address of a static function is
2597 taken. A static function whose address is not taken will get a
2598 nocf_check attribute. This will allow to reduce the number of EB. */
2600 if (!lookup_attribute ("nocf_check",
2601 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2602 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2604 cet_eb = gen_nop_endbr ();
2606 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2607 insn = BB_HEAD (bb);
2608 emit_insn_before (cet_eb, insn);
2611 bb = 0;
2612 FOR_EACH_BB_FN (bb, cfun)
2614 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2615 insn = NEXT_INSN (insn))
2617 if (CALL_P (insn))
2619 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2620 continue;
2621 /* Generate ENDBRANCH after CALL, which can return more than
2622 twice, setjmp-like functions. */
2624 cet_eb = gen_nop_endbr ();
2625 emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2626 continue;
2629 if (JUMP_P (insn) && flag_cet_switch)
2631 rtx target = JUMP_LABEL (insn);
2632 if (target == NULL_RTX || ANY_RETURN_P (target))
2633 continue;
2635 /* Check the jump is a switch table. */
2636 rtx_insn *label = as_a<rtx_insn *> (target);
2637 rtx_insn *table = next_insn (label);
2638 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2639 continue;
2641 /* For the indirect jump find out all places it jumps and insert
2642 ENDBRANCH there. It should be done under a special flag to
2643 control ENDBRANCH generation for switch stmts. */
2644 edge_iterator ei;
2645 edge e;
2646 basic_block dest_blk;
2648 FOR_EACH_EDGE (e, ei, bb->succs)
2650 rtx_insn *insn;
2652 dest_blk = e->dest;
2653 insn = BB_HEAD (dest_blk);
2654 gcc_assert (LABEL_P (insn));
2655 cet_eb = gen_nop_endbr ();
2656 emit_insn_after (cet_eb, insn);
2658 continue;
2661 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2662 || (NOTE_P (insn)
2663 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2664 /* TODO. Check /s bit also. */
2666 cet_eb = gen_nop_endbr ();
2667 emit_insn_after (cet_eb, insn);
2668 continue;
2673 timevar_pop (TV_MACH_DEP);
2674 return 0;
2677 namespace {
2679 const pass_data pass_data_insert_endbranch =
2681 RTL_PASS, /* type. */
2682 "cet", /* name. */
2683 OPTGROUP_NONE, /* optinfo_flags. */
2684 TV_MACH_DEP, /* tv_id. */
2685 0, /* properties_required. */
2686 0, /* properties_provided. */
2687 0, /* properties_destroyed. */
2688 0, /* todo_flags_start. */
2689 0, /* todo_flags_finish. */
2692 class pass_insert_endbranch : public rtl_opt_pass
2694 public:
2695 pass_insert_endbranch (gcc::context *ctxt)
2696 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2699 /* opt_pass methods: */
2700 virtual bool gate (function *)
2702 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2705 virtual unsigned int execute (function *)
2707 return rest_of_insert_endbranch ();
2710 }; // class pass_insert_endbranch
2712 } // anon namespace
2714 rtl_opt_pass *
2715 make_pass_insert_endbranch (gcc::context *ctxt)
2717 return new pass_insert_endbranch (ctxt);
2720 /* Return true if a red-zone is in use. We can't use red-zone when
2721 there are local indirect jumps, like "indirect_jump" or "tablejump",
2722 which jumps to another place in the function, since "call" in the
2723 indirect thunk pushes the return address onto stack, destroying
2724 red-zone.
2726 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2727 for CALL, in red-zone, we can allow local indirect jumps with
2728 indirect thunk. */
2730 bool
2731 ix86_using_red_zone (void)
2733 return (TARGET_RED_ZONE
2734 && !TARGET_64BIT_MS_ABI
2735 && (!cfun->machine->has_local_indirect_jump
2736 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2739 /* Return a string that documents the current -m options. The caller is
2740 responsible for freeing the string. */
2742 static char *
2743 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2744 int flags, int flags2,
2745 const char *arch, const char *tune,
2746 enum fpmath_unit fpmath, bool add_nl_p)
2748 struct ix86_target_opts
2750 const char *option; /* option string */
2751 HOST_WIDE_INT mask; /* isa mask options */
2754 /* This table is ordered so that options like -msse4.2 that imply other
2755 ISAs come first. Target string will be displayed in the same order. */
2756 static struct ix86_target_opts isa2_opts[] =
2758 { "-mcx16", OPTION_MASK_ISA_CX16 },
2759 { "-mmpx", OPTION_MASK_ISA_MPX },
2760 { "-mvaes", OPTION_MASK_ISA_VAES },
2761 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2762 { "-mpconfig", OPTION_MASK_ISA_PCONFIG },
2763 { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD },
2764 { "-msgx", OPTION_MASK_ISA_SGX },
2765 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2766 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2767 { "-mibt", OPTION_MASK_ISA_IBT },
2768 { "-mhle", OPTION_MASK_ISA_HLE },
2769 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2770 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2771 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2773 static struct ix86_target_opts isa_opts[] =
2775 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2776 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2777 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2778 { "-mgfni", OPTION_MASK_ISA_GFNI },
2779 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2780 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2781 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2782 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2783 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2784 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2785 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2786 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2787 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2788 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2789 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2790 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2791 { "-mfma", OPTION_MASK_ISA_FMA },
2792 { "-mxop", OPTION_MASK_ISA_XOP },
2793 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2794 { "-mf16c", OPTION_MASK_ISA_F16C },
2795 { "-mavx", OPTION_MASK_ISA_AVX },
2796 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2797 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2798 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2799 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2800 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2801 { "-msse3", OPTION_MASK_ISA_SSE3 },
2802 { "-maes", OPTION_MASK_ISA_AES },
2803 { "-msha", OPTION_MASK_ISA_SHA },
2804 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2805 { "-msse2", OPTION_MASK_ISA_SSE2 },
2806 { "-msse", OPTION_MASK_ISA_SSE },
2807 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2808 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2809 { "-mmmx", OPTION_MASK_ISA_MMX },
2810 { "-mrtm", OPTION_MASK_ISA_RTM },
2811 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2812 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2813 { "-madx", OPTION_MASK_ISA_ADX },
2814 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2815 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2816 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2817 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2818 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2819 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2820 { "-mabm", OPTION_MASK_ISA_ABM },
2821 { "-mbmi", OPTION_MASK_ISA_BMI },
2822 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2823 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2824 { "-mtbm", OPTION_MASK_ISA_TBM },
2825 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2826 { "-msahf", OPTION_MASK_ISA_SAHF },
2827 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2828 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2829 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2830 { "-mpku", OPTION_MASK_ISA_PKU },
2831 { "-mlwp", OPTION_MASK_ISA_LWP },
2832 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2833 { "-mclwb", OPTION_MASK_ISA_CLWB },
2834 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2837 /* Flag options. */
2838 static struct ix86_target_opts flag_opts[] =
2840 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2841 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2842 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2843 { "-m80387", MASK_80387 },
2844 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2845 { "-malign-double", MASK_ALIGN_DOUBLE },
2846 { "-mcld", MASK_CLD },
2847 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2848 { "-mieee-fp", MASK_IEEE_FP },
2849 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2850 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2851 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2852 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2853 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2854 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2855 { "-mno-red-zone", MASK_NO_RED_ZONE },
2856 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2857 { "-mrecip", MASK_RECIP },
2858 { "-mrtd", MASK_RTD },
2859 { "-msseregparm", MASK_SSEREGPARM },
2860 { "-mstack-arg-probe", MASK_STACK_PROBE },
2861 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2862 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2863 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2864 { "-mvzeroupper", MASK_VZEROUPPER },
2865 { "-mstv", MASK_STV },
2866 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2867 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2868 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2871 /* Additional flag options. */
2872 static struct ix86_target_opts flag2_opts[] =
2874 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2877 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2878 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2880 char isa_other[40];
2881 char isa2_other[40];
2882 char flags_other[40];
2883 char flags2_other[40];
2884 unsigned num = 0;
2885 unsigned i, j;
2886 char *ret;
2887 char *ptr;
2888 size_t len;
2889 size_t line_len;
2890 size_t sep_len;
2891 const char *abi;
2893 memset (opts, '\0', sizeof (opts));
2895 /* Add -march= option. */
2896 if (arch)
2898 opts[num][0] = "-march=";
2899 opts[num++][1] = arch;
2902 /* Add -mtune= option. */
2903 if (tune)
2905 opts[num][0] = "-mtune=";
2906 opts[num++][1] = tune;
2909 /* Add -m32/-m64/-mx32. */
2910 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2912 if ((isa & OPTION_MASK_ABI_64) != 0)
2913 abi = "-m64";
2914 else
2915 abi = "-mx32";
2916 isa &= ~ (OPTION_MASK_ISA_64BIT
2917 | OPTION_MASK_ABI_64
2918 | OPTION_MASK_ABI_X32);
2920 else
2921 abi = "-m32";
2922 opts[num++][0] = abi;
2924 /* Pick out the options in isa2 options. */
2925 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2927 if ((isa2 & isa2_opts[i].mask) != 0)
2929 opts[num++][0] = isa2_opts[i].option;
2930 isa2 &= ~ isa2_opts[i].mask;
2934 if (isa2 && add_nl_p)
2936 opts[num++][0] = isa2_other;
2937 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2940 /* Pick out the options in isa options. */
2941 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2943 if ((isa & isa_opts[i].mask) != 0)
2945 opts[num++][0] = isa_opts[i].option;
2946 isa &= ~ isa_opts[i].mask;
2950 if (isa && add_nl_p)
2952 opts[num++][0] = isa_other;
2953 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2956 /* Add flag options. */
2957 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2959 if ((flags & flag_opts[i].mask) != 0)
2961 opts[num++][0] = flag_opts[i].option;
2962 flags &= ~ flag_opts[i].mask;
2966 if (flags && add_nl_p)
2968 opts[num++][0] = flags_other;
2969 sprintf (flags_other, "(other flags: %#x)", flags);
2972 /* Add additional flag options. */
2973 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2975 if ((flags2 & flag2_opts[i].mask) != 0)
2977 opts[num++][0] = flag2_opts[i].option;
2978 flags2 &= ~ flag2_opts[i].mask;
2982 if (flags2 && add_nl_p)
2984 opts[num++][0] = flags2_other;
2985 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2988 /* Add -fpmath= option. */
2989 if (fpmath)
2991 opts[num][0] = "-mfpmath=";
2992 switch ((int) fpmath)
2994 case FPMATH_387:
2995 opts[num++][1] = "387";
2996 break;
2998 case FPMATH_SSE:
2999 opts[num++][1] = "sse";
3000 break;
3002 case FPMATH_387 | FPMATH_SSE:
3003 opts[num++][1] = "sse+387";
3004 break;
3006 default:
3007 gcc_unreachable ();
3011 /* Any options? */
3012 if (num == 0)
3013 return NULL;
3015 gcc_assert (num < ARRAY_SIZE (opts));
3017 /* Size the string. */
3018 len = 0;
3019 sep_len = (add_nl_p) ? 3 : 1;
3020 for (i = 0; i < num; i++)
3022 len += sep_len;
3023 for (j = 0; j < 2; j++)
3024 if (opts[i][j])
3025 len += strlen (opts[i][j]);
3028 /* Build the string. */
3029 ret = ptr = (char *) xmalloc (len);
3030 line_len = 0;
3032 for (i = 0; i < num; i++)
3034 size_t len2[2];
3036 for (j = 0; j < 2; j++)
3037 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3039 if (i != 0)
3041 *ptr++ = ' ';
3042 line_len++;
3044 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3046 *ptr++ = '\\';
3047 *ptr++ = '\n';
3048 line_len = 0;
3052 for (j = 0; j < 2; j++)
3053 if (opts[i][j])
3055 memcpy (ptr, opts[i][j], len2[j]);
3056 ptr += len2[j];
3057 line_len += len2[j];
3061 *ptr = '\0';
3062 gcc_assert (ret + len >= ptr);
3064 return ret;
3067 /* Return true, if profiling code should be emitted before
3068 prologue. Otherwise it returns false.
3069 Note: For x86 with "hotfix" it is sorried. */
3070 static bool
3071 ix86_profile_before_prologue (void)
3073 return flag_fentry != 0;
3076 /* Function that is callable from the debugger to print the current
3077 options. */
3078 void ATTRIBUTE_UNUSED
3079 ix86_debug_options (void)
3081 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3082 target_flags, ix86_target_flags,
3083 ix86_arch_string,ix86_tune_string,
3084 ix86_fpmath, true);
3086 if (opts)
3088 fprintf (stderr, "%s\n\n", opts);
3089 free (opts);
3091 else
3092 fputs ("<no options>\n\n", stderr);
3094 return;
3097 /* Return true if T is one of the bytes we should avoid with
3098 -mmitigate-rop. */
3100 static bool
3101 ix86_rop_should_change_byte_p (int t)
3103 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3106 static const char *stringop_alg_names[] = {
3107 #define DEF_ENUM
3108 #define DEF_ALG(alg, name) #name,
3109 #include "stringop.def"
3110 #undef DEF_ENUM
3111 #undef DEF_ALG
3114 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3115 The string is of the following form (or comma separated list of it):
3117 strategy_alg:max_size:[align|noalign]
3119 where the full size range for the strategy is either [0, max_size] or
3120 [min_size, max_size], in which min_size is the max_size + 1 of the
3121 preceding range. The last size range must have max_size == -1.
3123 Examples:
3126 -mmemcpy-strategy=libcall:-1:noalign
3128 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3132 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3134 This is to tell the compiler to use the following strategy for memset
3135 1) when the expected size is between [1, 16], use rep_8byte strategy;
3136 2) when the size is between [17, 2048], use vector_loop;
3137 3) when the size is > 2048, use libcall. */
3139 struct stringop_size_range
3141 int max;
3142 stringop_alg alg;
3143 bool noalign;
3146 static void
3147 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3149 const struct stringop_algs *default_algs;
3150 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3151 char *curr_range_str, *next_range_str;
3152 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3153 int i = 0, n = 0;
3155 if (is_memset)
3156 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3157 else
3158 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3160 curr_range_str = strategy_str;
3164 int maxs;
3165 char alg_name[128];
3166 char align[16];
3167 next_range_str = strchr (curr_range_str, ',');
3168 if (next_range_str)
3169 *next_range_str++ = '\0';
3171 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3172 align) != 3)
3174 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3175 return;
3178 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3180 error ("size ranges of option %qs should be increasing", opt);
3181 return;
3184 for (i = 0; i < last_alg; i++)
3185 if (!strcmp (alg_name, stringop_alg_names[i]))
3186 break;
3188 if (i == last_alg)
3190 error ("wrong strategy name %qs specified for option %qs",
3191 alg_name, opt);
3193 auto_vec <const char *> candidates;
3194 for (i = 0; i < last_alg; i++)
3195 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3196 candidates.safe_push (stringop_alg_names[i]);
3198 char *s;
3199 const char *hint
3200 = candidates_list_and_hint (alg_name, s, candidates);
3201 if (hint)
3202 inform (input_location,
3203 "valid arguments to %qs are: %s; did you mean %qs?",
3204 opt, s, hint);
3205 else
3206 inform (input_location, "valid arguments to %qs are: %s",
3207 opt, s);
3208 XDELETEVEC (s);
3209 return;
3212 if ((stringop_alg) i == rep_prefix_8_byte
3213 && !TARGET_64BIT)
3215 /* rep; movq isn't available in 32-bit code. */
3216 error ("strategy name %qs specified for option %qs "
3217 "not supported for 32-bit code", alg_name, opt);
3218 return;
3221 input_ranges[n].max = maxs;
3222 input_ranges[n].alg = (stringop_alg) i;
3223 if (!strcmp (align, "align"))
3224 input_ranges[n].noalign = false;
3225 else if (!strcmp (align, "noalign"))
3226 input_ranges[n].noalign = true;
3227 else
3229 error ("unknown alignment %qs specified for option %qs", align, opt);
3230 return;
3232 n++;
3233 curr_range_str = next_range_str;
3235 while (curr_range_str);
3237 if (input_ranges[n - 1].max != -1)
3239 error ("the max value for the last size range should be -1"
3240 " for option %qs", opt);
3241 return;
3244 if (n > MAX_STRINGOP_ALGS)
3246 error ("too many size ranges specified in option %qs", opt);
3247 return;
3250 /* Now override the default algs array. */
3251 for (i = 0; i < n; i++)
3253 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3254 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3255 = input_ranges[i].alg;
3256 *const_cast<int *>(&default_algs->size[i].noalign)
3257 = input_ranges[i].noalign;
3262 /* parse -mtune-ctrl= option. When DUMP is true,
3263 print the features that are explicitly set. */
3265 static void
3266 parse_mtune_ctrl_str (bool dump)
3268 if (!ix86_tune_ctrl_string)
3269 return;
3271 char *next_feature_string = NULL;
3272 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3273 char *orig = curr_feature_string;
3274 int i;
3277 bool clear = false;
3279 next_feature_string = strchr (curr_feature_string, ',');
3280 if (next_feature_string)
3281 *next_feature_string++ = '\0';
3282 if (*curr_feature_string == '^')
3284 curr_feature_string++;
3285 clear = true;
3287 for (i = 0; i < X86_TUNE_LAST; i++)
3289 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3291 ix86_tune_features[i] = !clear;
3292 if (dump)
3293 fprintf (stderr, "Explicitly %s feature %s\n",
3294 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3295 break;
3298 if (i == X86_TUNE_LAST)
3299 error ("unknown parameter to option -mtune-ctrl: %s",
3300 clear ? curr_feature_string - 1 : curr_feature_string);
3301 curr_feature_string = next_feature_string;
3303 while (curr_feature_string);
3304 free (orig);
3307 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3308 processor type. */
3310 static void
3311 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3313 unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
3314 int i;
3316 for (i = 0; i < X86_TUNE_LAST; ++i)
3318 if (ix86_tune_no_default)
3319 ix86_tune_features[i] = 0;
3320 else
3321 ix86_tune_features[i]
3322 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3325 if (dump)
3327 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3328 for (i = 0; i < X86_TUNE_LAST; i++)
3329 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3330 ix86_tune_features[i] ? "on" : "off");
3333 parse_mtune_ctrl_str (dump);
3337 /* Default align_* from the processor table. */
3339 static void
3340 ix86_default_align (struct gcc_options *opts)
3342 if (opts->x_align_loops == 0)
3344 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3345 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3347 if (opts->x_align_jumps == 0)
3349 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3350 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3352 if (opts->x_align_functions == 0)
3354 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3358 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3360 static void
3361 ix86_override_options_after_change (void)
3363 ix86_default_align (&global_options);
3366 /* Override various settings based on options. If MAIN_ARGS_P, the
3367 options are from the command line, otherwise they are from
3368 attributes. Return true if there's an error related to march
3369 option. */
3371 static bool
3372 ix86_option_override_internal (bool main_args_p,
3373 struct gcc_options *opts,
3374 struct gcc_options *opts_set)
3376 int i;
3377 unsigned HOST_WIDE_INT ix86_arch_mask;
3378 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3380 const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3381 const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3382 const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3383 const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3384 const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3385 const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3386 const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3387 const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3388 const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3389 const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3390 const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3391 const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3392 const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3393 const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3394 const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3395 const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3396 const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3397 const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3398 const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3399 const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3400 const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3401 const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3402 const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3403 const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3404 const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3405 const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3406 const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3407 const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3408 const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3409 const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3410 const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3411 const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3412 const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3413 const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3414 const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3415 const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3416 const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3417 const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3418 const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3419 const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3420 const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3421 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3422 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3423 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3424 const wide_int_bitmask PTA_MPX (HOST_WIDE_INT_1U << 44);
3425 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3426 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3427 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3428 const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3429 const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3430 const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3431 const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3432 const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3433 const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3434 const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3435 const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3436 const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3437 const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3438 const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3439 const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3440 const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3441 const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3442 const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3443 const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3444 const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3445 const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3446 const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3447 const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3448 const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3449 const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3450 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3451 const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7);
3452 const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
3454 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3455 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3456 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3457 | PTA_POPCNT;
3458 const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3459 const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3460 | PTA_XSAVEOPT;
3461 const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3462 | PTA_RDRND | PTA_F16C;
3463 const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3464 | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3465 const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3466 | PTA_RDSEED;
3467 const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3468 | PTA_XSAVEC | PTA_XSAVES | PTA_SGX;
3469 const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3470 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3471 | PTA_CLWB;
3472 const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE | PTA_AVX512F
3473 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3474 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA;
3475 const wide_int_bitmask PTA_ICELAKE_CLIENT = PTA_CANNONLAKE | PTA_AVX512VNNI
3476 | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3477 | PTA_RDPID | PTA_CLWB;
3478 const wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT | PTA_PCONFIG
3479 | PTA_WBNOINVD;
3480 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3481 | PTA_AVX512F | PTA_AVX512CD;
3482 const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3483 const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3484 const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3485 | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3487 static struct pta
3489 const char *const name; /* processor name or nickname. */
3490 const enum processor_type processor;
3491 const enum attr_cpu schedule;
3492 const wide_int_bitmask flags;
3494 const processor_alias_table[] =
3496 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3497 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3498 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3499 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3500 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3501 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3502 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3503 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3504 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3505 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3506 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_FXSR},
3508 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3509 PTA_MMX | PTA_SSE | PTA_FXSR},
3510 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3512 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3513 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3514 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3515 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3516 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3517 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3518 PTA_MMX | PTA_SSE | PTA_FXSR},
3519 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3520 PTA_MMX | PTA_SSE | PTA_FXSR},
3521 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3522 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3523 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3524 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3525 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3526 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3527 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3528 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3529 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3530 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3531 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3532 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3533 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3534 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3535 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3536 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3537 PTA_SANDYBRIDGE},
3538 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3539 PTA_SANDYBRIDGE},
3540 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3541 PTA_IVYBRIDGE},
3542 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3543 PTA_IVYBRIDGE},
3544 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3545 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3546 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3547 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3548 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3549 PTA_SKYLAKE_AVX512},
3550 {"cannonlake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_CANNONLAKE},
3551 {"icelake-client", PROCESSOR_ICELAKE_CLIENT, CPU_HASWELL,
3552 PTA_ICELAKE_CLIENT},
3553 {"icelake-server", PROCESSOR_ICELAKE_SERVER, CPU_HASWELL,
3554 PTA_ICELAKE_SERVER},
3555 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3556 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3557 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3558 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3559 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3560 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3561 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3562 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3563 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3564 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3565 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3566 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3567 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3568 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3569 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3570 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3571 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3572 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3573 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3574 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3575 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3576 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3577 {"x86-64", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3579 {"eden-x2", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3581 {"nano", PROCESSOR_K8, CPU_K8,
3582 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3583 | PTA_SSSE3 | PTA_FXSR},
3584 {"nano-1000", PROCESSOR_K8, CPU_K8,
3585 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3586 | PTA_SSSE3 | PTA_FXSR},
3587 {"nano-2000", PROCESSOR_K8, CPU_K8,
3588 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3589 | PTA_SSSE3 | PTA_FXSR},
3590 {"nano-3000", PROCESSOR_K8, CPU_K8,
3591 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3592 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3593 {"nano-x2", PROCESSOR_K8, CPU_K8,
3594 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3595 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3596 {"eden-x4", PROCESSOR_K8, CPU_K8,
3597 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3598 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3599 {"nano-x4", PROCESSOR_K8, CPU_K8,
3600 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3601 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3602 {"k8", PROCESSOR_K8, CPU_K8,
3603 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3604 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3605 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3606 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3607 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3608 {"opteron", PROCESSOR_K8, CPU_K8,
3609 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3610 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3611 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3612 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3613 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3614 {"athlon64", PROCESSOR_K8, CPU_K8,
3615 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3616 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3617 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3618 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3619 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3620 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3621 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3622 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3623 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3624 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3625 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3626 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3627 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3628 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3629 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3630 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3631 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3632 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3633 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3634 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3635 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3636 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3637 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3638 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3639 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3640 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3641 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3642 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3643 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3644 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3645 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3646 | PTA_XSAVEOPT | PTA_FSGSBASE},
3647 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3648 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3649 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3650 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3651 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3652 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3653 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3654 | PTA_MOVBE | PTA_MWAITX},
3655 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3656 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3657 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3658 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3659 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3660 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3661 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3662 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3663 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3664 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3665 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3666 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3667 | PTA_FXSR | PTA_XSAVE},
3668 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3669 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3670 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3671 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3672 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3673 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3675 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3676 PTA_64BIT
3677 | PTA_HLE /* flags are only used for -march switch. */ },
3680 /* -mrecip options. */
3681 static struct
3683 const char *string; /* option name */
3684 unsigned int mask; /* mask bits to set */
3686 const recip_options[] =
3688 { "all", RECIP_MASK_ALL },
3689 { "none", RECIP_MASK_NONE },
3690 { "div", RECIP_MASK_DIV },
3691 { "sqrt", RECIP_MASK_SQRT },
3692 { "vec-div", RECIP_MASK_VEC_DIV },
3693 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3696 int const pta_size = ARRAY_SIZE (processor_alias_table);
3698 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3699 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3700 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3701 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3702 #ifdef TARGET_BI_ARCH
3703 else
3705 #if TARGET_BI_ARCH == 1
3706 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3707 is on and OPTION_MASK_ABI_X32 is off. We turn off
3708 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3709 -mx32. */
3710 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3711 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3712 #else
3713 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3714 on and OPTION_MASK_ABI_64 is off. We turn off
3715 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3716 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3717 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3718 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3719 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3720 #endif
3721 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3722 && TARGET_IAMCU_P (opts->x_target_flags))
3723 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3724 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3726 #endif
3728 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3730 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3731 OPTION_MASK_ABI_64 for TARGET_X32. */
3732 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3733 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3735 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3736 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3737 | OPTION_MASK_ABI_X32
3738 | OPTION_MASK_ABI_64);
3739 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3741 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3742 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3743 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3744 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3747 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3748 SUBTARGET_OVERRIDE_OPTIONS;
3749 #endif
3751 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3752 SUBSUBTARGET_OVERRIDE_OPTIONS;
3753 #endif
3755 /* -fPIC is the default for x86_64. */
3756 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3757 opts->x_flag_pic = 2;
3759 /* Need to check -mtune=generic first. */
3760 if (opts->x_ix86_tune_string)
3762 /* As special support for cross compilers we read -mtune=native
3763 as -mtune=generic. With native compilers we won't see the
3764 -mtune=native, as it was changed by the driver. */
3765 if (!strcmp (opts->x_ix86_tune_string, "native"))
3767 opts->x_ix86_tune_string = "generic";
3769 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3770 warning (OPT_Wdeprecated,
3771 main_args_p
3772 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3773 "or %<-mtune=generic%> instead as appropriate")
3774 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3775 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3776 " instead as appropriate"));
3778 else
3780 if (opts->x_ix86_arch_string)
3781 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3782 if (!opts->x_ix86_tune_string)
3784 opts->x_ix86_tune_string
3785 = processor_target_table[TARGET_CPU_DEFAULT].name;
3786 ix86_tune_defaulted = 1;
3789 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3790 or defaulted. We need to use a sensible tune option. */
3791 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3793 opts->x_ix86_tune_string = "generic";
3797 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3798 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3800 /* rep; movq isn't available in 32-bit code. */
3801 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3802 opts->x_ix86_stringop_alg = no_stringop;
3805 if (!opts->x_ix86_arch_string)
3806 opts->x_ix86_arch_string
3807 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3808 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3809 else
3810 ix86_arch_specified = 1;
3812 if (opts_set->x_ix86_pmode)
3814 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3815 && opts->x_ix86_pmode == PMODE_SI)
3816 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3817 && opts->x_ix86_pmode == PMODE_DI))
3818 error ("address mode %qs not supported in the %s bit mode",
3819 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3820 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3822 else
3823 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3824 ? PMODE_DI : PMODE_SI;
3826 if (!opts_set->x_ix86_abi)
3827 opts->x_ix86_abi = DEFAULT_ABI;
3829 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3830 error ("-mabi=ms not supported with X32 ABI");
3831 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3833 /* For targets using ms ABI enable ms-extensions, if not
3834 explicit turned off. For non-ms ABI we turn off this
3835 option. */
3836 if (!opts_set->x_flag_ms_extensions)
3837 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3839 if (opts_set->x_ix86_cmodel)
3841 switch (opts->x_ix86_cmodel)
3843 case CM_SMALL:
3844 case CM_SMALL_PIC:
3845 if (opts->x_flag_pic)
3846 opts->x_ix86_cmodel = CM_SMALL_PIC;
3847 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3848 error ("code model %qs not supported in the %s bit mode",
3849 "small", "32");
3850 break;
3852 case CM_MEDIUM:
3853 case CM_MEDIUM_PIC:
3854 if (opts->x_flag_pic)
3855 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3856 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3857 error ("code model %qs not supported in the %s bit mode",
3858 "medium", "32");
3859 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3860 error ("code model %qs not supported in x32 mode",
3861 "medium");
3862 break;
3864 case CM_LARGE:
3865 case CM_LARGE_PIC:
3866 if (opts->x_flag_pic)
3867 opts->x_ix86_cmodel = CM_LARGE_PIC;
3868 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3869 error ("code model %qs not supported in the %s bit mode",
3870 "large", "32");
3871 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3872 error ("code model %qs not supported in x32 mode",
3873 "large");
3874 break;
3876 case CM_32:
3877 if (opts->x_flag_pic)
3878 error ("code model %s does not support PIC mode", "32");
3879 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3880 error ("code model %qs not supported in the %s bit mode",
3881 "32", "64");
3882 break;
3884 case CM_KERNEL:
3885 if (opts->x_flag_pic)
3887 error ("code model %s does not support PIC mode", "kernel");
3888 opts->x_ix86_cmodel = CM_32;
3890 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3891 error ("code model %qs not supported in the %s bit mode",
3892 "kernel", "32");
3893 break;
3895 default:
3896 gcc_unreachable ();
3899 else
3901 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3902 use of rip-relative addressing. This eliminates fixups that
3903 would otherwise be needed if this object is to be placed in a
3904 DLL, and is essentially just as efficient as direct addressing. */
3905 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3906 && (TARGET_RDOS || TARGET_PECOFF))
3907 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3908 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3909 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3910 else
3911 opts->x_ix86_cmodel = CM_32;
3913 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3915 error ("-masm=intel not supported in this configuration");
3916 opts->x_ix86_asm_dialect = ASM_ATT;
3918 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3919 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3920 sorry ("%i-bit mode not compiled in",
3921 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3923 for (i = 0; i < pta_size; i++)
3924 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3926 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3928 error (main_args_p
3929 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3930 "switch")
3931 : G_("%<generic%> CPU can be used only for "
3932 "%<target(\"tune=\")%> attribute"));
3933 return false;
3935 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3937 error (main_args_p
3938 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3939 "switch")
3940 : G_("%<intel%> CPU can be used only for "
3941 "%<target(\"tune=\")%> attribute"));
3942 return false;
3945 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3946 && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3948 error ("CPU you selected does not support x86-64 "
3949 "instruction set");
3950 return false;
3953 ix86_schedule = processor_alias_table[i].schedule;
3954 ix86_arch = processor_alias_table[i].processor;
3955 /* Default cpu tuning to the architecture. */
3956 ix86_tune = ix86_arch;
3958 if (((processor_alias_table[i].flags & PTA_MMX) != 0)
3959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3961 if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
3962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3964 if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
3965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3967 if (((processor_alias_table[i].flags & PTA_SSE) != 0)
3968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3970 if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
3971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3973 if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
3974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3976 if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
3977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3979 if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
3980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3982 if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
3983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3985 if (((processor_alias_table[i].flags & PTA_AVX) != 0)
3986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3988 if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
3989 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3990 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3991 if (((processor_alias_table[i].flags & PTA_FMA) != 0)
3992 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3993 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3994 if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
3995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3997 if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
3998 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3999 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
4000 if (((processor_alias_table[i].flags & PTA_XOP) != 0)
4001 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
4002 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
4003 if (((processor_alias_table[i].flags & PTA_LWP) != 0)
4004 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
4005 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
4006 if (((processor_alias_table[i].flags & PTA_ABM) != 0)
4007 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
4008 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
4009 if (((processor_alias_table[i].flags & PTA_BMI) != 0)
4010 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4011 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4012 if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
4013 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4014 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4015 if (((processor_alias_table[i].flags & PTA_TBM) != 0)
4016 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4017 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4018 if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
4019 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4020 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4021 if (((processor_alias_table[i].flags & PTA_CX16) != 0)
4022 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4023 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4024 if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
4025 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4026 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4027 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4028 && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
4029 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4030 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4031 if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
4032 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4033 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4034 if (((processor_alias_table[i].flags & PTA_AES) != 0)
4035 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4036 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4037 if (((processor_alias_table[i].flags & PTA_SHA) != 0)
4038 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4039 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4040 if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
4041 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4042 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4043 if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
4044 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4045 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4046 if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
4047 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4048 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4049 if (((processor_alias_table[i].flags & PTA_F16C) != 0)
4050 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4051 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4052 if (((processor_alias_table[i].flags & PTA_RTM) != 0)
4053 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4054 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4055 if (((processor_alias_table[i].flags & PTA_HLE) != 0)
4056 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4057 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4058 if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
4059 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4060 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4061 if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
4062 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4063 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4064 if (((processor_alias_table[i].flags & PTA_ADX) != 0)
4065 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4066 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4067 if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
4068 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4069 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4070 if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
4071 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4072 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4073 if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
4074 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4075 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4076 if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
4077 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4078 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4079 if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
4080 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4081 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4082 if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
4083 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4084 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4085 if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
4086 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4087 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4088 if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
4089 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4090 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4091 if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
4092 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4093 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4094 if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4095 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4096 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4097 if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4098 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4099 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4100 if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4101 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4102 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4103 if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4104 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4105 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4106 if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4107 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4108 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4109 if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4110 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4111 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4112 if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4113 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4114 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4115 if (((processor_alias_table[i].flags & PTA_MPX) != 0)
4116 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4117 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4118 if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4119 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4120 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4121 if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4122 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4123 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4124 if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4125 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4126 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4127 if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4128 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4129 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4130 if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4131 && !(opts->x_ix86_isa_flags_explicit
4132 & OPTION_MASK_ISA_AVX512VBMI2))
4133 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4134 if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4135 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4136 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4137 if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4138 && !(opts->x_ix86_isa_flags_explicit
4139 & OPTION_MASK_ISA_AVX512BITALG))
4140 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4142 if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4143 && !(opts->x_ix86_isa_flags2_explicit
4144 & OPTION_MASK_ISA_AVX5124VNNIW))
4145 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4146 if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4147 && !(opts->x_ix86_isa_flags2_explicit
4148 & OPTION_MASK_ISA_AVX5124FMAPS))
4149 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4150 if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4151 && !(opts->x_ix86_isa_flags_explicit
4152 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4153 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4154 if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4155 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4156 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4157 if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4158 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4159 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4160 if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4161 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4162 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4163 if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
4164 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
4165 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
4166 if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
4167 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
4168 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
4170 if ((processor_alias_table[i].flags
4171 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4172 x86_prefetch_sse = true;
4173 if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4174 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4175 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4176 if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4177 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4178 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4180 /* Don't enable x87 instructions if only
4181 general registers are allowed. */
4182 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4183 && !(opts_set->x_target_flags & MASK_80387))
4185 if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4186 opts->x_target_flags &= ~MASK_80387;
4187 else
4188 opts->x_target_flags |= MASK_80387;
4190 break;
4193 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4194 error ("Intel MPX does not support x32");
4196 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4197 error ("Intel MPX does not support x32");
4199 if (i == pta_size)
4201 error (main_args_p
4202 ? G_("bad value (%qs) for %<-march=%> switch")
4203 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4204 opts->x_ix86_arch_string);
4206 auto_vec <const char *> candidates;
4207 for (i = 0; i < pta_size; i++)
4208 if (strcmp (processor_alias_table[i].name, "generic")
4209 && strcmp (processor_alias_table[i].name, "intel")
4210 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4211 || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4212 candidates.safe_push (processor_alias_table[i].name);
4214 #ifdef HAVE_LOCAL_CPU_DETECT
4215 /* Add also "native" as possible value. */
4216 candidates.safe_push ("native");
4217 #endif
4219 char *s;
4220 const char *hint
4221 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4222 if (hint)
4223 inform (input_location,
4224 main_args_p
4225 ? G_("valid arguments to %<-march=%> switch are: "
4226 "%s; did you mean %qs?")
4227 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4228 "%s; did you mean %qs?"), s, hint);
4229 else
4230 inform (input_location,
4231 main_args_p
4232 ? G_("valid arguments to %<-march=%> switch are: %s")
4233 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4234 "are: %s"), s);
4235 XDELETEVEC (s);
4238 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
4239 for (i = 0; i < X86_ARCH_LAST; ++i)
4240 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4242 for (i = 0; i < pta_size; i++)
4243 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4245 ix86_schedule = processor_alias_table[i].schedule;
4246 ix86_tune = processor_alias_table[i].processor;
4247 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4249 if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4251 if (ix86_tune_defaulted)
4253 opts->x_ix86_tune_string = "x86-64";
4254 for (i = 0; i < pta_size; i++)
4255 if (! strcmp (opts->x_ix86_tune_string,
4256 processor_alias_table[i].name))
4257 break;
4258 ix86_schedule = processor_alias_table[i].schedule;
4259 ix86_tune = processor_alias_table[i].processor;
4261 else
4262 error ("CPU you selected does not support x86-64 "
4263 "instruction set");
4266 /* Intel CPUs have always interpreted SSE prefetch instructions as
4267 NOPs; so, we can enable SSE prefetch instructions even when
4268 -mtune (rather than -march) points us to a processor that has them.
4269 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4270 higher processors. */
4271 if (TARGET_CMOV
4272 && ((processor_alias_table[i].flags
4273 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4274 x86_prefetch_sse = true;
4275 break;
4278 if (ix86_tune_specified && i == pta_size)
4280 error (main_args_p
4281 ? G_("bad value (%qs) for %<-mtune=%> switch")
4282 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4283 opts->x_ix86_tune_string);
4285 auto_vec <const char *> candidates;
4286 for (i = 0; i < pta_size; i++)
4287 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4288 || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4289 candidates.safe_push (processor_alias_table[i].name);
4291 #ifdef HAVE_LOCAL_CPU_DETECT
4292 /* Add also "native" as possible value. */
4293 candidates.safe_push ("native");
4294 #endif
4296 char *s;
4297 const char *hint
4298 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4299 if (hint)
4300 inform (input_location,
4301 main_args_p
4302 ? G_("valid arguments to %<-mtune=%> switch are: "
4303 "%s; did you mean %qs?")
4304 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4305 "%s; did you mean %qs?"), s, hint);
4306 else
4307 inform (input_location,
4308 main_args_p
4309 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4310 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4311 "are: %s"), s);
4312 XDELETEVEC (s);
4315 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4317 #ifndef USE_IX86_FRAME_POINTER
4318 #define USE_IX86_FRAME_POINTER 0
4319 #endif
4321 #ifndef USE_X86_64_FRAME_POINTER
4322 #define USE_X86_64_FRAME_POINTER 0
4323 #endif
4325 /* Set the default values for switches whose default depends on TARGET_64BIT
4326 in case they weren't overwritten by command line options. */
4327 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4329 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4330 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4331 if (opts->x_flag_asynchronous_unwind_tables
4332 && !opts_set->x_flag_unwind_tables
4333 && TARGET_64BIT_MS_ABI)
4334 opts->x_flag_unwind_tables = 1;
4335 if (opts->x_flag_asynchronous_unwind_tables == 2)
4336 opts->x_flag_unwind_tables
4337 = opts->x_flag_asynchronous_unwind_tables = 1;
4338 if (opts->x_flag_pcc_struct_return == 2)
4339 opts->x_flag_pcc_struct_return = 0;
4341 else
4343 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4344 opts->x_flag_omit_frame_pointer
4345 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4346 if (opts->x_flag_asynchronous_unwind_tables == 2)
4347 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4348 if (opts->x_flag_pcc_struct_return == 2)
4350 /* Intel MCU psABI specifies that -freg-struct-return should
4351 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4352 we check -miamcu so that -freg-struct-return is always
4353 turned on if -miamcu is used. */
4354 if (TARGET_IAMCU_P (opts->x_target_flags))
4355 opts->x_flag_pcc_struct_return = 0;
4356 else
4357 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4361 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4362 /* TODO: ix86_cost should be chosen at instruction or function granuality
4363 so for cold code we use size_cost even in !optimize_size compilation. */
4364 if (opts->x_optimize_size)
4365 ix86_cost = &ix86_size_cost;
4366 else
4367 ix86_cost = ix86_tune_cost;
4369 /* Arrange to set up i386_stack_locals for all functions. */
4370 init_machine_status = ix86_init_machine_status;
4372 /* Validate -mregparm= value. */
4373 if (opts_set->x_ix86_regparm)
4375 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4376 warning (0, "-mregparm is ignored in 64-bit mode");
4377 else if (TARGET_IAMCU_P (opts->x_target_flags))
4378 warning (0, "-mregparm is ignored for Intel MCU psABI");
4379 if (opts->x_ix86_regparm > REGPARM_MAX)
4381 error ("-mregparm=%d is not between 0 and %d",
4382 opts->x_ix86_regparm, REGPARM_MAX);
4383 opts->x_ix86_regparm = 0;
4386 if (TARGET_IAMCU_P (opts->x_target_flags)
4387 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4388 opts->x_ix86_regparm = REGPARM_MAX;
4390 /* Default align_* from the processor table. */
4391 ix86_default_align (opts);
4393 /* Provide default for -mbranch-cost= value. */
4394 if (!opts_set->x_ix86_branch_cost)
4395 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4397 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4399 opts->x_target_flags
4400 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4402 /* Enable by default the SSE and MMX builtins. Do allow the user to
4403 explicitly disable any of these. In particular, disabling SSE and
4404 MMX for kernel code is extremely useful. */
4405 if (!ix86_arch_specified)
4406 opts->x_ix86_isa_flags
4407 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4408 | TARGET_SUBTARGET64_ISA_DEFAULT)
4409 & ~opts->x_ix86_isa_flags_explicit);
4411 if (TARGET_RTD_P (opts->x_target_flags))
4412 warning (0,
4413 main_args_p
4414 ? G_("%<-mrtd%> is ignored in 64bit mode")
4415 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4417 else
4419 opts->x_target_flags
4420 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4422 if (!ix86_arch_specified)
4423 opts->x_ix86_isa_flags
4424 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4426 /* i386 ABI does not specify red zone. It still makes sense to use it
4427 when programmer takes care to stack from being destroyed. */
4428 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4429 opts->x_target_flags |= MASK_NO_RED_ZONE;
4432 /* Keep nonleaf frame pointers. */
4433 if (opts->x_flag_omit_frame_pointer)
4434 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4435 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4436 opts->x_flag_omit_frame_pointer = 1;
4438 /* If we're doing fast math, we don't care about comparison order
4439 wrt NaNs. This lets us use a shorter comparison sequence. */
4440 if (opts->x_flag_finite_math_only)
4441 opts->x_target_flags &= ~MASK_IEEE_FP;
4443 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4444 since the insns won't need emulation. */
4445 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4446 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4448 /* Likewise, if the target doesn't have a 387, or we've specified
4449 software floating point, don't use 387 inline intrinsics. */
4450 if (!TARGET_80387_P (opts->x_target_flags))
4451 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4453 /* Turn on MMX builtins for -msse. */
4454 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4455 opts->x_ix86_isa_flags
4456 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4458 /* Enable SSE prefetch. */
4459 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4460 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4461 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4462 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4463 x86_prefetch_sse = true;
4465 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4466 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4467 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4468 opts->x_ix86_isa_flags
4469 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4471 /* Enable lzcnt instruction for -mabm. */
4472 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4473 opts->x_ix86_isa_flags
4474 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4476 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4477 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4478 opts->x_ix86_isa_flags
4479 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4480 & ~opts->x_ix86_isa_flags_explicit);
4482 /* Validate -mpreferred-stack-boundary= value or default it to
4483 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4484 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4485 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4487 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4488 int max = TARGET_SEH ? 4 : 12;
4490 if (opts->x_ix86_preferred_stack_boundary_arg < min
4491 || opts->x_ix86_preferred_stack_boundary_arg > max)
4493 if (min == max)
4494 error ("-mpreferred-stack-boundary is not supported "
4495 "for this target");
4496 else
4497 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4498 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4500 else
4501 ix86_preferred_stack_boundary
4502 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4505 /* Set the default value for -mstackrealign. */
4506 if (!opts_set->x_ix86_force_align_arg_pointer)
4507 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4509 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4511 /* Validate -mincoming-stack-boundary= value or default it to
4512 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4513 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4514 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4516 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4518 if (opts->x_ix86_incoming_stack_boundary_arg < min
4519 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4520 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4521 opts->x_ix86_incoming_stack_boundary_arg, min);
4522 else
4524 ix86_user_incoming_stack_boundary
4525 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4526 ix86_incoming_stack_boundary
4527 = ix86_user_incoming_stack_boundary;
4531 #ifndef NO_PROFILE_COUNTERS
4532 if (flag_nop_mcount)
4533 error ("-mnop-mcount is not compatible with this target");
4534 #endif
4535 if (flag_nop_mcount && flag_pic)
4536 error ("-mnop-mcount is not implemented for -fPIC");
4538 /* Accept -msseregparm only if at least SSE support is enabled. */
4539 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4540 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4541 error (main_args_p
4542 ? G_("%<-msseregparm%> used without SSE enabled")
4543 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4545 if (opts_set->x_ix86_fpmath)
4547 if (opts->x_ix86_fpmath & FPMATH_SSE)
4549 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4551 if (TARGET_80387_P (opts->x_target_flags))
4553 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4554 opts->x_ix86_fpmath = FPMATH_387;
4557 else if ((opts->x_ix86_fpmath & FPMATH_387)
4558 && !TARGET_80387_P (opts->x_target_flags))
4560 warning (0, "387 instruction set disabled, using SSE arithmetics");
4561 opts->x_ix86_fpmath = FPMATH_SSE;
4565 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4566 fpmath=387. The second is however default at many targets since the
4567 extra 80bit precision of temporaries is considered to be part of ABI.
4568 Overwrite the default at least for -ffast-math.
4569 TODO: -mfpmath=both seems to produce same performing code with bit
4570 smaller binaries. It is however not clear if register allocation is
4571 ready for this setting.
4572 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4573 codegen. We may switch to 387 with -ffast-math for size optimized
4574 functions. */
4575 else if (fast_math_flags_set_p (&global_options)
4576 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4577 opts->x_ix86_fpmath = FPMATH_SSE;
4578 else
4579 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4581 /* Use external vectorized library in vectorizing intrinsics. */
4582 if (opts_set->x_ix86_veclibabi_type)
4583 switch (opts->x_ix86_veclibabi_type)
4585 case ix86_veclibabi_type_svml:
4586 ix86_veclib_handler = ix86_veclibabi_svml;
4587 break;
4589 case ix86_veclibabi_type_acml:
4590 ix86_veclib_handler = ix86_veclibabi_acml;
4591 break;
4593 default:
4594 gcc_unreachable ();
4597 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4598 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4599 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4601 /* If stack probes are required, the space used for large function
4602 arguments on the stack must also be probed, so enable
4603 -maccumulate-outgoing-args so this happens in the prologue. */
4604 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4605 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4607 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4608 warning (0,
4609 main_args_p
4610 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4611 "for correctness")
4612 : G_("stack probing requires "
4613 "%<target(\"accumulate-outgoing-args\")%> for "
4614 "correctness"));
4615 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4618 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4619 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4620 if (fixed_regs[BP_REG]
4621 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4623 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4624 warning (0,
4625 main_args_p
4626 ? G_("fixed ebp register requires "
4627 "%<-maccumulate-outgoing-args%>")
4628 : G_("fixed ebp register requires "
4629 "%<target(\"accumulate-outgoing-args\")%>"));
4630 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4633 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4635 char *p;
4636 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4637 p = strchr (internal_label_prefix, 'X');
4638 internal_label_prefix_len = p - internal_label_prefix;
4639 *p = '\0';
4642 /* When scheduling description is not available, disable scheduler pass
4643 so it won't slow down the compilation and make x87 code slower. */
4644 if (!TARGET_SCHEDULE)
4645 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4647 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4648 ix86_tune_cost->simultaneous_prefetches,
4649 opts->x_param_values,
4650 opts_set->x_param_values);
4651 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4652 ix86_tune_cost->prefetch_block,
4653 opts->x_param_values,
4654 opts_set->x_param_values);
4655 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4656 ix86_tune_cost->l1_cache_size,
4657 opts->x_param_values,
4658 opts_set->x_param_values);
4659 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4660 ix86_tune_cost->l2_cache_size,
4661 opts->x_param_values,
4662 opts_set->x_param_values);
4664 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4665 if (opts->x_flag_prefetch_loop_arrays < 0
4666 && HAVE_prefetch
4667 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4668 && !opts->x_optimize_size
4669 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4670 opts->x_flag_prefetch_loop_arrays = 1;
4672 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4673 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4674 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4675 targetm.expand_builtin_va_start = NULL;
4677 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4679 ix86_gen_leave = gen_leave_rex64;
4680 if (Pmode == DImode)
4682 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4683 ix86_gen_tls_local_dynamic_base_64
4684 = gen_tls_local_dynamic_base_64_di;
4686 else
4688 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4689 ix86_gen_tls_local_dynamic_base_64
4690 = gen_tls_local_dynamic_base_64_si;
4693 else
4694 ix86_gen_leave = gen_leave;
4696 if (Pmode == DImode)
4698 ix86_gen_add3 = gen_adddi3;
4699 ix86_gen_sub3 = gen_subdi3;
4700 ix86_gen_sub3_carry = gen_subdi3_carry;
4701 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4702 ix86_gen_andsp = gen_anddi3;
4703 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4704 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4705 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4706 ix86_gen_monitor = gen_sse3_monitor_di;
4707 ix86_gen_monitorx = gen_monitorx_di;
4708 ix86_gen_clzero = gen_clzero_di;
4710 else
4712 ix86_gen_add3 = gen_addsi3;
4713 ix86_gen_sub3 = gen_subsi3;
4714 ix86_gen_sub3_carry = gen_subsi3_carry;
4715 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4716 ix86_gen_andsp = gen_andsi3;
4717 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4718 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4719 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4720 ix86_gen_monitor = gen_sse3_monitor_si;
4721 ix86_gen_monitorx = gen_monitorx_si;
4722 ix86_gen_clzero = gen_clzero_si;
4725 #ifdef USE_IX86_CLD
4726 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4727 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4728 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4729 #endif
4731 /* Set the default value for -mfentry. */
4732 if (!opts_set->x_flag_fentry)
4733 opts->x_flag_fentry = TARGET_SEH;
4734 else
4736 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4737 && opts->x_flag_fentry)
4738 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4739 "with -fpic");
4740 else if (TARGET_SEH && !opts->x_flag_fentry)
4741 sorry ("-mno-fentry isn%'t compatible with SEH");
4744 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4745 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4747 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4748 && TARGET_EMIT_VZEROUPPER)
4749 opts->x_target_flags |= MASK_VZEROUPPER;
4750 if (!(opts_set->x_target_flags & MASK_STV))
4751 opts->x_target_flags |= MASK_STV;
4752 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4753 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4754 stack realignment will be extra cost the pass doesn't take into
4755 account and the pass can't realign the stack. */
4756 if (ix86_preferred_stack_boundary < 128
4757 || ix86_incoming_stack_boundary < 128
4758 || opts->x_ix86_force_align_arg_pointer)
4759 opts->x_target_flags &= ~MASK_STV;
4760 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4761 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4762 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4763 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4764 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4765 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4767 /* Enable 128-bit AVX instruction generation
4768 for the auto-vectorizer. */
4769 if (TARGET_AVX128_OPTIMAL
4770 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4771 opts->x_prefer_vector_width_type = PVW_AVX128;
4773 /* Use 256-bit AVX instruction generation
4774 in the auto-vectorizer. */
4775 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4776 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4777 opts->x_prefer_vector_width_type = PVW_AVX256;
4779 if (opts->x_ix86_recip_name)
4781 char *p = ASTRDUP (opts->x_ix86_recip_name);
4782 char *q;
4783 unsigned int mask, i;
4784 bool invert;
4786 while ((q = strtok (p, ",")) != NULL)
4788 p = NULL;
4789 if (*q == '!')
4791 invert = true;
4792 q++;
4794 else
4795 invert = false;
4797 if (!strcmp (q, "default"))
4798 mask = RECIP_MASK_ALL;
4799 else
4801 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4802 if (!strcmp (q, recip_options[i].string))
4804 mask = recip_options[i].mask;
4805 break;
4808 if (i == ARRAY_SIZE (recip_options))
4810 error ("unknown option for -mrecip=%s", q);
4811 invert = false;
4812 mask = RECIP_MASK_NONE;
4816 opts->x_recip_mask_explicit |= mask;
4817 if (invert)
4818 opts->x_recip_mask &= ~mask;
4819 else
4820 opts->x_recip_mask |= mask;
4824 if (TARGET_RECIP_P (opts->x_target_flags))
4825 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4826 else if (opts_set->x_target_flags & MASK_RECIP)
4827 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4829 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4830 for 64-bit Bionic. Also default long double to 64-bit for Intel
4831 MCU psABI. */
4832 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4833 && !(opts_set->x_target_flags
4834 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4835 opts->x_target_flags |= (TARGET_64BIT
4836 ? MASK_LONG_DOUBLE_128
4837 : MASK_LONG_DOUBLE_64);
4839 /* Only one of them can be active. */
4840 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4841 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4843 /* Handle stack protector */
4844 if (!opts_set->x_ix86_stack_protector_guard)
4845 opts->x_ix86_stack_protector_guard
4846 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4848 #ifdef TARGET_THREAD_SSP_OFFSET
4849 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4850 #endif
4852 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4854 char *endp;
4855 const char *str = ix86_stack_protector_guard_offset_str;
4857 errno = 0;
4858 int64_t offset;
4860 #if defined(INT64_T_IS_LONG)
4861 offset = strtol (str, &endp, 0);
4862 #else
4863 offset = strtoll (str, &endp, 0);
4864 #endif
4866 if (!*str || *endp || errno)
4867 error ("%qs is not a valid number "
4868 "in -mstack-protector-guard-offset=", str);
4870 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4871 HOST_WIDE_INT_C (0x7fffffff)))
4872 error ("%qs is not a valid offset "
4873 "in -mstack-protector-guard-offset=", str);
4875 ix86_stack_protector_guard_offset = offset;
4878 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4880 /* The kernel uses a different segment register for performance
4881 reasons; a system call would not have to trash the userspace
4882 segment register, which would be expensive. */
4883 if (ix86_cmodel == CM_KERNEL)
4884 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4886 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4888 const char *str = ix86_stack_protector_guard_reg_str;
4889 addr_space_t seg = ADDR_SPACE_GENERIC;
4891 /* Discard optional register prefix. */
4892 if (str[0] == '%')
4893 str++;
4895 if (strlen (str) == 2 && str[1] == 's')
4897 if (str[0] == 'f')
4898 seg = ADDR_SPACE_SEG_FS;
4899 else if (str[0] == 'g')
4900 seg = ADDR_SPACE_SEG_GS;
4903 if (seg == ADDR_SPACE_GENERIC)
4904 error ("%qs is not a valid base register "
4905 "in -mstack-protector-guard-reg=",
4906 ix86_stack_protector_guard_reg_str);
4908 ix86_stack_protector_guard_reg = seg;
4911 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4912 if (opts->x_ix86_tune_memcpy_strategy)
4914 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4915 ix86_parse_stringop_strategy_string (str, false);
4916 free (str);
4919 if (opts->x_ix86_tune_memset_strategy)
4921 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4922 ix86_parse_stringop_strategy_string (str, true);
4923 free (str);
4926 /* Save the initial options in case the user does function specific
4927 options. */
4928 if (main_args_p)
4929 target_option_default_node = target_option_current_node
4930 = build_target_option_node (opts);
4932 /* Do not support control flow instrumentation if CET is not enabled. */
4933 cf_protection_level cf_protection
4934 = (cf_protection_level) (opts->x_flag_cf_protection & ~CF_SET);
4935 if (cf_protection != CF_NONE)
4937 switch (cf_protection)
4939 case CF_BRANCH:
4940 if (! TARGET_IBT_P (opts->x_ix86_isa_flags2))
4942 error ("%<-fcf-protection=branch%> requires Intel CET "
4943 "support. Use -mcet or -mibt option to enable CET");
4944 flag_cf_protection = CF_NONE;
4945 return false;
4947 break;
4948 case CF_RETURN:
4949 if (! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4951 error ("%<-fcf-protection=return%> requires Intel CET "
4952 "support. Use -mcet or -mshstk option to enable CET");
4953 flag_cf_protection = CF_NONE;
4954 return false;
4956 break;
4957 case CF_FULL:
4958 if ( ! TARGET_IBT_P (opts->x_ix86_isa_flags2)
4959 || ! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4961 error ("%<-fcf-protection=full%> requires Intel CET "
4962 "support. Use -mcet or both of -mibt and "
4963 "-mshstk options to enable CET");
4964 flag_cf_protection = CF_NONE;
4965 return false;
4967 break;
4968 default:
4969 gcc_unreachable ();
4972 opts->x_flag_cf_protection =
4973 (cf_protection_level) (cf_protection | CF_SET);
4976 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4977 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4978 opts->x_param_values,
4979 opts_set->x_param_values);
4981 return true;
4984 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4986 static void
4987 ix86_option_override (void)
4989 ix86_option_override_internal (true, &global_options, &global_options_set);
4992 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4993 static char *
4994 ix86_offload_options (void)
4996 if (TARGET_LP64)
4997 return xstrdup ("-foffload-abi=lp64");
4998 return xstrdup ("-foffload-abi=ilp32");
5001 /* Update register usage after having seen the compiler flags. */
5003 static void
5004 ix86_conditional_register_usage (void)
5006 int i, c_mask;
5008 /* If there are no caller-saved registers, preserve all registers.
5009 except fixed_regs and registers used for function return value
5010 since aggregate_value_p checks call_used_regs[regno] on return
5011 value. */
5012 if (cfun && cfun->machine->no_caller_saved_registers)
5013 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5014 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
5015 call_used_regs[i] = 0;
5017 /* For 32-bit targets, squash the REX registers. */
5018 if (! TARGET_64BIT)
5020 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
5021 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5022 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
5023 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5024 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5025 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5028 /* See the definition of CALL_USED_REGISTERS in i386.h. */
5029 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
5031 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
5033 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5035 /* Set/reset conditionally defined registers from
5036 CALL_USED_REGISTERS initializer. */
5037 if (call_used_regs[i] > 1)
5038 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
5040 /* Calculate registers of CLOBBERED_REGS register set
5041 as call used registers from GENERAL_REGS register set. */
5042 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
5043 && call_used_regs[i])
5044 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
5047 /* If MMX is disabled, squash the registers. */
5048 if (! TARGET_MMX)
5049 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5050 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
5051 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5053 /* If SSE is disabled, squash the registers. */
5054 if (! TARGET_SSE)
5055 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5056 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
5057 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5059 /* If the FPU is disabled, squash the registers. */
5060 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
5061 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5062 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5063 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5065 /* If AVX512F is disabled, squash the registers. */
5066 if (! TARGET_AVX512F)
5068 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5069 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5071 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5072 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5075 /* If MPX is disabled, squash the registers. */
5076 if (! TARGET_MPX)
5077 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5078 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5081 /* Canonicalize a comparison from one we don't have to one we do have. */
5083 static void
5084 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5085 bool op0_preserve_value)
5087 /* The order of operands in x87 ficom compare is forced by combine in
5088 simplify_comparison () function. Float operator is treated as RTX_OBJ
5089 with a precedence over other operators and is always put in the first
5090 place. Swap condition and operands to match ficom instruction. */
5091 if (!op0_preserve_value
5092 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5094 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5096 /* We are called only for compares that are split to SAHF instruction.
5097 Ensure that we have setcc/jcc insn for the swapped condition. */
5098 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5100 std::swap (*op0, *op1);
5101 *code = (int) scode;
5106 /* Save the current options */
5108 static void
5109 ix86_function_specific_save (struct cl_target_option *ptr,
5110 struct gcc_options *opts)
5112 ptr->arch = ix86_arch;
5113 ptr->schedule = ix86_schedule;
5114 ptr->prefetch_sse = x86_prefetch_sse;
5115 ptr->tune = ix86_tune;
5116 ptr->branch_cost = ix86_branch_cost;
5117 ptr->tune_defaulted = ix86_tune_defaulted;
5118 ptr->arch_specified = ix86_arch_specified;
5119 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5120 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5121 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5122 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5123 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5124 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5125 ptr->x_ix86_abi = opts->x_ix86_abi;
5126 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5127 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5128 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5129 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5130 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5131 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5132 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5133 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5134 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5135 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5136 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5137 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5138 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5139 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5140 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5141 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5142 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5143 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5144 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5145 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5147 /* The fields are char but the variables are not; make sure the
5148 values fit in the fields. */
5149 gcc_assert (ptr->arch == ix86_arch);
5150 gcc_assert (ptr->schedule == ix86_schedule);
5151 gcc_assert (ptr->tune == ix86_tune);
5152 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5155 /* Restore the current options */
5157 static void
5158 ix86_function_specific_restore (struct gcc_options *opts,
5159 struct cl_target_option *ptr)
5161 enum processor_type old_tune = ix86_tune;
5162 enum processor_type old_arch = ix86_arch;
5163 unsigned HOST_WIDE_INT ix86_arch_mask;
5164 int i;
5166 /* We don't change -fPIC. */
5167 opts->x_flag_pic = flag_pic;
5169 ix86_arch = (enum processor_type) ptr->arch;
5170 ix86_schedule = (enum attr_cpu) ptr->schedule;
5171 ix86_tune = (enum processor_type) ptr->tune;
5172 x86_prefetch_sse = ptr->prefetch_sse;
5173 opts->x_ix86_branch_cost = ptr->branch_cost;
5174 ix86_tune_defaulted = ptr->tune_defaulted;
5175 ix86_arch_specified = ptr->arch_specified;
5176 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5177 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5178 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5179 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5180 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5181 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5182 opts->x_ix86_abi = ptr->x_ix86_abi;
5183 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5184 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5185 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5186 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5187 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5188 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5189 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5190 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5191 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5192 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5193 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5194 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5195 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5196 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5197 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5198 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5199 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5200 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5201 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5202 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5203 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5204 /* TODO: ix86_cost should be chosen at instruction or function granuality
5205 so for cold code we use size_cost even in !optimize_size compilation. */
5206 if (opts->x_optimize_size)
5207 ix86_cost = &ix86_size_cost;
5208 else
5209 ix86_cost = ix86_tune_cost;
5211 /* Recreate the arch feature tests if the arch changed */
5212 if (old_arch != ix86_arch)
5214 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
5215 for (i = 0; i < X86_ARCH_LAST; ++i)
5216 ix86_arch_features[i]
5217 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5220 /* Recreate the tune optimization tests */
5221 if (old_tune != ix86_tune)
5222 set_ix86_tune_features (ix86_tune, false);
5225 /* Adjust target options after streaming them in. This is mainly about
5226 reconciling them with global options. */
5228 static void
5229 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5231 /* flag_pic is a global option, but ix86_cmodel is target saved option
5232 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5233 for PIC, or error out. */
5234 if (flag_pic)
5235 switch (ptr->x_ix86_cmodel)
5237 case CM_SMALL:
5238 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5239 break;
5241 case CM_MEDIUM:
5242 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5243 break;
5245 case CM_LARGE:
5246 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5247 break;
5249 case CM_KERNEL:
5250 error ("code model %s does not support PIC mode", "kernel");
5251 break;
5253 default:
5254 break;
5256 else
5257 switch (ptr->x_ix86_cmodel)
5259 case CM_SMALL_PIC:
5260 ptr->x_ix86_cmodel = CM_SMALL;
5261 break;
5263 case CM_MEDIUM_PIC:
5264 ptr->x_ix86_cmodel = CM_MEDIUM;
5265 break;
5267 case CM_LARGE_PIC:
5268 ptr->x_ix86_cmodel = CM_LARGE;
5269 break;
5271 default:
5272 break;
5276 /* Print the current options */
5278 static void
5279 ix86_function_specific_print (FILE *file, int indent,
5280 struct cl_target_option *ptr)
5282 char *target_string
5283 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5284 ptr->x_target_flags, ptr->x_ix86_target_flags,
5285 NULL, NULL, ptr->x_ix86_fpmath, false);
5287 gcc_assert (ptr->arch < PROCESSOR_max);
5288 fprintf (file, "%*sarch = %d (%s)\n",
5289 indent, "",
5290 ptr->arch, processor_target_table[ptr->arch].name);
5292 gcc_assert (ptr->tune < PROCESSOR_max);
5293 fprintf (file, "%*stune = %d (%s)\n",
5294 indent, "",
5295 ptr->tune, processor_target_table[ptr->tune].name);
5297 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5299 if (target_string)
5301 fprintf (file, "%*s%s\n", indent, "", target_string);
5302 free (target_string);
5307 /* Inner function to process the attribute((target(...))), take an argument and
5308 set the current options from the argument. If we have a list, recursively go
5309 over the list. */
5311 static bool
5312 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5313 struct gcc_options *opts,
5314 struct gcc_options *opts_set,
5315 struct gcc_options *enum_opts_set)
5317 char *next_optstr;
5318 bool ret = true;
5320 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5321 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5322 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5323 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5324 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5326 enum ix86_opt_type
5328 ix86_opt_unknown,
5329 ix86_opt_yes,
5330 ix86_opt_no,
5331 ix86_opt_str,
5332 ix86_opt_enum,
5333 ix86_opt_isa
5336 static const struct
5338 const char *string;
5339 size_t len;
5340 enum ix86_opt_type type;
5341 int opt;
5342 int mask;
5343 } attrs[] = {
5344 /* isa options */
5345 IX86_ATTR_ISA ("pconfig", OPT_mpconfig),
5346 IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd),
5347 IX86_ATTR_ISA ("sgx", OPT_msgx),
5348 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5349 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5350 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5351 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5352 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5353 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5355 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5356 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5357 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5358 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5359 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5360 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5361 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5362 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5363 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5364 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5365 IX86_ATTR_ISA ("fma", OPT_mfma),
5366 IX86_ATTR_ISA ("xop", OPT_mxop),
5367 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5368 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5369 IX86_ATTR_ISA ("avx", OPT_mavx),
5370 IX86_ATTR_ISA ("sse4", OPT_msse4),
5371 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5372 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5373 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5374 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5375 IX86_ATTR_ISA ("sse3", OPT_msse3),
5376 IX86_ATTR_ISA ("aes", OPT_maes),
5377 IX86_ATTR_ISA ("sha", OPT_msha),
5378 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5379 IX86_ATTR_ISA ("sse2", OPT_msse2),
5380 IX86_ATTR_ISA ("sse", OPT_msse),
5381 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5382 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5383 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5384 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5385 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5386 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5387 IX86_ATTR_ISA ("adx", OPT_madx),
5388 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5389 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5390 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5391 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5392 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5393 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5394 IX86_ATTR_ISA ("abm", OPT_mabm),
5395 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5396 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5397 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5398 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5399 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5400 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5401 IX86_ATTR_ISA ("sahf", OPT_msahf),
5402 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5403 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5404 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5405 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5406 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5407 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5408 IX86_ATTR_ISA ("pku", OPT_mpku),
5409 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5410 IX86_ATTR_ISA ("hle", OPT_mhle),
5411 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5412 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5413 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5414 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5415 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5416 IX86_ATTR_ISA ("ibt", OPT_mibt),
5417 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5418 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5419 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5421 /* enum options */
5422 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5424 /* string options */
5425 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5426 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5428 /* flag options */
5429 IX86_ATTR_YES ("cld",
5430 OPT_mcld,
5431 MASK_CLD),
5433 IX86_ATTR_NO ("fancy-math-387",
5434 OPT_mfancy_math_387,
5435 MASK_NO_FANCY_MATH_387),
5437 IX86_ATTR_YES ("ieee-fp",
5438 OPT_mieee_fp,
5439 MASK_IEEE_FP),
5441 IX86_ATTR_YES ("inline-all-stringops",
5442 OPT_minline_all_stringops,
5443 MASK_INLINE_ALL_STRINGOPS),
5445 IX86_ATTR_YES ("inline-stringops-dynamically",
5446 OPT_minline_stringops_dynamically,
5447 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5449 IX86_ATTR_NO ("align-stringops",
5450 OPT_mno_align_stringops,
5451 MASK_NO_ALIGN_STRINGOPS),
5453 IX86_ATTR_YES ("recip",
5454 OPT_mrecip,
5455 MASK_RECIP),
5459 /* If this is a list, recurse to get the options. */
5460 if (TREE_CODE (args) == TREE_LIST)
5462 bool ret = true;
5464 for (; args; args = TREE_CHAIN (args))
5465 if (TREE_VALUE (args)
5466 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5467 p_strings, opts, opts_set,
5468 enum_opts_set))
5469 ret = false;
5471 return ret;
5474 else if (TREE_CODE (args) != STRING_CST)
5476 error ("attribute %<target%> argument not a string");
5477 return false;
5480 /* Handle multiple arguments separated by commas. */
5481 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5483 while (next_optstr && *next_optstr != '\0')
5485 char *p = next_optstr;
5486 char *orig_p = p;
5487 char *comma = strchr (next_optstr, ',');
5488 const char *opt_string;
5489 size_t len, opt_len;
5490 int opt;
5491 bool opt_set_p;
5492 char ch;
5493 unsigned i;
5494 enum ix86_opt_type type = ix86_opt_unknown;
5495 int mask = 0;
5497 if (comma)
5499 *comma = '\0';
5500 len = comma - next_optstr;
5501 next_optstr = comma + 1;
5503 else
5505 len = strlen (p);
5506 next_optstr = NULL;
5509 /* Recognize no-xxx. */
5510 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5512 opt_set_p = false;
5513 p += 3;
5514 len -= 3;
5516 else
5517 opt_set_p = true;
5519 /* Find the option. */
5520 ch = *p;
5521 opt = N_OPTS;
5522 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5524 type = attrs[i].type;
5525 opt_len = attrs[i].len;
5526 if (ch == attrs[i].string[0]
5527 && ((type != ix86_opt_str && type != ix86_opt_enum)
5528 ? len == opt_len
5529 : len > opt_len)
5530 && memcmp (p, attrs[i].string, opt_len) == 0)
5532 opt = attrs[i].opt;
5533 mask = attrs[i].mask;
5534 opt_string = attrs[i].string;
5535 break;
5539 /* Process the option. */
5540 if (opt == N_OPTS)
5542 error ("attribute(target(\"%s\")) is unknown", orig_p);
5543 ret = false;
5546 else if (type == ix86_opt_isa)
5548 struct cl_decoded_option decoded;
5550 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5551 ix86_handle_option (opts, opts_set,
5552 &decoded, input_location);
5555 else if (type == ix86_opt_yes || type == ix86_opt_no)
5557 if (type == ix86_opt_no)
5558 opt_set_p = !opt_set_p;
5560 if (opt_set_p)
5561 opts->x_target_flags |= mask;
5562 else
5563 opts->x_target_flags &= ~mask;
5566 else if (type == ix86_opt_str)
5568 if (p_strings[opt])
5570 error ("option(\"%s\") was already specified", opt_string);
5571 ret = false;
5573 else
5574 p_strings[opt] = xstrdup (p + opt_len);
5577 else if (type == ix86_opt_enum)
5579 bool arg_ok;
5580 int value;
5582 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5583 if (arg_ok)
5584 set_option (opts, enum_opts_set, opt, value,
5585 p + opt_len, DK_UNSPECIFIED, input_location,
5586 global_dc);
5587 else
5589 error ("attribute(target(\"%s\")) is unknown", orig_p);
5590 ret = false;
5594 else
5595 gcc_unreachable ();
5598 return ret;
5601 /* Release allocated strings. */
5602 static void
5603 release_options_strings (char **option_strings)
5605 /* Free up memory allocated to hold the strings */
5606 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5607 free (option_strings[i]);
5610 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5612 tree
5613 ix86_valid_target_attribute_tree (tree args,
5614 struct gcc_options *opts,
5615 struct gcc_options *opts_set)
5617 const char *orig_arch_string = opts->x_ix86_arch_string;
5618 const char *orig_tune_string = opts->x_ix86_tune_string;
5619 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5620 int orig_tune_defaulted = ix86_tune_defaulted;
5621 int orig_arch_specified = ix86_arch_specified;
5622 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5623 tree t = NULL_TREE;
5624 struct cl_target_option *def
5625 = TREE_TARGET_OPTION (target_option_default_node);
5626 struct gcc_options enum_opts_set;
5628 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5630 /* Process each of the options on the chain. */
5631 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5632 opts_set, &enum_opts_set))
5633 return error_mark_node;
5635 /* If the changed options are different from the default, rerun
5636 ix86_option_override_internal, and then save the options away.
5637 The string options are attribute options, and will be undone
5638 when we copy the save structure. */
5639 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5640 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5641 || opts->x_target_flags != def->x_target_flags
5642 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5643 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5644 || enum_opts_set.x_ix86_fpmath)
5646 /* If we are using the default tune= or arch=, undo the string assigned,
5647 and use the default. */
5648 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5650 opts->x_ix86_arch_string
5651 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5653 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5654 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5655 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5656 | OPTION_MASK_ABI_64
5657 | OPTION_MASK_ABI_X32
5658 | OPTION_MASK_CODE16);
5659 opts->x_ix86_isa_flags2 = 0;
5661 else if (!orig_arch_specified)
5662 opts->x_ix86_arch_string = NULL;
5664 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5665 opts->x_ix86_tune_string
5666 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5667 else if (orig_tune_defaulted)
5668 opts->x_ix86_tune_string = NULL;
5670 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5671 if (enum_opts_set.x_ix86_fpmath)
5672 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5674 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5675 bool r = ix86_option_override_internal (false, opts, opts_set);
5676 if (!r)
5678 release_options_strings (option_strings);
5679 return error_mark_node;
5682 /* Add any builtin functions with the new isa if any. */
5683 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5685 /* Save the current options unless we are validating options for
5686 #pragma. */
5687 t = build_target_option_node (opts);
5689 opts->x_ix86_arch_string = orig_arch_string;
5690 opts->x_ix86_tune_string = orig_tune_string;
5691 opts_set->x_ix86_fpmath = orig_fpmath_set;
5693 release_options_strings (option_strings);
5696 return t;
5699 /* Hook to validate attribute((target("string"))). */
5701 static bool
5702 ix86_valid_target_attribute_p (tree fndecl,
5703 tree ARG_UNUSED (name),
5704 tree args,
5705 int ARG_UNUSED (flags))
5707 struct gcc_options func_options;
5708 tree new_target, new_optimize;
5709 bool ret = true;
5711 /* attribute((target("default"))) does nothing, beyond
5712 affecting multi-versioning. */
5713 if (TREE_VALUE (args)
5714 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5715 && TREE_CHAIN (args) == NULL_TREE
5716 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5717 return true;
5719 tree old_optimize = build_optimization_node (&global_options);
5721 /* Get the optimization options of the current function. */
5722 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5724 if (!func_optimize)
5725 func_optimize = old_optimize;
5727 /* Init func_options. */
5728 memset (&func_options, 0, sizeof (func_options));
5729 init_options_struct (&func_options, NULL);
5730 lang_hooks.init_options_struct (&func_options);
5732 cl_optimization_restore (&func_options,
5733 TREE_OPTIMIZATION (func_optimize));
5735 /* Initialize func_options to the default before its target options can
5736 be set. */
5737 cl_target_option_restore (&func_options,
5738 TREE_TARGET_OPTION (target_option_default_node));
5740 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5741 &global_options_set);
5743 new_optimize = build_optimization_node (&func_options);
5745 if (new_target == error_mark_node)
5746 ret = false;
5748 else if (fndecl && new_target)
5750 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5752 if (old_optimize != new_optimize)
5753 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5756 finalize_options_struct (&func_options);
5758 return ret;
5762 /* Hook to determine if one function can safely inline another. */
5764 static bool
5765 ix86_can_inline_p (tree caller, tree callee)
5767 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5768 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5769 if (!callee_tree)
5770 callee_tree = target_option_default_node;
5771 if (!caller_tree)
5772 caller_tree = target_option_default_node;
5773 if (callee_tree == caller_tree)
5774 return true;
5776 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5777 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5778 bool ret = false;
5780 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5781 function can inline a SSE2 function but a SSE2 function can't inline
5782 a SSE4 function. */
5783 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5784 != callee_opts->x_ix86_isa_flags)
5785 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5786 != callee_opts->x_ix86_isa_flags2))
5787 ret = false;
5789 /* See if we have the same non-isa options. */
5790 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5791 ret = false;
5793 /* See if arch, tune, etc. are the same. */
5794 else if (caller_opts->arch != callee_opts->arch)
5795 ret = false;
5797 else if (caller_opts->tune != callee_opts->tune)
5798 ret = false;
5800 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5801 /* If the calle doesn't use FP expressions differences in
5802 ix86_fpmath can be ignored. We are called from FEs
5803 for multi-versioning call optimization, so beware of
5804 ipa_fn_summaries not available. */
5805 && (! ipa_fn_summaries
5806 || ipa_fn_summaries->get
5807 (cgraph_node::get (callee))->fp_expressions))
5808 ret = false;
5810 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5811 ret = false;
5813 else
5814 ret = true;
5816 return ret;
5820 /* Remember the last target of ix86_set_current_function. */
5821 static GTY(()) tree ix86_previous_fndecl;
5823 /* Set targets globals to the default (or current #pragma GCC target
5824 if active). Invalidate ix86_previous_fndecl cache. */
5826 void
5827 ix86_reset_previous_fndecl (void)
5829 tree new_tree = target_option_current_node;
5830 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5831 if (TREE_TARGET_GLOBALS (new_tree))
5832 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5833 else if (new_tree == target_option_default_node)
5834 restore_target_globals (&default_target_globals);
5835 else
5836 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5837 ix86_previous_fndecl = NULL_TREE;
5840 /* Set the func_type field from the function FNDECL. */
5842 static void
5843 ix86_set_func_type (tree fndecl)
5845 if (cfun->machine->func_type == TYPE_UNKNOWN)
5847 if (lookup_attribute ("interrupt",
5848 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5850 if (ix86_function_naked (fndecl))
5851 error_at (DECL_SOURCE_LOCATION (fndecl),
5852 "interrupt and naked attributes are not compatible");
5854 int nargs = 0;
5855 for (tree arg = DECL_ARGUMENTS (fndecl);
5856 arg;
5857 arg = TREE_CHAIN (arg))
5858 nargs++;
5859 cfun->machine->no_caller_saved_registers = true;
5860 cfun->machine->func_type
5861 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5863 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5865 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5866 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5867 sorry ("Only DWARF debug format is supported for interrupt "
5868 "service routine.");
5870 else
5872 cfun->machine->func_type = TYPE_NORMAL;
5873 if (lookup_attribute ("no_caller_saved_registers",
5874 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5875 cfun->machine->no_caller_saved_registers = true;
5880 /* Set the indirect_branch_type field from the function FNDECL. */
5882 static void
5883 ix86_set_indirect_branch_type (tree fndecl)
5885 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5887 tree attr = lookup_attribute ("indirect_branch",
5888 DECL_ATTRIBUTES (fndecl));
5889 if (attr != NULL)
5891 tree args = TREE_VALUE (attr);
5892 if (args == NULL)
5893 gcc_unreachable ();
5894 tree cst = TREE_VALUE (args);
5895 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5896 cfun->machine->indirect_branch_type = indirect_branch_keep;
5897 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5898 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5899 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5900 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5901 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5902 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5903 else
5904 gcc_unreachable ();
5906 else
5907 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5909 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5910 nor -mindirect-branch=thunk-extern. */
5911 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5912 && ((cfun->machine->indirect_branch_type
5913 == indirect_branch_thunk_extern)
5914 || (cfun->machine->indirect_branch_type
5915 == indirect_branch_thunk)))
5916 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5917 "compatible",
5918 ((cfun->machine->indirect_branch_type
5919 == indirect_branch_thunk_extern)
5920 ? "thunk-extern" : "thunk"));
5922 /* -mindirect-branch=thunk-extern, -fcf-protection=branch and
5923 -fcheck-pointer-bounds are not compatible. */
5924 if ((cfun->machine->indirect_branch_type
5925 == indirect_branch_thunk_extern)
5926 && flag_check_pointer_bounds
5927 && (flag_cf_protection & CF_BRANCH) != 0)
5928 error ("%<-mindirect-branch=thunk-extern%>, "
5929 "%<-fcf-protection=branch%> and "
5930 "%<-fcheck-pointer-bounds%> are not compatible");
5933 if (cfun->machine->function_return_type == indirect_branch_unset)
5935 tree attr = lookup_attribute ("function_return",
5936 DECL_ATTRIBUTES (fndecl));
5937 if (attr != NULL)
5939 tree args = TREE_VALUE (attr);
5940 if (args == NULL)
5941 gcc_unreachable ();
5942 tree cst = TREE_VALUE (args);
5943 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5944 cfun->machine->function_return_type = indirect_branch_keep;
5945 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5946 cfun->machine->function_return_type = indirect_branch_thunk;
5947 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5948 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5949 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5950 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5951 else
5952 gcc_unreachable ();
5954 else
5955 cfun->machine->function_return_type = ix86_function_return;
5957 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5958 nor -mfunction-return=thunk-extern. */
5959 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5960 && ((cfun->machine->function_return_type
5961 == indirect_branch_thunk_extern)
5962 || (cfun->machine->function_return_type
5963 == indirect_branch_thunk)))
5964 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5965 "compatible",
5966 ((cfun->machine->function_return_type
5967 == indirect_branch_thunk_extern)
5968 ? "thunk-extern" : "thunk"));
5972 /* Establish appropriate back-end context for processing the function
5973 FNDECL. The argument might be NULL to indicate processing at top
5974 level, outside of any function scope. */
5975 static void
5976 ix86_set_current_function (tree fndecl)
5978 /* Only change the context if the function changes. This hook is called
5979 several times in the course of compiling a function, and we don't want to
5980 slow things down too much or call target_reinit when it isn't safe. */
5981 if (fndecl == ix86_previous_fndecl)
5983 /* There may be 2 function bodies for the same function FNDECL,
5984 one is extern inline and one isn't. Call ix86_set_func_type
5985 to set the func_type field. */
5986 if (fndecl != NULL_TREE)
5988 ix86_set_func_type (fndecl);
5989 ix86_set_indirect_branch_type (fndecl);
5991 return;
5994 tree old_tree;
5995 if (ix86_previous_fndecl == NULL_TREE)
5996 old_tree = target_option_current_node;
5997 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5998 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5999 else
6000 old_tree = target_option_default_node;
6002 if (fndecl == NULL_TREE)
6004 if (old_tree != target_option_current_node)
6005 ix86_reset_previous_fndecl ();
6006 return;
6009 ix86_set_func_type (fndecl);
6010 ix86_set_indirect_branch_type (fndecl);
6012 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
6013 if (new_tree == NULL_TREE)
6014 new_tree = target_option_default_node;
6016 if (old_tree != new_tree)
6018 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6019 if (TREE_TARGET_GLOBALS (new_tree))
6020 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6021 else if (new_tree == target_option_default_node)
6022 restore_target_globals (&default_target_globals);
6023 else
6024 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6026 ix86_previous_fndecl = fndecl;
6028 static bool prev_no_caller_saved_registers;
6030 /* 64-bit MS and SYSV ABI have different set of call used registers.
6031 Avoid expensive re-initialization of init_regs each time we switch
6032 function context. */
6033 if (TARGET_64BIT
6034 && (call_used_regs[SI_REG]
6035 == (cfun->machine->call_abi == MS_ABI)))
6036 reinit_regs ();
6037 /* Need to re-initialize init_regs if caller-saved registers are
6038 changed. */
6039 else if (prev_no_caller_saved_registers
6040 != cfun->machine->no_caller_saved_registers)
6041 reinit_regs ();
6043 if (cfun->machine->func_type != TYPE_NORMAL
6044 || cfun->machine->no_caller_saved_registers)
6046 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
6047 may change processor state. */
6048 const char *isa;
6049 if (TARGET_MPX)
6050 isa = "MPX";
6051 else if (TARGET_SSE)
6052 isa = "SSE";
6053 else if (TARGET_MMX)
6054 isa = "MMX/3Dnow";
6055 else if (TARGET_80387)
6056 isa = "80387";
6057 else
6058 isa = NULL;
6059 if (isa != NULL)
6061 if (cfun->machine->func_type != TYPE_NORMAL)
6062 sorry ("%s instructions aren't allowed in %s service routine",
6063 isa, (cfun->machine->func_type == TYPE_EXCEPTION
6064 ? "exception" : "interrupt"));
6065 else
6066 sorry ("%s instructions aren't allowed in function with "
6067 "no_caller_saved_registers attribute", isa);
6068 /* Don't issue the same error twice. */
6069 cfun->machine->func_type = TYPE_NORMAL;
6070 cfun->machine->no_caller_saved_registers = false;
6074 prev_no_caller_saved_registers
6075 = cfun->machine->no_caller_saved_registers;
6079 /* Return true if this goes in large data/bss. */
6081 static bool
6082 ix86_in_large_data_p (tree exp)
6084 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6085 return false;
6087 if (exp == NULL_TREE)
6088 return false;
6090 /* Functions are never large data. */
6091 if (TREE_CODE (exp) == FUNCTION_DECL)
6092 return false;
6094 /* Automatic variables are never large data. */
6095 if (VAR_P (exp) && !is_global_var (exp))
6096 return false;
6098 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6100 const char *section = DECL_SECTION_NAME (exp);
6101 if (strcmp (section, ".ldata") == 0
6102 || strcmp (section, ".lbss") == 0)
6103 return true;
6104 return false;
6106 else
6108 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6110 /* If this is an incomplete type with size 0, then we can't put it
6111 in data because it might be too big when completed. Also,
6112 int_size_in_bytes returns -1 if size can vary or is larger than
6113 an integer in which case also it is safer to assume that it goes in
6114 large data. */
6115 if (size <= 0 || size > ix86_section_threshold)
6116 return true;
6119 return false;
6122 /* i386-specific section flag to mark large sections. */
6123 #define SECTION_LARGE SECTION_MACH_DEP
6125 /* Switch to the appropriate section for output of DECL.
6126 DECL is either a `VAR_DECL' node or a constant of some sort.
6127 RELOC indicates whether forming the initial value of DECL requires
6128 link-time relocations. */
6130 ATTRIBUTE_UNUSED static section *
6131 x86_64_elf_select_section (tree decl, int reloc,
6132 unsigned HOST_WIDE_INT align)
6134 if (ix86_in_large_data_p (decl))
6136 const char *sname = NULL;
6137 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6138 switch (categorize_decl_for_section (decl, reloc))
6140 case SECCAT_DATA:
6141 sname = ".ldata";
6142 break;
6143 case SECCAT_DATA_REL:
6144 sname = ".ldata.rel";
6145 break;
6146 case SECCAT_DATA_REL_LOCAL:
6147 sname = ".ldata.rel.local";
6148 break;
6149 case SECCAT_DATA_REL_RO:
6150 sname = ".ldata.rel.ro";
6151 break;
6152 case SECCAT_DATA_REL_RO_LOCAL:
6153 sname = ".ldata.rel.ro.local";
6154 break;
6155 case SECCAT_BSS:
6156 sname = ".lbss";
6157 flags |= SECTION_BSS;
6158 break;
6159 case SECCAT_RODATA:
6160 case SECCAT_RODATA_MERGE_STR:
6161 case SECCAT_RODATA_MERGE_STR_INIT:
6162 case SECCAT_RODATA_MERGE_CONST:
6163 sname = ".lrodata";
6164 flags &= ~SECTION_WRITE;
6165 break;
6166 case SECCAT_SRODATA:
6167 case SECCAT_SDATA:
6168 case SECCAT_SBSS:
6169 gcc_unreachable ();
6170 case SECCAT_TEXT:
6171 case SECCAT_TDATA:
6172 case SECCAT_TBSS:
6173 /* We don't split these for medium model. Place them into
6174 default sections and hope for best. */
6175 break;
6177 if (sname)
6179 /* We might get called with string constants, but get_named_section
6180 doesn't like them as they are not DECLs. Also, we need to set
6181 flags in that case. */
6182 if (!DECL_P (decl))
6183 return get_section (sname, flags, NULL);
6184 return get_named_section (decl, sname, reloc);
6187 return default_elf_select_section (decl, reloc, align);
6190 /* Select a set of attributes for section NAME based on the properties
6191 of DECL and whether or not RELOC indicates that DECL's initializer
6192 might contain runtime relocations. */
6194 static unsigned int ATTRIBUTE_UNUSED
6195 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6197 unsigned int flags = default_section_type_flags (decl, name, reloc);
6199 if (ix86_in_large_data_p (decl))
6200 flags |= SECTION_LARGE;
6202 if (decl == NULL_TREE
6203 && (strcmp (name, ".ldata.rel.ro") == 0
6204 || strcmp (name, ".ldata.rel.ro.local") == 0))
6205 flags |= SECTION_RELRO;
6207 if (strcmp (name, ".lbss") == 0
6208 || strncmp (name, ".lbss.", 5) == 0
6209 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6210 flags |= SECTION_BSS;
6212 return flags;
6215 /* Build up a unique section name, expressed as a
6216 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6217 RELOC indicates whether the initial value of EXP requires
6218 link-time relocations. */
6220 static void ATTRIBUTE_UNUSED
6221 x86_64_elf_unique_section (tree decl, int reloc)
6223 if (ix86_in_large_data_p (decl))
6225 const char *prefix = NULL;
6226 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6227 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6229 switch (categorize_decl_for_section (decl, reloc))
6231 case SECCAT_DATA:
6232 case SECCAT_DATA_REL:
6233 case SECCAT_DATA_REL_LOCAL:
6234 case SECCAT_DATA_REL_RO:
6235 case SECCAT_DATA_REL_RO_LOCAL:
6236 prefix = one_only ? ".ld" : ".ldata";
6237 break;
6238 case SECCAT_BSS:
6239 prefix = one_only ? ".lb" : ".lbss";
6240 break;
6241 case SECCAT_RODATA:
6242 case SECCAT_RODATA_MERGE_STR:
6243 case SECCAT_RODATA_MERGE_STR_INIT:
6244 case SECCAT_RODATA_MERGE_CONST:
6245 prefix = one_only ? ".lr" : ".lrodata";
6246 break;
6247 case SECCAT_SRODATA:
6248 case SECCAT_SDATA:
6249 case SECCAT_SBSS:
6250 gcc_unreachable ();
6251 case SECCAT_TEXT:
6252 case SECCAT_TDATA:
6253 case SECCAT_TBSS:
6254 /* We don't split these for medium model. Place them into
6255 default sections and hope for best. */
6256 break;
6258 if (prefix)
6260 const char *name, *linkonce;
6261 char *string;
6263 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6264 name = targetm.strip_name_encoding (name);
6266 /* If we're using one_only, then there needs to be a .gnu.linkonce
6267 prefix to the section name. */
6268 linkonce = one_only ? ".gnu.linkonce" : "";
6270 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6272 set_decl_section_name (decl, string);
6273 return;
6276 default_unique_section (decl, reloc);
6279 #ifdef COMMON_ASM_OP
6281 #ifndef LARGECOMM_SECTION_ASM_OP
6282 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6283 #endif
6285 /* This says how to output assembler code to declare an
6286 uninitialized external linkage data object.
6288 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6289 large objects. */
6290 void
6291 x86_elf_aligned_decl_common (FILE *file, tree decl,
6292 const char *name, unsigned HOST_WIDE_INT size,
6293 int align)
6295 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6296 && size > (unsigned int)ix86_section_threshold)
6298 switch_to_section (get_named_section (decl, ".lbss", 0));
6299 fputs (LARGECOMM_SECTION_ASM_OP, file);
6301 else
6302 fputs (COMMON_ASM_OP, file);
6303 assemble_name (file, name);
6304 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6305 size, align / BITS_PER_UNIT);
6307 #endif
6309 /* Utility function for targets to use in implementing
6310 ASM_OUTPUT_ALIGNED_BSS. */
6312 void
6313 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6314 unsigned HOST_WIDE_INT size, int align)
6316 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6317 && size > (unsigned int)ix86_section_threshold)
6318 switch_to_section (get_named_section (decl, ".lbss", 0));
6319 else
6320 switch_to_section (bss_section);
6321 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6322 #ifdef ASM_DECLARE_OBJECT_NAME
6323 last_assemble_variable_decl = decl;
6324 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6325 #else
6326 /* Standard thing is just output label for the object. */
6327 ASM_OUTPUT_LABEL (file, name);
6328 #endif /* ASM_DECLARE_OBJECT_NAME */
6329 ASM_OUTPUT_SKIP (file, size ? size : 1);
6332 /* Decide whether we must probe the stack before any space allocation
6333 on this target. It's essentially TARGET_STACK_PROBE except when
6334 -fstack-check causes the stack to be already probed differently. */
6336 bool
6337 ix86_target_stack_probe (void)
6339 /* Do not probe the stack twice if static stack checking is enabled. */
6340 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6341 return false;
6343 return TARGET_STACK_PROBE;
6346 /* Decide whether we can make a sibling call to a function. DECL is the
6347 declaration of the function being targeted by the call and EXP is the
6348 CALL_EXPR representing the call. */
6350 static bool
6351 ix86_function_ok_for_sibcall (tree decl, tree exp)
6353 tree type, decl_or_type;
6354 rtx a, b;
6355 bool bind_global = decl && !targetm.binds_local_p (decl);
6357 if (ix86_function_naked (current_function_decl))
6358 return false;
6360 /* Sibling call isn't OK if there are no caller-saved registers
6361 since all registers must be preserved before return. */
6362 if (cfun->machine->no_caller_saved_registers)
6363 return false;
6365 /* If we are generating position-independent code, we cannot sibcall
6366 optimize direct calls to global functions, as the PLT requires
6367 %ebx be live. (Darwin does not have a PLT.) */
6368 if (!TARGET_MACHO
6369 && !TARGET_64BIT
6370 && flag_pic
6371 && flag_plt
6372 && bind_global)
6373 return false;
6375 /* If we need to align the outgoing stack, then sibcalling would
6376 unalign the stack, which may break the called function. */
6377 if (ix86_minimum_incoming_stack_boundary (true)
6378 < PREFERRED_STACK_BOUNDARY)
6379 return false;
6381 if (decl)
6383 decl_or_type = decl;
6384 type = TREE_TYPE (decl);
6386 else
6388 /* We're looking at the CALL_EXPR, we need the type of the function. */
6389 type = CALL_EXPR_FN (exp); /* pointer expression */
6390 type = TREE_TYPE (type); /* pointer type */
6391 type = TREE_TYPE (type); /* function type */
6392 decl_or_type = type;
6395 /* Check that the return value locations are the same. Like
6396 if we are returning floats on the 80387 register stack, we cannot
6397 make a sibcall from a function that doesn't return a float to a
6398 function that does or, conversely, from a function that does return
6399 a float to a function that doesn't; the necessary stack adjustment
6400 would not be executed. This is also the place we notice
6401 differences in the return value ABI. Note that it is ok for one
6402 of the functions to have void return type as long as the return
6403 value of the other is passed in a register. */
6404 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6405 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6406 cfun->decl, false);
6407 if (STACK_REG_P (a) || STACK_REG_P (b))
6409 if (!rtx_equal_p (a, b))
6410 return false;
6412 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6414 else if (!rtx_equal_p (a, b))
6415 return false;
6417 if (TARGET_64BIT)
6419 /* The SYSV ABI has more call-clobbered registers;
6420 disallow sibcalls from MS to SYSV. */
6421 if (cfun->machine->call_abi == MS_ABI
6422 && ix86_function_type_abi (type) == SYSV_ABI)
6423 return false;
6425 else
6427 /* If this call is indirect, we'll need to be able to use a
6428 call-clobbered register for the address of the target function.
6429 Make sure that all such registers are not used for passing
6430 parameters. Note that DLLIMPORT functions and call to global
6431 function via GOT slot are indirect. */
6432 if (!decl
6433 || (bind_global && flag_pic && !flag_plt)
6434 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
6435 || flag_force_indirect_call)
6437 /* Check if regparm >= 3 since arg_reg_available is set to
6438 false if regparm == 0. If regparm is 1 or 2, there is
6439 always a call-clobbered register available.
6441 ??? The symbol indirect call doesn't need a call-clobbered
6442 register. But we don't know if this is a symbol indirect
6443 call or not here. */
6444 if (ix86_function_regparm (type, decl) >= 3
6445 && !cfun->machine->arg_reg_available)
6446 return false;
6450 /* Otherwise okay. That also includes certain types of indirect calls. */
6451 return true;
6454 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6455 and "sseregparm" calling convention attributes;
6456 arguments as in struct attribute_spec.handler. */
6458 static tree
6459 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6460 bool *no_add_attrs)
6462 if (TREE_CODE (*node) != FUNCTION_TYPE
6463 && TREE_CODE (*node) != METHOD_TYPE
6464 && TREE_CODE (*node) != FIELD_DECL
6465 && TREE_CODE (*node) != TYPE_DECL)
6467 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6468 name);
6469 *no_add_attrs = true;
6470 return NULL_TREE;
6473 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6474 if (is_attribute_p ("regparm", name))
6476 tree cst;
6478 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6480 error ("fastcall and regparm attributes are not compatible");
6483 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6485 error ("regparam and thiscall attributes are not compatible");
6488 cst = TREE_VALUE (args);
6489 if (TREE_CODE (cst) != INTEGER_CST)
6491 warning (OPT_Wattributes,
6492 "%qE attribute requires an integer constant argument",
6493 name);
6494 *no_add_attrs = true;
6496 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6498 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6499 name, REGPARM_MAX);
6500 *no_add_attrs = true;
6503 return NULL_TREE;
6506 if (TARGET_64BIT)
6508 /* Do not warn when emulating the MS ABI. */
6509 if ((TREE_CODE (*node) != FUNCTION_TYPE
6510 && TREE_CODE (*node) != METHOD_TYPE)
6511 || ix86_function_type_abi (*node) != MS_ABI)
6512 warning (OPT_Wattributes, "%qE attribute ignored",
6513 name);
6514 *no_add_attrs = true;
6515 return NULL_TREE;
6518 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6519 if (is_attribute_p ("fastcall", name))
6521 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6523 error ("fastcall and cdecl attributes are not compatible");
6525 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6527 error ("fastcall and stdcall attributes are not compatible");
6529 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6531 error ("fastcall and regparm attributes are not compatible");
6533 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6535 error ("fastcall and thiscall attributes are not compatible");
6539 /* Can combine stdcall with fastcall (redundant), regparm and
6540 sseregparm. */
6541 else if (is_attribute_p ("stdcall", name))
6543 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6545 error ("stdcall and cdecl attributes are not compatible");
6547 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6549 error ("stdcall and fastcall attributes are not compatible");
6551 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6553 error ("stdcall and thiscall attributes are not compatible");
6557 /* Can combine cdecl with regparm and sseregparm. */
6558 else if (is_attribute_p ("cdecl", name))
6560 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6562 error ("stdcall and cdecl attributes are not compatible");
6564 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6566 error ("fastcall and cdecl attributes are not compatible");
6568 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6570 error ("cdecl and thiscall attributes are not compatible");
6573 else if (is_attribute_p ("thiscall", name))
6575 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6576 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6577 name);
6578 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6580 error ("stdcall and thiscall attributes are not compatible");
6582 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6584 error ("fastcall and thiscall attributes are not compatible");
6586 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6588 error ("cdecl and thiscall attributes are not compatible");
6592 /* Can combine sseregparm with all attributes. */
6594 return NULL_TREE;
6597 /* The transactional memory builtins are implicitly regparm or fastcall
6598 depending on the ABI. Override the generic do-nothing attribute that
6599 these builtins were declared with, and replace it with one of the two
6600 attributes that we expect elsewhere. */
6602 static tree
6603 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6604 int flags, bool *no_add_attrs)
6606 tree alt;
6608 /* In no case do we want to add the placeholder attribute. */
6609 *no_add_attrs = true;
6611 /* The 64-bit ABI is unchanged for transactional memory. */
6612 if (TARGET_64BIT)
6613 return NULL_TREE;
6615 /* ??? Is there a better way to validate 32-bit windows? We have
6616 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6617 if (CHECK_STACK_LIMIT > 0)
6618 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6619 else
6621 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6622 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6624 decl_attributes (node, alt, flags);
6626 return NULL_TREE;
6629 /* This function determines from TYPE the calling-convention. */
6631 unsigned int
6632 ix86_get_callcvt (const_tree type)
6634 unsigned int ret = 0;
6635 bool is_stdarg;
6636 tree attrs;
6638 if (TARGET_64BIT)
6639 return IX86_CALLCVT_CDECL;
6641 attrs = TYPE_ATTRIBUTES (type);
6642 if (attrs != NULL_TREE)
6644 if (lookup_attribute ("cdecl", attrs))
6645 ret |= IX86_CALLCVT_CDECL;
6646 else if (lookup_attribute ("stdcall", attrs))
6647 ret |= IX86_CALLCVT_STDCALL;
6648 else if (lookup_attribute ("fastcall", attrs))
6649 ret |= IX86_CALLCVT_FASTCALL;
6650 else if (lookup_attribute ("thiscall", attrs))
6651 ret |= IX86_CALLCVT_THISCALL;
6653 /* Regparam isn't allowed for thiscall and fastcall. */
6654 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6656 if (lookup_attribute ("regparm", attrs))
6657 ret |= IX86_CALLCVT_REGPARM;
6658 if (lookup_attribute ("sseregparm", attrs))
6659 ret |= IX86_CALLCVT_SSEREGPARM;
6662 if (IX86_BASE_CALLCVT(ret) != 0)
6663 return ret;
6666 is_stdarg = stdarg_p (type);
6667 if (TARGET_RTD && !is_stdarg)
6668 return IX86_CALLCVT_STDCALL | ret;
6670 if (ret != 0
6671 || is_stdarg
6672 || TREE_CODE (type) != METHOD_TYPE
6673 || ix86_function_type_abi (type) != MS_ABI)
6674 return IX86_CALLCVT_CDECL | ret;
6676 return IX86_CALLCVT_THISCALL;
6679 /* Return 0 if the attributes for two types are incompatible, 1 if they
6680 are compatible, and 2 if they are nearly compatible (which causes a
6681 warning to be generated). */
6683 static int
6684 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6686 unsigned int ccvt1, ccvt2;
6688 if (TREE_CODE (type1) != FUNCTION_TYPE
6689 && TREE_CODE (type1) != METHOD_TYPE)
6690 return 1;
6692 ccvt1 = ix86_get_callcvt (type1);
6693 ccvt2 = ix86_get_callcvt (type2);
6694 if (ccvt1 != ccvt2)
6695 return 0;
6696 if (ix86_function_regparm (type1, NULL)
6697 != ix86_function_regparm (type2, NULL))
6698 return 0;
6700 return 1;
6703 /* Return the regparm value for a function with the indicated TYPE and DECL.
6704 DECL may be NULL when calling function indirectly
6705 or considering a libcall. */
6707 static int
6708 ix86_function_regparm (const_tree type, const_tree decl)
6710 tree attr;
6711 int regparm;
6712 unsigned int ccvt;
6714 if (TARGET_64BIT)
6715 return (ix86_function_type_abi (type) == SYSV_ABI
6716 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6717 ccvt = ix86_get_callcvt (type);
6718 regparm = ix86_regparm;
6720 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6722 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6723 if (attr)
6725 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6726 return regparm;
6729 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6730 return 2;
6731 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6732 return 1;
6734 /* Use register calling convention for local functions when possible. */
6735 if (decl
6736 && TREE_CODE (decl) == FUNCTION_DECL)
6738 cgraph_node *target = cgraph_node::get (decl);
6739 if (target)
6740 target = target->function_symbol ();
6742 /* Caller and callee must agree on the calling convention, so
6743 checking here just optimize means that with
6744 __attribute__((optimize (...))) caller could use regparm convention
6745 and callee not, or vice versa. Instead look at whether the callee
6746 is optimized or not. */
6747 if (target && opt_for_fn (target->decl, optimize)
6748 && !(profile_flag && !flag_fentry))
6750 cgraph_local_info *i = &target->local;
6751 if (i && i->local && i->can_change_signature)
6753 int local_regparm, globals = 0, regno;
6755 /* Make sure no regparm register is taken by a
6756 fixed register variable. */
6757 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6758 local_regparm++)
6759 if (fixed_regs[local_regparm])
6760 break;
6762 /* We don't want to use regparm(3) for nested functions as
6763 these use a static chain pointer in the third argument. */
6764 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6765 local_regparm = 2;
6767 /* Save a register for the split stack. */
6768 if (flag_split_stack)
6770 if (local_regparm == 3)
6771 local_regparm = 2;
6772 else if (local_regparm == 2
6773 && DECL_STATIC_CHAIN (target->decl))
6774 local_regparm = 1;
6777 /* Each fixed register usage increases register pressure,
6778 so less registers should be used for argument passing.
6779 This functionality can be overriden by an explicit
6780 regparm value. */
6781 for (regno = AX_REG; regno <= DI_REG; regno++)
6782 if (fixed_regs[regno])
6783 globals++;
6785 local_regparm
6786 = globals < local_regparm ? local_regparm - globals : 0;
6788 if (local_regparm > regparm)
6789 regparm = local_regparm;
6794 return regparm;
6797 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6798 DFmode (2) arguments in SSE registers for a function with the
6799 indicated TYPE and DECL. DECL may be NULL when calling function
6800 indirectly or considering a libcall. Return -1 if any FP parameter
6801 should be rejected by error. This is used in siutation we imply SSE
6802 calling convetion but the function is called from another function with
6803 SSE disabled. Otherwise return 0. */
6805 static int
6806 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6808 gcc_assert (!TARGET_64BIT);
6810 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6811 by the sseregparm attribute. */
6812 if (TARGET_SSEREGPARM
6813 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6815 if (!TARGET_SSE)
6817 if (warn)
6819 if (decl)
6820 error ("calling %qD with attribute sseregparm without "
6821 "SSE/SSE2 enabled", decl);
6822 else
6823 error ("calling %qT with attribute sseregparm without "
6824 "SSE/SSE2 enabled", type);
6826 return 0;
6829 return 2;
6832 if (!decl)
6833 return 0;
6835 cgraph_node *target = cgraph_node::get (decl);
6836 if (target)
6837 target = target->function_symbol ();
6839 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6840 (and DFmode for SSE2) arguments in SSE registers. */
6841 if (target
6842 /* TARGET_SSE_MATH */
6843 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6844 && opt_for_fn (target->decl, optimize)
6845 && !(profile_flag && !flag_fentry))
6847 cgraph_local_info *i = &target->local;
6848 if (i && i->local && i->can_change_signature)
6850 /* Refuse to produce wrong code when local function with SSE enabled
6851 is called from SSE disabled function.
6852 FIXME: We need a way to detect these cases cross-ltrans partition
6853 and avoid using SSE calling conventions on local functions called
6854 from function with SSE disabled. For now at least delay the
6855 warning until we know we are going to produce wrong code.
6856 See PR66047 */
6857 if (!TARGET_SSE && warn)
6858 return -1;
6859 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6860 ->x_ix86_isa_flags) ? 2 : 1;
6864 return 0;
6867 /* Return true if EAX is live at the start of the function. Used by
6868 ix86_expand_prologue to determine if we need special help before
6869 calling allocate_stack_worker. */
6871 static bool
6872 ix86_eax_live_at_start_p (void)
6874 /* Cheat. Don't bother working forward from ix86_function_regparm
6875 to the function type to whether an actual argument is located in
6876 eax. Instead just look at cfg info, which is still close enough
6877 to correct at this point. This gives false positives for broken
6878 functions that might use uninitialized data that happens to be
6879 allocated in eax, but who cares? */
6880 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6883 static bool
6884 ix86_keep_aggregate_return_pointer (tree fntype)
6886 tree attr;
6888 if (!TARGET_64BIT)
6890 attr = lookup_attribute ("callee_pop_aggregate_return",
6891 TYPE_ATTRIBUTES (fntype));
6892 if (attr)
6893 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6895 /* For 32-bit MS-ABI the default is to keep aggregate
6896 return pointer. */
6897 if (ix86_function_type_abi (fntype) == MS_ABI)
6898 return true;
6900 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6903 /* Value is the number of bytes of arguments automatically
6904 popped when returning from a subroutine call.
6905 FUNDECL is the declaration node of the function (as a tree),
6906 FUNTYPE is the data type of the function (as a tree),
6907 or for a library call it is an identifier node for the subroutine name.
6908 SIZE is the number of bytes of arguments passed on the stack.
6910 On the 80386, the RTD insn may be used to pop them if the number
6911 of args is fixed, but if the number is variable then the caller
6912 must pop them all. RTD can't be used for library calls now
6913 because the library is compiled with the Unix compiler.
6914 Use of RTD is a selectable option, since it is incompatible with
6915 standard Unix calling sequences. If the option is not selected,
6916 the caller must always pop the args.
6918 The attribute stdcall is equivalent to RTD on a per module basis. */
6920 static poly_int64
6921 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6923 unsigned int ccvt;
6925 /* None of the 64-bit ABIs pop arguments. */
6926 if (TARGET_64BIT)
6927 return 0;
6929 ccvt = ix86_get_callcvt (funtype);
6931 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6932 | IX86_CALLCVT_THISCALL)) != 0
6933 && ! stdarg_p (funtype))
6934 return size;
6936 /* Lose any fake structure return argument if it is passed on the stack. */
6937 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6938 && !ix86_keep_aggregate_return_pointer (funtype))
6940 int nregs = ix86_function_regparm (funtype, fundecl);
6941 if (nregs == 0)
6942 return GET_MODE_SIZE (Pmode);
6945 return 0;
6948 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6950 static bool
6951 ix86_legitimate_combined_insn (rtx_insn *insn)
6953 int i;
6955 /* Check operand constraints in case hard registers were propagated
6956 into insn pattern. This check prevents combine pass from
6957 generating insn patterns with invalid hard register operands.
6958 These invalid insns can eventually confuse reload to error out
6959 with a spill failure. See also PRs 46829 and 46843. */
6961 gcc_assert (INSN_CODE (insn) >= 0);
6963 extract_insn (insn);
6964 preprocess_constraints (insn);
6966 int n_operands = recog_data.n_operands;
6967 int n_alternatives = recog_data.n_alternatives;
6968 for (i = 0; i < n_operands; i++)
6970 rtx op = recog_data.operand[i];
6971 machine_mode mode = GET_MODE (op);
6972 const operand_alternative *op_alt;
6973 int offset = 0;
6974 bool win;
6975 int j;
6977 /* A unary operator may be accepted by the predicate, but it
6978 is irrelevant for matching constraints. */
6979 if (UNARY_P (op))
6980 op = XEXP (op, 0);
6982 if (SUBREG_P (op))
6984 if (REG_P (SUBREG_REG (op))
6985 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6986 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6987 GET_MODE (SUBREG_REG (op)),
6988 SUBREG_BYTE (op),
6989 GET_MODE (op));
6990 op = SUBREG_REG (op);
6993 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6994 continue;
6996 op_alt = recog_op_alt;
6998 /* Operand has no constraints, anything is OK. */
6999 win = !n_alternatives;
7001 alternative_mask preferred = get_preferred_alternatives (insn);
7002 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
7004 if (!TEST_BIT (preferred, j))
7005 continue;
7006 if (op_alt[i].anything_ok
7007 || (op_alt[i].matches != -1
7008 && operands_match_p
7009 (recog_data.operand[i],
7010 recog_data.operand[op_alt[i].matches]))
7011 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
7013 win = true;
7014 break;
7018 if (!win)
7019 return false;
7022 return true;
7025 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
7027 static unsigned HOST_WIDE_INT
7028 ix86_asan_shadow_offset (void)
7030 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
7031 : HOST_WIDE_INT_C (0x7fff8000))
7032 : (HOST_WIDE_INT_1 << 29);
7035 /* Argument support functions. */
7037 /* Return true when register may be used to pass function parameters. */
7038 bool
7039 ix86_function_arg_regno_p (int regno)
7041 int i;
7042 enum calling_abi call_abi;
7043 const int *parm_regs;
7045 if (TARGET_MPX && BND_REGNO_P (regno))
7046 return true;
7048 if (!TARGET_64BIT)
7050 if (TARGET_MACHO)
7051 return (regno < REGPARM_MAX
7052 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
7053 else
7054 return (regno < REGPARM_MAX
7055 || (TARGET_MMX && MMX_REGNO_P (regno)
7056 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
7057 || (TARGET_SSE && SSE_REGNO_P (regno)
7058 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
7061 if (TARGET_SSE && SSE_REGNO_P (regno)
7062 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
7063 return true;
7065 /* TODO: The function should depend on current function ABI but
7066 builtins.c would need updating then. Therefore we use the
7067 default ABI. */
7068 call_abi = ix86_cfun_abi ();
7070 /* RAX is used as hidden argument to va_arg functions. */
7071 if (call_abi == SYSV_ABI && regno == AX_REG)
7072 return true;
7074 if (call_abi == MS_ABI)
7075 parm_regs = x86_64_ms_abi_int_parameter_registers;
7076 else
7077 parm_regs = x86_64_int_parameter_registers;
7079 for (i = 0; i < (call_abi == MS_ABI
7080 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7081 if (regno == parm_regs[i])
7082 return true;
7083 return false;
7086 /* Return if we do not know how to pass TYPE solely in registers. */
7088 static bool
7089 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7091 if (must_pass_in_stack_var_size_or_pad (mode, type))
7092 return true;
7094 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7095 The layout_type routine is crafty and tries to trick us into passing
7096 currently unsupported vector types on the stack by using TImode. */
7097 return (!TARGET_64BIT && mode == TImode
7098 && type && TREE_CODE (type) != VECTOR_TYPE);
7101 /* It returns the size, in bytes, of the area reserved for arguments passed
7102 in registers for the function represented by fndecl dependent to the used
7103 abi format. */
7105 ix86_reg_parm_stack_space (const_tree fndecl)
7107 enum calling_abi call_abi = SYSV_ABI;
7108 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7109 call_abi = ix86_function_abi (fndecl);
7110 else
7111 call_abi = ix86_function_type_abi (fndecl);
7112 if (TARGET_64BIT && call_abi == MS_ABI)
7113 return 32;
7114 return 0;
7117 /* We add this as a workaround in order to use libc_has_function
7118 hook in i386.md. */
7119 bool
7120 ix86_libc_has_function (enum function_class fn_class)
7122 return targetm.libc_has_function (fn_class);
7125 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7126 specifying the call abi used. */
7127 enum calling_abi
7128 ix86_function_type_abi (const_tree fntype)
7130 enum calling_abi abi = ix86_abi;
7132 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7133 return abi;
7135 if (abi == SYSV_ABI
7136 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7138 static int warned;
7139 if (TARGET_X32 && !warned)
7141 error ("X32 does not support ms_abi attribute");
7142 warned = 1;
7145 abi = MS_ABI;
7147 else if (abi == MS_ABI
7148 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7149 abi = SYSV_ABI;
7151 return abi;
7154 static enum calling_abi
7155 ix86_function_abi (const_tree fndecl)
7157 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7160 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7161 specifying the call abi used. */
7162 enum calling_abi
7163 ix86_cfun_abi (void)
7165 return cfun ? cfun->machine->call_abi : ix86_abi;
7168 static bool
7169 ix86_function_ms_hook_prologue (const_tree fn)
7171 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7173 if (decl_function_context (fn) != NULL_TREE)
7174 error_at (DECL_SOURCE_LOCATION (fn),
7175 "ms_hook_prologue is not compatible with nested function");
7176 else
7177 return true;
7179 return false;
7182 static bool
7183 ix86_function_naked (const_tree fn)
7185 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7186 return true;
7188 return false;
7191 /* Write the extra assembler code needed to declare a function properly. */
7193 void
7194 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7195 tree decl)
7197 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7199 if (is_ms_hook)
7201 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7202 unsigned int filler_cc = 0xcccccccc;
7204 for (i = 0; i < filler_count; i += 4)
7205 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7208 #ifdef SUBTARGET_ASM_UNWIND_INIT
7209 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7210 #endif
7212 ASM_OUTPUT_LABEL (asm_out_file, fname);
7214 /* Output magic byte marker, if hot-patch attribute is set. */
7215 if (is_ms_hook)
7217 if (TARGET_64BIT)
7219 /* leaq [%rsp + 0], %rsp */
7220 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7221 asm_out_file);
7223 else
7225 /* movl.s %edi, %edi
7226 push %ebp
7227 movl.s %esp, %ebp */
7228 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7233 /* Implementation of call abi switching target hook. Specific to FNDECL
7234 the specific call register sets are set. See also
7235 ix86_conditional_register_usage for more details. */
7236 void
7237 ix86_call_abi_override (const_tree fndecl)
7239 cfun->machine->call_abi = ix86_function_abi (fndecl);
7242 /* Return 1 if pseudo register should be created and used to hold
7243 GOT address for PIC code. */
7244 bool
7245 ix86_use_pseudo_pic_reg (void)
7247 if ((TARGET_64BIT
7248 && (ix86_cmodel == CM_SMALL_PIC
7249 || TARGET_PECOFF))
7250 || !flag_pic)
7251 return false;
7252 return true;
7255 /* Initialize large model PIC register. */
7257 static void
7258 ix86_init_large_pic_reg (unsigned int tmp_regno)
7260 rtx_code_label *label;
7261 rtx tmp_reg;
7263 gcc_assert (Pmode == DImode);
7264 label = gen_label_rtx ();
7265 emit_label (label);
7266 LABEL_PRESERVE_P (label) = 1;
7267 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7268 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7269 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7270 label));
7271 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7272 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7273 pic_offset_table_rtx, tmp_reg));
7274 const char *name = LABEL_NAME (label);
7275 PUT_CODE (label, NOTE);
7276 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7277 NOTE_DELETED_LABEL_NAME (label) = name;
7280 /* Create and initialize PIC register if required. */
7281 static void
7282 ix86_init_pic_reg (void)
7284 edge entry_edge;
7285 rtx_insn *seq;
7287 if (!ix86_use_pseudo_pic_reg ())
7288 return;
7290 start_sequence ();
7292 if (TARGET_64BIT)
7294 if (ix86_cmodel == CM_LARGE_PIC)
7295 ix86_init_large_pic_reg (R11_REG);
7296 else
7297 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7299 else
7301 /* If there is future mcount call in the function it is more profitable
7302 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7303 rtx reg = crtl->profile
7304 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7305 : pic_offset_table_rtx;
7306 rtx_insn *insn = emit_insn (gen_set_got (reg));
7307 RTX_FRAME_RELATED_P (insn) = 1;
7308 if (crtl->profile)
7309 emit_move_insn (pic_offset_table_rtx, reg);
7310 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7313 seq = get_insns ();
7314 end_sequence ();
7316 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7317 insert_insn_on_edge (seq, entry_edge);
7318 commit_one_edge_insertion (entry_edge);
7321 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7322 for a call to a function whose data type is FNTYPE.
7323 For a library call, FNTYPE is 0. */
7325 void
7326 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7327 tree fntype, /* tree ptr for function decl */
7328 rtx libname, /* SYMBOL_REF of library name or 0 */
7329 tree fndecl,
7330 int caller)
7332 struct cgraph_local_info *i = NULL;
7333 struct cgraph_node *target = NULL;
7335 memset (cum, 0, sizeof (*cum));
7337 if (fndecl)
7339 target = cgraph_node::get (fndecl);
7340 if (target)
7342 target = target->function_symbol ();
7343 i = cgraph_node::local_info (target->decl);
7344 cum->call_abi = ix86_function_abi (target->decl);
7346 else
7347 cum->call_abi = ix86_function_abi (fndecl);
7349 else
7350 cum->call_abi = ix86_function_type_abi (fntype);
7352 cum->caller = caller;
7354 /* Set up the number of registers to use for passing arguments. */
7355 cum->nregs = ix86_regparm;
7356 if (TARGET_64BIT)
7358 cum->nregs = (cum->call_abi == SYSV_ABI
7359 ? X86_64_REGPARM_MAX
7360 : X86_64_MS_REGPARM_MAX);
7362 if (TARGET_SSE)
7364 cum->sse_nregs = SSE_REGPARM_MAX;
7365 if (TARGET_64BIT)
7367 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7368 ? X86_64_SSE_REGPARM_MAX
7369 : X86_64_MS_SSE_REGPARM_MAX);
7372 if (TARGET_MMX)
7373 cum->mmx_nregs = MMX_REGPARM_MAX;
7374 cum->warn_avx512f = true;
7375 cum->warn_avx = true;
7376 cum->warn_sse = true;
7377 cum->warn_mmx = true;
7379 /* Because type might mismatch in between caller and callee, we need to
7380 use actual type of function for local calls.
7381 FIXME: cgraph_analyze can be told to actually record if function uses
7382 va_start so for local functions maybe_vaarg can be made aggressive
7383 helping K&R code.
7384 FIXME: once typesytem is fixed, we won't need this code anymore. */
7385 if (i && i->local && i->can_change_signature)
7386 fntype = TREE_TYPE (target->decl);
7387 cum->stdarg = stdarg_p (fntype);
7388 cum->maybe_vaarg = (fntype
7389 ? (!prototype_p (fntype) || stdarg_p (fntype))
7390 : !libname);
7392 cum->bnd_regno = FIRST_BND_REG;
7393 cum->bnds_in_bt = 0;
7394 cum->force_bnd_pass = 0;
7395 cum->decl = fndecl;
7397 cum->warn_empty = !warn_abi || cum->stdarg;
7398 if (!cum->warn_empty && fntype)
7400 function_args_iterator iter;
7401 tree argtype;
7402 bool seen_empty_type = false;
7403 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7405 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7406 break;
7407 if (TYPE_EMPTY_P (argtype))
7408 seen_empty_type = true;
7409 else if (seen_empty_type)
7411 cum->warn_empty = true;
7412 break;
7417 if (!TARGET_64BIT)
7419 /* If there are variable arguments, then we won't pass anything
7420 in registers in 32-bit mode. */
7421 if (stdarg_p (fntype))
7423 cum->nregs = 0;
7424 /* Since in 32-bit, variable arguments are always passed on
7425 stack, there is scratch register available for indirect
7426 sibcall. */
7427 cfun->machine->arg_reg_available = true;
7428 cum->sse_nregs = 0;
7429 cum->mmx_nregs = 0;
7430 cum->warn_avx512f = false;
7431 cum->warn_avx = false;
7432 cum->warn_sse = false;
7433 cum->warn_mmx = false;
7434 return;
7437 /* Use ecx and edx registers if function has fastcall attribute,
7438 else look for regparm information. */
7439 if (fntype)
7441 unsigned int ccvt = ix86_get_callcvt (fntype);
7442 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7444 cum->nregs = 1;
7445 cum->fastcall = 1; /* Same first register as in fastcall. */
7447 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7449 cum->nregs = 2;
7450 cum->fastcall = 1;
7452 else
7453 cum->nregs = ix86_function_regparm (fntype, fndecl);
7456 /* Set up the number of SSE registers used for passing SFmode
7457 and DFmode arguments. Warn for mismatching ABI. */
7458 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7461 cfun->machine->arg_reg_available = (cum->nregs > 0);
7464 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7465 But in the case of vector types, it is some vector mode.
7467 When we have only some of our vector isa extensions enabled, then there
7468 are some modes for which vector_mode_supported_p is false. For these
7469 modes, the generic vector support in gcc will choose some non-vector mode
7470 in order to implement the type. By computing the natural mode, we'll
7471 select the proper ABI location for the operand and not depend on whatever
7472 the middle-end decides to do with these vector types.
7474 The midde-end can't deal with the vector types > 16 bytes. In this
7475 case, we return the original mode and warn ABI change if CUM isn't
7476 NULL.
7478 If INT_RETURN is true, warn ABI change if the vector mode isn't
7479 available for function return value. */
7481 static machine_mode
7482 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7483 bool in_return)
7485 machine_mode mode = TYPE_MODE (type);
7487 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7489 HOST_WIDE_INT size = int_size_in_bytes (type);
7490 if ((size == 8 || size == 16 || size == 32 || size == 64)
7491 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7492 && TYPE_VECTOR_SUBPARTS (type) > 1)
7494 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7496 /* There are no XFmode vector modes. */
7497 if (innermode == XFmode)
7498 return mode;
7500 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7501 mode = MIN_MODE_VECTOR_FLOAT;
7502 else
7503 mode = MIN_MODE_VECTOR_INT;
7505 /* Get the mode which has this inner mode and number of units. */
7506 FOR_EACH_MODE_FROM (mode, mode)
7507 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7508 && GET_MODE_INNER (mode) == innermode)
7510 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7512 static bool warnedavx512f;
7513 static bool warnedavx512f_ret;
7515 if (cum && cum->warn_avx512f && !warnedavx512f)
7517 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7518 "without AVX512F enabled changes the ABI"))
7519 warnedavx512f = true;
7521 else if (in_return && !warnedavx512f_ret)
7523 if (warning (OPT_Wpsabi, "AVX512F vector return "
7524 "without AVX512F enabled changes the ABI"))
7525 warnedavx512f_ret = true;
7528 return TYPE_MODE (type);
7530 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7532 static bool warnedavx;
7533 static bool warnedavx_ret;
7535 if (cum && cum->warn_avx && !warnedavx)
7537 if (warning (OPT_Wpsabi, "AVX vector argument "
7538 "without AVX enabled changes the ABI"))
7539 warnedavx = true;
7541 else if (in_return && !warnedavx_ret)
7543 if (warning (OPT_Wpsabi, "AVX vector return "
7544 "without AVX enabled changes the ABI"))
7545 warnedavx_ret = true;
7548 return TYPE_MODE (type);
7550 else if (((size == 8 && TARGET_64BIT) || size == 16)
7551 && !TARGET_SSE
7552 && !TARGET_IAMCU)
7554 static bool warnedsse;
7555 static bool warnedsse_ret;
7557 if (cum && cum->warn_sse && !warnedsse)
7559 if (warning (OPT_Wpsabi, "SSE vector argument "
7560 "without SSE enabled changes the ABI"))
7561 warnedsse = true;
7563 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7565 if (warning (OPT_Wpsabi, "SSE vector return "
7566 "without SSE enabled changes the ABI"))
7567 warnedsse_ret = true;
7570 else if ((size == 8 && !TARGET_64BIT)
7571 && (!cfun
7572 || cfun->machine->func_type == TYPE_NORMAL)
7573 && !TARGET_MMX
7574 && !TARGET_IAMCU)
7576 static bool warnedmmx;
7577 static bool warnedmmx_ret;
7579 if (cum && cum->warn_mmx && !warnedmmx)
7581 if (warning (OPT_Wpsabi, "MMX vector argument "
7582 "without MMX enabled changes the ABI"))
7583 warnedmmx = true;
7585 else if (in_return && !warnedmmx_ret)
7587 if (warning (OPT_Wpsabi, "MMX vector return "
7588 "without MMX enabled changes the ABI"))
7589 warnedmmx_ret = true;
7592 return mode;
7595 gcc_unreachable ();
7599 return mode;
7602 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7603 this may not agree with the mode that the type system has chosen for the
7604 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7605 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7607 static rtx
7608 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7609 unsigned int regno)
7611 rtx tmp;
7613 if (orig_mode != BLKmode)
7614 tmp = gen_rtx_REG (orig_mode, regno);
7615 else
7617 tmp = gen_rtx_REG (mode, regno);
7618 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7619 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7622 return tmp;
7625 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7626 of this code is to classify each 8bytes of incoming argument by the register
7627 class and assign registers accordingly. */
7629 /* Return the union class of CLASS1 and CLASS2.
7630 See the x86-64 PS ABI for details. */
7632 static enum x86_64_reg_class
7633 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7635 /* Rule #1: If both classes are equal, this is the resulting class. */
7636 if (class1 == class2)
7637 return class1;
7639 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7640 the other class. */
7641 if (class1 == X86_64_NO_CLASS)
7642 return class2;
7643 if (class2 == X86_64_NO_CLASS)
7644 return class1;
7646 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7647 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7648 return X86_64_MEMORY_CLASS;
7650 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7651 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7652 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7653 return X86_64_INTEGERSI_CLASS;
7654 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7655 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7656 return X86_64_INTEGER_CLASS;
7658 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7659 MEMORY is used. */
7660 if (class1 == X86_64_X87_CLASS
7661 || class1 == X86_64_X87UP_CLASS
7662 || class1 == X86_64_COMPLEX_X87_CLASS
7663 || class2 == X86_64_X87_CLASS
7664 || class2 == X86_64_X87UP_CLASS
7665 || class2 == X86_64_COMPLEX_X87_CLASS)
7666 return X86_64_MEMORY_CLASS;
7668 /* Rule #6: Otherwise class SSE is used. */
7669 return X86_64_SSE_CLASS;
7672 /* Classify the argument of type TYPE and mode MODE.
7673 CLASSES will be filled by the register class used to pass each word
7674 of the operand. The number of words is returned. In case the parameter
7675 should be passed in memory, 0 is returned. As a special case for zero
7676 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7678 BIT_OFFSET is used internally for handling records and specifies offset
7679 of the offset in bits modulo 512 to avoid overflow cases.
7681 See the x86-64 PS ABI for details.
7684 static int
7685 classify_argument (machine_mode mode, const_tree type,
7686 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7688 HOST_WIDE_INT bytes =
7689 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7690 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7692 /* Variable sized entities are always passed/returned in memory. */
7693 if (bytes < 0)
7694 return 0;
7696 if (mode != VOIDmode
7697 && targetm.calls.must_pass_in_stack (mode, type))
7698 return 0;
7700 if (type && AGGREGATE_TYPE_P (type))
7702 int i;
7703 tree field;
7704 enum x86_64_reg_class subclasses[MAX_CLASSES];
7706 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7707 if (bytes > 64)
7708 return 0;
7710 for (i = 0; i < words; i++)
7711 classes[i] = X86_64_NO_CLASS;
7713 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7714 signalize memory class, so handle it as special case. */
7715 if (!words)
7717 classes[0] = X86_64_NO_CLASS;
7718 return 1;
7721 /* Classify each field of record and merge classes. */
7722 switch (TREE_CODE (type))
7724 case RECORD_TYPE:
7725 /* And now merge the fields of structure. */
7726 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7728 if (TREE_CODE (field) == FIELD_DECL)
7730 int num;
7732 if (TREE_TYPE (field) == error_mark_node)
7733 continue;
7735 /* Bitfields are always classified as integer. Handle them
7736 early, since later code would consider them to be
7737 misaligned integers. */
7738 if (DECL_BIT_FIELD (field))
7740 for (i = (int_bit_position (field)
7741 + (bit_offset % 64)) / 8 / 8;
7742 i < ((int_bit_position (field) + (bit_offset % 64))
7743 + tree_to_shwi (DECL_SIZE (field))
7744 + 63) / 8 / 8; i++)
7745 classes[i] =
7746 merge_classes (X86_64_INTEGER_CLASS,
7747 classes[i]);
7749 else
7751 int pos;
7753 type = TREE_TYPE (field);
7755 /* Flexible array member is ignored. */
7756 if (TYPE_MODE (type) == BLKmode
7757 && TREE_CODE (type) == ARRAY_TYPE
7758 && TYPE_SIZE (type) == NULL_TREE
7759 && TYPE_DOMAIN (type) != NULL_TREE
7760 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7761 == NULL_TREE))
7763 static bool warned;
7765 if (!warned && warn_psabi)
7767 warned = true;
7768 inform (input_location,
7769 "the ABI of passing struct with"
7770 " a flexible array member has"
7771 " changed in GCC 4.4");
7773 continue;
7775 num = classify_argument (TYPE_MODE (type), type,
7776 subclasses,
7777 (int_bit_position (field)
7778 + bit_offset) % 512);
7779 if (!num)
7780 return 0;
7781 pos = (int_bit_position (field)
7782 + (bit_offset % 64)) / 8 / 8;
7783 for (i = 0; i < num && (i + pos) < words; i++)
7784 classes[i + pos] =
7785 merge_classes (subclasses[i], classes[i + pos]);
7789 break;
7791 case ARRAY_TYPE:
7792 /* Arrays are handled as small records. */
7794 int num;
7795 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7796 TREE_TYPE (type), subclasses, bit_offset);
7797 if (!num)
7798 return 0;
7800 /* The partial classes are now full classes. */
7801 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7802 subclasses[0] = X86_64_SSE_CLASS;
7803 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7804 && !((bit_offset % 64) == 0 && bytes == 4))
7805 subclasses[0] = X86_64_INTEGER_CLASS;
7807 for (i = 0; i < words; i++)
7808 classes[i] = subclasses[i % num];
7810 break;
7812 case UNION_TYPE:
7813 case QUAL_UNION_TYPE:
7814 /* Unions are similar to RECORD_TYPE but offset is always 0.
7816 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7818 if (TREE_CODE (field) == FIELD_DECL)
7820 int num;
7822 if (TREE_TYPE (field) == error_mark_node)
7823 continue;
7825 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7826 TREE_TYPE (field), subclasses,
7827 bit_offset);
7828 if (!num)
7829 return 0;
7830 for (i = 0; i < num && i < words; i++)
7831 classes[i] = merge_classes (subclasses[i], classes[i]);
7834 break;
7836 default:
7837 gcc_unreachable ();
7840 if (words > 2)
7842 /* When size > 16 bytes, if the first one isn't
7843 X86_64_SSE_CLASS or any other ones aren't
7844 X86_64_SSEUP_CLASS, everything should be passed in
7845 memory. */
7846 if (classes[0] != X86_64_SSE_CLASS)
7847 return 0;
7849 for (i = 1; i < words; i++)
7850 if (classes[i] != X86_64_SSEUP_CLASS)
7851 return 0;
7854 /* Final merger cleanup. */
7855 for (i = 0; i < words; i++)
7857 /* If one class is MEMORY, everything should be passed in
7858 memory. */
7859 if (classes[i] == X86_64_MEMORY_CLASS)
7860 return 0;
7862 /* The X86_64_SSEUP_CLASS should be always preceded by
7863 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7864 if (classes[i] == X86_64_SSEUP_CLASS
7865 && classes[i - 1] != X86_64_SSE_CLASS
7866 && classes[i - 1] != X86_64_SSEUP_CLASS)
7868 /* The first one should never be X86_64_SSEUP_CLASS. */
7869 gcc_assert (i != 0);
7870 classes[i] = X86_64_SSE_CLASS;
7873 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7874 everything should be passed in memory. */
7875 if (classes[i] == X86_64_X87UP_CLASS
7876 && (classes[i - 1] != X86_64_X87_CLASS))
7878 static bool warned;
7880 /* The first one should never be X86_64_X87UP_CLASS. */
7881 gcc_assert (i != 0);
7882 if (!warned && warn_psabi)
7884 warned = true;
7885 inform (input_location,
7886 "the ABI of passing union with long double"
7887 " has changed in GCC 4.4");
7889 return 0;
7892 return words;
7895 /* Compute alignment needed. We align all types to natural boundaries with
7896 exception of XFmode that is aligned to 64bits. */
7897 if (mode != VOIDmode && mode != BLKmode)
7899 int mode_alignment = GET_MODE_BITSIZE (mode);
7901 if (mode == XFmode)
7902 mode_alignment = 128;
7903 else if (mode == XCmode)
7904 mode_alignment = 256;
7905 if (COMPLEX_MODE_P (mode))
7906 mode_alignment /= 2;
7907 /* Misaligned fields are always returned in memory. */
7908 if (bit_offset % mode_alignment)
7909 return 0;
7912 /* for V1xx modes, just use the base mode */
7913 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7914 && GET_MODE_UNIT_SIZE (mode) == bytes)
7915 mode = GET_MODE_INNER (mode);
7917 /* Classification of atomic types. */
7918 switch (mode)
7920 case E_SDmode:
7921 case E_DDmode:
7922 classes[0] = X86_64_SSE_CLASS;
7923 return 1;
7924 case E_TDmode:
7925 classes[0] = X86_64_SSE_CLASS;
7926 classes[1] = X86_64_SSEUP_CLASS;
7927 return 2;
7928 case E_DImode:
7929 case E_SImode:
7930 case E_HImode:
7931 case E_QImode:
7932 case E_CSImode:
7933 case E_CHImode:
7934 case E_CQImode:
7936 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7938 /* Analyze last 128 bits only. */
7939 size = (size - 1) & 0x7f;
7941 if (size < 32)
7943 classes[0] = X86_64_INTEGERSI_CLASS;
7944 return 1;
7946 else if (size < 64)
7948 classes[0] = X86_64_INTEGER_CLASS;
7949 return 1;
7951 else if (size < 64+32)
7953 classes[0] = X86_64_INTEGER_CLASS;
7954 classes[1] = X86_64_INTEGERSI_CLASS;
7955 return 2;
7957 else if (size < 64+64)
7959 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7960 return 2;
7962 else
7963 gcc_unreachable ();
7965 case E_CDImode:
7966 case E_TImode:
7967 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7968 return 2;
7969 case E_COImode:
7970 case E_OImode:
7971 /* OImode shouldn't be used directly. */
7972 gcc_unreachable ();
7973 case E_CTImode:
7974 return 0;
7975 case E_SFmode:
7976 if (!(bit_offset % 64))
7977 classes[0] = X86_64_SSESF_CLASS;
7978 else
7979 classes[0] = X86_64_SSE_CLASS;
7980 return 1;
7981 case E_DFmode:
7982 classes[0] = X86_64_SSEDF_CLASS;
7983 return 1;
7984 case E_XFmode:
7985 classes[0] = X86_64_X87_CLASS;
7986 classes[1] = X86_64_X87UP_CLASS;
7987 return 2;
7988 case E_TFmode:
7989 classes[0] = X86_64_SSE_CLASS;
7990 classes[1] = X86_64_SSEUP_CLASS;
7991 return 2;
7992 case E_SCmode:
7993 classes[0] = X86_64_SSE_CLASS;
7994 if (!(bit_offset % 64))
7995 return 1;
7996 else
7998 static bool warned;
8000 if (!warned && warn_psabi)
8002 warned = true;
8003 inform (input_location,
8004 "the ABI of passing structure with complex float"
8005 " member has changed in GCC 4.4");
8007 classes[1] = X86_64_SSESF_CLASS;
8008 return 2;
8010 case E_DCmode:
8011 classes[0] = X86_64_SSEDF_CLASS;
8012 classes[1] = X86_64_SSEDF_CLASS;
8013 return 2;
8014 case E_XCmode:
8015 classes[0] = X86_64_COMPLEX_X87_CLASS;
8016 return 1;
8017 case E_TCmode:
8018 /* This modes is larger than 16 bytes. */
8019 return 0;
8020 case E_V8SFmode:
8021 case E_V8SImode:
8022 case E_V32QImode:
8023 case E_V16HImode:
8024 case E_V4DFmode:
8025 case E_V4DImode:
8026 classes[0] = X86_64_SSE_CLASS;
8027 classes[1] = X86_64_SSEUP_CLASS;
8028 classes[2] = X86_64_SSEUP_CLASS;
8029 classes[3] = X86_64_SSEUP_CLASS;
8030 return 4;
8031 case E_V8DFmode:
8032 case E_V16SFmode:
8033 case E_V8DImode:
8034 case E_V16SImode:
8035 case E_V32HImode:
8036 case E_V64QImode:
8037 classes[0] = X86_64_SSE_CLASS;
8038 classes[1] = X86_64_SSEUP_CLASS;
8039 classes[2] = X86_64_SSEUP_CLASS;
8040 classes[3] = X86_64_SSEUP_CLASS;
8041 classes[4] = X86_64_SSEUP_CLASS;
8042 classes[5] = X86_64_SSEUP_CLASS;
8043 classes[6] = X86_64_SSEUP_CLASS;
8044 classes[7] = X86_64_SSEUP_CLASS;
8045 return 8;
8046 case E_V4SFmode:
8047 case E_V4SImode:
8048 case E_V16QImode:
8049 case E_V8HImode:
8050 case E_V2DFmode:
8051 case E_V2DImode:
8052 classes[0] = X86_64_SSE_CLASS;
8053 classes[1] = X86_64_SSEUP_CLASS;
8054 return 2;
8055 case E_V1TImode:
8056 case E_V1DImode:
8057 case E_V2SFmode:
8058 case E_V2SImode:
8059 case E_V4HImode:
8060 case E_V8QImode:
8061 classes[0] = X86_64_SSE_CLASS;
8062 return 1;
8063 case E_BLKmode:
8064 case E_VOIDmode:
8065 return 0;
8066 default:
8067 gcc_assert (VECTOR_MODE_P (mode));
8069 if (bytes > 16)
8070 return 0;
8072 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
8074 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8075 classes[0] = X86_64_INTEGERSI_CLASS;
8076 else
8077 classes[0] = X86_64_INTEGER_CLASS;
8078 classes[1] = X86_64_INTEGER_CLASS;
8079 return 1 + (bytes > 8);
8083 /* Examine the argument and return set number of register required in each
8084 class. Return true iff parameter should be passed in memory. */
8086 static bool
8087 examine_argument (machine_mode mode, const_tree type, int in_return,
8088 int *int_nregs, int *sse_nregs)
8090 enum x86_64_reg_class regclass[MAX_CLASSES];
8091 int n = classify_argument (mode, type, regclass, 0);
8093 *int_nregs = 0;
8094 *sse_nregs = 0;
8096 if (!n)
8097 return true;
8098 for (n--; n >= 0; n--)
8099 switch (regclass[n])
8101 case X86_64_INTEGER_CLASS:
8102 case X86_64_INTEGERSI_CLASS:
8103 (*int_nregs)++;
8104 break;
8105 case X86_64_SSE_CLASS:
8106 case X86_64_SSESF_CLASS:
8107 case X86_64_SSEDF_CLASS:
8108 (*sse_nregs)++;
8109 break;
8110 case X86_64_NO_CLASS:
8111 case X86_64_SSEUP_CLASS:
8112 break;
8113 case X86_64_X87_CLASS:
8114 case X86_64_X87UP_CLASS:
8115 case X86_64_COMPLEX_X87_CLASS:
8116 if (!in_return)
8117 return true;
8118 break;
8119 case X86_64_MEMORY_CLASS:
8120 gcc_unreachable ();
8123 return false;
8126 /* Construct container for the argument used by GCC interface. See
8127 FUNCTION_ARG for the detailed description. */
8129 static rtx
8130 construct_container (machine_mode mode, machine_mode orig_mode,
8131 const_tree type, int in_return, int nintregs, int nsseregs,
8132 const int *intreg, int sse_regno)
8134 /* The following variables hold the static issued_error state. */
8135 static bool issued_sse_arg_error;
8136 static bool issued_sse_ret_error;
8137 static bool issued_x87_ret_error;
8139 machine_mode tmpmode;
8140 int bytes =
8141 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8142 enum x86_64_reg_class regclass[MAX_CLASSES];
8143 int n;
8144 int i;
8145 int nexps = 0;
8146 int needed_sseregs, needed_intregs;
8147 rtx exp[MAX_CLASSES];
8148 rtx ret;
8150 n = classify_argument (mode, type, regclass, 0);
8151 if (!n)
8152 return NULL;
8153 if (examine_argument (mode, type, in_return, &needed_intregs,
8154 &needed_sseregs))
8155 return NULL;
8156 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8157 return NULL;
8159 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8160 some less clueful developer tries to use floating-point anyway. */
8161 if (needed_sseregs && !TARGET_SSE)
8163 if (in_return)
8165 if (!issued_sse_ret_error)
8167 error ("SSE register return with SSE disabled");
8168 issued_sse_ret_error = true;
8171 else if (!issued_sse_arg_error)
8173 error ("SSE register argument with SSE disabled");
8174 issued_sse_arg_error = true;
8176 return NULL;
8179 /* Likewise, error if the ABI requires us to return values in the
8180 x87 registers and the user specified -mno-80387. */
8181 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8182 for (i = 0; i < n; i++)
8183 if (regclass[i] == X86_64_X87_CLASS
8184 || regclass[i] == X86_64_X87UP_CLASS
8185 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8187 if (!issued_x87_ret_error)
8189 error ("x87 register return with x87 disabled");
8190 issued_x87_ret_error = true;
8192 return NULL;
8195 /* First construct simple cases. Avoid SCmode, since we want to use
8196 single register to pass this type. */
8197 if (n == 1 && mode != SCmode)
8198 switch (regclass[0])
8200 case X86_64_INTEGER_CLASS:
8201 case X86_64_INTEGERSI_CLASS:
8202 return gen_rtx_REG (mode, intreg[0]);
8203 case X86_64_SSE_CLASS:
8204 case X86_64_SSESF_CLASS:
8205 case X86_64_SSEDF_CLASS:
8206 if (mode != BLKmode)
8207 return gen_reg_or_parallel (mode, orig_mode,
8208 SSE_REGNO (sse_regno));
8209 break;
8210 case X86_64_X87_CLASS:
8211 case X86_64_COMPLEX_X87_CLASS:
8212 return gen_rtx_REG (mode, FIRST_STACK_REG);
8213 case X86_64_NO_CLASS:
8214 /* Zero sized array, struct or class. */
8215 return NULL;
8216 default:
8217 gcc_unreachable ();
8219 if (n == 2
8220 && regclass[0] == X86_64_SSE_CLASS
8221 && regclass[1] == X86_64_SSEUP_CLASS
8222 && mode != BLKmode)
8223 return gen_reg_or_parallel (mode, orig_mode,
8224 SSE_REGNO (sse_regno));
8225 if (n == 4
8226 && regclass[0] == X86_64_SSE_CLASS
8227 && regclass[1] == X86_64_SSEUP_CLASS
8228 && regclass[2] == X86_64_SSEUP_CLASS
8229 && regclass[3] == X86_64_SSEUP_CLASS
8230 && mode != BLKmode)
8231 return gen_reg_or_parallel (mode, orig_mode,
8232 SSE_REGNO (sse_regno));
8233 if (n == 8
8234 && regclass[0] == X86_64_SSE_CLASS
8235 && regclass[1] == X86_64_SSEUP_CLASS
8236 && regclass[2] == X86_64_SSEUP_CLASS
8237 && regclass[3] == X86_64_SSEUP_CLASS
8238 && regclass[4] == X86_64_SSEUP_CLASS
8239 && regclass[5] == X86_64_SSEUP_CLASS
8240 && regclass[6] == X86_64_SSEUP_CLASS
8241 && regclass[7] == X86_64_SSEUP_CLASS
8242 && mode != BLKmode)
8243 return gen_reg_or_parallel (mode, orig_mode,
8244 SSE_REGNO (sse_regno));
8245 if (n == 2
8246 && regclass[0] == X86_64_X87_CLASS
8247 && regclass[1] == X86_64_X87UP_CLASS)
8248 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8250 if (n == 2
8251 && regclass[0] == X86_64_INTEGER_CLASS
8252 && regclass[1] == X86_64_INTEGER_CLASS
8253 && (mode == CDImode || mode == TImode)
8254 && intreg[0] + 1 == intreg[1])
8255 return gen_rtx_REG (mode, intreg[0]);
8257 /* Otherwise figure out the entries of the PARALLEL. */
8258 for (i = 0; i < n; i++)
8260 int pos;
8262 switch (regclass[i])
8264 case X86_64_NO_CLASS:
8265 break;
8266 case X86_64_INTEGER_CLASS:
8267 case X86_64_INTEGERSI_CLASS:
8268 /* Merge TImodes on aligned occasions here too. */
8269 if (i * 8 + 8 > bytes)
8271 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8272 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8273 /* We've requested 24 bytes we
8274 don't have mode for. Use DImode. */
8275 tmpmode = DImode;
8277 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8278 tmpmode = SImode;
8279 else
8280 tmpmode = DImode;
8281 exp [nexps++]
8282 = gen_rtx_EXPR_LIST (VOIDmode,
8283 gen_rtx_REG (tmpmode, *intreg),
8284 GEN_INT (i*8));
8285 intreg++;
8286 break;
8287 case X86_64_SSESF_CLASS:
8288 exp [nexps++]
8289 = gen_rtx_EXPR_LIST (VOIDmode,
8290 gen_rtx_REG (SFmode,
8291 SSE_REGNO (sse_regno)),
8292 GEN_INT (i*8));
8293 sse_regno++;
8294 break;
8295 case X86_64_SSEDF_CLASS:
8296 exp [nexps++]
8297 = gen_rtx_EXPR_LIST (VOIDmode,
8298 gen_rtx_REG (DFmode,
8299 SSE_REGNO (sse_regno)),
8300 GEN_INT (i*8));
8301 sse_regno++;
8302 break;
8303 case X86_64_SSE_CLASS:
8304 pos = i;
8305 switch (n)
8307 case 1:
8308 tmpmode = DImode;
8309 break;
8310 case 2:
8311 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8313 tmpmode = TImode;
8314 i++;
8316 else
8317 tmpmode = DImode;
8318 break;
8319 case 4:
8320 gcc_assert (i == 0
8321 && regclass[1] == X86_64_SSEUP_CLASS
8322 && regclass[2] == X86_64_SSEUP_CLASS
8323 && regclass[3] == X86_64_SSEUP_CLASS);
8324 tmpmode = OImode;
8325 i += 3;
8326 break;
8327 case 8:
8328 gcc_assert (i == 0
8329 && regclass[1] == X86_64_SSEUP_CLASS
8330 && regclass[2] == X86_64_SSEUP_CLASS
8331 && regclass[3] == X86_64_SSEUP_CLASS
8332 && regclass[4] == X86_64_SSEUP_CLASS
8333 && regclass[5] == X86_64_SSEUP_CLASS
8334 && regclass[6] == X86_64_SSEUP_CLASS
8335 && regclass[7] == X86_64_SSEUP_CLASS);
8336 tmpmode = XImode;
8337 i += 7;
8338 break;
8339 default:
8340 gcc_unreachable ();
8342 exp [nexps++]
8343 = gen_rtx_EXPR_LIST (VOIDmode,
8344 gen_rtx_REG (tmpmode,
8345 SSE_REGNO (sse_regno)),
8346 GEN_INT (pos*8));
8347 sse_regno++;
8348 break;
8349 default:
8350 gcc_unreachable ();
8354 /* Empty aligned struct, union or class. */
8355 if (nexps == 0)
8356 return NULL;
8358 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8359 for (i = 0; i < nexps; i++)
8360 XVECEXP (ret, 0, i) = exp [i];
8361 return ret;
8364 /* Update the data in CUM to advance over an argument of mode MODE
8365 and data type TYPE. (TYPE is null for libcalls where that information
8366 may not be available.)
8368 Return a number of integer regsiters advanced over. */
8370 static int
8371 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8372 const_tree type, HOST_WIDE_INT bytes,
8373 HOST_WIDE_INT words)
8375 int res = 0;
8376 bool error_p = false;
8378 if (TARGET_IAMCU)
8380 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8381 bytes in registers. */
8382 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8383 goto pass_in_reg;
8384 return res;
8387 switch (mode)
8389 default:
8390 break;
8392 case E_BLKmode:
8393 if (bytes < 0)
8394 break;
8395 /* FALLTHRU */
8397 case E_DImode:
8398 case E_SImode:
8399 case E_HImode:
8400 case E_QImode:
8401 pass_in_reg:
8402 cum->words += words;
8403 cum->nregs -= words;
8404 cum->regno += words;
8405 if (cum->nregs >= 0)
8406 res = words;
8407 if (cum->nregs <= 0)
8409 cum->nregs = 0;
8410 cfun->machine->arg_reg_available = false;
8411 cum->regno = 0;
8413 break;
8415 case E_OImode:
8416 /* OImode shouldn't be used directly. */
8417 gcc_unreachable ();
8419 case E_DFmode:
8420 if (cum->float_in_sse == -1)
8421 error_p = true;
8422 if (cum->float_in_sse < 2)
8423 break;
8424 /* FALLTHRU */
8425 case E_SFmode:
8426 if (cum->float_in_sse == -1)
8427 error_p = true;
8428 if (cum->float_in_sse < 1)
8429 break;
8430 /* FALLTHRU */
8432 case E_V8SFmode:
8433 case E_V8SImode:
8434 case E_V64QImode:
8435 case E_V32HImode:
8436 case E_V16SImode:
8437 case E_V8DImode:
8438 case E_V16SFmode:
8439 case E_V8DFmode:
8440 case E_V32QImode:
8441 case E_V16HImode:
8442 case E_V4DFmode:
8443 case E_V4DImode:
8444 case E_TImode:
8445 case E_V16QImode:
8446 case E_V8HImode:
8447 case E_V4SImode:
8448 case E_V2DImode:
8449 case E_V4SFmode:
8450 case E_V2DFmode:
8451 if (!type || !AGGREGATE_TYPE_P (type))
8453 cum->sse_words += words;
8454 cum->sse_nregs -= 1;
8455 cum->sse_regno += 1;
8456 if (cum->sse_nregs <= 0)
8458 cum->sse_nregs = 0;
8459 cum->sse_regno = 0;
8462 break;
8464 case E_V8QImode:
8465 case E_V4HImode:
8466 case E_V2SImode:
8467 case E_V2SFmode:
8468 case E_V1TImode:
8469 case E_V1DImode:
8470 if (!type || !AGGREGATE_TYPE_P (type))
8472 cum->mmx_words += words;
8473 cum->mmx_nregs -= 1;
8474 cum->mmx_regno += 1;
8475 if (cum->mmx_nregs <= 0)
8477 cum->mmx_nregs = 0;
8478 cum->mmx_regno = 0;
8481 break;
8483 if (error_p)
8485 cum->float_in_sse = 0;
8486 error ("calling %qD with SSE calling convention without "
8487 "SSE/SSE2 enabled", cum->decl);
8488 sorry ("this is a GCC bug that can be worked around by adding "
8489 "attribute used to function called");
8492 return res;
8495 static int
8496 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8497 const_tree type, HOST_WIDE_INT words, bool named)
8499 int int_nregs, sse_nregs;
8501 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8502 if (!named && (VALID_AVX512F_REG_MODE (mode)
8503 || VALID_AVX256_REG_MODE (mode)))
8504 return 0;
8506 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8507 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8509 cum->nregs -= int_nregs;
8510 cum->sse_nregs -= sse_nregs;
8511 cum->regno += int_nregs;
8512 cum->sse_regno += sse_nregs;
8513 return int_nregs;
8515 else
8517 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8518 cum->words = ROUND_UP (cum->words, align);
8519 cum->words += words;
8520 return 0;
8524 static int
8525 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8526 HOST_WIDE_INT words)
8528 /* Otherwise, this should be passed indirect. */
8529 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8531 cum->words += words;
8532 if (cum->nregs > 0)
8534 cum->nregs -= 1;
8535 cum->regno += 1;
8536 return 1;
8538 return 0;
8541 /* Update the data in CUM to advance over an argument of mode MODE and
8542 data type TYPE. (TYPE is null for libcalls where that information
8543 may not be available.) */
8545 static void
8546 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8547 const_tree type, bool named)
8549 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8550 HOST_WIDE_INT bytes, words;
8551 int nregs;
8553 /* The argument of interrupt handler is a special case and is
8554 handled in ix86_function_arg. */
8555 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8556 return;
8558 if (mode == BLKmode)
8559 bytes = int_size_in_bytes (type);
8560 else
8561 bytes = GET_MODE_SIZE (mode);
8562 words = CEIL (bytes, UNITS_PER_WORD);
8564 if (type)
8565 mode = type_natural_mode (type, NULL, false);
8567 if ((type && POINTER_BOUNDS_TYPE_P (type))
8568 || POINTER_BOUNDS_MODE_P (mode))
8570 /* If we pass bounds in BT then just update remained bounds count. */
8571 if (cum->bnds_in_bt)
8573 cum->bnds_in_bt--;
8574 return;
8577 /* Update remained number of bounds to force. */
8578 if (cum->force_bnd_pass)
8579 cum->force_bnd_pass--;
8581 cum->bnd_regno++;
8583 return;
8586 /* The first arg not going to Bounds Tables resets this counter. */
8587 cum->bnds_in_bt = 0;
8588 /* For unnamed args we always pass bounds to avoid bounds mess when
8589 passed and received types do not match. If bounds do not follow
8590 unnamed arg, still pretend required number of bounds were passed. */
8591 if (cum->force_bnd_pass)
8593 cum->bnd_regno += cum->force_bnd_pass;
8594 cum->force_bnd_pass = 0;
8597 if (TARGET_64BIT)
8599 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8601 if (call_abi == MS_ABI)
8602 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8603 else
8604 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8606 else
8607 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8609 /* For stdarg we expect bounds to be passed for each value passed
8610 in register. */
8611 if (cum->stdarg)
8612 cum->force_bnd_pass = nregs;
8613 /* For pointers passed in memory we expect bounds passed in Bounds
8614 Table. */
8615 if (!nregs)
8617 /* Track if there are outgoing arguments on stack. */
8618 if (cum->caller)
8619 cfun->machine->outgoing_args_on_stack = true;
8621 if (flag_check_pointer_bounds)
8622 cum->bnds_in_bt = chkp_type_bounds_count (type);
8626 /* Define where to put the arguments to a function.
8627 Value is zero to push the argument on the stack,
8628 or a hard register in which to store the argument.
8630 MODE is the argument's machine mode.
8631 TYPE is the data type of the argument (as a tree).
8632 This is null for libcalls where that information may
8633 not be available.
8634 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8635 the preceding args and about the function being called.
8636 NAMED is nonzero if this argument is a named parameter
8637 (otherwise it is an extra parameter matching an ellipsis). */
8639 static rtx
8640 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8641 machine_mode orig_mode, const_tree type,
8642 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8644 bool error_p = false;
8646 /* Avoid the AL settings for the Unix64 ABI. */
8647 if (mode == VOIDmode)
8648 return constm1_rtx;
8650 if (TARGET_IAMCU)
8652 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8653 bytes in registers. */
8654 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8655 goto pass_in_reg;
8656 return NULL_RTX;
8659 switch (mode)
8661 default:
8662 break;
8664 case E_BLKmode:
8665 if (bytes < 0)
8666 break;
8667 /* FALLTHRU */
8668 case E_DImode:
8669 case E_SImode:
8670 case E_HImode:
8671 case E_QImode:
8672 pass_in_reg:
8673 if (words <= cum->nregs)
8675 int regno = cum->regno;
8677 /* Fastcall allocates the first two DWORD (SImode) or
8678 smaller arguments to ECX and EDX if it isn't an
8679 aggregate type . */
8680 if (cum->fastcall)
8682 if (mode == BLKmode
8683 || mode == DImode
8684 || (type && AGGREGATE_TYPE_P (type)))
8685 break;
8687 /* ECX not EAX is the first allocated register. */
8688 if (regno == AX_REG)
8689 regno = CX_REG;
8691 return gen_rtx_REG (mode, regno);
8693 break;
8695 case E_DFmode:
8696 if (cum->float_in_sse == -1)
8697 error_p = true;
8698 if (cum->float_in_sse < 2)
8699 break;
8700 /* FALLTHRU */
8701 case E_SFmode:
8702 if (cum->float_in_sse == -1)
8703 error_p = true;
8704 if (cum->float_in_sse < 1)
8705 break;
8706 /* FALLTHRU */
8707 case E_TImode:
8708 /* In 32bit, we pass TImode in xmm registers. */
8709 case E_V16QImode:
8710 case E_V8HImode:
8711 case E_V4SImode:
8712 case E_V2DImode:
8713 case E_V4SFmode:
8714 case E_V2DFmode:
8715 if (!type || !AGGREGATE_TYPE_P (type))
8717 if (cum->sse_nregs)
8718 return gen_reg_or_parallel (mode, orig_mode,
8719 cum->sse_regno + FIRST_SSE_REG);
8721 break;
8723 case E_OImode:
8724 case E_XImode:
8725 /* OImode and XImode shouldn't be used directly. */
8726 gcc_unreachable ();
8728 case E_V64QImode:
8729 case E_V32HImode:
8730 case E_V16SImode:
8731 case E_V8DImode:
8732 case E_V16SFmode:
8733 case E_V8DFmode:
8734 case E_V8SFmode:
8735 case E_V8SImode:
8736 case E_V32QImode:
8737 case E_V16HImode:
8738 case E_V4DFmode:
8739 case E_V4DImode:
8740 if (!type || !AGGREGATE_TYPE_P (type))
8742 if (cum->sse_nregs)
8743 return gen_reg_or_parallel (mode, orig_mode,
8744 cum->sse_regno + FIRST_SSE_REG);
8746 break;
8748 case E_V8QImode:
8749 case E_V4HImode:
8750 case E_V2SImode:
8751 case E_V2SFmode:
8752 case E_V1TImode:
8753 case E_V1DImode:
8754 if (!type || !AGGREGATE_TYPE_P (type))
8756 if (cum->mmx_nregs)
8757 return gen_reg_or_parallel (mode, orig_mode,
8758 cum->mmx_regno + FIRST_MMX_REG);
8760 break;
8762 if (error_p)
8764 cum->float_in_sse = 0;
8765 error ("calling %qD with SSE calling convention without "
8766 "SSE/SSE2 enabled", cum->decl);
8767 sorry ("this is a GCC bug that can be worked around by adding "
8768 "attribute used to function called");
8771 return NULL_RTX;
8774 static rtx
8775 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8776 machine_mode orig_mode, const_tree type, bool named)
8778 /* Handle a hidden AL argument containing number of registers
8779 for varargs x86-64 functions. */
8780 if (mode == VOIDmode)
8781 return GEN_INT (cum->maybe_vaarg
8782 ? (cum->sse_nregs < 0
8783 ? X86_64_SSE_REGPARM_MAX
8784 : cum->sse_regno)
8785 : -1);
8787 switch (mode)
8789 default:
8790 break;
8792 case E_V8SFmode:
8793 case E_V8SImode:
8794 case E_V32QImode:
8795 case E_V16HImode:
8796 case E_V4DFmode:
8797 case E_V4DImode:
8798 case E_V16SFmode:
8799 case E_V16SImode:
8800 case E_V64QImode:
8801 case E_V32HImode:
8802 case E_V8DFmode:
8803 case E_V8DImode:
8804 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8805 if (!named)
8806 return NULL;
8807 break;
8810 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8811 cum->sse_nregs,
8812 &x86_64_int_parameter_registers [cum->regno],
8813 cum->sse_regno);
8816 static rtx
8817 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8818 machine_mode orig_mode, bool named,
8819 HOST_WIDE_INT bytes)
8821 unsigned int regno;
8823 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8824 We use value of -2 to specify that current function call is MSABI. */
8825 if (mode == VOIDmode)
8826 return GEN_INT (-2);
8828 /* If we've run out of registers, it goes on the stack. */
8829 if (cum->nregs == 0)
8830 return NULL_RTX;
8832 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8834 /* Only floating point modes are passed in anything but integer regs. */
8835 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8837 if (named)
8838 regno = cum->regno + FIRST_SSE_REG;
8839 else
8841 rtx t1, t2;
8843 /* Unnamed floating parameters are passed in both the
8844 SSE and integer registers. */
8845 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8846 t2 = gen_rtx_REG (mode, regno);
8847 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8848 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8849 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8852 /* Handle aggregated types passed in register. */
8853 if (orig_mode == BLKmode)
8855 if (bytes > 0 && bytes <= 8)
8856 mode = (bytes > 4 ? DImode : SImode);
8857 if (mode == BLKmode)
8858 mode = DImode;
8861 return gen_reg_or_parallel (mode, orig_mode, regno);
8864 /* Return where to put the arguments to a function.
8865 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8867 MODE is the argument's machine mode. TYPE is the data type of the
8868 argument. It is null for libcalls where that information may not be
8869 available. CUM gives information about the preceding args and about
8870 the function being called. NAMED is nonzero if this argument is a
8871 named parameter (otherwise it is an extra parameter matching an
8872 ellipsis). */
8874 static rtx
8875 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8876 const_tree type, bool named)
8878 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8879 machine_mode mode = omode;
8880 HOST_WIDE_INT bytes, words;
8881 rtx arg;
8883 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8885 gcc_assert (type != NULL_TREE);
8886 if (POINTER_TYPE_P (type))
8888 /* This is the pointer argument. */
8889 gcc_assert (TYPE_MODE (type) == Pmode);
8890 /* It is at -WORD(AP) in the current frame in interrupt and
8891 exception handlers. */
8892 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8894 else
8896 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8897 && TREE_CODE (type) == INTEGER_TYPE
8898 && TYPE_MODE (type) == word_mode);
8899 /* The error code is the word-mode integer argument at
8900 -2 * WORD(AP) in the current frame of the exception
8901 handler. */
8902 arg = gen_rtx_MEM (word_mode,
8903 plus_constant (Pmode,
8904 arg_pointer_rtx,
8905 -2 * UNITS_PER_WORD));
8907 return arg;
8910 /* All pointer bounds arguments are handled separately here. */
8911 if ((type && POINTER_BOUNDS_TYPE_P (type))
8912 || POINTER_BOUNDS_MODE_P (mode))
8914 /* Return NULL if bounds are forced to go in Bounds Table. */
8915 if (cum->bnds_in_bt)
8916 arg = NULL;
8917 /* Return the next available bound reg if any. */
8918 else if (cum->bnd_regno <= LAST_BND_REG)
8919 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8920 /* Return the next special slot number otherwise. */
8921 else
8922 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8924 return arg;
8927 if (mode == BLKmode)
8928 bytes = int_size_in_bytes (type);
8929 else
8930 bytes = GET_MODE_SIZE (mode);
8931 words = CEIL (bytes, UNITS_PER_WORD);
8933 /* To simplify the code below, represent vector types with a vector mode
8934 even if MMX/SSE are not active. */
8935 if (type && TREE_CODE (type) == VECTOR_TYPE)
8936 mode = type_natural_mode (type, cum, false);
8938 if (TARGET_64BIT)
8940 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8942 if (call_abi == MS_ABI)
8943 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8944 else
8945 arg = function_arg_64 (cum, mode, omode, type, named);
8947 else
8948 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8950 /* Track if there are outgoing arguments on stack. */
8951 if (arg == NULL_RTX && cum->caller)
8952 cfun->machine->outgoing_args_on_stack = true;
8954 return arg;
8957 /* A C expression that indicates when an argument must be passed by
8958 reference. If nonzero for an argument, a copy of that argument is
8959 made in memory and a pointer to the argument is passed instead of
8960 the argument itself. The pointer is passed in whatever way is
8961 appropriate for passing a pointer to that type. */
8963 static bool
8964 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8965 const_tree type, bool)
8967 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8969 /* Bounds are never passed by reference. */
8970 if ((type && POINTER_BOUNDS_TYPE_P (type))
8971 || POINTER_BOUNDS_MODE_P (mode))
8972 return false;
8974 if (TARGET_64BIT)
8976 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8978 /* See Windows x64 Software Convention. */
8979 if (call_abi == MS_ABI)
8981 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8983 if (type)
8985 /* Arrays are passed by reference. */
8986 if (TREE_CODE (type) == ARRAY_TYPE)
8987 return true;
8989 if (RECORD_OR_UNION_TYPE_P (type))
8991 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8992 are passed by reference. */
8993 msize = int_size_in_bytes (type);
8997 /* __m128 is passed by reference. */
8998 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
9000 else if (type && int_size_in_bytes (type) == -1)
9001 return true;
9004 return false;
9007 /* Return true when TYPE should be 128bit aligned for 32bit argument
9008 passing ABI. XXX: This function is obsolete and is only used for
9009 checking psABI compatibility with previous versions of GCC. */
9011 static bool
9012 ix86_compat_aligned_value_p (const_tree type)
9014 machine_mode mode = TYPE_MODE (type);
9015 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
9016 || mode == TDmode
9017 || mode == TFmode
9018 || mode == TCmode)
9019 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
9020 return true;
9021 if (TYPE_ALIGN (type) < 128)
9022 return false;
9024 if (AGGREGATE_TYPE_P (type))
9026 /* Walk the aggregates recursively. */
9027 switch (TREE_CODE (type))
9029 case RECORD_TYPE:
9030 case UNION_TYPE:
9031 case QUAL_UNION_TYPE:
9033 tree field;
9035 /* Walk all the structure fields. */
9036 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9038 if (TREE_CODE (field) == FIELD_DECL
9039 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
9040 return true;
9042 break;
9045 case ARRAY_TYPE:
9046 /* Just for use if some languages passes arrays by value. */
9047 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
9048 return true;
9049 break;
9051 default:
9052 gcc_unreachable ();
9055 return false;
9058 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
9059 XXX: This function is obsolete and is only used for checking psABI
9060 compatibility with previous versions of GCC. */
9062 static unsigned int
9063 ix86_compat_function_arg_boundary (machine_mode mode,
9064 const_tree type, unsigned int align)
9066 /* In 32bit, only _Decimal128 and __float128 are aligned to their
9067 natural boundaries. */
9068 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
9070 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
9071 make an exception for SSE modes since these require 128bit
9072 alignment.
9074 The handling here differs from field_alignment. ICC aligns MMX
9075 arguments to 4 byte boundaries, while structure fields are aligned
9076 to 8 byte boundaries. */
9077 if (!type)
9079 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9080 align = PARM_BOUNDARY;
9082 else
9084 if (!ix86_compat_aligned_value_p (type))
9085 align = PARM_BOUNDARY;
9088 if (align > BIGGEST_ALIGNMENT)
9089 align = BIGGEST_ALIGNMENT;
9090 return align;
9093 /* Return true when TYPE should be 128bit aligned for 32bit argument
9094 passing ABI. */
9096 static bool
9097 ix86_contains_aligned_value_p (const_tree type)
9099 machine_mode mode = TYPE_MODE (type);
9101 if (mode == XFmode || mode == XCmode)
9102 return false;
9104 if (TYPE_ALIGN (type) < 128)
9105 return false;
9107 if (AGGREGATE_TYPE_P (type))
9109 /* Walk the aggregates recursively. */
9110 switch (TREE_CODE (type))
9112 case RECORD_TYPE:
9113 case UNION_TYPE:
9114 case QUAL_UNION_TYPE:
9116 tree field;
9118 /* Walk all the structure fields. */
9119 for (field = TYPE_FIELDS (type);
9120 field;
9121 field = DECL_CHAIN (field))
9123 if (TREE_CODE (field) == FIELD_DECL
9124 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9125 return true;
9127 break;
9130 case ARRAY_TYPE:
9131 /* Just for use if some languages passes arrays by value. */
9132 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9133 return true;
9134 break;
9136 default:
9137 gcc_unreachable ();
9140 else
9141 return TYPE_ALIGN (type) >= 128;
9143 return false;
9146 /* Gives the alignment boundary, in bits, of an argument with the
9147 specified mode and type. */
9149 static unsigned int
9150 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9152 unsigned int align;
9153 if (type)
9155 /* Since the main variant type is used for call, we convert it to
9156 the main variant type. */
9157 type = TYPE_MAIN_VARIANT (type);
9158 align = TYPE_ALIGN (type);
9159 if (TYPE_EMPTY_P (type))
9160 return PARM_BOUNDARY;
9162 else
9163 align = GET_MODE_ALIGNMENT (mode);
9164 if (align < PARM_BOUNDARY)
9165 align = PARM_BOUNDARY;
9166 else
9168 static bool warned;
9169 unsigned int saved_align = align;
9171 if (!TARGET_64BIT)
9173 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9174 if (!type)
9176 if (mode == XFmode || mode == XCmode)
9177 align = PARM_BOUNDARY;
9179 else if (!ix86_contains_aligned_value_p (type))
9180 align = PARM_BOUNDARY;
9182 if (align < 128)
9183 align = PARM_BOUNDARY;
9186 if (warn_psabi
9187 && !warned
9188 && align != ix86_compat_function_arg_boundary (mode, type,
9189 saved_align))
9191 warned = true;
9192 inform (input_location,
9193 "The ABI for passing parameters with %d-byte"
9194 " alignment has changed in GCC 4.6",
9195 align / BITS_PER_UNIT);
9199 return align;
9202 /* Return true if N is a possible register number of function value. */
9204 static bool
9205 ix86_function_value_regno_p (const unsigned int regno)
9207 switch (regno)
9209 case AX_REG:
9210 return true;
9211 case DX_REG:
9212 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9213 case DI_REG:
9214 case SI_REG:
9215 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9217 case BND0_REG:
9218 case BND1_REG:
9219 return chkp_function_instrumented_p (current_function_decl);
9221 /* Complex values are returned in %st(0)/%st(1) pair. */
9222 case ST0_REG:
9223 case ST1_REG:
9224 /* TODO: The function should depend on current function ABI but
9225 builtins.c would need updating then. Therefore we use the
9226 default ABI. */
9227 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9228 return false;
9229 return TARGET_FLOAT_RETURNS_IN_80387;
9231 /* Complex values are returned in %xmm0/%xmm1 pair. */
9232 case XMM0_REG:
9233 case XMM1_REG:
9234 return TARGET_SSE;
9236 case MM0_REG:
9237 if (TARGET_MACHO || TARGET_64BIT)
9238 return false;
9239 return TARGET_MMX;
9242 return false;
9245 /* Define how to find the value returned by a function.
9246 VALTYPE is the data type of the value (as a tree).
9247 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9248 otherwise, FUNC is 0. */
9250 static rtx
9251 function_value_32 (machine_mode orig_mode, machine_mode mode,
9252 const_tree fntype, const_tree fn)
9254 unsigned int regno;
9256 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9257 we normally prevent this case when mmx is not available. However
9258 some ABIs may require the result to be returned like DImode. */
9259 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9260 regno = FIRST_MMX_REG;
9262 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9263 we prevent this case when sse is not available. However some ABIs
9264 may require the result to be returned like integer TImode. */
9265 else if (mode == TImode
9266 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9267 regno = FIRST_SSE_REG;
9269 /* 32-byte vector modes in %ymm0. */
9270 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9271 regno = FIRST_SSE_REG;
9273 /* 64-byte vector modes in %zmm0. */
9274 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9275 regno = FIRST_SSE_REG;
9277 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9278 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9279 regno = FIRST_FLOAT_REG;
9280 else
9281 /* Most things go in %eax. */
9282 regno = AX_REG;
9284 /* Override FP return register with %xmm0 for local functions when
9285 SSE math is enabled or for functions with sseregparm attribute. */
9286 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9288 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9289 if (sse_level == -1)
9291 error ("calling %qD with SSE calling convention without "
9292 "SSE/SSE2 enabled", fn);
9293 sorry ("this is a GCC bug that can be worked around by adding "
9294 "attribute used to function called");
9296 else if ((sse_level >= 1 && mode == SFmode)
9297 || (sse_level == 2 && mode == DFmode))
9298 regno = FIRST_SSE_REG;
9301 /* OImode shouldn't be used directly. */
9302 gcc_assert (mode != OImode);
9304 return gen_rtx_REG (orig_mode, regno);
9307 static rtx
9308 function_value_64 (machine_mode orig_mode, machine_mode mode,
9309 const_tree valtype)
9311 rtx ret;
9313 /* Handle libcalls, which don't provide a type node. */
9314 if (valtype == NULL)
9316 unsigned int regno;
9318 switch (mode)
9320 case E_SFmode:
9321 case E_SCmode:
9322 case E_DFmode:
9323 case E_DCmode:
9324 case E_TFmode:
9325 case E_SDmode:
9326 case E_DDmode:
9327 case E_TDmode:
9328 regno = FIRST_SSE_REG;
9329 break;
9330 case E_XFmode:
9331 case E_XCmode:
9332 regno = FIRST_FLOAT_REG;
9333 break;
9334 case E_TCmode:
9335 return NULL;
9336 default:
9337 regno = AX_REG;
9340 return gen_rtx_REG (mode, regno);
9342 else if (POINTER_TYPE_P (valtype))
9344 /* Pointers are always returned in word_mode. */
9345 mode = word_mode;
9348 ret = construct_container (mode, orig_mode, valtype, 1,
9349 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9350 x86_64_int_return_registers, 0);
9352 /* For zero sized structures, construct_container returns NULL, but we
9353 need to keep rest of compiler happy by returning meaningful value. */
9354 if (!ret)
9355 ret = gen_rtx_REG (orig_mode, AX_REG);
9357 return ret;
9360 static rtx
9361 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9362 const_tree valtype)
9364 unsigned int regno = AX_REG;
9366 if (TARGET_SSE)
9368 switch (GET_MODE_SIZE (mode))
9370 case 16:
9371 if (valtype != NULL_TREE
9372 && !VECTOR_INTEGER_TYPE_P (valtype)
9373 && !VECTOR_INTEGER_TYPE_P (valtype)
9374 && !INTEGRAL_TYPE_P (valtype)
9375 && !VECTOR_FLOAT_TYPE_P (valtype))
9376 break;
9377 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9378 && !COMPLEX_MODE_P (mode))
9379 regno = FIRST_SSE_REG;
9380 break;
9381 case 8:
9382 case 4:
9383 if (mode == SFmode || mode == DFmode)
9384 regno = FIRST_SSE_REG;
9385 break;
9386 default:
9387 break;
9390 return gen_rtx_REG (orig_mode, regno);
9393 static rtx
9394 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9395 machine_mode orig_mode, machine_mode mode)
9397 const_tree fn, fntype;
9399 fn = NULL_TREE;
9400 if (fntype_or_decl && DECL_P (fntype_or_decl))
9401 fn = fntype_or_decl;
9402 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9404 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9405 || POINTER_BOUNDS_MODE_P (mode))
9406 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9407 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9408 return function_value_ms_64 (orig_mode, mode, valtype);
9409 else if (TARGET_64BIT)
9410 return function_value_64 (orig_mode, mode, valtype);
9411 else
9412 return function_value_32 (orig_mode, mode, fntype, fn);
9415 static rtx
9416 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9418 machine_mode mode, orig_mode;
9420 orig_mode = TYPE_MODE (valtype);
9421 mode = type_natural_mode (valtype, NULL, true);
9422 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9425 /* Return an RTX representing a place where a function returns
9426 or recieves pointer bounds or NULL if no bounds are returned.
9428 VALTYPE is a data type of a value returned by the function.
9430 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9431 or FUNCTION_TYPE of the function.
9433 If OUTGOING is false, return a place in which the caller will
9434 see the return value. Otherwise, return a place where a
9435 function returns a value. */
9437 static rtx
9438 ix86_function_value_bounds (const_tree valtype,
9439 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9440 bool outgoing ATTRIBUTE_UNUSED)
9442 rtx res = NULL_RTX;
9444 if (BOUNDED_TYPE_P (valtype))
9445 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9446 else if (chkp_type_has_pointer (valtype))
9448 bitmap slots;
9449 rtx bounds[2];
9450 bitmap_iterator bi;
9451 unsigned i, bnd_no = 0;
9453 bitmap_obstack_initialize (NULL);
9454 slots = BITMAP_ALLOC (NULL);
9455 chkp_find_bound_slots (valtype, slots);
9457 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9459 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9460 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9461 gcc_assert (bnd_no < 2);
9462 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9465 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9467 BITMAP_FREE (slots);
9468 bitmap_obstack_release (NULL);
9470 else
9471 res = NULL_RTX;
9473 return res;
9476 /* Pointer function arguments and return values are promoted to
9477 word_mode for normal functions. */
9479 static machine_mode
9480 ix86_promote_function_mode (const_tree type, machine_mode mode,
9481 int *punsignedp, const_tree fntype,
9482 int for_return)
9484 if (cfun->machine->func_type == TYPE_NORMAL
9485 && type != NULL_TREE
9486 && POINTER_TYPE_P (type))
9488 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9489 return word_mode;
9491 return default_promote_function_mode (type, mode, punsignedp, fntype,
9492 for_return);
9495 /* Return true if a structure, union or array with MODE containing FIELD
9496 should be accessed using BLKmode. */
9498 static bool
9499 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9501 /* Union with XFmode must be in BLKmode. */
9502 return (mode == XFmode
9503 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9504 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9508 ix86_libcall_value (machine_mode mode)
9510 return ix86_function_value_1 (NULL, NULL, mode, mode);
9513 /* Return true iff type is returned in memory. */
9515 static bool
9516 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9518 #ifdef SUBTARGET_RETURN_IN_MEMORY
9519 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9520 #else
9521 const machine_mode mode = type_natural_mode (type, NULL, true);
9522 HOST_WIDE_INT size;
9524 if (POINTER_BOUNDS_TYPE_P (type))
9525 return false;
9527 if (TARGET_64BIT)
9529 if (ix86_function_type_abi (fntype) == MS_ABI)
9531 size = int_size_in_bytes (type);
9533 /* __m128 is returned in xmm0. */
9534 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9535 || INTEGRAL_TYPE_P (type)
9536 || VECTOR_FLOAT_TYPE_P (type))
9537 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9538 && !COMPLEX_MODE_P (mode)
9539 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9540 return false;
9542 /* Otherwise, the size must be exactly in [1248]. */
9543 return size != 1 && size != 2 && size != 4 && size != 8;
9545 else
9547 int needed_intregs, needed_sseregs;
9549 return examine_argument (mode, type, 1,
9550 &needed_intregs, &needed_sseregs);
9553 else
9555 size = int_size_in_bytes (type);
9557 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9558 bytes in registers. */
9559 if (TARGET_IAMCU)
9560 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9562 if (mode == BLKmode)
9563 return true;
9565 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9566 return false;
9568 if (VECTOR_MODE_P (mode) || mode == TImode)
9570 /* User-created vectors small enough to fit in EAX. */
9571 if (size < 8)
9572 return false;
9574 /* Unless ABI prescibes otherwise,
9575 MMX/3dNow values are returned in MM0 if available. */
9577 if (size == 8)
9578 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9580 /* SSE values are returned in XMM0 if available. */
9581 if (size == 16)
9582 return !TARGET_SSE;
9584 /* AVX values are returned in YMM0 if available. */
9585 if (size == 32)
9586 return !TARGET_AVX;
9588 /* AVX512F values are returned in ZMM0 if available. */
9589 if (size == 64)
9590 return !TARGET_AVX512F;
9593 if (mode == XFmode)
9594 return false;
9596 if (size > 12)
9597 return true;
9599 /* OImode shouldn't be used directly. */
9600 gcc_assert (mode != OImode);
9602 return false;
9604 #endif
9608 /* Create the va_list data type. */
9610 static tree
9611 ix86_build_builtin_va_list_64 (void)
9613 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9615 record = lang_hooks.types.make_type (RECORD_TYPE);
9616 type_decl = build_decl (BUILTINS_LOCATION,
9617 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9619 f_gpr = build_decl (BUILTINS_LOCATION,
9620 FIELD_DECL, get_identifier ("gp_offset"),
9621 unsigned_type_node);
9622 f_fpr = build_decl (BUILTINS_LOCATION,
9623 FIELD_DECL, get_identifier ("fp_offset"),
9624 unsigned_type_node);
9625 f_ovf = build_decl (BUILTINS_LOCATION,
9626 FIELD_DECL, get_identifier ("overflow_arg_area"),
9627 ptr_type_node);
9628 f_sav = build_decl (BUILTINS_LOCATION,
9629 FIELD_DECL, get_identifier ("reg_save_area"),
9630 ptr_type_node);
9632 va_list_gpr_counter_field = f_gpr;
9633 va_list_fpr_counter_field = f_fpr;
9635 DECL_FIELD_CONTEXT (f_gpr) = record;
9636 DECL_FIELD_CONTEXT (f_fpr) = record;
9637 DECL_FIELD_CONTEXT (f_ovf) = record;
9638 DECL_FIELD_CONTEXT (f_sav) = record;
9640 TYPE_STUB_DECL (record) = type_decl;
9641 TYPE_NAME (record) = type_decl;
9642 TYPE_FIELDS (record) = f_gpr;
9643 DECL_CHAIN (f_gpr) = f_fpr;
9644 DECL_CHAIN (f_fpr) = f_ovf;
9645 DECL_CHAIN (f_ovf) = f_sav;
9647 layout_type (record);
9649 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9650 NULL_TREE, TYPE_ATTRIBUTES (record));
9652 /* The correct type is an array type of one element. */
9653 return build_array_type (record, build_index_type (size_zero_node));
9656 /* Setup the builtin va_list data type and for 64-bit the additional
9657 calling convention specific va_list data types. */
9659 static tree
9660 ix86_build_builtin_va_list (void)
9662 if (TARGET_64BIT)
9664 /* Initialize ABI specific va_list builtin types.
9666 In lto1, we can encounter two va_list types:
9667 - one as a result of the type-merge across TUs, and
9668 - the one constructed here.
9669 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9670 a type identity check in canonical_va_list_type based on
9671 TYPE_MAIN_VARIANT (which we used to have) will not work.
9672 Instead, we tag each va_list_type_node with its unique attribute, and
9673 look for the attribute in the type identity check in
9674 canonical_va_list_type.
9676 Tagging sysv_va_list_type_node directly with the attribute is
9677 problematic since it's a array of one record, which will degrade into a
9678 pointer to record when used as parameter (see build_va_arg comments for
9679 an example), dropping the attribute in the process. So we tag the
9680 record instead. */
9682 /* For SYSV_ABI we use an array of one record. */
9683 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9685 /* For MS_ABI we use plain pointer to argument area. */
9686 tree char_ptr_type = build_pointer_type (char_type_node);
9687 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9688 TYPE_ATTRIBUTES (char_ptr_type));
9689 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9691 return ((ix86_abi == MS_ABI)
9692 ? ms_va_list_type_node
9693 : sysv_va_list_type_node);
9695 else
9697 /* For i386 we use plain pointer to argument area. */
9698 return build_pointer_type (char_type_node);
9702 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9704 static void
9705 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9707 rtx save_area, mem;
9708 alias_set_type set;
9709 int i, max;
9711 /* GPR size of varargs save area. */
9712 if (cfun->va_list_gpr_size)
9713 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9714 else
9715 ix86_varargs_gpr_size = 0;
9717 /* FPR size of varargs save area. We don't need it if we don't pass
9718 anything in SSE registers. */
9719 if (TARGET_SSE && cfun->va_list_fpr_size)
9720 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9721 else
9722 ix86_varargs_fpr_size = 0;
9724 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9725 return;
9727 save_area = frame_pointer_rtx;
9728 set = get_varargs_alias_set ();
9730 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9731 if (max > X86_64_REGPARM_MAX)
9732 max = X86_64_REGPARM_MAX;
9734 for (i = cum->regno; i < max; i++)
9736 mem = gen_rtx_MEM (word_mode,
9737 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9738 MEM_NOTRAP_P (mem) = 1;
9739 set_mem_alias_set (mem, set);
9740 emit_move_insn (mem,
9741 gen_rtx_REG (word_mode,
9742 x86_64_int_parameter_registers[i]));
9745 if (ix86_varargs_fpr_size)
9747 machine_mode smode;
9748 rtx_code_label *label;
9749 rtx test;
9751 /* Now emit code to save SSE registers. The AX parameter contains number
9752 of SSE parameter registers used to call this function, though all we
9753 actually check here is the zero/non-zero status. */
9755 label = gen_label_rtx ();
9756 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9757 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9758 label));
9760 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9761 we used movdqa (i.e. TImode) instead? Perhaps even better would
9762 be if we could determine the real mode of the data, via a hook
9763 into pass_stdarg. Ignore all that for now. */
9764 smode = V4SFmode;
9765 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9766 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9768 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9769 if (max > X86_64_SSE_REGPARM_MAX)
9770 max = X86_64_SSE_REGPARM_MAX;
9772 for (i = cum->sse_regno; i < max; ++i)
9774 mem = plus_constant (Pmode, save_area,
9775 i * 16 + ix86_varargs_gpr_size);
9776 mem = gen_rtx_MEM (smode, mem);
9777 MEM_NOTRAP_P (mem) = 1;
9778 set_mem_alias_set (mem, set);
9779 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9781 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9784 emit_label (label);
9788 static void
9789 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9791 alias_set_type set = get_varargs_alias_set ();
9792 int i;
9794 /* Reset to zero, as there might be a sysv vaarg used
9795 before. */
9796 ix86_varargs_gpr_size = 0;
9797 ix86_varargs_fpr_size = 0;
9799 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9801 rtx reg, mem;
9803 mem = gen_rtx_MEM (Pmode,
9804 plus_constant (Pmode, virtual_incoming_args_rtx,
9805 i * UNITS_PER_WORD));
9806 MEM_NOTRAP_P (mem) = 1;
9807 set_mem_alias_set (mem, set);
9809 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9810 emit_move_insn (mem, reg);
9814 static void
9815 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9816 tree type, int *, int no_rtl)
9818 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9819 CUMULATIVE_ARGS next_cum;
9820 tree fntype;
9822 /* This argument doesn't appear to be used anymore. Which is good,
9823 because the old code here didn't suppress rtl generation. */
9824 gcc_assert (!no_rtl);
9826 if (!TARGET_64BIT)
9827 return;
9829 fntype = TREE_TYPE (current_function_decl);
9831 /* For varargs, we do not want to skip the dummy va_dcl argument.
9832 For stdargs, we do want to skip the last named argument. */
9833 next_cum = *cum;
9834 if (stdarg_p (fntype))
9835 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9836 true);
9838 if (cum->call_abi == MS_ABI)
9839 setup_incoming_varargs_ms_64 (&next_cum);
9840 else
9841 setup_incoming_varargs_64 (&next_cum);
9844 static void
9845 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9846 machine_mode mode,
9847 tree type,
9848 int *pretend_size ATTRIBUTE_UNUSED,
9849 int no_rtl)
9851 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9852 CUMULATIVE_ARGS next_cum;
9853 tree fntype;
9854 rtx save_area;
9855 int bnd_reg, i, max;
9857 gcc_assert (!no_rtl);
9859 /* Do nothing if we use plain pointer to argument area. */
9860 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9861 return;
9863 fntype = TREE_TYPE (current_function_decl);
9865 /* For varargs, we do not want to skip the dummy va_dcl argument.
9866 For stdargs, we do want to skip the last named argument. */
9867 next_cum = *cum;
9868 if (stdarg_p (fntype))
9869 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9870 true);
9871 save_area = frame_pointer_rtx;
9873 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9874 if (max > X86_64_REGPARM_MAX)
9875 max = X86_64_REGPARM_MAX;
9877 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9878 if (chkp_function_instrumented_p (current_function_decl))
9879 for (i = cum->regno; i < max; i++)
9881 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9882 rtx ptr = gen_rtx_REG (Pmode,
9883 x86_64_int_parameter_registers[i]);
9884 rtx bounds;
9886 if (bnd_reg <= LAST_BND_REG)
9887 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9888 else
9890 rtx ldx_addr =
9891 plus_constant (Pmode, arg_pointer_rtx,
9892 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9893 bounds = gen_reg_rtx (BNDmode);
9894 emit_insn (BNDmode == BND64mode
9895 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9896 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9899 emit_insn (BNDmode == BND64mode
9900 ? gen_bnd64_stx (addr, ptr, bounds)
9901 : gen_bnd32_stx (addr, ptr, bounds));
9903 bnd_reg++;
9908 /* Checks if TYPE is of kind va_list char *. */
9910 static bool
9911 is_va_list_char_pointer (tree type)
9913 tree canonic;
9915 /* For 32-bit it is always true. */
9916 if (!TARGET_64BIT)
9917 return true;
9918 canonic = ix86_canonical_va_list_type (type);
9919 return (canonic == ms_va_list_type_node
9920 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9923 /* Implement va_start. */
9925 static void
9926 ix86_va_start (tree valist, rtx nextarg)
9928 HOST_WIDE_INT words, n_gpr, n_fpr;
9929 tree f_gpr, f_fpr, f_ovf, f_sav;
9930 tree gpr, fpr, ovf, sav, t;
9931 tree type;
9932 rtx ovf_rtx;
9934 if (flag_split_stack
9935 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9937 unsigned int scratch_regno;
9939 /* When we are splitting the stack, we can't refer to the stack
9940 arguments using internal_arg_pointer, because they may be on
9941 the old stack. The split stack prologue will arrange to
9942 leave a pointer to the old stack arguments in a scratch
9943 register, which we here copy to a pseudo-register. The split
9944 stack prologue can't set the pseudo-register directly because
9945 it (the prologue) runs before any registers have been saved. */
9947 scratch_regno = split_stack_prologue_scratch_regno ();
9948 if (scratch_regno != INVALID_REGNUM)
9950 rtx reg;
9951 rtx_insn *seq;
9953 reg = gen_reg_rtx (Pmode);
9954 cfun->machine->split_stack_varargs_pointer = reg;
9956 start_sequence ();
9957 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9958 seq = get_insns ();
9959 end_sequence ();
9961 push_topmost_sequence ();
9962 emit_insn_after (seq, entry_of_function ());
9963 pop_topmost_sequence ();
9967 /* Only 64bit target needs something special. */
9968 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9970 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9971 std_expand_builtin_va_start (valist, nextarg);
9972 else
9974 rtx va_r, next;
9976 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9977 next = expand_binop (ptr_mode, add_optab,
9978 cfun->machine->split_stack_varargs_pointer,
9979 crtl->args.arg_offset_rtx,
9980 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9981 convert_move (va_r, next, 0);
9983 /* Store zero bounds for va_list. */
9984 if (chkp_function_instrumented_p (current_function_decl))
9985 chkp_expand_bounds_reset_for_mem (valist,
9986 make_tree (TREE_TYPE (valist),
9987 next));
9990 return;
9993 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9994 f_fpr = DECL_CHAIN (f_gpr);
9995 f_ovf = DECL_CHAIN (f_fpr);
9996 f_sav = DECL_CHAIN (f_ovf);
9998 valist = build_simple_mem_ref (valist);
9999 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
10000 /* The following should be folded into the MEM_REF offset. */
10001 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
10002 f_gpr, NULL_TREE);
10003 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
10004 f_fpr, NULL_TREE);
10005 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
10006 f_ovf, NULL_TREE);
10007 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
10008 f_sav, NULL_TREE);
10010 /* Count number of gp and fp argument registers used. */
10011 words = crtl->args.info.words;
10012 n_gpr = crtl->args.info.regno;
10013 n_fpr = crtl->args.info.sse_regno;
10015 if (cfun->va_list_gpr_size)
10017 type = TREE_TYPE (gpr);
10018 t = build2 (MODIFY_EXPR, type,
10019 gpr, build_int_cst (type, n_gpr * 8));
10020 TREE_SIDE_EFFECTS (t) = 1;
10021 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10024 if (TARGET_SSE && cfun->va_list_fpr_size)
10026 type = TREE_TYPE (fpr);
10027 t = build2 (MODIFY_EXPR, type, fpr,
10028 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
10029 TREE_SIDE_EFFECTS (t) = 1;
10030 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10033 /* Find the overflow area. */
10034 type = TREE_TYPE (ovf);
10035 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10036 ovf_rtx = crtl->args.internal_arg_pointer;
10037 else
10038 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
10039 t = make_tree (type, ovf_rtx);
10040 if (words != 0)
10041 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
10043 /* Store zero bounds for overflow area pointer. */
10044 if (chkp_function_instrumented_p (current_function_decl))
10045 chkp_expand_bounds_reset_for_mem (ovf, t);
10047 t = build2 (MODIFY_EXPR, type, ovf, t);
10048 TREE_SIDE_EFFECTS (t) = 1;
10049 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10051 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
10053 /* Find the register save area.
10054 Prologue of the function save it right above stack frame. */
10055 type = TREE_TYPE (sav);
10056 t = make_tree (type, frame_pointer_rtx);
10057 if (!ix86_varargs_gpr_size)
10058 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
10060 /* Store zero bounds for save area pointer. */
10061 if (chkp_function_instrumented_p (current_function_decl))
10062 chkp_expand_bounds_reset_for_mem (sav, t);
10064 t = build2 (MODIFY_EXPR, type, sav, t);
10065 TREE_SIDE_EFFECTS (t) = 1;
10066 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10070 /* Implement va_arg. */
10072 static tree
10073 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
10074 gimple_seq *post_p)
10076 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
10077 tree f_gpr, f_fpr, f_ovf, f_sav;
10078 tree gpr, fpr, ovf, sav, t;
10079 int size, rsize;
10080 tree lab_false, lab_over = NULL_TREE;
10081 tree addr, t2;
10082 rtx container;
10083 int indirect_p = 0;
10084 tree ptrtype;
10085 machine_mode nat_mode;
10086 unsigned int arg_boundary;
10088 /* Only 64bit target needs something special. */
10089 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10090 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10092 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10093 f_fpr = DECL_CHAIN (f_gpr);
10094 f_ovf = DECL_CHAIN (f_fpr);
10095 f_sav = DECL_CHAIN (f_ovf);
10097 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10098 valist, f_gpr, NULL_TREE);
10100 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10101 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10102 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10104 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10105 if (indirect_p)
10106 type = build_pointer_type (type);
10107 size = arg_int_size_in_bytes (type);
10108 rsize = CEIL (size, UNITS_PER_WORD);
10110 nat_mode = type_natural_mode (type, NULL, false);
10111 switch (nat_mode)
10113 case E_V8SFmode:
10114 case E_V8SImode:
10115 case E_V32QImode:
10116 case E_V16HImode:
10117 case E_V4DFmode:
10118 case E_V4DImode:
10119 case E_V16SFmode:
10120 case E_V16SImode:
10121 case E_V64QImode:
10122 case E_V32HImode:
10123 case E_V8DFmode:
10124 case E_V8DImode:
10125 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10126 if (!TARGET_64BIT_MS_ABI)
10128 container = NULL;
10129 break;
10131 /* FALLTHRU */
10133 default:
10134 container = construct_container (nat_mode, TYPE_MODE (type),
10135 type, 0, X86_64_REGPARM_MAX,
10136 X86_64_SSE_REGPARM_MAX, intreg,
10138 break;
10141 /* Pull the value out of the saved registers. */
10143 addr = create_tmp_var (ptr_type_node, "addr");
10145 if (container)
10147 int needed_intregs, needed_sseregs;
10148 bool need_temp;
10149 tree int_addr, sse_addr;
10151 lab_false = create_artificial_label (UNKNOWN_LOCATION);
10152 lab_over = create_artificial_label (UNKNOWN_LOCATION);
10154 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10156 need_temp = (!REG_P (container)
10157 && ((needed_intregs && TYPE_ALIGN (type) > 64)
10158 || TYPE_ALIGN (type) > 128));
10160 /* In case we are passing structure, verify that it is consecutive block
10161 on the register save area. If not we need to do moves. */
10162 if (!need_temp && !REG_P (container))
10164 /* Verify that all registers are strictly consecutive */
10165 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10167 int i;
10169 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10171 rtx slot = XVECEXP (container, 0, i);
10172 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10173 || INTVAL (XEXP (slot, 1)) != i * 16)
10174 need_temp = true;
10177 else
10179 int i;
10181 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10183 rtx slot = XVECEXP (container, 0, i);
10184 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10185 || INTVAL (XEXP (slot, 1)) != i * 8)
10186 need_temp = true;
10190 if (!need_temp)
10192 int_addr = addr;
10193 sse_addr = addr;
10195 else
10197 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10198 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10201 /* First ensure that we fit completely in registers. */
10202 if (needed_intregs)
10204 t = build_int_cst (TREE_TYPE (gpr),
10205 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10206 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10207 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10208 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10209 gimplify_and_add (t, pre_p);
10211 if (needed_sseregs)
10213 t = build_int_cst (TREE_TYPE (fpr),
10214 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10215 + X86_64_REGPARM_MAX * 8);
10216 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10217 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10218 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10219 gimplify_and_add (t, pre_p);
10222 /* Compute index to start of area used for integer regs. */
10223 if (needed_intregs)
10225 /* int_addr = gpr + sav; */
10226 t = fold_build_pointer_plus (sav, gpr);
10227 gimplify_assign (int_addr, t, pre_p);
10229 if (needed_sseregs)
10231 /* sse_addr = fpr + sav; */
10232 t = fold_build_pointer_plus (sav, fpr);
10233 gimplify_assign (sse_addr, t, pre_p);
10235 if (need_temp)
10237 int i, prev_size = 0;
10238 tree temp = create_tmp_var (type, "va_arg_tmp");
10240 /* addr = &temp; */
10241 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10242 gimplify_assign (addr, t, pre_p);
10244 for (i = 0; i < XVECLEN (container, 0); i++)
10246 rtx slot = XVECEXP (container, 0, i);
10247 rtx reg = XEXP (slot, 0);
10248 machine_mode mode = GET_MODE (reg);
10249 tree piece_type;
10250 tree addr_type;
10251 tree daddr_type;
10252 tree src_addr, src;
10253 int src_offset;
10254 tree dest_addr, dest;
10255 int cur_size = GET_MODE_SIZE (mode);
10257 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10258 prev_size = INTVAL (XEXP (slot, 1));
10259 if (prev_size + cur_size > size)
10261 cur_size = size - prev_size;
10262 unsigned int nbits = cur_size * BITS_PER_UNIT;
10263 if (!int_mode_for_size (nbits, 1).exists (&mode))
10264 mode = QImode;
10266 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10267 if (mode == GET_MODE (reg))
10268 addr_type = build_pointer_type (piece_type);
10269 else
10270 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10271 true);
10272 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10273 true);
10275 if (SSE_REGNO_P (REGNO (reg)))
10277 src_addr = sse_addr;
10278 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10280 else
10282 src_addr = int_addr;
10283 src_offset = REGNO (reg) * 8;
10285 src_addr = fold_convert (addr_type, src_addr);
10286 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10288 dest_addr = fold_convert (daddr_type, addr);
10289 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10290 if (cur_size == GET_MODE_SIZE (mode))
10292 src = build_va_arg_indirect_ref (src_addr);
10293 dest = build_va_arg_indirect_ref (dest_addr);
10295 gimplify_assign (dest, src, pre_p);
10297 else
10299 tree copy
10300 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10301 3, dest_addr, src_addr,
10302 size_int (cur_size));
10303 gimplify_and_add (copy, pre_p);
10305 prev_size += cur_size;
10309 if (needed_intregs)
10311 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10312 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10313 gimplify_assign (gpr, t, pre_p);
10316 if (needed_sseregs)
10318 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10319 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10320 gimplify_assign (unshare_expr (fpr), t, pre_p);
10323 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10325 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10328 /* ... otherwise out of the overflow area. */
10330 /* When we align parameter on stack for caller, if the parameter
10331 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10332 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10333 here with caller. */
10334 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10335 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10336 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10338 /* Care for on-stack alignment if needed. */
10339 if (arg_boundary <= 64 || size == 0)
10340 t = ovf;
10341 else
10343 HOST_WIDE_INT align = arg_boundary / 8;
10344 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10345 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10346 build_int_cst (TREE_TYPE (t), -align));
10349 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10350 gimplify_assign (addr, t, pre_p);
10352 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10353 gimplify_assign (unshare_expr (ovf), t, pre_p);
10355 if (container)
10356 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10358 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10359 addr = fold_convert (ptrtype, addr);
10361 if (indirect_p)
10362 addr = build_va_arg_indirect_ref (addr);
10363 return build_va_arg_indirect_ref (addr);
10366 /* Return true if OPNUM's MEM should be matched
10367 in movabs* patterns. */
10369 bool
10370 ix86_check_movabs (rtx insn, int opnum)
10372 rtx set, mem;
10374 set = PATTERN (insn);
10375 if (GET_CODE (set) == PARALLEL)
10376 set = XVECEXP (set, 0, 0);
10377 gcc_assert (GET_CODE (set) == SET);
10378 mem = XEXP (set, opnum);
10379 while (SUBREG_P (mem))
10380 mem = SUBREG_REG (mem);
10381 gcc_assert (MEM_P (mem));
10382 return volatile_ok || !MEM_VOLATILE_P (mem);
10385 /* Return false if INSN contains a MEM with a non-default address space. */
10386 bool
10387 ix86_check_no_addr_space (rtx insn)
10389 subrtx_var_iterator::array_type array;
10390 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10392 rtx x = *iter;
10393 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10394 return false;
10396 return true;
10399 /* Initialize the table of extra 80387 mathematical constants. */
10401 static void
10402 init_ext_80387_constants (void)
10404 static const char * cst[5] =
10406 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10407 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10408 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10409 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10410 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10412 int i;
10414 for (i = 0; i < 5; i++)
10416 real_from_string (&ext_80387_constants_table[i], cst[i]);
10417 /* Ensure each constant is rounded to XFmode precision. */
10418 real_convert (&ext_80387_constants_table[i],
10419 XFmode, &ext_80387_constants_table[i]);
10422 ext_80387_constants_init = 1;
10425 /* Return non-zero if the constant is something that
10426 can be loaded with a special instruction. */
10429 standard_80387_constant_p (rtx x)
10431 machine_mode mode = GET_MODE (x);
10433 const REAL_VALUE_TYPE *r;
10435 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10436 return -1;
10438 if (x == CONST0_RTX (mode))
10439 return 1;
10440 if (x == CONST1_RTX (mode))
10441 return 2;
10443 r = CONST_DOUBLE_REAL_VALUE (x);
10445 /* For XFmode constants, try to find a special 80387 instruction when
10446 optimizing for size or on those CPUs that benefit from them. */
10447 if (mode == XFmode
10448 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10450 int i;
10452 if (! ext_80387_constants_init)
10453 init_ext_80387_constants ();
10455 for (i = 0; i < 5; i++)
10456 if (real_identical (r, &ext_80387_constants_table[i]))
10457 return i + 3;
10460 /* Load of the constant -0.0 or -1.0 will be split as
10461 fldz;fchs or fld1;fchs sequence. */
10462 if (real_isnegzero (r))
10463 return 8;
10464 if (real_identical (r, &dconstm1))
10465 return 9;
10467 return 0;
10470 /* Return the opcode of the special instruction to be used to load
10471 the constant X. */
10473 const char *
10474 standard_80387_constant_opcode (rtx x)
10476 switch (standard_80387_constant_p (x))
10478 case 1:
10479 return "fldz";
10480 case 2:
10481 return "fld1";
10482 case 3:
10483 return "fldlg2";
10484 case 4:
10485 return "fldln2";
10486 case 5:
10487 return "fldl2e";
10488 case 6:
10489 return "fldl2t";
10490 case 7:
10491 return "fldpi";
10492 case 8:
10493 case 9:
10494 return "#";
10495 default:
10496 gcc_unreachable ();
10500 /* Return the CONST_DOUBLE representing the 80387 constant that is
10501 loaded by the specified special instruction. The argument IDX
10502 matches the return value from standard_80387_constant_p. */
10505 standard_80387_constant_rtx (int idx)
10507 int i;
10509 if (! ext_80387_constants_init)
10510 init_ext_80387_constants ();
10512 switch (idx)
10514 case 3:
10515 case 4:
10516 case 5:
10517 case 6:
10518 case 7:
10519 i = idx - 3;
10520 break;
10522 default:
10523 gcc_unreachable ();
10526 return const_double_from_real_value (ext_80387_constants_table[i],
10527 XFmode);
10530 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10531 in supported SSE/AVX vector mode. */
10534 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10536 machine_mode mode;
10538 if (!TARGET_SSE)
10539 return 0;
10541 mode = GET_MODE (x);
10543 if (x == const0_rtx || const0_operand (x, mode))
10544 return 1;
10546 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10548 /* VOIDmode integer constant, get mode from the predicate. */
10549 if (mode == VOIDmode)
10550 mode = pred_mode;
10552 switch (GET_MODE_SIZE (mode))
10554 case 64:
10555 if (TARGET_AVX512F)
10556 return 2;
10557 break;
10558 case 32:
10559 if (TARGET_AVX2)
10560 return 2;
10561 break;
10562 case 16:
10563 if (TARGET_SSE2)
10564 return 2;
10565 break;
10566 case 0:
10567 /* VOIDmode */
10568 gcc_unreachable ();
10569 default:
10570 break;
10574 return 0;
10577 /* Return the opcode of the special instruction to be used to load
10578 the constant operands[1] into operands[0]. */
10580 const char *
10581 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10583 machine_mode mode;
10584 rtx x = operands[1];
10586 gcc_assert (TARGET_SSE);
10588 mode = GET_MODE (x);
10590 if (x == const0_rtx || const0_operand (x, mode))
10592 switch (get_attr_mode (insn))
10594 case MODE_TI:
10595 if (!EXT_REX_SSE_REG_P (operands[0]))
10596 return "%vpxor\t%0, %d0";
10597 /* FALLTHRU */
10598 case MODE_XI:
10599 case MODE_OI:
10600 if (EXT_REX_SSE_REG_P (operands[0]))
10601 return (TARGET_AVX512VL
10602 ? "vpxord\t%x0, %x0, %x0"
10603 : "vpxord\t%g0, %g0, %g0");
10604 return "vpxor\t%x0, %x0, %x0";
10606 case MODE_V2DF:
10607 if (!EXT_REX_SSE_REG_P (operands[0]))
10608 return "%vxorpd\t%0, %d0";
10609 /* FALLTHRU */
10610 case MODE_V8DF:
10611 case MODE_V4DF:
10612 if (!EXT_REX_SSE_REG_P (operands[0]))
10613 return "vxorpd\t%x0, %x0, %x0";
10614 else if (TARGET_AVX512DQ)
10615 return (TARGET_AVX512VL
10616 ? "vxorpd\t%x0, %x0, %x0"
10617 : "vxorpd\t%g0, %g0, %g0");
10618 else
10619 return (TARGET_AVX512VL
10620 ? "vpxorq\t%x0, %x0, %x0"
10621 : "vpxorq\t%g0, %g0, %g0");
10623 case MODE_V4SF:
10624 if (!EXT_REX_SSE_REG_P (operands[0]))
10625 return "%vxorps\t%0, %d0";
10626 /* FALLTHRU */
10627 case MODE_V16SF:
10628 case MODE_V8SF:
10629 if (!EXT_REX_SSE_REG_P (operands[0]))
10630 return "vxorps\t%x0, %x0, %x0";
10631 else if (TARGET_AVX512DQ)
10632 return (TARGET_AVX512VL
10633 ? "vxorps\t%x0, %x0, %x0"
10634 : "vxorps\t%g0, %g0, %g0");
10635 else
10636 return (TARGET_AVX512VL
10637 ? "vpxord\t%x0, %x0, %x0"
10638 : "vpxord\t%g0, %g0, %g0");
10640 default:
10641 gcc_unreachable ();
10644 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10646 enum attr_mode insn_mode = get_attr_mode (insn);
10648 switch (insn_mode)
10650 case MODE_XI:
10651 case MODE_V8DF:
10652 case MODE_V16SF:
10653 gcc_assert (TARGET_AVX512F);
10654 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10656 case MODE_OI:
10657 case MODE_V4DF:
10658 case MODE_V8SF:
10659 gcc_assert (TARGET_AVX2);
10660 /* FALLTHRU */
10661 case MODE_TI:
10662 case MODE_V2DF:
10663 case MODE_V4SF:
10664 gcc_assert (TARGET_SSE2);
10665 if (!EXT_REX_SSE_REG_P (operands[0]))
10666 return (TARGET_AVX
10667 ? "vpcmpeqd\t%0, %0, %0"
10668 : "pcmpeqd\t%0, %0");
10669 else if (TARGET_AVX512VL)
10670 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10671 else
10672 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10674 default:
10675 gcc_unreachable ();
10679 gcc_unreachable ();
10682 /* Returns true if INSN can be transformed from a memory load
10683 to a supported FP constant load. */
10685 bool
10686 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10688 rtx src = find_constant_src (insn);
10690 gcc_assert (REG_P (dst));
10692 if (src == NULL
10693 || (SSE_REGNO_P (REGNO (dst))
10694 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10695 || (STACK_REGNO_P (REGNO (dst))
10696 && standard_80387_constant_p (src) < 1))
10697 return false;
10699 return true;
10702 /* Returns true if OP contains a symbol reference */
10704 bool
10705 symbolic_reference_mentioned_p (rtx op)
10707 const char *fmt;
10708 int i;
10710 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10711 return true;
10713 fmt = GET_RTX_FORMAT (GET_CODE (op));
10714 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10716 if (fmt[i] == 'E')
10718 int j;
10720 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10721 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10722 return true;
10725 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10726 return true;
10729 return false;
10732 /* Return true if it is appropriate to emit `ret' instructions in the
10733 body of a function. Do this only if the epilogue is simple, needing a
10734 couple of insns. Prior to reloading, we can't tell how many registers
10735 must be saved, so return false then. Return false if there is no frame
10736 marker to de-allocate. */
10738 bool
10739 ix86_can_use_return_insn_p (void)
10741 if (ix86_function_naked (current_function_decl))
10742 return false;
10744 /* Don't use `ret' instruction in interrupt handler. */
10745 if (! reload_completed
10746 || frame_pointer_needed
10747 || cfun->machine->func_type != TYPE_NORMAL)
10748 return 0;
10750 /* Don't allow more than 32k pop, since that's all we can do
10751 with one instruction. */
10752 if (crtl->args.pops_args && crtl->args.size >= 32768)
10753 return 0;
10755 struct ix86_frame &frame = cfun->machine->frame;
10756 return (frame.stack_pointer_offset == UNITS_PER_WORD
10757 && (frame.nregs + frame.nsseregs) == 0);
10760 /* Value should be nonzero if functions must have frame pointers.
10761 Zero means the frame pointer need not be set up (and parms may
10762 be accessed via the stack pointer) in functions that seem suitable. */
10764 static bool
10765 ix86_frame_pointer_required (void)
10767 /* If we accessed previous frames, then the generated code expects
10768 to be able to access the saved ebp value in our frame. */
10769 if (cfun->machine->accesses_prev_frame)
10770 return true;
10772 /* Several x86 os'es need a frame pointer for other reasons,
10773 usually pertaining to setjmp. */
10774 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10775 return true;
10777 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10778 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10779 return true;
10781 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10782 allocation is 4GB. */
10783 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10784 return true;
10786 /* SSE saves require frame-pointer when stack is misaligned. */
10787 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10788 return true;
10790 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10791 turns off the frame pointer by default. Turn it back on now if
10792 we've not got a leaf function. */
10793 if (TARGET_OMIT_LEAF_FRAME_POINTER
10794 && (!crtl->is_leaf
10795 || ix86_current_function_calls_tls_descriptor))
10796 return true;
10798 if (crtl->profile && !flag_fentry)
10799 return true;
10801 return false;
10804 /* Record that the current function accesses previous call frames. */
10806 void
10807 ix86_setup_frame_addresses (void)
10809 cfun->machine->accesses_prev_frame = 1;
10812 #ifndef USE_HIDDEN_LINKONCE
10813 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10814 # define USE_HIDDEN_LINKONCE 1
10815 # else
10816 # define USE_HIDDEN_LINKONCE 0
10817 # endif
10818 #endif
10820 /* Label count for call and return thunks. It is used to make unique
10821 labels in call and return thunks. */
10822 static int indirectlabelno;
10824 /* True if call thunk function is needed. */
10825 static bool indirect_thunk_needed = false;
10826 /* True if call thunk function with the BND prefix is needed. */
10827 static bool indirect_thunk_bnd_needed = false;
10829 /* Bit masks of integer registers, which contain branch target, used
10830 by call thunk functions. */
10831 static int indirect_thunks_used;
10832 /* Bit masks of integer registers, which contain branch target, used
10833 by call thunk functions with the BND prefix. */
10834 static int indirect_thunks_bnd_used;
10836 /* True if return thunk function is needed. */
10837 static bool indirect_return_needed = false;
10838 /* True if return thunk function with the BND prefix is needed. */
10839 static bool indirect_return_bnd_needed = false;
10841 /* True if return thunk function via CX is needed. */
10842 static bool indirect_return_via_cx;
10843 /* True if return thunk function via CX with the BND prefix is
10844 needed. */
10845 static bool indirect_return_via_cx_bnd;
10847 #ifndef INDIRECT_LABEL
10848 # define INDIRECT_LABEL "LIND"
10849 #endif
10851 /* Indicate what prefix is needed for an indirect branch. */
10852 enum indirect_thunk_prefix
10854 indirect_thunk_prefix_none,
10855 indirect_thunk_prefix_bnd,
10856 indirect_thunk_prefix_nt
10859 /* Return the prefix needed for an indirect branch INSN. */
10861 enum indirect_thunk_prefix
10862 indirect_thunk_need_prefix (rtx_insn *insn)
10864 enum indirect_thunk_prefix need_prefix;
10865 if (ix86_bnd_prefixed_insn_p (insn))
10866 need_prefix = indirect_thunk_prefix_bnd;
10867 else if ((cfun->machine->indirect_branch_type
10868 == indirect_branch_thunk_extern)
10869 && ix86_notrack_prefixed_insn_p (insn))
10871 /* NOTRACK prefix is only used with external thunk so that it
10872 can be properly updated to support CET at run-time. */
10873 need_prefix = indirect_thunk_prefix_nt;
10875 else
10876 need_prefix = indirect_thunk_prefix_none;
10877 return need_prefix;
10880 /* Fills in the label name that should be used for the indirect thunk. */
10882 static void
10883 indirect_thunk_name (char name[32], unsigned int regno,
10884 enum indirect_thunk_prefix need_prefix,
10885 bool ret_p)
10887 if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
10888 gcc_unreachable ();
10890 if (USE_HIDDEN_LINKONCE)
10892 const char *prefix;
10894 if (need_prefix == indirect_thunk_prefix_bnd)
10895 prefix = "_bnd";
10896 else if (need_prefix == indirect_thunk_prefix_nt
10897 && regno != INVALID_REGNUM)
10899 /* NOTRACK prefix is only used with external thunk via
10900 register so that NOTRACK prefix can be added to indirect
10901 branch via register to support CET at run-time. */
10902 prefix = "_nt";
10904 else
10905 prefix = "";
10907 const char *ret = ret_p ? "return" : "indirect";
10909 if (regno != INVALID_REGNUM)
10911 const char *reg_prefix;
10912 if (LEGACY_INT_REGNO_P (regno))
10913 reg_prefix = TARGET_64BIT ? "r" : "e";
10914 else
10915 reg_prefix = "";
10916 sprintf (name, "__x86_%s_thunk%s_%s%s",
10917 ret, prefix, reg_prefix, reg_names[regno]);
10919 else
10920 sprintf (name, "__x86_%s_thunk%s", ret, prefix);
10922 else
10924 if (regno != INVALID_REGNUM)
10926 if (need_prefix == indirect_thunk_prefix_bnd)
10927 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10928 else
10929 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10931 else
10933 if (ret_p)
10935 if (need_prefix == indirect_thunk_prefix_bnd)
10936 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10937 else
10938 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10940 else
10942 if (need_prefix == indirect_thunk_prefix_bnd)
10943 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10944 else
10945 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10951 /* Output a call and return thunk for indirect branch. If BND_P is
10952 true, the BND prefix is needed. If REGNO != -1, the function
10953 address is in REGNO and the call and return thunk looks like:
10955 call L2
10957 pause
10958 lfence
10959 jmp L1
10961 mov %REG, (%sp)
10964 Otherwise, the function address is on the top of stack and the
10965 call and return thunk looks like:
10967 call L2
10969 pause
10970 lfence
10971 jmp L1
10973 lea WORD_SIZE(%sp), %sp
10977 static void
10978 output_indirect_thunk (enum indirect_thunk_prefix need_prefix,
10979 unsigned int regno)
10981 char indirectlabel1[32];
10982 char indirectlabel2[32];
10984 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10985 indirectlabelno++);
10986 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10987 indirectlabelno++);
10989 /* Call */
10990 if (need_prefix == indirect_thunk_prefix_bnd)
10991 fputs ("\tbnd call\t", asm_out_file);
10992 else
10993 fputs ("\tcall\t", asm_out_file);
10994 assemble_name_raw (asm_out_file, indirectlabel2);
10995 fputc ('\n', asm_out_file);
10997 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10999 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
11000 Usage of both pause + lfence is compromise solution. */
11001 fprintf (asm_out_file, "\tpause\n\tlfence\n");
11003 /* Jump. */
11004 fputs ("\tjmp\t", asm_out_file);
11005 assemble_name_raw (asm_out_file, indirectlabel1);
11006 fputc ('\n', asm_out_file);
11008 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
11010 if (regno != INVALID_REGNUM)
11012 /* MOV. */
11013 rtx xops[2];
11014 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
11015 xops[1] = gen_rtx_REG (word_mode, regno);
11016 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
11018 else
11020 /* LEA. */
11021 rtx xops[2];
11022 xops[0] = stack_pointer_rtx;
11023 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11024 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
11027 if (need_prefix == indirect_thunk_prefix_bnd)
11028 fputs ("\tbnd ret\n", asm_out_file);
11029 else
11030 fputs ("\tret\n", asm_out_file);
11033 /* Output a funtion with a call and return thunk for indirect branch.
11034 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
11035 the function address is in REGNO. Otherwise, the function address is
11036 on the top of stack. Thunk is used for function return if RET_P is
11037 true. */
11039 static void
11040 output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
11041 unsigned int regno, bool ret_p)
11043 char name[32];
11044 tree decl;
11046 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
11047 indirect_thunk_name (name, regno, need_prefix, ret_p);
11048 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11049 get_identifier (name),
11050 build_function_type_list (void_type_node, NULL_TREE));
11051 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11052 NULL_TREE, void_type_node);
11053 TREE_PUBLIC (decl) = 1;
11054 TREE_STATIC (decl) = 1;
11055 DECL_IGNORED_P (decl) = 1;
11057 #if TARGET_MACHO
11058 if (TARGET_MACHO)
11060 switch_to_section (darwin_sections[picbase_thunk_section]);
11061 fputs ("\t.weak_definition\t", asm_out_file);
11062 assemble_name (asm_out_file, name);
11063 fputs ("\n\t.private_extern\t", asm_out_file);
11064 assemble_name (asm_out_file, name);
11065 putc ('\n', asm_out_file);
11066 ASM_OUTPUT_LABEL (asm_out_file, name);
11067 DECL_WEAK (decl) = 1;
11069 else
11070 #endif
11071 if (USE_HIDDEN_LINKONCE)
11073 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11075 targetm.asm_out.unique_section (decl, 0);
11076 switch_to_section (get_named_section (decl, NULL, 0));
11078 targetm.asm_out.globalize_label (asm_out_file, name);
11079 fputs ("\t.hidden\t", asm_out_file);
11080 assemble_name (asm_out_file, name);
11081 putc ('\n', asm_out_file);
11082 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11084 else
11086 switch_to_section (text_section);
11087 ASM_OUTPUT_LABEL (asm_out_file, name);
11090 DECL_INITIAL (decl) = make_node (BLOCK);
11091 current_function_decl = decl;
11092 allocate_struct_function (decl, false);
11093 init_function_start (decl);
11094 /* We're about to hide the function body from callees of final_* by
11095 emitting it directly; tell them we're a thunk, if they care. */
11096 cfun->is_thunk = true;
11097 first_function_block_is_cold = false;
11098 /* Make sure unwind info is emitted for the thunk if needed. */
11099 final_start_function (emit_barrier (), asm_out_file, 1);
11101 output_indirect_thunk (need_prefix, regno);
11103 final_end_function ();
11104 init_insn_lengths ();
11105 free_after_compilation (cfun);
11106 set_cfun (NULL);
11107 current_function_decl = NULL;
11110 static int pic_labels_used;
11112 /* Fills in the label name that should be used for a pc thunk for
11113 the given register. */
11115 static void
11116 get_pc_thunk_name (char name[32], unsigned int regno)
11118 gcc_assert (!TARGET_64BIT);
11120 if (USE_HIDDEN_LINKONCE)
11121 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11122 else
11123 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11127 /* This function generates code for -fpic that loads %ebx with
11128 the return address of the caller and then returns. */
11130 static void
11131 ix86_code_end (void)
11133 rtx xops[2];
11134 unsigned int regno;
11136 if (indirect_return_needed)
11137 output_indirect_thunk_function (indirect_thunk_prefix_none,
11138 INVALID_REGNUM, true);
11139 if (indirect_return_bnd_needed)
11140 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11141 INVALID_REGNUM, true);
11143 if (indirect_return_via_cx)
11144 output_indirect_thunk_function (indirect_thunk_prefix_none,
11145 CX_REG, true);
11146 if (indirect_return_via_cx_bnd)
11147 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11148 CX_REG, true);
11150 if (indirect_thunk_needed)
11151 output_indirect_thunk_function (indirect_thunk_prefix_none,
11152 INVALID_REGNUM, false);
11153 if (indirect_thunk_bnd_needed)
11154 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11155 INVALID_REGNUM, false);
11157 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11159 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11160 if ((indirect_thunks_used & (1 << i)))
11161 output_indirect_thunk_function (indirect_thunk_prefix_none,
11162 regno, false);
11164 if ((indirect_thunks_bnd_used & (1 << i)))
11165 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11166 regno, false);
11169 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11171 char name[32];
11172 tree decl;
11174 if ((indirect_thunks_used & (1 << regno)))
11175 output_indirect_thunk_function (indirect_thunk_prefix_none,
11176 regno, false);
11178 if ((indirect_thunks_bnd_used & (1 << regno)))
11179 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11180 regno, false);
11182 if (!(pic_labels_used & (1 << regno)))
11183 continue;
11185 get_pc_thunk_name (name, regno);
11187 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11188 get_identifier (name),
11189 build_function_type_list (void_type_node, NULL_TREE));
11190 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11191 NULL_TREE, void_type_node);
11192 TREE_PUBLIC (decl) = 1;
11193 TREE_STATIC (decl) = 1;
11194 DECL_IGNORED_P (decl) = 1;
11196 #if TARGET_MACHO
11197 if (TARGET_MACHO)
11199 switch_to_section (darwin_sections[picbase_thunk_section]);
11200 fputs ("\t.weak_definition\t", asm_out_file);
11201 assemble_name (asm_out_file, name);
11202 fputs ("\n\t.private_extern\t", asm_out_file);
11203 assemble_name (asm_out_file, name);
11204 putc ('\n', asm_out_file);
11205 ASM_OUTPUT_LABEL (asm_out_file, name);
11206 DECL_WEAK (decl) = 1;
11208 else
11209 #endif
11210 if (USE_HIDDEN_LINKONCE)
11212 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11214 targetm.asm_out.unique_section (decl, 0);
11215 switch_to_section (get_named_section (decl, NULL, 0));
11217 targetm.asm_out.globalize_label (asm_out_file, name);
11218 fputs ("\t.hidden\t", asm_out_file);
11219 assemble_name (asm_out_file, name);
11220 putc ('\n', asm_out_file);
11221 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11223 else
11225 switch_to_section (text_section);
11226 ASM_OUTPUT_LABEL (asm_out_file, name);
11229 DECL_INITIAL (decl) = make_node (BLOCK);
11230 current_function_decl = decl;
11231 allocate_struct_function (decl, false);
11232 init_function_start (decl);
11233 /* We're about to hide the function body from callees of final_* by
11234 emitting it directly; tell them we're a thunk, if they care. */
11235 cfun->is_thunk = true;
11236 first_function_block_is_cold = false;
11237 /* Make sure unwind info is emitted for the thunk if needed. */
11238 final_start_function (emit_barrier (), asm_out_file, 1);
11240 /* Pad stack IP move with 4 instructions (two NOPs count
11241 as one instruction). */
11242 if (TARGET_PAD_SHORT_FUNCTION)
11244 int i = 8;
11246 while (i--)
11247 fputs ("\tnop\n", asm_out_file);
11250 xops[0] = gen_rtx_REG (Pmode, regno);
11251 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11252 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11253 output_asm_insn ("%!ret", NULL);
11254 final_end_function ();
11255 init_insn_lengths ();
11256 free_after_compilation (cfun);
11257 set_cfun (NULL);
11258 current_function_decl = NULL;
11261 if (flag_split_stack)
11262 file_end_indicate_split_stack ();
11265 /* Emit code for the SET_GOT patterns. */
11267 const char *
11268 output_set_got (rtx dest, rtx label)
11270 rtx xops[3];
11272 xops[0] = dest;
11274 if (TARGET_VXWORKS_RTP && flag_pic)
11276 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11277 xops[2] = gen_rtx_MEM (Pmode,
11278 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11279 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11281 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11282 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11283 an unadorned address. */
11284 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11285 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11286 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11287 return "";
11290 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11292 if (flag_pic)
11294 char name[32];
11295 get_pc_thunk_name (name, REGNO (dest));
11296 pic_labels_used |= 1 << REGNO (dest);
11298 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11299 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11300 output_asm_insn ("%!call\t%X2", xops);
11302 #if TARGET_MACHO
11303 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11304 This is what will be referenced by the Mach-O PIC subsystem. */
11305 if (machopic_should_output_picbase_label () || !label)
11306 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11308 /* When we are restoring the pic base at the site of a nonlocal label,
11309 and we decided to emit the pic base above, we will still output a
11310 local label used for calculating the correction offset (even though
11311 the offset will be 0 in that case). */
11312 if (label)
11313 targetm.asm_out.internal_label (asm_out_file, "L",
11314 CODE_LABEL_NUMBER (label));
11315 #endif
11317 else
11319 if (TARGET_MACHO)
11320 /* We don't need a pic base, we're not producing pic. */
11321 gcc_unreachable ();
11323 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11324 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11325 targetm.asm_out.internal_label (asm_out_file, "L",
11326 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11329 if (!TARGET_MACHO)
11330 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11332 return "";
11335 /* Generate an "push" pattern for input ARG. */
11337 static rtx
11338 gen_push (rtx arg)
11340 struct machine_function *m = cfun->machine;
11342 if (m->fs.cfa_reg == stack_pointer_rtx)
11343 m->fs.cfa_offset += UNITS_PER_WORD;
11344 m->fs.sp_offset += UNITS_PER_WORD;
11346 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11347 arg = gen_rtx_REG (word_mode, REGNO (arg));
11349 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11350 gen_rtx_PRE_DEC (Pmode,
11351 stack_pointer_rtx)),
11352 arg);
11355 /* Generate an "pop" pattern for input ARG. */
11357 static rtx
11358 gen_pop (rtx arg)
11360 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11361 arg = gen_rtx_REG (word_mode, REGNO (arg));
11363 return gen_rtx_SET (arg,
11364 gen_rtx_MEM (word_mode,
11365 gen_rtx_POST_INC (Pmode,
11366 stack_pointer_rtx)));
11369 /* Return >= 0 if there is an unused call-clobbered register available
11370 for the entire function. */
11372 static unsigned int
11373 ix86_select_alt_pic_regnum (void)
11375 if (ix86_use_pseudo_pic_reg ())
11376 return INVALID_REGNUM;
11378 if (crtl->is_leaf
11379 && !crtl->profile
11380 && !ix86_current_function_calls_tls_descriptor)
11382 int i, drap;
11383 /* Can't use the same register for both PIC and DRAP. */
11384 if (crtl->drap_reg)
11385 drap = REGNO (crtl->drap_reg);
11386 else
11387 drap = -1;
11388 for (i = 2; i >= 0; --i)
11389 if (i != drap && !df_regs_ever_live_p (i))
11390 return i;
11393 return INVALID_REGNUM;
11396 /* Return true if REGNO is used by the epilogue. */
11398 bool
11399 ix86_epilogue_uses (int regno)
11401 /* If there are no caller-saved registers, we preserve all registers,
11402 except for MMX and x87 registers which aren't supported when saving
11403 and restoring registers. Don't explicitly save SP register since
11404 it is always preserved. */
11405 return (epilogue_completed
11406 && cfun->machine->no_caller_saved_registers
11407 && !fixed_regs[regno]
11408 && !STACK_REGNO_P (regno)
11409 && !MMX_REGNO_P (regno));
11412 /* Return nonzero if register REGNO can be used as a scratch register
11413 in peephole2. */
11415 static bool
11416 ix86_hard_regno_scratch_ok (unsigned int regno)
11418 /* If there are no caller-saved registers, we can't use any register
11419 as a scratch register after epilogue and use REGNO as scratch
11420 register only if it has been used before to avoid saving and
11421 restoring it. */
11422 return (!cfun->machine->no_caller_saved_registers
11423 || (!epilogue_completed
11424 && df_regs_ever_live_p (regno)));
11427 /* Return true if register class CL should be an additional allocno
11428 class. */
11430 static bool
11431 ix86_additional_allocno_class_p (reg_class_t cl)
11433 return cl == MOD4_SSE_REGS;
11436 /* Return TRUE if we need to save REGNO. */
11438 static bool
11439 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11441 /* If there are no caller-saved registers, we preserve all registers,
11442 except for MMX and x87 registers which aren't supported when saving
11443 and restoring registers. Don't explicitly save SP register since
11444 it is always preserved. */
11445 if (cfun->machine->no_caller_saved_registers)
11447 /* Don't preserve registers used for function return value. */
11448 rtx reg = crtl->return_rtx;
11449 if (reg)
11451 unsigned int i = REGNO (reg);
11452 unsigned int nregs = REG_NREGS (reg);
11453 while (nregs-- > 0)
11454 if ((i + nregs) == regno)
11455 return false;
11457 reg = crtl->return_bnd;
11458 if (reg)
11460 i = REGNO (reg);
11461 nregs = REG_NREGS (reg);
11462 while (nregs-- > 0)
11463 if ((i + nregs) == regno)
11464 return false;
11468 return (df_regs_ever_live_p (regno)
11469 && !fixed_regs[regno]
11470 && !STACK_REGNO_P (regno)
11471 && !MMX_REGNO_P (regno)
11472 && (regno != HARD_FRAME_POINTER_REGNUM
11473 || !frame_pointer_needed));
11476 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11477 && pic_offset_table_rtx)
11479 if (ix86_use_pseudo_pic_reg ())
11481 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11482 _mcount in prologue. */
11483 if (!TARGET_64BIT && flag_pic && crtl->profile)
11484 return true;
11486 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11487 || crtl->profile
11488 || crtl->calls_eh_return
11489 || crtl->uses_const_pool
11490 || cfun->has_nonlocal_label)
11491 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11494 if (crtl->calls_eh_return && maybe_eh_return)
11496 unsigned i;
11497 for (i = 0; ; i++)
11499 unsigned test = EH_RETURN_DATA_REGNO (i);
11500 if (test == INVALID_REGNUM)
11501 break;
11502 if (test == regno)
11503 return true;
11507 if (ignore_outlined && cfun->machine->call_ms2sysv)
11509 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11510 + xlogue_layout::MIN_REGS;
11511 if (xlogue_layout::is_stub_managed_reg (regno, count))
11512 return false;
11515 if (crtl->drap_reg
11516 && regno == REGNO (crtl->drap_reg)
11517 && !cfun->machine->no_drap_save_restore)
11518 return true;
11520 return (df_regs_ever_live_p (regno)
11521 && !call_used_regs[regno]
11522 && !fixed_regs[regno]
11523 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11526 /* Return number of saved general prupose registers. */
11528 static int
11529 ix86_nsaved_regs (void)
11531 int nregs = 0;
11532 int regno;
11534 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11535 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11536 nregs ++;
11537 return nregs;
11540 /* Return number of saved SSE registers. */
11542 static int
11543 ix86_nsaved_sseregs (void)
11545 int nregs = 0;
11546 int regno;
11548 if (!TARGET_64BIT_MS_ABI)
11549 return 0;
11550 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11551 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11552 nregs ++;
11553 return nregs;
11556 /* Given FROM and TO register numbers, say whether this elimination is
11557 allowed. If stack alignment is needed, we can only replace argument
11558 pointer with hard frame pointer, or replace frame pointer with stack
11559 pointer. Otherwise, frame pointer elimination is automatically
11560 handled and all other eliminations are valid. */
11562 static bool
11563 ix86_can_eliminate (const int from, const int to)
11565 if (stack_realign_fp)
11566 return ((from == ARG_POINTER_REGNUM
11567 && to == HARD_FRAME_POINTER_REGNUM)
11568 || (from == FRAME_POINTER_REGNUM
11569 && to == STACK_POINTER_REGNUM));
11570 else
11571 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11574 /* Return the offset between two registers, one to be eliminated, and the other
11575 its replacement, at the start of a routine. */
11577 HOST_WIDE_INT
11578 ix86_initial_elimination_offset (int from, int to)
11580 struct ix86_frame &frame = cfun->machine->frame;
11582 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11583 return frame.hard_frame_pointer_offset;
11584 else if (from == FRAME_POINTER_REGNUM
11585 && to == HARD_FRAME_POINTER_REGNUM)
11586 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11587 else
11589 gcc_assert (to == STACK_POINTER_REGNUM);
11591 if (from == ARG_POINTER_REGNUM)
11592 return frame.stack_pointer_offset;
11594 gcc_assert (from == FRAME_POINTER_REGNUM);
11595 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11599 /* In a dynamically-aligned function, we can't know the offset from
11600 stack pointer to frame pointer, so we must ensure that setjmp
11601 eliminates fp against the hard fp (%ebp) rather than trying to
11602 index from %esp up to the top of the frame across a gap that is
11603 of unknown (at compile-time) size. */
11604 static rtx
11605 ix86_builtin_setjmp_frame_value (void)
11607 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11610 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11611 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11613 static bool warned_once = false;
11614 if (!warned_once)
11616 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11617 feature);
11618 warned_once = true;
11622 /* Return the probing interval for -fstack-clash-protection. */
11624 static HOST_WIDE_INT
11625 get_probe_interval (void)
11627 if (flag_stack_clash_protection)
11628 return (HOST_WIDE_INT_1U
11629 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11630 else
11631 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11634 /* When using -fsplit-stack, the allocation routines set a field in
11635 the TCB to the bottom of the stack plus this much space, measured
11636 in bytes. */
11638 #define SPLIT_STACK_AVAILABLE 256
11640 /* Fill structure ix86_frame about frame of currently computed function. */
11642 static void
11643 ix86_compute_frame_layout (void)
11645 struct ix86_frame *frame = &cfun->machine->frame;
11646 struct machine_function *m = cfun->machine;
11647 unsigned HOST_WIDE_INT stack_alignment_needed;
11648 HOST_WIDE_INT offset;
11649 unsigned HOST_WIDE_INT preferred_alignment;
11650 HOST_WIDE_INT size = get_frame_size ();
11651 HOST_WIDE_INT to_allocate;
11653 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11654 * ms_abi functions that call a sysv function. We now need to prune away
11655 * cases where it should be disabled. */
11656 if (TARGET_64BIT && m->call_ms2sysv)
11658 gcc_assert (TARGET_64BIT_MS_ABI);
11659 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11660 gcc_assert (!TARGET_SEH);
11661 gcc_assert (TARGET_SSE);
11662 gcc_assert (!ix86_using_red_zone ());
11664 if (crtl->calls_eh_return)
11666 gcc_assert (!reload_completed);
11667 m->call_ms2sysv = false;
11668 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11671 else if (ix86_static_chain_on_stack)
11673 gcc_assert (!reload_completed);
11674 m->call_ms2sysv = false;
11675 warn_once_call_ms2sysv_xlogues ("static call chains");
11678 /* Finally, compute which registers the stub will manage. */
11679 else
11681 unsigned count = xlogue_layout::count_stub_managed_regs ();
11682 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11683 m->call_ms2sysv_pad_in = 0;
11687 frame->nregs = ix86_nsaved_regs ();
11688 frame->nsseregs = ix86_nsaved_sseregs ();
11690 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11691 except for function prologues, leaf functions and when the defult
11692 incoming stack boundary is overriden at command line or via
11693 force_align_arg_pointer attribute. */
11694 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11695 && (!crtl->is_leaf || cfun->calls_alloca != 0
11696 || ix86_current_function_calls_tls_descriptor
11697 || ix86_incoming_stack_boundary < 128))
11699 crtl->preferred_stack_boundary = 128;
11700 crtl->stack_alignment_needed = 128;
11703 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11704 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11706 gcc_assert (!size || stack_alignment_needed);
11707 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11708 gcc_assert (preferred_alignment <= stack_alignment_needed);
11710 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11711 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11712 if (TARGET_64BIT && m->call_ms2sysv)
11714 gcc_assert (stack_alignment_needed >= 16);
11715 gcc_assert (!frame->nsseregs);
11718 /* For SEH we have to limit the amount of code movement into the prologue.
11719 At present we do this via a BLOCKAGE, at which point there's very little
11720 scheduling that can be done, which means that there's very little point
11721 in doing anything except PUSHs. */
11722 if (TARGET_SEH)
11723 m->use_fast_prologue_epilogue = false;
11724 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11726 int count = frame->nregs;
11727 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11729 /* The fast prologue uses move instead of push to save registers. This
11730 is significantly longer, but also executes faster as modern hardware
11731 can execute the moves in parallel, but can't do that for push/pop.
11733 Be careful about choosing what prologue to emit: When function takes
11734 many instructions to execute we may use slow version as well as in
11735 case function is known to be outside hot spot (this is known with
11736 feedback only). Weight the size of function by number of registers
11737 to save as it is cheap to use one or two push instructions but very
11738 slow to use many of them. */
11739 if (count)
11740 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11741 if (node->frequency < NODE_FREQUENCY_NORMAL
11742 || (flag_branch_probabilities
11743 && node->frequency < NODE_FREQUENCY_HOT))
11744 m->use_fast_prologue_epilogue = false;
11745 else
11746 m->use_fast_prologue_epilogue
11747 = !expensive_function_p (count);
11750 frame->save_regs_using_mov
11751 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11752 /* If static stack checking is enabled and done with probes,
11753 the registers need to be saved before allocating the frame. */
11754 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11756 /* Skip return address and error code in exception handler. */
11757 offset = INCOMING_FRAME_SP_OFFSET;
11759 /* Skip pushed static chain. */
11760 if (ix86_static_chain_on_stack)
11761 offset += UNITS_PER_WORD;
11763 /* Skip saved base pointer. */
11764 if (frame_pointer_needed)
11765 offset += UNITS_PER_WORD;
11766 frame->hfp_save_offset = offset;
11768 /* The traditional frame pointer location is at the top of the frame. */
11769 frame->hard_frame_pointer_offset = offset;
11771 /* Register save area */
11772 offset += frame->nregs * UNITS_PER_WORD;
11773 frame->reg_save_offset = offset;
11775 /* On SEH target, registers are pushed just before the frame pointer
11776 location. */
11777 if (TARGET_SEH)
11778 frame->hard_frame_pointer_offset = offset;
11780 /* Calculate the size of the va-arg area (not including padding, if any). */
11781 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11783 /* Also adjust stack_realign_offset for the largest alignment of
11784 stack slot actually used. */
11785 if (stack_realign_fp
11786 || (cfun->machine->max_used_stack_alignment != 0
11787 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11789 /* We may need a 16-byte aligned stack for the remainder of the
11790 register save area, but the stack frame for the local function
11791 may require a greater alignment if using AVX/2/512. In order
11792 to avoid wasting space, we first calculate the space needed for
11793 the rest of the register saves, add that to the stack pointer,
11794 and then realign the stack to the boundary of the start of the
11795 frame for the local function. */
11796 HOST_WIDE_INT space_needed = 0;
11797 HOST_WIDE_INT sse_reg_space_needed = 0;
11799 if (TARGET_64BIT)
11801 if (m->call_ms2sysv)
11803 m->call_ms2sysv_pad_in = 0;
11804 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11807 else if (frame->nsseregs)
11808 /* The only ABI that has saved SSE registers (Win64) also has a
11809 16-byte aligned default stack. However, many programs violate
11810 the ABI, and Wine64 forces stack realignment to compensate. */
11811 space_needed = frame->nsseregs * 16;
11813 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11815 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11816 rounding to be pedantic. */
11817 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11819 else
11820 space_needed = frame->va_arg_size;
11822 /* Record the allocation size required prior to the realignment AND. */
11823 frame->stack_realign_allocate = space_needed;
11825 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11826 before this point are not directly comparable with values below
11827 this point. Use sp_valid_at to determine if the stack pointer is
11828 valid for a given offset, fp_valid_at for the frame pointer, or
11829 choose_baseaddr to have a base register chosen for you.
11831 Note that the result of (frame->stack_realign_offset
11832 & (stack_alignment_needed - 1)) may not equal zero. */
11833 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11834 frame->stack_realign_offset = offset - space_needed;
11835 frame->sse_reg_save_offset = frame->stack_realign_offset
11836 + sse_reg_space_needed;
11838 else
11840 frame->stack_realign_offset = offset;
11842 if (TARGET_64BIT && m->call_ms2sysv)
11844 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11845 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11848 /* Align and set SSE register save area. */
11849 else if (frame->nsseregs)
11851 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11852 required and the DRAP re-alignment boundary is at least 16 bytes,
11853 then we want the SSE register save area properly aligned. */
11854 if (ix86_incoming_stack_boundary >= 128
11855 || (stack_realign_drap && stack_alignment_needed >= 16))
11856 offset = ROUND_UP (offset, 16);
11857 offset += frame->nsseregs * 16;
11859 frame->sse_reg_save_offset = offset;
11860 offset += frame->va_arg_size;
11863 /* Align start of frame for local function. When a function call
11864 is removed, it may become a leaf function. But if argument may
11865 be passed on stack, we need to align the stack when there is no
11866 tail call. */
11867 if (m->call_ms2sysv
11868 || frame->va_arg_size != 0
11869 || size != 0
11870 || !crtl->is_leaf
11871 || (!crtl->tail_call_emit
11872 && cfun->machine->outgoing_args_on_stack)
11873 || cfun->calls_alloca
11874 || ix86_current_function_calls_tls_descriptor)
11875 offset = ROUND_UP (offset, stack_alignment_needed);
11877 /* Frame pointer points here. */
11878 frame->frame_pointer_offset = offset;
11880 offset += size;
11882 /* Add outgoing arguments area. Can be skipped if we eliminated
11883 all the function calls as dead code.
11884 Skipping is however impossible when function calls alloca. Alloca
11885 expander assumes that last crtl->outgoing_args_size
11886 of stack frame are unused. */
11887 if (ACCUMULATE_OUTGOING_ARGS
11888 && (!crtl->is_leaf || cfun->calls_alloca
11889 || ix86_current_function_calls_tls_descriptor))
11891 offset += crtl->outgoing_args_size;
11892 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11894 else
11895 frame->outgoing_arguments_size = 0;
11897 /* Align stack boundary. Only needed if we're calling another function
11898 or using alloca. */
11899 if (!crtl->is_leaf || cfun->calls_alloca
11900 || ix86_current_function_calls_tls_descriptor)
11901 offset = ROUND_UP (offset, preferred_alignment);
11903 /* We've reached end of stack frame. */
11904 frame->stack_pointer_offset = offset;
11906 /* Size prologue needs to allocate. */
11907 to_allocate = offset - frame->sse_reg_save_offset;
11909 if ((!to_allocate && frame->nregs <= 1)
11910 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11911 /* If stack clash probing needs a loop, then it needs a
11912 scratch register. But the returned register is only guaranteed
11913 to be safe to use after register saves are complete. So if
11914 stack clash protections are enabled and the allocated frame is
11915 larger than the probe interval, then use pushes to save
11916 callee saved registers. */
11917 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11918 frame->save_regs_using_mov = false;
11920 if (ix86_using_red_zone ()
11921 && crtl->sp_is_unchanging
11922 && crtl->is_leaf
11923 && !ix86_pc_thunk_call_expanded
11924 && !ix86_current_function_calls_tls_descriptor)
11926 frame->red_zone_size = to_allocate;
11927 if (frame->save_regs_using_mov)
11928 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11929 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11930 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11932 else
11933 frame->red_zone_size = 0;
11934 frame->stack_pointer_offset -= frame->red_zone_size;
11936 /* The SEH frame pointer location is near the bottom of the frame.
11937 This is enforced by the fact that the difference between the
11938 stack pointer and the frame pointer is limited to 240 bytes in
11939 the unwind data structure. */
11940 if (TARGET_SEH)
11942 HOST_WIDE_INT diff;
11944 /* If we can leave the frame pointer where it is, do so. Also, returns
11945 the establisher frame for __builtin_frame_address (0). */
11946 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11947 if (diff <= SEH_MAX_FRAME_SIZE
11948 && (diff > 240 || (diff & 15) != 0)
11949 && !crtl->accesses_prior_frames)
11951 /* Ideally we'd determine what portion of the local stack frame
11952 (within the constraint of the lowest 240) is most heavily used.
11953 But without that complication, simply bias the frame pointer
11954 by 128 bytes so as to maximize the amount of the local stack
11955 frame that is addressable with 8-bit offsets. */
11956 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11961 /* This is semi-inlined memory_address_length, but simplified
11962 since we know that we're always dealing with reg+offset, and
11963 to avoid having to create and discard all that rtl. */
11965 static inline int
11966 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11968 int len = 4;
11970 if (offset == 0)
11972 /* EBP and R13 cannot be encoded without an offset. */
11973 len = (regno == BP_REG || regno == R13_REG);
11975 else if (IN_RANGE (offset, -128, 127))
11976 len = 1;
11978 /* ESP and R12 must be encoded with a SIB byte. */
11979 if (regno == SP_REG || regno == R12_REG)
11980 len++;
11982 return len;
11985 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11986 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11988 static bool
11989 sp_valid_at (HOST_WIDE_INT cfa_offset)
11991 const struct machine_frame_state &fs = cfun->machine->fs;
11992 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11994 /* Validate that the cfa_offset isn't in a "no-man's land". */
11995 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11996 return false;
11998 return fs.sp_valid;
12001 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
12002 the frame save area. The register is saved at CFA - CFA_OFFSET. */
12004 static inline bool
12005 fp_valid_at (HOST_WIDE_INT cfa_offset)
12007 const struct machine_frame_state &fs = cfun->machine->fs;
12008 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
12010 /* Validate that the cfa_offset isn't in a "no-man's land". */
12011 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
12012 return false;
12014 return fs.fp_valid;
12017 /* Choose a base register based upon alignment requested, speed and/or
12018 size. */
12020 static void
12021 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
12022 HOST_WIDE_INT &base_offset,
12023 unsigned int align_reqested, unsigned int *align)
12025 const struct machine_function *m = cfun->machine;
12026 unsigned int hfp_align;
12027 unsigned int drap_align;
12028 unsigned int sp_align;
12029 bool hfp_ok = fp_valid_at (cfa_offset);
12030 bool drap_ok = m->fs.drap_valid;
12031 bool sp_ok = sp_valid_at (cfa_offset);
12033 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
12035 /* Filter out any registers that don't meet the requested alignment
12036 criteria. */
12037 if (align_reqested)
12039 if (m->fs.realigned)
12040 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
12041 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
12042 notes (which we would need to use a realigned stack pointer),
12043 so disable on SEH targets. */
12044 else if (m->fs.sp_realigned)
12045 sp_align = crtl->stack_alignment_needed;
12047 hfp_ok = hfp_ok && hfp_align >= align_reqested;
12048 drap_ok = drap_ok && drap_align >= align_reqested;
12049 sp_ok = sp_ok && sp_align >= align_reqested;
12052 if (m->use_fast_prologue_epilogue)
12054 /* Choose the base register most likely to allow the most scheduling
12055 opportunities. Generally FP is valid throughout the function,
12056 while DRAP must be reloaded within the epilogue. But choose either
12057 over the SP due to increased encoding size. */
12059 if (hfp_ok)
12061 base_reg = hard_frame_pointer_rtx;
12062 base_offset = m->fs.fp_offset - cfa_offset;
12064 else if (drap_ok)
12066 base_reg = crtl->drap_reg;
12067 base_offset = 0 - cfa_offset;
12069 else if (sp_ok)
12071 base_reg = stack_pointer_rtx;
12072 base_offset = m->fs.sp_offset - cfa_offset;
12075 else
12077 HOST_WIDE_INT toffset;
12078 int len = 16, tlen;
12080 /* Choose the base register with the smallest address encoding.
12081 With a tie, choose FP > DRAP > SP. */
12082 if (sp_ok)
12084 base_reg = stack_pointer_rtx;
12085 base_offset = m->fs.sp_offset - cfa_offset;
12086 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12088 if (drap_ok)
12090 toffset = 0 - cfa_offset;
12091 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12092 if (tlen <= len)
12094 base_reg = crtl->drap_reg;
12095 base_offset = toffset;
12096 len = tlen;
12099 if (hfp_ok)
12101 toffset = m->fs.fp_offset - cfa_offset;
12102 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12103 if (tlen <= len)
12105 base_reg = hard_frame_pointer_rtx;
12106 base_offset = toffset;
12107 len = tlen;
12112 /* Set the align return value. */
12113 if (align)
12115 if (base_reg == stack_pointer_rtx)
12116 *align = sp_align;
12117 else if (base_reg == crtl->drap_reg)
12118 *align = drap_align;
12119 else if (base_reg == hard_frame_pointer_rtx)
12120 *align = hfp_align;
12124 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12125 the alignment of address. If ALIGN is non-null, it should point to
12126 an alignment value (in bits) that is preferred or zero and will
12127 recieve the alignment of the base register that was selected,
12128 irrespective of rather or not CFA_OFFSET is a multiple of that
12129 alignment value. If it is possible for the base register offset to be
12130 non-immediate then SCRATCH_REGNO should specify a scratch register to
12131 use.
12133 The valid base registers are taken from CFUN->MACHINE->FS. */
12135 static rtx
12136 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12137 unsigned int scratch_regno = INVALID_REGNUM)
12139 rtx base_reg = NULL;
12140 HOST_WIDE_INT base_offset = 0;
12142 /* If a specific alignment is requested, try to get a base register
12143 with that alignment first. */
12144 if (align && *align)
12145 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12147 if (!base_reg)
12148 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12150 gcc_assert (base_reg != NULL);
12152 rtx base_offset_rtx = GEN_INT (base_offset);
12154 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12156 gcc_assert (scratch_regno != INVALID_REGNUM);
12158 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12159 emit_move_insn (scratch_reg, base_offset_rtx);
12161 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12164 return plus_constant (Pmode, base_reg, base_offset);
12167 /* Emit code to save registers in the prologue. */
12169 static void
12170 ix86_emit_save_regs (void)
12172 unsigned int regno;
12173 rtx_insn *insn;
12175 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12176 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12178 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12179 RTX_FRAME_RELATED_P (insn) = 1;
12183 /* Emit a single register save at CFA - CFA_OFFSET. */
12185 static void
12186 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12187 HOST_WIDE_INT cfa_offset)
12189 struct machine_function *m = cfun->machine;
12190 rtx reg = gen_rtx_REG (mode, regno);
12191 rtx mem, addr, base, insn;
12192 unsigned int align = GET_MODE_ALIGNMENT (mode);
12194 addr = choose_baseaddr (cfa_offset, &align);
12195 mem = gen_frame_mem (mode, addr);
12197 /* The location aligment depends upon the base register. */
12198 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12199 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12200 set_mem_align (mem, align);
12202 insn = emit_insn (gen_rtx_SET (mem, reg));
12203 RTX_FRAME_RELATED_P (insn) = 1;
12205 base = addr;
12206 if (GET_CODE (base) == PLUS)
12207 base = XEXP (base, 0);
12208 gcc_checking_assert (REG_P (base));
12210 /* When saving registers into a re-aligned local stack frame, avoid
12211 any tricky guessing by dwarf2out. */
12212 if (m->fs.realigned)
12214 gcc_checking_assert (stack_realign_drap);
12216 if (regno == REGNO (crtl->drap_reg))
12218 /* A bit of a hack. We force the DRAP register to be saved in
12219 the re-aligned stack frame, which provides us with a copy
12220 of the CFA that will last past the prologue. Install it. */
12221 gcc_checking_assert (cfun->machine->fs.fp_valid);
12222 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12223 cfun->machine->fs.fp_offset - cfa_offset);
12224 mem = gen_rtx_MEM (mode, addr);
12225 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12227 else
12229 /* The frame pointer is a stable reference within the
12230 aligned frame. Use it. */
12231 gcc_checking_assert (cfun->machine->fs.fp_valid);
12232 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12233 cfun->machine->fs.fp_offset - cfa_offset);
12234 mem = gen_rtx_MEM (mode, addr);
12235 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12239 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12240 && cfa_offset >= m->fs.sp_realigned_offset)
12242 gcc_checking_assert (stack_realign_fp);
12243 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12246 /* The memory may not be relative to the current CFA register,
12247 which means that we may need to generate a new pattern for
12248 use by the unwind info. */
12249 else if (base != m->fs.cfa_reg)
12251 addr = plus_constant (Pmode, m->fs.cfa_reg,
12252 m->fs.cfa_offset - cfa_offset);
12253 mem = gen_rtx_MEM (mode, addr);
12254 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12258 /* Emit code to save registers using MOV insns.
12259 First register is stored at CFA - CFA_OFFSET. */
12260 static void
12261 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12263 unsigned int regno;
12265 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12266 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12268 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12269 cfa_offset -= UNITS_PER_WORD;
12273 /* Emit code to save SSE registers using MOV insns.
12274 First register is stored at CFA - CFA_OFFSET. */
12275 static void
12276 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12278 unsigned int regno;
12280 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12281 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12283 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12284 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12288 static GTY(()) rtx queued_cfa_restores;
12290 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12291 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12292 Don't add the note if the previously saved value will be left untouched
12293 within stack red-zone till return, as unwinders can find the same value
12294 in the register and on the stack. */
12296 static void
12297 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12299 if (!crtl->shrink_wrapped
12300 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12301 return;
12303 if (insn)
12305 add_reg_note (insn, REG_CFA_RESTORE, reg);
12306 RTX_FRAME_RELATED_P (insn) = 1;
12308 else
12309 queued_cfa_restores
12310 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12313 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12315 static void
12316 ix86_add_queued_cfa_restore_notes (rtx insn)
12318 rtx last;
12319 if (!queued_cfa_restores)
12320 return;
12321 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12323 XEXP (last, 1) = REG_NOTES (insn);
12324 REG_NOTES (insn) = queued_cfa_restores;
12325 queued_cfa_restores = NULL_RTX;
12326 RTX_FRAME_RELATED_P (insn) = 1;
12329 /* Expand prologue or epilogue stack adjustment.
12330 The pattern exist to put a dependency on all ebp-based memory accesses.
12331 STYLE should be negative if instructions should be marked as frame related,
12332 zero if %r11 register is live and cannot be freely used and positive
12333 otherwise. */
12335 static rtx
12336 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12337 int style, bool set_cfa)
12339 struct machine_function *m = cfun->machine;
12340 rtx insn;
12341 bool add_frame_related_expr = false;
12343 if (Pmode == SImode)
12344 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12345 else if (x86_64_immediate_operand (offset, DImode))
12346 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12347 else
12349 rtx tmp;
12350 /* r11 is used by indirect sibcall return as well, set before the
12351 epilogue and used after the epilogue. */
12352 if (style)
12353 tmp = gen_rtx_REG (DImode, R11_REG);
12354 else
12356 gcc_assert (src != hard_frame_pointer_rtx
12357 && dest != hard_frame_pointer_rtx);
12358 tmp = hard_frame_pointer_rtx;
12360 insn = emit_insn (gen_rtx_SET (tmp, offset));
12361 if (style < 0)
12362 add_frame_related_expr = true;
12364 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12367 insn = emit_insn (insn);
12368 if (style >= 0)
12369 ix86_add_queued_cfa_restore_notes (insn);
12371 if (set_cfa)
12373 rtx r;
12375 gcc_assert (m->fs.cfa_reg == src);
12376 m->fs.cfa_offset += INTVAL (offset);
12377 m->fs.cfa_reg = dest;
12379 r = gen_rtx_PLUS (Pmode, src, offset);
12380 r = gen_rtx_SET (dest, r);
12381 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12382 RTX_FRAME_RELATED_P (insn) = 1;
12384 else if (style < 0)
12386 RTX_FRAME_RELATED_P (insn) = 1;
12387 if (add_frame_related_expr)
12389 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12390 r = gen_rtx_SET (dest, r);
12391 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12395 if (dest == stack_pointer_rtx)
12397 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12398 bool valid = m->fs.sp_valid;
12399 bool realigned = m->fs.sp_realigned;
12401 if (src == hard_frame_pointer_rtx)
12403 valid = m->fs.fp_valid;
12404 realigned = false;
12405 ooffset = m->fs.fp_offset;
12407 else if (src == crtl->drap_reg)
12409 valid = m->fs.drap_valid;
12410 realigned = false;
12411 ooffset = 0;
12413 else
12415 /* Else there are two possibilities: SP itself, which we set
12416 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12417 taken care of this by hand along the eh_return path. */
12418 gcc_checking_assert (src == stack_pointer_rtx
12419 || offset == const0_rtx);
12422 m->fs.sp_offset = ooffset - INTVAL (offset);
12423 m->fs.sp_valid = valid;
12424 m->fs.sp_realigned = realigned;
12426 return insn;
12429 /* Find an available register to be used as dynamic realign argument
12430 pointer regsiter. Such a register will be written in prologue and
12431 used in begin of body, so it must not be
12432 1. parameter passing register.
12433 2. GOT pointer.
12434 We reuse static-chain register if it is available. Otherwise, we
12435 use DI for i386 and R13 for x86-64. We chose R13 since it has
12436 shorter encoding.
12438 Return: the regno of chosen register. */
12440 static unsigned int
12441 find_drap_reg (void)
12443 tree decl = cfun->decl;
12445 /* Always use callee-saved register if there are no caller-saved
12446 registers. */
12447 if (TARGET_64BIT)
12449 /* Use R13 for nested function or function need static chain.
12450 Since function with tail call may use any caller-saved
12451 registers in epilogue, DRAP must not use caller-saved
12452 register in such case. */
12453 if (DECL_STATIC_CHAIN (decl)
12454 || cfun->machine->no_caller_saved_registers
12455 || crtl->tail_call_emit)
12456 return R13_REG;
12458 return R10_REG;
12460 else
12462 /* Use DI for nested function or function need static chain.
12463 Since function with tail call may use any caller-saved
12464 registers in epilogue, DRAP must not use caller-saved
12465 register in such case. */
12466 if (DECL_STATIC_CHAIN (decl)
12467 || cfun->machine->no_caller_saved_registers
12468 || crtl->tail_call_emit)
12469 return DI_REG;
12471 /* Reuse static chain register if it isn't used for parameter
12472 passing. */
12473 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12475 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12476 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12477 return CX_REG;
12479 return DI_REG;
12483 /* Handle a "force_align_arg_pointer" attribute. */
12485 static tree
12486 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12487 tree, int, bool *no_add_attrs)
12489 if (TREE_CODE (*node) != FUNCTION_TYPE
12490 && TREE_CODE (*node) != METHOD_TYPE
12491 && TREE_CODE (*node) != FIELD_DECL
12492 && TREE_CODE (*node) != TYPE_DECL)
12494 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12495 name);
12496 *no_add_attrs = true;
12499 return NULL_TREE;
12502 /* Return minimum incoming stack alignment. */
12504 static unsigned int
12505 ix86_minimum_incoming_stack_boundary (bool sibcall)
12507 unsigned int incoming_stack_boundary;
12509 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12510 if (cfun->machine->func_type != TYPE_NORMAL)
12511 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12512 /* Prefer the one specified at command line. */
12513 else if (ix86_user_incoming_stack_boundary)
12514 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12515 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12516 if -mstackrealign is used, it isn't used for sibcall check and
12517 estimated stack alignment is 128bit. */
12518 else if (!sibcall
12519 && ix86_force_align_arg_pointer
12520 && crtl->stack_alignment_estimated == 128)
12521 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12522 else
12523 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12525 /* Incoming stack alignment can be changed on individual functions
12526 via force_align_arg_pointer attribute. We use the smallest
12527 incoming stack boundary. */
12528 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12529 && lookup_attribute (ix86_force_align_arg_pointer_string,
12530 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12531 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12533 /* The incoming stack frame has to be aligned at least at
12534 parm_stack_boundary. */
12535 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12536 incoming_stack_boundary = crtl->parm_stack_boundary;
12538 /* Stack at entrance of main is aligned by runtime. We use the
12539 smallest incoming stack boundary. */
12540 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12541 && DECL_NAME (current_function_decl)
12542 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12543 && DECL_FILE_SCOPE_P (current_function_decl))
12544 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12546 return incoming_stack_boundary;
12549 /* Update incoming stack boundary and estimated stack alignment. */
12551 static void
12552 ix86_update_stack_boundary (void)
12554 ix86_incoming_stack_boundary
12555 = ix86_minimum_incoming_stack_boundary (false);
12557 /* x86_64 vararg needs 16byte stack alignment for register save
12558 area. */
12559 if (TARGET_64BIT
12560 && cfun->stdarg
12561 && crtl->stack_alignment_estimated < 128)
12562 crtl->stack_alignment_estimated = 128;
12564 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12565 if (ix86_tls_descriptor_calls_expanded_in_cfun
12566 && crtl->preferred_stack_boundary < 128)
12567 crtl->preferred_stack_boundary = 128;
12570 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12571 needed or an rtx for DRAP otherwise. */
12573 static rtx
12574 ix86_get_drap_rtx (void)
12576 /* We must use DRAP if there are outgoing arguments on stack and
12577 ACCUMULATE_OUTGOING_ARGS is false. */
12578 if (ix86_force_drap
12579 || (cfun->machine->outgoing_args_on_stack
12580 && !ACCUMULATE_OUTGOING_ARGS))
12581 crtl->need_drap = true;
12583 if (stack_realign_drap)
12585 /* Assign DRAP to vDRAP and returns vDRAP */
12586 unsigned int regno = find_drap_reg ();
12587 rtx drap_vreg;
12588 rtx arg_ptr;
12589 rtx_insn *seq, *insn;
12591 arg_ptr = gen_rtx_REG (Pmode, regno);
12592 crtl->drap_reg = arg_ptr;
12594 start_sequence ();
12595 drap_vreg = copy_to_reg (arg_ptr);
12596 seq = get_insns ();
12597 end_sequence ();
12599 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12600 if (!optimize)
12602 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12603 RTX_FRAME_RELATED_P (insn) = 1;
12605 return drap_vreg;
12607 else
12608 return NULL;
12611 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12613 static rtx
12614 ix86_internal_arg_pointer (void)
12616 return virtual_incoming_args_rtx;
12619 struct scratch_reg {
12620 rtx reg;
12621 bool saved;
12624 /* Return a short-lived scratch register for use on function entry.
12625 In 32-bit mode, it is valid only after the registers are saved
12626 in the prologue. This register must be released by means of
12627 release_scratch_register_on_entry once it is dead. */
12629 static void
12630 get_scratch_register_on_entry (struct scratch_reg *sr)
12632 int regno;
12634 sr->saved = false;
12636 if (TARGET_64BIT)
12638 /* We always use R11 in 64-bit mode. */
12639 regno = R11_REG;
12641 else
12643 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12644 bool fastcall_p
12645 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12646 bool thiscall_p
12647 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12648 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12649 int regparm = ix86_function_regparm (fntype, decl);
12650 int drap_regno
12651 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12653 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12654 for the static chain register. */
12655 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12656 && drap_regno != AX_REG)
12657 regno = AX_REG;
12658 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12659 for the static chain register. */
12660 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12661 regno = AX_REG;
12662 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12663 regno = DX_REG;
12664 /* ecx is the static chain register. */
12665 else if (regparm < 3 && !fastcall_p && !thiscall_p
12666 && !static_chain_p
12667 && drap_regno != CX_REG)
12668 regno = CX_REG;
12669 else if (ix86_save_reg (BX_REG, true, false))
12670 regno = BX_REG;
12671 /* esi is the static chain register. */
12672 else if (!(regparm == 3 && static_chain_p)
12673 && ix86_save_reg (SI_REG, true, false))
12674 regno = SI_REG;
12675 else if (ix86_save_reg (DI_REG, true, false))
12676 regno = DI_REG;
12677 else
12679 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12680 sr->saved = true;
12684 sr->reg = gen_rtx_REG (Pmode, regno);
12685 if (sr->saved)
12687 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12688 RTX_FRAME_RELATED_P (insn) = 1;
12692 /* Release a scratch register obtained from the preceding function.
12694 If RELEASE_VIA_POP is true, we just pop the register off the stack
12695 to release it. This is what non-Linux systems use with -fstack-check.
12697 Otherwise we use OFFSET to locate the saved register and the
12698 allocated stack space becomes part of the local frame and is
12699 deallocated by the epilogue. */
12701 static void
12702 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12703 bool release_via_pop)
12705 if (sr->saved)
12707 if (release_via_pop)
12709 struct machine_function *m = cfun->machine;
12710 rtx x, insn = emit_insn (gen_pop (sr->reg));
12712 /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
12713 RTX_FRAME_RELATED_P (insn) = 1;
12714 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12715 x = gen_rtx_SET (stack_pointer_rtx, x);
12716 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12717 m->fs.sp_offset -= UNITS_PER_WORD;
12719 else
12721 rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12722 x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12723 emit_insn (x);
12728 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12730 This differs from the next routine in that it tries hard to prevent
12731 attacks that jump the stack guard. Thus it is never allowed to allocate
12732 more than PROBE_INTERVAL bytes of stack space without a suitable
12733 probe.
12735 INT_REGISTERS_SAVED is true if integer registers have already been
12736 pushed on the stack. */
12738 static void
12739 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12740 const bool int_registers_saved)
12742 struct machine_function *m = cfun->machine;
12744 /* If this function does not statically allocate stack space, then
12745 no probes are needed. */
12746 if (!size)
12748 /* However, the allocation of space via pushes for register
12749 saves could be viewed as allocating space, but without the
12750 need to probe. */
12751 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12752 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12753 else
12754 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12755 return;
12758 /* If we are a noreturn function, then we have to consider the
12759 possibility that we're called via a jump rather than a call.
12761 Thus we don't have the implicit probe generated by saving the
12762 return address into the stack at the call. Thus, the stack
12763 pointer could be anywhere in the guard page. The safe thing
12764 to do is emit a probe now.
12766 The probe can be avoided if we have already emitted any callee
12767 register saves into the stack or have a frame pointer (which will
12768 have been saved as well). Those saves will function as implicit
12769 probes.
12771 ?!? This should be revamped to work like aarch64 and s390 where
12772 we track the offset from the most recent probe. Normally that
12773 offset would be zero. For a noreturn function we would reset
12774 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12775 we just probe when we cross PROBE_INTERVAL. */
12776 if (TREE_THIS_VOLATILE (cfun->decl)
12777 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12779 /* We can safely use any register here since we're just going to push
12780 its value and immediately pop it back. But we do try and avoid
12781 argument passing registers so as not to introduce dependencies in
12782 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12783 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12784 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12785 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12786 m->fs.sp_offset -= UNITS_PER_WORD;
12787 if (m->fs.cfa_reg == stack_pointer_rtx)
12789 m->fs.cfa_offset -= UNITS_PER_WORD;
12790 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12791 x = gen_rtx_SET (stack_pointer_rtx, x);
12792 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12793 RTX_FRAME_RELATED_P (insn_push) = 1;
12794 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12795 x = gen_rtx_SET (stack_pointer_rtx, x);
12796 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12797 RTX_FRAME_RELATED_P (insn_pop) = 1;
12799 emit_insn (gen_blockage ());
12802 /* If we allocate less than the size of the guard statically,
12803 then no probing is necessary, but we do need to allocate
12804 the stack. */
12805 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12807 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12808 GEN_INT (-size), -1,
12809 m->fs.cfa_reg == stack_pointer_rtx);
12810 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12811 return;
12814 /* We're allocating a large enough stack frame that we need to
12815 emit probes. Either emit them inline or in a loop depending
12816 on the size. */
12817 HOST_WIDE_INT probe_interval = get_probe_interval ();
12818 if (size <= 4 * probe_interval)
12820 HOST_WIDE_INT i;
12821 for (i = probe_interval; i <= size; i += probe_interval)
12823 /* Allocate PROBE_INTERVAL bytes. */
12824 rtx insn
12825 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12826 GEN_INT (-probe_interval), -1,
12827 m->fs.cfa_reg == stack_pointer_rtx);
12828 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12830 /* And probe at *sp. */
12831 emit_stack_probe (stack_pointer_rtx);
12832 emit_insn (gen_blockage ());
12835 /* We need to allocate space for the residual, but we do not need
12836 to probe the residual. */
12837 HOST_WIDE_INT residual = (i - probe_interval - size);
12838 if (residual)
12839 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12840 GEN_INT (residual), -1,
12841 m->fs.cfa_reg == stack_pointer_rtx);
12842 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12844 else
12846 /* We expect the GP registers to be saved when probes are used
12847 as the probing sequences might need a scratch register and
12848 the routine to allocate one assumes the integer registers
12849 have already been saved. */
12850 gcc_assert (int_registers_saved);
12852 struct scratch_reg sr;
12853 get_scratch_register_on_entry (&sr);
12855 /* If we needed to save a register, then account for any space
12856 that was pushed (we are not going to pop the register when
12857 we do the restore). */
12858 if (sr.saved)
12859 size -= UNITS_PER_WORD;
12861 /* Step 1: round SIZE down to a multiple of the interval. */
12862 HOST_WIDE_INT rounded_size = size & -probe_interval;
12864 /* Step 2: compute final value of the loop counter. Use lea if
12865 possible. */
12866 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12867 rtx insn;
12868 if (address_no_seg_operand (addr, Pmode))
12869 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12870 else
12872 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12873 insn = emit_insn (gen_rtx_SET (sr.reg,
12874 gen_rtx_PLUS (Pmode, sr.reg,
12875 stack_pointer_rtx)));
12877 if (m->fs.cfa_reg == stack_pointer_rtx)
12879 add_reg_note (insn, REG_CFA_DEF_CFA,
12880 plus_constant (Pmode, sr.reg,
12881 m->fs.cfa_offset + rounded_size));
12882 RTX_FRAME_RELATED_P (insn) = 1;
12885 /* Step 3: the loop. */
12886 rtx size_rtx = GEN_INT (rounded_size);
12887 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12888 size_rtx));
12889 if (m->fs.cfa_reg == stack_pointer_rtx)
12891 m->fs.cfa_offset += rounded_size;
12892 add_reg_note (insn, REG_CFA_DEF_CFA,
12893 plus_constant (Pmode, stack_pointer_rtx,
12894 m->fs.cfa_offset));
12895 RTX_FRAME_RELATED_P (insn) = 1;
12897 m->fs.sp_offset += rounded_size;
12898 emit_insn (gen_blockage ());
12900 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12901 is equal to ROUNDED_SIZE. */
12903 if (size != rounded_size)
12904 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12905 GEN_INT (rounded_size - size), -1,
12906 m->fs.cfa_reg == stack_pointer_rtx);
12907 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12909 /* This does not deallocate the space reserved for the scratch
12910 register. That will be deallocated in the epilogue. */
12911 release_scratch_register_on_entry (&sr, size, false);
12914 /* Make sure nothing is scheduled before we are done. */
12915 emit_insn (gen_blockage ());
12918 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12920 INT_REGISTERS_SAVED is true if integer registers have already been
12921 pushed on the stack. */
12923 static void
12924 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12925 const bool int_registers_saved)
12927 /* We skip the probe for the first interval + a small dope of 4 words and
12928 probe that many bytes past the specified size to maintain a protection
12929 area at the botton of the stack. */
12930 const int dope = 4 * UNITS_PER_WORD;
12931 rtx size_rtx = GEN_INT (size), last;
12933 /* See if we have a constant small number of probes to generate. If so,
12934 that's the easy case. The run-time loop is made up of 9 insns in the
12935 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12936 for n # of intervals. */
12937 if (size <= 4 * get_probe_interval ())
12939 HOST_WIDE_INT i, adjust;
12940 bool first_probe = true;
12942 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12943 values of N from 1 until it exceeds SIZE. If only one probe is
12944 needed, this will not generate any code. Then adjust and probe
12945 to PROBE_INTERVAL + SIZE. */
12946 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12948 if (first_probe)
12950 adjust = 2 * get_probe_interval () + dope;
12951 first_probe = false;
12953 else
12954 adjust = get_probe_interval ();
12956 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12957 plus_constant (Pmode, stack_pointer_rtx,
12958 -adjust)));
12959 emit_stack_probe (stack_pointer_rtx);
12962 if (first_probe)
12963 adjust = size + get_probe_interval () + dope;
12964 else
12965 adjust = size + get_probe_interval () - i;
12967 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12968 plus_constant (Pmode, stack_pointer_rtx,
12969 -adjust)));
12970 emit_stack_probe (stack_pointer_rtx);
12972 /* Adjust back to account for the additional first interval. */
12973 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12974 plus_constant (Pmode, stack_pointer_rtx,
12975 (get_probe_interval ()
12976 + dope))));
12979 /* Otherwise, do the same as above, but in a loop. Note that we must be
12980 extra careful with variables wrapping around because we might be at
12981 the very top (or the very bottom) of the address space and we have
12982 to be able to handle this case properly; in particular, we use an
12983 equality test for the loop condition. */
12984 else
12986 /* We expect the GP registers to be saved when probes are used
12987 as the probing sequences might need a scratch register and
12988 the routine to allocate one assumes the integer registers
12989 have already been saved. */
12990 gcc_assert (int_registers_saved);
12992 HOST_WIDE_INT rounded_size;
12993 struct scratch_reg sr;
12995 get_scratch_register_on_entry (&sr);
12997 /* If we needed to save a register, then account for any space
12998 that was pushed (we are not going to pop the register when
12999 we do the restore). */
13000 if (sr.saved)
13001 size -= UNITS_PER_WORD;
13003 /* Step 1: round SIZE to the previous multiple of the interval. */
13005 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13008 /* Step 2: compute initial and final value of the loop counter. */
13010 /* SP = SP_0 + PROBE_INTERVAL. */
13011 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13012 plus_constant (Pmode, stack_pointer_rtx,
13013 - (get_probe_interval () + dope))));
13015 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13016 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13017 emit_insn (gen_rtx_SET (sr.reg,
13018 plus_constant (Pmode, stack_pointer_rtx,
13019 -rounded_size)));
13020 else
13022 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13023 emit_insn (gen_rtx_SET (sr.reg,
13024 gen_rtx_PLUS (Pmode, sr.reg,
13025 stack_pointer_rtx)));
13029 /* Step 3: the loop
13033 SP = SP + PROBE_INTERVAL
13034 probe at SP
13036 while (SP != LAST_ADDR)
13038 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13039 values of N from 1 until it is equal to ROUNDED_SIZE. */
13041 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13044 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13045 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13047 if (size != rounded_size)
13049 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13050 plus_constant (Pmode, stack_pointer_rtx,
13051 rounded_size - size)));
13052 emit_stack_probe (stack_pointer_rtx);
13055 /* Adjust back to account for the additional first interval. */
13056 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13057 plus_constant (Pmode, stack_pointer_rtx,
13058 (get_probe_interval ()
13059 + dope))));
13061 /* This does not deallocate the space reserved for the scratch
13062 register. That will be deallocated in the epilogue. */
13063 release_scratch_register_on_entry (&sr, size, false);
13066 /* Even if the stack pointer isn't the CFA register, we need to correctly
13067 describe the adjustments made to it, in particular differentiate the
13068 frame-related ones from the frame-unrelated ones. */
13069 if (size > 0)
13071 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13072 XVECEXP (expr, 0, 0)
13073 = gen_rtx_SET (stack_pointer_rtx,
13074 plus_constant (Pmode, stack_pointer_rtx, -size));
13075 XVECEXP (expr, 0, 1)
13076 = gen_rtx_SET (stack_pointer_rtx,
13077 plus_constant (Pmode, stack_pointer_rtx,
13078 get_probe_interval () + dope + size));
13079 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13080 RTX_FRAME_RELATED_P (last) = 1;
13082 cfun->machine->fs.sp_offset += size;
13085 /* Make sure nothing is scheduled before we are done. */
13086 emit_insn (gen_blockage ());
13089 /* Adjust the stack pointer up to REG while probing it. */
13091 const char *
13092 output_adjust_stack_and_probe (rtx reg)
13094 static int labelno = 0;
13095 char loop_lab[32];
13096 rtx xops[2];
13098 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13100 /* Loop. */
13101 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13103 /* SP = SP + PROBE_INTERVAL. */
13104 xops[0] = stack_pointer_rtx;
13105 xops[1] = GEN_INT (get_probe_interval ());
13106 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13108 /* Probe at SP. */
13109 xops[1] = const0_rtx;
13110 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13112 /* Test if SP == LAST_ADDR. */
13113 xops[0] = stack_pointer_rtx;
13114 xops[1] = reg;
13115 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13117 /* Branch. */
13118 fputs ("\tjne\t", asm_out_file);
13119 assemble_name_raw (asm_out_file, loop_lab);
13120 fputc ('\n', asm_out_file);
13122 return "";
13125 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13126 inclusive. These are offsets from the current stack pointer.
13128 INT_REGISTERS_SAVED is true if integer registers have already been
13129 pushed on the stack. */
13131 static void
13132 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
13133 const bool int_registers_saved)
13135 /* See if we have a constant small number of probes to generate. If so,
13136 that's the easy case. The run-time loop is made up of 6 insns in the
13137 generic case while the compile-time loop is made up of n insns for n #
13138 of intervals. */
13139 if (size <= 6 * get_probe_interval ())
13141 HOST_WIDE_INT i;
13143 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13144 it exceeds SIZE. If only one probe is needed, this will not
13145 generate any code. Then probe at FIRST + SIZE. */
13146 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
13147 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13148 -(first + i)));
13150 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13151 -(first + size)));
13154 /* Otherwise, do the same as above, but in a loop. Note that we must be
13155 extra careful with variables wrapping around because we might be at
13156 the very top (or the very bottom) of the address space and we have
13157 to be able to handle this case properly; in particular, we use an
13158 equality test for the loop condition. */
13159 else
13161 /* We expect the GP registers to be saved when probes are used
13162 as the probing sequences might need a scratch register and
13163 the routine to allocate one assumes the integer registers
13164 have already been saved. */
13165 gcc_assert (int_registers_saved);
13167 HOST_WIDE_INT rounded_size, last;
13168 struct scratch_reg sr;
13170 get_scratch_register_on_entry (&sr);
13173 /* Step 1: round SIZE to the previous multiple of the interval. */
13175 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13178 /* Step 2: compute initial and final value of the loop counter. */
13180 /* TEST_OFFSET = FIRST. */
13181 emit_move_insn (sr.reg, GEN_INT (-first));
13183 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13184 last = first + rounded_size;
13187 /* Step 3: the loop
13191 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13192 probe at TEST_ADDR
13194 while (TEST_ADDR != LAST_ADDR)
13196 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13197 until it is equal to ROUNDED_SIZE. */
13199 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13202 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13203 that SIZE is equal to ROUNDED_SIZE. */
13205 if (size != rounded_size)
13206 emit_stack_probe (plus_constant (Pmode,
13207 gen_rtx_PLUS (Pmode,
13208 stack_pointer_rtx,
13209 sr.reg),
13210 rounded_size - size));
13212 release_scratch_register_on_entry (&sr, size, true);
13215 /* Make sure nothing is scheduled before we are done. */
13216 emit_insn (gen_blockage ());
13219 /* Probe a range of stack addresses from REG to END, inclusive. These are
13220 offsets from the current stack pointer. */
13222 const char *
13223 output_probe_stack_range (rtx reg, rtx end)
13225 static int labelno = 0;
13226 char loop_lab[32];
13227 rtx xops[3];
13229 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13231 /* Loop. */
13232 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13234 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13235 xops[0] = reg;
13236 xops[1] = GEN_INT (get_probe_interval ());
13237 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13239 /* Probe at TEST_ADDR. */
13240 xops[0] = stack_pointer_rtx;
13241 xops[1] = reg;
13242 xops[2] = const0_rtx;
13243 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13245 /* Test if TEST_ADDR == LAST_ADDR. */
13246 xops[0] = reg;
13247 xops[1] = end;
13248 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13250 /* Branch. */
13251 fputs ("\tjne\t", asm_out_file);
13252 assemble_name_raw (asm_out_file, loop_lab);
13253 fputc ('\n', asm_out_file);
13255 return "";
13258 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13259 to the largest alignment, in bits, of stack slot used if stack
13260 frame is required and CHECK_STACK_SLOT is true. */
13262 static bool
13263 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13264 bool check_stack_slot)
13266 HARD_REG_SET set_up_by_prologue, prologue_used;
13267 basic_block bb;
13269 CLEAR_HARD_REG_SET (prologue_used);
13270 CLEAR_HARD_REG_SET (set_up_by_prologue);
13271 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13272 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13273 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13274 HARD_FRAME_POINTER_REGNUM);
13276 /* The preferred stack alignment is the minimum stack alignment. */
13277 if (stack_alignment > crtl->preferred_stack_boundary)
13278 stack_alignment = crtl->preferred_stack_boundary;
13280 bool require_stack_frame = false;
13282 FOR_EACH_BB_FN (bb, cfun)
13284 rtx_insn *insn;
13285 FOR_BB_INSNS (bb, insn)
13286 if (NONDEBUG_INSN_P (insn)
13287 && requires_stack_frame_p (insn, prologue_used,
13288 set_up_by_prologue))
13290 require_stack_frame = true;
13292 if (check_stack_slot)
13294 /* Find the maximum stack alignment. */
13295 subrtx_iterator::array_type array;
13296 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13297 if (MEM_P (*iter)
13298 && (reg_mentioned_p (stack_pointer_rtx,
13299 *iter)
13300 || reg_mentioned_p (frame_pointer_rtx,
13301 *iter)))
13303 unsigned int alignment = MEM_ALIGN (*iter);
13304 if (alignment > stack_alignment)
13305 stack_alignment = alignment;
13311 return require_stack_frame;
13314 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13315 will guide prologue/epilogue to be generated in correct form. */
13317 static void
13318 ix86_finalize_stack_frame_flags (void)
13320 /* Check if stack realign is really needed after reload, and
13321 stores result in cfun */
13322 unsigned int incoming_stack_boundary
13323 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13324 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13325 unsigned int stack_alignment
13326 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13327 ? crtl->max_used_stack_slot_alignment
13328 : crtl->stack_alignment_needed);
13329 unsigned int stack_realign
13330 = (incoming_stack_boundary < stack_alignment);
13331 bool recompute_frame_layout_p = false;
13333 if (crtl->stack_realign_finalized)
13335 /* After stack_realign_needed is finalized, we can't no longer
13336 change it. */
13337 gcc_assert (crtl->stack_realign_needed == stack_realign);
13338 return;
13341 /* If the only reason for frame_pointer_needed is that we conservatively
13342 assumed stack realignment might be needed or -fno-omit-frame-pointer
13343 is used, but in the end nothing that needed the stack alignment had
13344 been spilled nor stack access, clear frame_pointer_needed and say we
13345 don't need stack realignment. */
13346 if ((stack_realign || !flag_omit_frame_pointer)
13347 && frame_pointer_needed
13348 && crtl->is_leaf
13349 && crtl->sp_is_unchanging
13350 && !ix86_current_function_calls_tls_descriptor
13351 && !crtl->accesses_prior_frames
13352 && !cfun->calls_alloca
13353 && !crtl->calls_eh_return
13354 /* See ira_setup_eliminable_regset for the rationale. */
13355 && !(STACK_CHECK_MOVING_SP
13356 && flag_stack_check
13357 && flag_exceptions
13358 && cfun->can_throw_non_call_exceptions)
13359 && !ix86_frame_pointer_required ()
13360 && get_frame_size () == 0
13361 && ix86_nsaved_sseregs () == 0
13362 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13364 if (ix86_find_max_used_stack_alignment (stack_alignment,
13365 stack_realign))
13367 /* Stack frame is required. If stack alignment needed is less
13368 than incoming stack boundary, don't realign stack. */
13369 stack_realign = incoming_stack_boundary < stack_alignment;
13370 if (!stack_realign)
13372 crtl->max_used_stack_slot_alignment
13373 = incoming_stack_boundary;
13374 crtl->stack_alignment_needed
13375 = incoming_stack_boundary;
13376 /* Also update preferred_stack_boundary for leaf
13377 functions. */
13378 crtl->preferred_stack_boundary
13379 = incoming_stack_boundary;
13382 else
13384 /* If drap has been set, but it actually isn't live at the
13385 start of the function, there is no reason to set it up. */
13386 if (crtl->drap_reg)
13388 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13389 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13390 REGNO (crtl->drap_reg)))
13392 crtl->drap_reg = NULL_RTX;
13393 crtl->need_drap = false;
13396 else
13397 cfun->machine->no_drap_save_restore = true;
13399 frame_pointer_needed = false;
13400 stack_realign = false;
13401 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13402 crtl->stack_alignment_needed = incoming_stack_boundary;
13403 crtl->stack_alignment_estimated = incoming_stack_boundary;
13404 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13405 crtl->preferred_stack_boundary = incoming_stack_boundary;
13406 df_finish_pass (true);
13407 df_scan_alloc (NULL);
13408 df_scan_blocks ();
13409 df_compute_regs_ever_live (true);
13410 df_analyze ();
13412 if (flag_var_tracking)
13414 /* Since frame pointer is no longer available, replace it with
13415 stack pointer - UNITS_PER_WORD in debug insns. */
13416 df_ref ref, next;
13417 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13418 ref; ref = next)
13420 next = DF_REF_NEXT_REG (ref);
13421 if (!DF_REF_INSN_INFO (ref))
13422 continue;
13424 /* Make sure the next ref is for a different instruction,
13425 so that we're not affected by the rescan. */
13426 rtx_insn *insn = DF_REF_INSN (ref);
13427 while (next && DF_REF_INSN (next) == insn)
13428 next = DF_REF_NEXT_REG (next);
13430 if (DEBUG_INSN_P (insn))
13432 bool changed = false;
13433 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13435 rtx *loc = DF_REF_LOC (ref);
13436 if (*loc == hard_frame_pointer_rtx)
13438 *loc = plus_constant (Pmode,
13439 stack_pointer_rtx,
13440 -UNITS_PER_WORD);
13441 changed = true;
13444 if (changed)
13445 df_insn_rescan (insn);
13450 recompute_frame_layout_p = true;
13453 else if (crtl->max_used_stack_slot_alignment
13454 > crtl->preferred_stack_boundary)
13456 /* We don't need to realign stack. But we still need to keep
13457 stack frame properly aligned to satisfy the largest alignment
13458 of stack slots. */
13459 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13460 cfun->machine->max_used_stack_alignment
13461 = stack_alignment / BITS_PER_UNIT;
13464 if (crtl->stack_realign_needed != stack_realign)
13465 recompute_frame_layout_p = true;
13466 crtl->stack_realign_needed = stack_realign;
13467 crtl->stack_realign_finalized = true;
13468 if (recompute_frame_layout_p)
13469 ix86_compute_frame_layout ();
13472 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13474 static void
13475 ix86_elim_entry_set_got (rtx reg)
13477 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13478 rtx_insn *c_insn = BB_HEAD (bb);
13479 if (!NONDEBUG_INSN_P (c_insn))
13480 c_insn = next_nonnote_nondebug_insn (c_insn);
13481 if (c_insn && NONJUMP_INSN_P (c_insn))
13483 rtx pat = PATTERN (c_insn);
13484 if (GET_CODE (pat) == PARALLEL)
13486 rtx vec = XVECEXP (pat, 0, 0);
13487 if (GET_CODE (vec) == SET
13488 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13489 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13490 delete_insn (c_insn);
13495 static rtx
13496 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13498 rtx addr, mem;
13500 if (offset)
13501 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13502 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13503 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13506 static inline rtx
13507 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13509 return gen_frame_set (reg, frame_reg, offset, false);
13512 static inline rtx
13513 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13515 return gen_frame_set (reg, frame_reg, offset, true);
13518 static void
13519 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13521 struct machine_function *m = cfun->machine;
13522 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13523 + m->call_ms2sysv_extra_regs;
13524 rtvec v = rtvec_alloc (ncregs + 1);
13525 unsigned int align, i, vi = 0;
13526 rtx_insn *insn;
13527 rtx sym, addr;
13528 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13529 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13531 /* AL should only be live with sysv_abi. */
13532 gcc_assert (!ix86_eax_live_at_start_p ());
13533 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13535 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13536 we've actually realigned the stack or not. */
13537 align = GET_MODE_ALIGNMENT (V4SFmode);
13538 addr = choose_baseaddr (frame.stack_realign_offset
13539 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13540 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13542 emit_insn (gen_rtx_SET (rax, addr));
13544 /* Get the stub symbol. */
13545 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13546 : XLOGUE_STUB_SAVE);
13547 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13549 for (i = 0; i < ncregs; ++i)
13551 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13552 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13553 r.regno);
13554 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13557 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13559 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13560 RTX_FRAME_RELATED_P (insn) = true;
13563 /* Expand the prologue into a bunch of separate insns. */
13565 void
13566 ix86_expand_prologue (void)
13568 struct machine_function *m = cfun->machine;
13569 rtx insn, t;
13570 HOST_WIDE_INT allocate;
13571 bool int_registers_saved;
13572 bool sse_registers_saved;
13573 bool save_stub_call_needed;
13574 rtx static_chain = NULL_RTX;
13576 if (ix86_function_naked (current_function_decl))
13577 return;
13579 ix86_finalize_stack_frame_flags ();
13581 /* DRAP should not coexist with stack_realign_fp */
13582 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13584 memset (&m->fs, 0, sizeof (m->fs));
13586 /* Initialize CFA state for before the prologue. */
13587 m->fs.cfa_reg = stack_pointer_rtx;
13588 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13590 /* Track SP offset to the CFA. We continue tracking this after we've
13591 swapped the CFA register away from SP. In the case of re-alignment
13592 this is fudged; we're interested to offsets within the local frame. */
13593 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13594 m->fs.sp_valid = true;
13595 m->fs.sp_realigned = false;
13597 const struct ix86_frame &frame = cfun->machine->frame;
13599 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13601 /* We should have already generated an error for any use of
13602 ms_hook on a nested function. */
13603 gcc_checking_assert (!ix86_static_chain_on_stack);
13605 /* Check if profiling is active and we shall use profiling before
13606 prologue variant. If so sorry. */
13607 if (crtl->profile && flag_fentry != 0)
13608 sorry ("ms_hook_prologue attribute isn%'t compatible "
13609 "with -mfentry for 32-bit");
13611 /* In ix86_asm_output_function_label we emitted:
13612 8b ff movl.s %edi,%edi
13613 55 push %ebp
13614 8b ec movl.s %esp,%ebp
13616 This matches the hookable function prologue in Win32 API
13617 functions in Microsoft Windows XP Service Pack 2 and newer.
13618 Wine uses this to enable Windows apps to hook the Win32 API
13619 functions provided by Wine.
13621 What that means is that we've already set up the frame pointer. */
13623 if (frame_pointer_needed
13624 && !(crtl->drap_reg && crtl->stack_realign_needed))
13626 rtx push, mov;
13628 /* We've decided to use the frame pointer already set up.
13629 Describe this to the unwinder by pretending that both
13630 push and mov insns happen right here.
13632 Putting the unwind info here at the end of the ms_hook
13633 is done so that we can make absolutely certain we get
13634 the required byte sequence at the start of the function,
13635 rather than relying on an assembler that can produce
13636 the exact encoding required.
13638 However it does mean (in the unpatched case) that we have
13639 a 1 insn window where the asynchronous unwind info is
13640 incorrect. However, if we placed the unwind info at
13641 its correct location we would have incorrect unwind info
13642 in the patched case. Which is probably all moot since
13643 I don't expect Wine generates dwarf2 unwind info for the
13644 system libraries that use this feature. */
13646 insn = emit_insn (gen_blockage ());
13648 push = gen_push (hard_frame_pointer_rtx);
13649 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13650 stack_pointer_rtx);
13651 RTX_FRAME_RELATED_P (push) = 1;
13652 RTX_FRAME_RELATED_P (mov) = 1;
13654 RTX_FRAME_RELATED_P (insn) = 1;
13655 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13656 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13658 /* Note that gen_push incremented m->fs.cfa_offset, even
13659 though we didn't emit the push insn here. */
13660 m->fs.cfa_reg = hard_frame_pointer_rtx;
13661 m->fs.fp_offset = m->fs.cfa_offset;
13662 m->fs.fp_valid = true;
13664 else
13666 /* The frame pointer is not needed so pop %ebp again.
13667 This leaves us with a pristine state. */
13668 emit_insn (gen_pop (hard_frame_pointer_rtx));
13672 /* The first insn of a function that accepts its static chain on the
13673 stack is to push the register that would be filled in by a direct
13674 call. This insn will be skipped by the trampoline. */
13675 else if (ix86_static_chain_on_stack)
13677 static_chain = ix86_static_chain (cfun->decl, false);
13678 insn = emit_insn (gen_push (static_chain));
13679 emit_insn (gen_blockage ());
13681 /* We don't want to interpret this push insn as a register save,
13682 only as a stack adjustment. The real copy of the register as
13683 a save will be done later, if needed. */
13684 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13685 t = gen_rtx_SET (stack_pointer_rtx, t);
13686 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13687 RTX_FRAME_RELATED_P (insn) = 1;
13690 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13691 of DRAP is needed and stack realignment is really needed after reload */
13692 if (stack_realign_drap)
13694 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13696 /* Can't use DRAP in interrupt function. */
13697 if (cfun->machine->func_type != TYPE_NORMAL)
13698 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13699 "in interrupt service routine. This may be worked "
13700 "around by avoiding functions with aggregate return.");
13702 /* Only need to push parameter pointer reg if it is caller saved. */
13703 if (!call_used_regs[REGNO (crtl->drap_reg)])
13705 /* Push arg pointer reg */
13706 insn = emit_insn (gen_push (crtl->drap_reg));
13707 RTX_FRAME_RELATED_P (insn) = 1;
13710 /* Grab the argument pointer. */
13711 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13712 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13713 RTX_FRAME_RELATED_P (insn) = 1;
13714 m->fs.cfa_reg = crtl->drap_reg;
13715 m->fs.cfa_offset = 0;
13717 /* Align the stack. */
13718 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13719 stack_pointer_rtx,
13720 GEN_INT (-align_bytes)));
13721 RTX_FRAME_RELATED_P (insn) = 1;
13723 /* Replicate the return address on the stack so that return
13724 address can be reached via (argp - 1) slot. This is needed
13725 to implement macro RETURN_ADDR_RTX and intrinsic function
13726 expand_builtin_return_addr etc. */
13727 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13728 t = gen_frame_mem (word_mode, t);
13729 insn = emit_insn (gen_push (t));
13730 RTX_FRAME_RELATED_P (insn) = 1;
13732 /* For the purposes of frame and register save area addressing,
13733 we've started over with a new frame. */
13734 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13735 m->fs.realigned = true;
13737 if (static_chain)
13739 /* Replicate static chain on the stack so that static chain
13740 can be reached via (argp - 2) slot. This is needed for
13741 nested function with stack realignment. */
13742 insn = emit_insn (gen_push (static_chain));
13743 RTX_FRAME_RELATED_P (insn) = 1;
13747 int_registers_saved = (frame.nregs == 0);
13748 sse_registers_saved = (frame.nsseregs == 0);
13749 save_stub_call_needed = (m->call_ms2sysv);
13750 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13752 if (frame_pointer_needed && !m->fs.fp_valid)
13754 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13755 slower on all targets. Also sdb didn't like it. */
13756 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13757 RTX_FRAME_RELATED_P (insn) = 1;
13759 /* Push registers now, before setting the frame pointer
13760 on SEH target. */
13761 if (!int_registers_saved
13762 && TARGET_SEH
13763 && !frame.save_regs_using_mov)
13765 ix86_emit_save_regs ();
13766 int_registers_saved = true;
13767 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13770 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13772 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13773 RTX_FRAME_RELATED_P (insn) = 1;
13775 if (m->fs.cfa_reg == stack_pointer_rtx)
13776 m->fs.cfa_reg = hard_frame_pointer_rtx;
13777 m->fs.fp_offset = m->fs.sp_offset;
13778 m->fs.fp_valid = true;
13782 if (!int_registers_saved)
13784 /* If saving registers via PUSH, do so now. */
13785 if (!frame.save_regs_using_mov)
13787 ix86_emit_save_regs ();
13788 int_registers_saved = true;
13789 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13792 /* When using red zone we may start register saving before allocating
13793 the stack frame saving one cycle of the prologue. However, avoid
13794 doing this if we have to probe the stack; at least on x86_64 the
13795 stack probe can turn into a call that clobbers a red zone location. */
13796 else if (ix86_using_red_zone ()
13797 && (! TARGET_STACK_PROBE
13798 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13800 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13801 int_registers_saved = true;
13805 if (stack_realign_fp)
13807 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13808 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13810 /* Record last valid frame pointer offset. */
13811 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13813 /* The computation of the size of the re-aligned stack frame means
13814 that we must allocate the size of the register save area before
13815 performing the actual alignment. Otherwise we cannot guarantee
13816 that there's enough storage above the realignment point. */
13817 allocate = frame.reg_save_offset - m->fs.sp_offset
13818 + frame.stack_realign_allocate;
13819 if (allocate)
13820 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13821 GEN_INT (-allocate), -1, false);
13823 /* Align the stack. */
13824 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13825 stack_pointer_rtx,
13826 GEN_INT (-align_bytes)));
13827 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13828 m->fs.sp_realigned_offset = m->fs.sp_offset
13829 - frame.stack_realign_allocate;
13830 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13831 Beyond this point, stack access should be done via choose_baseaddr or
13832 by using sp_valid_at and fp_valid_at to determine the correct base
13833 register. Henceforth, any CFA offset should be thought of as logical
13834 and not physical. */
13835 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13836 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13837 m->fs.sp_realigned = true;
13839 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13840 is needed to describe where a register is saved using a realigned
13841 stack pointer, so we need to invalidate the stack pointer for that
13842 target. */
13843 if (TARGET_SEH)
13844 m->fs.sp_valid = false;
13846 /* If SP offset is non-immediate after allocation of the stack frame,
13847 then emit SSE saves or stub call prior to allocating the rest of the
13848 stack frame. This is less efficient for the out-of-line stub because
13849 we can't combine allocations across the call barrier, but it's better
13850 than using a scratch register. */
13851 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13852 - m->fs.sp_realigned_offset),
13853 Pmode))
13855 if (!sse_registers_saved)
13857 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13858 sse_registers_saved = true;
13860 else if (save_stub_call_needed)
13862 ix86_emit_outlined_ms2sysv_save (frame);
13863 save_stub_call_needed = false;
13868 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13870 if (flag_stack_usage_info)
13872 /* We start to count from ARG_POINTER. */
13873 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13875 /* If it was realigned, take into account the fake frame. */
13876 if (stack_realign_drap)
13878 if (ix86_static_chain_on_stack)
13879 stack_size += UNITS_PER_WORD;
13881 if (!call_used_regs[REGNO (crtl->drap_reg)])
13882 stack_size += UNITS_PER_WORD;
13884 /* This over-estimates by 1 minimal-stack-alignment-unit but
13885 mitigates that by counting in the new return address slot. */
13886 current_function_dynamic_stack_size
13887 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13890 current_function_static_stack_size = stack_size;
13893 /* On SEH target with very large frame size, allocate an area to save
13894 SSE registers (as the very large allocation won't be described). */
13895 if (TARGET_SEH
13896 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13897 && !sse_registers_saved)
13899 HOST_WIDE_INT sse_size =
13900 frame.sse_reg_save_offset - frame.reg_save_offset;
13902 gcc_assert (int_registers_saved);
13904 /* No need to do stack checking as the area will be immediately
13905 written. */
13906 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13907 GEN_INT (-sse_size), -1,
13908 m->fs.cfa_reg == stack_pointer_rtx);
13909 allocate -= sse_size;
13910 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13911 sse_registers_saved = true;
13914 /* The stack has already been decremented by the instruction calling us
13915 so probe if the size is non-negative to preserve the protection area. */
13916 if (allocate >= 0
13917 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13918 || flag_stack_clash_protection))
13920 if (flag_stack_clash_protection)
13922 ix86_adjust_stack_and_probe_stack_clash (allocate,
13923 int_registers_saved);
13924 allocate = 0;
13926 else if (STACK_CHECK_MOVING_SP)
13928 if (!(crtl->is_leaf && !cfun->calls_alloca
13929 && allocate <= get_probe_interval ()))
13931 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13932 allocate = 0;
13935 else
13937 HOST_WIDE_INT size = allocate;
13939 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13940 size = 0x80000000 - get_stack_check_protect () - 1;
13942 if (TARGET_STACK_PROBE)
13944 if (crtl->is_leaf && !cfun->calls_alloca)
13946 if (size > get_probe_interval ())
13947 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13949 else
13950 ix86_emit_probe_stack_range (0,
13951 size + get_stack_check_protect (),
13952 int_registers_saved);
13954 else
13956 if (crtl->is_leaf && !cfun->calls_alloca)
13958 if (size > get_probe_interval ()
13959 && size > get_stack_check_protect ())
13960 ix86_emit_probe_stack_range (get_stack_check_protect (),
13961 (size
13962 - get_stack_check_protect ()),
13963 int_registers_saved);
13965 else
13966 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13967 int_registers_saved);
13972 if (allocate == 0)
13974 else if (!ix86_target_stack_probe ()
13975 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13977 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13978 GEN_INT (-allocate), -1,
13979 m->fs.cfa_reg == stack_pointer_rtx);
13981 else
13983 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13984 rtx r10 = NULL;
13985 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13986 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13987 bool eax_live = ix86_eax_live_at_start_p ();
13988 bool r10_live = false;
13990 if (TARGET_64BIT)
13991 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13993 if (eax_live)
13995 insn = emit_insn (gen_push (eax));
13996 allocate -= UNITS_PER_WORD;
13997 /* Note that SEH directives need to continue tracking the stack
13998 pointer even after the frame pointer has been set up. */
13999 if (sp_is_cfa_reg || TARGET_SEH)
14001 if (sp_is_cfa_reg)
14002 m->fs.cfa_offset += UNITS_PER_WORD;
14003 RTX_FRAME_RELATED_P (insn) = 1;
14004 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14005 gen_rtx_SET (stack_pointer_rtx,
14006 plus_constant (Pmode, stack_pointer_rtx,
14007 -UNITS_PER_WORD)));
14011 if (r10_live)
14013 r10 = gen_rtx_REG (Pmode, R10_REG);
14014 insn = emit_insn (gen_push (r10));
14015 allocate -= UNITS_PER_WORD;
14016 if (sp_is_cfa_reg || TARGET_SEH)
14018 if (sp_is_cfa_reg)
14019 m->fs.cfa_offset += UNITS_PER_WORD;
14020 RTX_FRAME_RELATED_P (insn) = 1;
14021 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14022 gen_rtx_SET (stack_pointer_rtx,
14023 plus_constant (Pmode, stack_pointer_rtx,
14024 -UNITS_PER_WORD)));
14028 emit_move_insn (eax, GEN_INT (allocate));
14029 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14031 /* Use the fact that AX still contains ALLOCATE. */
14032 adjust_stack_insn = (Pmode == DImode
14033 ? gen_pro_epilogue_adjust_stack_di_sub
14034 : gen_pro_epilogue_adjust_stack_si_sub);
14036 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14037 stack_pointer_rtx, eax));
14039 if (sp_is_cfa_reg || TARGET_SEH)
14041 if (sp_is_cfa_reg)
14042 m->fs.cfa_offset += allocate;
14043 RTX_FRAME_RELATED_P (insn) = 1;
14044 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14045 gen_rtx_SET (stack_pointer_rtx,
14046 plus_constant (Pmode, stack_pointer_rtx,
14047 -allocate)));
14049 m->fs.sp_offset += allocate;
14051 /* Use stack_pointer_rtx for relative addressing so that code
14052 works for realigned stack, too. */
14053 if (r10_live && eax_live)
14055 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14056 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14057 gen_frame_mem (word_mode, t));
14058 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14059 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14060 gen_frame_mem (word_mode, t));
14062 else if (eax_live || r10_live)
14064 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14065 emit_move_insn (gen_rtx_REG (word_mode,
14066 (eax_live ? AX_REG : R10_REG)),
14067 gen_frame_mem (word_mode, t));
14070 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14072 /* If we havn't already set up the frame pointer, do so now. */
14073 if (frame_pointer_needed && !m->fs.fp_valid)
14075 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14076 GEN_INT (frame.stack_pointer_offset
14077 - frame.hard_frame_pointer_offset));
14078 insn = emit_insn (insn);
14079 RTX_FRAME_RELATED_P (insn) = 1;
14080 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14082 if (m->fs.cfa_reg == stack_pointer_rtx)
14083 m->fs.cfa_reg = hard_frame_pointer_rtx;
14084 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14085 m->fs.fp_valid = true;
14088 if (!int_registers_saved)
14089 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14090 if (!sse_registers_saved)
14091 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14092 else if (save_stub_call_needed)
14093 ix86_emit_outlined_ms2sysv_save (frame);
14095 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14096 in PROLOGUE. */
14097 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14099 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14100 insn = emit_insn (gen_set_got (pic));
14101 RTX_FRAME_RELATED_P (insn) = 1;
14102 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14103 emit_insn (gen_prologue_use (pic));
14104 /* Deleting already emmitted SET_GOT if exist and allocated to
14105 REAL_PIC_OFFSET_TABLE_REGNUM. */
14106 ix86_elim_entry_set_got (pic);
14109 if (crtl->drap_reg && !crtl->stack_realign_needed)
14111 /* vDRAP is setup but after reload it turns out stack realign
14112 isn't necessary, here we will emit prologue to setup DRAP
14113 without stack realign adjustment */
14114 t = choose_baseaddr (0, NULL);
14115 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14118 /* Prevent instructions from being scheduled into register save push
14119 sequence when access to the redzone area is done through frame pointer.
14120 The offset between the frame pointer and the stack pointer is calculated
14121 relative to the value of the stack pointer at the end of the function
14122 prologue, and moving instructions that access redzone area via frame
14123 pointer inside push sequence violates this assumption. */
14124 if (frame_pointer_needed && frame.red_zone_size)
14125 emit_insn (gen_memory_blockage ());
14127 /* SEH requires that the prologue end within 256 bytes of the start of
14128 the function. Prevent instruction schedules that would extend that.
14129 Further, prevent alloca modifications to the stack pointer from being
14130 combined with prologue modifications. */
14131 if (TARGET_SEH)
14132 emit_insn (gen_prologue_use (stack_pointer_rtx));
14135 /* Emit code to restore REG using a POP insn. */
14137 static void
14138 ix86_emit_restore_reg_using_pop (rtx reg)
14140 struct machine_function *m = cfun->machine;
14141 rtx_insn *insn = emit_insn (gen_pop (reg));
14143 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14144 m->fs.sp_offset -= UNITS_PER_WORD;
14146 if (m->fs.cfa_reg == crtl->drap_reg
14147 && REGNO (reg) == REGNO (crtl->drap_reg))
14149 /* Previously we'd represented the CFA as an expression
14150 like *(%ebp - 8). We've just popped that value from
14151 the stack, which means we need to reset the CFA to
14152 the drap register. This will remain until we restore
14153 the stack pointer. */
14154 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14155 RTX_FRAME_RELATED_P (insn) = 1;
14157 /* This means that the DRAP register is valid for addressing too. */
14158 m->fs.drap_valid = true;
14159 return;
14162 if (m->fs.cfa_reg == stack_pointer_rtx)
14164 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14165 x = gen_rtx_SET (stack_pointer_rtx, x);
14166 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14167 RTX_FRAME_RELATED_P (insn) = 1;
14169 m->fs.cfa_offset -= UNITS_PER_WORD;
14172 /* When the frame pointer is the CFA, and we pop it, we are
14173 swapping back to the stack pointer as the CFA. This happens
14174 for stack frames that don't allocate other data, so we assume
14175 the stack pointer is now pointing at the return address, i.e.
14176 the function entry state, which makes the offset be 1 word. */
14177 if (reg == hard_frame_pointer_rtx)
14179 m->fs.fp_valid = false;
14180 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14182 m->fs.cfa_reg = stack_pointer_rtx;
14183 m->fs.cfa_offset -= UNITS_PER_WORD;
14185 add_reg_note (insn, REG_CFA_DEF_CFA,
14186 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14187 GEN_INT (m->fs.cfa_offset)));
14188 RTX_FRAME_RELATED_P (insn) = 1;
14193 /* Emit code to restore saved registers using POP insns. */
14195 static void
14196 ix86_emit_restore_regs_using_pop (void)
14198 unsigned int regno;
14200 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14201 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14202 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14205 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14206 omits the emit and only attaches the notes. */
14208 static void
14209 ix86_emit_leave (rtx_insn *insn)
14211 struct machine_function *m = cfun->machine;
14212 if (!insn)
14213 insn = emit_insn (ix86_gen_leave ());
14215 ix86_add_queued_cfa_restore_notes (insn);
14217 gcc_assert (m->fs.fp_valid);
14218 m->fs.sp_valid = true;
14219 m->fs.sp_realigned = false;
14220 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14221 m->fs.fp_valid = false;
14223 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14225 m->fs.cfa_reg = stack_pointer_rtx;
14226 m->fs.cfa_offset = m->fs.sp_offset;
14228 add_reg_note (insn, REG_CFA_DEF_CFA,
14229 plus_constant (Pmode, stack_pointer_rtx,
14230 m->fs.sp_offset));
14231 RTX_FRAME_RELATED_P (insn) = 1;
14233 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14234 m->fs.fp_offset);
14237 /* Emit code to restore saved registers using MOV insns.
14238 First register is restored from CFA - CFA_OFFSET. */
14239 static void
14240 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14241 bool maybe_eh_return)
14243 struct machine_function *m = cfun->machine;
14244 unsigned int regno;
14246 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14247 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14249 rtx reg = gen_rtx_REG (word_mode, regno);
14250 rtx mem;
14251 rtx_insn *insn;
14253 mem = choose_baseaddr (cfa_offset, NULL);
14254 mem = gen_frame_mem (word_mode, mem);
14255 insn = emit_move_insn (reg, mem);
14257 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14259 /* Previously we'd represented the CFA as an expression
14260 like *(%ebp - 8). We've just popped that value from
14261 the stack, which means we need to reset the CFA to
14262 the drap register. This will remain until we restore
14263 the stack pointer. */
14264 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14265 RTX_FRAME_RELATED_P (insn) = 1;
14267 /* This means that the DRAP register is valid for addressing. */
14268 m->fs.drap_valid = true;
14270 else
14271 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14273 cfa_offset -= UNITS_PER_WORD;
14277 /* Emit code to restore saved registers using MOV insns.
14278 First register is restored from CFA - CFA_OFFSET. */
14279 static void
14280 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14281 bool maybe_eh_return)
14283 unsigned int regno;
14285 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14286 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14288 rtx reg = gen_rtx_REG (V4SFmode, regno);
14289 rtx mem;
14290 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14292 mem = choose_baseaddr (cfa_offset, &align);
14293 mem = gen_rtx_MEM (V4SFmode, mem);
14295 /* The location aligment depends upon the base register. */
14296 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14297 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14298 set_mem_align (mem, align);
14299 emit_insn (gen_rtx_SET (reg, mem));
14301 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14303 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14307 static void
14308 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14309 bool use_call, int style)
14311 struct machine_function *m = cfun->machine;
14312 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14313 + m->call_ms2sysv_extra_regs;
14314 rtvec v;
14315 unsigned int elems_needed, align, i, vi = 0;
14316 rtx_insn *insn;
14317 rtx sym, tmp;
14318 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14319 rtx r10 = NULL_RTX;
14320 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14321 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14322 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14323 rtx rsi_frame_load = NULL_RTX;
14324 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14325 enum xlogue_stub stub;
14327 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14329 /* If using a realigned stack, we should never start with padding. */
14330 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14332 /* Setup RSI as the stub's base pointer. */
14333 align = GET_MODE_ALIGNMENT (V4SFmode);
14334 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14335 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14337 emit_insn (gen_rtx_SET (rsi, tmp));
14339 /* Get a symbol for the stub. */
14340 if (frame_pointer_needed)
14341 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14342 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14343 else
14344 stub = use_call ? XLOGUE_STUB_RESTORE
14345 : XLOGUE_STUB_RESTORE_TAIL;
14346 sym = xlogue.get_stub_rtx (stub);
14348 elems_needed = ncregs;
14349 if (use_call)
14350 elems_needed += 1;
14351 else
14352 elems_needed += frame_pointer_needed ? 5 : 3;
14353 v = rtvec_alloc (elems_needed);
14355 /* We call the epilogue stub when we need to pop incoming args or we are
14356 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14357 epilogue stub and it is the tail-call. */
14358 if (use_call)
14359 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14360 else
14362 RTVEC_ELT (v, vi++) = ret_rtx;
14363 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14364 if (frame_pointer_needed)
14366 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14367 gcc_assert (m->fs.fp_valid);
14368 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14370 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14371 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14372 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14373 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14374 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14376 else
14378 /* If no hard frame pointer, we set R10 to the SP restore value. */
14379 gcc_assert (!m->fs.fp_valid);
14380 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14381 gcc_assert (m->fs.sp_valid);
14383 r10 = gen_rtx_REG (DImode, R10_REG);
14384 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14385 emit_insn (gen_rtx_SET (r10, tmp));
14387 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14391 /* Generate frame load insns and restore notes. */
14392 for (i = 0; i < ncregs; ++i)
14394 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14395 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14396 rtx reg, frame_load;
14398 reg = gen_rtx_REG (mode, r.regno);
14399 frame_load = gen_frame_load (reg, rsi, r.offset);
14401 /* Save RSI frame load insn & note to add last. */
14402 if (r.regno == SI_REG)
14404 gcc_assert (!rsi_frame_load);
14405 rsi_frame_load = frame_load;
14406 rsi_restore_offset = r.offset;
14408 else
14410 RTVEC_ELT (v, vi++) = frame_load;
14411 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14415 /* Add RSI frame load & restore note at the end. */
14416 gcc_assert (rsi_frame_load);
14417 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14418 RTVEC_ELT (v, vi++) = rsi_frame_load;
14419 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14420 rsi_restore_offset);
14422 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14423 if (!use_call && !frame_pointer_needed)
14425 gcc_assert (m->fs.sp_valid);
14426 gcc_assert (!m->fs.sp_realigned);
14428 /* At this point, R10 should point to frame.stack_realign_offset. */
14429 if (m->fs.cfa_reg == stack_pointer_rtx)
14430 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14431 m->fs.sp_offset = frame.stack_realign_offset;
14434 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14435 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14436 if (use_call)
14437 insn = emit_insn (tmp);
14438 else
14440 insn = emit_jump_insn (tmp);
14441 JUMP_LABEL (insn) = ret_rtx;
14443 if (frame_pointer_needed)
14444 ix86_emit_leave (insn);
14445 else
14447 /* Need CFA adjust note. */
14448 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14449 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14453 RTX_FRAME_RELATED_P (insn) = true;
14454 ix86_add_queued_cfa_restore_notes (insn);
14456 /* If we're not doing a tail-call, we need to adjust the stack. */
14457 if (use_call && m->fs.sp_valid)
14459 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14460 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14461 GEN_INT (dealloc), style,
14462 m->fs.cfa_reg == stack_pointer_rtx);
14466 /* Restore function stack, frame, and registers. */
14468 void
14469 ix86_expand_epilogue (int style)
14471 struct machine_function *m = cfun->machine;
14472 struct machine_frame_state frame_state_save = m->fs;
14473 bool restore_regs_via_mov;
14474 bool using_drap;
14475 bool restore_stub_is_tail = false;
14477 if (ix86_function_naked (current_function_decl))
14479 /* The program should not reach this point. */
14480 emit_insn (gen_ud2 ());
14481 return;
14484 ix86_finalize_stack_frame_flags ();
14485 const struct ix86_frame &frame = cfun->machine->frame;
14487 m->fs.sp_realigned = stack_realign_fp;
14488 m->fs.sp_valid = stack_realign_fp
14489 || !frame_pointer_needed
14490 || crtl->sp_is_unchanging;
14491 gcc_assert (!m->fs.sp_valid
14492 || m->fs.sp_offset == frame.stack_pointer_offset);
14494 /* The FP must be valid if the frame pointer is present. */
14495 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14496 gcc_assert (!m->fs.fp_valid
14497 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14499 /* We must have *some* valid pointer to the stack frame. */
14500 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14502 /* The DRAP is never valid at this point. */
14503 gcc_assert (!m->fs.drap_valid);
14505 /* See the comment about red zone and frame
14506 pointer usage in ix86_expand_prologue. */
14507 if (frame_pointer_needed && frame.red_zone_size)
14508 emit_insn (gen_memory_blockage ());
14510 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14511 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14513 /* Determine the CFA offset of the end of the red-zone. */
14514 m->fs.red_zone_offset = 0;
14515 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14517 /* The red-zone begins below return address and error code in
14518 exception handler. */
14519 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14521 /* When the register save area is in the aligned portion of
14522 the stack, determine the maximum runtime displacement that
14523 matches up with the aligned frame. */
14524 if (stack_realign_drap)
14525 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14526 + UNITS_PER_WORD);
14529 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14531 /* Special care must be taken for the normal return case of a function
14532 using eh_return: the eax and edx registers are marked as saved, but
14533 not restored along this path. Adjust the save location to match. */
14534 if (crtl->calls_eh_return && style != 2)
14535 reg_save_offset -= 2 * UNITS_PER_WORD;
14537 /* EH_RETURN requires the use of moves to function properly. */
14538 if (crtl->calls_eh_return)
14539 restore_regs_via_mov = true;
14540 /* SEH requires the use of pops to identify the epilogue. */
14541 else if (TARGET_SEH)
14542 restore_regs_via_mov = false;
14543 /* If we're only restoring one register and sp cannot be used then
14544 using a move instruction to restore the register since it's
14545 less work than reloading sp and popping the register. */
14546 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14547 restore_regs_via_mov = true;
14548 else if (TARGET_EPILOGUE_USING_MOVE
14549 && cfun->machine->use_fast_prologue_epilogue
14550 && (frame.nregs > 1
14551 || m->fs.sp_offset != reg_save_offset))
14552 restore_regs_via_mov = true;
14553 else if (frame_pointer_needed
14554 && !frame.nregs
14555 && m->fs.sp_offset != reg_save_offset)
14556 restore_regs_via_mov = true;
14557 else if (frame_pointer_needed
14558 && TARGET_USE_LEAVE
14559 && cfun->machine->use_fast_prologue_epilogue
14560 && frame.nregs == 1)
14561 restore_regs_via_mov = true;
14562 else
14563 restore_regs_via_mov = false;
14565 if (restore_regs_via_mov || frame.nsseregs)
14567 /* Ensure that the entire register save area is addressable via
14568 the stack pointer, if we will restore SSE regs via sp. */
14569 if (TARGET_64BIT
14570 && m->fs.sp_offset > 0x7fffffff
14571 && sp_valid_at (frame.stack_realign_offset + 1)
14572 && (frame.nsseregs + frame.nregs) != 0)
14574 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14575 GEN_INT (m->fs.sp_offset
14576 - frame.sse_reg_save_offset),
14577 style,
14578 m->fs.cfa_reg == stack_pointer_rtx);
14582 /* If there are any SSE registers to restore, then we have to do it
14583 via moves, since there's obviously no pop for SSE regs. */
14584 if (frame.nsseregs)
14585 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14586 style == 2);
14588 if (m->call_ms2sysv)
14590 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14592 /* We cannot use a tail-call for the stub if:
14593 1. We have to pop incoming args,
14594 2. We have additional int regs to restore, or
14595 3. A sibling call will be the tail-call, or
14596 4. We are emitting an eh_return_internal epilogue.
14598 TODO: Item 4 has not yet tested!
14600 If any of the above are true, we will call the stub rather than
14601 jump to it. */
14602 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14603 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14606 /* If using out-of-line stub that is a tail-call, then...*/
14607 if (m->call_ms2sysv && restore_stub_is_tail)
14609 /* TODO: parinoid tests. (remove eventually) */
14610 gcc_assert (m->fs.sp_valid);
14611 gcc_assert (!m->fs.sp_realigned);
14612 gcc_assert (!m->fs.fp_valid);
14613 gcc_assert (!m->fs.realigned);
14614 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14615 gcc_assert (!crtl->drap_reg);
14616 gcc_assert (!frame.nregs);
14618 else if (restore_regs_via_mov)
14620 rtx t;
14622 if (frame.nregs)
14623 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14625 /* eh_return epilogues need %ecx added to the stack pointer. */
14626 if (style == 2)
14628 rtx sa = EH_RETURN_STACKADJ_RTX;
14629 rtx_insn *insn;
14631 /* %ecx can't be used for both DRAP register and eh_return. */
14632 if (crtl->drap_reg)
14633 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14635 /* regparm nested functions don't work with eh_return. */
14636 gcc_assert (!ix86_static_chain_on_stack);
14638 if (frame_pointer_needed)
14640 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14641 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14642 emit_insn (gen_rtx_SET (sa, t));
14644 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14645 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14647 /* Note that we use SA as a temporary CFA, as the return
14648 address is at the proper place relative to it. We
14649 pretend this happens at the FP restore insn because
14650 prior to this insn the FP would be stored at the wrong
14651 offset relative to SA, and after this insn we have no
14652 other reasonable register to use for the CFA. We don't
14653 bother resetting the CFA to the SP for the duration of
14654 the return insn, unless the control flow instrumentation
14655 is done. In this case the SP is used later and we have
14656 to reset CFA to SP. */
14657 add_reg_note (insn, REG_CFA_DEF_CFA,
14658 plus_constant (Pmode, sa, UNITS_PER_WORD));
14659 ix86_add_queued_cfa_restore_notes (insn);
14660 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14661 RTX_FRAME_RELATED_P (insn) = 1;
14663 m->fs.cfa_reg = sa;
14664 m->fs.cfa_offset = UNITS_PER_WORD;
14665 m->fs.fp_valid = false;
14667 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14668 const0_rtx, style,
14669 flag_cf_protection);
14671 else
14673 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14674 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14675 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14676 ix86_add_queued_cfa_restore_notes (insn);
14678 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14679 if (m->fs.cfa_offset != UNITS_PER_WORD)
14681 m->fs.cfa_offset = UNITS_PER_WORD;
14682 add_reg_note (insn, REG_CFA_DEF_CFA,
14683 plus_constant (Pmode, stack_pointer_rtx,
14684 UNITS_PER_WORD));
14685 RTX_FRAME_RELATED_P (insn) = 1;
14688 m->fs.sp_offset = UNITS_PER_WORD;
14689 m->fs.sp_valid = true;
14690 m->fs.sp_realigned = false;
14693 else
14695 /* SEH requires that the function end with (1) a stack adjustment
14696 if necessary, (2) a sequence of pops, and (3) a return or
14697 jump instruction. Prevent insns from the function body from
14698 being scheduled into this sequence. */
14699 if (TARGET_SEH)
14701 /* Prevent a catch region from being adjacent to the standard
14702 epilogue sequence. Unfortunately neither crtl->uses_eh_lsda
14703 nor several other flags that would be interesting to test are
14704 set up yet. */
14705 if (flag_non_call_exceptions)
14706 emit_insn (gen_nops (const1_rtx));
14707 else
14708 emit_insn (gen_blockage ());
14711 /* First step is to deallocate the stack frame so that we can
14712 pop the registers. If the stack pointer was realigned, it needs
14713 to be restored now. Also do it on SEH target for very large
14714 frame as the emitted instructions aren't allowed by the ABI
14715 in epilogues. */
14716 if (!m->fs.sp_valid || m->fs.sp_realigned
14717 || (TARGET_SEH
14718 && (m->fs.sp_offset - reg_save_offset
14719 >= SEH_MAX_FRAME_SIZE)))
14721 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14722 GEN_INT (m->fs.fp_offset
14723 - reg_save_offset),
14724 style, false);
14726 else if (m->fs.sp_offset != reg_save_offset)
14728 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14729 GEN_INT (m->fs.sp_offset
14730 - reg_save_offset),
14731 style,
14732 m->fs.cfa_reg == stack_pointer_rtx);
14735 ix86_emit_restore_regs_using_pop ();
14738 /* If we used a stack pointer and haven't already got rid of it,
14739 then do so now. */
14740 if (m->fs.fp_valid)
14742 /* If the stack pointer is valid and pointing at the frame
14743 pointer store address, then we only need a pop. */
14744 if (sp_valid_at (frame.hfp_save_offset)
14745 && m->fs.sp_offset == frame.hfp_save_offset)
14746 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14747 /* Leave results in shorter dependency chains on CPUs that are
14748 able to grok it fast. */
14749 else if (TARGET_USE_LEAVE
14750 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14751 || !cfun->machine->use_fast_prologue_epilogue)
14752 ix86_emit_leave (NULL);
14753 else
14755 pro_epilogue_adjust_stack (stack_pointer_rtx,
14756 hard_frame_pointer_rtx,
14757 const0_rtx, style, !using_drap);
14758 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14762 if (using_drap)
14764 int param_ptr_offset = UNITS_PER_WORD;
14765 rtx_insn *insn;
14767 gcc_assert (stack_realign_drap);
14769 if (ix86_static_chain_on_stack)
14770 param_ptr_offset += UNITS_PER_WORD;
14771 if (!call_used_regs[REGNO (crtl->drap_reg)])
14772 param_ptr_offset += UNITS_PER_WORD;
14774 insn = emit_insn (gen_rtx_SET
14775 (stack_pointer_rtx,
14776 gen_rtx_PLUS (Pmode,
14777 crtl->drap_reg,
14778 GEN_INT (-param_ptr_offset))));
14779 m->fs.cfa_reg = stack_pointer_rtx;
14780 m->fs.cfa_offset = param_ptr_offset;
14781 m->fs.sp_offset = param_ptr_offset;
14782 m->fs.realigned = false;
14784 add_reg_note (insn, REG_CFA_DEF_CFA,
14785 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14786 GEN_INT (param_ptr_offset)));
14787 RTX_FRAME_RELATED_P (insn) = 1;
14789 if (!call_used_regs[REGNO (crtl->drap_reg)])
14790 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14793 /* At this point the stack pointer must be valid, and we must have
14794 restored all of the registers. We may not have deallocated the
14795 entire stack frame. We've delayed this until now because it may
14796 be possible to merge the local stack deallocation with the
14797 deallocation forced by ix86_static_chain_on_stack. */
14798 gcc_assert (m->fs.sp_valid);
14799 gcc_assert (!m->fs.sp_realigned);
14800 gcc_assert (!m->fs.fp_valid);
14801 gcc_assert (!m->fs.realigned);
14802 if (m->fs.sp_offset != UNITS_PER_WORD)
14804 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14805 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14806 style, true);
14808 else
14809 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14811 /* Sibcall epilogues don't want a return instruction. */
14812 if (style == 0)
14814 m->fs = frame_state_save;
14815 return;
14818 if (cfun->machine->func_type != TYPE_NORMAL)
14819 emit_jump_insn (gen_interrupt_return ());
14820 else if (crtl->args.pops_args && crtl->args.size)
14822 rtx popc = GEN_INT (crtl->args.pops_args);
14824 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14825 address, do explicit add, and jump indirectly to the caller. */
14827 if (crtl->args.pops_args >= 65536)
14829 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14830 rtx_insn *insn;
14832 /* There is no "pascal" calling convention in any 64bit ABI. */
14833 gcc_assert (!TARGET_64BIT);
14835 insn = emit_insn (gen_pop (ecx));
14836 m->fs.cfa_offset -= UNITS_PER_WORD;
14837 m->fs.sp_offset -= UNITS_PER_WORD;
14839 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14840 x = gen_rtx_SET (stack_pointer_rtx, x);
14841 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14842 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14843 RTX_FRAME_RELATED_P (insn) = 1;
14845 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14846 popc, -1, true);
14847 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14849 else
14850 emit_jump_insn (gen_simple_return_pop_internal (popc));
14852 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14854 /* In case of return from EH a simple return cannot be used
14855 as a return address will be compared with a shadow stack
14856 return address. Use indirect jump instead. */
14857 if (style == 2 && flag_cf_protection)
14859 /* Register used in indirect jump must be in word_mode. But
14860 Pmode may not be the same as word_mode for x32. */
14861 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14862 rtx_insn *insn;
14864 insn = emit_insn (gen_pop (ecx));
14865 m->fs.cfa_offset -= UNITS_PER_WORD;
14866 m->fs.sp_offset -= UNITS_PER_WORD;
14868 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14869 x = gen_rtx_SET (stack_pointer_rtx, x);
14870 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14871 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14872 RTX_FRAME_RELATED_P (insn) = 1;
14874 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14876 else
14877 emit_jump_insn (gen_simple_return_internal ());
14880 /* Restore the state back to the state from the prologue,
14881 so that it's correct for the next epilogue. */
14882 m->fs = frame_state_save;
14885 /* Reset from the function's potential modifications. */
14887 static void
14888 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14890 if (pic_offset_table_rtx
14891 && !ix86_use_pseudo_pic_reg ())
14892 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14894 if (TARGET_MACHO)
14896 rtx_insn *insn = get_last_insn ();
14897 rtx_insn *deleted_debug_label = NULL;
14899 /* Mach-O doesn't support labels at the end of objects, so if
14900 it looks like we might want one, take special action.
14901 First, collect any sequence of deleted debug labels. */
14902 while (insn
14903 && NOTE_P (insn)
14904 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14906 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14907 notes only, instead set their CODE_LABEL_NUMBER to -1,
14908 otherwise there would be code generation differences
14909 in between -g and -g0. */
14910 if (NOTE_P (insn) && NOTE_KIND (insn)
14911 == NOTE_INSN_DELETED_DEBUG_LABEL)
14912 deleted_debug_label = insn;
14913 insn = PREV_INSN (insn);
14916 /* If we have:
14917 label:
14918 barrier
14919 then this needs to be detected, so skip past the barrier. */
14921 if (insn && BARRIER_P (insn))
14922 insn = PREV_INSN (insn);
14924 /* Up to now we've only seen notes or barriers. */
14925 if (insn)
14927 if (LABEL_P (insn)
14928 || (NOTE_P (insn)
14929 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14930 /* Trailing label. */
14931 fputs ("\tnop\n", file);
14932 else if (cfun && ! cfun->is_thunk)
14934 /* See if we have a completely empty function body, skipping
14935 the special case of the picbase thunk emitted as asm. */
14936 while (insn && ! INSN_P (insn))
14937 insn = PREV_INSN (insn);
14938 /* If we don't find any insns, we've got an empty function body;
14939 I.e. completely empty - without a return or branch. This is
14940 taken as the case where a function body has been removed
14941 because it contains an inline __builtin_unreachable(). GCC
14942 declares that reaching __builtin_unreachable() means UB so
14943 we're not obliged to do anything special; however, we want
14944 non-zero-sized function bodies. To meet this, and help the
14945 user out, let's trap the case. */
14946 if (insn == NULL)
14947 fputs ("\tud2\n", file);
14950 else if (deleted_debug_label)
14951 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14952 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14953 CODE_LABEL_NUMBER (insn) = -1;
14957 /* Return a scratch register to use in the split stack prologue. The
14958 split stack prologue is used for -fsplit-stack. It is the first
14959 instructions in the function, even before the regular prologue.
14960 The scratch register can be any caller-saved register which is not
14961 used for parameters or for the static chain. */
14963 static unsigned int
14964 split_stack_prologue_scratch_regno (void)
14966 if (TARGET_64BIT)
14967 return R11_REG;
14968 else
14970 bool is_fastcall, is_thiscall;
14971 int regparm;
14973 is_fastcall = (lookup_attribute ("fastcall",
14974 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14975 != NULL);
14976 is_thiscall = (lookup_attribute ("thiscall",
14977 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14978 != NULL);
14979 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14981 if (is_fastcall)
14983 if (DECL_STATIC_CHAIN (cfun->decl))
14985 sorry ("-fsplit-stack does not support fastcall with "
14986 "nested function");
14987 return INVALID_REGNUM;
14989 return AX_REG;
14991 else if (is_thiscall)
14993 if (!DECL_STATIC_CHAIN (cfun->decl))
14994 return DX_REG;
14995 return AX_REG;
14997 else if (regparm < 3)
14999 if (!DECL_STATIC_CHAIN (cfun->decl))
15000 return CX_REG;
15001 else
15003 if (regparm >= 2)
15005 sorry ("-fsplit-stack does not support 2 register "
15006 "parameters for a nested function");
15007 return INVALID_REGNUM;
15009 return DX_REG;
15012 else
15014 /* FIXME: We could make this work by pushing a register
15015 around the addition and comparison. */
15016 sorry ("-fsplit-stack does not support 3 register parameters");
15017 return INVALID_REGNUM;
15022 /* A SYMBOL_REF for the function which allocates new stackspace for
15023 -fsplit-stack. */
15025 static GTY(()) rtx split_stack_fn;
15027 /* A SYMBOL_REF for the more stack function when using the large
15028 model. */
15030 static GTY(()) rtx split_stack_fn_large;
15032 /* Return location of the stack guard value in the TLS block. */
15035 ix86_split_stack_guard (void)
15037 int offset;
15038 addr_space_t as = DEFAULT_TLS_SEG_REG;
15039 rtx r;
15041 gcc_assert (flag_split_stack);
15043 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15044 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15045 #else
15046 gcc_unreachable ();
15047 #endif
15049 r = GEN_INT (offset);
15050 r = gen_const_mem (Pmode, r);
15051 set_mem_addr_space (r, as);
15053 return r;
15056 /* Handle -fsplit-stack. These are the first instructions in the
15057 function, even before the regular prologue. */
15059 void
15060 ix86_expand_split_stack_prologue (void)
15062 HOST_WIDE_INT allocate;
15063 unsigned HOST_WIDE_INT args_size;
15064 rtx_code_label *label;
15065 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15066 rtx scratch_reg = NULL_RTX;
15067 rtx_code_label *varargs_label = NULL;
15068 rtx fn;
15070 gcc_assert (flag_split_stack && reload_completed);
15072 ix86_finalize_stack_frame_flags ();
15073 struct ix86_frame &frame = cfun->machine->frame;
15074 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15076 /* This is the label we will branch to if we have enough stack
15077 space. We expect the basic block reordering pass to reverse this
15078 branch if optimizing, so that we branch in the unlikely case. */
15079 label = gen_label_rtx ();
15081 /* We need to compare the stack pointer minus the frame size with
15082 the stack boundary in the TCB. The stack boundary always gives
15083 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15084 can compare directly. Otherwise we need to do an addition. */
15086 limit = ix86_split_stack_guard ();
15088 if (allocate < SPLIT_STACK_AVAILABLE)
15089 current = stack_pointer_rtx;
15090 else
15092 unsigned int scratch_regno;
15093 rtx offset;
15095 /* We need a scratch register to hold the stack pointer minus
15096 the required frame size. Since this is the very start of the
15097 function, the scratch register can be any caller-saved
15098 register which is not used for parameters. */
15099 offset = GEN_INT (- allocate);
15100 scratch_regno = split_stack_prologue_scratch_regno ();
15101 if (scratch_regno == INVALID_REGNUM)
15102 return;
15103 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15104 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15106 /* We don't use ix86_gen_add3 in this case because it will
15107 want to split to lea, but when not optimizing the insn
15108 will not be split after this point. */
15109 emit_insn (gen_rtx_SET (scratch_reg,
15110 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15111 offset)));
15113 else
15115 emit_move_insn (scratch_reg, offset);
15116 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15117 stack_pointer_rtx));
15119 current = scratch_reg;
15122 ix86_expand_branch (GEU, current, limit, label);
15123 rtx_insn *jump_insn = get_last_insn ();
15124 JUMP_LABEL (jump_insn) = label;
15126 /* Mark the jump as very likely to be taken. */
15127 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15129 if (split_stack_fn == NULL_RTX)
15131 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15132 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15134 fn = split_stack_fn;
15136 /* Get more stack space. We pass in the desired stack space and the
15137 size of the arguments to copy to the new stack. In 32-bit mode
15138 we push the parameters; __morestack will return on a new stack
15139 anyhow. In 64-bit mode we pass the parameters in r10 and
15140 r11. */
15141 allocate_rtx = GEN_INT (allocate);
15142 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
15143 call_fusage = NULL_RTX;
15144 rtx pop = NULL_RTX;
15145 if (TARGET_64BIT)
15147 rtx reg10, reg11;
15149 reg10 = gen_rtx_REG (Pmode, R10_REG);
15150 reg11 = gen_rtx_REG (Pmode, R11_REG);
15152 /* If this function uses a static chain, it will be in %r10.
15153 Preserve it across the call to __morestack. */
15154 if (DECL_STATIC_CHAIN (cfun->decl))
15156 rtx rax;
15158 rax = gen_rtx_REG (word_mode, AX_REG);
15159 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15160 use_reg (&call_fusage, rax);
15163 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15164 && !TARGET_PECOFF)
15166 HOST_WIDE_INT argval;
15168 gcc_assert (Pmode == DImode);
15169 /* When using the large model we need to load the address
15170 into a register, and we've run out of registers. So we
15171 switch to a different calling convention, and we call a
15172 different function: __morestack_large. We pass the
15173 argument size in the upper 32 bits of r10 and pass the
15174 frame size in the lower 32 bits. */
15175 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15176 gcc_assert ((args_size & 0xffffffff) == args_size);
15178 if (split_stack_fn_large == NULL_RTX)
15180 split_stack_fn_large =
15181 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15182 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15184 if (ix86_cmodel == CM_LARGE_PIC)
15186 rtx_code_label *label;
15187 rtx x;
15189 label = gen_label_rtx ();
15190 emit_label (label);
15191 LABEL_PRESERVE_P (label) = 1;
15192 emit_insn (gen_set_rip_rex64 (reg10, label));
15193 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15194 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15195 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15196 UNSPEC_GOT);
15197 x = gen_rtx_CONST (Pmode, x);
15198 emit_move_insn (reg11, x);
15199 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15200 x = gen_const_mem (Pmode, x);
15201 emit_move_insn (reg11, x);
15203 else
15204 emit_move_insn (reg11, split_stack_fn_large);
15206 fn = reg11;
15208 argval = ((args_size << 16) << 16) + allocate;
15209 emit_move_insn (reg10, GEN_INT (argval));
15211 else
15213 emit_move_insn (reg10, allocate_rtx);
15214 emit_move_insn (reg11, GEN_INT (args_size));
15215 use_reg (&call_fusage, reg11);
15218 use_reg (&call_fusage, reg10);
15220 else
15222 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15223 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15224 insn = emit_insn (gen_push (allocate_rtx));
15225 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15226 pop = GEN_INT (2 * UNITS_PER_WORD);
15228 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15229 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15230 pop, false);
15231 add_function_usage_to (call_insn, call_fusage);
15232 if (!TARGET_64BIT)
15233 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15234 /* Indicate that this function can't jump to non-local gotos. */
15235 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15237 /* In order to make call/return prediction work right, we now need
15238 to execute a return instruction. See
15239 libgcc/config/i386/morestack.S for the details on how this works.
15241 For flow purposes gcc must not see this as a return
15242 instruction--we need control flow to continue at the subsequent
15243 label. Therefore, we use an unspec. */
15244 gcc_assert (crtl->args.pops_args < 65536);
15245 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15247 /* If we are in 64-bit mode and this function uses a static chain,
15248 we saved %r10 in %rax before calling _morestack. */
15249 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15250 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15251 gen_rtx_REG (word_mode, AX_REG));
15253 /* If this function calls va_start, we need to store a pointer to
15254 the arguments on the old stack, because they may not have been
15255 all copied to the new stack. At this point the old stack can be
15256 found at the frame pointer value used by __morestack, because
15257 __morestack has set that up before calling back to us. Here we
15258 store that pointer in a scratch register, and in
15259 ix86_expand_prologue we store the scratch register in a stack
15260 slot. */
15261 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15263 unsigned int scratch_regno;
15264 rtx frame_reg;
15265 int words;
15267 scratch_regno = split_stack_prologue_scratch_regno ();
15268 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15269 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15271 /* 64-bit:
15272 fp -> old fp value
15273 return address within this function
15274 return address of caller of this function
15275 stack arguments
15276 So we add three words to get to the stack arguments.
15278 32-bit:
15279 fp -> old fp value
15280 return address within this function
15281 first argument to __morestack
15282 second argument to __morestack
15283 return address of caller of this function
15284 stack arguments
15285 So we add five words to get to the stack arguments.
15287 words = TARGET_64BIT ? 3 : 5;
15288 emit_insn (gen_rtx_SET (scratch_reg,
15289 gen_rtx_PLUS (Pmode, frame_reg,
15290 GEN_INT (words * UNITS_PER_WORD))));
15292 varargs_label = gen_label_rtx ();
15293 emit_jump_insn (gen_jump (varargs_label));
15294 JUMP_LABEL (get_last_insn ()) = varargs_label;
15296 emit_barrier ();
15299 emit_label (label);
15300 LABEL_NUSES (label) = 1;
15302 /* If this function calls va_start, we now have to set the scratch
15303 register for the case where we do not call __morestack. In this
15304 case we need to set it based on the stack pointer. */
15305 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15307 emit_insn (gen_rtx_SET (scratch_reg,
15308 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15309 GEN_INT (UNITS_PER_WORD))));
15311 emit_label (varargs_label);
15312 LABEL_NUSES (varargs_label) = 1;
15316 /* We may have to tell the dataflow pass that the split stack prologue
15317 is initializing a scratch register. */
15319 static void
15320 ix86_live_on_entry (bitmap regs)
15322 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15324 gcc_assert (flag_split_stack);
15325 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15329 /* Extract the parts of an RTL expression that is a valid memory address
15330 for an instruction. Return 0 if the structure of the address is
15331 grossly off. Return -1 if the address contains ASHIFT, so it is not
15332 strictly valid, but still used for computing length of lea instruction. */
15335 ix86_decompose_address (rtx addr, struct ix86_address *out)
15337 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15338 rtx base_reg, index_reg;
15339 HOST_WIDE_INT scale = 1;
15340 rtx scale_rtx = NULL_RTX;
15341 rtx tmp;
15342 int retval = 1;
15343 addr_space_t seg = ADDR_SPACE_GENERIC;
15345 /* Allow zero-extended SImode addresses,
15346 they will be emitted with addr32 prefix. */
15347 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15349 if (GET_CODE (addr) == ZERO_EXTEND
15350 && GET_MODE (XEXP (addr, 0)) == SImode)
15352 addr = XEXP (addr, 0);
15353 if (CONST_INT_P (addr))
15354 return 0;
15356 else if (GET_CODE (addr) == AND
15357 && const_32bit_mask (XEXP (addr, 1), DImode))
15359 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15360 if (addr == NULL_RTX)
15361 return 0;
15363 if (CONST_INT_P (addr))
15364 return 0;
15368 /* Allow SImode subregs of DImode addresses,
15369 they will be emitted with addr32 prefix. */
15370 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15372 if (SUBREG_P (addr)
15373 && GET_MODE (SUBREG_REG (addr)) == DImode)
15375 addr = SUBREG_REG (addr);
15376 if (CONST_INT_P (addr))
15377 return 0;
15381 if (REG_P (addr))
15382 base = addr;
15383 else if (SUBREG_P (addr))
15385 if (REG_P (SUBREG_REG (addr)))
15386 base = addr;
15387 else
15388 return 0;
15390 else if (GET_CODE (addr) == PLUS)
15392 rtx addends[4], op;
15393 int n = 0, i;
15395 op = addr;
15398 if (n >= 4)
15399 return 0;
15400 addends[n++] = XEXP (op, 1);
15401 op = XEXP (op, 0);
15403 while (GET_CODE (op) == PLUS);
15404 if (n >= 4)
15405 return 0;
15406 addends[n] = op;
15408 for (i = n; i >= 0; --i)
15410 op = addends[i];
15411 switch (GET_CODE (op))
15413 case MULT:
15414 if (index)
15415 return 0;
15416 index = XEXP (op, 0);
15417 scale_rtx = XEXP (op, 1);
15418 break;
15420 case ASHIFT:
15421 if (index)
15422 return 0;
15423 index = XEXP (op, 0);
15424 tmp = XEXP (op, 1);
15425 if (!CONST_INT_P (tmp))
15426 return 0;
15427 scale = INTVAL (tmp);
15428 if ((unsigned HOST_WIDE_INT) scale > 3)
15429 return 0;
15430 scale = 1 << scale;
15431 break;
15433 case ZERO_EXTEND:
15434 op = XEXP (op, 0);
15435 if (GET_CODE (op) != UNSPEC)
15436 return 0;
15437 /* FALLTHRU */
15439 case UNSPEC:
15440 if (XINT (op, 1) == UNSPEC_TP
15441 && TARGET_TLS_DIRECT_SEG_REFS
15442 && seg == ADDR_SPACE_GENERIC)
15443 seg = DEFAULT_TLS_SEG_REG;
15444 else
15445 return 0;
15446 break;
15448 case SUBREG:
15449 if (!REG_P (SUBREG_REG (op)))
15450 return 0;
15451 /* FALLTHRU */
15453 case REG:
15454 if (!base)
15455 base = op;
15456 else if (!index)
15457 index = op;
15458 else
15459 return 0;
15460 break;
15462 case CONST:
15463 case CONST_INT:
15464 case SYMBOL_REF:
15465 case LABEL_REF:
15466 if (disp)
15467 return 0;
15468 disp = op;
15469 break;
15471 default:
15472 return 0;
15476 else if (GET_CODE (addr) == MULT)
15478 index = XEXP (addr, 0); /* index*scale */
15479 scale_rtx = XEXP (addr, 1);
15481 else if (GET_CODE (addr) == ASHIFT)
15483 /* We're called for lea too, which implements ashift on occasion. */
15484 index = XEXP (addr, 0);
15485 tmp = XEXP (addr, 1);
15486 if (!CONST_INT_P (tmp))
15487 return 0;
15488 scale = INTVAL (tmp);
15489 if ((unsigned HOST_WIDE_INT) scale > 3)
15490 return 0;
15491 scale = 1 << scale;
15492 retval = -1;
15494 else
15495 disp = addr; /* displacement */
15497 if (index)
15499 if (REG_P (index))
15501 else if (SUBREG_P (index)
15502 && REG_P (SUBREG_REG (index)))
15504 else
15505 return 0;
15508 /* Extract the integral value of scale. */
15509 if (scale_rtx)
15511 if (!CONST_INT_P (scale_rtx))
15512 return 0;
15513 scale = INTVAL (scale_rtx);
15516 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15517 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15519 /* Avoid useless 0 displacement. */
15520 if (disp == const0_rtx && (base || index))
15521 disp = NULL_RTX;
15523 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15524 if (base_reg && index_reg && scale == 1
15525 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15526 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15527 || REGNO (index_reg) == SP_REG))
15529 std::swap (base, index);
15530 std::swap (base_reg, index_reg);
15533 /* Special case: %ebp cannot be encoded as a base without a displacement.
15534 Similarly %r13. */
15535 if (!disp && base_reg
15536 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15537 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15538 || REGNO (base_reg) == BP_REG
15539 || REGNO (base_reg) == R13_REG))
15540 disp = const0_rtx;
15542 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15543 Avoid this by transforming to [%esi+0].
15544 Reload calls address legitimization without cfun defined, so we need
15545 to test cfun for being non-NULL. */
15546 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15547 && base_reg && !index_reg && !disp
15548 && REGNO (base_reg) == SI_REG)
15549 disp = const0_rtx;
15551 /* Special case: encode reg+reg instead of reg*2. */
15552 if (!base && index && scale == 2)
15553 base = index, base_reg = index_reg, scale = 1;
15555 /* Special case: scaling cannot be encoded without base or displacement. */
15556 if (!base && !disp && index && scale != 1)
15557 disp = const0_rtx;
15559 out->base = base;
15560 out->index = index;
15561 out->disp = disp;
15562 out->scale = scale;
15563 out->seg = seg;
15565 return retval;
15568 /* Return cost of the memory address x.
15569 For i386, it is better to use a complex address than let gcc copy
15570 the address into a reg and make a new pseudo. But not if the address
15571 requires to two regs - that would mean more pseudos with longer
15572 lifetimes. */
15573 static int
15574 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15576 struct ix86_address parts;
15577 int cost = 1;
15578 int ok = ix86_decompose_address (x, &parts);
15580 gcc_assert (ok);
15582 if (parts.base && SUBREG_P (parts.base))
15583 parts.base = SUBREG_REG (parts.base);
15584 if (parts.index && SUBREG_P (parts.index))
15585 parts.index = SUBREG_REG (parts.index);
15587 /* Attempt to minimize number of registers in the address by increasing
15588 address cost for each used register. We don't increase address cost
15589 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15590 is not invariant itself it most likely means that base or index is not
15591 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15592 which is not profitable for x86. */
15593 if (parts.base
15594 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15595 && (current_pass->type == GIMPLE_PASS
15596 || !pic_offset_table_rtx
15597 || !REG_P (parts.base)
15598 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15599 cost++;
15601 if (parts.index
15602 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15603 && (current_pass->type == GIMPLE_PASS
15604 || !pic_offset_table_rtx
15605 || !REG_P (parts.index)
15606 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15607 cost++;
15609 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15610 since it's predecode logic can't detect the length of instructions
15611 and it degenerates to vector decoded. Increase cost of such
15612 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15613 to split such addresses or even refuse such addresses at all.
15615 Following addressing modes are affected:
15616 [base+scale*index]
15617 [scale*index+disp]
15618 [base+index]
15620 The first and last case may be avoidable by explicitly coding the zero in
15621 memory address, but I don't have AMD-K6 machine handy to check this
15622 theory. */
15624 if (TARGET_K6
15625 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15626 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15627 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15628 cost += 10;
15630 return cost;
15633 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15634 this is used for to form addresses to local data when -fPIC is in
15635 use. */
15637 static bool
15638 darwin_local_data_pic (rtx disp)
15640 return (GET_CODE (disp) == UNSPEC
15641 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15644 /* True if operand X should be loaded from GOT. */
15646 bool
15647 ix86_force_load_from_GOT_p (rtx x)
15649 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15650 && !TARGET_PECOFF && !TARGET_MACHO
15651 && !flag_plt && !flag_pic
15652 && ix86_cmodel != CM_LARGE
15653 && GET_CODE (x) == SYMBOL_REF
15654 && SYMBOL_REF_FUNCTION_P (x)
15655 && !SYMBOL_REF_LOCAL_P (x));
15658 /* Determine if a given RTX is a valid constant. We already know this
15659 satisfies CONSTANT_P. */
15661 static bool
15662 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15664 /* Pointer bounds constants are not valid. */
15665 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15666 return false;
15668 switch (GET_CODE (x))
15670 case CONST:
15671 x = XEXP (x, 0);
15673 if (GET_CODE (x) == PLUS)
15675 if (!CONST_INT_P (XEXP (x, 1)))
15676 return false;
15677 x = XEXP (x, 0);
15680 if (TARGET_MACHO && darwin_local_data_pic (x))
15681 return true;
15683 /* Only some unspecs are valid as "constants". */
15684 if (GET_CODE (x) == UNSPEC)
15685 switch (XINT (x, 1))
15687 case UNSPEC_GOT:
15688 case UNSPEC_GOTOFF:
15689 case UNSPEC_PLTOFF:
15690 return TARGET_64BIT;
15691 case UNSPEC_TPOFF:
15692 case UNSPEC_NTPOFF:
15693 x = XVECEXP (x, 0, 0);
15694 return (GET_CODE (x) == SYMBOL_REF
15695 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15696 case UNSPEC_DTPOFF:
15697 x = XVECEXP (x, 0, 0);
15698 return (GET_CODE (x) == SYMBOL_REF
15699 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15700 default:
15701 return false;
15704 /* We must have drilled down to a symbol. */
15705 if (GET_CODE (x) == LABEL_REF)
15706 return true;
15707 if (GET_CODE (x) != SYMBOL_REF)
15708 return false;
15709 /* FALLTHRU */
15711 case SYMBOL_REF:
15712 /* TLS symbols are never valid. */
15713 if (SYMBOL_REF_TLS_MODEL (x))
15714 return false;
15716 /* DLLIMPORT symbols are never valid. */
15717 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15718 && SYMBOL_REF_DLLIMPORT_P (x))
15719 return false;
15721 #if TARGET_MACHO
15722 /* mdynamic-no-pic */
15723 if (MACHO_DYNAMIC_NO_PIC_P)
15724 return machopic_symbol_defined_p (x);
15725 #endif
15727 /* External function address should be loaded
15728 via the GOT slot to avoid PLT. */
15729 if (ix86_force_load_from_GOT_p (x))
15730 return false;
15732 break;
15734 CASE_CONST_SCALAR_INT:
15735 switch (mode)
15737 case E_TImode:
15738 if (TARGET_64BIT)
15739 return true;
15740 /* FALLTHRU */
15741 case E_OImode:
15742 case E_XImode:
15743 if (!standard_sse_constant_p (x, mode))
15744 return false;
15745 default:
15746 break;
15748 break;
15750 case CONST_VECTOR:
15751 if (!standard_sse_constant_p (x, mode))
15752 return false;
15754 default:
15755 break;
15758 /* Otherwise we handle everything else in the move patterns. */
15759 return true;
15762 /* Determine if it's legal to put X into the constant pool. This
15763 is not possible for the address of thread-local symbols, which
15764 is checked above. */
15766 static bool
15767 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15769 /* We can put any immediate constant in memory. */
15770 switch (GET_CODE (x))
15772 CASE_CONST_ANY:
15773 return false;
15775 default:
15776 break;
15779 return !ix86_legitimate_constant_p (mode, x);
15782 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15783 otherwise zero. */
15785 static bool
15786 is_imported_p (rtx x)
15788 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15789 || GET_CODE (x) != SYMBOL_REF)
15790 return false;
15792 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15796 /* Nonzero if the constant value X is a legitimate general operand
15797 when generating PIC code. It is given that flag_pic is on and
15798 that X satisfies CONSTANT_P. */
15800 bool
15801 legitimate_pic_operand_p (rtx x)
15803 rtx inner;
15805 switch (GET_CODE (x))
15807 case CONST:
15808 inner = XEXP (x, 0);
15809 if (GET_CODE (inner) == PLUS
15810 && CONST_INT_P (XEXP (inner, 1)))
15811 inner = XEXP (inner, 0);
15813 /* Only some unspecs are valid as "constants". */
15814 if (GET_CODE (inner) == UNSPEC)
15815 switch (XINT (inner, 1))
15817 case UNSPEC_GOT:
15818 case UNSPEC_GOTOFF:
15819 case UNSPEC_PLTOFF:
15820 return TARGET_64BIT;
15821 case UNSPEC_TPOFF:
15822 x = XVECEXP (inner, 0, 0);
15823 return (GET_CODE (x) == SYMBOL_REF
15824 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15825 case UNSPEC_MACHOPIC_OFFSET:
15826 return legitimate_pic_address_disp_p (x);
15827 default:
15828 return false;
15830 /* FALLTHRU */
15832 case SYMBOL_REF:
15833 case LABEL_REF:
15834 return legitimate_pic_address_disp_p (x);
15836 default:
15837 return true;
15841 /* Determine if a given CONST RTX is a valid memory displacement
15842 in PIC mode. */
15844 bool
15845 legitimate_pic_address_disp_p (rtx disp)
15847 bool saw_plus;
15849 /* In 64bit mode we can allow direct addresses of symbols and labels
15850 when they are not dynamic symbols. */
15851 if (TARGET_64BIT)
15853 rtx op0 = disp, op1;
15855 switch (GET_CODE (disp))
15857 case LABEL_REF:
15858 return true;
15860 case CONST:
15861 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15862 break;
15863 op0 = XEXP (XEXP (disp, 0), 0);
15864 op1 = XEXP (XEXP (disp, 0), 1);
15865 if (!CONST_INT_P (op1))
15866 break;
15867 if (GET_CODE (op0) == UNSPEC
15868 && (XINT (op0, 1) == UNSPEC_DTPOFF
15869 || XINT (op0, 1) == UNSPEC_NTPOFF)
15870 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15871 return true;
15872 if (INTVAL (op1) >= 16*1024*1024
15873 || INTVAL (op1) < -16*1024*1024)
15874 break;
15875 if (GET_CODE (op0) == LABEL_REF)
15876 return true;
15877 if (GET_CODE (op0) == CONST
15878 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15879 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15880 return true;
15881 if (GET_CODE (op0) == UNSPEC
15882 && XINT (op0, 1) == UNSPEC_PCREL)
15883 return true;
15884 if (GET_CODE (op0) != SYMBOL_REF)
15885 break;
15886 /* FALLTHRU */
15888 case SYMBOL_REF:
15889 /* TLS references should always be enclosed in UNSPEC.
15890 The dllimported symbol needs always to be resolved. */
15891 if (SYMBOL_REF_TLS_MODEL (op0)
15892 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15893 return false;
15895 if (TARGET_PECOFF)
15897 if (is_imported_p (op0))
15898 return true;
15900 if (SYMBOL_REF_FAR_ADDR_P (op0)
15901 || !SYMBOL_REF_LOCAL_P (op0))
15902 break;
15904 /* Function-symbols need to be resolved only for
15905 large-model.
15906 For the small-model we don't need to resolve anything
15907 here. */
15908 if ((ix86_cmodel != CM_LARGE_PIC
15909 && SYMBOL_REF_FUNCTION_P (op0))
15910 || ix86_cmodel == CM_SMALL_PIC)
15911 return true;
15912 /* Non-external symbols don't need to be resolved for
15913 large, and medium-model. */
15914 if ((ix86_cmodel == CM_LARGE_PIC
15915 || ix86_cmodel == CM_MEDIUM_PIC)
15916 && !SYMBOL_REF_EXTERNAL_P (op0))
15917 return true;
15919 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15920 && (SYMBOL_REF_LOCAL_P (op0)
15921 || (HAVE_LD_PIE_COPYRELOC
15922 && flag_pie
15923 && !SYMBOL_REF_WEAK (op0)
15924 && !SYMBOL_REF_FUNCTION_P (op0)))
15925 && ix86_cmodel != CM_LARGE_PIC)
15926 return true;
15927 break;
15929 default:
15930 break;
15933 if (GET_CODE (disp) != CONST)
15934 return false;
15935 disp = XEXP (disp, 0);
15937 if (TARGET_64BIT)
15939 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15940 of GOT tables. We should not need these anyway. */
15941 if (GET_CODE (disp) != UNSPEC
15942 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15943 && XINT (disp, 1) != UNSPEC_GOTOFF
15944 && XINT (disp, 1) != UNSPEC_PCREL
15945 && XINT (disp, 1) != UNSPEC_PLTOFF))
15946 return false;
15948 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15949 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15950 return false;
15951 return true;
15954 saw_plus = false;
15955 if (GET_CODE (disp) == PLUS)
15957 if (!CONST_INT_P (XEXP (disp, 1)))
15958 return false;
15959 disp = XEXP (disp, 0);
15960 saw_plus = true;
15963 if (TARGET_MACHO && darwin_local_data_pic (disp))
15964 return true;
15966 if (GET_CODE (disp) != UNSPEC)
15967 return false;
15969 switch (XINT (disp, 1))
15971 case UNSPEC_GOT:
15972 if (saw_plus)
15973 return false;
15974 /* We need to check for both symbols and labels because VxWorks loads
15975 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15976 details. */
15977 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15978 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15979 case UNSPEC_GOTOFF:
15980 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15981 While ABI specify also 32bit relocation but we don't produce it in
15982 small PIC model at all. */
15983 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15984 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15985 && !TARGET_64BIT)
15986 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15987 return false;
15988 case UNSPEC_GOTTPOFF:
15989 case UNSPEC_GOTNTPOFF:
15990 case UNSPEC_INDNTPOFF:
15991 if (saw_plus)
15992 return false;
15993 disp = XVECEXP (disp, 0, 0);
15994 return (GET_CODE (disp) == SYMBOL_REF
15995 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15996 case UNSPEC_NTPOFF:
15997 disp = XVECEXP (disp, 0, 0);
15998 return (GET_CODE (disp) == SYMBOL_REF
15999 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16000 case UNSPEC_DTPOFF:
16001 disp = XVECEXP (disp, 0, 0);
16002 return (GET_CODE (disp) == SYMBOL_REF
16003 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16006 return false;
16009 /* Determine if op is suitable RTX for an address register.
16010 Return naked register if a register or a register subreg is
16011 found, otherwise return NULL_RTX. */
16013 static rtx
16014 ix86_validate_address_register (rtx op)
16016 machine_mode mode = GET_MODE (op);
16018 /* Only SImode or DImode registers can form the address. */
16019 if (mode != SImode && mode != DImode)
16020 return NULL_RTX;
16022 if (REG_P (op))
16023 return op;
16024 else if (SUBREG_P (op))
16026 rtx reg = SUBREG_REG (op);
16028 if (!REG_P (reg))
16029 return NULL_RTX;
16031 mode = GET_MODE (reg);
16033 /* Don't allow SUBREGs that span more than a word. It can
16034 lead to spill failures when the register is one word out
16035 of a two word structure. */
16036 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16037 return NULL_RTX;
16039 /* Allow only SUBREGs of non-eliminable hard registers. */
16040 if (register_no_elim_operand (reg, mode))
16041 return reg;
16044 /* Op is not a register. */
16045 return NULL_RTX;
16048 /* Recognizes RTL expressions that are valid memory addresses for an
16049 instruction. The MODE argument is the machine mode for the MEM
16050 expression that wants to use this address.
16052 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16053 convert common non-canonical forms to canonical form so that they will
16054 be recognized. */
16056 static bool
16057 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16059 struct ix86_address parts;
16060 rtx base, index, disp;
16061 HOST_WIDE_INT scale;
16062 addr_space_t seg;
16064 if (ix86_decompose_address (addr, &parts) <= 0)
16065 /* Decomposition failed. */
16066 return false;
16068 base = parts.base;
16069 index = parts.index;
16070 disp = parts.disp;
16071 scale = parts.scale;
16072 seg = parts.seg;
16074 /* Validate base register. */
16075 if (base)
16077 rtx reg = ix86_validate_address_register (base);
16079 if (reg == NULL_RTX)
16080 return false;
16082 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16083 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16084 /* Base is not valid. */
16085 return false;
16088 /* Validate index register. */
16089 if (index)
16091 rtx reg = ix86_validate_address_register (index);
16093 if (reg == NULL_RTX)
16094 return false;
16096 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16097 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16098 /* Index is not valid. */
16099 return false;
16102 /* Index and base should have the same mode. */
16103 if (base && index
16104 && GET_MODE (base) != GET_MODE (index))
16105 return false;
16107 /* Address override works only on the (%reg) part of %fs:(%reg). */
16108 if (seg != ADDR_SPACE_GENERIC
16109 && ((base && GET_MODE (base) != word_mode)
16110 || (index && GET_MODE (index) != word_mode)))
16111 return false;
16113 /* Validate scale factor. */
16114 if (scale != 1)
16116 if (!index)
16117 /* Scale without index. */
16118 return false;
16120 if (scale != 2 && scale != 4 && scale != 8)
16121 /* Scale is not a valid multiplier. */
16122 return false;
16125 /* Validate displacement. */
16126 if (disp)
16128 if (GET_CODE (disp) == CONST
16129 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16130 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16131 switch (XINT (XEXP (disp, 0), 1))
16133 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16134 when used. While ABI specify also 32bit relocations, we
16135 don't produce them at all and use IP relative instead.
16136 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16137 should be loaded via GOT. */
16138 case UNSPEC_GOT:
16139 if (!TARGET_64BIT
16140 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16141 goto is_legitimate_pic;
16142 /* FALLTHRU */
16143 case UNSPEC_GOTOFF:
16144 gcc_assert (flag_pic);
16145 if (!TARGET_64BIT)
16146 goto is_legitimate_pic;
16148 /* 64bit address unspec. */
16149 return false;
16151 case UNSPEC_GOTPCREL:
16152 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16153 goto is_legitimate_pic;
16154 /* FALLTHRU */
16155 case UNSPEC_PCREL:
16156 gcc_assert (flag_pic);
16157 goto is_legitimate_pic;
16159 case UNSPEC_GOTTPOFF:
16160 case UNSPEC_GOTNTPOFF:
16161 case UNSPEC_INDNTPOFF:
16162 case UNSPEC_NTPOFF:
16163 case UNSPEC_DTPOFF:
16164 break;
16166 default:
16167 /* Invalid address unspec. */
16168 return false;
16171 else if (SYMBOLIC_CONST (disp)
16172 && (flag_pic
16173 || (TARGET_MACHO
16174 #if TARGET_MACHO
16175 && MACHOPIC_INDIRECT
16176 && !machopic_operand_p (disp)
16177 #endif
16181 is_legitimate_pic:
16182 if (TARGET_64BIT && (index || base))
16184 /* foo@dtpoff(%rX) is ok. */
16185 if (GET_CODE (disp) != CONST
16186 || GET_CODE (XEXP (disp, 0)) != PLUS
16187 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16188 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16189 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16190 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16191 /* Non-constant pic memory reference. */
16192 return false;
16194 else if ((!TARGET_MACHO || flag_pic)
16195 && ! legitimate_pic_address_disp_p (disp))
16196 /* Displacement is an invalid pic construct. */
16197 return false;
16198 #if TARGET_MACHO
16199 else if (MACHO_DYNAMIC_NO_PIC_P
16200 && !ix86_legitimate_constant_p (Pmode, disp))
16201 /* displacment must be referenced via non_lazy_pointer */
16202 return false;
16203 #endif
16205 /* This code used to verify that a symbolic pic displacement
16206 includes the pic_offset_table_rtx register.
16208 While this is good idea, unfortunately these constructs may
16209 be created by "adds using lea" optimization for incorrect
16210 code like:
16212 int a;
16213 int foo(int i)
16215 return *(&a+i);
16218 This code is nonsensical, but results in addressing
16219 GOT table with pic_offset_table_rtx base. We can't
16220 just refuse it easily, since it gets matched by
16221 "addsi3" pattern, that later gets split to lea in the
16222 case output register differs from input. While this
16223 can be handled by separate addsi pattern for this case
16224 that never results in lea, this seems to be easier and
16225 correct fix for crash to disable this test. */
16227 else if (GET_CODE (disp) != LABEL_REF
16228 && !CONST_INT_P (disp)
16229 && (GET_CODE (disp) != CONST
16230 || !ix86_legitimate_constant_p (Pmode, disp))
16231 && (GET_CODE (disp) != SYMBOL_REF
16232 || !ix86_legitimate_constant_p (Pmode, disp)))
16233 /* Displacement is not constant. */
16234 return false;
16235 else if (TARGET_64BIT
16236 && !x86_64_immediate_operand (disp, VOIDmode))
16237 /* Displacement is out of range. */
16238 return false;
16239 /* In x32 mode, constant addresses are sign extended to 64bit, so
16240 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16241 else if (TARGET_X32 && !(index || base)
16242 && CONST_INT_P (disp)
16243 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16244 return false;
16247 /* Everything looks valid. */
16248 return true;
16251 /* Determine if a given RTX is a valid constant address. */
16253 bool
16254 constant_address_p (rtx x)
16256 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16259 /* Return a unique alias set for the GOT. */
16261 static alias_set_type
16262 ix86_GOT_alias_set (void)
16264 static alias_set_type set = -1;
16265 if (set == -1)
16266 set = new_alias_set ();
16267 return set;
16270 /* Return a legitimate reference for ORIG (an address) using the
16271 register REG. If REG is 0, a new pseudo is generated.
16273 There are two types of references that must be handled:
16275 1. Global data references must load the address from the GOT, via
16276 the PIC reg. An insn is emitted to do this load, and the reg is
16277 returned.
16279 2. Static data references, constant pool addresses, and code labels
16280 compute the address as an offset from the GOT, whose base is in
16281 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16282 differentiate them from global data objects. The returned
16283 address is the PIC reg + an unspec constant.
16285 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16286 reg also appears in the address. */
16288 static rtx
16289 legitimize_pic_address (rtx orig, rtx reg)
16291 rtx addr = orig;
16292 rtx new_rtx = orig;
16294 #if TARGET_MACHO
16295 if (TARGET_MACHO && !TARGET_64BIT)
16297 if (reg == 0)
16298 reg = gen_reg_rtx (Pmode);
16299 /* Use the generic Mach-O PIC machinery. */
16300 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16302 #endif
16304 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16306 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16307 if (tmp)
16308 return tmp;
16311 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16312 new_rtx = addr;
16313 else if ((!TARGET_64BIT
16314 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16315 && !TARGET_PECOFF
16316 && gotoff_operand (addr, Pmode))
16318 /* This symbol may be referenced via a displacement
16319 from the PIC base address (@GOTOFF). */
16320 if (GET_CODE (addr) == CONST)
16321 addr = XEXP (addr, 0);
16323 if (GET_CODE (addr) == PLUS)
16325 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16326 UNSPEC_GOTOFF);
16327 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16329 else
16330 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16332 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16334 if (TARGET_64BIT)
16335 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16337 if (reg != 0)
16339 gcc_assert (REG_P (reg));
16340 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16341 new_rtx, reg, 1, OPTAB_DIRECT);
16343 else
16344 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16346 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16347 /* We can't use @GOTOFF for text labels
16348 on VxWorks, see gotoff_operand. */
16349 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16351 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16352 if (tmp)
16353 return tmp;
16355 /* For x64 PE-COFF there is no GOT table,
16356 so we use address directly. */
16357 if (TARGET_64BIT && TARGET_PECOFF)
16359 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16360 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16362 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16364 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16365 UNSPEC_GOTPCREL);
16366 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16367 new_rtx = gen_const_mem (Pmode, new_rtx);
16368 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16370 else
16372 /* This symbol must be referenced via a load
16373 from the Global Offset Table (@GOT). */
16374 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16375 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16376 if (TARGET_64BIT)
16377 new_rtx = force_reg (Pmode, new_rtx);
16378 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16379 new_rtx = gen_const_mem (Pmode, new_rtx);
16380 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16383 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16385 else
16387 if (CONST_INT_P (addr)
16388 && !x86_64_immediate_operand (addr, VOIDmode))
16389 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16390 else if (GET_CODE (addr) == CONST)
16392 addr = XEXP (addr, 0);
16394 /* We must match stuff we generate before. Assume the only
16395 unspecs that can get here are ours. Not that we could do
16396 anything with them anyway.... */
16397 if (GET_CODE (addr) == UNSPEC
16398 || (GET_CODE (addr) == PLUS
16399 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16400 return orig;
16401 gcc_assert (GET_CODE (addr) == PLUS);
16404 if (GET_CODE (addr) == PLUS)
16406 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16408 /* Check first to see if this is a constant
16409 offset from a @GOTOFF symbol reference. */
16410 if (!TARGET_PECOFF
16411 && gotoff_operand (op0, Pmode)
16412 && CONST_INT_P (op1))
16414 if (!TARGET_64BIT)
16416 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16417 UNSPEC_GOTOFF);
16418 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16419 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16421 if (reg != 0)
16423 gcc_assert (REG_P (reg));
16424 new_rtx = expand_simple_binop (Pmode, PLUS,
16425 pic_offset_table_rtx,
16426 new_rtx, reg, 1,
16427 OPTAB_DIRECT);
16429 else
16430 new_rtx
16431 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16433 else
16435 if (INTVAL (op1) < -16*1024*1024
16436 || INTVAL (op1) >= 16*1024*1024)
16438 if (!x86_64_immediate_operand (op1, Pmode))
16439 op1 = force_reg (Pmode, op1);
16441 new_rtx
16442 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16446 else
16448 rtx base = legitimize_pic_address (op0, reg);
16449 machine_mode mode = GET_MODE (base);
16450 new_rtx
16451 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16453 if (CONST_INT_P (new_rtx))
16455 if (INTVAL (new_rtx) < -16*1024*1024
16456 || INTVAL (new_rtx) >= 16*1024*1024)
16458 if (!x86_64_immediate_operand (new_rtx, mode))
16459 new_rtx = force_reg (mode, new_rtx);
16461 new_rtx
16462 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16464 else
16465 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16467 else
16469 /* For %rip addressing, we have to use
16470 just disp32, not base nor index. */
16471 if (TARGET_64BIT
16472 && (GET_CODE (base) == SYMBOL_REF
16473 || GET_CODE (base) == LABEL_REF))
16474 base = force_reg (mode, base);
16475 if (GET_CODE (new_rtx) == PLUS
16476 && CONSTANT_P (XEXP (new_rtx, 1)))
16478 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16479 new_rtx = XEXP (new_rtx, 1);
16481 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16486 return new_rtx;
16489 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16491 static rtx
16492 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16494 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16496 if (GET_MODE (tp) != tp_mode)
16498 gcc_assert (GET_MODE (tp) == SImode);
16499 gcc_assert (tp_mode == DImode);
16501 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16504 if (to_reg)
16505 tp = copy_to_mode_reg (tp_mode, tp);
16507 return tp;
16510 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16512 static GTY(()) rtx ix86_tls_symbol;
16514 static rtx
16515 ix86_tls_get_addr (void)
16517 if (!ix86_tls_symbol)
16519 const char *sym
16520 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16521 ? "___tls_get_addr" : "__tls_get_addr");
16523 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16526 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16528 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16529 UNSPEC_PLTOFF);
16530 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16531 gen_rtx_CONST (Pmode, unspec));
16534 return ix86_tls_symbol;
16537 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16539 static GTY(()) rtx ix86_tls_module_base_symbol;
16542 ix86_tls_module_base (void)
16544 if (!ix86_tls_module_base_symbol)
16546 ix86_tls_module_base_symbol
16547 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16549 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16550 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16553 return ix86_tls_module_base_symbol;
16556 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16557 false if we expect this to be used for a memory address and true if
16558 we expect to load the address into a register. */
16560 static rtx
16561 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16563 rtx dest, base, off;
16564 rtx pic = NULL_RTX, tp = NULL_RTX;
16565 machine_mode tp_mode = Pmode;
16566 int type;
16568 /* Fall back to global dynamic model if tool chain cannot support local
16569 dynamic. */
16570 if (TARGET_SUN_TLS && !TARGET_64BIT
16571 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16572 && model == TLS_MODEL_LOCAL_DYNAMIC)
16573 model = TLS_MODEL_GLOBAL_DYNAMIC;
16575 switch (model)
16577 case TLS_MODEL_GLOBAL_DYNAMIC:
16578 dest = gen_reg_rtx (Pmode);
16580 if (!TARGET_64BIT)
16582 if (flag_pic && !TARGET_PECOFF)
16583 pic = pic_offset_table_rtx;
16584 else
16586 pic = gen_reg_rtx (Pmode);
16587 emit_insn (gen_set_got (pic));
16591 if (TARGET_GNU2_TLS)
16593 if (TARGET_64BIT)
16594 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16595 else
16596 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16598 tp = get_thread_pointer (Pmode, true);
16599 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16601 if (GET_MODE (x) != Pmode)
16602 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16604 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16606 else
16608 rtx caddr = ix86_tls_get_addr ();
16610 if (TARGET_64BIT)
16612 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16613 rtx_insn *insns;
16615 start_sequence ();
16616 emit_call_insn
16617 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16618 insns = get_insns ();
16619 end_sequence ();
16621 if (GET_MODE (x) != Pmode)
16622 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16624 RTL_CONST_CALL_P (insns) = 1;
16625 emit_libcall_block (insns, dest, rax, x);
16627 else
16628 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16630 break;
16632 case TLS_MODEL_LOCAL_DYNAMIC:
16633 base = gen_reg_rtx (Pmode);
16635 if (!TARGET_64BIT)
16637 if (flag_pic)
16638 pic = pic_offset_table_rtx;
16639 else
16641 pic = gen_reg_rtx (Pmode);
16642 emit_insn (gen_set_got (pic));
16646 if (TARGET_GNU2_TLS)
16648 rtx tmp = ix86_tls_module_base ();
16650 if (TARGET_64BIT)
16651 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16652 else
16653 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16655 tp = get_thread_pointer (Pmode, true);
16656 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16657 gen_rtx_MINUS (Pmode, tmp, tp));
16659 else
16661 rtx caddr = ix86_tls_get_addr ();
16663 if (TARGET_64BIT)
16665 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16666 rtx_insn *insns;
16667 rtx eqv;
16669 start_sequence ();
16670 emit_call_insn
16671 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16672 insns = get_insns ();
16673 end_sequence ();
16675 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16676 share the LD_BASE result with other LD model accesses. */
16677 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16678 UNSPEC_TLS_LD_BASE);
16680 RTL_CONST_CALL_P (insns) = 1;
16681 emit_libcall_block (insns, base, rax, eqv);
16683 else
16684 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16687 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16688 off = gen_rtx_CONST (Pmode, off);
16690 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16692 if (TARGET_GNU2_TLS)
16694 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16696 if (GET_MODE (x) != Pmode)
16697 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16699 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16701 break;
16703 case TLS_MODEL_INITIAL_EXEC:
16704 if (TARGET_64BIT)
16706 if (TARGET_SUN_TLS && !TARGET_X32)
16708 /* The Sun linker took the AMD64 TLS spec literally
16709 and can only handle %rax as destination of the
16710 initial executable code sequence. */
16712 dest = gen_reg_rtx (DImode);
16713 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16714 return dest;
16717 /* Generate DImode references to avoid %fs:(%reg32)
16718 problems and linker IE->LE relaxation bug. */
16719 tp_mode = DImode;
16720 pic = NULL;
16721 type = UNSPEC_GOTNTPOFF;
16723 else if (flag_pic)
16725 pic = pic_offset_table_rtx;
16726 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16728 else if (!TARGET_ANY_GNU_TLS)
16730 pic = gen_reg_rtx (Pmode);
16731 emit_insn (gen_set_got (pic));
16732 type = UNSPEC_GOTTPOFF;
16734 else
16736 pic = NULL;
16737 type = UNSPEC_INDNTPOFF;
16740 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16741 off = gen_rtx_CONST (tp_mode, off);
16742 if (pic)
16743 off = gen_rtx_PLUS (tp_mode, pic, off);
16744 off = gen_const_mem (tp_mode, off);
16745 set_mem_alias_set (off, ix86_GOT_alias_set ());
16747 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16749 base = get_thread_pointer (tp_mode,
16750 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16751 off = force_reg (tp_mode, off);
16752 dest = gen_rtx_PLUS (tp_mode, base, off);
16753 if (tp_mode != Pmode)
16754 dest = convert_to_mode (Pmode, dest, 1);
16756 else
16758 base = get_thread_pointer (Pmode, true);
16759 dest = gen_reg_rtx (Pmode);
16760 emit_insn (ix86_gen_sub3 (dest, base, off));
16762 break;
16764 case TLS_MODEL_LOCAL_EXEC:
16765 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16766 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16767 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16768 off = gen_rtx_CONST (Pmode, off);
16770 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16772 base = get_thread_pointer (Pmode,
16773 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16774 return gen_rtx_PLUS (Pmode, base, off);
16776 else
16778 base = get_thread_pointer (Pmode, true);
16779 dest = gen_reg_rtx (Pmode);
16780 emit_insn (ix86_gen_sub3 (dest, base, off));
16782 break;
16784 default:
16785 gcc_unreachable ();
16788 return dest;
16791 /* Return true if OP refers to a TLS address. */
16792 bool
16793 ix86_tls_address_pattern_p (rtx op)
16795 subrtx_var_iterator::array_type array;
16796 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16798 rtx op = *iter;
16799 if (MEM_P (op))
16801 rtx *x = &XEXP (op, 0);
16802 while (GET_CODE (*x) == PLUS)
16804 int i;
16805 for (i = 0; i < 2; i++)
16807 rtx u = XEXP (*x, i);
16808 if (GET_CODE (u) == ZERO_EXTEND)
16809 u = XEXP (u, 0);
16810 if (GET_CODE (u) == UNSPEC
16811 && XINT (u, 1) == UNSPEC_TP)
16812 return true;
16814 x = &XEXP (*x, 0);
16817 iter.skip_subrtxes ();
16821 return false;
16824 /* Rewrite *LOC so that it refers to a default TLS address space. */
16825 void
16826 ix86_rewrite_tls_address_1 (rtx *loc)
16828 subrtx_ptr_iterator::array_type array;
16829 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16831 rtx *loc = *iter;
16832 if (MEM_P (*loc))
16834 rtx addr = XEXP (*loc, 0);
16835 rtx *x = &addr;
16836 while (GET_CODE (*x) == PLUS)
16838 int i;
16839 for (i = 0; i < 2; i++)
16841 rtx u = XEXP (*x, i);
16842 if (GET_CODE (u) == ZERO_EXTEND)
16843 u = XEXP (u, 0);
16844 if (GET_CODE (u) == UNSPEC
16845 && XINT (u, 1) == UNSPEC_TP)
16847 addr_space_t as = DEFAULT_TLS_SEG_REG;
16849 *x = XEXP (*x, 1 - i);
16851 *loc = replace_equiv_address_nv (*loc, addr, true);
16852 set_mem_addr_space (*loc, as);
16853 return;
16856 x = &XEXP (*x, 0);
16859 iter.skip_subrtxes ();
16864 /* Rewrite instruction pattern involvning TLS address
16865 so that it refers to a default TLS address space. */
16867 ix86_rewrite_tls_address (rtx pattern)
16869 pattern = copy_insn (pattern);
16870 ix86_rewrite_tls_address_1 (&pattern);
16871 return pattern;
16874 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16875 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16876 unique refptr-DECL symbol corresponding to symbol DECL. */
16878 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16880 static inline hashval_t hash (tree_map *m) { return m->hash; }
16881 static inline bool
16882 equal (tree_map *a, tree_map *b)
16884 return a->base.from == b->base.from;
16887 static int
16888 keep_cache_entry (tree_map *&m)
16890 return ggc_marked_p (m->base.from);
16894 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16896 static tree
16897 get_dllimport_decl (tree decl, bool beimport)
16899 struct tree_map *h, in;
16900 const char *name;
16901 const char *prefix;
16902 size_t namelen, prefixlen;
16903 char *imp_name;
16904 tree to;
16905 rtx rtl;
16907 if (!dllimport_map)
16908 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16910 in.hash = htab_hash_pointer (decl);
16911 in.base.from = decl;
16912 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16913 h = *loc;
16914 if (h)
16915 return h->to;
16917 *loc = h = ggc_alloc<tree_map> ();
16918 h->hash = in.hash;
16919 h->base.from = decl;
16920 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16921 VAR_DECL, NULL, ptr_type_node);
16922 DECL_ARTIFICIAL (to) = 1;
16923 DECL_IGNORED_P (to) = 1;
16924 DECL_EXTERNAL (to) = 1;
16925 TREE_READONLY (to) = 1;
16927 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16928 name = targetm.strip_name_encoding (name);
16929 if (beimport)
16930 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16931 ? "*__imp_" : "*__imp__";
16932 else
16933 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16934 namelen = strlen (name);
16935 prefixlen = strlen (prefix);
16936 imp_name = (char *) alloca (namelen + prefixlen + 1);
16937 memcpy (imp_name, prefix, prefixlen);
16938 memcpy (imp_name + prefixlen, name, namelen + 1);
16940 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16941 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16942 SET_SYMBOL_REF_DECL (rtl, to);
16943 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16944 if (!beimport)
16946 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16947 #ifdef SUB_TARGET_RECORD_STUB
16948 SUB_TARGET_RECORD_STUB (name);
16949 #endif
16952 rtl = gen_const_mem (Pmode, rtl);
16953 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16955 SET_DECL_RTL (to, rtl);
16956 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16958 return to;
16961 /* Expand SYMBOL into its corresponding far-address symbol.
16962 WANT_REG is true if we require the result be a register. */
16964 static rtx
16965 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16967 tree imp_decl;
16968 rtx x;
16970 gcc_assert (SYMBOL_REF_DECL (symbol));
16971 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16973 x = DECL_RTL (imp_decl);
16974 if (want_reg)
16975 x = force_reg (Pmode, x);
16976 return x;
16979 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16980 true if we require the result be a register. */
16982 static rtx
16983 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16985 tree imp_decl;
16986 rtx x;
16988 gcc_assert (SYMBOL_REF_DECL (symbol));
16989 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16991 x = DECL_RTL (imp_decl);
16992 if (want_reg)
16993 x = force_reg (Pmode, x);
16994 return x;
16997 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16998 is true if we require the result be a register. */
17000 static rtx
17001 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17003 if (!TARGET_PECOFF)
17004 return NULL_RTX;
17006 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17008 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17009 return legitimize_dllimport_symbol (addr, inreg);
17010 if (GET_CODE (addr) == CONST
17011 && GET_CODE (XEXP (addr, 0)) == PLUS
17012 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17013 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17015 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17016 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17020 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17021 return NULL_RTX;
17022 if (GET_CODE (addr) == SYMBOL_REF
17023 && !is_imported_p (addr)
17024 && SYMBOL_REF_EXTERNAL_P (addr)
17025 && SYMBOL_REF_DECL (addr))
17026 return legitimize_pe_coff_extern_decl (addr, inreg);
17028 if (GET_CODE (addr) == CONST
17029 && GET_CODE (XEXP (addr, 0)) == PLUS
17030 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17031 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17032 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17033 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17035 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17036 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17038 return NULL_RTX;
17041 /* Try machine-dependent ways of modifying an illegitimate address
17042 to be legitimate. If we find one, return the new, valid address.
17043 This macro is used in only one place: `memory_address' in explow.c.
17045 OLDX is the address as it was before break_out_memory_refs was called.
17046 In some cases it is useful to look at this to decide what needs to be done.
17048 It is always safe for this macro to do nothing. It exists to recognize
17049 opportunities to optimize the output.
17051 For the 80386, we handle X+REG by loading X into a register R and
17052 using R+REG. R will go in a general reg and indexing will be used.
17053 However, if REG is a broken-out memory address or multiplication,
17054 nothing needs to be done because REG can certainly go in a general reg.
17056 When -fpic is used, special handling is needed for symbolic references.
17057 See comments by legitimize_pic_address in i386.c for details. */
17059 static rtx
17060 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17062 bool changed = false;
17063 unsigned log;
17065 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17066 if (log)
17067 return legitimize_tls_address (x, (enum tls_model) log, false);
17068 if (GET_CODE (x) == CONST
17069 && GET_CODE (XEXP (x, 0)) == PLUS
17070 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17071 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17073 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17074 (enum tls_model) log, false);
17075 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17078 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17080 rtx tmp = legitimize_pe_coff_symbol (x, true);
17081 if (tmp)
17082 return tmp;
17085 if (flag_pic && SYMBOLIC_CONST (x))
17086 return legitimize_pic_address (x, 0);
17088 #if TARGET_MACHO
17089 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17090 return machopic_indirect_data_reference (x, 0);
17091 #endif
17093 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17094 if (GET_CODE (x) == ASHIFT
17095 && CONST_INT_P (XEXP (x, 1))
17096 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17098 changed = true;
17099 log = INTVAL (XEXP (x, 1));
17100 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17101 GEN_INT (1 << log));
17104 if (GET_CODE (x) == PLUS)
17106 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17108 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17109 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17110 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17112 changed = true;
17113 log = INTVAL (XEXP (XEXP (x, 0), 1));
17114 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17115 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17116 GEN_INT (1 << log));
17119 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17120 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17121 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17123 changed = true;
17124 log = INTVAL (XEXP (XEXP (x, 1), 1));
17125 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17126 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17127 GEN_INT (1 << log));
17130 /* Put multiply first if it isn't already. */
17131 if (GET_CODE (XEXP (x, 1)) == MULT)
17133 std::swap (XEXP (x, 0), XEXP (x, 1));
17134 changed = true;
17137 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17138 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17139 created by virtual register instantiation, register elimination, and
17140 similar optimizations. */
17141 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17143 changed = true;
17144 x = gen_rtx_PLUS (Pmode,
17145 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17146 XEXP (XEXP (x, 1), 0)),
17147 XEXP (XEXP (x, 1), 1));
17150 /* Canonicalize
17151 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17152 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17153 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17154 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17155 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17156 && CONSTANT_P (XEXP (x, 1)))
17158 rtx constant;
17159 rtx other = NULL_RTX;
17161 if (CONST_INT_P (XEXP (x, 1)))
17163 constant = XEXP (x, 1);
17164 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17166 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17168 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17169 other = XEXP (x, 1);
17171 else
17172 constant = 0;
17174 if (constant)
17176 changed = true;
17177 x = gen_rtx_PLUS (Pmode,
17178 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17179 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17180 plus_constant (Pmode, other,
17181 INTVAL (constant)));
17185 if (changed && ix86_legitimate_address_p (mode, x, false))
17186 return x;
17188 if (GET_CODE (XEXP (x, 0)) == MULT)
17190 changed = true;
17191 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17194 if (GET_CODE (XEXP (x, 1)) == MULT)
17196 changed = true;
17197 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17200 if (changed
17201 && REG_P (XEXP (x, 1))
17202 && REG_P (XEXP (x, 0)))
17203 return x;
17205 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17207 changed = true;
17208 x = legitimize_pic_address (x, 0);
17211 if (changed && ix86_legitimate_address_p (mode, x, false))
17212 return x;
17214 if (REG_P (XEXP (x, 0)))
17216 rtx temp = gen_reg_rtx (Pmode);
17217 rtx val = force_operand (XEXP (x, 1), temp);
17218 if (val != temp)
17220 val = convert_to_mode (Pmode, val, 1);
17221 emit_move_insn (temp, val);
17224 XEXP (x, 1) = temp;
17225 return x;
17228 else if (REG_P (XEXP (x, 1)))
17230 rtx temp = gen_reg_rtx (Pmode);
17231 rtx val = force_operand (XEXP (x, 0), temp);
17232 if (val != temp)
17234 val = convert_to_mode (Pmode, val, 1);
17235 emit_move_insn (temp, val);
17238 XEXP (x, 0) = temp;
17239 return x;
17243 return x;
17246 /* Print an integer constant expression in assembler syntax. Addition
17247 and subtraction are the only arithmetic that may appear in these
17248 expressions. FILE is the stdio stream to write to, X is the rtx, and
17249 CODE is the operand print code from the output string. */
17251 static void
17252 output_pic_addr_const (FILE *file, rtx x, int code)
17254 char buf[256];
17256 switch (GET_CODE (x))
17258 case PC:
17259 gcc_assert (flag_pic);
17260 putc ('.', file);
17261 break;
17263 case SYMBOL_REF:
17264 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17265 output_addr_const (file, x);
17266 else
17268 const char *name = XSTR (x, 0);
17270 /* Mark the decl as referenced so that cgraph will
17271 output the function. */
17272 if (SYMBOL_REF_DECL (x))
17273 mark_decl_referenced (SYMBOL_REF_DECL (x));
17275 #if TARGET_MACHO
17276 if (MACHOPIC_INDIRECT
17277 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17278 name = machopic_indirection_name (x, /*stub_p=*/true);
17279 #endif
17280 assemble_name (file, name);
17282 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17283 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17284 fputs ("@PLT", file);
17285 break;
17287 case LABEL_REF:
17288 x = XEXP (x, 0);
17289 /* FALLTHRU */
17290 case CODE_LABEL:
17291 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17292 assemble_name (asm_out_file, buf);
17293 break;
17295 case CONST_INT:
17296 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17297 break;
17299 case CONST:
17300 /* This used to output parentheses around the expression,
17301 but that does not work on the 386 (either ATT or BSD assembler). */
17302 output_pic_addr_const (file, XEXP (x, 0), code);
17303 break;
17305 case CONST_DOUBLE:
17306 /* We can't handle floating point constants;
17307 TARGET_PRINT_OPERAND must handle them. */
17308 output_operand_lossage ("floating constant misused");
17309 break;
17311 case PLUS:
17312 /* Some assemblers need integer constants to appear first. */
17313 if (CONST_INT_P (XEXP (x, 0)))
17315 output_pic_addr_const (file, XEXP (x, 0), code);
17316 putc ('+', file);
17317 output_pic_addr_const (file, XEXP (x, 1), code);
17319 else
17321 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17322 output_pic_addr_const (file, XEXP (x, 1), code);
17323 putc ('+', file);
17324 output_pic_addr_const (file, XEXP (x, 0), code);
17326 break;
17328 case MINUS:
17329 if (!TARGET_MACHO)
17330 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17331 output_pic_addr_const (file, XEXP (x, 0), code);
17332 putc ('-', file);
17333 output_pic_addr_const (file, XEXP (x, 1), code);
17334 if (!TARGET_MACHO)
17335 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17336 break;
17338 case UNSPEC:
17339 gcc_assert (XVECLEN (x, 0) == 1);
17340 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17341 switch (XINT (x, 1))
17343 case UNSPEC_GOT:
17344 fputs ("@GOT", file);
17345 break;
17346 case UNSPEC_GOTOFF:
17347 fputs ("@GOTOFF", file);
17348 break;
17349 case UNSPEC_PLTOFF:
17350 fputs ("@PLTOFF", file);
17351 break;
17352 case UNSPEC_PCREL:
17353 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17354 "(%rip)" : "[rip]", file);
17355 break;
17356 case UNSPEC_GOTPCREL:
17357 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17358 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17359 break;
17360 case UNSPEC_GOTTPOFF:
17361 /* FIXME: This might be @TPOFF in Sun ld too. */
17362 fputs ("@gottpoff", file);
17363 break;
17364 case UNSPEC_TPOFF:
17365 fputs ("@tpoff", file);
17366 break;
17367 case UNSPEC_NTPOFF:
17368 if (TARGET_64BIT)
17369 fputs ("@tpoff", file);
17370 else
17371 fputs ("@ntpoff", file);
17372 break;
17373 case UNSPEC_DTPOFF:
17374 fputs ("@dtpoff", file);
17375 break;
17376 case UNSPEC_GOTNTPOFF:
17377 if (TARGET_64BIT)
17378 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17379 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17380 else
17381 fputs ("@gotntpoff", file);
17382 break;
17383 case UNSPEC_INDNTPOFF:
17384 fputs ("@indntpoff", file);
17385 break;
17386 #if TARGET_MACHO
17387 case UNSPEC_MACHOPIC_OFFSET:
17388 putc ('-', file);
17389 machopic_output_function_base_name (file);
17390 break;
17391 #endif
17392 default:
17393 output_operand_lossage ("invalid UNSPEC as operand");
17394 break;
17396 break;
17398 default:
17399 output_operand_lossage ("invalid expression as operand");
17403 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17404 We need to emit DTP-relative relocations. */
17406 static void ATTRIBUTE_UNUSED
17407 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17409 fputs (ASM_LONG, file);
17410 output_addr_const (file, x);
17411 fputs ("@dtpoff", file);
17412 switch (size)
17414 case 4:
17415 break;
17416 case 8:
17417 fputs (", 0", file);
17418 break;
17419 default:
17420 gcc_unreachable ();
17424 /* Return true if X is a representation of the PIC register. This copes
17425 with calls from ix86_find_base_term, where the register might have
17426 been replaced by a cselib value. */
17428 static bool
17429 ix86_pic_register_p (rtx x)
17431 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17432 return (pic_offset_table_rtx
17433 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17434 else if (!REG_P (x))
17435 return false;
17436 else if (pic_offset_table_rtx)
17438 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17439 return true;
17440 if (HARD_REGISTER_P (x)
17441 && !HARD_REGISTER_P (pic_offset_table_rtx)
17442 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17443 return true;
17444 return false;
17446 else
17447 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17450 /* Helper function for ix86_delegitimize_address.
17451 Attempt to delegitimize TLS local-exec accesses. */
17453 static rtx
17454 ix86_delegitimize_tls_address (rtx orig_x)
17456 rtx x = orig_x, unspec;
17457 struct ix86_address addr;
17459 if (!TARGET_TLS_DIRECT_SEG_REFS)
17460 return orig_x;
17461 if (MEM_P (x))
17462 x = XEXP (x, 0);
17463 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17464 return orig_x;
17465 if (ix86_decompose_address (x, &addr) == 0
17466 || addr.seg != DEFAULT_TLS_SEG_REG
17467 || addr.disp == NULL_RTX
17468 || GET_CODE (addr.disp) != CONST)
17469 return orig_x;
17470 unspec = XEXP (addr.disp, 0);
17471 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17472 unspec = XEXP (unspec, 0);
17473 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17474 return orig_x;
17475 x = XVECEXP (unspec, 0, 0);
17476 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17477 if (unspec != XEXP (addr.disp, 0))
17478 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17479 if (addr.index)
17481 rtx idx = addr.index;
17482 if (addr.scale != 1)
17483 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17484 x = gen_rtx_PLUS (Pmode, idx, x);
17486 if (addr.base)
17487 x = gen_rtx_PLUS (Pmode, addr.base, x);
17488 if (MEM_P (orig_x))
17489 x = replace_equiv_address_nv (orig_x, x);
17490 return x;
17493 /* In the name of slightly smaller debug output, and to cater to
17494 general assembler lossage, recognize PIC+GOTOFF and turn it back
17495 into a direct symbol reference.
17497 On Darwin, this is necessary to avoid a crash, because Darwin
17498 has a different PIC label for each routine but the DWARF debugging
17499 information is not associated with any particular routine, so it's
17500 necessary to remove references to the PIC label from RTL stored by
17501 the DWARF output code.
17503 This helper is used in the normal ix86_delegitimize_address
17504 entrypoint (e.g. used in the target delegitimization hook) and
17505 in ix86_find_base_term. As compile time memory optimization, we
17506 avoid allocating rtxes that will not change anything on the outcome
17507 of the callers (find_base_value and find_base_term). */
17509 static inline rtx
17510 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17512 rtx orig_x = delegitimize_mem_from_attrs (x);
17513 /* addend is NULL or some rtx if x is something+GOTOFF where
17514 something doesn't include the PIC register. */
17515 rtx addend = NULL_RTX;
17516 /* reg_addend is NULL or a multiple of some register. */
17517 rtx reg_addend = NULL_RTX;
17518 /* const_addend is NULL or a const_int. */
17519 rtx const_addend = NULL_RTX;
17520 /* This is the result, or NULL. */
17521 rtx result = NULL_RTX;
17523 x = orig_x;
17525 if (MEM_P (x))
17526 x = XEXP (x, 0);
17528 if (TARGET_64BIT)
17530 if (GET_CODE (x) == CONST
17531 && GET_CODE (XEXP (x, 0)) == PLUS
17532 && GET_MODE (XEXP (x, 0)) == Pmode
17533 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17534 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17535 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17537 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17538 base. A CONST can't be arg_pointer_rtx based. */
17539 if (base_term_p && MEM_P (orig_x))
17540 return orig_x;
17541 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17542 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17543 if (MEM_P (orig_x))
17544 x = replace_equiv_address_nv (orig_x, x);
17545 return x;
17548 if (GET_CODE (x) == CONST
17549 && GET_CODE (XEXP (x, 0)) == UNSPEC
17550 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17551 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17552 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17554 x = XVECEXP (XEXP (x, 0), 0, 0);
17555 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17557 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17558 if (x == NULL_RTX)
17559 return orig_x;
17561 return x;
17564 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17565 return ix86_delegitimize_tls_address (orig_x);
17567 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17568 and -mcmodel=medium -fpic. */
17571 if (GET_CODE (x) != PLUS
17572 || GET_CODE (XEXP (x, 1)) != CONST)
17573 return ix86_delegitimize_tls_address (orig_x);
17575 if (ix86_pic_register_p (XEXP (x, 0)))
17576 /* %ebx + GOT/GOTOFF */
17578 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17580 /* %ebx + %reg * scale + GOT/GOTOFF */
17581 reg_addend = XEXP (x, 0);
17582 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17583 reg_addend = XEXP (reg_addend, 1);
17584 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17585 reg_addend = XEXP (reg_addend, 0);
17586 else
17588 reg_addend = NULL_RTX;
17589 addend = XEXP (x, 0);
17592 else
17593 addend = XEXP (x, 0);
17595 x = XEXP (XEXP (x, 1), 0);
17596 if (GET_CODE (x) == PLUS
17597 && CONST_INT_P (XEXP (x, 1)))
17599 const_addend = XEXP (x, 1);
17600 x = XEXP (x, 0);
17603 if (GET_CODE (x) == UNSPEC
17604 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17605 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17606 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17607 && !MEM_P (orig_x) && !addend)))
17608 result = XVECEXP (x, 0, 0);
17610 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17611 && !MEM_P (orig_x))
17612 result = XVECEXP (x, 0, 0);
17614 if (! result)
17615 return ix86_delegitimize_tls_address (orig_x);
17617 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17618 recurse on the first operand. */
17619 if (const_addend && !base_term_p)
17620 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17621 if (reg_addend)
17622 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17623 if (addend)
17625 /* If the rest of original X doesn't involve the PIC register, add
17626 addend and subtract pic_offset_table_rtx. This can happen e.g.
17627 for code like:
17628 leal (%ebx, %ecx, 4), %ecx
17630 movl foo@GOTOFF(%ecx), %edx
17631 in which case we return (%ecx - %ebx) + foo
17632 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17633 and reload has completed. Don't do the latter for debug,
17634 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17635 if (pic_offset_table_rtx
17636 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17637 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17638 pic_offset_table_rtx),
17639 result);
17640 else if (base_term_p
17641 && pic_offset_table_rtx
17642 && !TARGET_MACHO
17643 && !TARGET_VXWORKS_RTP)
17645 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17646 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17647 result = gen_rtx_PLUS (Pmode, tmp, result);
17649 else
17650 return orig_x;
17652 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17654 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17655 if (result == NULL_RTX)
17656 return orig_x;
17658 return result;
17661 /* The normal instantiation of the above template. */
17663 static rtx
17664 ix86_delegitimize_address (rtx x)
17666 return ix86_delegitimize_address_1 (x, false);
17669 /* If X is a machine specific address (i.e. a symbol or label being
17670 referenced as a displacement from the GOT implemented using an
17671 UNSPEC), then return the base term. Otherwise return X. */
17674 ix86_find_base_term (rtx x)
17676 rtx term;
17678 if (TARGET_64BIT)
17680 if (GET_CODE (x) != CONST)
17681 return x;
17682 term = XEXP (x, 0);
17683 if (GET_CODE (term) == PLUS
17684 && CONST_INT_P (XEXP (term, 1)))
17685 term = XEXP (term, 0);
17686 if (GET_CODE (term) != UNSPEC
17687 || (XINT (term, 1) != UNSPEC_GOTPCREL
17688 && XINT (term, 1) != UNSPEC_PCREL))
17689 return x;
17691 return XVECEXP (term, 0, 0);
17694 return ix86_delegitimize_address_1 (x, true);
17697 /* Return true if X shouldn't be emitted into the debug info.
17698 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17699 symbol easily into the .debug_info section, so we need not to
17700 delegitimize, but instead assemble as @gotoff.
17701 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17702 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17704 static bool
17705 ix86_const_not_ok_for_debug_p (rtx x)
17707 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17708 return true;
17710 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17711 return true;
17713 return false;
17716 static void
17717 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17718 bool fp, FILE *file)
17720 const char *suffix;
17722 if (mode == CCFPmode)
17724 code = ix86_fp_compare_code_to_integer (code);
17725 mode = CCmode;
17727 if (reverse)
17728 code = reverse_condition (code);
17730 switch (code)
17732 case EQ:
17733 gcc_assert (mode != CCGZmode);
17734 switch (mode)
17736 case E_CCAmode:
17737 suffix = "a";
17738 break;
17739 case E_CCCmode:
17740 suffix = "c";
17741 break;
17742 case E_CCOmode:
17743 suffix = "o";
17744 break;
17745 case E_CCPmode:
17746 suffix = "p";
17747 break;
17748 case E_CCSmode:
17749 suffix = "s";
17750 break;
17751 default:
17752 suffix = "e";
17753 break;
17755 break;
17756 case NE:
17757 gcc_assert (mode != CCGZmode);
17758 switch (mode)
17760 case E_CCAmode:
17761 suffix = "na";
17762 break;
17763 case E_CCCmode:
17764 suffix = "nc";
17765 break;
17766 case E_CCOmode:
17767 suffix = "no";
17768 break;
17769 case E_CCPmode:
17770 suffix = "np";
17771 break;
17772 case E_CCSmode:
17773 suffix = "ns";
17774 break;
17775 default:
17776 suffix = "ne";
17777 break;
17779 break;
17780 case GT:
17781 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17782 suffix = "g";
17783 break;
17784 case GTU:
17785 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17786 Those same assemblers have the same but opposite lossage on cmov. */
17787 if (mode == CCmode)
17788 suffix = fp ? "nbe" : "a";
17789 else
17790 gcc_unreachable ();
17791 break;
17792 case LT:
17793 switch (mode)
17795 case E_CCNOmode:
17796 case E_CCGOCmode:
17797 suffix = "s";
17798 break;
17800 case E_CCmode:
17801 case E_CCGCmode:
17802 case E_CCGZmode:
17803 suffix = "l";
17804 break;
17806 default:
17807 gcc_unreachable ();
17809 break;
17810 case LTU:
17811 if (mode == CCmode || mode == CCGZmode)
17812 suffix = "b";
17813 else if (mode == CCCmode)
17814 suffix = fp ? "b" : "c";
17815 else
17816 gcc_unreachable ();
17817 break;
17818 case GE:
17819 switch (mode)
17821 case E_CCNOmode:
17822 case E_CCGOCmode:
17823 suffix = "ns";
17824 break;
17826 case E_CCmode:
17827 case E_CCGCmode:
17828 case E_CCGZmode:
17829 suffix = "ge";
17830 break;
17832 default:
17833 gcc_unreachable ();
17835 break;
17836 case GEU:
17837 if (mode == CCmode || mode == CCGZmode)
17838 suffix = "nb";
17839 else if (mode == CCCmode)
17840 suffix = fp ? "nb" : "nc";
17841 else
17842 gcc_unreachable ();
17843 break;
17844 case LE:
17845 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17846 suffix = "le";
17847 break;
17848 case LEU:
17849 if (mode == CCmode)
17850 suffix = "be";
17851 else
17852 gcc_unreachable ();
17853 break;
17854 case UNORDERED:
17855 suffix = fp ? "u" : "p";
17856 break;
17857 case ORDERED:
17858 suffix = fp ? "nu" : "np";
17859 break;
17860 default:
17861 gcc_unreachable ();
17863 fputs (suffix, file);
17866 /* Print the name of register X to FILE based on its machine mode and number.
17867 If CODE is 'w', pretend the mode is HImode.
17868 If CODE is 'b', pretend the mode is QImode.
17869 If CODE is 'k', pretend the mode is SImode.
17870 If CODE is 'q', pretend the mode is DImode.
17871 If CODE is 'x', pretend the mode is V4SFmode.
17872 If CODE is 't', pretend the mode is V8SFmode.
17873 If CODE is 'g', pretend the mode is V16SFmode.
17874 If CODE is 'h', pretend the reg is the 'high' byte register.
17875 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17876 If CODE is 'd', duplicate the operand for AVX instruction.
17877 If CODE is 'V', print naked full integer register name without %.
17880 void
17881 print_reg (rtx x, int code, FILE *file)
17883 const char *reg;
17884 int msize;
17885 unsigned int regno;
17886 bool duplicated;
17888 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17889 putc ('%', file);
17891 if (x == pc_rtx)
17893 gcc_assert (TARGET_64BIT);
17894 fputs ("rip", file);
17895 return;
17898 if (code == 'y' && STACK_TOP_P (x))
17900 fputs ("st(0)", file);
17901 return;
17904 if (code == 'w')
17905 msize = 2;
17906 else if (code == 'b')
17907 msize = 1;
17908 else if (code == 'k')
17909 msize = 4;
17910 else if (code == 'q')
17911 msize = 8;
17912 else if (code == 'h')
17913 msize = 0;
17914 else if (code == 'x')
17915 msize = 16;
17916 else if (code == 't')
17917 msize = 32;
17918 else if (code == 'g')
17919 msize = 64;
17920 else
17921 msize = GET_MODE_SIZE (GET_MODE (x));
17923 regno = REGNO (x);
17925 if (regno == ARG_POINTER_REGNUM
17926 || regno == FRAME_POINTER_REGNUM
17927 || regno == FPSR_REG
17928 || regno == FPCR_REG)
17930 output_operand_lossage
17931 ("invalid use of register '%s'", reg_names[regno]);
17932 return;
17934 else if (regno == FLAGS_REG)
17936 output_operand_lossage ("invalid use of asm flag output");
17937 return;
17940 if (code == 'V')
17942 if (GENERAL_REGNO_P (regno))
17943 msize = GET_MODE_SIZE (word_mode);
17944 else
17945 error ("'V' modifier on non-integer register");
17948 duplicated = code == 'd' && TARGET_AVX;
17950 switch (msize)
17952 case 16:
17953 case 12:
17954 case 8:
17955 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17956 warning (0, "unsupported size for integer register");
17957 /* FALLTHRU */
17958 case 4:
17959 if (LEGACY_INT_REGNO_P (regno))
17960 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17961 /* FALLTHRU */
17962 case 2:
17963 normal:
17964 reg = hi_reg_name[regno];
17965 break;
17966 case 1:
17967 if (regno >= ARRAY_SIZE (qi_reg_name))
17968 goto normal;
17969 if (!ANY_QI_REGNO_P (regno))
17970 error ("unsupported size for integer register");
17971 reg = qi_reg_name[regno];
17972 break;
17973 case 0:
17974 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17975 goto normal;
17976 reg = qi_high_reg_name[regno];
17977 break;
17978 case 32:
17979 case 64:
17980 if (SSE_REGNO_P (regno))
17982 gcc_assert (!duplicated);
17983 putc (msize == 32 ? 'y' : 'z', file);
17984 reg = hi_reg_name[regno] + 1;
17985 break;
17987 goto normal;
17988 default:
17989 gcc_unreachable ();
17992 fputs (reg, file);
17994 /* Irritatingly, AMD extended registers use
17995 different naming convention: "r%d[bwd]" */
17996 if (REX_INT_REGNO_P (regno))
17998 gcc_assert (TARGET_64BIT);
17999 switch (msize)
18001 case 0:
18002 error ("extended registers have no high halves");
18003 break;
18004 case 1:
18005 putc ('b', file);
18006 break;
18007 case 2:
18008 putc ('w', file);
18009 break;
18010 case 4:
18011 putc ('d', file);
18012 break;
18013 case 8:
18014 /* no suffix */
18015 break;
18016 default:
18017 error ("unsupported operand size for extended register");
18018 break;
18020 return;
18023 if (duplicated)
18025 if (ASSEMBLER_DIALECT == ASM_ATT)
18026 fprintf (file, ", %%%s", reg);
18027 else
18028 fprintf (file, ", %s", reg);
18032 /* Meaning of CODE:
18033 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18034 C -- print opcode suffix for set/cmov insn.
18035 c -- like C, but print reversed condition
18036 F,f -- likewise, but for floating-point.
18037 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18038 otherwise nothing
18039 R -- print embedded rounding and sae.
18040 r -- print only sae.
18041 z -- print the opcode suffix for the size of the current operand.
18042 Z -- likewise, with special suffixes for x87 instructions.
18043 * -- print a star (in certain assembler syntax)
18044 A -- print an absolute memory reference.
18045 E -- print address with DImode register names if TARGET_64BIT.
18046 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18047 s -- print a shift double count, followed by the assemblers argument
18048 delimiter.
18049 b -- print the QImode name of the register for the indicated operand.
18050 %b0 would print %al if operands[0] is reg 0.
18051 w -- likewise, print the HImode name of the register.
18052 k -- likewise, print the SImode name of the register.
18053 q -- likewise, print the DImode name of the register.
18054 x -- likewise, print the V4SFmode name of the register.
18055 t -- likewise, print the V8SFmode name of the register.
18056 g -- likewise, print the V16SFmode name of the register.
18057 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18058 y -- print "st(0)" instead of "st" as a register.
18059 d -- print duplicated register operand for AVX instruction.
18060 D -- print condition for SSE cmp instruction.
18061 P -- if PIC, print an @PLT suffix.
18062 p -- print raw symbol name.
18063 X -- don't print any sort of PIC '@' suffix for a symbol.
18064 & -- print some in-use local-dynamic symbol name.
18065 H -- print a memory address offset by 8; used for sse high-parts
18066 Y -- print condition for XOP pcom* instruction.
18067 V -- print naked full integer register name without %.
18068 + -- print a branch hint as 'cs' or 'ds' prefix
18069 ; -- print a semicolon (after prefixes due to bug in older gas).
18070 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18071 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18072 ! -- print MPX prefix for jxx/call/ret instructions if required.
18075 void
18076 ix86_print_operand (FILE *file, rtx x, int code)
18078 if (code)
18080 switch (code)
18082 case 'A':
18083 switch (ASSEMBLER_DIALECT)
18085 case ASM_ATT:
18086 putc ('*', file);
18087 break;
18089 case ASM_INTEL:
18090 /* Intel syntax. For absolute addresses, registers should not
18091 be surrounded by braces. */
18092 if (!REG_P (x))
18094 putc ('[', file);
18095 ix86_print_operand (file, x, 0);
18096 putc (']', file);
18097 return;
18099 break;
18101 default:
18102 gcc_unreachable ();
18105 ix86_print_operand (file, x, 0);
18106 return;
18108 case 'E':
18109 /* Wrap address in an UNSPEC to declare special handling. */
18110 if (TARGET_64BIT)
18111 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18113 output_address (VOIDmode, x);
18114 return;
18116 case 'L':
18117 if (ASSEMBLER_DIALECT == ASM_ATT)
18118 putc ('l', file);
18119 return;
18121 case 'W':
18122 if (ASSEMBLER_DIALECT == ASM_ATT)
18123 putc ('w', file);
18124 return;
18126 case 'B':
18127 if (ASSEMBLER_DIALECT == ASM_ATT)
18128 putc ('b', file);
18129 return;
18131 case 'Q':
18132 if (ASSEMBLER_DIALECT == ASM_ATT)
18133 putc ('l', file);
18134 return;
18136 case 'S':
18137 if (ASSEMBLER_DIALECT == ASM_ATT)
18138 putc ('s', file);
18139 return;
18141 case 'T':
18142 if (ASSEMBLER_DIALECT == ASM_ATT)
18143 putc ('t', file);
18144 return;
18146 case 'O':
18147 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18148 if (ASSEMBLER_DIALECT != ASM_ATT)
18149 return;
18151 switch (GET_MODE_SIZE (GET_MODE (x)))
18153 case 2:
18154 putc ('w', file);
18155 break;
18157 case 4:
18158 putc ('l', file);
18159 break;
18161 case 8:
18162 putc ('q', file);
18163 break;
18165 default:
18166 output_operand_lossage ("invalid operand size for operand "
18167 "code 'O'");
18168 return;
18171 putc ('.', file);
18172 #endif
18173 return;
18175 case 'z':
18176 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18178 /* Opcodes don't get size suffixes if using Intel opcodes. */
18179 if (ASSEMBLER_DIALECT == ASM_INTEL)
18180 return;
18182 switch (GET_MODE_SIZE (GET_MODE (x)))
18184 case 1:
18185 putc ('b', file);
18186 return;
18188 case 2:
18189 putc ('w', file);
18190 return;
18192 case 4:
18193 putc ('l', file);
18194 return;
18196 case 8:
18197 putc ('q', file);
18198 return;
18200 default:
18201 output_operand_lossage ("invalid operand size for operand "
18202 "code 'z'");
18203 return;
18207 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18208 warning (0, "non-integer operand used with operand code 'z'");
18209 /* FALLTHRU */
18211 case 'Z':
18212 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18213 if (ASSEMBLER_DIALECT == ASM_INTEL)
18214 return;
18216 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18218 switch (GET_MODE_SIZE (GET_MODE (x)))
18220 case 2:
18221 #ifdef HAVE_AS_IX86_FILDS
18222 putc ('s', file);
18223 #endif
18224 return;
18226 case 4:
18227 putc ('l', file);
18228 return;
18230 case 8:
18231 #ifdef HAVE_AS_IX86_FILDQ
18232 putc ('q', file);
18233 #else
18234 fputs ("ll", file);
18235 #endif
18236 return;
18238 default:
18239 break;
18242 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18244 /* 387 opcodes don't get size suffixes
18245 if the operands are registers. */
18246 if (STACK_REG_P (x))
18247 return;
18249 switch (GET_MODE_SIZE (GET_MODE (x)))
18251 case 4:
18252 putc ('s', file);
18253 return;
18255 case 8:
18256 putc ('l', file);
18257 return;
18259 case 12:
18260 case 16:
18261 putc ('t', file);
18262 return;
18264 default:
18265 break;
18268 else
18270 output_operand_lossage ("invalid operand type used with "
18271 "operand code 'Z'");
18272 return;
18275 output_operand_lossage ("invalid operand size for operand code 'Z'");
18276 return;
18278 case 'd':
18279 case 'b':
18280 case 'w':
18281 case 'k':
18282 case 'q':
18283 case 'h':
18284 case 't':
18285 case 'g':
18286 case 'y':
18287 case 'x':
18288 case 'X':
18289 case 'P':
18290 case 'p':
18291 case 'V':
18292 break;
18294 case 's':
18295 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18297 ix86_print_operand (file, x, 0);
18298 fputs (", ", file);
18300 return;
18302 case 'Y':
18303 switch (GET_CODE (x))
18305 case NE:
18306 fputs ("neq", file);
18307 break;
18308 case EQ:
18309 fputs ("eq", file);
18310 break;
18311 case GE:
18312 case GEU:
18313 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18314 break;
18315 case GT:
18316 case GTU:
18317 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18318 break;
18319 case LE:
18320 case LEU:
18321 fputs ("le", file);
18322 break;
18323 case LT:
18324 case LTU:
18325 fputs ("lt", file);
18326 break;
18327 case UNORDERED:
18328 fputs ("unord", file);
18329 break;
18330 case ORDERED:
18331 fputs ("ord", file);
18332 break;
18333 case UNEQ:
18334 fputs ("ueq", file);
18335 break;
18336 case UNGE:
18337 fputs ("nlt", file);
18338 break;
18339 case UNGT:
18340 fputs ("nle", file);
18341 break;
18342 case UNLE:
18343 fputs ("ule", file);
18344 break;
18345 case UNLT:
18346 fputs ("ult", file);
18347 break;
18348 case LTGT:
18349 fputs ("une", file);
18350 break;
18351 default:
18352 output_operand_lossage ("operand is not a condition code, "
18353 "invalid operand code 'Y'");
18354 return;
18356 return;
18358 case 'D':
18359 /* Little bit of braindamage here. The SSE compare instructions
18360 does use completely different names for the comparisons that the
18361 fp conditional moves. */
18362 switch (GET_CODE (x))
18364 case UNEQ:
18365 if (TARGET_AVX)
18367 fputs ("eq_us", file);
18368 break;
18370 /* FALLTHRU */
18371 case EQ:
18372 fputs ("eq", file);
18373 break;
18374 case UNLT:
18375 if (TARGET_AVX)
18377 fputs ("nge", file);
18378 break;
18380 /* FALLTHRU */
18381 case LT:
18382 fputs ("lt", file);
18383 break;
18384 case UNLE:
18385 if (TARGET_AVX)
18387 fputs ("ngt", file);
18388 break;
18390 /* FALLTHRU */
18391 case LE:
18392 fputs ("le", file);
18393 break;
18394 case UNORDERED:
18395 fputs ("unord", file);
18396 break;
18397 case LTGT:
18398 if (TARGET_AVX)
18400 fputs ("neq_oq", file);
18401 break;
18403 /* FALLTHRU */
18404 case NE:
18405 fputs ("neq", file);
18406 break;
18407 case GE:
18408 if (TARGET_AVX)
18410 fputs ("ge", file);
18411 break;
18413 /* FALLTHRU */
18414 case UNGE:
18415 fputs ("nlt", file);
18416 break;
18417 case GT:
18418 if (TARGET_AVX)
18420 fputs ("gt", file);
18421 break;
18423 /* FALLTHRU */
18424 case UNGT:
18425 fputs ("nle", file);
18426 break;
18427 case ORDERED:
18428 fputs ("ord", file);
18429 break;
18430 default:
18431 output_operand_lossage ("operand is not a condition code, "
18432 "invalid operand code 'D'");
18433 return;
18435 return;
18437 case 'F':
18438 case 'f':
18439 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18440 if (ASSEMBLER_DIALECT == ASM_ATT)
18441 putc ('.', file);
18442 gcc_fallthrough ();
18443 #endif
18445 case 'C':
18446 case 'c':
18447 if (!COMPARISON_P (x))
18449 output_operand_lossage ("operand is not a condition code, "
18450 "invalid operand code '%c'", code);
18451 return;
18453 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18454 code == 'c' || code == 'f',
18455 code == 'F' || code == 'f',
18456 file);
18457 return;
18459 case 'H':
18460 if (!offsettable_memref_p (x))
18462 output_operand_lossage ("operand is not an offsettable memory "
18463 "reference, invalid operand code 'H'");
18464 return;
18466 /* It doesn't actually matter what mode we use here, as we're
18467 only going to use this for printing. */
18468 x = adjust_address_nv (x, DImode, 8);
18469 /* Output 'qword ptr' for intel assembler dialect. */
18470 if (ASSEMBLER_DIALECT == ASM_INTEL)
18471 code = 'q';
18472 break;
18474 case 'K':
18475 if (!CONST_INT_P (x))
18477 output_operand_lossage ("operand is not an integer, invalid "
18478 "operand code 'K'");
18479 return;
18482 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18483 #ifdef HAVE_AS_IX86_HLE
18484 fputs ("xacquire ", file);
18485 #else
18486 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18487 #endif
18488 else if (INTVAL (x) & IX86_HLE_RELEASE)
18489 #ifdef HAVE_AS_IX86_HLE
18490 fputs ("xrelease ", file);
18491 #else
18492 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18493 #endif
18494 /* We do not want to print value of the operand. */
18495 return;
18497 case 'N':
18498 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18499 fputs ("{z}", file);
18500 return;
18502 case 'r':
18503 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18505 output_operand_lossage ("operand is not a specific integer, "
18506 "invalid operand code 'r'");
18507 return;
18510 if (ASSEMBLER_DIALECT == ASM_INTEL)
18511 fputs (", ", file);
18513 fputs ("{sae}", file);
18515 if (ASSEMBLER_DIALECT == ASM_ATT)
18516 fputs (", ", file);
18518 return;
18520 case 'R':
18521 if (!CONST_INT_P (x))
18523 output_operand_lossage ("operand is not an integer, invalid "
18524 "operand code 'R'");
18525 return;
18528 if (ASSEMBLER_DIALECT == ASM_INTEL)
18529 fputs (", ", file);
18531 switch (INTVAL (x))
18533 case ROUND_NEAREST_INT | ROUND_SAE:
18534 fputs ("{rn-sae}", file);
18535 break;
18536 case ROUND_NEG_INF | ROUND_SAE:
18537 fputs ("{rd-sae}", file);
18538 break;
18539 case ROUND_POS_INF | ROUND_SAE:
18540 fputs ("{ru-sae}", file);
18541 break;
18542 case ROUND_ZERO | ROUND_SAE:
18543 fputs ("{rz-sae}", file);
18544 break;
18545 default:
18546 output_operand_lossage ("operand is not a specific integer, "
18547 "invalid operand code 'R'");
18550 if (ASSEMBLER_DIALECT == ASM_ATT)
18551 fputs (", ", file);
18553 return;
18555 case '*':
18556 if (ASSEMBLER_DIALECT == ASM_ATT)
18557 putc ('*', file);
18558 return;
18560 case '&':
18562 const char *name = get_some_local_dynamic_name ();
18563 if (name == NULL)
18564 output_operand_lossage ("'%%&' used without any "
18565 "local dynamic TLS references");
18566 else
18567 assemble_name (file, name);
18568 return;
18571 case '+':
18573 rtx x;
18575 if (!optimize
18576 || optimize_function_for_size_p (cfun)
18577 || !TARGET_BRANCH_PREDICTION_HINTS)
18578 return;
18580 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18581 if (x)
18583 int pred_val = profile_probability::from_reg_br_prob_note
18584 (XINT (x, 0)).to_reg_br_prob_base ();
18586 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18587 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18589 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18590 bool cputaken
18591 = final_forward_branch_p (current_output_insn) == 0;
18593 /* Emit hints only in the case default branch prediction
18594 heuristics would fail. */
18595 if (taken != cputaken)
18597 /* We use 3e (DS) prefix for taken branches and
18598 2e (CS) prefix for not taken branches. */
18599 if (taken)
18600 fputs ("ds ; ", file);
18601 else
18602 fputs ("cs ; ", file);
18606 return;
18609 case ';':
18610 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18611 putc (';', file);
18612 #endif
18613 return;
18615 case '~':
18616 putc (TARGET_AVX2 ? 'i' : 'f', file);
18617 return;
18619 case '^':
18620 if (TARGET_64BIT && Pmode != word_mode)
18621 fputs ("addr32 ", file);
18622 return;
18624 case '!':
18625 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18626 fputs ("bnd ", file);
18627 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18628 fputs ("notrack ", file);
18629 return;
18631 default:
18632 output_operand_lossage ("invalid operand code '%c'", code);
18636 if (REG_P (x))
18637 print_reg (x, code, file);
18639 else if (MEM_P (x))
18641 rtx addr = XEXP (x, 0);
18643 /* No `byte ptr' prefix for call instructions ... */
18644 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18646 machine_mode mode = GET_MODE (x);
18647 const char *size;
18649 /* Check for explicit size override codes. */
18650 if (code == 'b')
18651 size = "BYTE";
18652 else if (code == 'w')
18653 size = "WORD";
18654 else if (code == 'k')
18655 size = "DWORD";
18656 else if (code == 'q')
18657 size = "QWORD";
18658 else if (code == 'x')
18659 size = "XMMWORD";
18660 else if (code == 't')
18661 size = "YMMWORD";
18662 else if (code == 'g')
18663 size = "ZMMWORD";
18664 else if (mode == BLKmode)
18665 /* ... or BLKmode operands, when not overridden. */
18666 size = NULL;
18667 else
18668 switch (GET_MODE_SIZE (mode))
18670 case 1: size = "BYTE"; break;
18671 case 2: size = "WORD"; break;
18672 case 4: size = "DWORD"; break;
18673 case 8: size = "QWORD"; break;
18674 case 12: size = "TBYTE"; break;
18675 case 16:
18676 if (mode == XFmode)
18677 size = "TBYTE";
18678 else
18679 size = "XMMWORD";
18680 break;
18681 case 32: size = "YMMWORD"; break;
18682 case 64: size = "ZMMWORD"; break;
18683 default:
18684 gcc_unreachable ();
18686 if (size)
18688 fputs (size, file);
18689 fputs (" PTR ", file);
18693 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18694 output_operand_lossage ("invalid constraints for operand");
18695 else
18696 ix86_print_operand_address_as
18697 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18700 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18702 long l;
18704 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18706 if (ASSEMBLER_DIALECT == ASM_ATT)
18707 putc ('$', file);
18708 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18709 if (code == 'q')
18710 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18711 (unsigned long long) (int) l);
18712 else
18713 fprintf (file, "0x%08x", (unsigned int) l);
18716 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18718 long l[2];
18720 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18722 if (ASSEMBLER_DIALECT == ASM_ATT)
18723 putc ('$', file);
18724 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18727 /* These float cases don't actually occur as immediate operands. */
18728 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18730 char dstr[30];
18732 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18733 fputs (dstr, file);
18736 else
18738 /* We have patterns that allow zero sets of memory, for instance.
18739 In 64-bit mode, we should probably support all 8-byte vectors,
18740 since we can in fact encode that into an immediate. */
18741 if (GET_CODE (x) == CONST_VECTOR)
18743 if (x != CONST0_RTX (GET_MODE (x)))
18744 output_operand_lossage ("invalid vector immediate");
18745 x = const0_rtx;
18748 if (code != 'P' && code != 'p')
18750 if (CONST_INT_P (x))
18752 if (ASSEMBLER_DIALECT == ASM_ATT)
18753 putc ('$', file);
18755 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18756 || GET_CODE (x) == LABEL_REF)
18758 if (ASSEMBLER_DIALECT == ASM_ATT)
18759 putc ('$', file);
18760 else
18761 fputs ("OFFSET FLAT:", file);
18764 if (CONST_INT_P (x))
18765 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18766 else if (flag_pic || MACHOPIC_INDIRECT)
18767 output_pic_addr_const (file, x, code);
18768 else
18769 output_addr_const (file, x);
18773 static bool
18774 ix86_print_operand_punct_valid_p (unsigned char code)
18776 return (code == '*' || code == '+' || code == '&' || code == ';'
18777 || code == '~' || code == '^' || code == '!');
18780 /* Print a memory operand whose address is ADDR. */
18782 static void
18783 ix86_print_operand_address_as (FILE *file, rtx addr,
18784 addr_space_t as, bool no_rip)
18786 struct ix86_address parts;
18787 rtx base, index, disp;
18788 int scale;
18789 int ok;
18790 bool vsib = false;
18791 int code = 0;
18793 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18795 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18796 gcc_assert (parts.index == NULL_RTX);
18797 parts.index = XVECEXP (addr, 0, 1);
18798 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18799 addr = XVECEXP (addr, 0, 0);
18800 vsib = true;
18802 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18804 gcc_assert (TARGET_64BIT);
18805 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18806 code = 'q';
18808 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18810 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18811 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18812 if (parts.base != NULL_RTX)
18814 parts.index = parts.base;
18815 parts.scale = 1;
18817 parts.base = XVECEXP (addr, 0, 0);
18818 addr = XVECEXP (addr, 0, 0);
18820 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18822 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18823 gcc_assert (parts.index == NULL_RTX);
18824 parts.index = XVECEXP (addr, 0, 1);
18825 addr = XVECEXP (addr, 0, 0);
18827 else
18828 ok = ix86_decompose_address (addr, &parts);
18830 gcc_assert (ok);
18832 base = parts.base;
18833 index = parts.index;
18834 disp = parts.disp;
18835 scale = parts.scale;
18837 if (ADDR_SPACE_GENERIC_P (as))
18838 as = parts.seg;
18839 else
18840 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18842 if (!ADDR_SPACE_GENERIC_P (as))
18844 const char *string;
18846 if (as == ADDR_SPACE_SEG_FS)
18847 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18848 else if (as == ADDR_SPACE_SEG_GS)
18849 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18850 else
18851 gcc_unreachable ();
18852 fputs (string, file);
18855 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18856 if (TARGET_64BIT && !base && !index && !no_rip)
18858 rtx symbol = disp;
18860 if (GET_CODE (disp) == CONST
18861 && GET_CODE (XEXP (disp, 0)) == PLUS
18862 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18863 symbol = XEXP (XEXP (disp, 0), 0);
18865 if (GET_CODE (symbol) == LABEL_REF
18866 || (GET_CODE (symbol) == SYMBOL_REF
18867 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18868 base = pc_rtx;
18871 if (!base && !index)
18873 /* Displacement only requires special attention. */
18874 if (CONST_INT_P (disp))
18876 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18877 fputs ("ds:", file);
18878 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18880 /* Load the external function address via the GOT slot to avoid PLT. */
18881 else if (GET_CODE (disp) == CONST
18882 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18883 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18884 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18885 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18886 output_pic_addr_const (file, disp, 0);
18887 else if (flag_pic)
18888 output_pic_addr_const (file, disp, 0);
18889 else
18890 output_addr_const (file, disp);
18892 else
18894 /* Print SImode register names to force addr32 prefix. */
18895 if (SImode_address_operand (addr, VOIDmode))
18897 if (flag_checking)
18899 gcc_assert (TARGET_64BIT);
18900 switch (GET_CODE (addr))
18902 case SUBREG:
18903 gcc_assert (GET_MODE (addr) == SImode);
18904 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18905 break;
18906 case ZERO_EXTEND:
18907 case AND:
18908 gcc_assert (GET_MODE (addr) == DImode);
18909 break;
18910 default:
18911 gcc_unreachable ();
18914 gcc_assert (!code);
18915 code = 'k';
18917 else if (code == 0
18918 && TARGET_X32
18919 && disp
18920 && CONST_INT_P (disp)
18921 && INTVAL (disp) < -16*1024*1024)
18923 /* X32 runs in 64-bit mode, where displacement, DISP, in
18924 address DISP(%r64), is encoded as 32-bit immediate sign-
18925 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18926 address is %r64 + 0xffffffffbffffd00. When %r64 <
18927 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18928 which is invalid for x32. The correct address is %r64
18929 - 0x40000300 == 0xf7ffdd64. To properly encode
18930 -0x40000300(%r64) for x32, we zero-extend negative
18931 displacement by forcing addr32 prefix which truncates
18932 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18933 zero-extend all negative displacements, including -1(%rsp).
18934 However, for small negative displacements, sign-extension
18935 won't cause overflow. We only zero-extend negative
18936 displacements if they < -16*1024*1024, which is also used
18937 to check legitimate address displacements for PIC. */
18938 code = 'k';
18941 /* Since the upper 32 bits of RSP are always zero for x32,
18942 we can encode %esp as %rsp to avoid 0x67 prefix if
18943 there is no index register. */
18944 if (TARGET_X32 && Pmode == SImode
18945 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18946 code = 'q';
18948 if (ASSEMBLER_DIALECT == ASM_ATT)
18950 if (disp)
18952 if (flag_pic)
18953 output_pic_addr_const (file, disp, 0);
18954 else if (GET_CODE (disp) == LABEL_REF)
18955 output_asm_label (disp);
18956 else
18957 output_addr_const (file, disp);
18960 putc ('(', file);
18961 if (base)
18962 print_reg (base, code, file);
18963 if (index)
18965 putc (',', file);
18966 print_reg (index, vsib ? 0 : code, file);
18967 if (scale != 1 || vsib)
18968 fprintf (file, ",%d", scale);
18970 putc (')', file);
18972 else
18974 rtx offset = NULL_RTX;
18976 if (disp)
18978 /* Pull out the offset of a symbol; print any symbol itself. */
18979 if (GET_CODE (disp) == CONST
18980 && GET_CODE (XEXP (disp, 0)) == PLUS
18981 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18983 offset = XEXP (XEXP (disp, 0), 1);
18984 disp = gen_rtx_CONST (VOIDmode,
18985 XEXP (XEXP (disp, 0), 0));
18988 if (flag_pic)
18989 output_pic_addr_const (file, disp, 0);
18990 else if (GET_CODE (disp) == LABEL_REF)
18991 output_asm_label (disp);
18992 else if (CONST_INT_P (disp))
18993 offset = disp;
18994 else
18995 output_addr_const (file, disp);
18998 putc ('[', file);
18999 if (base)
19001 print_reg (base, code, file);
19002 if (offset)
19004 if (INTVAL (offset) >= 0)
19005 putc ('+', file);
19006 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19009 else if (offset)
19010 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19011 else
19012 putc ('0', file);
19014 if (index)
19016 putc ('+', file);
19017 print_reg (index, vsib ? 0 : code, file);
19018 if (scale != 1 || vsib)
19019 fprintf (file, "*%d", scale);
19021 putc (']', file);
19026 static void
19027 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19029 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19032 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19034 static bool
19035 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19037 rtx op;
19039 if (GET_CODE (x) != UNSPEC)
19040 return false;
19042 op = XVECEXP (x, 0, 0);
19043 switch (XINT (x, 1))
19045 case UNSPEC_GOTOFF:
19046 output_addr_const (file, op);
19047 fputs ("@gotoff", file);
19048 break;
19049 case UNSPEC_GOTTPOFF:
19050 output_addr_const (file, op);
19051 /* FIXME: This might be @TPOFF in Sun ld. */
19052 fputs ("@gottpoff", file);
19053 break;
19054 case UNSPEC_TPOFF:
19055 output_addr_const (file, op);
19056 fputs ("@tpoff", file);
19057 break;
19058 case UNSPEC_NTPOFF:
19059 output_addr_const (file, op);
19060 if (TARGET_64BIT)
19061 fputs ("@tpoff", file);
19062 else
19063 fputs ("@ntpoff", file);
19064 break;
19065 case UNSPEC_DTPOFF:
19066 output_addr_const (file, op);
19067 fputs ("@dtpoff", file);
19068 break;
19069 case UNSPEC_GOTNTPOFF:
19070 output_addr_const (file, op);
19071 if (TARGET_64BIT)
19072 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19073 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19074 else
19075 fputs ("@gotntpoff", file);
19076 break;
19077 case UNSPEC_INDNTPOFF:
19078 output_addr_const (file, op);
19079 fputs ("@indntpoff", file);
19080 break;
19081 #if TARGET_MACHO
19082 case UNSPEC_MACHOPIC_OFFSET:
19083 output_addr_const (file, op);
19084 putc ('-', file);
19085 machopic_output_function_base_name (file);
19086 break;
19087 #endif
19089 default:
19090 return false;
19093 return true;
19096 /* Split one or more double-mode RTL references into pairs of half-mode
19097 references. The RTL can be REG, offsettable MEM, integer constant, or
19098 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19099 split and "num" is its length. lo_half and hi_half are output arrays
19100 that parallel "operands". */
19102 void
19103 split_double_mode (machine_mode mode, rtx operands[],
19104 int num, rtx lo_half[], rtx hi_half[])
19106 machine_mode half_mode;
19107 unsigned int byte;
19109 switch (mode)
19111 case E_TImode:
19112 half_mode = DImode;
19113 break;
19114 case E_DImode:
19115 half_mode = SImode;
19116 break;
19117 default:
19118 gcc_unreachable ();
19121 byte = GET_MODE_SIZE (half_mode);
19123 while (num--)
19125 rtx op = operands[num];
19127 /* simplify_subreg refuse to split volatile memory addresses,
19128 but we still have to handle it. */
19129 if (MEM_P (op))
19131 lo_half[num] = adjust_address (op, half_mode, 0);
19132 hi_half[num] = adjust_address (op, half_mode, byte);
19134 else
19136 lo_half[num] = simplify_gen_subreg (half_mode, op,
19137 GET_MODE (op) == VOIDmode
19138 ? mode : GET_MODE (op), 0);
19139 hi_half[num] = simplify_gen_subreg (half_mode, op,
19140 GET_MODE (op) == VOIDmode
19141 ? mode : GET_MODE (op), byte);
19146 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19147 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19148 is the expression of the binary operation. The output may either be
19149 emitted here, or returned to the caller, like all output_* functions.
19151 There is no guarantee that the operands are the same mode, as they
19152 might be within FLOAT or FLOAT_EXTEND expressions. */
19154 #ifndef SYSV386_COMPAT
19155 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19156 wants to fix the assemblers because that causes incompatibility
19157 with gcc. No-one wants to fix gcc because that causes
19158 incompatibility with assemblers... You can use the option of
19159 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19160 #define SYSV386_COMPAT 1
19161 #endif
19163 const char *
19164 output_387_binary_op (rtx_insn *insn, rtx *operands)
19166 static char buf[40];
19167 const char *p;
19168 bool is_sse
19169 = (SSE_REG_P (operands[0])
19170 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19172 if (is_sse)
19173 p = "%v";
19174 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19175 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19176 p = "fi";
19177 else
19178 p = "f";
19180 strcpy (buf, p);
19182 switch (GET_CODE (operands[3]))
19184 case PLUS:
19185 p = "add"; break;
19186 case MINUS:
19187 p = "sub"; break;
19188 case MULT:
19189 p = "mul"; break;
19190 case DIV:
19191 p = "div"; break;
19192 default:
19193 gcc_unreachable ();
19196 strcat (buf, p);
19198 if (is_sse)
19200 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19201 strcat (buf, p);
19203 if (TARGET_AVX)
19204 p = "\t{%2, %1, %0|%0, %1, %2}";
19205 else
19206 p = "\t{%2, %0|%0, %2}";
19208 strcat (buf, p);
19209 return buf;
19212 /* Even if we do not want to check the inputs, this documents input
19213 constraints. Which helps in understanding the following code. */
19214 if (flag_checking)
19216 if (STACK_REG_P (operands[0])
19217 && ((REG_P (operands[1])
19218 && REGNO (operands[0]) == REGNO (operands[1])
19219 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19220 || (REG_P (operands[2])
19221 && REGNO (operands[0]) == REGNO (operands[2])
19222 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19223 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19224 ; /* ok */
19225 else
19226 gcc_unreachable ();
19229 switch (GET_CODE (operands[3]))
19231 case MULT:
19232 case PLUS:
19233 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19234 std::swap (operands[1], operands[2]);
19236 /* know operands[0] == operands[1]. */
19238 if (MEM_P (operands[2]))
19240 p = "%Z2\t%2";
19241 break;
19244 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19246 if (STACK_TOP_P (operands[0]))
19247 /* How is it that we are storing to a dead operand[2]?
19248 Well, presumably operands[1] is dead too. We can't
19249 store the result to st(0) as st(0) gets popped on this
19250 instruction. Instead store to operands[2] (which I
19251 think has to be st(1)). st(1) will be popped later.
19252 gcc <= 2.8.1 didn't have this check and generated
19253 assembly code that the Unixware assembler rejected. */
19254 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19255 else
19256 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19257 break;
19260 if (STACK_TOP_P (operands[0]))
19261 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19262 else
19263 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19264 break;
19266 case MINUS:
19267 case DIV:
19268 if (MEM_P (operands[1]))
19270 p = "r%Z1\t%1";
19271 break;
19274 if (MEM_P (operands[2]))
19276 p = "%Z2\t%2";
19277 break;
19280 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19282 #if SYSV386_COMPAT
19283 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19284 derived assemblers, confusingly reverse the direction of
19285 the operation for fsub{r} and fdiv{r} when the
19286 destination register is not st(0). The Intel assembler
19287 doesn't have this brain damage. Read !SYSV386_COMPAT to
19288 figure out what the hardware really does. */
19289 if (STACK_TOP_P (operands[0]))
19290 p = "{p\t%0, %2|rp\t%2, %0}";
19291 else
19292 p = "{rp\t%2, %0|p\t%0, %2}";
19293 #else
19294 if (STACK_TOP_P (operands[0]))
19295 /* As above for fmul/fadd, we can't store to st(0). */
19296 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19297 else
19298 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19299 #endif
19300 break;
19303 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19305 #if SYSV386_COMPAT
19306 if (STACK_TOP_P (operands[0]))
19307 p = "{rp\t%0, %1|p\t%1, %0}";
19308 else
19309 p = "{p\t%1, %0|rp\t%0, %1}";
19310 #else
19311 if (STACK_TOP_P (operands[0]))
19312 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19313 else
19314 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19315 #endif
19316 break;
19319 if (STACK_TOP_P (operands[0]))
19321 if (STACK_TOP_P (operands[1]))
19322 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19323 else
19324 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19325 break;
19327 else if (STACK_TOP_P (operands[1]))
19329 #if SYSV386_COMPAT
19330 p = "{\t%1, %0|r\t%0, %1}";
19331 #else
19332 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19333 #endif
19335 else
19337 #if SYSV386_COMPAT
19338 p = "{r\t%2, %0|\t%0, %2}";
19339 #else
19340 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19341 #endif
19343 break;
19345 default:
19346 gcc_unreachable ();
19349 strcat (buf, p);
19350 return buf;
19353 /* Return needed mode for entity in optimize_mode_switching pass. */
19355 static int
19356 ix86_dirflag_mode_needed (rtx_insn *insn)
19358 if (CALL_P (insn))
19360 if (cfun->machine->func_type == TYPE_NORMAL)
19361 return X86_DIRFLAG_ANY;
19362 else
19363 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19364 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19367 if (recog_memoized (insn) < 0)
19368 return X86_DIRFLAG_ANY;
19370 if (get_attr_type (insn) == TYPE_STR)
19372 /* Emit cld instruction if stringops are used in the function. */
19373 if (cfun->machine->func_type == TYPE_NORMAL)
19374 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19375 else
19376 return X86_DIRFLAG_RESET;
19379 return X86_DIRFLAG_ANY;
19382 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19384 static bool
19385 ix86_check_avx_upper_register (const_rtx exp)
19387 if (SUBREG_P (exp))
19388 exp = SUBREG_REG (exp);
19390 return (REG_P (exp)
19391 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19392 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19395 /* Return needed mode for entity in optimize_mode_switching pass. */
19397 static int
19398 ix86_avx_u128_mode_needed (rtx_insn *insn)
19400 if (CALL_P (insn))
19402 rtx link;
19404 /* Needed mode is set to AVX_U128_CLEAN if there are
19405 no 256bit or 512bit modes used in function arguments. */
19406 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19407 link;
19408 link = XEXP (link, 1))
19410 if (GET_CODE (XEXP (link, 0)) == USE)
19412 rtx arg = XEXP (XEXP (link, 0), 0);
19414 if (ix86_check_avx_upper_register (arg))
19415 return AVX_U128_DIRTY;
19419 return AVX_U128_CLEAN;
19422 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19423 Hardware changes state only when a 256bit register is written to,
19424 but we need to prevent the compiler from moving optimal insertion
19425 point above eventual read from 256bit or 512 bit register. */
19426 subrtx_iterator::array_type array;
19427 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19428 if (ix86_check_avx_upper_register (*iter))
19429 return AVX_U128_DIRTY;
19431 return AVX_U128_ANY;
19434 /* Return mode that i387 must be switched into
19435 prior to the execution of insn. */
19437 static int
19438 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19440 enum attr_i387_cw mode;
19442 /* The mode UNINITIALIZED is used to store control word after a
19443 function call or ASM pattern. The mode ANY specify that function
19444 has no requirements on the control word and make no changes in the
19445 bits we are interested in. */
19447 if (CALL_P (insn)
19448 || (NONJUMP_INSN_P (insn)
19449 && (asm_noperands (PATTERN (insn)) >= 0
19450 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19451 return I387_CW_UNINITIALIZED;
19453 if (recog_memoized (insn) < 0)
19454 return I387_CW_ANY;
19456 mode = get_attr_i387_cw (insn);
19458 switch (entity)
19460 case I387_TRUNC:
19461 if (mode == I387_CW_TRUNC)
19462 return mode;
19463 break;
19465 case I387_FLOOR:
19466 if (mode == I387_CW_FLOOR)
19467 return mode;
19468 break;
19470 case I387_CEIL:
19471 if (mode == I387_CW_CEIL)
19472 return mode;
19473 break;
19475 case I387_MASK_PM:
19476 if (mode == I387_CW_MASK_PM)
19477 return mode;
19478 break;
19480 default:
19481 gcc_unreachable ();
19484 return I387_CW_ANY;
19487 /* Return mode that entity must be switched into
19488 prior to the execution of insn. */
19490 static int
19491 ix86_mode_needed (int entity, rtx_insn *insn)
19493 switch (entity)
19495 case X86_DIRFLAG:
19496 return ix86_dirflag_mode_needed (insn);
19497 case AVX_U128:
19498 return ix86_avx_u128_mode_needed (insn);
19499 case I387_TRUNC:
19500 case I387_FLOOR:
19501 case I387_CEIL:
19502 case I387_MASK_PM:
19503 return ix86_i387_mode_needed (entity, insn);
19504 default:
19505 gcc_unreachable ();
19507 return 0;
19510 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19512 static void
19513 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19515 if (ix86_check_avx_upper_register (dest))
19517 bool *used = (bool *) data;
19518 *used = true;
19522 /* Calculate mode of upper 128bit AVX registers after the insn. */
19524 static int
19525 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19527 rtx pat = PATTERN (insn);
19529 if (vzeroupper_operation (pat, VOIDmode)
19530 || vzeroall_operation (pat, VOIDmode))
19531 return AVX_U128_CLEAN;
19533 /* We know that state is clean after CALL insn if there are no
19534 256bit or 512bit registers used in the function return register. */
19535 if (CALL_P (insn))
19537 bool avx_upper_reg_found = false;
19538 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19540 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19543 /* Otherwise, return current mode. Remember that if insn
19544 references AVX 256bit or 512bit registers, the mode was already
19545 changed to DIRTY from MODE_NEEDED. */
19546 return mode;
19549 /* Return the mode that an insn results in. */
19551 static int
19552 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19554 switch (entity)
19556 case X86_DIRFLAG:
19557 return mode;
19558 case AVX_U128:
19559 return ix86_avx_u128_mode_after (mode, insn);
19560 case I387_TRUNC:
19561 case I387_FLOOR:
19562 case I387_CEIL:
19563 case I387_MASK_PM:
19564 return mode;
19565 default:
19566 gcc_unreachable ();
19570 static int
19571 ix86_dirflag_mode_entry (void)
19573 /* For TARGET_CLD or in the interrupt handler we can't assume
19574 direction flag state at function entry. */
19575 if (TARGET_CLD
19576 || cfun->machine->func_type != TYPE_NORMAL)
19577 return X86_DIRFLAG_ANY;
19579 return X86_DIRFLAG_RESET;
19582 static int
19583 ix86_avx_u128_mode_entry (void)
19585 tree arg;
19587 /* Entry mode is set to AVX_U128_DIRTY if there are
19588 256bit or 512bit modes used in function arguments. */
19589 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19590 arg = TREE_CHAIN (arg))
19592 rtx incoming = DECL_INCOMING_RTL (arg);
19594 if (incoming && ix86_check_avx_upper_register (incoming))
19595 return AVX_U128_DIRTY;
19598 return AVX_U128_CLEAN;
19601 /* Return a mode that ENTITY is assumed to be
19602 switched to at function entry. */
19604 static int
19605 ix86_mode_entry (int entity)
19607 switch (entity)
19609 case X86_DIRFLAG:
19610 return ix86_dirflag_mode_entry ();
19611 case AVX_U128:
19612 return ix86_avx_u128_mode_entry ();
19613 case I387_TRUNC:
19614 case I387_FLOOR:
19615 case I387_CEIL:
19616 case I387_MASK_PM:
19617 return I387_CW_ANY;
19618 default:
19619 gcc_unreachable ();
19623 static int
19624 ix86_avx_u128_mode_exit (void)
19626 rtx reg = crtl->return_rtx;
19628 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19629 or 512 bit modes used in the function return register. */
19630 if (reg && ix86_check_avx_upper_register (reg))
19631 return AVX_U128_DIRTY;
19633 return AVX_U128_CLEAN;
19636 /* Return a mode that ENTITY is assumed to be
19637 switched to at function exit. */
19639 static int
19640 ix86_mode_exit (int entity)
19642 switch (entity)
19644 case X86_DIRFLAG:
19645 return X86_DIRFLAG_ANY;
19646 case AVX_U128:
19647 return ix86_avx_u128_mode_exit ();
19648 case I387_TRUNC:
19649 case I387_FLOOR:
19650 case I387_CEIL:
19651 case I387_MASK_PM:
19652 return I387_CW_ANY;
19653 default:
19654 gcc_unreachable ();
19658 static int
19659 ix86_mode_priority (int, int n)
19661 return n;
19664 /* Output code to initialize control word copies used by trunc?f?i and
19665 rounding patterns. CURRENT_MODE is set to current control word,
19666 while NEW_MODE is set to new control word. */
19668 static void
19669 emit_i387_cw_initialization (int mode)
19671 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19672 rtx new_mode;
19674 enum ix86_stack_slot slot;
19676 rtx reg = gen_reg_rtx (HImode);
19678 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19679 emit_move_insn (reg, copy_rtx (stored_mode));
19681 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19682 || optimize_insn_for_size_p ())
19684 switch (mode)
19686 case I387_CW_TRUNC:
19687 /* round toward zero (truncate) */
19688 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19689 slot = SLOT_CW_TRUNC;
19690 break;
19692 case I387_CW_FLOOR:
19693 /* round down toward -oo */
19694 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19695 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19696 slot = SLOT_CW_FLOOR;
19697 break;
19699 case I387_CW_CEIL:
19700 /* round up toward +oo */
19701 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19702 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19703 slot = SLOT_CW_CEIL;
19704 break;
19706 case I387_CW_MASK_PM:
19707 /* mask precision exception for nearbyint() */
19708 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19709 slot = SLOT_CW_MASK_PM;
19710 break;
19712 default:
19713 gcc_unreachable ();
19716 else
19718 switch (mode)
19720 case I387_CW_TRUNC:
19721 /* round toward zero (truncate) */
19722 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19723 slot = SLOT_CW_TRUNC;
19724 break;
19726 case I387_CW_FLOOR:
19727 /* round down toward -oo */
19728 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19729 slot = SLOT_CW_FLOOR;
19730 break;
19732 case I387_CW_CEIL:
19733 /* round up toward +oo */
19734 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19735 slot = SLOT_CW_CEIL;
19736 break;
19738 case I387_CW_MASK_PM:
19739 /* mask precision exception for nearbyint() */
19740 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19741 slot = SLOT_CW_MASK_PM;
19742 break;
19744 default:
19745 gcc_unreachable ();
19749 gcc_assert (slot < MAX_386_STACK_LOCALS);
19751 new_mode = assign_386_stack_local (HImode, slot);
19752 emit_move_insn (new_mode, reg);
19755 /* Emit vzeroupper. */
19757 void
19758 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19760 int i;
19762 /* Cancel automatic vzeroupper insertion if there are
19763 live call-saved SSE registers at the insertion point. */
19765 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19766 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19767 return;
19769 if (TARGET_64BIT)
19770 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19771 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19772 return;
19774 emit_insn (gen_avx_vzeroupper ());
19777 /* Generate one or more insns to set ENTITY to MODE. */
19779 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19780 is the set of hard registers live at the point where the insn(s)
19781 are to be inserted. */
19783 static void
19784 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19785 HARD_REG_SET regs_live)
19787 switch (entity)
19789 case X86_DIRFLAG:
19790 if (mode == X86_DIRFLAG_RESET)
19791 emit_insn (gen_cld ());
19792 break;
19793 case AVX_U128:
19794 if (mode == AVX_U128_CLEAN)
19795 ix86_avx_emit_vzeroupper (regs_live);
19796 break;
19797 case I387_TRUNC:
19798 case I387_FLOOR:
19799 case I387_CEIL:
19800 case I387_MASK_PM:
19801 if (mode != I387_CW_ANY
19802 && mode != I387_CW_UNINITIALIZED)
19803 emit_i387_cw_initialization (mode);
19804 break;
19805 default:
19806 gcc_unreachable ();
19810 /* Output code for INSN to convert a float to a signed int. OPERANDS
19811 are the insn operands. The output may be [HSD]Imode and the input
19812 operand may be [SDX]Fmode. */
19814 const char *
19815 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19817 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19818 bool dimode_p = GET_MODE (operands[0]) == DImode;
19819 int round_mode = get_attr_i387_cw (insn);
19821 static char buf[40];
19822 const char *p;
19824 /* Jump through a hoop or two for DImode, since the hardware has no
19825 non-popping instruction. We used to do this a different way, but
19826 that was somewhat fragile and broke with post-reload splitters. */
19827 if ((dimode_p || fisttp) && !stack_top_dies)
19828 output_asm_insn ("fld\t%y1", operands);
19830 gcc_assert (STACK_TOP_P (operands[1]));
19831 gcc_assert (MEM_P (operands[0]));
19832 gcc_assert (GET_MODE (operands[1]) != TFmode);
19834 if (fisttp)
19835 return "fisttp%Z0\t%0";
19837 strcpy (buf, "fist");
19839 if (round_mode != I387_CW_ANY)
19840 output_asm_insn ("fldcw\t%3", operands);
19842 p = "p%Z0\t%0";
19843 strcat (buf, p + !(stack_top_dies || dimode_p));
19845 output_asm_insn (buf, operands);
19847 if (round_mode != I387_CW_ANY)
19848 output_asm_insn ("fldcw\t%2", operands);
19850 return "";
19853 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19854 have the values zero or one, indicates the ffreep insn's operand
19855 from the OPERANDS array. */
19857 static const char *
19858 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19860 if (TARGET_USE_FFREEP)
19861 #ifdef HAVE_AS_IX86_FFREEP
19862 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19863 #else
19865 static char retval[32];
19866 int regno = REGNO (operands[opno]);
19868 gcc_assert (STACK_REGNO_P (regno));
19870 regno -= FIRST_STACK_REG;
19872 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19873 return retval;
19875 #endif
19877 return opno ? "fstp\t%y1" : "fstp\t%y0";
19881 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19882 should be used. UNORDERED_P is true when fucom should be used. */
19884 const char *
19885 output_fp_compare (rtx_insn *insn, rtx *operands,
19886 bool eflags_p, bool unordered_p)
19888 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19889 bool stack_top_dies;
19891 static char buf[40];
19892 const char *p;
19894 gcc_assert (STACK_TOP_P (xops[0]));
19896 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19898 if (eflags_p)
19900 p = unordered_p ? "fucomi" : "fcomi";
19901 strcpy (buf, p);
19903 p = "p\t{%y1, %0|%0, %y1}";
19904 strcat (buf, p + !stack_top_dies);
19906 return buf;
19909 if (STACK_REG_P (xops[1])
19910 && stack_top_dies
19911 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19913 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19915 /* If both the top of the 387 stack die, and the other operand
19916 is also a stack register that dies, then this must be a
19917 `fcompp' float compare. */
19918 p = unordered_p ? "fucompp" : "fcompp";
19919 strcpy (buf, p);
19921 else if (const0_operand (xops[1], VOIDmode))
19923 gcc_assert (!unordered_p);
19924 strcpy (buf, "ftst");
19926 else
19928 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19930 gcc_assert (!unordered_p);
19931 p = "ficom";
19933 else
19934 p = unordered_p ? "fucom" : "fcom";
19936 strcpy (buf, p);
19938 p = "p%Z2\t%y2";
19939 strcat (buf, p + !stack_top_dies);
19942 output_asm_insn (buf, operands);
19943 return "fnstsw\t%0";
19946 void
19947 ix86_output_addr_vec_elt (FILE *file, int value)
19949 const char *directive = ASM_LONG;
19951 #ifdef ASM_QUAD
19952 if (TARGET_LP64)
19953 directive = ASM_QUAD;
19954 #else
19955 gcc_assert (!TARGET_64BIT);
19956 #endif
19958 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19961 void
19962 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19964 const char *directive = ASM_LONG;
19966 #ifdef ASM_QUAD
19967 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19968 directive = ASM_QUAD;
19969 #else
19970 gcc_assert (!TARGET_64BIT);
19971 #endif
19972 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19973 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19974 fprintf (file, "%s%s%d-%s%d\n",
19975 directive, LPREFIX, value, LPREFIX, rel);
19976 else if (HAVE_AS_GOTOFF_IN_DATA)
19977 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19978 #if TARGET_MACHO
19979 else if (TARGET_MACHO)
19981 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19982 machopic_output_function_base_name (file);
19983 putc ('\n', file);
19985 #endif
19986 else
19987 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19988 GOT_SYMBOL_NAME, LPREFIX, value);
19991 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19992 for the target. */
19994 void
19995 ix86_expand_clear (rtx dest)
19997 rtx tmp;
19999 /* We play register width games, which are only valid after reload. */
20000 gcc_assert (reload_completed);
20002 /* Avoid HImode and its attendant prefix byte. */
20003 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20004 dest = gen_rtx_REG (SImode, REGNO (dest));
20005 tmp = gen_rtx_SET (dest, const0_rtx);
20007 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20009 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20010 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20013 emit_insn (tmp);
20016 void
20017 ix86_expand_move (machine_mode mode, rtx operands[])
20019 rtx op0, op1;
20020 rtx tmp, addend = NULL_RTX;
20021 enum tls_model model;
20023 op0 = operands[0];
20024 op1 = operands[1];
20026 switch (GET_CODE (op1))
20028 case CONST:
20029 tmp = XEXP (op1, 0);
20031 if (GET_CODE (tmp) != PLUS
20032 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20033 break;
20035 op1 = XEXP (tmp, 0);
20036 addend = XEXP (tmp, 1);
20037 /* FALLTHRU */
20039 case SYMBOL_REF:
20040 model = SYMBOL_REF_TLS_MODEL (op1);
20042 if (model)
20043 op1 = legitimize_tls_address (op1, model, true);
20044 else if (ix86_force_load_from_GOT_p (op1))
20046 /* Load the external function address via GOT slot to avoid PLT. */
20047 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20048 (TARGET_64BIT
20049 ? UNSPEC_GOTPCREL
20050 : UNSPEC_GOT));
20051 op1 = gen_rtx_CONST (Pmode, op1);
20052 op1 = gen_const_mem (Pmode, op1);
20053 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20055 else
20057 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20058 if (tmp)
20060 op1 = tmp;
20061 if (!addend)
20062 break;
20064 else
20066 op1 = operands[1];
20067 break;
20071 if (addend)
20073 op1 = force_operand (op1, NULL_RTX);
20074 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20075 op0, 1, OPTAB_DIRECT);
20077 else
20078 op1 = force_operand (op1, op0);
20080 if (op1 == op0)
20081 return;
20083 op1 = convert_to_mode (mode, op1, 1);
20085 default:
20086 break;
20089 if ((flag_pic || MACHOPIC_INDIRECT)
20090 && symbolic_operand (op1, mode))
20092 if (TARGET_MACHO && !TARGET_64BIT)
20094 #if TARGET_MACHO
20095 /* dynamic-no-pic */
20096 if (MACHOPIC_INDIRECT)
20098 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20099 ? op0 : gen_reg_rtx (Pmode);
20100 op1 = machopic_indirect_data_reference (op1, temp);
20101 if (MACHOPIC_PURE)
20102 op1 = machopic_legitimize_pic_address (op1, mode,
20103 temp == op1 ? 0 : temp);
20105 if (op0 != op1 && GET_CODE (op0) != MEM)
20107 rtx insn = gen_rtx_SET (op0, op1);
20108 emit_insn (insn);
20109 return;
20111 if (GET_CODE (op0) == MEM)
20112 op1 = force_reg (Pmode, op1);
20113 else
20115 rtx temp = op0;
20116 if (GET_CODE (temp) != REG)
20117 temp = gen_reg_rtx (Pmode);
20118 temp = legitimize_pic_address (op1, temp);
20119 if (temp == op0)
20120 return;
20121 op1 = temp;
20123 /* dynamic-no-pic */
20124 #endif
20126 else
20128 if (MEM_P (op0))
20129 op1 = force_reg (mode, op1);
20130 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20132 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20133 op1 = legitimize_pic_address (op1, reg);
20134 if (op0 == op1)
20135 return;
20136 op1 = convert_to_mode (mode, op1, 1);
20140 else
20142 if (MEM_P (op0)
20143 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20144 || !push_operand (op0, mode))
20145 && MEM_P (op1))
20146 op1 = force_reg (mode, op1);
20148 if (push_operand (op0, mode)
20149 && ! general_no_elim_operand (op1, mode))
20150 op1 = copy_to_mode_reg (mode, op1);
20152 /* Force large constants in 64bit compilation into register
20153 to get them CSEed. */
20154 if (can_create_pseudo_p ()
20155 && (mode == DImode) && TARGET_64BIT
20156 && immediate_operand (op1, mode)
20157 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20158 && !register_operand (op0, mode)
20159 && optimize)
20160 op1 = copy_to_mode_reg (mode, op1);
20162 if (can_create_pseudo_p ()
20163 && CONST_DOUBLE_P (op1))
20165 /* If we are loading a floating point constant to a register,
20166 force the value to memory now, since we'll get better code
20167 out the back end. */
20169 op1 = validize_mem (force_const_mem (mode, op1));
20170 if (!register_operand (op0, mode))
20172 rtx temp = gen_reg_rtx (mode);
20173 emit_insn (gen_rtx_SET (temp, op1));
20174 emit_move_insn (op0, temp);
20175 return;
20180 emit_insn (gen_rtx_SET (op0, op1));
20183 void
20184 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20186 rtx op0 = operands[0], op1 = operands[1];
20187 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20188 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20189 unsigned int align = (TARGET_IAMCU
20190 ? GET_MODE_BITSIZE (mode)
20191 : GET_MODE_ALIGNMENT (mode));
20193 if (push_operand (op0, VOIDmode))
20194 op0 = emit_move_resolve_push (mode, op0);
20196 /* Force constants other than zero into memory. We do not know how
20197 the instructions used to build constants modify the upper 64 bits
20198 of the register, once we have that information we may be able
20199 to handle some of them more efficiently. */
20200 if (can_create_pseudo_p ()
20201 && (CONSTANT_P (op1)
20202 || (SUBREG_P (op1)
20203 && CONSTANT_P (SUBREG_REG (op1))))
20204 && ((register_operand (op0, mode)
20205 && !standard_sse_constant_p (op1, mode))
20206 /* ix86_expand_vector_move_misalign() does not like constants. */
20207 || (SSE_REG_MODE_P (mode)
20208 && MEM_P (op0)
20209 && MEM_ALIGN (op0) < align)))
20211 if (SUBREG_P (op1))
20213 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20214 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20215 if (r)
20216 r = validize_mem (r);
20217 else
20218 r = force_reg (imode, SUBREG_REG (op1));
20219 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20221 else
20222 op1 = validize_mem (force_const_mem (mode, op1));
20225 /* We need to check memory alignment for SSE mode since attribute
20226 can make operands unaligned. */
20227 if (can_create_pseudo_p ()
20228 && SSE_REG_MODE_P (mode)
20229 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20230 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20232 rtx tmp[2];
20234 /* ix86_expand_vector_move_misalign() does not like both
20235 arguments in memory. */
20236 if (!register_operand (op0, mode)
20237 && !register_operand (op1, mode))
20238 op1 = force_reg (mode, op1);
20240 tmp[0] = op0; tmp[1] = op1;
20241 ix86_expand_vector_move_misalign (mode, tmp);
20242 return;
20245 /* Make operand1 a register if it isn't already. */
20246 if (can_create_pseudo_p ()
20247 && !register_operand (op0, mode)
20248 && !register_operand (op1, mode))
20250 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20251 return;
20254 emit_insn (gen_rtx_SET (op0, op1));
20257 /* Split 32-byte AVX unaligned load and store if needed. */
20259 static void
20260 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20262 rtx m;
20263 rtx (*extract) (rtx, rtx, rtx);
20264 machine_mode mode;
20266 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20267 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20269 emit_insn (gen_rtx_SET (op0, op1));
20270 return;
20273 rtx orig_op0 = NULL_RTX;
20274 mode = GET_MODE (op0);
20275 switch (GET_MODE_CLASS (mode))
20277 case MODE_VECTOR_INT:
20278 case MODE_INT:
20279 if (mode != V32QImode)
20281 if (!MEM_P (op0))
20283 orig_op0 = op0;
20284 op0 = gen_reg_rtx (V32QImode);
20286 else
20287 op0 = gen_lowpart (V32QImode, op0);
20288 op1 = gen_lowpart (V32QImode, op1);
20289 mode = V32QImode;
20291 break;
20292 case MODE_VECTOR_FLOAT:
20293 break;
20294 default:
20295 gcc_unreachable ();
20298 switch (mode)
20300 default:
20301 gcc_unreachable ();
20302 case E_V32QImode:
20303 extract = gen_avx_vextractf128v32qi;
20304 mode = V16QImode;
20305 break;
20306 case E_V8SFmode:
20307 extract = gen_avx_vextractf128v8sf;
20308 mode = V4SFmode;
20309 break;
20310 case E_V4DFmode:
20311 extract = gen_avx_vextractf128v4df;
20312 mode = V2DFmode;
20313 break;
20316 if (MEM_P (op1))
20318 rtx r = gen_reg_rtx (mode);
20319 m = adjust_address (op1, mode, 0);
20320 emit_move_insn (r, m);
20321 m = adjust_address (op1, mode, 16);
20322 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20323 emit_move_insn (op0, r);
20325 else if (MEM_P (op0))
20327 m = adjust_address (op0, mode, 0);
20328 emit_insn (extract (m, op1, const0_rtx));
20329 m = adjust_address (op0, mode, 16);
20330 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20332 else
20333 gcc_unreachable ();
20335 if (orig_op0)
20336 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20339 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20340 straight to ix86_expand_vector_move. */
20341 /* Code generation for scalar reg-reg moves of single and double precision data:
20342 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20343 movaps reg, reg
20344 else
20345 movss reg, reg
20346 if (x86_sse_partial_reg_dependency == true)
20347 movapd reg, reg
20348 else
20349 movsd reg, reg
20351 Code generation for scalar loads of double precision data:
20352 if (x86_sse_split_regs == true)
20353 movlpd mem, reg (gas syntax)
20354 else
20355 movsd mem, reg
20357 Code generation for unaligned packed loads of single precision data
20358 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20359 if (x86_sse_unaligned_move_optimal)
20360 movups mem, reg
20362 if (x86_sse_partial_reg_dependency == true)
20364 xorps reg, reg
20365 movlps mem, reg
20366 movhps mem+8, reg
20368 else
20370 movlps mem, reg
20371 movhps mem+8, reg
20374 Code generation for unaligned packed loads of double precision data
20375 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20376 if (x86_sse_unaligned_move_optimal)
20377 movupd mem, reg
20379 if (x86_sse_split_regs == true)
20381 movlpd mem, reg
20382 movhpd mem+8, reg
20384 else
20386 movsd mem, reg
20387 movhpd mem+8, reg
20391 void
20392 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20394 rtx op0, op1, m;
20396 op0 = operands[0];
20397 op1 = operands[1];
20399 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20400 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20402 emit_insn (gen_rtx_SET (op0, op1));
20403 return;
20406 if (TARGET_AVX)
20408 if (GET_MODE_SIZE (mode) == 32)
20409 ix86_avx256_split_vector_move_misalign (op0, op1);
20410 else
20411 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20412 emit_insn (gen_rtx_SET (op0, op1));
20413 return;
20416 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20417 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20419 emit_insn (gen_rtx_SET (op0, op1));
20420 return;
20423 /* ??? If we have typed data, then it would appear that using
20424 movdqu is the only way to get unaligned data loaded with
20425 integer type. */
20426 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20428 emit_insn (gen_rtx_SET (op0, op1));
20429 return;
20432 if (MEM_P (op1))
20434 if (TARGET_SSE2 && mode == V2DFmode)
20436 rtx zero;
20438 /* When SSE registers are split into halves, we can avoid
20439 writing to the top half twice. */
20440 if (TARGET_SSE_SPLIT_REGS)
20442 emit_clobber (op0);
20443 zero = op0;
20445 else
20447 /* ??? Not sure about the best option for the Intel chips.
20448 The following would seem to satisfy; the register is
20449 entirely cleared, breaking the dependency chain. We
20450 then store to the upper half, with a dependency depth
20451 of one. A rumor has it that Intel recommends two movsd
20452 followed by an unpacklpd, but this is unconfirmed. And
20453 given that the dependency depth of the unpacklpd would
20454 still be one, I'm not sure why this would be better. */
20455 zero = CONST0_RTX (V2DFmode);
20458 m = adjust_address (op1, DFmode, 0);
20459 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20460 m = adjust_address (op1, DFmode, 8);
20461 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20463 else
20465 rtx t;
20467 if (mode != V4SFmode)
20468 t = gen_reg_rtx (V4SFmode);
20469 else
20470 t = op0;
20472 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20473 emit_move_insn (t, CONST0_RTX (V4SFmode));
20474 else
20475 emit_clobber (t);
20477 m = adjust_address (op1, V2SFmode, 0);
20478 emit_insn (gen_sse_loadlps (t, t, m));
20479 m = adjust_address (op1, V2SFmode, 8);
20480 emit_insn (gen_sse_loadhps (t, t, m));
20481 if (mode != V4SFmode)
20482 emit_move_insn (op0, gen_lowpart (mode, t));
20485 else if (MEM_P (op0))
20487 if (TARGET_SSE2 && mode == V2DFmode)
20489 m = adjust_address (op0, DFmode, 0);
20490 emit_insn (gen_sse2_storelpd (m, op1));
20491 m = adjust_address (op0, DFmode, 8);
20492 emit_insn (gen_sse2_storehpd (m, op1));
20494 else
20496 if (mode != V4SFmode)
20497 op1 = gen_lowpart (V4SFmode, op1);
20499 m = adjust_address (op0, V2SFmode, 0);
20500 emit_insn (gen_sse_storelps (m, op1));
20501 m = adjust_address (op0, V2SFmode, 8);
20502 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20505 else
20506 gcc_unreachable ();
20509 /* Helper function of ix86_fixup_binary_operands to canonicalize
20510 operand order. Returns true if the operands should be swapped. */
20512 static bool
20513 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20514 rtx operands[])
20516 rtx dst = operands[0];
20517 rtx src1 = operands[1];
20518 rtx src2 = operands[2];
20520 /* If the operation is not commutative, we can't do anything. */
20521 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20522 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20523 return false;
20525 /* Highest priority is that src1 should match dst. */
20526 if (rtx_equal_p (dst, src1))
20527 return false;
20528 if (rtx_equal_p (dst, src2))
20529 return true;
20531 /* Next highest priority is that immediate constants come second. */
20532 if (immediate_operand (src2, mode))
20533 return false;
20534 if (immediate_operand (src1, mode))
20535 return true;
20537 /* Lowest priority is that memory references should come second. */
20538 if (MEM_P (src2))
20539 return false;
20540 if (MEM_P (src1))
20541 return true;
20543 return false;
20547 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20548 destination to use for the operation. If different from the true
20549 destination in operands[0], a copy operation will be required. */
20552 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20553 rtx operands[])
20555 rtx dst = operands[0];
20556 rtx src1 = operands[1];
20557 rtx src2 = operands[2];
20559 /* Canonicalize operand order. */
20560 if (ix86_swap_binary_operands_p (code, mode, operands))
20562 /* It is invalid to swap operands of different modes. */
20563 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20565 std::swap (src1, src2);
20568 /* Both source operands cannot be in memory. */
20569 if (MEM_P (src1) && MEM_P (src2))
20571 /* Optimization: Only read from memory once. */
20572 if (rtx_equal_p (src1, src2))
20574 src2 = force_reg (mode, src2);
20575 src1 = src2;
20577 else if (rtx_equal_p (dst, src1))
20578 src2 = force_reg (mode, src2);
20579 else
20580 src1 = force_reg (mode, src1);
20583 /* If the destination is memory, and we do not have matching source
20584 operands, do things in registers. */
20585 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20586 dst = gen_reg_rtx (mode);
20588 /* Source 1 cannot be a constant. */
20589 if (CONSTANT_P (src1))
20590 src1 = force_reg (mode, src1);
20592 /* Source 1 cannot be a non-matching memory. */
20593 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20594 src1 = force_reg (mode, src1);
20596 /* Improve address combine. */
20597 if (code == PLUS
20598 && GET_MODE_CLASS (mode) == MODE_INT
20599 && MEM_P (src2))
20600 src2 = force_reg (mode, src2);
20602 operands[1] = src1;
20603 operands[2] = src2;
20604 return dst;
20607 /* Similarly, but assume that the destination has already been
20608 set up properly. */
20610 void
20611 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20612 machine_mode mode, rtx operands[])
20614 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20615 gcc_assert (dst == operands[0]);
20618 /* Attempt to expand a binary operator. Make the expansion closer to the
20619 actual machine, then just general_operand, which will allow 3 separate
20620 memory references (one output, two input) in a single insn. */
20622 void
20623 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20624 rtx operands[])
20626 rtx src1, src2, dst, op, clob;
20628 dst = ix86_fixup_binary_operands (code, mode, operands);
20629 src1 = operands[1];
20630 src2 = operands[2];
20632 /* Emit the instruction. */
20634 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20636 if (reload_completed
20637 && code == PLUS
20638 && !rtx_equal_p (dst, src1))
20640 /* This is going to be an LEA; avoid splitting it later. */
20641 emit_insn (op);
20643 else
20645 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20646 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20649 /* Fix up the destination if needed. */
20650 if (dst != operands[0])
20651 emit_move_insn (operands[0], dst);
20654 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20655 the given OPERANDS. */
20657 void
20658 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20659 rtx operands[])
20661 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20662 if (SUBREG_P (operands[1]))
20664 op1 = operands[1];
20665 op2 = operands[2];
20667 else if (SUBREG_P (operands[2]))
20669 op1 = operands[2];
20670 op2 = operands[1];
20672 /* Optimize (__m128i) d | (__m128i) e and similar code
20673 when d and e are float vectors into float vector logical
20674 insn. In C/C++ without using intrinsics there is no other way
20675 to express vector logical operation on float vectors than
20676 to cast them temporarily to integer vectors. */
20677 if (op1
20678 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20679 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20680 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20681 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20682 && SUBREG_BYTE (op1) == 0
20683 && (GET_CODE (op2) == CONST_VECTOR
20684 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20685 && SUBREG_BYTE (op2) == 0))
20686 && can_create_pseudo_p ())
20688 rtx dst;
20689 switch (GET_MODE (SUBREG_REG (op1)))
20691 case E_V4SFmode:
20692 case E_V8SFmode:
20693 case E_V16SFmode:
20694 case E_V2DFmode:
20695 case E_V4DFmode:
20696 case E_V8DFmode:
20697 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20698 if (GET_CODE (op2) == CONST_VECTOR)
20700 op2 = gen_lowpart (GET_MODE (dst), op2);
20701 op2 = force_reg (GET_MODE (dst), op2);
20703 else
20705 op1 = operands[1];
20706 op2 = SUBREG_REG (operands[2]);
20707 if (!vector_operand (op2, GET_MODE (dst)))
20708 op2 = force_reg (GET_MODE (dst), op2);
20710 op1 = SUBREG_REG (op1);
20711 if (!vector_operand (op1, GET_MODE (dst)))
20712 op1 = force_reg (GET_MODE (dst), op1);
20713 emit_insn (gen_rtx_SET (dst,
20714 gen_rtx_fmt_ee (code, GET_MODE (dst),
20715 op1, op2)));
20716 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20717 return;
20718 default:
20719 break;
20722 if (!vector_operand (operands[1], mode))
20723 operands[1] = force_reg (mode, operands[1]);
20724 if (!vector_operand (operands[2], mode))
20725 operands[2] = force_reg (mode, operands[2]);
20726 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20727 emit_insn (gen_rtx_SET (operands[0],
20728 gen_rtx_fmt_ee (code, mode, operands[1],
20729 operands[2])));
20732 /* Return TRUE or FALSE depending on whether the binary operator meets the
20733 appropriate constraints. */
20735 bool
20736 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20737 rtx operands[3])
20739 rtx dst = operands[0];
20740 rtx src1 = operands[1];
20741 rtx src2 = operands[2];
20743 /* Both source operands cannot be in memory. */
20744 if (MEM_P (src1) && MEM_P (src2))
20745 return false;
20747 /* Canonicalize operand order for commutative operators. */
20748 if (ix86_swap_binary_operands_p (code, mode, operands))
20749 std::swap (src1, src2);
20751 /* If the destination is memory, we must have a matching source operand. */
20752 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20753 return false;
20755 /* Source 1 cannot be a constant. */
20756 if (CONSTANT_P (src1))
20757 return false;
20759 /* Source 1 cannot be a non-matching memory. */
20760 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20761 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20762 return (code == AND
20763 && (mode == HImode
20764 || mode == SImode
20765 || (TARGET_64BIT && mode == DImode))
20766 && satisfies_constraint_L (src2));
20768 return true;
20771 /* Attempt to expand a unary operator. Make the expansion closer to the
20772 actual machine, then just general_operand, which will allow 2 separate
20773 memory references (one output, one input) in a single insn. */
20775 void
20776 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20777 rtx operands[])
20779 bool matching_memory = false;
20780 rtx src, dst, op, clob;
20782 dst = operands[0];
20783 src = operands[1];
20785 /* If the destination is memory, and we do not have matching source
20786 operands, do things in registers. */
20787 if (MEM_P (dst))
20789 if (rtx_equal_p (dst, src))
20790 matching_memory = true;
20791 else
20792 dst = gen_reg_rtx (mode);
20795 /* When source operand is memory, destination must match. */
20796 if (MEM_P (src) && !matching_memory)
20797 src = force_reg (mode, src);
20799 /* Emit the instruction. */
20801 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20803 if (code == NOT)
20804 emit_insn (op);
20805 else
20807 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20808 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20811 /* Fix up the destination if needed. */
20812 if (dst != operands[0])
20813 emit_move_insn (operands[0], dst);
20816 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20817 divisor are within the range [0-255]. */
20819 void
20820 ix86_split_idivmod (machine_mode mode, rtx operands[],
20821 bool signed_p)
20823 rtx_code_label *end_label, *qimode_label;
20824 rtx div, mod;
20825 rtx_insn *insn;
20826 rtx scratch, tmp0, tmp1, tmp2;
20827 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20828 rtx (*gen_zero_extend) (rtx, rtx);
20829 rtx (*gen_test_ccno_1) (rtx, rtx);
20831 switch (mode)
20833 case E_SImode:
20834 if (GET_MODE (operands[0]) == SImode)
20836 if (GET_MODE (operands[1]) == SImode)
20837 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20838 else
20839 gen_divmod4_1
20840 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20841 gen_zero_extend = gen_zero_extendqisi2;
20843 else
20845 gen_divmod4_1
20846 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20847 gen_zero_extend = gen_zero_extendqidi2;
20849 gen_test_ccno_1 = gen_testsi_ccno_1;
20850 break;
20851 case E_DImode:
20852 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20853 gen_test_ccno_1 = gen_testdi_ccno_1;
20854 gen_zero_extend = gen_zero_extendqidi2;
20855 break;
20856 default:
20857 gcc_unreachable ();
20860 end_label = gen_label_rtx ();
20861 qimode_label = gen_label_rtx ();
20863 scratch = gen_reg_rtx (mode);
20865 /* Use 8bit unsigned divimod if dividend and divisor are within
20866 the range [0-255]. */
20867 emit_move_insn (scratch, operands[2]);
20868 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20869 scratch, 1, OPTAB_DIRECT);
20870 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20871 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20872 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20873 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20874 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20875 pc_rtx);
20876 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20877 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20878 JUMP_LABEL (insn) = qimode_label;
20880 /* Generate original signed/unsigned divimod. */
20881 div = gen_divmod4_1 (operands[0], operands[1],
20882 operands[2], operands[3]);
20883 emit_insn (div);
20885 /* Branch to the end. */
20886 emit_jump_insn (gen_jump (end_label));
20887 emit_barrier ();
20889 /* Generate 8bit unsigned divide. */
20890 emit_label (qimode_label);
20891 /* Don't use operands[0] for result of 8bit divide since not all
20892 registers support QImode ZERO_EXTRACT. */
20893 tmp0 = lowpart_subreg (HImode, scratch, mode);
20894 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20895 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20896 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20898 if (signed_p)
20900 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20901 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20903 else
20905 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20906 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20908 if (mode == SImode)
20910 if (GET_MODE (operands[0]) != SImode)
20911 div = gen_rtx_ZERO_EXTEND (DImode, div);
20912 if (GET_MODE (operands[1]) != SImode)
20913 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20916 /* Extract remainder from AH. */
20917 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20918 tmp0, GEN_INT (8), GEN_INT (8));
20919 if (REG_P (operands[1]))
20920 insn = emit_move_insn (operands[1], tmp1);
20921 else
20923 /* Need a new scratch register since the old one has result
20924 of 8bit divide. */
20925 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20926 emit_move_insn (scratch, tmp1);
20927 insn = emit_move_insn (operands[1], scratch);
20929 set_unique_reg_note (insn, REG_EQUAL, mod);
20931 /* Zero extend quotient from AL. */
20932 tmp1 = gen_lowpart (QImode, tmp0);
20933 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20934 set_unique_reg_note (insn, REG_EQUAL, div);
20936 emit_label (end_label);
20939 #define LEA_MAX_STALL (3)
20940 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20942 /* Increase given DISTANCE in half-cycles according to
20943 dependencies between PREV and NEXT instructions.
20944 Add 1 half-cycle if there is no dependency and
20945 go to next cycle if there is some dependecy. */
20947 static unsigned int
20948 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20950 df_ref def, use;
20952 if (!prev || !next)
20953 return distance + (distance & 1) + 2;
20955 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20956 return distance + 1;
20958 FOR_EACH_INSN_USE (use, next)
20959 FOR_EACH_INSN_DEF (def, prev)
20960 if (!DF_REF_IS_ARTIFICIAL (def)
20961 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20962 return distance + (distance & 1) + 2;
20964 return distance + 1;
20967 /* Function checks if instruction INSN defines register number
20968 REGNO1 or REGNO2. */
20970 static bool
20971 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20972 rtx_insn *insn)
20974 df_ref def;
20976 FOR_EACH_INSN_DEF (def, insn)
20977 if (DF_REF_REG_DEF_P (def)
20978 && !DF_REF_IS_ARTIFICIAL (def)
20979 && (regno1 == DF_REF_REGNO (def)
20980 || regno2 == DF_REF_REGNO (def)))
20981 return true;
20983 return false;
20986 /* Function checks if instruction INSN uses register number
20987 REGNO as a part of address expression. */
20989 static bool
20990 insn_uses_reg_mem (unsigned int regno, rtx insn)
20992 df_ref use;
20994 FOR_EACH_INSN_USE (use, insn)
20995 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20996 return true;
20998 return false;
21001 /* Search backward for non-agu definition of register number REGNO1
21002 or register number REGNO2 in basic block starting from instruction
21003 START up to head of basic block or instruction INSN.
21005 Function puts true value into *FOUND var if definition was found
21006 and false otherwise.
21008 Distance in half-cycles between START and found instruction or head
21009 of BB is added to DISTANCE and returned. */
21011 static int
21012 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21013 rtx_insn *insn, int distance,
21014 rtx_insn *start, bool *found)
21016 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21017 rtx_insn *prev = start;
21018 rtx_insn *next = NULL;
21020 *found = false;
21022 while (prev
21023 && prev != insn
21024 && distance < LEA_SEARCH_THRESHOLD)
21026 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21028 distance = increase_distance (prev, next, distance);
21029 if (insn_defines_reg (regno1, regno2, prev))
21031 if (recog_memoized (prev) < 0
21032 || get_attr_type (prev) != TYPE_LEA)
21034 *found = true;
21035 return distance;
21039 next = prev;
21041 if (prev == BB_HEAD (bb))
21042 break;
21044 prev = PREV_INSN (prev);
21047 return distance;
21050 /* Search backward for non-agu definition of register number REGNO1
21051 or register number REGNO2 in INSN's basic block until
21052 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21053 2. Reach neighbor BBs boundary, or
21054 3. Reach agu definition.
21055 Returns the distance between the non-agu definition point and INSN.
21056 If no definition point, returns -1. */
21058 static int
21059 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21060 rtx_insn *insn)
21062 basic_block bb = BLOCK_FOR_INSN (insn);
21063 int distance = 0;
21064 bool found = false;
21066 if (insn != BB_HEAD (bb))
21067 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21068 distance, PREV_INSN (insn),
21069 &found);
21071 if (!found && distance < LEA_SEARCH_THRESHOLD)
21073 edge e;
21074 edge_iterator ei;
21075 bool simple_loop = false;
21077 FOR_EACH_EDGE (e, ei, bb->preds)
21078 if (e->src == bb)
21080 simple_loop = true;
21081 break;
21084 if (simple_loop)
21085 distance = distance_non_agu_define_in_bb (regno1, regno2,
21086 insn, distance,
21087 BB_END (bb), &found);
21088 else
21090 int shortest_dist = -1;
21091 bool found_in_bb = false;
21093 FOR_EACH_EDGE (e, ei, bb->preds)
21095 int bb_dist
21096 = distance_non_agu_define_in_bb (regno1, regno2,
21097 insn, distance,
21098 BB_END (e->src),
21099 &found_in_bb);
21100 if (found_in_bb)
21102 if (shortest_dist < 0)
21103 shortest_dist = bb_dist;
21104 else if (bb_dist > 0)
21105 shortest_dist = MIN (bb_dist, shortest_dist);
21107 found = true;
21111 distance = shortest_dist;
21115 /* get_attr_type may modify recog data. We want to make sure
21116 that recog data is valid for instruction INSN, on which
21117 distance_non_agu_define is called. INSN is unchanged here. */
21118 extract_insn_cached (insn);
21120 if (!found)
21121 return -1;
21123 return distance >> 1;
21126 /* Return the distance in half-cycles between INSN and the next
21127 insn that uses register number REGNO in memory address added
21128 to DISTANCE. Return -1 if REGNO0 is set.
21130 Put true value into *FOUND if register usage was found and
21131 false otherwise.
21132 Put true value into *REDEFINED if register redefinition was
21133 found and false otherwise. */
21135 static int
21136 distance_agu_use_in_bb (unsigned int regno,
21137 rtx_insn *insn, int distance, rtx_insn *start,
21138 bool *found, bool *redefined)
21140 basic_block bb = NULL;
21141 rtx_insn *next = start;
21142 rtx_insn *prev = NULL;
21144 *found = false;
21145 *redefined = false;
21147 if (start != NULL_RTX)
21149 bb = BLOCK_FOR_INSN (start);
21150 if (start != BB_HEAD (bb))
21151 /* If insn and start belong to the same bb, set prev to insn,
21152 so the call to increase_distance will increase the distance
21153 between insns by 1. */
21154 prev = insn;
21157 while (next
21158 && next != insn
21159 && distance < LEA_SEARCH_THRESHOLD)
21161 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21163 distance = increase_distance(prev, next, distance);
21164 if (insn_uses_reg_mem (regno, next))
21166 /* Return DISTANCE if OP0 is used in memory
21167 address in NEXT. */
21168 *found = true;
21169 return distance;
21172 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21174 /* Return -1 if OP0 is set in NEXT. */
21175 *redefined = true;
21176 return -1;
21179 prev = next;
21182 if (next == BB_END (bb))
21183 break;
21185 next = NEXT_INSN (next);
21188 return distance;
21191 /* Return the distance between INSN and the next insn that uses
21192 register number REGNO0 in memory address. Return -1 if no such
21193 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21195 static int
21196 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21198 basic_block bb = BLOCK_FOR_INSN (insn);
21199 int distance = 0;
21200 bool found = false;
21201 bool redefined = false;
21203 if (insn != BB_END (bb))
21204 distance = distance_agu_use_in_bb (regno0, insn, distance,
21205 NEXT_INSN (insn),
21206 &found, &redefined);
21208 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21210 edge e;
21211 edge_iterator ei;
21212 bool simple_loop = false;
21214 FOR_EACH_EDGE (e, ei, bb->succs)
21215 if (e->dest == bb)
21217 simple_loop = true;
21218 break;
21221 if (simple_loop)
21222 distance = distance_agu_use_in_bb (regno0, insn,
21223 distance, BB_HEAD (bb),
21224 &found, &redefined);
21225 else
21227 int shortest_dist = -1;
21228 bool found_in_bb = false;
21229 bool redefined_in_bb = false;
21231 FOR_EACH_EDGE (e, ei, bb->succs)
21233 int bb_dist
21234 = distance_agu_use_in_bb (regno0, insn,
21235 distance, BB_HEAD (e->dest),
21236 &found_in_bb, &redefined_in_bb);
21237 if (found_in_bb)
21239 if (shortest_dist < 0)
21240 shortest_dist = bb_dist;
21241 else if (bb_dist > 0)
21242 shortest_dist = MIN (bb_dist, shortest_dist);
21244 found = true;
21248 distance = shortest_dist;
21252 if (!found || redefined)
21253 return -1;
21255 return distance >> 1;
21258 /* Define this macro to tune LEA priority vs ADD, it take effect when
21259 there is a dilemma of choicing LEA or ADD
21260 Negative value: ADD is more preferred than LEA
21261 Zero: Netrual
21262 Positive value: LEA is more preferred than ADD*/
21263 #define IX86_LEA_PRIORITY 0
21265 /* Return true if usage of lea INSN has performance advantage
21266 over a sequence of instructions. Instructions sequence has
21267 SPLIT_COST cycles higher latency than lea latency. */
21269 static bool
21270 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21271 unsigned int regno2, int split_cost, bool has_scale)
21273 int dist_define, dist_use;
21275 /* For Silvermont if using a 2-source or 3-source LEA for
21276 non-destructive destination purposes, or due to wanting
21277 ability to use SCALE, the use of LEA is justified. */
21278 if (TARGET_SILVERMONT || TARGET_INTEL)
21280 if (has_scale)
21281 return true;
21282 if (split_cost < 1)
21283 return false;
21284 if (regno0 == regno1 || regno0 == regno2)
21285 return false;
21286 return true;
21289 dist_define = distance_non_agu_define (regno1, regno2, insn);
21290 dist_use = distance_agu_use (regno0, insn);
21292 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21294 /* If there is no non AGU operand definition, no AGU
21295 operand usage and split cost is 0 then both lea
21296 and non lea variants have same priority. Currently
21297 we prefer lea for 64 bit code and non lea on 32 bit
21298 code. */
21299 if (dist_use < 0 && split_cost == 0)
21300 return TARGET_64BIT || IX86_LEA_PRIORITY;
21301 else
21302 return true;
21305 /* With longer definitions distance lea is more preferable.
21306 Here we change it to take into account splitting cost and
21307 lea priority. */
21308 dist_define += split_cost + IX86_LEA_PRIORITY;
21310 /* If there is no use in memory addess then we just check
21311 that split cost exceeds AGU stall. */
21312 if (dist_use < 0)
21313 return dist_define > LEA_MAX_STALL;
21315 /* If this insn has both backward non-agu dependence and forward
21316 agu dependence, the one with short distance takes effect. */
21317 return dist_define >= dist_use;
21320 /* Return true if it is legal to clobber flags by INSN and
21321 false otherwise. */
21323 static bool
21324 ix86_ok_to_clobber_flags (rtx_insn *insn)
21326 basic_block bb = BLOCK_FOR_INSN (insn);
21327 df_ref use;
21328 bitmap live;
21330 while (insn)
21332 if (NONDEBUG_INSN_P (insn))
21334 FOR_EACH_INSN_USE (use, insn)
21335 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21336 return false;
21338 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21339 return true;
21342 if (insn == BB_END (bb))
21343 break;
21345 insn = NEXT_INSN (insn);
21348 live = df_get_live_out(bb);
21349 return !REGNO_REG_SET_P (live, FLAGS_REG);
21352 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21353 move and add to avoid AGU stalls. */
21355 bool
21356 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21358 unsigned int regno0, regno1, regno2;
21360 /* Check if we need to optimize. */
21361 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21362 return false;
21364 /* Check it is correct to split here. */
21365 if (!ix86_ok_to_clobber_flags(insn))
21366 return false;
21368 regno0 = true_regnum (operands[0]);
21369 regno1 = true_regnum (operands[1]);
21370 regno2 = true_regnum (operands[2]);
21372 /* We need to split only adds with non destructive
21373 destination operand. */
21374 if (regno0 == regno1 || regno0 == regno2)
21375 return false;
21376 else
21377 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21380 /* Return true if we should emit lea instruction instead of mov
21381 instruction. */
21383 bool
21384 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21386 unsigned int regno0, regno1;
21388 /* Check if we need to optimize. */
21389 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21390 return false;
21392 /* Use lea for reg to reg moves only. */
21393 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21394 return false;
21396 regno0 = true_regnum (operands[0]);
21397 regno1 = true_regnum (operands[1]);
21399 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21402 /* Return true if we need to split lea into a sequence of
21403 instructions to avoid AGU stalls. */
21405 bool
21406 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21408 unsigned int regno0, regno1, regno2;
21409 int split_cost;
21410 struct ix86_address parts;
21411 int ok;
21413 /* Check we need to optimize. */
21414 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21415 return false;
21417 /* The "at least two components" test below might not catch simple
21418 move or zero extension insns if parts.base is non-NULL and parts.disp
21419 is const0_rtx as the only components in the address, e.g. if the
21420 register is %rbp or %r13. As this test is much cheaper and moves or
21421 zero extensions are the common case, do this check first. */
21422 if (REG_P (operands[1])
21423 || (SImode_address_operand (operands[1], VOIDmode)
21424 && REG_P (XEXP (operands[1], 0))))
21425 return false;
21427 /* Check if it is OK to split here. */
21428 if (!ix86_ok_to_clobber_flags (insn))
21429 return false;
21431 ok = ix86_decompose_address (operands[1], &parts);
21432 gcc_assert (ok);
21434 /* There should be at least two components in the address. */
21435 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21436 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21437 return false;
21439 /* We should not split into add if non legitimate pic
21440 operand is used as displacement. */
21441 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21442 return false;
21444 regno0 = true_regnum (operands[0]) ;
21445 regno1 = INVALID_REGNUM;
21446 regno2 = INVALID_REGNUM;
21448 if (parts.base)
21449 regno1 = true_regnum (parts.base);
21450 if (parts.index)
21451 regno2 = true_regnum (parts.index);
21453 split_cost = 0;
21455 /* Compute how many cycles we will add to execution time
21456 if split lea into a sequence of instructions. */
21457 if (parts.base || parts.index)
21459 /* Have to use mov instruction if non desctructive
21460 destination form is used. */
21461 if (regno1 != regno0 && regno2 != regno0)
21462 split_cost += 1;
21464 /* Have to add index to base if both exist. */
21465 if (parts.base && parts.index)
21466 split_cost += 1;
21468 /* Have to use shift and adds if scale is 2 or greater. */
21469 if (parts.scale > 1)
21471 if (regno0 != regno1)
21472 split_cost += 1;
21473 else if (regno2 == regno0)
21474 split_cost += 4;
21475 else
21476 split_cost += parts.scale;
21479 /* Have to use add instruction with immediate if
21480 disp is non zero. */
21481 if (parts.disp && parts.disp != const0_rtx)
21482 split_cost += 1;
21484 /* Subtract the price of lea. */
21485 split_cost -= 1;
21488 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21489 parts.scale > 1);
21492 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21493 matches destination. RTX includes clobber of FLAGS_REG. */
21495 static void
21496 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21497 rtx dst, rtx src)
21499 rtx op, clob;
21501 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21502 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21504 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21507 /* Return true if regno1 def is nearest to the insn. */
21509 static bool
21510 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21512 rtx_insn *prev = insn;
21513 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21515 if (insn == start)
21516 return false;
21517 while (prev && prev != start)
21519 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21521 prev = PREV_INSN (prev);
21522 continue;
21524 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21525 return true;
21526 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21527 return false;
21528 prev = PREV_INSN (prev);
21531 /* None of the regs is defined in the bb. */
21532 return false;
21535 /* Split lea instructions into a sequence of instructions
21536 which are executed on ALU to avoid AGU stalls.
21537 It is assumed that it is allowed to clobber flags register
21538 at lea position. */
21540 void
21541 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21543 unsigned int regno0, regno1, regno2;
21544 struct ix86_address parts;
21545 rtx target, tmp;
21546 int ok, adds;
21548 ok = ix86_decompose_address (operands[1], &parts);
21549 gcc_assert (ok);
21551 target = gen_lowpart (mode, operands[0]);
21553 regno0 = true_regnum (target);
21554 regno1 = INVALID_REGNUM;
21555 regno2 = INVALID_REGNUM;
21557 if (parts.base)
21559 parts.base = gen_lowpart (mode, parts.base);
21560 regno1 = true_regnum (parts.base);
21563 if (parts.index)
21565 parts.index = gen_lowpart (mode, parts.index);
21566 regno2 = true_regnum (parts.index);
21569 if (parts.disp)
21570 parts.disp = gen_lowpart (mode, parts.disp);
21572 if (parts.scale > 1)
21574 /* Case r1 = r1 + ... */
21575 if (regno1 == regno0)
21577 /* If we have a case r1 = r1 + C * r2 then we
21578 should use multiplication which is very
21579 expensive. Assume cost model is wrong if we
21580 have such case here. */
21581 gcc_assert (regno2 != regno0);
21583 for (adds = parts.scale; adds > 0; adds--)
21584 ix86_emit_binop (PLUS, mode, target, parts.index);
21586 else
21588 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21589 if (regno0 != regno2)
21590 emit_insn (gen_rtx_SET (target, parts.index));
21592 /* Use shift for scaling. */
21593 ix86_emit_binop (ASHIFT, mode, target,
21594 GEN_INT (exact_log2 (parts.scale)));
21596 if (parts.base)
21597 ix86_emit_binop (PLUS, mode, target, parts.base);
21599 if (parts.disp && parts.disp != const0_rtx)
21600 ix86_emit_binop (PLUS, mode, target, parts.disp);
21603 else if (!parts.base && !parts.index)
21605 gcc_assert(parts.disp);
21606 emit_insn (gen_rtx_SET (target, parts.disp));
21608 else
21610 if (!parts.base)
21612 if (regno0 != regno2)
21613 emit_insn (gen_rtx_SET (target, parts.index));
21615 else if (!parts.index)
21617 if (regno0 != regno1)
21618 emit_insn (gen_rtx_SET (target, parts.base));
21620 else
21622 if (regno0 == regno1)
21623 tmp = parts.index;
21624 else if (regno0 == regno2)
21625 tmp = parts.base;
21626 else
21628 rtx tmp1;
21630 /* Find better operand for SET instruction, depending
21631 on which definition is farther from the insn. */
21632 if (find_nearest_reg_def (insn, regno1, regno2))
21633 tmp = parts.index, tmp1 = parts.base;
21634 else
21635 tmp = parts.base, tmp1 = parts.index;
21637 emit_insn (gen_rtx_SET (target, tmp));
21639 if (parts.disp && parts.disp != const0_rtx)
21640 ix86_emit_binop (PLUS, mode, target, parts.disp);
21642 ix86_emit_binop (PLUS, mode, target, tmp1);
21643 return;
21646 ix86_emit_binop (PLUS, mode, target, tmp);
21649 if (parts.disp && parts.disp != const0_rtx)
21650 ix86_emit_binop (PLUS, mode, target, parts.disp);
21654 /* Return true if it is ok to optimize an ADD operation to LEA
21655 operation to avoid flag register consumation. For most processors,
21656 ADD is faster than LEA. For the processors like BONNELL, if the
21657 destination register of LEA holds an actual address which will be
21658 used soon, LEA is better and otherwise ADD is better. */
21660 bool
21661 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21663 unsigned int regno0 = true_regnum (operands[0]);
21664 unsigned int regno1 = true_regnum (operands[1]);
21665 unsigned int regno2 = true_regnum (operands[2]);
21667 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21668 if (regno0 != regno1 && regno0 != regno2)
21669 return true;
21671 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21672 return false;
21674 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21677 /* Return true if destination reg of SET_BODY is shift count of
21678 USE_BODY. */
21680 static bool
21681 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21683 rtx set_dest;
21684 rtx shift_rtx;
21685 int i;
21687 /* Retrieve destination of SET_BODY. */
21688 switch (GET_CODE (set_body))
21690 case SET:
21691 set_dest = SET_DEST (set_body);
21692 if (!set_dest || !REG_P (set_dest))
21693 return false;
21694 break;
21695 case PARALLEL:
21696 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21697 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21698 use_body))
21699 return true;
21700 /* FALLTHROUGH */
21701 default:
21702 return false;
21705 /* Retrieve shift count of USE_BODY. */
21706 switch (GET_CODE (use_body))
21708 case SET:
21709 shift_rtx = XEXP (use_body, 1);
21710 break;
21711 case PARALLEL:
21712 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21713 if (ix86_dep_by_shift_count_body (set_body,
21714 XVECEXP (use_body, 0, i)))
21715 return true;
21716 /* FALLTHROUGH */
21717 default:
21718 return false;
21721 if (shift_rtx
21722 && (GET_CODE (shift_rtx) == ASHIFT
21723 || GET_CODE (shift_rtx) == LSHIFTRT
21724 || GET_CODE (shift_rtx) == ASHIFTRT
21725 || GET_CODE (shift_rtx) == ROTATE
21726 || GET_CODE (shift_rtx) == ROTATERT))
21728 rtx shift_count = XEXP (shift_rtx, 1);
21730 /* Return true if shift count is dest of SET_BODY. */
21731 if (REG_P (shift_count))
21733 /* Add check since it can be invoked before register
21734 allocation in pre-reload schedule. */
21735 if (reload_completed
21736 && true_regnum (set_dest) == true_regnum (shift_count))
21737 return true;
21738 else if (REGNO(set_dest) == REGNO(shift_count))
21739 return true;
21743 return false;
21746 /* Return true if destination reg of SET_INSN is shift count of
21747 USE_INSN. */
21749 bool
21750 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21752 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21753 PATTERN (use_insn));
21756 /* Return TRUE or FALSE depending on whether the unary operator meets the
21757 appropriate constraints. */
21759 bool
21760 ix86_unary_operator_ok (enum rtx_code,
21761 machine_mode,
21762 rtx operands[2])
21764 /* If one of operands is memory, source and destination must match. */
21765 if ((MEM_P (operands[0])
21766 || MEM_P (operands[1]))
21767 && ! rtx_equal_p (operands[0], operands[1]))
21768 return false;
21769 return true;
21772 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21773 are ok, keeping in mind the possible movddup alternative. */
21775 bool
21776 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21778 if (MEM_P (operands[0]))
21779 return rtx_equal_p (operands[0], operands[1 + high]);
21780 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21781 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21782 return true;
21785 /* Post-reload splitter for converting an SF or DFmode value in an
21786 SSE register into an unsigned SImode. */
21788 void
21789 ix86_split_convert_uns_si_sse (rtx operands[])
21791 machine_mode vecmode;
21792 rtx value, large, zero_or_two31, input, two31, x;
21794 large = operands[1];
21795 zero_or_two31 = operands[2];
21796 input = operands[3];
21797 two31 = operands[4];
21798 vecmode = GET_MODE (large);
21799 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21801 /* Load up the value into the low element. We must ensure that the other
21802 elements are valid floats -- zero is the easiest such value. */
21803 if (MEM_P (input))
21805 if (vecmode == V4SFmode)
21806 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21807 else
21808 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21810 else
21812 input = gen_rtx_REG (vecmode, REGNO (input));
21813 emit_move_insn (value, CONST0_RTX (vecmode));
21814 if (vecmode == V4SFmode)
21815 emit_insn (gen_sse_movss (value, value, input));
21816 else
21817 emit_insn (gen_sse2_movsd (value, value, input));
21820 emit_move_insn (large, two31);
21821 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21823 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21824 emit_insn (gen_rtx_SET (large, x));
21826 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21827 emit_insn (gen_rtx_SET (zero_or_two31, x));
21829 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21830 emit_insn (gen_rtx_SET (value, x));
21832 large = gen_rtx_REG (V4SImode, REGNO (large));
21833 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21835 x = gen_rtx_REG (V4SImode, REGNO (value));
21836 if (vecmode == V4SFmode)
21837 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21838 else
21839 emit_insn (gen_sse2_cvttpd2dq (x, value));
21840 value = x;
21842 emit_insn (gen_xorv4si3 (value, value, large));
21845 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21846 Expects the 64-bit DImode to be supplied in a pair of integral
21847 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21848 -mfpmath=sse, !optimize_size only. */
21850 void
21851 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21853 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21854 rtx int_xmm, fp_xmm;
21855 rtx biases, exponents;
21856 rtx x;
21858 int_xmm = gen_reg_rtx (V4SImode);
21859 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21860 emit_insn (gen_movdi_to_sse (int_xmm, input));
21861 else if (TARGET_SSE_SPLIT_REGS)
21863 emit_clobber (int_xmm);
21864 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21866 else
21868 x = gen_reg_rtx (V2DImode);
21869 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21870 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21873 x = gen_rtx_CONST_VECTOR (V4SImode,
21874 gen_rtvec (4, GEN_INT (0x43300000UL),
21875 GEN_INT (0x45300000UL),
21876 const0_rtx, const0_rtx));
21877 exponents = validize_mem (force_const_mem (V4SImode, x));
21879 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21880 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21882 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21883 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21884 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21885 (0x1.0p84 + double(fp_value_hi_xmm)).
21886 Note these exponents differ by 32. */
21888 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21890 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21891 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21892 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21893 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21894 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21895 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21896 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21897 biases = validize_mem (force_const_mem (V2DFmode, biases));
21898 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21900 /* Add the upper and lower DFmode values together. */
21901 if (TARGET_SSE3)
21902 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21903 else
21905 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21906 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21907 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21910 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21913 /* Not used, but eases macroization of patterns. */
21914 void
21915 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21917 gcc_unreachable ();
21920 /* Convert an unsigned SImode value into a DFmode. Only currently used
21921 for SSE, but applicable anywhere. */
21923 void
21924 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21926 REAL_VALUE_TYPE TWO31r;
21927 rtx x, fp;
21929 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21930 NULL, 1, OPTAB_DIRECT);
21932 fp = gen_reg_rtx (DFmode);
21933 emit_insn (gen_floatsidf2 (fp, x));
21935 real_ldexp (&TWO31r, &dconst1, 31);
21936 x = const_double_from_real_value (TWO31r, DFmode);
21938 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21939 if (x != target)
21940 emit_move_insn (target, x);
21943 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21944 32-bit mode; otherwise we have a direct convert instruction. */
21946 void
21947 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21949 REAL_VALUE_TYPE TWO32r;
21950 rtx fp_lo, fp_hi, x;
21952 fp_lo = gen_reg_rtx (DFmode);
21953 fp_hi = gen_reg_rtx (DFmode);
21955 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21957 real_ldexp (&TWO32r, &dconst1, 32);
21958 x = const_double_from_real_value (TWO32r, DFmode);
21959 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21961 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21963 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21964 0, OPTAB_DIRECT);
21965 if (x != target)
21966 emit_move_insn (target, x);
21969 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21970 For x86_32, -mfpmath=sse, !optimize_size only. */
21971 void
21972 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21974 REAL_VALUE_TYPE ONE16r;
21975 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21977 real_ldexp (&ONE16r, &dconst1, 16);
21978 x = const_double_from_real_value (ONE16r, SFmode);
21979 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21980 NULL, 0, OPTAB_DIRECT);
21981 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21982 NULL, 0, OPTAB_DIRECT);
21983 fp_hi = gen_reg_rtx (SFmode);
21984 fp_lo = gen_reg_rtx (SFmode);
21985 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21986 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21987 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21988 0, OPTAB_DIRECT);
21989 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21990 0, OPTAB_DIRECT);
21991 if (!rtx_equal_p (target, fp_hi))
21992 emit_move_insn (target, fp_hi);
21995 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21996 a vector of unsigned ints VAL to vector of floats TARGET. */
21998 void
21999 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22001 rtx tmp[8];
22002 REAL_VALUE_TYPE TWO16r;
22003 machine_mode intmode = GET_MODE (val);
22004 machine_mode fltmode = GET_MODE (target);
22005 rtx (*cvt) (rtx, rtx);
22007 if (intmode == V4SImode)
22008 cvt = gen_floatv4siv4sf2;
22009 else
22010 cvt = gen_floatv8siv8sf2;
22011 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22012 tmp[0] = force_reg (intmode, tmp[0]);
22013 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22014 OPTAB_DIRECT);
22015 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22016 NULL_RTX, 1, OPTAB_DIRECT);
22017 tmp[3] = gen_reg_rtx (fltmode);
22018 emit_insn (cvt (tmp[3], tmp[1]));
22019 tmp[4] = gen_reg_rtx (fltmode);
22020 emit_insn (cvt (tmp[4], tmp[2]));
22021 real_ldexp (&TWO16r, &dconst1, 16);
22022 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22023 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22024 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22025 OPTAB_DIRECT);
22026 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22027 OPTAB_DIRECT);
22028 if (tmp[7] != target)
22029 emit_move_insn (target, tmp[7]);
22032 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22033 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22034 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22035 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22038 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22040 REAL_VALUE_TYPE TWO31r;
22041 rtx two31r, tmp[4];
22042 machine_mode mode = GET_MODE (val);
22043 machine_mode scalarmode = GET_MODE_INNER (mode);
22044 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22045 rtx (*cmp) (rtx, rtx, rtx, rtx);
22046 int i;
22048 for (i = 0; i < 3; i++)
22049 tmp[i] = gen_reg_rtx (mode);
22050 real_ldexp (&TWO31r, &dconst1, 31);
22051 two31r = const_double_from_real_value (TWO31r, scalarmode);
22052 two31r = ix86_build_const_vector (mode, 1, two31r);
22053 two31r = force_reg (mode, two31r);
22054 switch (mode)
22056 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22057 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22058 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22059 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22060 default: gcc_unreachable ();
22062 tmp[3] = gen_rtx_LE (mode, two31r, val);
22063 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22064 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22065 0, OPTAB_DIRECT);
22066 if (intmode == V4SImode || TARGET_AVX2)
22067 *xorp = expand_simple_binop (intmode, ASHIFT,
22068 gen_lowpart (intmode, tmp[0]),
22069 GEN_INT (31), NULL_RTX, 0,
22070 OPTAB_DIRECT);
22071 else
22073 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22074 two31 = ix86_build_const_vector (intmode, 1, two31);
22075 *xorp = expand_simple_binop (intmode, AND,
22076 gen_lowpart (intmode, tmp[0]),
22077 two31, NULL_RTX, 0,
22078 OPTAB_DIRECT);
22080 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22081 0, OPTAB_DIRECT);
22084 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22085 then replicate the value for all elements of the vector
22086 register. */
22089 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22091 int i, n_elt;
22092 rtvec v;
22093 machine_mode scalar_mode;
22095 switch (mode)
22097 case E_V64QImode:
22098 case E_V32QImode:
22099 case E_V16QImode:
22100 case E_V32HImode:
22101 case E_V16HImode:
22102 case E_V8HImode:
22103 case E_V16SImode:
22104 case E_V8SImode:
22105 case E_V4SImode:
22106 case E_V8DImode:
22107 case E_V4DImode:
22108 case E_V2DImode:
22109 gcc_assert (vect);
22110 /* FALLTHRU */
22111 case E_V16SFmode:
22112 case E_V8SFmode:
22113 case E_V4SFmode:
22114 case E_V8DFmode:
22115 case E_V4DFmode:
22116 case E_V2DFmode:
22117 n_elt = GET_MODE_NUNITS (mode);
22118 v = rtvec_alloc (n_elt);
22119 scalar_mode = GET_MODE_INNER (mode);
22121 RTVEC_ELT (v, 0) = value;
22123 for (i = 1; i < n_elt; ++i)
22124 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22126 return gen_rtx_CONST_VECTOR (mode, v);
22128 default:
22129 gcc_unreachable ();
22133 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22134 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22135 for an SSE register. If VECT is true, then replicate the mask for
22136 all elements of the vector register. If INVERT is true, then create
22137 a mask excluding the sign bit. */
22140 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22142 machine_mode vec_mode, imode;
22143 wide_int w;
22144 rtx mask, v;
22146 switch (mode)
22148 case E_V16SImode:
22149 case E_V16SFmode:
22150 case E_V8SImode:
22151 case E_V4SImode:
22152 case E_V8SFmode:
22153 case E_V4SFmode:
22154 vec_mode = mode;
22155 imode = SImode;
22156 break;
22158 case E_V8DImode:
22159 case E_V4DImode:
22160 case E_V2DImode:
22161 case E_V8DFmode:
22162 case E_V4DFmode:
22163 case E_V2DFmode:
22164 vec_mode = mode;
22165 imode = DImode;
22166 break;
22168 case E_TImode:
22169 case E_TFmode:
22170 vec_mode = VOIDmode;
22171 imode = TImode;
22172 break;
22174 default:
22175 gcc_unreachable ();
22178 machine_mode inner_mode = GET_MODE_INNER (mode);
22179 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22180 GET_MODE_BITSIZE (inner_mode));
22181 if (invert)
22182 w = wi::bit_not (w);
22184 /* Force this value into the low part of a fp vector constant. */
22185 mask = immed_wide_int_const (w, imode);
22186 mask = gen_lowpart (inner_mode, mask);
22188 if (vec_mode == VOIDmode)
22189 return force_reg (inner_mode, mask);
22191 v = ix86_build_const_vector (vec_mode, vect, mask);
22192 return force_reg (vec_mode, v);
22195 /* Generate code for floating point ABS or NEG. */
22197 void
22198 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22199 rtx operands[])
22201 rtx mask, set, dst, src;
22202 bool use_sse = false;
22203 bool vector_mode = VECTOR_MODE_P (mode);
22204 machine_mode vmode = mode;
22206 if (vector_mode)
22207 use_sse = true;
22208 else if (mode == TFmode)
22209 use_sse = true;
22210 else if (TARGET_SSE_MATH)
22212 use_sse = SSE_FLOAT_MODE_P (mode);
22213 if (mode == SFmode)
22214 vmode = V4SFmode;
22215 else if (mode == DFmode)
22216 vmode = V2DFmode;
22219 /* NEG and ABS performed with SSE use bitwise mask operations.
22220 Create the appropriate mask now. */
22221 if (use_sse)
22222 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22223 else
22224 mask = NULL_RTX;
22226 dst = operands[0];
22227 src = operands[1];
22229 set = gen_rtx_fmt_e (code, mode, src);
22230 set = gen_rtx_SET (dst, set);
22232 if (mask)
22234 rtx use, clob;
22235 rtvec par;
22237 use = gen_rtx_USE (VOIDmode, mask);
22238 if (vector_mode)
22239 par = gen_rtvec (2, set, use);
22240 else
22242 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22243 par = gen_rtvec (3, set, use, clob);
22245 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22247 else
22248 emit_insn (set);
22251 /* Expand a copysign operation. Special case operand 0 being a constant. */
22253 void
22254 ix86_expand_copysign (rtx operands[])
22256 machine_mode mode, vmode;
22257 rtx dest, op0, op1, mask, nmask;
22259 dest = operands[0];
22260 op0 = operands[1];
22261 op1 = operands[2];
22263 mode = GET_MODE (dest);
22265 if (mode == SFmode)
22266 vmode = V4SFmode;
22267 else if (mode == DFmode)
22268 vmode = V2DFmode;
22269 else
22270 vmode = mode;
22272 if (CONST_DOUBLE_P (op0))
22274 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22276 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22277 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22279 if (mode == SFmode || mode == DFmode)
22281 if (op0 == CONST0_RTX (mode))
22282 op0 = CONST0_RTX (vmode);
22283 else
22285 rtx v = ix86_build_const_vector (vmode, false, op0);
22287 op0 = force_reg (vmode, v);
22290 else if (op0 != CONST0_RTX (mode))
22291 op0 = force_reg (mode, op0);
22293 mask = ix86_build_signbit_mask (vmode, 0, 0);
22295 if (mode == SFmode)
22296 copysign_insn = gen_copysignsf3_const;
22297 else if (mode == DFmode)
22298 copysign_insn = gen_copysigndf3_const;
22299 else
22300 copysign_insn = gen_copysigntf3_const;
22302 emit_insn (copysign_insn (dest, op0, op1, mask));
22304 else
22306 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22308 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22309 mask = ix86_build_signbit_mask (vmode, 0, 0);
22311 if (mode == SFmode)
22312 copysign_insn = gen_copysignsf3_var;
22313 else if (mode == DFmode)
22314 copysign_insn = gen_copysigndf3_var;
22315 else
22316 copysign_insn = gen_copysigntf3_var;
22318 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22322 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22323 be a constant, and so has already been expanded into a vector constant. */
22325 void
22326 ix86_split_copysign_const (rtx operands[])
22328 machine_mode mode, vmode;
22329 rtx dest, op0, mask, x;
22331 dest = operands[0];
22332 op0 = operands[1];
22333 mask = operands[3];
22335 mode = GET_MODE (dest);
22336 vmode = GET_MODE (mask);
22338 dest = lowpart_subreg (vmode, dest, mode);
22339 x = gen_rtx_AND (vmode, dest, mask);
22340 emit_insn (gen_rtx_SET (dest, x));
22342 if (op0 != CONST0_RTX (vmode))
22344 x = gen_rtx_IOR (vmode, dest, op0);
22345 emit_insn (gen_rtx_SET (dest, x));
22349 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22350 so we have to do two masks. */
22352 void
22353 ix86_split_copysign_var (rtx operands[])
22355 machine_mode mode, vmode;
22356 rtx dest, scratch, op0, op1, mask, nmask, x;
22358 dest = operands[0];
22359 scratch = operands[1];
22360 op0 = operands[2];
22361 op1 = operands[3];
22362 nmask = operands[4];
22363 mask = operands[5];
22365 mode = GET_MODE (dest);
22366 vmode = GET_MODE (mask);
22368 if (rtx_equal_p (op0, op1))
22370 /* Shouldn't happen often (it's useless, obviously), but when it does
22371 we'd generate incorrect code if we continue below. */
22372 emit_move_insn (dest, op0);
22373 return;
22376 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22378 gcc_assert (REGNO (op1) == REGNO (scratch));
22380 x = gen_rtx_AND (vmode, scratch, mask);
22381 emit_insn (gen_rtx_SET (scratch, x));
22383 dest = mask;
22384 op0 = lowpart_subreg (vmode, op0, mode);
22385 x = gen_rtx_NOT (vmode, dest);
22386 x = gen_rtx_AND (vmode, x, op0);
22387 emit_insn (gen_rtx_SET (dest, x));
22389 else
22391 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22393 x = gen_rtx_AND (vmode, scratch, mask);
22395 else /* alternative 2,4 */
22397 gcc_assert (REGNO (mask) == REGNO (scratch));
22398 op1 = lowpart_subreg (vmode, op1, mode);
22399 x = gen_rtx_AND (vmode, scratch, op1);
22401 emit_insn (gen_rtx_SET (scratch, x));
22403 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22405 dest = lowpart_subreg (vmode, op0, mode);
22406 x = gen_rtx_AND (vmode, dest, nmask);
22408 else /* alternative 3,4 */
22410 gcc_assert (REGNO (nmask) == REGNO (dest));
22411 dest = nmask;
22412 op0 = lowpart_subreg (vmode, op0, mode);
22413 x = gen_rtx_AND (vmode, dest, op0);
22415 emit_insn (gen_rtx_SET (dest, x));
22418 x = gen_rtx_IOR (vmode, dest, scratch);
22419 emit_insn (gen_rtx_SET (dest, x));
22422 /* Return TRUE or FALSE depending on whether the first SET in INSN
22423 has source and destination with matching CC modes, and that the
22424 CC mode is at least as constrained as REQ_MODE. */
22426 bool
22427 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22429 rtx set;
22430 machine_mode set_mode;
22432 set = PATTERN (insn);
22433 if (GET_CODE (set) == PARALLEL)
22434 set = XVECEXP (set, 0, 0);
22435 gcc_assert (GET_CODE (set) == SET);
22436 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22438 set_mode = GET_MODE (SET_DEST (set));
22439 switch (set_mode)
22441 case E_CCNOmode:
22442 if (req_mode != CCNOmode
22443 && (req_mode != CCmode
22444 || XEXP (SET_SRC (set), 1) != const0_rtx))
22445 return false;
22446 break;
22447 case E_CCmode:
22448 if (req_mode == CCGCmode)
22449 return false;
22450 /* FALLTHRU */
22451 case E_CCGCmode:
22452 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22453 return false;
22454 /* FALLTHRU */
22455 case E_CCGOCmode:
22456 if (req_mode == CCZmode)
22457 return false;
22458 /* FALLTHRU */
22459 case E_CCZmode:
22460 break;
22462 case E_CCGZmode:
22464 case E_CCAmode:
22465 case E_CCCmode:
22466 case E_CCOmode:
22467 case E_CCPmode:
22468 case E_CCSmode:
22469 if (set_mode != req_mode)
22470 return false;
22471 break;
22473 default:
22474 gcc_unreachable ();
22477 return GET_MODE (SET_SRC (set)) == set_mode;
22480 /* Generate insn patterns to do an integer compare of OPERANDS. */
22482 static rtx
22483 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22485 machine_mode cmpmode;
22486 rtx tmp, flags;
22488 cmpmode = SELECT_CC_MODE (code, op0, op1);
22489 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22491 /* This is very simple, but making the interface the same as in the
22492 FP case makes the rest of the code easier. */
22493 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22494 emit_insn (gen_rtx_SET (flags, tmp));
22496 /* Return the test that should be put into the flags user, i.e.
22497 the bcc, scc, or cmov instruction. */
22498 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22501 /* Figure out whether to use unordered fp comparisons. */
22503 static bool
22504 ix86_unordered_fp_compare (enum rtx_code code)
22506 if (!TARGET_IEEE_FP)
22507 return false;
22509 switch (code)
22511 case GT:
22512 case GE:
22513 case LT:
22514 case LE:
22515 return false;
22517 case EQ:
22518 case NE:
22520 case LTGT:
22521 case UNORDERED:
22522 case ORDERED:
22523 case UNLT:
22524 case UNLE:
22525 case UNGT:
22526 case UNGE:
22527 case UNEQ:
22528 return true;
22530 default:
22531 gcc_unreachable ();
22535 machine_mode
22536 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22538 machine_mode mode = GET_MODE (op0);
22540 if (SCALAR_FLOAT_MODE_P (mode))
22542 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22543 return CCFPmode;
22546 switch (code)
22548 /* Only zero flag is needed. */
22549 case EQ: /* ZF=0 */
22550 case NE: /* ZF!=0 */
22551 return CCZmode;
22552 /* Codes needing carry flag. */
22553 case GEU: /* CF=0 */
22554 case LTU: /* CF=1 */
22555 /* Detect overflow checks. They need just the carry flag. */
22556 if (GET_CODE (op0) == PLUS
22557 && (rtx_equal_p (op1, XEXP (op0, 0))
22558 || rtx_equal_p (op1, XEXP (op0, 1))))
22559 return CCCmode;
22560 else
22561 return CCmode;
22562 case GTU: /* CF=0 & ZF=0 */
22563 case LEU: /* CF=1 | ZF=1 */
22564 return CCmode;
22565 /* Codes possibly doable only with sign flag when
22566 comparing against zero. */
22567 case GE: /* SF=OF or SF=0 */
22568 case LT: /* SF<>OF or SF=1 */
22569 if (op1 == const0_rtx)
22570 return CCGOCmode;
22571 else
22572 /* For other cases Carry flag is not required. */
22573 return CCGCmode;
22574 /* Codes doable only with sign flag when comparing
22575 against zero, but we miss jump instruction for it
22576 so we need to use relational tests against overflow
22577 that thus needs to be zero. */
22578 case GT: /* ZF=0 & SF=OF */
22579 case LE: /* ZF=1 | SF<>OF */
22580 if (op1 == const0_rtx)
22581 return CCNOmode;
22582 else
22583 return CCGCmode;
22584 /* strcmp pattern do (use flags) and combine may ask us for proper
22585 mode. */
22586 case USE:
22587 return CCmode;
22588 default:
22589 gcc_unreachable ();
22593 /* Return the fixed registers used for condition codes. */
22595 static bool
22596 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22598 *p1 = FLAGS_REG;
22599 *p2 = FPSR_REG;
22600 return true;
22603 /* If two condition code modes are compatible, return a condition code
22604 mode which is compatible with both. Otherwise, return
22605 VOIDmode. */
22607 static machine_mode
22608 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22610 if (m1 == m2)
22611 return m1;
22613 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22614 return VOIDmode;
22616 if ((m1 == CCGCmode && m2 == CCGOCmode)
22617 || (m1 == CCGOCmode && m2 == CCGCmode))
22618 return CCGCmode;
22620 if ((m1 == CCNOmode && m2 == CCGOCmode)
22621 || (m1 == CCGOCmode && m2 == CCNOmode))
22622 return CCNOmode;
22624 if (m1 == CCZmode
22625 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22626 return m2;
22627 else if (m2 == CCZmode
22628 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22629 return m1;
22631 switch (m1)
22633 default:
22634 gcc_unreachable ();
22636 case E_CCmode:
22637 case E_CCGCmode:
22638 case E_CCGOCmode:
22639 case E_CCNOmode:
22640 case E_CCAmode:
22641 case E_CCCmode:
22642 case E_CCOmode:
22643 case E_CCPmode:
22644 case E_CCSmode:
22645 case E_CCZmode:
22646 switch (m2)
22648 default:
22649 return VOIDmode;
22651 case E_CCmode:
22652 case E_CCGCmode:
22653 case E_CCGOCmode:
22654 case E_CCNOmode:
22655 case E_CCAmode:
22656 case E_CCCmode:
22657 case E_CCOmode:
22658 case E_CCPmode:
22659 case E_CCSmode:
22660 case E_CCZmode:
22661 return CCmode;
22664 case E_CCFPmode:
22665 /* These are only compatible with themselves, which we already
22666 checked above. */
22667 return VOIDmode;
22672 /* Return a comparison we can do and that it is equivalent to
22673 swap_condition (code) apart possibly from orderedness.
22674 But, never change orderedness if TARGET_IEEE_FP, returning
22675 UNKNOWN in that case if necessary. */
22677 static enum rtx_code
22678 ix86_fp_swap_condition (enum rtx_code code)
22680 switch (code)
22682 case GT: /* GTU - CF=0 & ZF=0 */
22683 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22684 case GE: /* GEU - CF=0 */
22685 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22686 case UNLT: /* LTU - CF=1 */
22687 return TARGET_IEEE_FP ? UNKNOWN : GT;
22688 case UNLE: /* LEU - CF=1 | ZF=1 */
22689 return TARGET_IEEE_FP ? UNKNOWN : GE;
22690 default:
22691 return swap_condition (code);
22695 /* Return cost of comparison CODE using the best strategy for performance.
22696 All following functions do use number of instructions as a cost metrics.
22697 In future this should be tweaked to compute bytes for optimize_size and
22698 take into account performance of various instructions on various CPUs. */
22700 static int
22701 ix86_fp_comparison_cost (enum rtx_code code)
22703 int arith_cost;
22705 /* The cost of code using bit-twiddling on %ah. */
22706 switch (code)
22708 case UNLE:
22709 case UNLT:
22710 case LTGT:
22711 case GT:
22712 case GE:
22713 case UNORDERED:
22714 case ORDERED:
22715 case UNEQ:
22716 arith_cost = 4;
22717 break;
22718 case LT:
22719 case NE:
22720 case EQ:
22721 case UNGE:
22722 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22723 break;
22724 case LE:
22725 case UNGT:
22726 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22727 break;
22728 default:
22729 gcc_unreachable ();
22732 switch (ix86_fp_comparison_strategy (code))
22734 case IX86_FPCMP_COMI:
22735 return arith_cost > 4 ? 3 : 2;
22736 case IX86_FPCMP_SAHF:
22737 return arith_cost > 4 ? 4 : 3;
22738 default:
22739 return arith_cost;
22743 /* Return strategy to use for floating-point. We assume that fcomi is always
22744 preferrable where available, since that is also true when looking at size
22745 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22747 enum ix86_fpcmp_strategy
22748 ix86_fp_comparison_strategy (enum rtx_code)
22750 /* Do fcomi/sahf based test when profitable. */
22752 if (TARGET_CMOVE)
22753 return IX86_FPCMP_COMI;
22755 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22756 return IX86_FPCMP_SAHF;
22758 return IX86_FPCMP_ARITH;
22761 /* Swap, force into registers, or otherwise massage the two operands
22762 to a fp comparison. The operands are updated in place; the new
22763 comparison code is returned. */
22765 static enum rtx_code
22766 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22768 bool unordered_compare = ix86_unordered_fp_compare (code);
22769 rtx op0 = *pop0, op1 = *pop1;
22770 machine_mode op_mode = GET_MODE (op0);
22771 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22773 /* All of the unordered compare instructions only work on registers.
22774 The same is true of the fcomi compare instructions. The XFmode
22775 compare instructions require registers except when comparing
22776 against zero or when converting operand 1 from fixed point to
22777 floating point. */
22779 if (!is_sse
22780 && (unordered_compare
22781 || (op_mode == XFmode
22782 && ! (standard_80387_constant_p (op0) == 1
22783 || standard_80387_constant_p (op1) == 1)
22784 && GET_CODE (op1) != FLOAT)
22785 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22787 op0 = force_reg (op_mode, op0);
22788 op1 = force_reg (op_mode, op1);
22790 else
22792 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22793 things around if they appear profitable, otherwise force op0
22794 into a register. */
22796 if (standard_80387_constant_p (op0) == 0
22797 || (MEM_P (op0)
22798 && ! (standard_80387_constant_p (op1) == 0
22799 || MEM_P (op1))))
22801 enum rtx_code new_code = ix86_fp_swap_condition (code);
22802 if (new_code != UNKNOWN)
22804 std::swap (op0, op1);
22805 code = new_code;
22809 if (!REG_P (op0))
22810 op0 = force_reg (op_mode, op0);
22812 if (CONSTANT_P (op1))
22814 int tmp = standard_80387_constant_p (op1);
22815 if (tmp == 0)
22816 op1 = validize_mem (force_const_mem (op_mode, op1));
22817 else if (tmp == 1)
22819 if (TARGET_CMOVE)
22820 op1 = force_reg (op_mode, op1);
22822 else
22823 op1 = force_reg (op_mode, op1);
22827 /* Try to rearrange the comparison to make it cheaper. */
22828 if (ix86_fp_comparison_cost (code)
22829 > ix86_fp_comparison_cost (swap_condition (code))
22830 && (REG_P (op1) || can_create_pseudo_p ()))
22832 std::swap (op0, op1);
22833 code = swap_condition (code);
22834 if (!REG_P (op0))
22835 op0 = force_reg (op_mode, op0);
22838 *pop0 = op0;
22839 *pop1 = op1;
22840 return code;
22843 /* Convert comparison codes we use to represent FP comparison to integer
22844 code that will result in proper branch. Return UNKNOWN if no such code
22845 is available. */
22847 enum rtx_code
22848 ix86_fp_compare_code_to_integer (enum rtx_code code)
22850 switch (code)
22852 case GT:
22853 return GTU;
22854 case GE:
22855 return GEU;
22856 case ORDERED:
22857 case UNORDERED:
22858 return code;
22859 case UNEQ:
22860 return EQ;
22861 case UNLT:
22862 return LTU;
22863 case UNLE:
22864 return LEU;
22865 case LTGT:
22866 return NE;
22867 default:
22868 return UNKNOWN;
22872 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22874 static rtx
22875 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22877 bool unordered_compare = ix86_unordered_fp_compare (code);
22878 machine_mode intcmp_mode;
22879 rtx tmp, tmp2;
22881 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22883 /* Do fcomi/sahf based test when profitable. */
22884 switch (ix86_fp_comparison_strategy (code))
22886 case IX86_FPCMP_COMI:
22887 intcmp_mode = CCFPmode;
22888 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22889 if (unordered_compare)
22890 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22891 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22892 break;
22894 case IX86_FPCMP_SAHF:
22895 intcmp_mode = CCFPmode;
22896 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22897 if (unordered_compare)
22898 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22899 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22900 if (!scratch)
22901 scratch = gen_reg_rtx (HImode);
22902 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22903 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22904 break;
22906 case IX86_FPCMP_ARITH:
22907 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22908 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22909 if (unordered_compare)
22910 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22911 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22912 if (!scratch)
22913 scratch = gen_reg_rtx (HImode);
22914 emit_insn (gen_rtx_SET (scratch, tmp));
22916 /* In the unordered case, we have to check C2 for NaN's, which
22917 doesn't happen to work out to anything nice combination-wise.
22918 So do some bit twiddling on the value we've got in AH to come
22919 up with an appropriate set of condition codes. */
22921 intcmp_mode = CCNOmode;
22922 switch (code)
22924 case GT:
22925 case UNGT:
22926 if (code == GT || !TARGET_IEEE_FP)
22928 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22929 code = EQ;
22931 else
22933 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22934 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22935 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22936 intcmp_mode = CCmode;
22937 code = GEU;
22939 break;
22940 case LT:
22941 case UNLT:
22942 if (code == LT && TARGET_IEEE_FP)
22944 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22945 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22946 intcmp_mode = CCmode;
22947 code = EQ;
22949 else
22951 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22952 code = NE;
22954 break;
22955 case GE:
22956 case UNGE:
22957 if (code == GE || !TARGET_IEEE_FP)
22959 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22960 code = EQ;
22962 else
22964 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22965 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22966 code = NE;
22968 break;
22969 case LE:
22970 case UNLE:
22971 if (code == LE && TARGET_IEEE_FP)
22973 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22974 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22975 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22976 intcmp_mode = CCmode;
22977 code = LTU;
22979 else
22981 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22982 code = NE;
22984 break;
22985 case EQ:
22986 case UNEQ:
22987 if (code == EQ && TARGET_IEEE_FP)
22989 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22990 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22991 intcmp_mode = CCmode;
22992 code = EQ;
22994 else
22996 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22997 code = NE;
22999 break;
23000 case NE:
23001 case LTGT:
23002 if (code == NE && TARGET_IEEE_FP)
23004 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23005 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23006 GEN_INT (0x40)));
23007 code = NE;
23009 else
23011 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23012 code = EQ;
23014 break;
23016 case UNORDERED:
23017 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23018 code = NE;
23019 break;
23020 case ORDERED:
23021 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23022 code = EQ;
23023 break;
23025 default:
23026 gcc_unreachable ();
23028 break;
23030 default:
23031 gcc_unreachable();
23034 /* Return the test that should be put into the flags user, i.e.
23035 the bcc, scc, or cmov instruction. */
23036 return gen_rtx_fmt_ee (code, VOIDmode,
23037 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23038 const0_rtx);
23041 static rtx
23042 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23044 rtx ret;
23046 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23047 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23049 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23051 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23052 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23054 else
23055 ret = ix86_expand_int_compare (code, op0, op1);
23057 return ret;
23060 void
23061 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23063 machine_mode mode = GET_MODE (op0);
23064 rtx tmp;
23066 /* Handle special case - vector comparsion with boolean result, transform
23067 it using ptest instruction. */
23068 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23070 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23071 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23073 gcc_assert (code == EQ || code == NE);
23074 /* Generate XOR since we can't check that one operand is zero vector. */
23075 tmp = gen_reg_rtx (mode);
23076 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23077 tmp = gen_lowpart (p_mode, tmp);
23078 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23079 gen_rtx_UNSPEC (CCmode,
23080 gen_rtvec (2, tmp, tmp),
23081 UNSPEC_PTEST)));
23082 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23083 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23084 gen_rtx_LABEL_REF (VOIDmode, label),
23085 pc_rtx);
23086 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23087 return;
23090 switch (mode)
23092 case E_SFmode:
23093 case E_DFmode:
23094 case E_XFmode:
23095 case E_QImode:
23096 case E_HImode:
23097 case E_SImode:
23098 simple:
23099 tmp = ix86_expand_compare (code, op0, op1);
23100 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23101 gen_rtx_LABEL_REF (VOIDmode, label),
23102 pc_rtx);
23103 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23104 return;
23106 case E_DImode:
23107 if (TARGET_64BIT)
23108 goto simple;
23109 /* For 32-bit target DI comparison may be performed on
23110 SSE registers. To allow this we should avoid split
23111 to SI mode which is achieved by doing xor in DI mode
23112 and then comparing with zero (which is recognized by
23113 STV pass). We don't compare using xor when optimizing
23114 for size. */
23115 if (!optimize_insn_for_size_p ()
23116 && TARGET_STV
23117 && (code == EQ || code == NE))
23119 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23120 op1 = const0_rtx;
23122 /* FALLTHRU */
23123 case E_TImode:
23124 /* Expand DImode branch into multiple compare+branch. */
23126 rtx lo[2], hi[2];
23127 rtx_code_label *label2;
23128 enum rtx_code code1, code2, code3;
23129 machine_mode submode;
23131 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23133 std::swap (op0, op1);
23134 code = swap_condition (code);
23137 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23138 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23140 submode = mode == DImode ? SImode : DImode;
23142 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23143 avoid two branches. This costs one extra insn, so disable when
23144 optimizing for size. */
23146 if ((code == EQ || code == NE)
23147 && (!optimize_insn_for_size_p ()
23148 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23150 rtx xor0, xor1;
23152 xor1 = hi[0];
23153 if (hi[1] != const0_rtx)
23154 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23155 NULL_RTX, 0, OPTAB_WIDEN);
23157 xor0 = lo[0];
23158 if (lo[1] != const0_rtx)
23159 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23160 NULL_RTX, 0, OPTAB_WIDEN);
23162 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23163 NULL_RTX, 0, OPTAB_WIDEN);
23165 ix86_expand_branch (code, tmp, const0_rtx, label);
23166 return;
23169 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23170 op1 is a constant and the low word is zero, then we can just
23171 examine the high word. Similarly for low word -1 and
23172 less-or-equal-than or greater-than. */
23174 if (CONST_INT_P (hi[1]))
23175 switch (code)
23177 case LT: case LTU: case GE: case GEU:
23178 if (lo[1] == const0_rtx)
23180 ix86_expand_branch (code, hi[0], hi[1], label);
23181 return;
23183 break;
23184 case LE: case LEU: case GT: case GTU:
23185 if (lo[1] == constm1_rtx)
23187 ix86_expand_branch (code, hi[0], hi[1], label);
23188 return;
23190 break;
23191 default:
23192 break;
23195 /* Emulate comparisons that do not depend on Zero flag with
23196 double-word subtraction. Note that only Overflow, Sign
23197 and Carry flags are valid, so swap arguments and condition
23198 of comparisons that would otherwise test Zero flag. */
23200 switch (code)
23202 case LE: case LEU: case GT: case GTU:
23203 std::swap (lo[0], lo[1]);
23204 std::swap (hi[0], hi[1]);
23205 code = swap_condition (code);
23206 /* FALLTHRU */
23208 case LT: case LTU: case GE: case GEU:
23210 rtx (*cmp_insn) (rtx, rtx);
23211 rtx (*sbb_insn) (rtx, rtx, rtx);
23212 bool uns = (code == LTU || code == GEU);
23214 if (TARGET_64BIT)
23216 cmp_insn = gen_cmpdi_1;
23217 sbb_insn
23218 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23220 else
23222 cmp_insn = gen_cmpsi_1;
23223 sbb_insn
23224 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23227 if (!nonimmediate_operand (lo[0], submode))
23228 lo[0] = force_reg (submode, lo[0]);
23229 if (!x86_64_general_operand (lo[1], submode))
23230 lo[1] = force_reg (submode, lo[1]);
23232 if (!register_operand (hi[0], submode))
23233 hi[0] = force_reg (submode, hi[0]);
23234 if ((uns && !nonimmediate_operand (hi[1], submode))
23235 || (!uns && !x86_64_general_operand (hi[1], submode)))
23236 hi[1] = force_reg (submode, hi[1]);
23238 emit_insn (cmp_insn (lo[0], lo[1]));
23239 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23241 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23243 ix86_expand_branch (code, tmp, const0_rtx, label);
23244 return;
23247 default:
23248 break;
23251 /* Otherwise, we need two or three jumps. */
23253 label2 = gen_label_rtx ();
23255 code1 = code;
23256 code2 = swap_condition (code);
23257 code3 = unsigned_condition (code);
23259 switch (code)
23261 case LT: case GT: case LTU: case GTU:
23262 break;
23264 case LE: code1 = LT; code2 = GT; break;
23265 case GE: code1 = GT; code2 = LT; break;
23266 case LEU: code1 = LTU; code2 = GTU; break;
23267 case GEU: code1 = GTU; code2 = LTU; break;
23269 case EQ: code1 = UNKNOWN; code2 = NE; break;
23270 case NE: code2 = UNKNOWN; break;
23272 default:
23273 gcc_unreachable ();
23277 * a < b =>
23278 * if (hi(a) < hi(b)) goto true;
23279 * if (hi(a) > hi(b)) goto false;
23280 * if (lo(a) < lo(b)) goto true;
23281 * false:
23284 if (code1 != UNKNOWN)
23285 ix86_expand_branch (code1, hi[0], hi[1], label);
23286 if (code2 != UNKNOWN)
23287 ix86_expand_branch (code2, hi[0], hi[1], label2);
23289 ix86_expand_branch (code3, lo[0], lo[1], label);
23291 if (code2 != UNKNOWN)
23292 emit_label (label2);
23293 return;
23296 default:
23297 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23298 goto simple;
23302 void
23303 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23305 rtx ret;
23307 gcc_assert (GET_MODE (dest) == QImode);
23309 ret = ix86_expand_compare (code, op0, op1);
23310 PUT_MODE (ret, QImode);
23311 emit_insn (gen_rtx_SET (dest, ret));
23314 /* Expand comparison setting or clearing carry flag. Return true when
23315 successful and set pop for the operation. */
23316 static bool
23317 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23319 machine_mode mode =
23320 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23322 /* Do not handle double-mode compares that go through special path. */
23323 if (mode == (TARGET_64BIT ? TImode : DImode))
23324 return false;
23326 if (SCALAR_FLOAT_MODE_P (mode))
23328 rtx compare_op;
23329 rtx_insn *compare_seq;
23331 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23333 /* Shortcut: following common codes never translate
23334 into carry flag compares. */
23335 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23336 || code == ORDERED || code == UNORDERED)
23337 return false;
23339 /* These comparisons require zero flag; swap operands so they won't. */
23340 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23341 && !TARGET_IEEE_FP)
23343 std::swap (op0, op1);
23344 code = swap_condition (code);
23347 /* Try to expand the comparison and verify that we end up with
23348 carry flag based comparison. This fails to be true only when
23349 we decide to expand comparison using arithmetic that is not
23350 too common scenario. */
23351 start_sequence ();
23352 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23353 compare_seq = get_insns ();
23354 end_sequence ();
23356 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23357 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23358 else
23359 code = GET_CODE (compare_op);
23361 if (code != LTU && code != GEU)
23362 return false;
23364 emit_insn (compare_seq);
23365 *pop = compare_op;
23366 return true;
23369 if (!INTEGRAL_MODE_P (mode))
23370 return false;
23372 switch (code)
23374 case LTU:
23375 case GEU:
23376 break;
23378 /* Convert a==0 into (unsigned)a<1. */
23379 case EQ:
23380 case NE:
23381 if (op1 != const0_rtx)
23382 return false;
23383 op1 = const1_rtx;
23384 code = (code == EQ ? LTU : GEU);
23385 break;
23387 /* Convert a>b into b<a or a>=b-1. */
23388 case GTU:
23389 case LEU:
23390 if (CONST_INT_P (op1))
23392 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23393 /* Bail out on overflow. We still can swap operands but that
23394 would force loading of the constant into register. */
23395 if (op1 == const0_rtx
23396 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23397 return false;
23398 code = (code == GTU ? GEU : LTU);
23400 else
23402 std::swap (op0, op1);
23403 code = (code == GTU ? LTU : GEU);
23405 break;
23407 /* Convert a>=0 into (unsigned)a<0x80000000. */
23408 case LT:
23409 case GE:
23410 if (mode == DImode || op1 != const0_rtx)
23411 return false;
23412 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23413 code = (code == LT ? GEU : LTU);
23414 break;
23415 case LE:
23416 case GT:
23417 if (mode == DImode || op1 != constm1_rtx)
23418 return false;
23419 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23420 code = (code == LE ? GEU : LTU);
23421 break;
23423 default:
23424 return false;
23426 /* Swapping operands may cause constant to appear as first operand. */
23427 if (!nonimmediate_operand (op0, VOIDmode))
23429 if (!can_create_pseudo_p ())
23430 return false;
23431 op0 = force_reg (mode, op0);
23433 *pop = ix86_expand_compare (code, op0, op1);
23434 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23435 return true;
23438 bool
23439 ix86_expand_int_movcc (rtx operands[])
23441 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23442 rtx_insn *compare_seq;
23443 rtx compare_op;
23444 machine_mode mode = GET_MODE (operands[0]);
23445 bool sign_bit_compare_p = false;
23446 rtx op0 = XEXP (operands[1], 0);
23447 rtx op1 = XEXP (operands[1], 1);
23449 if (GET_MODE (op0) == TImode
23450 || (GET_MODE (op0) == DImode
23451 && !TARGET_64BIT))
23452 return false;
23454 start_sequence ();
23455 compare_op = ix86_expand_compare (code, op0, op1);
23456 compare_seq = get_insns ();
23457 end_sequence ();
23459 compare_code = GET_CODE (compare_op);
23461 if ((op1 == const0_rtx && (code == GE || code == LT))
23462 || (op1 == constm1_rtx && (code == GT || code == LE)))
23463 sign_bit_compare_p = true;
23465 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23466 HImode insns, we'd be swallowed in word prefix ops. */
23468 if ((mode != HImode || TARGET_FAST_PREFIX)
23469 && (mode != (TARGET_64BIT ? TImode : DImode))
23470 && CONST_INT_P (operands[2])
23471 && CONST_INT_P (operands[3]))
23473 rtx out = operands[0];
23474 HOST_WIDE_INT ct = INTVAL (operands[2]);
23475 HOST_WIDE_INT cf = INTVAL (operands[3]);
23476 HOST_WIDE_INT diff;
23478 diff = ct - cf;
23479 /* Sign bit compares are better done using shifts than we do by using
23480 sbb. */
23481 if (sign_bit_compare_p
23482 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23484 /* Detect overlap between destination and compare sources. */
23485 rtx tmp = out;
23487 if (!sign_bit_compare_p)
23489 rtx flags;
23490 bool fpcmp = false;
23492 compare_code = GET_CODE (compare_op);
23494 flags = XEXP (compare_op, 0);
23496 if (GET_MODE (flags) == CCFPmode)
23498 fpcmp = true;
23499 compare_code
23500 = ix86_fp_compare_code_to_integer (compare_code);
23503 /* To simplify rest of code, restrict to the GEU case. */
23504 if (compare_code == LTU)
23506 std::swap (ct, cf);
23507 compare_code = reverse_condition (compare_code);
23508 code = reverse_condition (code);
23510 else
23512 if (fpcmp)
23513 PUT_CODE (compare_op,
23514 reverse_condition_maybe_unordered
23515 (GET_CODE (compare_op)));
23516 else
23517 PUT_CODE (compare_op,
23518 reverse_condition (GET_CODE (compare_op)));
23520 diff = ct - cf;
23522 if (reg_overlap_mentioned_p (out, op0)
23523 || reg_overlap_mentioned_p (out, op1))
23524 tmp = gen_reg_rtx (mode);
23526 if (mode == DImode)
23527 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23528 else
23529 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23530 flags, compare_op));
23532 else
23534 if (code == GT || code == GE)
23535 code = reverse_condition (code);
23536 else
23538 std::swap (ct, cf);
23539 diff = ct - cf;
23541 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23544 if (diff == 1)
23547 * cmpl op0,op1
23548 * sbbl dest,dest
23549 * [addl dest, ct]
23551 * Size 5 - 8.
23553 if (ct)
23554 tmp = expand_simple_binop (mode, PLUS,
23555 tmp, GEN_INT (ct),
23556 copy_rtx (tmp), 1, OPTAB_DIRECT);
23558 else if (cf == -1)
23561 * cmpl op0,op1
23562 * sbbl dest,dest
23563 * orl $ct, dest
23565 * Size 8.
23567 tmp = expand_simple_binop (mode, IOR,
23568 tmp, GEN_INT (ct),
23569 copy_rtx (tmp), 1, OPTAB_DIRECT);
23571 else if (diff == -1 && ct)
23574 * cmpl op0,op1
23575 * sbbl dest,dest
23576 * notl dest
23577 * [addl dest, cf]
23579 * Size 8 - 11.
23581 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23582 if (cf)
23583 tmp = expand_simple_binop (mode, PLUS,
23584 copy_rtx (tmp), GEN_INT (cf),
23585 copy_rtx (tmp), 1, OPTAB_DIRECT);
23587 else
23590 * cmpl op0,op1
23591 * sbbl dest,dest
23592 * [notl dest]
23593 * andl cf - ct, dest
23594 * [addl dest, ct]
23596 * Size 8 - 11.
23599 if (cf == 0)
23601 cf = ct;
23602 ct = 0;
23603 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23606 tmp = expand_simple_binop (mode, AND,
23607 copy_rtx (tmp),
23608 gen_int_mode (cf - ct, mode),
23609 copy_rtx (tmp), 1, OPTAB_DIRECT);
23610 if (ct)
23611 tmp = expand_simple_binop (mode, PLUS,
23612 copy_rtx (tmp), GEN_INT (ct),
23613 copy_rtx (tmp), 1, OPTAB_DIRECT);
23616 if (!rtx_equal_p (tmp, out))
23617 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23619 return true;
23622 if (diff < 0)
23624 machine_mode cmp_mode = GET_MODE (op0);
23625 enum rtx_code new_code;
23627 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23629 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23631 /* We may be reversing unordered compare to normal compare, that
23632 is not valid in general (we may convert non-trapping condition
23633 to trapping one), however on i386 we currently emit all
23634 comparisons unordered. */
23635 new_code = reverse_condition_maybe_unordered (code);
23637 else
23638 new_code = ix86_reverse_condition (code, cmp_mode);
23639 if (new_code != UNKNOWN)
23641 std::swap (ct, cf);
23642 diff = -diff;
23643 code = new_code;
23647 compare_code = UNKNOWN;
23648 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23649 && CONST_INT_P (op1))
23651 if (op1 == const0_rtx
23652 && (code == LT || code == GE))
23653 compare_code = code;
23654 else if (op1 == constm1_rtx)
23656 if (code == LE)
23657 compare_code = LT;
23658 else if (code == GT)
23659 compare_code = GE;
23663 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23664 if (compare_code != UNKNOWN
23665 && GET_MODE (op0) == GET_MODE (out)
23666 && (cf == -1 || ct == -1))
23668 /* If lea code below could be used, only optimize
23669 if it results in a 2 insn sequence. */
23671 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23672 || diff == 3 || diff == 5 || diff == 9)
23673 || (compare_code == LT && ct == -1)
23674 || (compare_code == GE && cf == -1))
23677 * notl op1 (if necessary)
23678 * sarl $31, op1
23679 * orl cf, op1
23681 if (ct != -1)
23683 cf = ct;
23684 ct = -1;
23685 code = reverse_condition (code);
23688 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23690 out = expand_simple_binop (mode, IOR,
23691 out, GEN_INT (cf),
23692 out, 1, OPTAB_DIRECT);
23693 if (out != operands[0])
23694 emit_move_insn (operands[0], out);
23696 return true;
23701 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23702 || diff == 3 || diff == 5 || diff == 9)
23703 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23704 && (mode != DImode
23705 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23708 * xorl dest,dest
23709 * cmpl op1,op2
23710 * setcc dest
23711 * lea cf(dest*(ct-cf)),dest
23713 * Size 14.
23715 * This also catches the degenerate setcc-only case.
23718 rtx tmp;
23719 int nops;
23721 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23723 nops = 0;
23724 /* On x86_64 the lea instruction operates on Pmode, so we need
23725 to get arithmetics done in proper mode to match. */
23726 if (diff == 1)
23727 tmp = copy_rtx (out);
23728 else
23730 rtx out1;
23731 out1 = copy_rtx (out);
23732 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23733 nops++;
23734 if (diff & 1)
23736 tmp = gen_rtx_PLUS (mode, tmp, out1);
23737 nops++;
23740 if (cf != 0)
23742 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23743 nops++;
23745 if (!rtx_equal_p (tmp, out))
23747 if (nops == 1)
23748 out = force_operand (tmp, copy_rtx (out));
23749 else
23750 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23752 if (!rtx_equal_p (out, operands[0]))
23753 emit_move_insn (operands[0], copy_rtx (out));
23755 return true;
23759 * General case: Jumpful:
23760 * xorl dest,dest cmpl op1, op2
23761 * cmpl op1, op2 movl ct, dest
23762 * setcc dest jcc 1f
23763 * decl dest movl cf, dest
23764 * andl (cf-ct),dest 1:
23765 * addl ct,dest
23767 * Size 20. Size 14.
23769 * This is reasonably steep, but branch mispredict costs are
23770 * high on modern cpus, so consider failing only if optimizing
23771 * for space.
23774 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23775 && BRANCH_COST (optimize_insn_for_speed_p (),
23776 false) >= 2)
23778 if (cf == 0)
23780 machine_mode cmp_mode = GET_MODE (op0);
23781 enum rtx_code new_code;
23783 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23785 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23787 /* We may be reversing unordered compare to normal compare,
23788 that is not valid in general (we may convert non-trapping
23789 condition to trapping one), however on i386 we currently
23790 emit all comparisons unordered. */
23791 new_code = reverse_condition_maybe_unordered (code);
23793 else
23795 new_code = ix86_reverse_condition (code, cmp_mode);
23796 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23797 compare_code = reverse_condition (compare_code);
23800 if (new_code != UNKNOWN)
23802 cf = ct;
23803 ct = 0;
23804 code = new_code;
23808 if (compare_code != UNKNOWN)
23810 /* notl op1 (if needed)
23811 sarl $31, op1
23812 andl (cf-ct), op1
23813 addl ct, op1
23815 For x < 0 (resp. x <= -1) there will be no notl,
23816 so if possible swap the constants to get rid of the
23817 complement.
23818 True/false will be -1/0 while code below (store flag
23819 followed by decrement) is 0/-1, so the constants need
23820 to be exchanged once more. */
23822 if (compare_code == GE || !cf)
23824 code = reverse_condition (code);
23825 compare_code = LT;
23827 else
23828 std::swap (ct, cf);
23830 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23832 else
23834 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23836 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23837 constm1_rtx,
23838 copy_rtx (out), 1, OPTAB_DIRECT);
23841 out = expand_simple_binop (mode, AND, copy_rtx (out),
23842 gen_int_mode (cf - ct, mode),
23843 copy_rtx (out), 1, OPTAB_DIRECT);
23844 if (ct)
23845 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23846 copy_rtx (out), 1, OPTAB_DIRECT);
23847 if (!rtx_equal_p (out, operands[0]))
23848 emit_move_insn (operands[0], copy_rtx (out));
23850 return true;
23854 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23856 /* Try a few things more with specific constants and a variable. */
23858 optab op;
23859 rtx var, orig_out, out, tmp;
23861 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23862 return false;
23864 /* If one of the two operands is an interesting constant, load a
23865 constant with the above and mask it in with a logical operation. */
23867 if (CONST_INT_P (operands[2]))
23869 var = operands[3];
23870 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23871 operands[3] = constm1_rtx, op = and_optab;
23872 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23873 operands[3] = const0_rtx, op = ior_optab;
23874 else
23875 return false;
23877 else if (CONST_INT_P (operands[3]))
23879 var = operands[2];
23880 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23881 operands[2] = constm1_rtx, op = and_optab;
23882 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23883 operands[2] = const0_rtx, op = ior_optab;
23884 else
23885 return false;
23887 else
23888 return false;
23890 orig_out = operands[0];
23891 tmp = gen_reg_rtx (mode);
23892 operands[0] = tmp;
23894 /* Recurse to get the constant loaded. */
23895 if (!ix86_expand_int_movcc (operands))
23896 return false;
23898 /* Mask in the interesting variable. */
23899 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23900 OPTAB_WIDEN);
23901 if (!rtx_equal_p (out, orig_out))
23902 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23904 return true;
23908 * For comparison with above,
23910 * movl cf,dest
23911 * movl ct,tmp
23912 * cmpl op1,op2
23913 * cmovcc tmp,dest
23915 * Size 15.
23918 if (! nonimmediate_operand (operands[2], mode))
23919 operands[2] = force_reg (mode, operands[2]);
23920 if (! nonimmediate_operand (operands[3], mode))
23921 operands[3] = force_reg (mode, operands[3]);
23923 if (! register_operand (operands[2], VOIDmode)
23924 && (mode == QImode
23925 || ! register_operand (operands[3], VOIDmode)))
23926 operands[2] = force_reg (mode, operands[2]);
23928 if (mode == QImode
23929 && ! register_operand (operands[3], VOIDmode))
23930 operands[3] = force_reg (mode, operands[3]);
23932 emit_insn (compare_seq);
23933 emit_insn (gen_rtx_SET (operands[0],
23934 gen_rtx_IF_THEN_ELSE (mode,
23935 compare_op, operands[2],
23936 operands[3])));
23937 return true;
23940 /* Swap, force into registers, or otherwise massage the two operands
23941 to an sse comparison with a mask result. Thus we differ a bit from
23942 ix86_prepare_fp_compare_args which expects to produce a flags result.
23944 The DEST operand exists to help determine whether to commute commutative
23945 operators. The POP0/POP1 operands are updated in place. The new
23946 comparison code is returned, or UNKNOWN if not implementable. */
23948 static enum rtx_code
23949 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23950 rtx *pop0, rtx *pop1)
23952 switch (code)
23954 case LTGT:
23955 case UNEQ:
23956 /* AVX supports all the needed comparisons. */
23957 if (TARGET_AVX)
23958 break;
23959 /* We have no LTGT as an operator. We could implement it with
23960 NE & ORDERED, but this requires an extra temporary. It's
23961 not clear that it's worth it. */
23962 return UNKNOWN;
23964 case LT:
23965 case LE:
23966 case UNGT:
23967 case UNGE:
23968 /* These are supported directly. */
23969 break;
23971 case EQ:
23972 case NE:
23973 case UNORDERED:
23974 case ORDERED:
23975 /* AVX has 3 operand comparisons, no need to swap anything. */
23976 if (TARGET_AVX)
23977 break;
23978 /* For commutative operators, try to canonicalize the destination
23979 operand to be first in the comparison - this helps reload to
23980 avoid extra moves. */
23981 if (!dest || !rtx_equal_p (dest, *pop1))
23982 break;
23983 /* FALLTHRU */
23985 case GE:
23986 case GT:
23987 case UNLE:
23988 case UNLT:
23989 /* These are not supported directly before AVX, and furthermore
23990 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23991 comparison operands to transform into something that is
23992 supported. */
23993 std::swap (*pop0, *pop1);
23994 code = swap_condition (code);
23995 break;
23997 default:
23998 gcc_unreachable ();
24001 return code;
24004 /* Detect conditional moves that exactly match min/max operational
24005 semantics. Note that this is IEEE safe, as long as we don't
24006 interchange the operands.
24008 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24009 and TRUE if the operation is successful and instructions are emitted. */
24011 static bool
24012 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24013 rtx cmp_op1, rtx if_true, rtx if_false)
24015 machine_mode mode;
24016 bool is_min;
24017 rtx tmp;
24019 if (code == LT)
24021 else if (code == UNGE)
24022 std::swap (if_true, if_false);
24023 else
24024 return false;
24026 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24027 is_min = true;
24028 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24029 is_min = false;
24030 else
24031 return false;
24033 mode = GET_MODE (dest);
24035 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24036 but MODE may be a vector mode and thus not appropriate. */
24037 if (!flag_finite_math_only || flag_signed_zeros)
24039 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24040 rtvec v;
24042 if_true = force_reg (mode, if_true);
24043 v = gen_rtvec (2, if_true, if_false);
24044 tmp = gen_rtx_UNSPEC (mode, v, u);
24046 else
24048 code = is_min ? SMIN : SMAX;
24049 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24052 emit_insn (gen_rtx_SET (dest, tmp));
24053 return true;
24056 /* Expand an sse vector comparison. Return the register with the result. */
24058 static rtx
24059 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24060 rtx op_true, rtx op_false)
24062 machine_mode mode = GET_MODE (dest);
24063 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24065 /* In general case result of comparison can differ from operands' type. */
24066 machine_mode cmp_mode;
24068 /* In AVX512F the result of comparison is an integer mask. */
24069 bool maskcmp = false;
24070 rtx x;
24072 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24074 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
24075 cmp_mode = int_mode_for_size (nbits, 0).require ();
24076 maskcmp = true;
24078 else
24079 cmp_mode = cmp_ops_mode;
24082 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24083 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24084 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24086 if (optimize
24087 || (maskcmp && cmp_mode != mode)
24088 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24089 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24090 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24092 /* Compare patterns for int modes are unspec in AVX512F only. */
24093 if (maskcmp && (code == GT || code == EQ))
24095 rtx (*gen)(rtx, rtx, rtx);
24097 switch (cmp_ops_mode)
24099 case E_V64QImode:
24100 gcc_assert (TARGET_AVX512BW);
24101 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24102 break;
24103 case E_V32HImode:
24104 gcc_assert (TARGET_AVX512BW);
24105 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24106 break;
24107 case E_V16SImode:
24108 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24109 break;
24110 case E_V8DImode:
24111 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24112 break;
24113 default:
24114 gen = NULL;
24117 if (gen)
24119 emit_insn (gen (dest, cmp_op0, cmp_op1));
24120 return dest;
24123 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24125 if (cmp_mode != mode && !maskcmp)
24127 x = force_reg (cmp_ops_mode, x);
24128 convert_move (dest, x, false);
24130 else
24131 emit_insn (gen_rtx_SET (dest, x));
24133 return dest;
24136 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24137 operations. This is used for both scalar and vector conditional moves. */
24139 void
24140 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24142 machine_mode mode = GET_MODE (dest);
24143 machine_mode cmpmode = GET_MODE (cmp);
24145 /* In AVX512F the result of comparison is an integer mask. */
24146 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24148 rtx t2, t3, x;
24150 /* If we have an integer mask and FP value then we need
24151 to cast mask to FP mode. */
24152 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24154 cmp = force_reg (cmpmode, cmp);
24155 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24158 if (vector_all_ones_operand (op_true, mode)
24159 && rtx_equal_p (op_false, CONST0_RTX (mode))
24160 && !maskcmp)
24162 emit_insn (gen_rtx_SET (dest, cmp));
24164 else if (op_false == CONST0_RTX (mode)
24165 && !maskcmp)
24167 op_true = force_reg (mode, op_true);
24168 x = gen_rtx_AND (mode, cmp, op_true);
24169 emit_insn (gen_rtx_SET (dest, x));
24171 else if (op_true == CONST0_RTX (mode)
24172 && !maskcmp)
24174 op_false = force_reg (mode, op_false);
24175 x = gen_rtx_NOT (mode, cmp);
24176 x = gen_rtx_AND (mode, x, op_false);
24177 emit_insn (gen_rtx_SET (dest, x));
24179 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24180 && !maskcmp)
24182 op_false = force_reg (mode, op_false);
24183 x = gen_rtx_IOR (mode, cmp, op_false);
24184 emit_insn (gen_rtx_SET (dest, x));
24186 else if (TARGET_XOP
24187 && !maskcmp)
24189 op_true = force_reg (mode, op_true);
24191 if (!nonimmediate_operand (op_false, mode))
24192 op_false = force_reg (mode, op_false);
24194 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24195 op_true,
24196 op_false)));
24198 else
24200 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24201 rtx d = dest;
24203 if (!nonimmediate_operand (op_true, mode))
24204 op_true = force_reg (mode, op_true);
24206 op_false = force_reg (mode, op_false);
24208 switch (mode)
24210 case E_V4SFmode:
24211 if (TARGET_SSE4_1)
24212 gen = gen_sse4_1_blendvps;
24213 break;
24214 case E_V2DFmode:
24215 if (TARGET_SSE4_1)
24216 gen = gen_sse4_1_blendvpd;
24217 break;
24218 case E_V16QImode:
24219 case E_V8HImode:
24220 case E_V4SImode:
24221 case E_V2DImode:
24222 if (TARGET_SSE4_1)
24224 gen = gen_sse4_1_pblendvb;
24225 if (mode != V16QImode)
24226 d = gen_reg_rtx (V16QImode);
24227 op_false = gen_lowpart (V16QImode, op_false);
24228 op_true = gen_lowpart (V16QImode, op_true);
24229 cmp = gen_lowpart (V16QImode, cmp);
24231 break;
24232 case E_V8SFmode:
24233 if (TARGET_AVX)
24234 gen = gen_avx_blendvps256;
24235 break;
24236 case E_V4DFmode:
24237 if (TARGET_AVX)
24238 gen = gen_avx_blendvpd256;
24239 break;
24240 case E_V32QImode:
24241 case E_V16HImode:
24242 case E_V8SImode:
24243 case E_V4DImode:
24244 if (TARGET_AVX2)
24246 gen = gen_avx2_pblendvb;
24247 if (mode != V32QImode)
24248 d = gen_reg_rtx (V32QImode);
24249 op_false = gen_lowpart (V32QImode, op_false);
24250 op_true = gen_lowpart (V32QImode, op_true);
24251 cmp = gen_lowpart (V32QImode, cmp);
24253 break;
24255 case E_V64QImode:
24256 gen = gen_avx512bw_blendmv64qi;
24257 break;
24258 case E_V32HImode:
24259 gen = gen_avx512bw_blendmv32hi;
24260 break;
24261 case E_V16SImode:
24262 gen = gen_avx512f_blendmv16si;
24263 break;
24264 case E_V8DImode:
24265 gen = gen_avx512f_blendmv8di;
24266 break;
24267 case E_V8DFmode:
24268 gen = gen_avx512f_blendmv8df;
24269 break;
24270 case E_V16SFmode:
24271 gen = gen_avx512f_blendmv16sf;
24272 break;
24274 default:
24275 break;
24278 if (gen != NULL)
24280 emit_insn (gen (d, op_false, op_true, cmp));
24281 if (d != dest)
24282 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24284 else
24286 op_true = force_reg (mode, op_true);
24288 t2 = gen_reg_rtx (mode);
24289 if (optimize)
24290 t3 = gen_reg_rtx (mode);
24291 else
24292 t3 = dest;
24294 x = gen_rtx_AND (mode, op_true, cmp);
24295 emit_insn (gen_rtx_SET (t2, x));
24297 x = gen_rtx_NOT (mode, cmp);
24298 x = gen_rtx_AND (mode, x, op_false);
24299 emit_insn (gen_rtx_SET (t3, x));
24301 x = gen_rtx_IOR (mode, t3, t2);
24302 emit_insn (gen_rtx_SET (dest, x));
24307 /* Expand a floating-point conditional move. Return true if successful. */
24309 bool
24310 ix86_expand_fp_movcc (rtx operands[])
24312 machine_mode mode = GET_MODE (operands[0]);
24313 enum rtx_code code = GET_CODE (operands[1]);
24314 rtx tmp, compare_op;
24315 rtx op0 = XEXP (operands[1], 0);
24316 rtx op1 = XEXP (operands[1], 1);
24318 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24320 machine_mode cmode;
24322 /* Since we've no cmove for sse registers, don't force bad register
24323 allocation just to gain access to it. Deny movcc when the
24324 comparison mode doesn't match the move mode. */
24325 cmode = GET_MODE (op0);
24326 if (cmode == VOIDmode)
24327 cmode = GET_MODE (op1);
24328 if (cmode != mode)
24329 return false;
24331 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24332 if (code == UNKNOWN)
24333 return false;
24335 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24336 operands[2], operands[3]))
24337 return true;
24339 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24340 operands[2], operands[3]);
24341 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24342 return true;
24345 if (GET_MODE (op0) == TImode
24346 || (GET_MODE (op0) == DImode
24347 && !TARGET_64BIT))
24348 return false;
24350 /* The floating point conditional move instructions don't directly
24351 support conditions resulting from a signed integer comparison. */
24353 compare_op = ix86_expand_compare (code, op0, op1);
24354 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24356 tmp = gen_reg_rtx (QImode);
24357 ix86_expand_setcc (tmp, code, op0, op1);
24359 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24362 emit_insn (gen_rtx_SET (operands[0],
24363 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24364 operands[2], operands[3])));
24366 return true;
24369 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24371 static int
24372 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24374 switch (code)
24376 case EQ:
24377 return 0;
24378 case LT:
24379 case LTU:
24380 return 1;
24381 case LE:
24382 case LEU:
24383 return 2;
24384 case NE:
24385 return 4;
24386 case GE:
24387 case GEU:
24388 return 5;
24389 case GT:
24390 case GTU:
24391 return 6;
24392 default:
24393 gcc_unreachable ();
24397 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24399 static int
24400 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24402 switch (code)
24404 case EQ:
24405 return 0x00;
24406 case NE:
24407 return 0x04;
24408 case GT:
24409 return 0x0e;
24410 case LE:
24411 return 0x02;
24412 case GE:
24413 return 0x0d;
24414 case LT:
24415 return 0x01;
24416 case UNLE:
24417 return 0x0a;
24418 case UNLT:
24419 return 0x09;
24420 case UNGE:
24421 return 0x05;
24422 case UNGT:
24423 return 0x06;
24424 case UNEQ:
24425 return 0x18;
24426 case LTGT:
24427 return 0x0c;
24428 case ORDERED:
24429 return 0x07;
24430 case UNORDERED:
24431 return 0x03;
24432 default:
24433 gcc_unreachable ();
24437 /* Return immediate value to be used in UNSPEC_PCMP
24438 for comparison CODE in MODE. */
24440 static int
24441 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24443 if (FLOAT_MODE_P (mode))
24444 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24445 return ix86_int_cmp_code_to_pcmp_immediate (code);
24448 /* Expand AVX-512 vector comparison. */
24450 bool
24451 ix86_expand_mask_vec_cmp (rtx operands[])
24453 machine_mode mask_mode = GET_MODE (operands[0]);
24454 machine_mode cmp_mode = GET_MODE (operands[2]);
24455 enum rtx_code code = GET_CODE (operands[1]);
24456 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24457 int unspec_code;
24458 rtx unspec;
24460 switch (code)
24462 case LEU:
24463 case GTU:
24464 case GEU:
24465 case LTU:
24466 unspec_code = UNSPEC_UNSIGNED_PCMP;
24467 break;
24469 default:
24470 unspec_code = UNSPEC_PCMP;
24473 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24474 operands[3], imm),
24475 unspec_code);
24476 emit_insn (gen_rtx_SET (operands[0], unspec));
24478 return true;
24481 /* Expand fp vector comparison. */
24483 bool
24484 ix86_expand_fp_vec_cmp (rtx operands[])
24486 enum rtx_code code = GET_CODE (operands[1]);
24487 rtx cmp;
24489 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24490 &operands[2], &operands[3]);
24491 if (code == UNKNOWN)
24493 rtx temp;
24494 switch (GET_CODE (operands[1]))
24496 case LTGT:
24497 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24498 operands[3], NULL, NULL);
24499 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24500 operands[3], NULL, NULL);
24501 code = AND;
24502 break;
24503 case UNEQ:
24504 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24505 operands[3], NULL, NULL);
24506 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24507 operands[3], NULL, NULL);
24508 code = IOR;
24509 break;
24510 default:
24511 gcc_unreachable ();
24513 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24514 OPTAB_DIRECT);
24516 else
24517 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24518 operands[1], operands[2]);
24520 if (operands[0] != cmp)
24521 emit_move_insn (operands[0], cmp);
24523 return true;
24526 static rtx
24527 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24528 rtx op_true, rtx op_false, bool *negate)
24530 machine_mode data_mode = GET_MODE (dest);
24531 machine_mode mode = GET_MODE (cop0);
24532 rtx x;
24534 *negate = false;
24536 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24537 if (TARGET_XOP
24538 && (mode == V16QImode || mode == V8HImode
24539 || mode == V4SImode || mode == V2DImode))
24541 else
24543 /* Canonicalize the comparison to EQ, GT, GTU. */
24544 switch (code)
24546 case EQ:
24547 case GT:
24548 case GTU:
24549 break;
24551 case NE:
24552 case LE:
24553 case LEU:
24554 code = reverse_condition (code);
24555 *negate = true;
24556 break;
24558 case GE:
24559 case GEU:
24560 code = reverse_condition (code);
24561 *negate = true;
24562 /* FALLTHRU */
24564 case LT:
24565 case LTU:
24566 std::swap (cop0, cop1);
24567 code = swap_condition (code);
24568 break;
24570 default:
24571 gcc_unreachable ();
24574 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24575 if (mode == V2DImode)
24577 switch (code)
24579 case EQ:
24580 /* SSE4.1 supports EQ. */
24581 if (!TARGET_SSE4_1)
24582 return NULL;
24583 break;
24585 case GT:
24586 case GTU:
24587 /* SSE4.2 supports GT/GTU. */
24588 if (!TARGET_SSE4_2)
24589 return NULL;
24590 break;
24592 default:
24593 gcc_unreachable ();
24597 /* Unsigned parallel compare is not supported by the hardware.
24598 Play some tricks to turn this into a signed comparison
24599 against 0. */
24600 if (code == GTU)
24602 cop0 = force_reg (mode, cop0);
24604 switch (mode)
24606 case E_V16SImode:
24607 case E_V8DImode:
24608 case E_V8SImode:
24609 case E_V4DImode:
24610 case E_V4SImode:
24611 case E_V2DImode:
24613 rtx t1, t2, mask;
24614 rtx (*gen_sub3) (rtx, rtx, rtx);
24616 switch (mode)
24618 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24619 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24620 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24621 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24622 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24623 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24624 default:
24625 gcc_unreachable ();
24627 /* Subtract (-(INT MAX) - 1) from both operands to make
24628 them signed. */
24629 mask = ix86_build_signbit_mask (mode, true, false);
24630 t1 = gen_reg_rtx (mode);
24631 emit_insn (gen_sub3 (t1, cop0, mask));
24633 t2 = gen_reg_rtx (mode);
24634 emit_insn (gen_sub3 (t2, cop1, mask));
24636 cop0 = t1;
24637 cop1 = t2;
24638 code = GT;
24640 break;
24642 case E_V64QImode:
24643 case E_V32HImode:
24644 case E_V32QImode:
24645 case E_V16HImode:
24646 case E_V16QImode:
24647 case E_V8HImode:
24648 /* Perform a parallel unsigned saturating subtraction. */
24649 x = gen_reg_rtx (mode);
24650 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24651 cop1)));
24653 cop0 = x;
24654 cop1 = CONST0_RTX (mode);
24655 code = EQ;
24656 *negate = !*negate;
24657 break;
24659 default:
24660 gcc_unreachable ();
24665 if (*negate)
24666 std::swap (op_true, op_false);
24668 /* Allow the comparison to be done in one mode, but the movcc to
24669 happen in another mode. */
24670 if (data_mode == mode)
24672 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24673 op_true, op_false);
24675 else
24677 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24678 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24679 op_true, op_false);
24680 if (GET_MODE (x) == mode)
24681 x = gen_lowpart (data_mode, x);
24684 return x;
24687 /* Expand integer vector comparison. */
24689 bool
24690 ix86_expand_int_vec_cmp (rtx operands[])
24692 rtx_code code = GET_CODE (operands[1]);
24693 bool negate = false;
24694 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24695 operands[3], NULL, NULL, &negate);
24697 if (!cmp)
24698 return false;
24700 if (negate)
24701 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24702 CONST0_RTX (GET_MODE (cmp)),
24703 NULL, NULL, &negate);
24705 gcc_assert (!negate);
24707 if (operands[0] != cmp)
24708 emit_move_insn (operands[0], cmp);
24710 return true;
24713 /* Expand a floating-point vector conditional move; a vcond operation
24714 rather than a movcc operation. */
24716 bool
24717 ix86_expand_fp_vcond (rtx operands[])
24719 enum rtx_code code = GET_CODE (operands[3]);
24720 rtx cmp;
24722 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24723 &operands[4], &operands[5]);
24724 if (code == UNKNOWN)
24726 rtx temp;
24727 switch (GET_CODE (operands[3]))
24729 case LTGT:
24730 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24731 operands[5], operands[0], operands[0]);
24732 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24733 operands[5], operands[1], operands[2]);
24734 code = AND;
24735 break;
24736 case UNEQ:
24737 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24738 operands[5], operands[0], operands[0]);
24739 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24740 operands[5], operands[1], operands[2]);
24741 code = IOR;
24742 break;
24743 default:
24744 gcc_unreachable ();
24746 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24747 OPTAB_DIRECT);
24748 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24749 return true;
24752 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24753 operands[5], operands[1], operands[2]))
24754 return true;
24756 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24757 operands[1], operands[2]);
24758 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24759 return true;
24762 /* Expand a signed/unsigned integral vector conditional move. */
24764 bool
24765 ix86_expand_int_vcond (rtx operands[])
24767 machine_mode data_mode = GET_MODE (operands[0]);
24768 machine_mode mode = GET_MODE (operands[4]);
24769 enum rtx_code code = GET_CODE (operands[3]);
24770 bool negate = false;
24771 rtx x, cop0, cop1;
24773 cop0 = operands[4];
24774 cop1 = operands[5];
24776 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24777 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24778 if ((code == LT || code == GE)
24779 && data_mode == mode
24780 && cop1 == CONST0_RTX (mode)
24781 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24782 && GET_MODE_UNIT_SIZE (data_mode) > 1
24783 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24784 && (GET_MODE_SIZE (data_mode) == 16
24785 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24787 rtx negop = operands[2 - (code == LT)];
24788 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24789 if (negop == CONST1_RTX (data_mode))
24791 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24792 operands[0], 1, OPTAB_DIRECT);
24793 if (res != operands[0])
24794 emit_move_insn (operands[0], res);
24795 return true;
24797 else if (GET_MODE_INNER (data_mode) != DImode
24798 && vector_all_ones_operand (negop, data_mode))
24800 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24801 operands[0], 0, OPTAB_DIRECT);
24802 if (res != operands[0])
24803 emit_move_insn (operands[0], res);
24804 return true;
24808 if (!nonimmediate_operand (cop1, mode))
24809 cop1 = force_reg (mode, cop1);
24810 if (!general_operand (operands[1], data_mode))
24811 operands[1] = force_reg (data_mode, operands[1]);
24812 if (!general_operand (operands[2], data_mode))
24813 operands[2] = force_reg (data_mode, operands[2]);
24815 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24816 operands[1], operands[2], &negate);
24818 if (!x)
24819 return false;
24821 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24822 operands[2-negate]);
24823 return true;
24826 /* AVX512F does support 64-byte integer vector operations,
24827 thus the longest vector we are faced with is V64QImode. */
24828 #define MAX_VECT_LEN 64
24830 struct expand_vec_perm_d
24832 rtx target, op0, op1;
24833 unsigned char perm[MAX_VECT_LEN];
24834 machine_mode vmode;
24835 unsigned char nelt;
24836 bool one_operand_p;
24837 bool testing_p;
24840 static bool
24841 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24842 struct expand_vec_perm_d *d)
24844 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24845 expander, so args are either in d, or in op0, op1 etc. */
24846 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24847 machine_mode maskmode = mode;
24848 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24850 switch (mode)
24852 case E_V8HImode:
24853 if (TARGET_AVX512VL && TARGET_AVX512BW)
24854 gen = gen_avx512vl_vpermt2varv8hi3;
24855 break;
24856 case E_V16HImode:
24857 if (TARGET_AVX512VL && TARGET_AVX512BW)
24858 gen = gen_avx512vl_vpermt2varv16hi3;
24859 break;
24860 case E_V64QImode:
24861 if (TARGET_AVX512VBMI)
24862 gen = gen_avx512bw_vpermt2varv64qi3;
24863 break;
24864 case E_V32HImode:
24865 if (TARGET_AVX512BW)
24866 gen = gen_avx512bw_vpermt2varv32hi3;
24867 break;
24868 case E_V4SImode:
24869 if (TARGET_AVX512VL)
24870 gen = gen_avx512vl_vpermt2varv4si3;
24871 break;
24872 case E_V8SImode:
24873 if (TARGET_AVX512VL)
24874 gen = gen_avx512vl_vpermt2varv8si3;
24875 break;
24876 case E_V16SImode:
24877 if (TARGET_AVX512F)
24878 gen = gen_avx512f_vpermt2varv16si3;
24879 break;
24880 case E_V4SFmode:
24881 if (TARGET_AVX512VL)
24883 gen = gen_avx512vl_vpermt2varv4sf3;
24884 maskmode = V4SImode;
24886 break;
24887 case E_V8SFmode:
24888 if (TARGET_AVX512VL)
24890 gen = gen_avx512vl_vpermt2varv8sf3;
24891 maskmode = V8SImode;
24893 break;
24894 case E_V16SFmode:
24895 if (TARGET_AVX512F)
24897 gen = gen_avx512f_vpermt2varv16sf3;
24898 maskmode = V16SImode;
24900 break;
24901 case E_V2DImode:
24902 if (TARGET_AVX512VL)
24903 gen = gen_avx512vl_vpermt2varv2di3;
24904 break;
24905 case E_V4DImode:
24906 if (TARGET_AVX512VL)
24907 gen = gen_avx512vl_vpermt2varv4di3;
24908 break;
24909 case E_V8DImode:
24910 if (TARGET_AVX512F)
24911 gen = gen_avx512f_vpermt2varv8di3;
24912 break;
24913 case E_V2DFmode:
24914 if (TARGET_AVX512VL)
24916 gen = gen_avx512vl_vpermt2varv2df3;
24917 maskmode = V2DImode;
24919 break;
24920 case E_V4DFmode:
24921 if (TARGET_AVX512VL)
24923 gen = gen_avx512vl_vpermt2varv4df3;
24924 maskmode = V4DImode;
24926 break;
24927 case E_V8DFmode:
24928 if (TARGET_AVX512F)
24930 gen = gen_avx512f_vpermt2varv8df3;
24931 maskmode = V8DImode;
24933 break;
24934 default:
24935 break;
24938 if (gen == NULL)
24939 return false;
24941 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24942 expander, so args are either in d, or in op0, op1 etc. */
24943 if (d)
24945 rtx vec[64];
24946 target = d->target;
24947 op0 = d->op0;
24948 op1 = d->op1;
24949 for (int i = 0; i < d->nelt; ++i)
24950 vec[i] = GEN_INT (d->perm[i]);
24951 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24954 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24955 return true;
24958 /* Expand a variable vector permutation. */
24960 void
24961 ix86_expand_vec_perm (rtx operands[])
24963 rtx target = operands[0];
24964 rtx op0 = operands[1];
24965 rtx op1 = operands[2];
24966 rtx mask = operands[3];
24967 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24968 machine_mode mode = GET_MODE (op0);
24969 machine_mode maskmode = GET_MODE (mask);
24970 int w, e, i;
24971 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24973 /* Number of elements in the vector. */
24974 w = GET_MODE_NUNITS (mode);
24975 e = GET_MODE_UNIT_SIZE (mode);
24976 gcc_assert (w <= 64);
24978 if (TARGET_AVX512F && one_operand_shuffle)
24980 rtx (*gen) (rtx, rtx, rtx) = NULL;
24981 switch (mode)
24983 case E_V16SImode:
24984 gen =gen_avx512f_permvarv16si;
24985 break;
24986 case E_V16SFmode:
24987 gen = gen_avx512f_permvarv16sf;
24988 break;
24989 case E_V8DImode:
24990 gen = gen_avx512f_permvarv8di;
24991 break;
24992 case E_V8DFmode:
24993 gen = gen_avx512f_permvarv8df;
24994 break;
24995 default:
24996 break;
24998 if (gen != NULL)
25000 emit_insn (gen (target, op0, mask));
25001 return;
25005 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
25006 return;
25008 if (TARGET_AVX2)
25010 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25012 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25013 an constant shuffle operand. With a tiny bit of effort we can
25014 use VPERMD instead. A re-interpretation stall for V4DFmode is
25015 unfortunate but there's no avoiding it.
25016 Similarly for V16HImode we don't have instructions for variable
25017 shuffling, while for V32QImode we can use after preparing suitable
25018 masks vpshufb; vpshufb; vpermq; vpor. */
25020 if (mode == V16HImode)
25022 maskmode = mode = V32QImode;
25023 w = 32;
25024 e = 1;
25026 else
25028 maskmode = mode = V8SImode;
25029 w = 8;
25030 e = 4;
25032 t1 = gen_reg_rtx (maskmode);
25034 /* Replicate the low bits of the V4DImode mask into V8SImode:
25035 mask = { A B C D }
25036 t1 = { A A B B C C D D }. */
25037 for (i = 0; i < w / 2; ++i)
25038 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25039 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25040 vt = force_reg (maskmode, vt);
25041 mask = gen_lowpart (maskmode, mask);
25042 if (maskmode == V8SImode)
25043 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25044 else
25045 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25047 /* Multiply the shuffle indicies by two. */
25048 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25049 OPTAB_DIRECT);
25051 /* Add one to the odd shuffle indicies:
25052 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25053 for (i = 0; i < w / 2; ++i)
25055 vec[i * 2] = const0_rtx;
25056 vec[i * 2 + 1] = const1_rtx;
25058 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25059 vt = validize_mem (force_const_mem (maskmode, vt));
25060 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25061 OPTAB_DIRECT);
25063 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25064 operands[3] = mask = t1;
25065 target = gen_reg_rtx (mode);
25066 op0 = gen_lowpart (mode, op0);
25067 op1 = gen_lowpart (mode, op1);
25070 switch (mode)
25072 case E_V8SImode:
25073 /* The VPERMD and VPERMPS instructions already properly ignore
25074 the high bits of the shuffle elements. No need for us to
25075 perform an AND ourselves. */
25076 if (one_operand_shuffle)
25078 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25079 if (target != operands[0])
25080 emit_move_insn (operands[0],
25081 gen_lowpart (GET_MODE (operands[0]), target));
25083 else
25085 t1 = gen_reg_rtx (V8SImode);
25086 t2 = gen_reg_rtx (V8SImode);
25087 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25088 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25089 goto merge_two;
25091 return;
25093 case E_V8SFmode:
25094 mask = gen_lowpart (V8SImode, mask);
25095 if (one_operand_shuffle)
25096 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25097 else
25099 t1 = gen_reg_rtx (V8SFmode);
25100 t2 = gen_reg_rtx (V8SFmode);
25101 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25102 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25103 goto merge_two;
25105 return;
25107 case E_V4SImode:
25108 /* By combining the two 128-bit input vectors into one 256-bit
25109 input vector, we can use VPERMD and VPERMPS for the full
25110 two-operand shuffle. */
25111 t1 = gen_reg_rtx (V8SImode);
25112 t2 = gen_reg_rtx (V8SImode);
25113 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25114 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25115 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25116 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25117 return;
25119 case E_V4SFmode:
25120 t1 = gen_reg_rtx (V8SFmode);
25121 t2 = gen_reg_rtx (V8SImode);
25122 mask = gen_lowpart (V4SImode, mask);
25123 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25124 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25125 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25126 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25127 return;
25129 case E_V32QImode:
25130 t1 = gen_reg_rtx (V32QImode);
25131 t2 = gen_reg_rtx (V32QImode);
25132 t3 = gen_reg_rtx (V32QImode);
25133 vt2 = GEN_INT (-128);
25134 vt = gen_const_vec_duplicate (V32QImode, vt2);
25135 vt = force_reg (V32QImode, vt);
25136 for (i = 0; i < 32; i++)
25137 vec[i] = i < 16 ? vt2 : const0_rtx;
25138 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25139 vt2 = force_reg (V32QImode, vt2);
25140 /* From mask create two adjusted masks, which contain the same
25141 bits as mask in the low 7 bits of each vector element.
25142 The first mask will have the most significant bit clear
25143 if it requests element from the same 128-bit lane
25144 and MSB set if it requests element from the other 128-bit lane.
25145 The second mask will have the opposite values of the MSB,
25146 and additionally will have its 128-bit lanes swapped.
25147 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25148 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25149 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25150 stands for other 12 bytes. */
25151 /* The bit whether element is from the same lane or the other
25152 lane is bit 4, so shift it up by 3 to the MSB position. */
25153 t5 = gen_reg_rtx (V4DImode);
25154 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25155 GEN_INT (3)));
25156 /* Clear MSB bits from the mask just in case it had them set. */
25157 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25158 /* After this t1 will have MSB set for elements from other lane. */
25159 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25160 /* Clear bits other than MSB. */
25161 emit_insn (gen_andv32qi3 (t1, t1, vt));
25162 /* Or in the lower bits from mask into t3. */
25163 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25164 /* And invert MSB bits in t1, so MSB is set for elements from the same
25165 lane. */
25166 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25167 /* Swap 128-bit lanes in t3. */
25168 t6 = gen_reg_rtx (V4DImode);
25169 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25170 const2_rtx, GEN_INT (3),
25171 const0_rtx, const1_rtx));
25172 /* And or in the lower bits from mask into t1. */
25173 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25174 if (one_operand_shuffle)
25176 /* Each of these shuffles will put 0s in places where
25177 element from the other 128-bit lane is needed, otherwise
25178 will shuffle in the requested value. */
25179 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25180 gen_lowpart (V32QImode, t6)));
25181 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25182 /* For t3 the 128-bit lanes are swapped again. */
25183 t7 = gen_reg_rtx (V4DImode);
25184 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25185 const2_rtx, GEN_INT (3),
25186 const0_rtx, const1_rtx));
25187 /* And oring both together leads to the result. */
25188 emit_insn (gen_iorv32qi3 (target, t1,
25189 gen_lowpart (V32QImode, t7)));
25190 if (target != operands[0])
25191 emit_move_insn (operands[0],
25192 gen_lowpart (GET_MODE (operands[0]), target));
25193 return;
25196 t4 = gen_reg_rtx (V32QImode);
25197 /* Similarly to the above one_operand_shuffle code,
25198 just for repeated twice for each operand. merge_two:
25199 code will merge the two results together. */
25200 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25201 gen_lowpart (V32QImode, t6)));
25202 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25203 gen_lowpart (V32QImode, t6)));
25204 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25205 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25206 t7 = gen_reg_rtx (V4DImode);
25207 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25208 const2_rtx, GEN_INT (3),
25209 const0_rtx, const1_rtx));
25210 t8 = gen_reg_rtx (V4DImode);
25211 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25212 const2_rtx, GEN_INT (3),
25213 const0_rtx, const1_rtx));
25214 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25215 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25216 t1 = t4;
25217 t2 = t3;
25218 goto merge_two;
25220 default:
25221 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25222 break;
25226 if (TARGET_XOP)
25228 /* The XOP VPPERM insn supports three inputs. By ignoring the
25229 one_operand_shuffle special case, we avoid creating another
25230 set of constant vectors in memory. */
25231 one_operand_shuffle = false;
25233 /* mask = mask & {2*w-1, ...} */
25234 vt = GEN_INT (2*w - 1);
25236 else
25238 /* mask = mask & {w-1, ...} */
25239 vt = GEN_INT (w - 1);
25242 vt = gen_const_vec_duplicate (maskmode, vt);
25243 mask = expand_simple_binop (maskmode, AND, mask, vt,
25244 NULL_RTX, 0, OPTAB_DIRECT);
25246 /* For non-QImode operations, convert the word permutation control
25247 into a byte permutation control. */
25248 if (mode != V16QImode)
25250 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25251 GEN_INT (exact_log2 (e)),
25252 NULL_RTX, 0, OPTAB_DIRECT);
25254 /* Convert mask to vector of chars. */
25255 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25257 /* Replicate each of the input bytes into byte positions:
25258 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25259 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25260 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25261 for (i = 0; i < 16; ++i)
25262 vec[i] = GEN_INT (i/e * e);
25263 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25264 vt = validize_mem (force_const_mem (V16QImode, vt));
25265 if (TARGET_XOP)
25266 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25267 else
25268 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25270 /* Convert it into the byte positions by doing
25271 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25272 for (i = 0; i < 16; ++i)
25273 vec[i] = GEN_INT (i % e);
25274 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25275 vt = validize_mem (force_const_mem (V16QImode, vt));
25276 emit_insn (gen_addv16qi3 (mask, mask, vt));
25279 /* The actual shuffle operations all operate on V16QImode. */
25280 op0 = gen_lowpart (V16QImode, op0);
25281 op1 = gen_lowpart (V16QImode, op1);
25283 if (TARGET_XOP)
25285 if (GET_MODE (target) != V16QImode)
25286 target = gen_reg_rtx (V16QImode);
25287 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25288 if (target != operands[0])
25289 emit_move_insn (operands[0],
25290 gen_lowpart (GET_MODE (operands[0]), target));
25292 else if (one_operand_shuffle)
25294 if (GET_MODE (target) != V16QImode)
25295 target = gen_reg_rtx (V16QImode);
25296 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25297 if (target != operands[0])
25298 emit_move_insn (operands[0],
25299 gen_lowpart (GET_MODE (operands[0]), target));
25301 else
25303 rtx xops[6];
25304 bool ok;
25306 /* Shuffle the two input vectors independently. */
25307 t1 = gen_reg_rtx (V16QImode);
25308 t2 = gen_reg_rtx (V16QImode);
25309 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25310 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25312 merge_two:
25313 /* Then merge them together. The key is whether any given control
25314 element contained a bit set that indicates the second word. */
25315 mask = operands[3];
25316 vt = GEN_INT (w);
25317 if (maskmode == V2DImode && !TARGET_SSE4_1)
25319 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25320 more shuffle to convert the V2DI input mask into a V4SI
25321 input mask. At which point the masking that expand_int_vcond
25322 will work as desired. */
25323 rtx t3 = gen_reg_rtx (V4SImode);
25324 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25325 const0_rtx, const0_rtx,
25326 const2_rtx, const2_rtx));
25327 mask = t3;
25328 maskmode = V4SImode;
25329 e = w = 4;
25332 vt = gen_const_vec_duplicate (maskmode, vt);
25333 vt = force_reg (maskmode, vt);
25334 mask = expand_simple_binop (maskmode, AND, mask, vt,
25335 NULL_RTX, 0, OPTAB_DIRECT);
25337 if (GET_MODE (target) != mode)
25338 target = gen_reg_rtx (mode);
25339 xops[0] = target;
25340 xops[1] = gen_lowpart (mode, t2);
25341 xops[2] = gen_lowpart (mode, t1);
25342 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25343 xops[4] = mask;
25344 xops[5] = vt;
25345 ok = ix86_expand_int_vcond (xops);
25346 gcc_assert (ok);
25347 if (target != operands[0])
25348 emit_move_insn (operands[0],
25349 gen_lowpart (GET_MODE (operands[0]), target));
25353 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25354 true if we should do zero extension, else sign extension. HIGH_P is
25355 true if we want the N/2 high elements, else the low elements. */
25357 void
25358 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25360 machine_mode imode = GET_MODE (src);
25361 rtx tmp;
25363 if (TARGET_SSE4_1)
25365 rtx (*unpack)(rtx, rtx);
25366 rtx (*extract)(rtx, rtx) = NULL;
25367 machine_mode halfmode = BLKmode;
25369 switch (imode)
25371 case E_V64QImode:
25372 if (unsigned_p)
25373 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25374 else
25375 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25376 halfmode = V32QImode;
25377 extract
25378 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25379 break;
25380 case E_V32QImode:
25381 if (unsigned_p)
25382 unpack = gen_avx2_zero_extendv16qiv16hi2;
25383 else
25384 unpack = gen_avx2_sign_extendv16qiv16hi2;
25385 halfmode = V16QImode;
25386 extract
25387 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25388 break;
25389 case E_V32HImode:
25390 if (unsigned_p)
25391 unpack = gen_avx512f_zero_extendv16hiv16si2;
25392 else
25393 unpack = gen_avx512f_sign_extendv16hiv16si2;
25394 halfmode = V16HImode;
25395 extract
25396 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25397 break;
25398 case E_V16HImode:
25399 if (unsigned_p)
25400 unpack = gen_avx2_zero_extendv8hiv8si2;
25401 else
25402 unpack = gen_avx2_sign_extendv8hiv8si2;
25403 halfmode = V8HImode;
25404 extract
25405 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25406 break;
25407 case E_V16SImode:
25408 if (unsigned_p)
25409 unpack = gen_avx512f_zero_extendv8siv8di2;
25410 else
25411 unpack = gen_avx512f_sign_extendv8siv8di2;
25412 halfmode = V8SImode;
25413 extract
25414 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25415 break;
25416 case E_V8SImode:
25417 if (unsigned_p)
25418 unpack = gen_avx2_zero_extendv4siv4di2;
25419 else
25420 unpack = gen_avx2_sign_extendv4siv4di2;
25421 halfmode = V4SImode;
25422 extract
25423 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25424 break;
25425 case E_V16QImode:
25426 if (unsigned_p)
25427 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25428 else
25429 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25430 break;
25431 case E_V8HImode:
25432 if (unsigned_p)
25433 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25434 else
25435 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25436 break;
25437 case E_V4SImode:
25438 if (unsigned_p)
25439 unpack = gen_sse4_1_zero_extendv2siv2di2;
25440 else
25441 unpack = gen_sse4_1_sign_extendv2siv2di2;
25442 break;
25443 default:
25444 gcc_unreachable ();
25447 if (GET_MODE_SIZE (imode) >= 32)
25449 tmp = gen_reg_rtx (halfmode);
25450 emit_insn (extract (tmp, src));
25452 else if (high_p)
25454 /* Shift higher 8 bytes to lower 8 bytes. */
25455 tmp = gen_reg_rtx (V1TImode);
25456 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25457 GEN_INT (64)));
25458 tmp = gen_lowpart (imode, tmp);
25460 else
25461 tmp = src;
25463 emit_insn (unpack (dest, tmp));
25465 else
25467 rtx (*unpack)(rtx, rtx, rtx);
25469 switch (imode)
25471 case E_V16QImode:
25472 if (high_p)
25473 unpack = gen_vec_interleave_highv16qi;
25474 else
25475 unpack = gen_vec_interleave_lowv16qi;
25476 break;
25477 case E_V8HImode:
25478 if (high_p)
25479 unpack = gen_vec_interleave_highv8hi;
25480 else
25481 unpack = gen_vec_interleave_lowv8hi;
25482 break;
25483 case E_V4SImode:
25484 if (high_p)
25485 unpack = gen_vec_interleave_highv4si;
25486 else
25487 unpack = gen_vec_interleave_lowv4si;
25488 break;
25489 default:
25490 gcc_unreachable ();
25493 if (unsigned_p)
25494 tmp = force_reg (imode, CONST0_RTX (imode));
25495 else
25496 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25497 src, pc_rtx, pc_rtx);
25499 rtx tmp2 = gen_reg_rtx (imode);
25500 emit_insn (unpack (tmp2, src, tmp));
25501 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25505 /* Expand conditional increment or decrement using adb/sbb instructions.
25506 The default case using setcc followed by the conditional move can be
25507 done by generic code. */
25508 bool
25509 ix86_expand_int_addcc (rtx operands[])
25511 enum rtx_code code = GET_CODE (operands[1]);
25512 rtx flags;
25513 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25514 rtx compare_op;
25515 rtx val = const0_rtx;
25516 bool fpcmp = false;
25517 machine_mode mode;
25518 rtx op0 = XEXP (operands[1], 0);
25519 rtx op1 = XEXP (operands[1], 1);
25521 if (operands[3] != const1_rtx
25522 && operands[3] != constm1_rtx)
25523 return false;
25524 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25525 return false;
25526 code = GET_CODE (compare_op);
25528 flags = XEXP (compare_op, 0);
25530 if (GET_MODE (flags) == CCFPmode)
25532 fpcmp = true;
25533 code = ix86_fp_compare_code_to_integer (code);
25536 if (code != LTU)
25538 val = constm1_rtx;
25539 if (fpcmp)
25540 PUT_CODE (compare_op,
25541 reverse_condition_maybe_unordered
25542 (GET_CODE (compare_op)));
25543 else
25544 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25547 mode = GET_MODE (operands[0]);
25549 /* Construct either adc or sbb insn. */
25550 if ((code == LTU) == (operands[3] == constm1_rtx))
25552 switch (mode)
25554 case E_QImode:
25555 insn = gen_subqi3_carry;
25556 break;
25557 case E_HImode:
25558 insn = gen_subhi3_carry;
25559 break;
25560 case E_SImode:
25561 insn = gen_subsi3_carry;
25562 break;
25563 case E_DImode:
25564 insn = gen_subdi3_carry;
25565 break;
25566 default:
25567 gcc_unreachable ();
25570 else
25572 switch (mode)
25574 case E_QImode:
25575 insn = gen_addqi3_carry;
25576 break;
25577 case E_HImode:
25578 insn = gen_addhi3_carry;
25579 break;
25580 case E_SImode:
25581 insn = gen_addsi3_carry;
25582 break;
25583 case E_DImode:
25584 insn = gen_adddi3_carry;
25585 break;
25586 default:
25587 gcc_unreachable ();
25590 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25592 return true;
25596 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25597 but works for floating pointer parameters and nonoffsetable memories.
25598 For pushes, it returns just stack offsets; the values will be saved
25599 in the right order. Maximally three parts are generated. */
25601 static int
25602 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25604 int size;
25606 if (!TARGET_64BIT)
25607 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25608 else
25609 size = (GET_MODE_SIZE (mode) + 4) / 8;
25611 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25612 gcc_assert (size >= 2 && size <= 4);
25614 /* Optimize constant pool reference to immediates. This is used by fp
25615 moves, that force all constants to memory to allow combining. */
25616 if (MEM_P (operand) && MEM_READONLY_P (operand))
25617 operand = avoid_constant_pool_reference (operand);
25619 if (MEM_P (operand) && !offsettable_memref_p (operand))
25621 /* The only non-offsetable memories we handle are pushes. */
25622 int ok = push_operand (operand, VOIDmode);
25624 gcc_assert (ok);
25626 operand = copy_rtx (operand);
25627 PUT_MODE (operand, word_mode);
25628 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25629 return size;
25632 if (GET_CODE (operand) == CONST_VECTOR)
25634 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25635 /* Caution: if we looked through a constant pool memory above,
25636 the operand may actually have a different mode now. That's
25637 ok, since we want to pun this all the way back to an integer. */
25638 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25639 gcc_assert (operand != NULL);
25640 mode = imode;
25643 if (!TARGET_64BIT)
25645 if (mode == DImode)
25646 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25647 else
25649 int i;
25651 if (REG_P (operand))
25653 gcc_assert (reload_completed);
25654 for (i = 0; i < size; i++)
25655 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25657 else if (offsettable_memref_p (operand))
25659 operand = adjust_address (operand, SImode, 0);
25660 parts[0] = operand;
25661 for (i = 1; i < size; i++)
25662 parts[i] = adjust_address (operand, SImode, 4 * i);
25664 else if (CONST_DOUBLE_P (operand))
25666 const REAL_VALUE_TYPE *r;
25667 long l[4];
25669 r = CONST_DOUBLE_REAL_VALUE (operand);
25670 switch (mode)
25672 case E_TFmode:
25673 real_to_target (l, r, mode);
25674 parts[3] = gen_int_mode (l[3], SImode);
25675 parts[2] = gen_int_mode (l[2], SImode);
25676 break;
25677 case E_XFmode:
25678 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25679 long double may not be 80-bit. */
25680 real_to_target (l, r, mode);
25681 parts[2] = gen_int_mode (l[2], SImode);
25682 break;
25683 case E_DFmode:
25684 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25685 break;
25686 default:
25687 gcc_unreachable ();
25689 parts[1] = gen_int_mode (l[1], SImode);
25690 parts[0] = gen_int_mode (l[0], SImode);
25692 else
25693 gcc_unreachable ();
25696 else
25698 if (mode == TImode)
25699 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25700 if (mode == XFmode || mode == TFmode)
25702 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25703 if (REG_P (operand))
25705 gcc_assert (reload_completed);
25706 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25707 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25709 else if (offsettable_memref_p (operand))
25711 operand = adjust_address (operand, DImode, 0);
25712 parts[0] = operand;
25713 parts[1] = adjust_address (operand, upper_mode, 8);
25715 else if (CONST_DOUBLE_P (operand))
25717 long l[4];
25719 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25721 /* real_to_target puts 32-bit pieces in each long. */
25722 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25723 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25724 << 32), DImode);
25726 if (upper_mode == SImode)
25727 parts[1] = gen_int_mode (l[2], SImode);
25728 else
25729 parts[1]
25730 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25731 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25732 << 32), DImode);
25734 else
25735 gcc_unreachable ();
25739 return size;
25742 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25743 Return false when normal moves are needed; true when all required
25744 insns have been emitted. Operands 2-4 contain the input values
25745 int the correct order; operands 5-7 contain the output values. */
25747 void
25748 ix86_split_long_move (rtx operands[])
25750 rtx part[2][4];
25751 int nparts, i, j;
25752 int push = 0;
25753 int collisions = 0;
25754 machine_mode mode = GET_MODE (operands[0]);
25755 bool collisionparts[4];
25757 /* The DFmode expanders may ask us to move double.
25758 For 64bit target this is single move. By hiding the fact
25759 here we simplify i386.md splitters. */
25760 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25762 /* Optimize constant pool reference to immediates. This is used by
25763 fp moves, that force all constants to memory to allow combining. */
25765 if (MEM_P (operands[1])
25766 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25767 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25768 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25769 if (push_operand (operands[0], VOIDmode))
25771 operands[0] = copy_rtx (operands[0]);
25772 PUT_MODE (operands[0], word_mode);
25774 else
25775 operands[0] = gen_lowpart (DImode, operands[0]);
25776 operands[1] = gen_lowpart (DImode, operands[1]);
25777 emit_move_insn (operands[0], operands[1]);
25778 return;
25781 /* The only non-offsettable memory we handle is push. */
25782 if (push_operand (operands[0], VOIDmode))
25783 push = 1;
25784 else
25785 gcc_assert (!MEM_P (operands[0])
25786 || offsettable_memref_p (operands[0]));
25788 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25789 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25791 /* When emitting push, take care for source operands on the stack. */
25792 if (push && MEM_P (operands[1])
25793 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25795 rtx src_base = XEXP (part[1][nparts - 1], 0);
25797 /* Compensate for the stack decrement by 4. */
25798 if (!TARGET_64BIT && nparts == 3
25799 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25800 src_base = plus_constant (Pmode, src_base, 4);
25802 /* src_base refers to the stack pointer and is
25803 automatically decreased by emitted push. */
25804 for (i = 0; i < nparts; i++)
25805 part[1][i] = change_address (part[1][i],
25806 GET_MODE (part[1][i]), src_base);
25809 /* We need to do copy in the right order in case an address register
25810 of the source overlaps the destination. */
25811 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25813 rtx tmp;
25815 for (i = 0; i < nparts; i++)
25817 collisionparts[i]
25818 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25819 if (collisionparts[i])
25820 collisions++;
25823 /* Collision in the middle part can be handled by reordering. */
25824 if (collisions == 1 && nparts == 3 && collisionparts [1])
25826 std::swap (part[0][1], part[0][2]);
25827 std::swap (part[1][1], part[1][2]);
25829 else if (collisions == 1
25830 && nparts == 4
25831 && (collisionparts [1] || collisionparts [2]))
25833 if (collisionparts [1])
25835 std::swap (part[0][1], part[0][2]);
25836 std::swap (part[1][1], part[1][2]);
25838 else
25840 std::swap (part[0][2], part[0][3]);
25841 std::swap (part[1][2], part[1][3]);
25845 /* If there are more collisions, we can't handle it by reordering.
25846 Do an lea to the last part and use only one colliding move. */
25847 else if (collisions > 1)
25849 rtx base, addr;
25851 collisions = 1;
25853 base = part[0][nparts - 1];
25855 /* Handle the case when the last part isn't valid for lea.
25856 Happens in 64-bit mode storing the 12-byte XFmode. */
25857 if (GET_MODE (base) != Pmode)
25858 base = gen_rtx_REG (Pmode, REGNO (base));
25860 addr = XEXP (part[1][0], 0);
25861 if (TARGET_TLS_DIRECT_SEG_REFS)
25863 struct ix86_address parts;
25864 int ok = ix86_decompose_address (addr, &parts);
25865 gcc_assert (ok);
25866 /* It is not valid to use %gs: or %fs: in lea. */
25867 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25869 emit_insn (gen_rtx_SET (base, addr));
25870 part[1][0] = replace_equiv_address (part[1][0], base);
25871 for (i = 1; i < nparts; i++)
25873 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25874 part[1][i] = replace_equiv_address (part[1][i], tmp);
25879 if (push)
25881 if (!TARGET_64BIT)
25883 if (nparts == 3)
25885 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25886 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25887 stack_pointer_rtx, GEN_INT (-4)));
25888 emit_move_insn (part[0][2], part[1][2]);
25890 else if (nparts == 4)
25892 emit_move_insn (part[0][3], part[1][3]);
25893 emit_move_insn (part[0][2], part[1][2]);
25896 else
25898 /* In 64bit mode we don't have 32bit push available. In case this is
25899 register, it is OK - we will just use larger counterpart. We also
25900 retype memory - these comes from attempt to avoid REX prefix on
25901 moving of second half of TFmode value. */
25902 if (GET_MODE (part[1][1]) == SImode)
25904 switch (GET_CODE (part[1][1]))
25906 case MEM:
25907 part[1][1] = adjust_address (part[1][1], DImode, 0);
25908 break;
25910 case REG:
25911 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25912 break;
25914 default:
25915 gcc_unreachable ();
25918 if (GET_MODE (part[1][0]) == SImode)
25919 part[1][0] = part[1][1];
25922 emit_move_insn (part[0][1], part[1][1]);
25923 emit_move_insn (part[0][0], part[1][0]);
25924 return;
25927 /* Choose correct order to not overwrite the source before it is copied. */
25928 if ((REG_P (part[0][0])
25929 && REG_P (part[1][1])
25930 && (REGNO (part[0][0]) == REGNO (part[1][1])
25931 || (nparts == 3
25932 && REGNO (part[0][0]) == REGNO (part[1][2]))
25933 || (nparts == 4
25934 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25935 || (collisions > 0
25936 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25938 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25940 operands[2 + i] = part[0][j];
25941 operands[6 + i] = part[1][j];
25944 else
25946 for (i = 0; i < nparts; i++)
25948 operands[2 + i] = part[0][i];
25949 operands[6 + i] = part[1][i];
25953 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25954 if (optimize_insn_for_size_p ())
25956 for (j = 0; j < nparts - 1; j++)
25957 if (CONST_INT_P (operands[6 + j])
25958 && operands[6 + j] != const0_rtx
25959 && REG_P (operands[2 + j]))
25960 for (i = j; i < nparts - 1; i++)
25961 if (CONST_INT_P (operands[7 + i])
25962 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25963 operands[7 + i] = operands[2 + j];
25966 for (i = 0; i < nparts; i++)
25967 emit_move_insn (operands[2 + i], operands[6 + i]);
25969 return;
25972 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25973 left shift by a constant, either using a single shift or
25974 a sequence of add instructions. */
25976 static void
25977 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25979 rtx (*insn)(rtx, rtx, rtx);
25981 if (count == 1
25982 || (count * ix86_cost->add <= ix86_cost->shift_const
25983 && !optimize_insn_for_size_p ()))
25985 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25986 while (count-- > 0)
25987 emit_insn (insn (operand, operand, operand));
25989 else
25991 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25992 emit_insn (insn (operand, operand, GEN_INT (count)));
25996 void
25997 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25999 rtx (*gen_ashl3)(rtx, rtx, rtx);
26000 rtx (*gen_shld)(rtx, rtx, rtx);
26001 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26003 rtx low[2], high[2];
26004 int count;
26006 if (CONST_INT_P (operands[2]))
26008 split_double_mode (mode, operands, 2, low, high);
26009 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26011 if (count >= half_width)
26013 emit_move_insn (high[0], low[1]);
26014 emit_move_insn (low[0], const0_rtx);
26016 if (count > half_width)
26017 ix86_expand_ashl_const (high[0], count - half_width, mode);
26019 else
26021 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26023 if (!rtx_equal_p (operands[0], operands[1]))
26024 emit_move_insn (operands[0], operands[1]);
26026 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26027 ix86_expand_ashl_const (low[0], count, mode);
26029 return;
26032 split_double_mode (mode, operands, 1, low, high);
26034 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26036 if (operands[1] == const1_rtx)
26038 /* Assuming we've chosen a QImode capable registers, then 1 << N
26039 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26040 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26042 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26044 ix86_expand_clear (low[0]);
26045 ix86_expand_clear (high[0]);
26046 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26048 d = gen_lowpart (QImode, low[0]);
26049 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26050 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26051 emit_insn (gen_rtx_SET (d, s));
26053 d = gen_lowpart (QImode, high[0]);
26054 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26055 s = gen_rtx_NE (QImode, flags, const0_rtx);
26056 emit_insn (gen_rtx_SET (d, s));
26059 /* Otherwise, we can get the same results by manually performing
26060 a bit extract operation on bit 5/6, and then performing the two
26061 shifts. The two methods of getting 0/1 into low/high are exactly
26062 the same size. Avoiding the shift in the bit extract case helps
26063 pentium4 a bit; no one else seems to care much either way. */
26064 else
26066 machine_mode half_mode;
26067 rtx (*gen_lshr3)(rtx, rtx, rtx);
26068 rtx (*gen_and3)(rtx, rtx, rtx);
26069 rtx (*gen_xor3)(rtx, rtx, rtx);
26070 HOST_WIDE_INT bits;
26071 rtx x;
26073 if (mode == DImode)
26075 half_mode = SImode;
26076 gen_lshr3 = gen_lshrsi3;
26077 gen_and3 = gen_andsi3;
26078 gen_xor3 = gen_xorsi3;
26079 bits = 5;
26081 else
26083 half_mode = DImode;
26084 gen_lshr3 = gen_lshrdi3;
26085 gen_and3 = gen_anddi3;
26086 gen_xor3 = gen_xordi3;
26087 bits = 6;
26090 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26091 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26092 else
26093 x = gen_lowpart (half_mode, operands[2]);
26094 emit_insn (gen_rtx_SET (high[0], x));
26096 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26097 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26098 emit_move_insn (low[0], high[0]);
26099 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26102 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26103 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26104 return;
26107 if (operands[1] == constm1_rtx)
26109 /* For -1 << N, we can avoid the shld instruction, because we
26110 know that we're shifting 0...31/63 ones into a -1. */
26111 emit_move_insn (low[0], constm1_rtx);
26112 if (optimize_insn_for_size_p ())
26113 emit_move_insn (high[0], low[0]);
26114 else
26115 emit_move_insn (high[0], constm1_rtx);
26117 else
26119 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26121 if (!rtx_equal_p (operands[0], operands[1]))
26122 emit_move_insn (operands[0], operands[1]);
26124 split_double_mode (mode, operands, 1, low, high);
26125 emit_insn (gen_shld (high[0], low[0], operands[2]));
26128 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26130 if (TARGET_CMOVE && scratch)
26132 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26133 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26135 ix86_expand_clear (scratch);
26136 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26138 else
26140 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26141 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26143 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26147 void
26148 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26150 rtx (*gen_ashr3)(rtx, rtx, rtx)
26151 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26152 rtx (*gen_shrd)(rtx, rtx, rtx);
26153 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26155 rtx low[2], high[2];
26156 int count;
26158 if (CONST_INT_P (operands[2]))
26160 split_double_mode (mode, operands, 2, low, high);
26161 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26163 if (count == GET_MODE_BITSIZE (mode) - 1)
26165 emit_move_insn (high[0], high[1]);
26166 emit_insn (gen_ashr3 (high[0], high[0],
26167 GEN_INT (half_width - 1)));
26168 emit_move_insn (low[0], high[0]);
26171 else if (count >= half_width)
26173 emit_move_insn (low[0], high[1]);
26174 emit_move_insn (high[0], low[0]);
26175 emit_insn (gen_ashr3 (high[0], high[0],
26176 GEN_INT (half_width - 1)));
26178 if (count > half_width)
26179 emit_insn (gen_ashr3 (low[0], low[0],
26180 GEN_INT (count - half_width)));
26182 else
26184 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26186 if (!rtx_equal_p (operands[0], operands[1]))
26187 emit_move_insn (operands[0], operands[1]);
26189 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26190 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26193 else
26195 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26197 if (!rtx_equal_p (operands[0], operands[1]))
26198 emit_move_insn (operands[0], operands[1]);
26200 split_double_mode (mode, operands, 1, low, high);
26202 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26203 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26205 if (TARGET_CMOVE && scratch)
26207 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26208 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26210 emit_move_insn (scratch, high[0]);
26211 emit_insn (gen_ashr3 (scratch, scratch,
26212 GEN_INT (half_width - 1)));
26213 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26214 scratch));
26216 else
26218 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26219 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26221 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26226 void
26227 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26229 rtx (*gen_lshr3)(rtx, rtx, rtx)
26230 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26231 rtx (*gen_shrd)(rtx, rtx, rtx);
26232 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26234 rtx low[2], high[2];
26235 int count;
26237 if (CONST_INT_P (operands[2]))
26239 split_double_mode (mode, operands, 2, low, high);
26240 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26242 if (count >= half_width)
26244 emit_move_insn (low[0], high[1]);
26245 ix86_expand_clear (high[0]);
26247 if (count > half_width)
26248 emit_insn (gen_lshr3 (low[0], low[0],
26249 GEN_INT (count - half_width)));
26251 else
26253 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26255 if (!rtx_equal_p (operands[0], operands[1]))
26256 emit_move_insn (operands[0], operands[1]);
26258 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26259 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26262 else
26264 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26266 if (!rtx_equal_p (operands[0], operands[1]))
26267 emit_move_insn (operands[0], operands[1]);
26269 split_double_mode (mode, operands, 1, low, high);
26271 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26272 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26274 if (TARGET_CMOVE && scratch)
26276 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26277 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26279 ix86_expand_clear (scratch);
26280 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26281 scratch));
26283 else
26285 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26286 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26288 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26293 /* Predict just emitted jump instruction to be taken with probability PROB. */
26294 static void
26295 predict_jump (int prob)
26297 rtx_insn *insn = get_last_insn ();
26298 gcc_assert (JUMP_P (insn));
26299 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26302 /* Helper function for the string operations below. Dest VARIABLE whether
26303 it is aligned to VALUE bytes. If true, jump to the label. */
26304 static rtx_code_label *
26305 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26307 rtx_code_label *label = gen_label_rtx ();
26308 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26309 if (GET_MODE (variable) == DImode)
26310 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26311 else
26312 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26313 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26314 1, label);
26315 if (epilogue)
26316 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26317 else
26318 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26319 return label;
26322 /* Adjust COUNTER by the VALUE. */
26323 static void
26324 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26326 rtx (*gen_add)(rtx, rtx, rtx)
26327 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26329 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26332 /* Zero extend possibly SImode EXP to Pmode register. */
26334 ix86_zero_extend_to_Pmode (rtx exp)
26336 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26339 /* Divide COUNTREG by SCALE. */
26340 static rtx
26341 scale_counter (rtx countreg, int scale)
26343 rtx sc;
26345 if (scale == 1)
26346 return countreg;
26347 if (CONST_INT_P (countreg))
26348 return GEN_INT (INTVAL (countreg) / scale);
26349 gcc_assert (REG_P (countreg));
26351 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26352 GEN_INT (exact_log2 (scale)),
26353 NULL, 1, OPTAB_DIRECT);
26354 return sc;
26357 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26358 DImode for constant loop counts. */
26360 static machine_mode
26361 counter_mode (rtx count_exp)
26363 if (GET_MODE (count_exp) != VOIDmode)
26364 return GET_MODE (count_exp);
26365 if (!CONST_INT_P (count_exp))
26366 return Pmode;
26367 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26368 return DImode;
26369 return SImode;
26372 /* Copy the address to a Pmode register. This is used for x32 to
26373 truncate DImode TLS address to a SImode register. */
26375 static rtx
26376 ix86_copy_addr_to_reg (rtx addr)
26378 rtx reg;
26379 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26381 reg = copy_addr_to_reg (addr);
26382 REG_POINTER (reg) = 1;
26383 return reg;
26385 else
26387 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26388 reg = copy_to_mode_reg (DImode, addr);
26389 REG_POINTER (reg) = 1;
26390 return gen_rtx_SUBREG (SImode, reg, 0);
26394 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26395 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26396 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26397 memory by VALUE (supposed to be in MODE).
26399 The size is rounded down to whole number of chunk size moved at once.
26400 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26403 static void
26404 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26405 rtx destptr, rtx srcptr, rtx value,
26406 rtx count, machine_mode mode, int unroll,
26407 int expected_size, bool issetmem)
26409 rtx_code_label *out_label, *top_label;
26410 rtx iter, tmp;
26411 machine_mode iter_mode = counter_mode (count);
26412 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26413 rtx piece_size = GEN_INT (piece_size_n);
26414 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26415 rtx size;
26416 int i;
26418 top_label = gen_label_rtx ();
26419 out_label = gen_label_rtx ();
26420 iter = gen_reg_rtx (iter_mode);
26422 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26423 NULL, 1, OPTAB_DIRECT);
26424 /* Those two should combine. */
26425 if (piece_size == const1_rtx)
26427 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26428 true, out_label);
26429 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26431 emit_move_insn (iter, const0_rtx);
26433 emit_label (top_label);
26435 tmp = convert_modes (Pmode, iter_mode, iter, true);
26437 /* This assert could be relaxed - in this case we'll need to compute
26438 smallest power of two, containing in PIECE_SIZE_N and pass it to
26439 offset_address. */
26440 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26441 destmem = offset_address (destmem, tmp, piece_size_n);
26442 destmem = adjust_address (destmem, mode, 0);
26444 if (!issetmem)
26446 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26447 srcmem = adjust_address (srcmem, mode, 0);
26449 /* When unrolling for chips that reorder memory reads and writes,
26450 we can save registers by using single temporary.
26451 Also using 4 temporaries is overkill in 32bit mode. */
26452 if (!TARGET_64BIT && 0)
26454 for (i = 0; i < unroll; i++)
26456 if (i)
26458 destmem =
26459 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26460 srcmem =
26461 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26463 emit_move_insn (destmem, srcmem);
26466 else
26468 rtx tmpreg[4];
26469 gcc_assert (unroll <= 4);
26470 for (i = 0; i < unroll; i++)
26472 tmpreg[i] = gen_reg_rtx (mode);
26473 if (i)
26475 srcmem =
26476 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26478 emit_move_insn (tmpreg[i], srcmem);
26480 for (i = 0; i < unroll; i++)
26482 if (i)
26484 destmem =
26485 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26487 emit_move_insn (destmem, tmpreg[i]);
26491 else
26492 for (i = 0; i < unroll; i++)
26494 if (i)
26495 destmem =
26496 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26497 emit_move_insn (destmem, value);
26500 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26501 true, OPTAB_LIB_WIDEN);
26502 if (tmp != iter)
26503 emit_move_insn (iter, tmp);
26505 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26506 true, top_label);
26507 if (expected_size != -1)
26509 expected_size /= GET_MODE_SIZE (mode) * unroll;
26510 if (expected_size == 0)
26511 predict_jump (0);
26512 else if (expected_size > REG_BR_PROB_BASE)
26513 predict_jump (REG_BR_PROB_BASE - 1);
26514 else
26515 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26517 else
26518 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26519 iter = ix86_zero_extend_to_Pmode (iter);
26520 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26521 true, OPTAB_LIB_WIDEN);
26522 if (tmp != destptr)
26523 emit_move_insn (destptr, tmp);
26524 if (!issetmem)
26526 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26527 true, OPTAB_LIB_WIDEN);
26528 if (tmp != srcptr)
26529 emit_move_insn (srcptr, tmp);
26531 emit_label (out_label);
26534 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26535 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26536 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26537 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26538 ORIG_VALUE is the original value passed to memset to fill the memory with.
26539 Other arguments have same meaning as for previous function. */
26541 static void
26542 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26543 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26544 rtx count,
26545 machine_mode mode, bool issetmem)
26547 rtx destexp;
26548 rtx srcexp;
26549 rtx countreg;
26550 HOST_WIDE_INT rounded_count;
26552 /* If possible, it is shorter to use rep movs.
26553 TODO: Maybe it is better to move this logic to decide_alg. */
26554 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26555 && (!issetmem || orig_value == const0_rtx))
26556 mode = SImode;
26558 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26559 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26561 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26562 GET_MODE_SIZE (mode)));
26563 if (mode != QImode)
26565 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26566 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26567 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26569 else
26570 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26571 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26573 rounded_count
26574 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26575 destmem = shallow_copy_rtx (destmem);
26576 set_mem_size (destmem, rounded_count);
26578 else if (MEM_SIZE_KNOWN_P (destmem))
26579 clear_mem_size (destmem);
26581 if (issetmem)
26583 value = force_reg (mode, gen_lowpart (mode, value));
26584 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26586 else
26588 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26589 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26590 if (mode != QImode)
26592 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26593 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26594 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26596 else
26597 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26598 if (CONST_INT_P (count))
26600 rounded_count
26601 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26602 srcmem = shallow_copy_rtx (srcmem);
26603 set_mem_size (srcmem, rounded_count);
26605 else
26607 if (MEM_SIZE_KNOWN_P (srcmem))
26608 clear_mem_size (srcmem);
26610 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26611 destexp, srcexp));
26615 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26616 DESTMEM.
26617 SRC is passed by pointer to be updated on return.
26618 Return value is updated DST. */
26619 static rtx
26620 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26621 HOST_WIDE_INT size_to_move)
26623 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26624 enum insn_code code;
26625 machine_mode move_mode;
26626 int piece_size, i;
26628 /* Find the widest mode in which we could perform moves.
26629 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26630 it until move of such size is supported. */
26631 piece_size = 1 << floor_log2 (size_to_move);
26632 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26633 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26635 gcc_assert (piece_size > 1);
26636 piece_size >>= 1;
26639 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26640 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26641 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26643 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26644 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26645 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26647 move_mode = word_mode;
26648 piece_size = GET_MODE_SIZE (move_mode);
26649 code = optab_handler (mov_optab, move_mode);
26652 gcc_assert (code != CODE_FOR_nothing);
26654 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26655 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26657 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26658 gcc_assert (size_to_move % piece_size == 0);
26659 adjust = GEN_INT (piece_size);
26660 for (i = 0; i < size_to_move; i += piece_size)
26662 /* We move from memory to memory, so we'll need to do it via
26663 a temporary register. */
26664 tempreg = gen_reg_rtx (move_mode);
26665 emit_insn (GEN_FCN (code) (tempreg, src));
26666 emit_insn (GEN_FCN (code) (dst, tempreg));
26668 emit_move_insn (destptr,
26669 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26670 emit_move_insn (srcptr,
26671 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26673 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26674 piece_size);
26675 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26676 piece_size);
26679 /* Update DST and SRC rtx. */
26680 *srcmem = src;
26681 return dst;
26684 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26685 static void
26686 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26687 rtx destptr, rtx srcptr, rtx count, int max_size)
26689 rtx src, dest;
26690 if (CONST_INT_P (count))
26692 HOST_WIDE_INT countval = INTVAL (count);
26693 HOST_WIDE_INT epilogue_size = countval % max_size;
26694 int i;
26696 /* For now MAX_SIZE should be a power of 2. This assert could be
26697 relaxed, but it'll require a bit more complicated epilogue
26698 expanding. */
26699 gcc_assert ((max_size & (max_size - 1)) == 0);
26700 for (i = max_size; i >= 1; i >>= 1)
26702 if (epilogue_size & i)
26703 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26705 return;
26707 if (max_size > 8)
26709 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26710 count, 1, OPTAB_DIRECT);
26711 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26712 count, QImode, 1, 4, false);
26713 return;
26716 /* When there are stringops, we can cheaply increase dest and src pointers.
26717 Otherwise we save code size by maintaining offset (zero is readily
26718 available from preceding rep operation) and using x86 addressing modes.
26720 if (TARGET_SINGLE_STRINGOP)
26722 if (max_size > 4)
26724 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26725 src = change_address (srcmem, SImode, srcptr);
26726 dest = change_address (destmem, SImode, destptr);
26727 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26728 emit_label (label);
26729 LABEL_NUSES (label) = 1;
26731 if (max_size > 2)
26733 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26734 src = change_address (srcmem, HImode, srcptr);
26735 dest = change_address (destmem, HImode, destptr);
26736 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26737 emit_label (label);
26738 LABEL_NUSES (label) = 1;
26740 if (max_size > 1)
26742 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26743 src = change_address (srcmem, QImode, srcptr);
26744 dest = change_address (destmem, QImode, destptr);
26745 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26746 emit_label (label);
26747 LABEL_NUSES (label) = 1;
26750 else
26752 rtx offset = force_reg (Pmode, const0_rtx);
26753 rtx tmp;
26755 if (max_size > 4)
26757 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26758 src = change_address (srcmem, SImode, srcptr);
26759 dest = change_address (destmem, SImode, destptr);
26760 emit_move_insn (dest, src);
26761 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26762 true, OPTAB_LIB_WIDEN);
26763 if (tmp != offset)
26764 emit_move_insn (offset, tmp);
26765 emit_label (label);
26766 LABEL_NUSES (label) = 1;
26768 if (max_size > 2)
26770 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26771 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26772 src = change_address (srcmem, HImode, tmp);
26773 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26774 dest = change_address (destmem, HImode, tmp);
26775 emit_move_insn (dest, src);
26776 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26777 true, OPTAB_LIB_WIDEN);
26778 if (tmp != offset)
26779 emit_move_insn (offset, tmp);
26780 emit_label (label);
26781 LABEL_NUSES (label) = 1;
26783 if (max_size > 1)
26785 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26786 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26787 src = change_address (srcmem, QImode, tmp);
26788 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26789 dest = change_address (destmem, QImode, tmp);
26790 emit_move_insn (dest, src);
26791 emit_label (label);
26792 LABEL_NUSES (label) = 1;
26797 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26798 with value PROMOTED_VAL.
26799 SRC is passed by pointer to be updated on return.
26800 Return value is updated DST. */
26801 static rtx
26802 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26803 HOST_WIDE_INT size_to_move)
26805 rtx dst = destmem, adjust;
26806 enum insn_code code;
26807 machine_mode move_mode;
26808 int piece_size, i;
26810 /* Find the widest mode in which we could perform moves.
26811 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26812 it until move of such size is supported. */
26813 move_mode = GET_MODE (promoted_val);
26814 if (move_mode == VOIDmode)
26815 move_mode = QImode;
26816 if (size_to_move < GET_MODE_SIZE (move_mode))
26818 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26819 move_mode = int_mode_for_size (move_bits, 0).require ();
26820 promoted_val = gen_lowpart (move_mode, promoted_val);
26822 piece_size = GET_MODE_SIZE (move_mode);
26823 code = optab_handler (mov_optab, move_mode);
26824 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26826 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26828 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26829 gcc_assert (size_to_move % piece_size == 0);
26830 adjust = GEN_INT (piece_size);
26831 for (i = 0; i < size_to_move; i += piece_size)
26833 if (piece_size <= GET_MODE_SIZE (word_mode))
26835 emit_insn (gen_strset (destptr, dst, promoted_val));
26836 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26837 piece_size);
26838 continue;
26841 emit_insn (GEN_FCN (code) (dst, promoted_val));
26843 emit_move_insn (destptr,
26844 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26846 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26847 piece_size);
26850 /* Update DST rtx. */
26851 return dst;
26853 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26854 static void
26855 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26856 rtx count, int max_size)
26858 count =
26859 expand_simple_binop (counter_mode (count), AND, count,
26860 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26861 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26862 gen_lowpart (QImode, value), count, QImode,
26863 1, max_size / 2, true);
26866 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26867 static void
26868 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26869 rtx count, int max_size)
26871 rtx dest;
26873 if (CONST_INT_P (count))
26875 HOST_WIDE_INT countval = INTVAL (count);
26876 HOST_WIDE_INT epilogue_size = countval % max_size;
26877 int i;
26879 /* For now MAX_SIZE should be a power of 2. This assert could be
26880 relaxed, but it'll require a bit more complicated epilogue
26881 expanding. */
26882 gcc_assert ((max_size & (max_size - 1)) == 0);
26883 for (i = max_size; i >= 1; i >>= 1)
26885 if (epilogue_size & i)
26887 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26888 destmem = emit_memset (destmem, destptr, vec_value, i);
26889 else
26890 destmem = emit_memset (destmem, destptr, value, i);
26893 return;
26895 if (max_size > 32)
26897 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26898 return;
26900 if (max_size > 16)
26902 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26903 if (TARGET_64BIT)
26905 dest = change_address (destmem, DImode, destptr);
26906 emit_insn (gen_strset (destptr, dest, value));
26907 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26908 emit_insn (gen_strset (destptr, dest, value));
26910 else
26912 dest = change_address (destmem, SImode, destptr);
26913 emit_insn (gen_strset (destptr, dest, value));
26914 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26915 emit_insn (gen_strset (destptr, dest, value));
26916 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26917 emit_insn (gen_strset (destptr, dest, value));
26918 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26919 emit_insn (gen_strset (destptr, dest, value));
26921 emit_label (label);
26922 LABEL_NUSES (label) = 1;
26924 if (max_size > 8)
26926 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26927 if (TARGET_64BIT)
26929 dest = change_address (destmem, DImode, destptr);
26930 emit_insn (gen_strset (destptr, dest, value));
26932 else
26934 dest = change_address (destmem, SImode, destptr);
26935 emit_insn (gen_strset (destptr, dest, value));
26936 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26937 emit_insn (gen_strset (destptr, dest, value));
26939 emit_label (label);
26940 LABEL_NUSES (label) = 1;
26942 if (max_size > 4)
26944 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26945 dest = change_address (destmem, SImode, destptr);
26946 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26947 emit_label (label);
26948 LABEL_NUSES (label) = 1;
26950 if (max_size > 2)
26952 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26953 dest = change_address (destmem, HImode, destptr);
26954 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26955 emit_label (label);
26956 LABEL_NUSES (label) = 1;
26958 if (max_size > 1)
26960 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26961 dest = change_address (destmem, QImode, destptr);
26962 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26963 emit_label (label);
26964 LABEL_NUSES (label) = 1;
26968 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26969 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26970 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26971 ignored.
26972 Return value is updated DESTMEM. */
26973 static rtx
26974 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26975 rtx destptr, rtx srcptr, rtx value,
26976 rtx vec_value, rtx count, int align,
26977 int desired_alignment, bool issetmem)
26979 int i;
26980 for (i = 1; i < desired_alignment; i <<= 1)
26982 if (align <= i)
26984 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26985 if (issetmem)
26987 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26988 destmem = emit_memset (destmem, destptr, vec_value, i);
26989 else
26990 destmem = emit_memset (destmem, destptr, value, i);
26992 else
26993 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26994 ix86_adjust_counter (count, i);
26995 emit_label (label);
26996 LABEL_NUSES (label) = 1;
26997 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27000 return destmem;
27003 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27004 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27005 and jump to DONE_LABEL. */
27006 static void
27007 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27008 rtx destptr, rtx srcptr,
27009 rtx value, rtx vec_value,
27010 rtx count, int size,
27011 rtx done_label, bool issetmem)
27013 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27014 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
27015 rtx modesize;
27016 int n;
27018 /* If we do not have vector value to copy, we must reduce size. */
27019 if (issetmem)
27021 if (!vec_value)
27023 if (GET_MODE (value) == VOIDmode && size > 8)
27024 mode = Pmode;
27025 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27026 mode = GET_MODE (value);
27028 else
27029 mode = GET_MODE (vec_value), value = vec_value;
27031 else
27033 /* Choose appropriate vector mode. */
27034 if (size >= 32)
27035 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27036 else if (size >= 16)
27037 mode = TARGET_SSE ? V16QImode : DImode;
27038 srcmem = change_address (srcmem, mode, srcptr);
27040 destmem = change_address (destmem, mode, destptr);
27041 modesize = GEN_INT (GET_MODE_SIZE (mode));
27042 gcc_assert (GET_MODE_SIZE (mode) <= size);
27043 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27045 if (issetmem)
27046 emit_move_insn (destmem, gen_lowpart (mode, value));
27047 else
27049 emit_move_insn (destmem, srcmem);
27050 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27052 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27055 destmem = offset_address (destmem, count, 1);
27056 destmem = offset_address (destmem, GEN_INT (-2 * size),
27057 GET_MODE_SIZE (mode));
27058 if (!issetmem)
27060 srcmem = offset_address (srcmem, count, 1);
27061 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27062 GET_MODE_SIZE (mode));
27064 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27066 if (issetmem)
27067 emit_move_insn (destmem, gen_lowpart (mode, value));
27068 else
27070 emit_move_insn (destmem, srcmem);
27071 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27073 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27075 emit_jump_insn (gen_jump (done_label));
27076 emit_barrier ();
27078 emit_label (label);
27079 LABEL_NUSES (label) = 1;
27082 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27083 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27084 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27085 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27086 DONE_LABEL is a label after the whole copying sequence. The label is created
27087 on demand if *DONE_LABEL is NULL.
27088 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27089 bounds after the initial copies.
27091 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27092 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27093 we will dispatch to a library call for large blocks.
27095 In pseudocode we do:
27097 if (COUNT < SIZE)
27099 Assume that SIZE is 4. Bigger sizes are handled analogously
27100 if (COUNT & 4)
27102 copy 4 bytes from SRCPTR to DESTPTR
27103 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27104 goto done_label
27106 if (!COUNT)
27107 goto done_label;
27108 copy 1 byte from SRCPTR to DESTPTR
27109 if (COUNT & 2)
27111 copy 2 bytes from SRCPTR to DESTPTR
27112 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27115 else
27117 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27118 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27120 OLD_DESPTR = DESTPTR;
27121 Align DESTPTR up to DESIRED_ALIGN
27122 SRCPTR += DESTPTR - OLD_DESTPTR
27123 COUNT -= DEST_PTR - OLD_DESTPTR
27124 if (DYNAMIC_CHECK)
27125 Round COUNT down to multiple of SIZE
27126 << optional caller supplied zero size guard is here >>
27127 << optional caller supplied dynamic check is here >>
27128 << caller supplied main copy loop is here >>
27130 done_label:
27132 static void
27133 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27134 rtx *destptr, rtx *srcptr,
27135 machine_mode mode,
27136 rtx value, rtx vec_value,
27137 rtx *count,
27138 rtx_code_label **done_label,
27139 int size,
27140 int desired_align,
27141 int align,
27142 unsigned HOST_WIDE_INT *min_size,
27143 bool dynamic_check,
27144 bool issetmem)
27146 rtx_code_label *loop_label = NULL, *label;
27147 int n;
27148 rtx modesize;
27149 int prolog_size = 0;
27150 rtx mode_value;
27152 /* Chose proper value to copy. */
27153 if (issetmem && VECTOR_MODE_P (mode))
27154 mode_value = vec_value;
27155 else
27156 mode_value = value;
27157 gcc_assert (GET_MODE_SIZE (mode) <= size);
27159 /* See if block is big or small, handle small blocks. */
27160 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27162 int size2 = size;
27163 loop_label = gen_label_rtx ();
27165 if (!*done_label)
27166 *done_label = gen_label_rtx ();
27168 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27169 1, loop_label);
27170 size2 >>= 1;
27172 /* Handle sizes > 3. */
27173 for (;size2 > 2; size2 >>= 1)
27174 expand_small_movmem_or_setmem (destmem, srcmem,
27175 *destptr, *srcptr,
27176 value, vec_value,
27177 *count,
27178 size2, *done_label, issetmem);
27179 /* Nothing to copy? Jump to DONE_LABEL if so */
27180 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27181 1, *done_label);
27183 /* Do a byte copy. */
27184 destmem = change_address (destmem, QImode, *destptr);
27185 if (issetmem)
27186 emit_move_insn (destmem, gen_lowpart (QImode, value));
27187 else
27189 srcmem = change_address (srcmem, QImode, *srcptr);
27190 emit_move_insn (destmem, srcmem);
27193 /* Handle sizes 2 and 3. */
27194 label = ix86_expand_aligntest (*count, 2, false);
27195 destmem = change_address (destmem, HImode, *destptr);
27196 destmem = offset_address (destmem, *count, 1);
27197 destmem = offset_address (destmem, GEN_INT (-2), 2);
27198 if (issetmem)
27199 emit_move_insn (destmem, gen_lowpart (HImode, value));
27200 else
27202 srcmem = change_address (srcmem, HImode, *srcptr);
27203 srcmem = offset_address (srcmem, *count, 1);
27204 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27205 emit_move_insn (destmem, srcmem);
27208 emit_label (label);
27209 LABEL_NUSES (label) = 1;
27210 emit_jump_insn (gen_jump (*done_label));
27211 emit_barrier ();
27213 else
27214 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27215 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27217 /* Start memcpy for COUNT >= SIZE. */
27218 if (loop_label)
27220 emit_label (loop_label);
27221 LABEL_NUSES (loop_label) = 1;
27224 /* Copy first desired_align bytes. */
27225 if (!issetmem)
27226 srcmem = change_address (srcmem, mode, *srcptr);
27227 destmem = change_address (destmem, mode, *destptr);
27228 modesize = GEN_INT (GET_MODE_SIZE (mode));
27229 for (n = 0; prolog_size < desired_align - align; n++)
27231 if (issetmem)
27232 emit_move_insn (destmem, mode_value);
27233 else
27235 emit_move_insn (destmem, srcmem);
27236 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27238 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27239 prolog_size += GET_MODE_SIZE (mode);
27243 /* Copy last SIZE bytes. */
27244 destmem = offset_address (destmem, *count, 1);
27245 destmem = offset_address (destmem,
27246 GEN_INT (-size - prolog_size),
27248 if (issetmem)
27249 emit_move_insn (destmem, mode_value);
27250 else
27252 srcmem = offset_address (srcmem, *count, 1);
27253 srcmem = offset_address (srcmem,
27254 GEN_INT (-size - prolog_size),
27256 emit_move_insn (destmem, srcmem);
27258 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27260 destmem = offset_address (destmem, modesize, 1);
27261 if (issetmem)
27262 emit_move_insn (destmem, mode_value);
27263 else
27265 srcmem = offset_address (srcmem, modesize, 1);
27266 emit_move_insn (destmem, srcmem);
27270 /* Align destination. */
27271 if (desired_align > 1 && desired_align > align)
27273 rtx saveddest = *destptr;
27275 gcc_assert (desired_align <= size);
27276 /* Align destptr up, place it to new register. */
27277 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27278 GEN_INT (prolog_size),
27279 NULL_RTX, 1, OPTAB_DIRECT);
27280 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27281 REG_POINTER (*destptr) = 1;
27282 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27283 GEN_INT (-desired_align),
27284 *destptr, 1, OPTAB_DIRECT);
27285 /* See how many bytes we skipped. */
27286 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27287 *destptr,
27288 saveddest, 1, OPTAB_DIRECT);
27289 /* Adjust srcptr and count. */
27290 if (!issetmem)
27291 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27292 saveddest, *srcptr, 1, OPTAB_DIRECT);
27293 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27294 saveddest, *count, 1, OPTAB_DIRECT);
27295 /* We copied at most size + prolog_size. */
27296 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27297 *min_size
27298 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27299 else
27300 *min_size = 0;
27302 /* Our loops always round down the block size, but for dispatch to
27303 library we need precise value. */
27304 if (dynamic_check)
27305 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27306 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27308 else
27310 gcc_assert (prolog_size == 0);
27311 /* Decrease count, so we won't end up copying last word twice. */
27312 if (!CONST_INT_P (*count))
27313 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27314 constm1_rtx, *count, 1, OPTAB_DIRECT);
27315 else
27316 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27317 (unsigned HOST_WIDE_INT)size));
27318 if (*min_size)
27319 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27324 /* This function is like the previous one, except here we know how many bytes
27325 need to be copied. That allows us to update alignment not only of DST, which
27326 is returned, but also of SRC, which is passed as a pointer for that
27327 reason. */
27328 static rtx
27329 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27330 rtx srcreg, rtx value, rtx vec_value,
27331 int desired_align, int align_bytes,
27332 bool issetmem)
27334 rtx src = NULL;
27335 rtx orig_dst = dst;
27336 rtx orig_src = NULL;
27337 int piece_size = 1;
27338 int copied_bytes = 0;
27340 if (!issetmem)
27342 gcc_assert (srcp != NULL);
27343 src = *srcp;
27344 orig_src = src;
27347 for (piece_size = 1;
27348 piece_size <= desired_align && copied_bytes < align_bytes;
27349 piece_size <<= 1)
27351 if (align_bytes & piece_size)
27353 if (issetmem)
27355 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27356 dst = emit_memset (dst, destreg, vec_value, piece_size);
27357 else
27358 dst = emit_memset (dst, destreg, value, piece_size);
27360 else
27361 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27362 copied_bytes += piece_size;
27365 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27366 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27367 if (MEM_SIZE_KNOWN_P (orig_dst))
27368 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27370 if (!issetmem)
27372 int src_align_bytes = get_mem_align_offset (src, desired_align
27373 * BITS_PER_UNIT);
27374 if (src_align_bytes >= 0)
27375 src_align_bytes = desired_align - src_align_bytes;
27376 if (src_align_bytes >= 0)
27378 unsigned int src_align;
27379 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27381 if ((src_align_bytes & (src_align - 1))
27382 == (align_bytes & (src_align - 1)))
27383 break;
27385 if (src_align > (unsigned int) desired_align)
27386 src_align = desired_align;
27387 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27388 set_mem_align (src, src_align * BITS_PER_UNIT);
27390 if (MEM_SIZE_KNOWN_P (orig_src))
27391 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27392 *srcp = src;
27395 return dst;
27398 /* Return true if ALG can be used in current context.
27399 Assume we expand memset if MEMSET is true. */
27400 static bool
27401 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27403 if (alg == no_stringop)
27404 return false;
27405 if (alg == vector_loop)
27406 return TARGET_SSE || TARGET_AVX;
27407 /* Algorithms using the rep prefix want at least edi and ecx;
27408 additionally, memset wants eax and memcpy wants esi. Don't
27409 consider such algorithms if the user has appropriated those
27410 registers for their own purposes, or if we have a non-default
27411 address space, since some string insns cannot override the segment. */
27412 if (alg == rep_prefix_1_byte
27413 || alg == rep_prefix_4_byte
27414 || alg == rep_prefix_8_byte)
27416 if (have_as)
27417 return false;
27418 if (fixed_regs[CX_REG]
27419 || fixed_regs[DI_REG]
27420 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27421 return false;
27423 return true;
27426 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27427 static enum stringop_alg
27428 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27429 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27430 bool memset, bool zero_memset, bool have_as,
27431 int *dynamic_check, bool *noalign, bool recur)
27433 const struct stringop_algs *algs;
27434 bool optimize_for_speed;
27435 int max = 0;
27436 const struct processor_costs *cost;
27437 int i;
27438 bool any_alg_usable_p = false;
27440 *noalign = false;
27441 *dynamic_check = -1;
27443 /* Even if the string operation call is cold, we still might spend a lot
27444 of time processing large blocks. */
27445 if (optimize_function_for_size_p (cfun)
27446 || (optimize_insn_for_size_p ()
27447 && (max_size < 256
27448 || (expected_size != -1 && expected_size < 256))))
27449 optimize_for_speed = false;
27450 else
27451 optimize_for_speed = true;
27453 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27454 if (memset)
27455 algs = &cost->memset[TARGET_64BIT != 0];
27456 else
27457 algs = &cost->memcpy[TARGET_64BIT != 0];
27459 /* See maximal size for user defined algorithm. */
27460 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27462 enum stringop_alg candidate = algs->size[i].alg;
27463 bool usable = alg_usable_p (candidate, memset, have_as);
27464 any_alg_usable_p |= usable;
27466 if (candidate != libcall && candidate && usable)
27467 max = algs->size[i].max;
27470 /* If expected size is not known but max size is small enough
27471 so inline version is a win, set expected size into
27472 the range. */
27473 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27474 && expected_size == -1)
27475 expected_size = min_size / 2 + max_size / 2;
27477 /* If user specified the algorithm, honor it if possible. */
27478 if (ix86_stringop_alg != no_stringop
27479 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27480 return ix86_stringop_alg;
27481 /* rep; movq or rep; movl is the smallest variant. */
27482 else if (!optimize_for_speed)
27484 *noalign = true;
27485 if (!count || (count & 3) || (memset && !zero_memset))
27486 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27487 ? rep_prefix_1_byte : loop_1_byte;
27488 else
27489 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27490 ? rep_prefix_4_byte : loop;
27492 /* Very tiny blocks are best handled via the loop, REP is expensive to
27493 setup. */
27494 else if (expected_size != -1 && expected_size < 4)
27495 return loop_1_byte;
27496 else if (expected_size != -1)
27498 enum stringop_alg alg = libcall;
27499 bool alg_noalign = false;
27500 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27502 /* We get here if the algorithms that were not libcall-based
27503 were rep-prefix based and we are unable to use rep prefixes
27504 based on global register usage. Break out of the loop and
27505 use the heuristic below. */
27506 if (algs->size[i].max == 0)
27507 break;
27508 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27510 enum stringop_alg candidate = algs->size[i].alg;
27512 if (candidate != libcall
27513 && alg_usable_p (candidate, memset, have_as))
27515 alg = candidate;
27516 alg_noalign = algs->size[i].noalign;
27518 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27519 last non-libcall inline algorithm. */
27520 if (TARGET_INLINE_ALL_STRINGOPS)
27522 /* When the current size is best to be copied by a libcall,
27523 but we are still forced to inline, run the heuristic below
27524 that will pick code for medium sized blocks. */
27525 if (alg != libcall)
27527 *noalign = alg_noalign;
27528 return alg;
27530 else if (!any_alg_usable_p)
27531 break;
27533 else if (alg_usable_p (candidate, memset, have_as))
27535 *noalign = algs->size[i].noalign;
27536 return candidate;
27541 /* When asked to inline the call anyway, try to pick meaningful choice.
27542 We look for maximal size of block that is faster to copy by hand and
27543 take blocks of at most of that size guessing that average size will
27544 be roughly half of the block.
27546 If this turns out to be bad, we might simply specify the preferred
27547 choice in ix86_costs. */
27548 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27549 && (algs->unknown_size == libcall
27550 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27552 enum stringop_alg alg;
27553 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27555 /* If there aren't any usable algorithms or if recursing already,
27556 then recursing on smaller sizes or same size isn't going to
27557 find anything. Just return the simple byte-at-a-time copy loop. */
27558 if (!any_alg_usable_p || recur)
27560 /* Pick something reasonable. */
27561 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27562 *dynamic_check = 128;
27563 return loop_1_byte;
27565 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27566 zero_memset, have_as, dynamic_check, noalign, true);
27567 gcc_assert (*dynamic_check == -1);
27568 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27569 *dynamic_check = max;
27570 else
27571 gcc_assert (alg != libcall);
27572 return alg;
27574 return (alg_usable_p (algs->unknown_size, memset, have_as)
27575 ? algs->unknown_size : libcall);
27578 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27579 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27580 static int
27581 decide_alignment (int align,
27582 enum stringop_alg alg,
27583 int expected_size,
27584 machine_mode move_mode)
27586 int desired_align = 0;
27588 gcc_assert (alg != no_stringop);
27590 if (alg == libcall)
27591 return 0;
27592 if (move_mode == VOIDmode)
27593 return 0;
27595 desired_align = GET_MODE_SIZE (move_mode);
27596 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27597 copying whole cacheline at once. */
27598 if (TARGET_PENTIUMPRO
27599 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27600 desired_align = 8;
27602 if (optimize_size)
27603 desired_align = 1;
27604 if (desired_align < align)
27605 desired_align = align;
27606 if (expected_size != -1 && expected_size < 4)
27607 desired_align = align;
27609 return desired_align;
27613 /* Helper function for memcpy. For QImode value 0xXY produce
27614 0xXYXYXYXY of wide specified by MODE. This is essentially
27615 a * 0x10101010, but we can do slightly better than
27616 synth_mult by unwinding the sequence by hand on CPUs with
27617 slow multiply. */
27618 static rtx
27619 promote_duplicated_reg (machine_mode mode, rtx val)
27621 machine_mode valmode = GET_MODE (val);
27622 rtx tmp;
27623 int nops = mode == DImode ? 3 : 2;
27625 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27626 if (val == const0_rtx)
27627 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27628 if (CONST_INT_P (val))
27630 HOST_WIDE_INT v = INTVAL (val) & 255;
27632 v |= v << 8;
27633 v |= v << 16;
27634 if (mode == DImode)
27635 v |= (v << 16) << 16;
27636 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27639 if (valmode == VOIDmode)
27640 valmode = QImode;
27641 if (valmode != QImode)
27642 val = gen_lowpart (QImode, val);
27643 if (mode == QImode)
27644 return val;
27645 if (!TARGET_PARTIAL_REG_STALL)
27646 nops--;
27647 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27648 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27649 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27650 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27652 rtx reg = convert_modes (mode, QImode, val, true);
27653 tmp = promote_duplicated_reg (mode, const1_rtx);
27654 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27655 OPTAB_DIRECT);
27657 else
27659 rtx reg = convert_modes (mode, QImode, val, true);
27661 if (!TARGET_PARTIAL_REG_STALL)
27662 if (mode == SImode)
27663 emit_insn (gen_insvsi_1 (reg, reg));
27664 else
27665 emit_insn (gen_insvdi_1 (reg, reg));
27666 else
27668 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27669 NULL, 1, OPTAB_DIRECT);
27670 reg =
27671 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27673 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27674 NULL, 1, OPTAB_DIRECT);
27675 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27676 if (mode == SImode)
27677 return reg;
27678 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27679 NULL, 1, OPTAB_DIRECT);
27680 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27681 return reg;
27685 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27686 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27687 alignment from ALIGN to DESIRED_ALIGN. */
27688 static rtx
27689 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27690 int align)
27692 rtx promoted_val;
27694 if (TARGET_64BIT
27695 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27696 promoted_val = promote_duplicated_reg (DImode, val);
27697 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27698 promoted_val = promote_duplicated_reg (SImode, val);
27699 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27700 promoted_val = promote_duplicated_reg (HImode, val);
27701 else
27702 promoted_val = val;
27704 return promoted_val;
27707 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27708 operations when profitable. The code depends upon architecture, block size
27709 and alignment, but always has one of the following overall structures:
27711 Aligned move sequence:
27713 1) Prologue guard: Conditional that jumps up to epilogues for small
27714 blocks that can be handled by epilogue alone. This is faster
27715 but also needed for correctness, since prologue assume the block
27716 is larger than the desired alignment.
27718 Optional dynamic check for size and libcall for large
27719 blocks is emitted here too, with -minline-stringops-dynamically.
27721 2) Prologue: copy first few bytes in order to get destination
27722 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27723 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27724 copied. We emit either a jump tree on power of two sized
27725 blocks, or a byte loop.
27727 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27728 with specified algorithm.
27730 4) Epilogue: code copying tail of the block that is too small to be
27731 handled by main body (or up to size guarded by prologue guard).
27733 Misaligned move sequence
27735 1) missaligned move prologue/epilogue containing:
27736 a) Prologue handling small memory blocks and jumping to done_label
27737 (skipped if blocks are known to be large enough)
27738 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27739 needed by single possibly misaligned move
27740 (skipped if alignment is not needed)
27741 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27743 2) Zero size guard dispatching to done_label, if needed
27745 3) dispatch to library call, if needed,
27747 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27748 with specified algorithm. */
27749 bool
27750 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27751 rtx align_exp, rtx expected_align_exp,
27752 rtx expected_size_exp, rtx min_size_exp,
27753 rtx max_size_exp, rtx probable_max_size_exp,
27754 bool issetmem)
27756 rtx destreg;
27757 rtx srcreg = NULL;
27758 rtx_code_label *label = NULL;
27759 rtx tmp;
27760 rtx_code_label *jump_around_label = NULL;
27761 HOST_WIDE_INT align = 1;
27762 unsigned HOST_WIDE_INT count = 0;
27763 HOST_WIDE_INT expected_size = -1;
27764 int size_needed = 0, epilogue_size_needed;
27765 int desired_align = 0, align_bytes = 0;
27766 enum stringop_alg alg;
27767 rtx promoted_val = NULL;
27768 rtx vec_promoted_val = NULL;
27769 bool force_loopy_epilogue = false;
27770 int dynamic_check;
27771 bool need_zero_guard = false;
27772 bool noalign;
27773 machine_mode move_mode = VOIDmode;
27774 machine_mode wider_mode;
27775 int unroll_factor = 1;
27776 /* TODO: Once value ranges are available, fill in proper data. */
27777 unsigned HOST_WIDE_INT min_size = 0;
27778 unsigned HOST_WIDE_INT max_size = -1;
27779 unsigned HOST_WIDE_INT probable_max_size = -1;
27780 bool misaligned_prologue_used = false;
27781 bool have_as;
27783 if (CONST_INT_P (align_exp))
27784 align = INTVAL (align_exp);
27785 /* i386 can do misaligned access on reasonably increased cost. */
27786 if (CONST_INT_P (expected_align_exp)
27787 && INTVAL (expected_align_exp) > align)
27788 align = INTVAL (expected_align_exp);
27789 /* ALIGN is the minimum of destination and source alignment, but we care here
27790 just about destination alignment. */
27791 else if (!issetmem
27792 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27793 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27795 if (CONST_INT_P (count_exp))
27797 min_size = max_size = probable_max_size = count = expected_size
27798 = INTVAL (count_exp);
27799 /* When COUNT is 0, there is nothing to do. */
27800 if (!count)
27801 return true;
27803 else
27805 if (min_size_exp)
27806 min_size = INTVAL (min_size_exp);
27807 if (max_size_exp)
27808 max_size = INTVAL (max_size_exp);
27809 if (probable_max_size_exp)
27810 probable_max_size = INTVAL (probable_max_size_exp);
27811 if (CONST_INT_P (expected_size_exp))
27812 expected_size = INTVAL (expected_size_exp);
27815 /* Make sure we don't need to care about overflow later on. */
27816 if (count > (HOST_WIDE_INT_1U << 30))
27817 return false;
27819 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27820 if (!issetmem)
27821 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27823 /* Step 0: Decide on preferred algorithm, desired alignment and
27824 size of chunks to be copied by main loop. */
27825 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27826 issetmem,
27827 issetmem && val_exp == const0_rtx, have_as,
27828 &dynamic_check, &noalign, false);
27829 if (alg == libcall)
27830 return false;
27831 gcc_assert (alg != no_stringop);
27833 /* For now vector-version of memset is generated only for memory zeroing, as
27834 creating of promoted vector value is very cheap in this case. */
27835 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27836 alg = unrolled_loop;
27838 if (!count)
27839 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27840 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27841 if (!issetmem)
27842 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27844 unroll_factor = 1;
27845 move_mode = word_mode;
27846 switch (alg)
27848 case libcall:
27849 case no_stringop:
27850 case last_alg:
27851 gcc_unreachable ();
27852 case loop_1_byte:
27853 need_zero_guard = true;
27854 move_mode = QImode;
27855 break;
27856 case loop:
27857 need_zero_guard = true;
27858 break;
27859 case unrolled_loop:
27860 need_zero_guard = true;
27861 unroll_factor = (TARGET_64BIT ? 4 : 2);
27862 break;
27863 case vector_loop:
27864 need_zero_guard = true;
27865 unroll_factor = 4;
27866 /* Find the widest supported mode. */
27867 move_mode = word_mode;
27868 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27869 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27870 move_mode = wider_mode;
27872 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27873 move_mode = TImode;
27875 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27876 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27877 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27879 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27880 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27881 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27882 move_mode = word_mode;
27884 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27885 break;
27886 case rep_prefix_8_byte:
27887 move_mode = DImode;
27888 break;
27889 case rep_prefix_4_byte:
27890 move_mode = SImode;
27891 break;
27892 case rep_prefix_1_byte:
27893 move_mode = QImode;
27894 break;
27896 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27897 epilogue_size_needed = size_needed;
27899 /* If we are going to call any library calls conditionally, make sure any
27900 pending stack adjustment happen before the first conditional branch,
27901 otherwise they will be emitted before the library call only and won't
27902 happen from the other branches. */
27903 if (dynamic_check != -1)
27904 do_pending_stack_adjust ();
27906 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27907 if (!TARGET_ALIGN_STRINGOPS || noalign)
27908 align = desired_align;
27910 /* Step 1: Prologue guard. */
27912 /* Alignment code needs count to be in register. */
27913 if (CONST_INT_P (count_exp) && desired_align > align)
27915 if (INTVAL (count_exp) > desired_align
27916 && INTVAL (count_exp) > size_needed)
27918 align_bytes
27919 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27920 if (align_bytes <= 0)
27921 align_bytes = 0;
27922 else
27923 align_bytes = desired_align - align_bytes;
27925 if (align_bytes == 0)
27926 count_exp = force_reg (counter_mode (count_exp), count_exp);
27928 gcc_assert (desired_align >= 1 && align >= 1);
27930 /* Misaligned move sequences handle both prologue and epilogue at once.
27931 Default code generation results in a smaller code for large alignments
27932 and also avoids redundant job when sizes are known precisely. */
27933 misaligned_prologue_used
27934 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27935 && MAX (desired_align, epilogue_size_needed) <= 32
27936 && desired_align <= epilogue_size_needed
27937 && ((desired_align > align && !align_bytes)
27938 || (!count && epilogue_size_needed > 1)));
27940 /* Do the cheap promotion to allow better CSE across the
27941 main loop and epilogue (ie one load of the big constant in the
27942 front of all code.
27943 For now the misaligned move sequences do not have fast path
27944 without broadcasting. */
27945 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27947 if (alg == vector_loop)
27949 gcc_assert (val_exp == const0_rtx);
27950 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27951 promoted_val = promote_duplicated_reg_to_size (val_exp,
27952 GET_MODE_SIZE (word_mode),
27953 desired_align, align);
27955 else
27957 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27958 desired_align, align);
27961 /* Misaligned move sequences handles both prologues and epilogues at once.
27962 Default code generation results in smaller code for large alignments and
27963 also avoids redundant job when sizes are known precisely. */
27964 if (misaligned_prologue_used)
27966 /* Misaligned move prologue handled small blocks by itself. */
27967 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27968 (dst, src, &destreg, &srcreg,
27969 move_mode, promoted_val, vec_promoted_val,
27970 &count_exp,
27971 &jump_around_label,
27972 desired_align < align
27973 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27974 desired_align, align, &min_size, dynamic_check, issetmem);
27975 if (!issetmem)
27976 src = change_address (src, BLKmode, srcreg);
27977 dst = change_address (dst, BLKmode, destreg);
27978 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27979 epilogue_size_needed = 0;
27980 if (need_zero_guard
27981 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27983 /* It is possible that we copied enough so the main loop will not
27984 execute. */
27985 gcc_assert (size_needed > 1);
27986 if (jump_around_label == NULL_RTX)
27987 jump_around_label = gen_label_rtx ();
27988 emit_cmp_and_jump_insns (count_exp,
27989 GEN_INT (size_needed),
27990 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27991 if (expected_size == -1
27992 || expected_size < (desired_align - align) / 2 + size_needed)
27993 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27994 else
27995 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27998 /* Ensure that alignment prologue won't copy past end of block. */
27999 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28001 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28002 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28003 Make sure it is power of 2. */
28004 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28006 /* To improve performance of small blocks, we jump around the VAL
28007 promoting mode. This mean that if the promoted VAL is not constant,
28008 we might not use it in the epilogue and have to use byte
28009 loop variant. */
28010 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28011 force_loopy_epilogue = true;
28012 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28013 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28015 /* If main algorithm works on QImode, no epilogue is needed.
28016 For small sizes just don't align anything. */
28017 if (size_needed == 1)
28018 desired_align = align;
28019 else
28020 goto epilogue;
28022 else if (!count
28023 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28025 label = gen_label_rtx ();
28026 emit_cmp_and_jump_insns (count_exp,
28027 GEN_INT (epilogue_size_needed),
28028 LTU, 0, counter_mode (count_exp), 1, label);
28029 if (expected_size == -1 || expected_size < epilogue_size_needed)
28030 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28031 else
28032 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28036 /* Emit code to decide on runtime whether library call or inline should be
28037 used. */
28038 if (dynamic_check != -1)
28040 if (!issetmem && CONST_INT_P (count_exp))
28042 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28044 emit_block_copy_via_libcall (dst, src, count_exp);
28045 count_exp = const0_rtx;
28046 goto epilogue;
28049 else
28051 rtx_code_label *hot_label = gen_label_rtx ();
28052 if (jump_around_label == NULL_RTX)
28053 jump_around_label = gen_label_rtx ();
28054 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28055 LEU, 0, counter_mode (count_exp),
28056 1, hot_label);
28057 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28058 if (issetmem)
28059 set_storage_via_libcall (dst, count_exp, val_exp);
28060 else
28061 emit_block_copy_via_libcall (dst, src, count_exp);
28062 emit_jump (jump_around_label);
28063 emit_label (hot_label);
28067 /* Step 2: Alignment prologue. */
28068 /* Do the expensive promotion once we branched off the small blocks. */
28069 if (issetmem && !promoted_val)
28070 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28071 desired_align, align);
28073 if (desired_align > align && !misaligned_prologue_used)
28075 if (align_bytes == 0)
28077 /* Except for the first move in prologue, we no longer know
28078 constant offset in aliasing info. It don't seems to worth
28079 the pain to maintain it for the first move, so throw away
28080 the info early. */
28081 dst = change_address (dst, BLKmode, destreg);
28082 if (!issetmem)
28083 src = change_address (src, BLKmode, srcreg);
28084 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28085 promoted_val, vec_promoted_val,
28086 count_exp, align, desired_align,
28087 issetmem);
28088 /* At most desired_align - align bytes are copied. */
28089 if (min_size < (unsigned)(desired_align - align))
28090 min_size = 0;
28091 else
28092 min_size -= desired_align - align;
28094 else
28096 /* If we know how many bytes need to be stored before dst is
28097 sufficiently aligned, maintain aliasing info accurately. */
28098 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28099 srcreg,
28100 promoted_val,
28101 vec_promoted_val,
28102 desired_align,
28103 align_bytes,
28104 issetmem);
28106 count_exp = plus_constant (counter_mode (count_exp),
28107 count_exp, -align_bytes);
28108 count -= align_bytes;
28109 min_size -= align_bytes;
28110 max_size -= align_bytes;
28112 if (need_zero_guard
28113 && min_size < (unsigned HOST_WIDE_INT) size_needed
28114 && (count < (unsigned HOST_WIDE_INT) size_needed
28115 || (align_bytes == 0
28116 && count < ((unsigned HOST_WIDE_INT) size_needed
28117 + desired_align - align))))
28119 /* It is possible that we copied enough so the main loop will not
28120 execute. */
28121 gcc_assert (size_needed > 1);
28122 if (label == NULL_RTX)
28123 label = gen_label_rtx ();
28124 emit_cmp_and_jump_insns (count_exp,
28125 GEN_INT (size_needed),
28126 LTU, 0, counter_mode (count_exp), 1, label);
28127 if (expected_size == -1
28128 || expected_size < (desired_align - align) / 2 + size_needed)
28129 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28130 else
28131 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28134 if (label && size_needed == 1)
28136 emit_label (label);
28137 LABEL_NUSES (label) = 1;
28138 label = NULL;
28139 epilogue_size_needed = 1;
28140 if (issetmem)
28141 promoted_val = val_exp;
28143 else if (label == NULL_RTX && !misaligned_prologue_used)
28144 epilogue_size_needed = size_needed;
28146 /* Step 3: Main loop. */
28148 switch (alg)
28150 case libcall:
28151 case no_stringop:
28152 case last_alg:
28153 gcc_unreachable ();
28154 case loop_1_byte:
28155 case loop:
28156 case unrolled_loop:
28157 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28158 count_exp, move_mode, unroll_factor,
28159 expected_size, issetmem);
28160 break;
28161 case vector_loop:
28162 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28163 vec_promoted_val, count_exp, move_mode,
28164 unroll_factor, expected_size, issetmem);
28165 break;
28166 case rep_prefix_8_byte:
28167 case rep_prefix_4_byte:
28168 case rep_prefix_1_byte:
28169 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28170 val_exp, count_exp, move_mode, issetmem);
28171 break;
28173 /* Adjust properly the offset of src and dest memory for aliasing. */
28174 if (CONST_INT_P (count_exp))
28176 if (!issetmem)
28177 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28178 (count / size_needed) * size_needed);
28179 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28180 (count / size_needed) * size_needed);
28182 else
28184 if (!issetmem)
28185 src = change_address (src, BLKmode, srcreg);
28186 dst = change_address (dst, BLKmode, destreg);
28189 /* Step 4: Epilogue to copy the remaining bytes. */
28190 epilogue:
28191 if (label)
28193 /* When the main loop is done, COUNT_EXP might hold original count,
28194 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28195 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28196 bytes. Compensate if needed. */
28198 if (size_needed < epilogue_size_needed)
28200 tmp =
28201 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28202 GEN_INT (size_needed - 1), count_exp, 1,
28203 OPTAB_DIRECT);
28204 if (tmp != count_exp)
28205 emit_move_insn (count_exp, tmp);
28207 emit_label (label);
28208 LABEL_NUSES (label) = 1;
28211 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28213 if (force_loopy_epilogue)
28214 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28215 epilogue_size_needed);
28216 else
28218 if (issetmem)
28219 expand_setmem_epilogue (dst, destreg, promoted_val,
28220 vec_promoted_val, count_exp,
28221 epilogue_size_needed);
28222 else
28223 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28224 epilogue_size_needed);
28227 if (jump_around_label)
28228 emit_label (jump_around_label);
28229 return true;
28233 /* Expand the appropriate insns for doing strlen if not just doing
28234 repnz; scasb
28236 out = result, initialized with the start address
28237 align_rtx = alignment of the address.
28238 scratch = scratch register, initialized with the startaddress when
28239 not aligned, otherwise undefined
28241 This is just the body. It needs the initializations mentioned above and
28242 some address computing at the end. These things are done in i386.md. */
28244 static void
28245 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28247 int align;
28248 rtx tmp;
28249 rtx_code_label *align_2_label = NULL;
28250 rtx_code_label *align_3_label = NULL;
28251 rtx_code_label *align_4_label = gen_label_rtx ();
28252 rtx_code_label *end_0_label = gen_label_rtx ();
28253 rtx mem;
28254 rtx tmpreg = gen_reg_rtx (SImode);
28255 rtx scratch = gen_reg_rtx (SImode);
28256 rtx cmp;
28258 align = 0;
28259 if (CONST_INT_P (align_rtx))
28260 align = INTVAL (align_rtx);
28262 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28264 /* Is there a known alignment and is it less than 4? */
28265 if (align < 4)
28267 rtx scratch1 = gen_reg_rtx (Pmode);
28268 emit_move_insn (scratch1, out);
28269 /* Is there a known alignment and is it not 2? */
28270 if (align != 2)
28272 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28273 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28275 /* Leave just the 3 lower bits. */
28276 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28277 NULL_RTX, 0, OPTAB_WIDEN);
28279 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28280 Pmode, 1, align_4_label);
28281 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28282 Pmode, 1, align_2_label);
28283 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28284 Pmode, 1, align_3_label);
28286 else
28288 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28289 check if is aligned to 4 - byte. */
28291 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28292 NULL_RTX, 0, OPTAB_WIDEN);
28294 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28295 Pmode, 1, align_4_label);
28298 mem = change_address (src, QImode, out);
28300 /* Now compare the bytes. */
28302 /* Compare the first n unaligned byte on a byte per byte basis. */
28303 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28304 QImode, 1, end_0_label);
28306 /* Increment the address. */
28307 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28309 /* Not needed with an alignment of 2 */
28310 if (align != 2)
28312 emit_label (align_2_label);
28314 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28315 end_0_label);
28317 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28319 emit_label (align_3_label);
28322 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28323 end_0_label);
28325 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28328 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28329 align this loop. It gives only huge programs, but does not help to
28330 speed up. */
28331 emit_label (align_4_label);
28333 mem = change_address (src, SImode, out);
28334 emit_move_insn (scratch, mem);
28335 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28337 /* This formula yields a nonzero result iff one of the bytes is zero.
28338 This saves three branches inside loop and many cycles. */
28340 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28341 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28342 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28343 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28344 gen_int_mode (0x80808080, SImode)));
28345 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28346 align_4_label);
28348 if (TARGET_CMOVE)
28350 rtx reg = gen_reg_rtx (SImode);
28351 rtx reg2 = gen_reg_rtx (Pmode);
28352 emit_move_insn (reg, tmpreg);
28353 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28355 /* If zero is not in the first two bytes, move two bytes forward. */
28356 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28357 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28358 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28359 emit_insn (gen_rtx_SET (tmpreg,
28360 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28361 reg,
28362 tmpreg)));
28363 /* Emit lea manually to avoid clobbering of flags. */
28364 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28366 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28367 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28368 emit_insn (gen_rtx_SET (out,
28369 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28370 reg2,
28371 out)));
28373 else
28375 rtx_code_label *end_2_label = gen_label_rtx ();
28376 /* Is zero in the first two bytes? */
28378 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28379 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28380 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28381 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28382 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28383 pc_rtx);
28384 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28385 JUMP_LABEL (tmp) = end_2_label;
28387 /* Not in the first two. Move two bytes forward. */
28388 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28389 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28391 emit_label (end_2_label);
28395 /* Avoid branch in fixing the byte. */
28396 tmpreg = gen_lowpart (QImode, tmpreg);
28397 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28398 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28399 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28400 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28402 emit_label (end_0_label);
28405 /* Expand strlen. */
28407 bool
28408 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28410 rtx addr, scratch1, scratch2, scratch3, scratch4;
28412 /* The generic case of strlen expander is long. Avoid it's
28413 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28415 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28416 && !TARGET_INLINE_ALL_STRINGOPS
28417 && !optimize_insn_for_size_p ()
28418 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28419 return false;
28421 addr = force_reg (Pmode, XEXP (src, 0));
28422 scratch1 = gen_reg_rtx (Pmode);
28424 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28425 && !optimize_insn_for_size_p ())
28427 /* Well it seems that some optimizer does not combine a call like
28428 foo(strlen(bar), strlen(bar));
28429 when the move and the subtraction is done here. It does calculate
28430 the length just once when these instructions are done inside of
28431 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28432 often used and I use one fewer register for the lifetime of
28433 output_strlen_unroll() this is better. */
28435 emit_move_insn (out, addr);
28437 ix86_expand_strlensi_unroll_1 (out, src, align);
28439 /* strlensi_unroll_1 returns the address of the zero at the end of
28440 the string, like memchr(), so compute the length by subtracting
28441 the start address. */
28442 emit_insn (ix86_gen_sub3 (out, out, addr));
28444 else
28446 rtx unspec;
28448 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28449 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28450 return false;
28451 /* Can't use this for non-default address spaces. */
28452 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28453 return false;
28455 scratch2 = gen_reg_rtx (Pmode);
28456 scratch3 = gen_reg_rtx (Pmode);
28457 scratch4 = force_reg (Pmode, constm1_rtx);
28459 emit_move_insn (scratch3, addr);
28460 eoschar = force_reg (QImode, eoschar);
28462 src = replace_equiv_address_nv (src, scratch3);
28464 /* If .md starts supporting :P, this can be done in .md. */
28465 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28466 scratch4), UNSPEC_SCAS);
28467 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28468 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28469 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28471 return true;
28474 /* For given symbol (function) construct code to compute address of it's PLT
28475 entry in large x86-64 PIC model. */
28476 static rtx
28477 construct_plt_address (rtx symbol)
28479 rtx tmp, unspec;
28481 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28482 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28483 gcc_assert (Pmode == DImode);
28485 tmp = gen_reg_rtx (Pmode);
28486 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28488 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28489 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28490 return tmp;
28494 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28495 rtx callarg2,
28496 rtx pop, bool sibcall)
28498 rtx vec[3];
28499 rtx use = NULL, call;
28500 unsigned int vec_len = 0;
28501 tree fndecl;
28503 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28505 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28506 if (fndecl
28507 && (lookup_attribute ("interrupt",
28508 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28509 error ("interrupt service routine can't be called directly");
28511 else
28512 fndecl = NULL_TREE;
28514 if (pop == const0_rtx)
28515 pop = NULL;
28516 gcc_assert (!TARGET_64BIT || !pop);
28518 if (TARGET_MACHO && !TARGET_64BIT)
28520 #if TARGET_MACHO
28521 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28522 fnaddr = machopic_indirect_call_target (fnaddr);
28523 #endif
28525 else
28527 /* Static functions and indirect calls don't need the pic register. Also,
28528 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28529 it an indirect call. */
28530 rtx addr = XEXP (fnaddr, 0);
28531 if (flag_pic
28532 && GET_CODE (addr) == SYMBOL_REF
28533 && !SYMBOL_REF_LOCAL_P (addr))
28535 if (flag_plt
28536 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28537 || !lookup_attribute ("noplt",
28538 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28540 if (!TARGET_64BIT
28541 || (ix86_cmodel == CM_LARGE_PIC
28542 && DEFAULT_ABI != MS_ABI))
28544 use_reg (&use, gen_rtx_REG (Pmode,
28545 REAL_PIC_OFFSET_TABLE_REGNUM));
28546 if (ix86_use_pseudo_pic_reg ())
28547 emit_move_insn (gen_rtx_REG (Pmode,
28548 REAL_PIC_OFFSET_TABLE_REGNUM),
28549 pic_offset_table_rtx);
28552 else if (!TARGET_PECOFF && !TARGET_MACHO)
28554 if (TARGET_64BIT)
28556 fnaddr = gen_rtx_UNSPEC (Pmode,
28557 gen_rtvec (1, addr),
28558 UNSPEC_GOTPCREL);
28559 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28561 else
28563 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28564 UNSPEC_GOT);
28565 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28566 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28567 fnaddr);
28569 fnaddr = gen_const_mem (Pmode, fnaddr);
28570 /* Pmode may not be the same as word_mode for x32, which
28571 doesn't support indirect branch via 32-bit memory slot.
28572 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28573 indirect branch via x32 GOT slot is OK. */
28574 if (GET_MODE (fnaddr) != word_mode)
28575 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28576 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28581 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28582 parameters passed in vector registers. */
28583 if (TARGET_64BIT
28584 && (INTVAL (callarg2) > 0
28585 || (INTVAL (callarg2) == 0
28586 && (TARGET_SSE || !flag_skip_rax_setup))))
28588 rtx al = gen_rtx_REG (QImode, AX_REG);
28589 emit_move_insn (al, callarg2);
28590 use_reg (&use, al);
28593 if (ix86_cmodel == CM_LARGE_PIC
28594 && !TARGET_PECOFF
28595 && MEM_P (fnaddr)
28596 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28597 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28598 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28599 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28600 branch via x32 GOT slot is OK. */
28601 else if (!(TARGET_X32
28602 && MEM_P (fnaddr)
28603 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28604 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28605 && (sibcall
28606 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28607 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28609 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28610 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28613 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28615 if (retval)
28617 /* We should add bounds as destination register in case
28618 pointer with bounds may be returned. */
28619 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28621 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28622 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28623 if (GET_CODE (retval) == PARALLEL)
28625 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28626 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28627 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28628 retval = chkp_join_splitted_slot (retval, par);
28630 else
28632 retval = gen_rtx_PARALLEL (VOIDmode,
28633 gen_rtvec (3, retval, b0, b1));
28634 chkp_put_regs_to_expr_list (retval);
28638 call = gen_rtx_SET (retval, call);
28640 vec[vec_len++] = call;
28642 if (pop)
28644 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28645 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28646 vec[vec_len++] = pop;
28649 if (cfun->machine->no_caller_saved_registers
28650 && (!fndecl
28651 || (!TREE_THIS_VOLATILE (fndecl)
28652 && !lookup_attribute ("no_caller_saved_registers",
28653 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28655 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28656 bool is_64bit_ms_abi = (TARGET_64BIT
28657 && ix86_function_abi (fndecl) == MS_ABI);
28658 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28660 /* If there are no caller-saved registers, add all registers
28661 that are clobbered by the call which returns. */
28662 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28663 if (!fixed_regs[i]
28664 && (ix86_call_used_regs[i] == 1
28665 || (ix86_call_used_regs[i] & c_mask))
28666 && !STACK_REGNO_P (i)
28667 && !MMX_REGNO_P (i))
28668 clobber_reg (&use,
28669 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28671 else if (TARGET_64BIT_MS_ABI
28672 && (!callarg2 || INTVAL (callarg2) != -2))
28674 unsigned i;
28676 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28678 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28679 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28681 clobber_reg (&use, gen_rtx_REG (mode, regno));
28684 /* Set here, but it may get cleared later. */
28685 if (TARGET_CALL_MS2SYSV_XLOGUES)
28687 if (!TARGET_SSE)
28690 /* Don't break hot-patched functions. */
28691 else if (ix86_function_ms_hook_prologue (current_function_decl))
28694 /* TODO: Cases not yet examined. */
28695 else if (flag_split_stack)
28696 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28698 else
28700 gcc_assert (!reload_completed);
28701 cfun->machine->call_ms2sysv = true;
28706 if (vec_len > 1)
28707 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28708 call = emit_call_insn (call);
28709 if (use)
28710 CALL_INSN_FUNCTION_USAGE (call) = use;
28712 return call;
28715 /* Return true if the function being called was marked with attribute
28716 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28717 to handle the non-PIC case in the backend because there is no easy
28718 interface for the front-end to force non-PLT calls to use the GOT.
28719 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28720 to call the function marked "noplt" indirectly. */
28722 static bool
28723 ix86_nopic_noplt_attribute_p (rtx call_op)
28725 if (flag_pic || ix86_cmodel == CM_LARGE
28726 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28727 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28728 || SYMBOL_REF_LOCAL_P (call_op))
28729 return false;
28731 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28733 if (!flag_plt
28734 || (symbol_decl != NULL_TREE
28735 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28736 return true;
28738 return false;
28741 /* Output indirect branch via a call and return thunk. CALL_OP is a
28742 register which contains the branch target. XASM is the assembly
28743 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28744 A normal call is converted to:
28746 call __x86_indirect_thunk_reg
28748 and a tail call is converted to:
28750 jmp __x86_indirect_thunk_reg
28753 static void
28754 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28756 char thunk_name_buf[32];
28757 char *thunk_name;
28758 enum indirect_thunk_prefix need_prefix
28759 = indirect_thunk_need_prefix (current_output_insn);
28760 int regno = REGNO (call_op);
28762 if (cfun->machine->indirect_branch_type
28763 != indirect_branch_thunk_inline)
28765 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28767 int i = regno;
28768 if (i >= FIRST_REX_INT_REG)
28769 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28770 if (need_prefix == indirect_thunk_prefix_bnd)
28771 indirect_thunks_bnd_used |= 1 << i;
28772 else
28773 indirect_thunks_used |= 1 << i;
28775 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28776 thunk_name = thunk_name_buf;
28778 else
28779 thunk_name = NULL;
28781 if (sibcall_p)
28783 if (thunk_name != NULL)
28785 if (need_prefix == indirect_thunk_prefix_bnd)
28786 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28787 else
28788 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28790 else
28791 output_indirect_thunk (need_prefix, regno);
28793 else
28795 if (thunk_name != NULL)
28797 if (need_prefix == indirect_thunk_prefix_bnd)
28798 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28799 else
28800 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28801 return;
28804 char indirectlabel1[32];
28805 char indirectlabel2[32];
28807 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28808 INDIRECT_LABEL,
28809 indirectlabelno++);
28810 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28811 INDIRECT_LABEL,
28812 indirectlabelno++);
28814 /* Jump. */
28815 if (need_prefix == indirect_thunk_prefix_bnd)
28816 fputs ("\tbnd jmp\t", asm_out_file);
28817 else
28818 fputs ("\tjmp\t", asm_out_file);
28819 assemble_name_raw (asm_out_file, indirectlabel2);
28820 fputc ('\n', asm_out_file);
28822 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28824 if (thunk_name != NULL)
28826 if (need_prefix == indirect_thunk_prefix_bnd)
28827 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28828 else
28829 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28831 else
28832 output_indirect_thunk (need_prefix, regno);
28834 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28836 /* Call. */
28837 if (need_prefix == indirect_thunk_prefix_bnd)
28838 fputs ("\tbnd call\t", asm_out_file);
28839 else
28840 fputs ("\tcall\t", asm_out_file);
28841 assemble_name_raw (asm_out_file, indirectlabel1);
28842 fputc ('\n', asm_out_file);
28846 /* Output indirect branch via a call and return thunk. CALL_OP is
28847 the branch target. XASM is the assembly template for CALL_OP.
28848 Branch is a tail call if SIBCALL_P is true. A normal call is
28849 converted to:
28851 jmp L2
28853 push CALL_OP
28854 jmp __x86_indirect_thunk
28856 call L1
28858 and a tail call is converted to:
28860 push CALL_OP
28861 jmp __x86_indirect_thunk
28864 static void
28865 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28866 bool sibcall_p)
28868 char thunk_name_buf[32];
28869 char *thunk_name;
28870 char push_buf[64];
28871 enum indirect_thunk_prefix need_prefix
28872 = indirect_thunk_need_prefix (current_output_insn);
28873 int regno = -1;
28875 if (cfun->machine->indirect_branch_type
28876 != indirect_branch_thunk_inline)
28878 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28880 if (need_prefix == indirect_thunk_prefix_bnd)
28881 indirect_thunk_bnd_needed = true;
28882 else
28883 indirect_thunk_needed = true;
28885 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28886 thunk_name = thunk_name_buf;
28888 else
28889 thunk_name = NULL;
28891 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28892 TARGET_64BIT ? 'q' : 'l', xasm);
28894 if (sibcall_p)
28896 output_asm_insn (push_buf, &call_op);
28897 if (thunk_name != NULL)
28899 if (need_prefix == indirect_thunk_prefix_bnd)
28900 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28901 else
28902 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28904 else
28905 output_indirect_thunk (need_prefix, regno);
28907 else
28909 char indirectlabel1[32];
28910 char indirectlabel2[32];
28912 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28913 INDIRECT_LABEL,
28914 indirectlabelno++);
28915 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28916 INDIRECT_LABEL,
28917 indirectlabelno++);
28919 /* Jump. */
28920 if (need_prefix == indirect_thunk_prefix_bnd)
28921 fputs ("\tbnd jmp\t", asm_out_file);
28922 else
28923 fputs ("\tjmp\t", asm_out_file);
28924 assemble_name_raw (asm_out_file, indirectlabel2);
28925 fputc ('\n', asm_out_file);
28927 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28929 /* An external function may be called via GOT, instead of PLT. */
28930 if (MEM_P (call_op))
28932 struct ix86_address parts;
28933 rtx addr = XEXP (call_op, 0);
28934 if (ix86_decompose_address (addr, &parts)
28935 && parts.base == stack_pointer_rtx)
28937 /* Since call will adjust stack by -UNITS_PER_WORD,
28938 we must convert "disp(stack, index, scale)" to
28939 "disp+UNITS_PER_WORD(stack, index, scale)". */
28940 if (parts.index)
28942 addr = gen_rtx_MULT (Pmode, parts.index,
28943 GEN_INT (parts.scale));
28944 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28945 addr);
28947 else
28948 addr = stack_pointer_rtx;
28950 rtx disp;
28951 if (parts.disp != NULL_RTX)
28952 disp = plus_constant (Pmode, parts.disp,
28953 UNITS_PER_WORD);
28954 else
28955 disp = GEN_INT (UNITS_PER_WORD);
28957 addr = gen_rtx_PLUS (Pmode, addr, disp);
28958 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28962 output_asm_insn (push_buf, &call_op);
28964 if (thunk_name != NULL)
28966 if (need_prefix == indirect_thunk_prefix_bnd)
28967 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28968 else
28969 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28971 else
28972 output_indirect_thunk (need_prefix, regno);
28974 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28976 /* Call. */
28977 if (need_prefix == indirect_thunk_prefix_bnd)
28978 fputs ("\tbnd call\t", asm_out_file);
28979 else
28980 fputs ("\tcall\t", asm_out_file);
28981 assemble_name_raw (asm_out_file, indirectlabel1);
28982 fputc ('\n', asm_out_file);
28986 /* Output indirect branch via a call and return thunk. CALL_OP is
28987 the branch target. XASM is the assembly template for CALL_OP.
28988 Branch is a tail call if SIBCALL_P is true. */
28990 static void
28991 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28992 bool sibcall_p)
28994 if (REG_P (call_op))
28995 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28996 else
28997 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
29000 /* Output indirect jump. CALL_OP is the jump target. */
29002 const char *
29003 ix86_output_indirect_jmp (rtx call_op)
29005 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
29007 /* We can't have red-zone since "call" in the indirect thunk
29008 pushes the return address onto stack, destroying red-zone. */
29009 if (ix86_red_zone_size != 0)
29010 gcc_unreachable ();
29012 ix86_output_indirect_branch (call_op, "%0", true);
29013 return "";
29015 else
29016 return "%!jmp\t%A0";
29019 /* Output function return. CALL_OP is the jump target. Add a REP
29020 prefix to RET if LONG_P is true and function return is kept. */
29022 const char *
29023 ix86_output_function_return (bool long_p)
29025 if (cfun->machine->function_return_type != indirect_branch_keep)
29027 char thunk_name[32];
29028 enum indirect_thunk_prefix need_prefix
29029 = indirect_thunk_need_prefix (current_output_insn);
29031 if (cfun->machine->function_return_type
29032 != indirect_branch_thunk_inline)
29034 bool need_thunk = (cfun->machine->function_return_type
29035 == indirect_branch_thunk);
29036 indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
29037 true);
29038 if (need_prefix == indirect_thunk_prefix_bnd)
29040 indirect_return_bnd_needed |= need_thunk;
29041 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29043 else
29045 indirect_return_needed |= need_thunk;
29046 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29049 else
29050 output_indirect_thunk (need_prefix, INVALID_REGNUM);
29052 return "";
29055 if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
29056 return "%!ret";
29058 return "rep%; ret";
29061 /* Output indirect function return. RET_OP is the function return
29062 target. */
29064 const char *
29065 ix86_output_indirect_function_return (rtx ret_op)
29067 if (cfun->machine->function_return_type != indirect_branch_keep)
29069 char thunk_name[32];
29070 enum indirect_thunk_prefix need_prefix
29071 = indirect_thunk_need_prefix (current_output_insn);
29072 unsigned int regno = REGNO (ret_op);
29073 gcc_assert (regno == CX_REG);
29075 if (cfun->machine->function_return_type
29076 != indirect_branch_thunk_inline)
29078 bool need_thunk = (cfun->machine->function_return_type
29079 == indirect_branch_thunk);
29080 indirect_thunk_name (thunk_name, regno, need_prefix, true);
29081 if (need_prefix == indirect_thunk_prefix_bnd)
29083 if (need_thunk)
29085 indirect_return_via_cx_bnd = true;
29086 indirect_thunks_bnd_used |= 1 << CX_REG;
29088 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29090 else
29092 if (need_thunk)
29094 indirect_return_via_cx = true;
29095 indirect_thunks_used |= 1 << CX_REG;
29097 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29100 else
29101 output_indirect_thunk (need_prefix, regno);
29103 return "";
29105 else
29106 return "%!jmp\t%A0";
29109 /* Split simple return with popping POPC bytes from stack to indirect
29110 branch with stack adjustment . */
29112 void
29113 ix86_split_simple_return_pop_internal (rtx popc)
29115 struct machine_function *m = cfun->machine;
29116 rtx ecx = gen_rtx_REG (SImode, CX_REG);
29117 rtx_insn *insn;
29119 /* There is no "pascal" calling convention in any 64bit ABI. */
29120 gcc_assert (!TARGET_64BIT);
29122 insn = emit_insn (gen_pop (ecx));
29123 m->fs.cfa_offset -= UNITS_PER_WORD;
29124 m->fs.sp_offset -= UNITS_PER_WORD;
29126 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
29127 x = gen_rtx_SET (stack_pointer_rtx, x);
29128 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29129 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
29130 RTX_FRAME_RELATED_P (insn) = 1;
29132 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
29133 x = gen_rtx_SET (stack_pointer_rtx, x);
29134 insn = emit_insn (x);
29135 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29136 RTX_FRAME_RELATED_P (insn) = 1;
29138 /* Now return address is in ECX. */
29139 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
29142 /* Output the assembly for a call instruction. */
29144 const char *
29145 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29147 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29148 bool output_indirect_p
29149 = (!TARGET_SEH
29150 && cfun->machine->indirect_branch_type != indirect_branch_keep);
29151 bool seh_nop_p = false;
29152 const char *xasm;
29154 if (SIBLING_CALL_P (insn))
29156 if (direct_p)
29158 if (ix86_nopic_noplt_attribute_p (call_op))
29160 direct_p = false;
29161 if (TARGET_64BIT)
29163 if (output_indirect_p)
29164 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29165 else
29166 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29168 else
29170 if (output_indirect_p)
29171 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29172 else
29173 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29176 else
29177 xasm = "%!jmp\t%P0";
29179 /* SEH epilogue detection requires the indirect branch case
29180 to include REX.W. */
29181 else if (TARGET_SEH)
29182 xasm = "%!rex.W jmp\t%A0";
29183 else
29185 if (output_indirect_p)
29186 xasm = "%0";
29187 else
29188 xasm = "%!jmp\t%A0";
29191 if (output_indirect_p && !direct_p)
29192 ix86_output_indirect_branch (call_op, xasm, true);
29193 else
29194 output_asm_insn (xasm, &call_op);
29195 return "";
29198 /* SEH unwinding can require an extra nop to be emitted in several
29199 circumstances. Determine if we have one of those. */
29200 if (TARGET_SEH)
29202 rtx_insn *i;
29204 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29206 /* Prevent a catch region from being adjacent to a jump that would
29207 be interpreted as an epilogue sequence by the unwinder. */
29208 if (JUMP_P(i) && CROSSING_JUMP_P (i))
29210 seh_nop_p = true;
29211 break;
29214 /* If we get to another real insn, we don't need the nop. */
29215 if (INSN_P (i))
29216 break;
29218 /* If we get to the epilogue note, prevent a catch region from
29219 being adjacent to the standard epilogue sequence. If non-
29220 call-exceptions, we'll have done this during epilogue emission. */
29221 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29222 && !flag_non_call_exceptions
29223 && !can_throw_internal (insn))
29225 seh_nop_p = true;
29226 break;
29230 /* If we didn't find a real insn following the call, prevent the
29231 unwinder from looking into the next function. */
29232 if (i == NULL)
29233 seh_nop_p = true;
29236 if (direct_p)
29238 if (ix86_nopic_noplt_attribute_p (call_op))
29240 direct_p = false;
29241 if (TARGET_64BIT)
29243 if (output_indirect_p)
29244 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29245 else
29246 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29248 else
29250 if (output_indirect_p)
29251 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29252 else
29253 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29256 else
29257 xasm = "%!call\t%P0";
29259 else
29261 if (output_indirect_p)
29262 xasm = "%0";
29263 else
29264 xasm = "%!call\t%A0";
29267 if (output_indirect_p && !direct_p)
29268 ix86_output_indirect_branch (call_op, xasm, false);
29269 else
29270 output_asm_insn (xasm, &call_op);
29272 if (seh_nop_p)
29273 return "nop";
29275 return "";
29278 /* Clear stack slot assignments remembered from previous functions.
29279 This is called from INIT_EXPANDERS once before RTL is emitted for each
29280 function. */
29282 static struct machine_function *
29283 ix86_init_machine_status (void)
29285 struct machine_function *f;
29287 f = ggc_cleared_alloc<machine_function> ();
29288 f->call_abi = ix86_abi;
29290 return f;
29293 /* Return a MEM corresponding to a stack slot with mode MODE.
29294 Allocate a new slot if necessary.
29296 The RTL for a function can have several slots available: N is
29297 which slot to use. */
29300 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29302 struct stack_local_entry *s;
29304 gcc_assert (n < MAX_386_STACK_LOCALS);
29306 for (s = ix86_stack_locals; s; s = s->next)
29307 if (s->mode == mode && s->n == n)
29308 return validize_mem (copy_rtx (s->rtl));
29310 s = ggc_alloc<stack_local_entry> ();
29311 s->n = n;
29312 s->mode = mode;
29313 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29315 s->next = ix86_stack_locals;
29316 ix86_stack_locals = s;
29317 return validize_mem (copy_rtx (s->rtl));
29320 static void
29321 ix86_instantiate_decls (void)
29323 struct stack_local_entry *s;
29325 for (s = ix86_stack_locals; s; s = s->next)
29326 if (s->rtl != NULL_RTX)
29327 instantiate_decl_rtl (s->rtl);
29330 /* Return the number used for encoding REG, in the range 0..7. */
29332 static int
29333 reg_encoded_number (rtx reg)
29335 unsigned regno = REGNO (reg);
29336 switch (regno)
29338 case AX_REG:
29339 return 0;
29340 case CX_REG:
29341 return 1;
29342 case DX_REG:
29343 return 2;
29344 case BX_REG:
29345 return 3;
29346 case SP_REG:
29347 return 4;
29348 case BP_REG:
29349 return 5;
29350 case SI_REG:
29351 return 6;
29352 case DI_REG:
29353 return 7;
29354 default:
29355 break;
29357 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29358 return regno - FIRST_STACK_REG;
29359 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29360 return regno - FIRST_SSE_REG;
29361 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29362 return regno - FIRST_MMX_REG;
29363 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29364 return regno - FIRST_REX_SSE_REG;
29365 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29366 return regno - FIRST_REX_INT_REG;
29367 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29368 return regno - FIRST_MASK_REG;
29369 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29370 return regno - FIRST_BND_REG;
29371 return -1;
29374 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29375 in its encoding if it could be relevant for ROP mitigation, otherwise
29376 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29377 used for calculating it into them. */
29379 static int
29380 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29381 int *popno0 = 0, int *popno1 = 0)
29383 if (asm_noperands (PATTERN (insn)) >= 0)
29384 return -1;
29385 int has_modrm = get_attr_modrm (insn);
29386 if (!has_modrm)
29387 return -1;
29388 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29389 rtx op0, op1;
29390 switch (cls)
29392 case MODRM_CLASS_OP02:
29393 gcc_assert (noperands >= 3);
29394 if (popno0)
29396 *popno0 = 0;
29397 *popno1 = 2;
29399 op0 = operands[0];
29400 op1 = operands[2];
29401 break;
29402 case MODRM_CLASS_OP01:
29403 gcc_assert (noperands >= 2);
29404 if (popno0)
29406 *popno0 = 0;
29407 *popno1 = 1;
29409 op0 = operands[0];
29410 op1 = operands[1];
29411 break;
29412 default:
29413 return -1;
29415 if (REG_P (op0) && REG_P (op1))
29417 int enc0 = reg_encoded_number (op0);
29418 int enc1 = reg_encoded_number (op1);
29419 return 0xc0 + (enc1 << 3) + enc0;
29421 return -1;
29424 /* Check whether x86 address PARTS is a pc-relative address. */
29426 bool
29427 ix86_rip_relative_addr_p (struct ix86_address *parts)
29429 rtx base, index, disp;
29431 base = parts->base;
29432 index = parts->index;
29433 disp = parts->disp;
29435 if (disp && !base && !index)
29437 if (TARGET_64BIT)
29439 rtx symbol = disp;
29441 if (GET_CODE (disp) == CONST)
29442 symbol = XEXP (disp, 0);
29443 if (GET_CODE (symbol) == PLUS
29444 && CONST_INT_P (XEXP (symbol, 1)))
29445 symbol = XEXP (symbol, 0);
29447 if (GET_CODE (symbol) == LABEL_REF
29448 || (GET_CODE (symbol) == SYMBOL_REF
29449 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29450 || (GET_CODE (symbol) == UNSPEC
29451 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29452 || XINT (symbol, 1) == UNSPEC_PCREL
29453 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29454 return true;
29457 return false;
29460 /* Calculate the length of the memory address in the instruction encoding.
29461 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29462 or other prefixes. We never generate addr32 prefix for LEA insn. */
29465 memory_address_length (rtx addr, bool lea)
29467 struct ix86_address parts;
29468 rtx base, index, disp;
29469 int len;
29470 int ok;
29472 if (GET_CODE (addr) == PRE_DEC
29473 || GET_CODE (addr) == POST_INC
29474 || GET_CODE (addr) == PRE_MODIFY
29475 || GET_CODE (addr) == POST_MODIFY)
29476 return 0;
29478 ok = ix86_decompose_address (addr, &parts);
29479 gcc_assert (ok);
29481 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29483 /* If this is not LEA instruction, add the length of addr32 prefix. */
29484 if (TARGET_64BIT && !lea
29485 && (SImode_address_operand (addr, VOIDmode)
29486 || (parts.base && GET_MODE (parts.base) == SImode)
29487 || (parts.index && GET_MODE (parts.index) == SImode)))
29488 len++;
29490 base = parts.base;
29491 index = parts.index;
29492 disp = parts.disp;
29494 if (base && SUBREG_P (base))
29495 base = SUBREG_REG (base);
29496 if (index && SUBREG_P (index))
29497 index = SUBREG_REG (index);
29499 gcc_assert (base == NULL_RTX || REG_P (base));
29500 gcc_assert (index == NULL_RTX || REG_P (index));
29502 /* Rule of thumb:
29503 - esp as the base always wants an index,
29504 - ebp as the base always wants a displacement,
29505 - r12 as the base always wants an index,
29506 - r13 as the base always wants a displacement. */
29508 /* Register Indirect. */
29509 if (base && !index && !disp)
29511 /* esp (for its index) and ebp (for its displacement) need
29512 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29513 code. */
29514 if (base == arg_pointer_rtx
29515 || base == frame_pointer_rtx
29516 || REGNO (base) == SP_REG
29517 || REGNO (base) == BP_REG
29518 || REGNO (base) == R12_REG
29519 || REGNO (base) == R13_REG)
29520 len++;
29523 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29524 is not disp32, but disp32(%rip), so for disp32
29525 SIB byte is needed, unless print_operand_address
29526 optimizes it into disp32(%rip) or (%rip) is implied
29527 by UNSPEC. */
29528 else if (disp && !base && !index)
29530 len += 4;
29531 if (!ix86_rip_relative_addr_p (&parts))
29532 len++;
29534 else
29536 /* Find the length of the displacement constant. */
29537 if (disp)
29539 if (base && satisfies_constraint_K (disp))
29540 len += 1;
29541 else
29542 len += 4;
29544 /* ebp always wants a displacement. Similarly r13. */
29545 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29546 len++;
29548 /* An index requires the two-byte modrm form.... */
29549 if (index
29550 /* ...like esp (or r12), which always wants an index. */
29551 || base == arg_pointer_rtx
29552 || base == frame_pointer_rtx
29553 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29554 len++;
29557 return len;
29560 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29561 is set, expect that insn have 8bit immediate alternative. */
29563 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29565 int len = 0;
29566 int i;
29567 extract_insn_cached (insn);
29568 for (i = recog_data.n_operands - 1; i >= 0; --i)
29569 if (CONSTANT_P (recog_data.operand[i]))
29571 enum attr_mode mode = get_attr_mode (insn);
29573 gcc_assert (!len);
29574 if (shortform && CONST_INT_P (recog_data.operand[i]))
29576 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29577 switch (mode)
29579 case MODE_QI:
29580 len = 1;
29581 continue;
29582 case MODE_HI:
29583 ival = trunc_int_for_mode (ival, HImode);
29584 break;
29585 case MODE_SI:
29586 ival = trunc_int_for_mode (ival, SImode);
29587 break;
29588 default:
29589 break;
29591 if (IN_RANGE (ival, -128, 127))
29593 len = 1;
29594 continue;
29597 switch (mode)
29599 case MODE_QI:
29600 len = 1;
29601 break;
29602 case MODE_HI:
29603 len = 2;
29604 break;
29605 case MODE_SI:
29606 len = 4;
29607 break;
29608 /* Immediates for DImode instructions are encoded
29609 as 32bit sign extended values. */
29610 case MODE_DI:
29611 len = 4;
29612 break;
29613 default:
29614 fatal_insn ("unknown insn mode", insn);
29617 return len;
29620 /* Compute default value for "length_address" attribute. */
29622 ix86_attr_length_address_default (rtx_insn *insn)
29624 int i;
29626 if (get_attr_type (insn) == TYPE_LEA)
29628 rtx set = PATTERN (insn), addr;
29630 if (GET_CODE (set) == PARALLEL)
29631 set = XVECEXP (set, 0, 0);
29633 gcc_assert (GET_CODE (set) == SET);
29635 addr = SET_SRC (set);
29637 return memory_address_length (addr, true);
29640 extract_insn_cached (insn);
29641 for (i = recog_data.n_operands - 1; i >= 0; --i)
29643 rtx op = recog_data.operand[i];
29644 if (MEM_P (op))
29646 constrain_operands_cached (insn, reload_completed);
29647 if (which_alternative != -1)
29649 const char *constraints = recog_data.constraints[i];
29650 int alt = which_alternative;
29652 while (*constraints == '=' || *constraints == '+')
29653 constraints++;
29654 while (alt-- > 0)
29655 while (*constraints++ != ',')
29657 /* Skip ignored operands. */
29658 if (*constraints == 'X')
29659 continue;
29662 int len = memory_address_length (XEXP (op, 0), false);
29664 /* Account for segment prefix for non-default addr spaces. */
29665 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29666 len++;
29668 return len;
29671 return 0;
29674 /* Compute default value for "length_vex" attribute. It includes
29675 2 or 3 byte VEX prefix and 1 opcode byte. */
29678 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29679 bool has_vex_w)
29681 int i;
29683 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29684 byte VEX prefix. */
29685 if (!has_0f_opcode || has_vex_w)
29686 return 3 + 1;
29688 /* We can always use 2 byte VEX prefix in 32bit. */
29689 if (!TARGET_64BIT)
29690 return 2 + 1;
29692 extract_insn_cached (insn);
29694 for (i = recog_data.n_operands - 1; i >= 0; --i)
29695 if (REG_P (recog_data.operand[i]))
29697 /* REX.W bit uses 3 byte VEX prefix. */
29698 if (GET_MODE (recog_data.operand[i]) == DImode
29699 && GENERAL_REG_P (recog_data.operand[i]))
29700 return 3 + 1;
29702 else
29704 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29705 if (MEM_P (recog_data.operand[i])
29706 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29707 return 3 + 1;
29710 return 2 + 1;
29714 static bool
29715 ix86_class_likely_spilled_p (reg_class_t);
29717 /* Returns true if lhs of insn is HW function argument register and set up
29718 is_spilled to true if it is likely spilled HW register. */
29719 static bool
29720 insn_is_function_arg (rtx insn, bool* is_spilled)
29722 rtx dst;
29724 if (!NONDEBUG_INSN_P (insn))
29725 return false;
29726 /* Call instructions are not movable, ignore it. */
29727 if (CALL_P (insn))
29728 return false;
29729 insn = PATTERN (insn);
29730 if (GET_CODE (insn) == PARALLEL)
29731 insn = XVECEXP (insn, 0, 0);
29732 if (GET_CODE (insn) != SET)
29733 return false;
29734 dst = SET_DEST (insn);
29735 if (REG_P (dst) && HARD_REGISTER_P (dst)
29736 && ix86_function_arg_regno_p (REGNO (dst)))
29738 /* Is it likely spilled HW register? */
29739 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29740 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29741 *is_spilled = true;
29742 return true;
29744 return false;
29747 /* Add output dependencies for chain of function adjacent arguments if only
29748 there is a move to likely spilled HW register. Return first argument
29749 if at least one dependence was added or NULL otherwise. */
29750 static rtx_insn *
29751 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29753 rtx_insn *insn;
29754 rtx_insn *last = call;
29755 rtx_insn *first_arg = NULL;
29756 bool is_spilled = false;
29758 head = PREV_INSN (head);
29760 /* Find nearest to call argument passing instruction. */
29761 while (true)
29763 last = PREV_INSN (last);
29764 if (last == head)
29765 return NULL;
29766 if (!NONDEBUG_INSN_P (last))
29767 continue;
29768 if (insn_is_function_arg (last, &is_spilled))
29769 break;
29770 return NULL;
29773 first_arg = last;
29774 while (true)
29776 insn = PREV_INSN (last);
29777 if (!INSN_P (insn))
29778 break;
29779 if (insn == head)
29780 break;
29781 if (!NONDEBUG_INSN_P (insn))
29783 last = insn;
29784 continue;
29786 if (insn_is_function_arg (insn, &is_spilled))
29788 /* Add output depdendence between two function arguments if chain
29789 of output arguments contains likely spilled HW registers. */
29790 if (is_spilled)
29791 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29792 first_arg = last = insn;
29794 else
29795 break;
29797 if (!is_spilled)
29798 return NULL;
29799 return first_arg;
29802 /* Add output or anti dependency from insn to first_arg to restrict its code
29803 motion. */
29804 static void
29805 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29807 rtx set;
29808 rtx tmp;
29810 /* Add anti dependencies for bounds stores. */
29811 if (INSN_P (insn)
29812 && GET_CODE (PATTERN (insn)) == PARALLEL
29813 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29814 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29816 add_dependence (first_arg, insn, REG_DEP_ANTI);
29817 return;
29820 set = single_set (insn);
29821 if (!set)
29822 return;
29823 tmp = SET_DEST (set);
29824 if (REG_P (tmp))
29826 /* Add output dependency to the first function argument. */
29827 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29828 return;
29830 /* Add anti dependency. */
29831 add_dependence (first_arg, insn, REG_DEP_ANTI);
29834 /* Avoid cross block motion of function argument through adding dependency
29835 from the first non-jump instruction in bb. */
29836 static void
29837 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29839 rtx_insn *insn = BB_END (bb);
29841 while (insn)
29843 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29845 rtx set = single_set (insn);
29846 if (set)
29848 avoid_func_arg_motion (arg, insn);
29849 return;
29852 if (insn == BB_HEAD (bb))
29853 return;
29854 insn = PREV_INSN (insn);
29858 /* Hook for pre-reload schedule - avoid motion of function arguments
29859 passed in likely spilled HW registers. */
29860 static void
29861 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29863 rtx_insn *insn;
29864 rtx_insn *first_arg = NULL;
29865 if (reload_completed)
29866 return;
29867 while (head != tail && DEBUG_INSN_P (head))
29868 head = NEXT_INSN (head);
29869 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29870 if (INSN_P (insn) && CALL_P (insn))
29872 first_arg = add_parameter_dependencies (insn, head);
29873 if (first_arg)
29875 /* Add dependee for first argument to predecessors if only
29876 region contains more than one block. */
29877 basic_block bb = BLOCK_FOR_INSN (insn);
29878 int rgn = CONTAINING_RGN (bb->index);
29879 int nr_blks = RGN_NR_BLOCKS (rgn);
29880 /* Skip trivial regions and region head blocks that can have
29881 predecessors outside of region. */
29882 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29884 edge e;
29885 edge_iterator ei;
29887 /* Regions are SCCs with the exception of selective
29888 scheduling with pipelining of outer blocks enabled.
29889 So also check that immediate predecessors of a non-head
29890 block are in the same region. */
29891 FOR_EACH_EDGE (e, ei, bb->preds)
29893 /* Avoid creating of loop-carried dependencies through
29894 using topological ordering in the region. */
29895 if (rgn == CONTAINING_RGN (e->src->index)
29896 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29897 add_dependee_for_func_arg (first_arg, e->src);
29900 insn = first_arg;
29901 if (insn == head)
29902 break;
29905 else if (first_arg)
29906 avoid_func_arg_motion (first_arg, insn);
29909 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29910 HW registers to maximum, to schedule them at soon as possible. These are
29911 moves from function argument registers at the top of the function entry
29912 and moves from function return value registers after call. */
29913 static int
29914 ix86_adjust_priority (rtx_insn *insn, int priority)
29916 rtx set;
29918 if (reload_completed)
29919 return priority;
29921 if (!NONDEBUG_INSN_P (insn))
29922 return priority;
29924 set = single_set (insn);
29925 if (set)
29927 rtx tmp = SET_SRC (set);
29928 if (REG_P (tmp)
29929 && HARD_REGISTER_P (tmp)
29930 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29931 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29932 return current_sched_info->sched_max_insns_priority;
29935 return priority;
29938 /* Prepare for scheduling pass. */
29939 static void
29940 ix86_sched_init_global (FILE *, int, int)
29942 /* Install scheduling hooks for current CPU. Some of these hooks are used
29943 in time-critical parts of the scheduler, so we only set them up when
29944 they are actually used. */
29945 switch (ix86_tune)
29947 case PROCESSOR_CORE2:
29948 case PROCESSOR_NEHALEM:
29949 case PROCESSOR_SANDYBRIDGE:
29950 case PROCESSOR_HASWELL:
29951 case PROCESSOR_GENERIC:
29952 /* Do not perform multipass scheduling for pre-reload schedule
29953 to save compile time. */
29954 if (reload_completed)
29956 ix86_core2i7_init_hooks ();
29957 break;
29959 /* Fall through. */
29960 default:
29961 targetm.sched.dfa_post_advance_cycle = NULL;
29962 targetm.sched.first_cycle_multipass_init = NULL;
29963 targetm.sched.first_cycle_multipass_begin = NULL;
29964 targetm.sched.first_cycle_multipass_issue = NULL;
29965 targetm.sched.first_cycle_multipass_backtrack = NULL;
29966 targetm.sched.first_cycle_multipass_end = NULL;
29967 targetm.sched.first_cycle_multipass_fini = NULL;
29968 break;
29973 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29975 static HOST_WIDE_INT
29976 ix86_static_rtx_alignment (machine_mode mode)
29978 if (mode == DFmode)
29979 return 64;
29980 if (ALIGN_MODE_128 (mode))
29981 return MAX (128, GET_MODE_ALIGNMENT (mode));
29982 return GET_MODE_ALIGNMENT (mode);
29985 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29987 static HOST_WIDE_INT
29988 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29990 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29991 || TREE_CODE (exp) == INTEGER_CST)
29993 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29994 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29995 return MAX (mode_align, align);
29997 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29998 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29999 return BITS_PER_WORD;
30001 return align;
30004 /* Implement TARGET_EMPTY_RECORD_P. */
30006 static bool
30007 ix86_is_empty_record (const_tree type)
30009 if (!TARGET_64BIT)
30010 return false;
30011 return default_is_empty_record (type);
30014 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
30016 static void
30017 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
30019 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
30021 if (!cum->warn_empty)
30022 return;
30024 if (!TYPE_EMPTY_P (type))
30025 return;
30027 const_tree ctx = get_ultimate_context (cum->decl);
30028 if (ctx != NULL_TREE
30029 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
30030 return;
30032 /* If the actual size of the type is zero, then there is no change
30033 in how objects of this size are passed. */
30034 if (int_size_in_bytes (type) == 0)
30035 return;
30037 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
30038 "changes in -fabi-version=12 (GCC 8)", type);
30040 /* Only warn once. */
30041 cum->warn_empty = false;
30044 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30045 the data type, and ALIGN is the alignment that the object would
30046 ordinarily have. */
30048 static int
30049 iamcu_alignment (tree type, int align)
30051 machine_mode mode;
30053 if (align < 32 || TYPE_USER_ALIGN (type))
30054 return align;
30056 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30057 bytes. */
30058 mode = TYPE_MODE (strip_array_types (type));
30059 switch (GET_MODE_CLASS (mode))
30061 case MODE_INT:
30062 case MODE_COMPLEX_INT:
30063 case MODE_COMPLEX_FLOAT:
30064 case MODE_FLOAT:
30065 case MODE_DECIMAL_FLOAT:
30066 return 32;
30067 default:
30068 return align;
30072 /* Compute the alignment for a static variable.
30073 TYPE is the data type, and ALIGN is the alignment that
30074 the object would ordinarily have. The value of this function is used
30075 instead of that alignment to align the object. */
30078 ix86_data_alignment (tree type, int align, bool opt)
30080 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30081 for symbols from other compilation units or symbols that don't need
30082 to bind locally. In order to preserve some ABI compatibility with
30083 those compilers, ensure we don't decrease alignment from what we
30084 used to assume. */
30086 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30088 /* A data structure, equal or greater than the size of a cache line
30089 (64 bytes in the Pentium 4 and other recent Intel processors, including
30090 processors based on Intel Core microarchitecture) should be aligned
30091 so that its base address is a multiple of a cache line size. */
30093 int max_align
30094 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30096 if (max_align < BITS_PER_WORD)
30097 max_align = BITS_PER_WORD;
30099 switch (ix86_align_data_type)
30101 case ix86_align_data_type_abi: opt = false; break;
30102 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30103 case ix86_align_data_type_cacheline: break;
30106 if (TARGET_IAMCU)
30107 align = iamcu_alignment (type, align);
30109 if (opt
30110 && AGGREGATE_TYPE_P (type)
30111 && TYPE_SIZE (type)
30112 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30114 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
30115 && align < max_align_compat)
30116 align = max_align_compat;
30117 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
30118 && align < max_align)
30119 align = max_align;
30122 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30123 to 16byte boundary. */
30124 if (TARGET_64BIT)
30126 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30127 && TYPE_SIZE (type)
30128 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30129 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30130 && align < 128)
30131 return 128;
30134 if (!opt)
30135 return align;
30137 if (TREE_CODE (type) == ARRAY_TYPE)
30139 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30140 return 64;
30141 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30142 return 128;
30144 else if (TREE_CODE (type) == COMPLEX_TYPE)
30147 if (TYPE_MODE (type) == DCmode && align < 64)
30148 return 64;
30149 if ((TYPE_MODE (type) == XCmode
30150 || TYPE_MODE (type) == TCmode) && align < 128)
30151 return 128;
30153 else if ((TREE_CODE (type) == RECORD_TYPE
30154 || TREE_CODE (type) == UNION_TYPE
30155 || TREE_CODE (type) == QUAL_UNION_TYPE)
30156 && TYPE_FIELDS (type))
30158 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30159 return 64;
30160 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30161 return 128;
30163 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30164 || TREE_CODE (type) == INTEGER_TYPE)
30166 if (TYPE_MODE (type) == DFmode && align < 64)
30167 return 64;
30168 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30169 return 128;
30172 return align;
30175 /* Compute the alignment for a local variable or a stack slot. EXP is
30176 the data type or decl itself, MODE is the widest mode available and
30177 ALIGN is the alignment that the object would ordinarily have. The
30178 value of this macro is used instead of that alignment to align the
30179 object. */
30181 unsigned int
30182 ix86_local_alignment (tree exp, machine_mode mode,
30183 unsigned int align)
30185 tree type, decl;
30187 if (exp && DECL_P (exp))
30189 type = TREE_TYPE (exp);
30190 decl = exp;
30192 else
30194 type = exp;
30195 decl = NULL;
30198 /* Don't do dynamic stack realignment for long long objects with
30199 -mpreferred-stack-boundary=2. */
30200 if (!TARGET_64BIT
30201 && align == 64
30202 && ix86_preferred_stack_boundary < 64
30203 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30204 && (!type || !TYPE_USER_ALIGN (type))
30205 && (!decl || !DECL_USER_ALIGN (decl)))
30206 align = 32;
30208 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30209 register in MODE. We will return the largest alignment of XF
30210 and DF. */
30211 if (!type)
30213 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30214 align = GET_MODE_ALIGNMENT (DFmode);
30215 return align;
30218 /* Don't increase alignment for Intel MCU psABI. */
30219 if (TARGET_IAMCU)
30220 return align;
30222 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30223 to 16byte boundary. Exact wording is:
30225 An array uses the same alignment as its elements, except that a local or
30226 global array variable of length at least 16 bytes or
30227 a C99 variable-length array variable always has alignment of at least 16 bytes.
30229 This was added to allow use of aligned SSE instructions at arrays. This
30230 rule is meant for static storage (where compiler can not do the analysis
30231 by itself). We follow it for automatic variables only when convenient.
30232 We fully control everything in the function compiled and functions from
30233 other unit can not rely on the alignment.
30235 Exclude va_list type. It is the common case of local array where
30236 we can not benefit from the alignment.
30238 TODO: Probably one should optimize for size only when var is not escaping. */
30239 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30240 && TARGET_SSE)
30242 if (AGGREGATE_TYPE_P (type)
30243 && (va_list_type_node == NULL_TREE
30244 || (TYPE_MAIN_VARIANT (type)
30245 != TYPE_MAIN_VARIANT (va_list_type_node)))
30246 && TYPE_SIZE (type)
30247 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30248 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30249 && align < 128)
30250 return 128;
30252 if (TREE_CODE (type) == ARRAY_TYPE)
30254 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30255 return 64;
30256 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30257 return 128;
30259 else if (TREE_CODE (type) == COMPLEX_TYPE)
30261 if (TYPE_MODE (type) == DCmode && align < 64)
30262 return 64;
30263 if ((TYPE_MODE (type) == XCmode
30264 || TYPE_MODE (type) == TCmode) && align < 128)
30265 return 128;
30267 else if ((TREE_CODE (type) == RECORD_TYPE
30268 || TREE_CODE (type) == UNION_TYPE
30269 || TREE_CODE (type) == QUAL_UNION_TYPE)
30270 && TYPE_FIELDS (type))
30272 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30273 return 64;
30274 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30275 return 128;
30277 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30278 || TREE_CODE (type) == INTEGER_TYPE)
30281 if (TYPE_MODE (type) == DFmode && align < 64)
30282 return 64;
30283 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30284 return 128;
30286 return align;
30289 /* Compute the minimum required alignment for dynamic stack realignment
30290 purposes for a local variable, parameter or a stack slot. EXP is
30291 the data type or decl itself, MODE is its mode and ALIGN is the
30292 alignment that the object would ordinarily have. */
30294 unsigned int
30295 ix86_minimum_alignment (tree exp, machine_mode mode,
30296 unsigned int align)
30298 tree type, decl;
30300 if (exp && DECL_P (exp))
30302 type = TREE_TYPE (exp);
30303 decl = exp;
30305 else
30307 type = exp;
30308 decl = NULL;
30311 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30312 return align;
30314 /* Don't do dynamic stack realignment for long long objects with
30315 -mpreferred-stack-boundary=2. */
30316 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30317 && (!type || !TYPE_USER_ALIGN (type))
30318 && (!decl || !DECL_USER_ALIGN (decl)))
30320 gcc_checking_assert (!TARGET_STV);
30321 return 32;
30324 return align;
30327 /* Find a location for the static chain incoming to a nested function.
30328 This is a register, unless all free registers are used by arguments. */
30330 static rtx
30331 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30333 unsigned regno;
30335 if (TARGET_64BIT)
30337 /* We always use R10 in 64-bit mode. */
30338 regno = R10_REG;
30340 else
30342 const_tree fntype, fndecl;
30343 unsigned int ccvt;
30345 /* By default in 32-bit mode we use ECX to pass the static chain. */
30346 regno = CX_REG;
30348 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30350 fntype = TREE_TYPE (fndecl_or_type);
30351 fndecl = fndecl_or_type;
30353 else
30355 fntype = fndecl_or_type;
30356 fndecl = NULL;
30359 ccvt = ix86_get_callcvt (fntype);
30360 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30362 /* Fastcall functions use ecx/edx for arguments, which leaves
30363 us with EAX for the static chain.
30364 Thiscall functions use ecx for arguments, which also
30365 leaves us with EAX for the static chain. */
30366 regno = AX_REG;
30368 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30370 /* Thiscall functions use ecx for arguments, which leaves
30371 us with EAX and EDX for the static chain.
30372 We are using for abi-compatibility EAX. */
30373 regno = AX_REG;
30375 else if (ix86_function_regparm (fntype, fndecl) == 3)
30377 /* For regparm 3, we have no free call-clobbered registers in
30378 which to store the static chain. In order to implement this,
30379 we have the trampoline push the static chain to the stack.
30380 However, we can't push a value below the return address when
30381 we call the nested function directly, so we have to use an
30382 alternate entry point. For this we use ESI, and have the
30383 alternate entry point push ESI, so that things appear the
30384 same once we're executing the nested function. */
30385 if (incoming_p)
30387 if (fndecl == current_function_decl
30388 && !ix86_static_chain_on_stack)
30390 gcc_assert (!reload_completed);
30391 ix86_static_chain_on_stack = true;
30393 return gen_frame_mem (SImode,
30394 plus_constant (Pmode,
30395 arg_pointer_rtx, -8));
30397 regno = SI_REG;
30401 return gen_rtx_REG (Pmode, regno);
30404 /* Emit RTL insns to initialize the variable parts of a trampoline.
30405 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30406 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30407 to be passed to the target function. */
30409 static void
30410 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30412 rtx mem, fnaddr;
30413 int opcode;
30414 int offset = 0;
30415 bool need_endbr = (flag_cf_protection & CF_BRANCH) && TARGET_IBT;
30417 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30419 if (TARGET_64BIT)
30421 int size;
30423 if (need_endbr)
30425 /* Insert ENDBR64. */
30426 mem = adjust_address (m_tramp, SImode, offset);
30427 emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
30428 offset += 4;
30431 /* Load the function address to r11. Try to load address using
30432 the shorter movl instead of movabs. We may want to support
30433 movq for kernel mode, but kernel does not use trampolines at
30434 the moment. FNADDR is a 32bit address and may not be in
30435 DImode when ptr_mode == SImode. Always use movl in this
30436 case. */
30437 if (ptr_mode == SImode
30438 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30440 fnaddr = copy_addr_to_reg (fnaddr);
30442 mem = adjust_address (m_tramp, HImode, offset);
30443 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30445 mem = adjust_address (m_tramp, SImode, offset + 2);
30446 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30447 offset += 6;
30449 else
30451 mem = adjust_address (m_tramp, HImode, offset);
30452 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30454 mem = adjust_address (m_tramp, DImode, offset + 2);
30455 emit_move_insn (mem, fnaddr);
30456 offset += 10;
30459 /* Load static chain using movabs to r10. Use the shorter movl
30460 instead of movabs when ptr_mode == SImode. */
30461 if (ptr_mode == SImode)
30463 opcode = 0xba41;
30464 size = 6;
30466 else
30468 opcode = 0xba49;
30469 size = 10;
30472 mem = adjust_address (m_tramp, HImode, offset);
30473 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30475 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30476 emit_move_insn (mem, chain_value);
30477 offset += size;
30479 /* Jump to r11; the last (unused) byte is a nop, only there to
30480 pad the write out to a single 32-bit store. */
30481 mem = adjust_address (m_tramp, SImode, offset);
30482 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30483 offset += 4;
30485 else
30487 rtx disp, chain;
30489 /* Depending on the static chain location, either load a register
30490 with a constant, or push the constant to the stack. All of the
30491 instructions are the same size. */
30492 chain = ix86_static_chain (fndecl, true);
30493 if (REG_P (chain))
30495 switch (REGNO (chain))
30497 case AX_REG:
30498 opcode = 0xb8; break;
30499 case CX_REG:
30500 opcode = 0xb9; break;
30501 default:
30502 gcc_unreachable ();
30505 else
30506 opcode = 0x68;
30508 if (need_endbr)
30510 /* Insert ENDBR32. */
30511 mem = adjust_address (m_tramp, SImode, offset);
30512 emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
30513 offset += 4;
30516 mem = adjust_address (m_tramp, QImode, offset);
30517 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30519 mem = adjust_address (m_tramp, SImode, offset + 1);
30520 emit_move_insn (mem, chain_value);
30521 offset += 5;
30523 mem = adjust_address (m_tramp, QImode, offset);
30524 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30526 mem = adjust_address (m_tramp, SImode, offset + 1);
30528 /* Compute offset from the end of the jmp to the target function.
30529 In the case in which the trampoline stores the static chain on
30530 the stack, we need to skip the first insn which pushes the
30531 (call-saved) register static chain; this push is 1 byte. */
30532 offset += 5;
30533 disp = expand_binop (SImode, sub_optab, fnaddr,
30534 plus_constant (Pmode, XEXP (m_tramp, 0),
30535 offset - (MEM_P (chain) ? 1 : 0)),
30536 NULL_RTX, 1, OPTAB_DIRECT);
30537 emit_move_insn (mem, disp);
30540 gcc_assert (offset <= TRAMPOLINE_SIZE);
30542 #ifdef HAVE_ENABLE_EXECUTE_STACK
30543 #ifdef CHECK_EXECUTE_STACK_ENABLED
30544 if (CHECK_EXECUTE_STACK_ENABLED)
30545 #endif
30546 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30547 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30548 #endif
30551 static bool
30552 ix86_allocate_stack_slots_for_args (void)
30554 /* Naked functions should not allocate stack slots for arguments. */
30555 return !ix86_function_naked (current_function_decl);
30558 static bool
30559 ix86_warn_func_return (tree decl)
30561 /* Naked functions are implemented entirely in assembly, including the
30562 return sequence, so suppress warnings about this. */
30563 return !ix86_function_naked (decl);
30566 /* The following file contains several enumerations and data structures
30567 built from the definitions in i386-builtin-types.def. */
30569 #include "i386-builtin-types.inc"
30571 /* Table for the ix86 builtin non-function types. */
30572 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30574 /* Retrieve an element from the above table, building some of
30575 the types lazily. */
30577 static tree
30578 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30580 unsigned int index;
30581 tree type, itype;
30583 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30585 type = ix86_builtin_type_tab[(int) tcode];
30586 if (type != NULL)
30587 return type;
30589 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30590 if (tcode <= IX86_BT_LAST_VECT)
30592 machine_mode mode;
30594 index = tcode - IX86_BT_LAST_PRIM - 1;
30595 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30596 mode = ix86_builtin_type_vect_mode[index];
30598 type = build_vector_type_for_mode (itype, mode);
30600 else
30602 int quals;
30604 index = tcode - IX86_BT_LAST_VECT - 1;
30605 if (tcode <= IX86_BT_LAST_PTR)
30606 quals = TYPE_UNQUALIFIED;
30607 else
30608 quals = TYPE_QUAL_CONST;
30610 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30611 if (quals != TYPE_UNQUALIFIED)
30612 itype = build_qualified_type (itype, quals);
30614 type = build_pointer_type (itype);
30617 ix86_builtin_type_tab[(int) tcode] = type;
30618 return type;
30621 /* Table for the ix86 builtin function types. */
30622 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30624 /* Retrieve an element from the above table, building some of
30625 the types lazily. */
30627 static tree
30628 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30630 tree type;
30632 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30634 type = ix86_builtin_func_type_tab[(int) tcode];
30635 if (type != NULL)
30636 return type;
30638 if (tcode <= IX86_BT_LAST_FUNC)
30640 unsigned start = ix86_builtin_func_start[(int) tcode];
30641 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30642 tree rtype, atype, args = void_list_node;
30643 unsigned i;
30645 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30646 for (i = after - 1; i > start; --i)
30648 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30649 args = tree_cons (NULL, atype, args);
30652 type = build_function_type (rtype, args);
30654 else
30656 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30657 enum ix86_builtin_func_type icode;
30659 icode = ix86_builtin_func_alias_base[index];
30660 type = ix86_get_builtin_func_type (icode);
30663 ix86_builtin_func_type_tab[(int) tcode] = type;
30664 return type;
30668 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30669 bdesc_* arrays below should come first, then builtins for each bdesc_*
30670 array in ascending order, so that we can use direct array accesses. */
30671 enum ix86_builtins
30673 IX86_BUILTIN_MASKMOVQ,
30674 IX86_BUILTIN_LDMXCSR,
30675 IX86_BUILTIN_STMXCSR,
30676 IX86_BUILTIN_MASKMOVDQU,
30677 IX86_BUILTIN_PSLLDQ128,
30678 IX86_BUILTIN_CLFLUSH,
30679 IX86_BUILTIN_MONITOR,
30680 IX86_BUILTIN_MWAIT,
30681 IX86_BUILTIN_CLZERO,
30682 IX86_BUILTIN_VEC_INIT_V2SI,
30683 IX86_BUILTIN_VEC_INIT_V4HI,
30684 IX86_BUILTIN_VEC_INIT_V8QI,
30685 IX86_BUILTIN_VEC_EXT_V2DF,
30686 IX86_BUILTIN_VEC_EXT_V2DI,
30687 IX86_BUILTIN_VEC_EXT_V4SF,
30688 IX86_BUILTIN_VEC_EXT_V4SI,
30689 IX86_BUILTIN_VEC_EXT_V8HI,
30690 IX86_BUILTIN_VEC_EXT_V2SI,
30691 IX86_BUILTIN_VEC_EXT_V4HI,
30692 IX86_BUILTIN_VEC_EXT_V16QI,
30693 IX86_BUILTIN_VEC_SET_V2DI,
30694 IX86_BUILTIN_VEC_SET_V4SF,
30695 IX86_BUILTIN_VEC_SET_V4SI,
30696 IX86_BUILTIN_VEC_SET_V8HI,
30697 IX86_BUILTIN_VEC_SET_V4HI,
30698 IX86_BUILTIN_VEC_SET_V16QI,
30699 IX86_BUILTIN_GATHERSIV2DF,
30700 IX86_BUILTIN_GATHERSIV4DF,
30701 IX86_BUILTIN_GATHERDIV2DF,
30702 IX86_BUILTIN_GATHERDIV4DF,
30703 IX86_BUILTIN_GATHERSIV4SF,
30704 IX86_BUILTIN_GATHERSIV8SF,
30705 IX86_BUILTIN_GATHERDIV4SF,
30706 IX86_BUILTIN_GATHERDIV8SF,
30707 IX86_BUILTIN_GATHERSIV2DI,
30708 IX86_BUILTIN_GATHERSIV4DI,
30709 IX86_BUILTIN_GATHERDIV2DI,
30710 IX86_BUILTIN_GATHERDIV4DI,
30711 IX86_BUILTIN_GATHERSIV4SI,
30712 IX86_BUILTIN_GATHERSIV8SI,
30713 IX86_BUILTIN_GATHERDIV4SI,
30714 IX86_BUILTIN_GATHERDIV8SI,
30715 IX86_BUILTIN_VFMSUBSD3_MASK3,
30716 IX86_BUILTIN_VFMSUBSS3_MASK3,
30717 IX86_BUILTIN_GATHER3SIV8SF,
30718 IX86_BUILTIN_GATHER3SIV4SF,
30719 IX86_BUILTIN_GATHER3SIV4DF,
30720 IX86_BUILTIN_GATHER3SIV2DF,
30721 IX86_BUILTIN_GATHER3DIV8SF,
30722 IX86_BUILTIN_GATHER3DIV4SF,
30723 IX86_BUILTIN_GATHER3DIV4DF,
30724 IX86_BUILTIN_GATHER3DIV2DF,
30725 IX86_BUILTIN_GATHER3SIV8SI,
30726 IX86_BUILTIN_GATHER3SIV4SI,
30727 IX86_BUILTIN_GATHER3SIV4DI,
30728 IX86_BUILTIN_GATHER3SIV2DI,
30729 IX86_BUILTIN_GATHER3DIV8SI,
30730 IX86_BUILTIN_GATHER3DIV4SI,
30731 IX86_BUILTIN_GATHER3DIV4DI,
30732 IX86_BUILTIN_GATHER3DIV2DI,
30733 IX86_BUILTIN_SCATTERSIV8SF,
30734 IX86_BUILTIN_SCATTERSIV4SF,
30735 IX86_BUILTIN_SCATTERSIV4DF,
30736 IX86_BUILTIN_SCATTERSIV2DF,
30737 IX86_BUILTIN_SCATTERDIV8SF,
30738 IX86_BUILTIN_SCATTERDIV4SF,
30739 IX86_BUILTIN_SCATTERDIV4DF,
30740 IX86_BUILTIN_SCATTERDIV2DF,
30741 IX86_BUILTIN_SCATTERSIV8SI,
30742 IX86_BUILTIN_SCATTERSIV4SI,
30743 IX86_BUILTIN_SCATTERSIV4DI,
30744 IX86_BUILTIN_SCATTERSIV2DI,
30745 IX86_BUILTIN_SCATTERDIV8SI,
30746 IX86_BUILTIN_SCATTERDIV4SI,
30747 IX86_BUILTIN_SCATTERDIV4DI,
30748 IX86_BUILTIN_SCATTERDIV2DI,
30749 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30750 where all operands are 32-byte or 64-byte wide respectively. */
30751 IX86_BUILTIN_GATHERALTSIV4DF,
30752 IX86_BUILTIN_GATHERALTDIV8SF,
30753 IX86_BUILTIN_GATHERALTSIV4DI,
30754 IX86_BUILTIN_GATHERALTDIV8SI,
30755 IX86_BUILTIN_GATHER3ALTDIV16SF,
30756 IX86_BUILTIN_GATHER3ALTDIV16SI,
30757 IX86_BUILTIN_GATHER3ALTSIV4DF,
30758 IX86_BUILTIN_GATHER3ALTDIV8SF,
30759 IX86_BUILTIN_GATHER3ALTSIV4DI,
30760 IX86_BUILTIN_GATHER3ALTDIV8SI,
30761 IX86_BUILTIN_GATHER3ALTSIV8DF,
30762 IX86_BUILTIN_GATHER3ALTSIV8DI,
30763 IX86_BUILTIN_GATHER3DIV16SF,
30764 IX86_BUILTIN_GATHER3DIV16SI,
30765 IX86_BUILTIN_GATHER3DIV8DF,
30766 IX86_BUILTIN_GATHER3DIV8DI,
30767 IX86_BUILTIN_GATHER3SIV16SF,
30768 IX86_BUILTIN_GATHER3SIV16SI,
30769 IX86_BUILTIN_GATHER3SIV8DF,
30770 IX86_BUILTIN_GATHER3SIV8DI,
30771 IX86_BUILTIN_SCATTERALTSIV8DF,
30772 IX86_BUILTIN_SCATTERALTDIV16SF,
30773 IX86_BUILTIN_SCATTERALTSIV8DI,
30774 IX86_BUILTIN_SCATTERALTDIV16SI,
30775 IX86_BUILTIN_SCATTERDIV16SF,
30776 IX86_BUILTIN_SCATTERDIV16SI,
30777 IX86_BUILTIN_SCATTERDIV8DF,
30778 IX86_BUILTIN_SCATTERDIV8DI,
30779 IX86_BUILTIN_SCATTERSIV16SF,
30780 IX86_BUILTIN_SCATTERSIV16SI,
30781 IX86_BUILTIN_SCATTERSIV8DF,
30782 IX86_BUILTIN_SCATTERSIV8DI,
30783 IX86_BUILTIN_GATHERPFQPD,
30784 IX86_BUILTIN_GATHERPFDPS,
30785 IX86_BUILTIN_GATHERPFDPD,
30786 IX86_BUILTIN_GATHERPFQPS,
30787 IX86_BUILTIN_SCATTERPFDPD,
30788 IX86_BUILTIN_SCATTERPFDPS,
30789 IX86_BUILTIN_SCATTERPFQPD,
30790 IX86_BUILTIN_SCATTERPFQPS,
30791 IX86_BUILTIN_CLWB,
30792 IX86_BUILTIN_CLFLUSHOPT,
30793 IX86_BUILTIN_INFQ,
30794 IX86_BUILTIN_HUGE_VALQ,
30795 IX86_BUILTIN_NANQ,
30796 IX86_BUILTIN_NANSQ,
30797 IX86_BUILTIN_XABORT,
30798 IX86_BUILTIN_ADDCARRYX32,
30799 IX86_BUILTIN_ADDCARRYX64,
30800 IX86_BUILTIN_SBB32,
30801 IX86_BUILTIN_SBB64,
30802 IX86_BUILTIN_RDRAND16_STEP,
30803 IX86_BUILTIN_RDRAND32_STEP,
30804 IX86_BUILTIN_RDRAND64_STEP,
30805 IX86_BUILTIN_RDSEED16_STEP,
30806 IX86_BUILTIN_RDSEED32_STEP,
30807 IX86_BUILTIN_RDSEED64_STEP,
30808 IX86_BUILTIN_MONITORX,
30809 IX86_BUILTIN_MWAITX,
30810 IX86_BUILTIN_CFSTRING,
30811 IX86_BUILTIN_CPU_INIT,
30812 IX86_BUILTIN_CPU_IS,
30813 IX86_BUILTIN_CPU_SUPPORTS,
30814 IX86_BUILTIN_READ_FLAGS,
30815 IX86_BUILTIN_WRITE_FLAGS,
30817 /* All the remaining builtins are tracked in bdesc_* arrays in
30818 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30819 this point. */
30820 #define BDESC(mask, icode, name, code, comparison, flag) \
30821 code,
30822 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30823 code, \
30824 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30825 #define BDESC_END(kind, next_kind)
30827 #include "i386-builtin.def"
30829 #undef BDESC
30830 #undef BDESC_FIRST
30831 #undef BDESC_END
30833 IX86_BUILTIN_MAX,
30835 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30837 /* Now just the aliases for bdesc_* start/end. */
30838 #define BDESC(mask, icode, name, code, comparison, flag)
30839 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30840 #define BDESC_END(kind, next_kind) \
30841 IX86_BUILTIN__BDESC_##kind##_LAST \
30842 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30844 #include "i386-builtin.def"
30846 #undef BDESC
30847 #undef BDESC_FIRST
30848 #undef BDESC_END
30850 /* Just to make sure there is no comma after the last enumerator. */
30851 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30854 /* Table for the ix86 builtin decls. */
30855 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30857 /* Table of all of the builtin functions that are possible with different ISA's
30858 but are waiting to be built until a function is declared to use that
30859 ISA. */
30860 struct builtin_isa {
30861 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30862 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30863 const char *name; /* function name */
30864 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30865 unsigned char const_p:1; /* true if the declaration is constant */
30866 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30867 bool leaf_p; /* true if the declaration has leaf attribute */
30868 bool nothrow_p; /* true if the declaration has nothrow attribute */
30869 bool set_and_not_built_p;
30872 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30874 /* Bits that can still enable any inclusion of a builtin. */
30875 static HOST_WIDE_INT deferred_isa_values = 0;
30876 static HOST_WIDE_INT deferred_isa_values2 = 0;
30878 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30879 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30880 function decl in the ix86_builtins array. Returns the function decl or
30881 NULL_TREE, if the builtin was not added.
30883 If the front end has a special hook for builtin functions, delay adding
30884 builtin functions that aren't in the current ISA until the ISA is changed
30885 with function specific optimization. Doing so, can save about 300K for the
30886 default compiler. When the builtin is expanded, check at that time whether
30887 it is valid.
30889 If the front end doesn't have a special hook, record all builtins, even if
30890 it isn't an instruction set in the current ISA in case the user uses
30891 function specific options for a different ISA, so that we don't get scope
30892 errors if a builtin is added in the middle of a function scope. */
30894 static inline tree
30895 def_builtin (HOST_WIDE_INT mask, const char *name,
30896 enum ix86_builtin_func_type tcode,
30897 enum ix86_builtins code)
30899 tree decl = NULL_TREE;
30901 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30903 ix86_builtins_isa[(int) code].isa = mask;
30905 mask &= ~OPTION_MASK_ISA_64BIT;
30907 /* Filter out the masks most often ored together with others. */
30908 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30909 && mask != OPTION_MASK_ISA_AVX512VL)
30910 mask &= ~OPTION_MASK_ISA_AVX512VL;
30911 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30912 && mask != OPTION_MASK_ISA_AVX512BW)
30913 mask &= ~OPTION_MASK_ISA_AVX512BW;
30915 if (mask == 0
30916 || (mask & ix86_isa_flags) != 0
30917 || (lang_hooks.builtin_function
30918 == lang_hooks.builtin_function_ext_scope))
30920 tree type = ix86_get_builtin_func_type (tcode);
30921 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30922 NULL, NULL_TREE);
30923 ix86_builtins[(int) code] = decl;
30924 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30926 else
30928 /* Just a MASK where set_and_not_built_p == true can potentially
30929 include a builtin. */
30930 deferred_isa_values |= mask;
30931 ix86_builtins[(int) code] = NULL_TREE;
30932 ix86_builtins_isa[(int) code].tcode = tcode;
30933 ix86_builtins_isa[(int) code].name = name;
30934 ix86_builtins_isa[(int) code].leaf_p = false;
30935 ix86_builtins_isa[(int) code].nothrow_p = false;
30936 ix86_builtins_isa[(int) code].const_p = false;
30937 ix86_builtins_isa[(int) code].pure_p = false;
30938 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30942 return decl;
30945 /* Like def_builtin, but also marks the function decl "const". */
30947 static inline tree
30948 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30949 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30951 tree decl = def_builtin (mask, name, tcode, code);
30952 if (decl)
30953 TREE_READONLY (decl) = 1;
30954 else
30955 ix86_builtins_isa[(int) code].const_p = true;
30957 return decl;
30960 /* Like def_builtin, but also marks the function decl "pure". */
30962 static inline tree
30963 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30964 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30966 tree decl = def_builtin (mask, name, tcode, code);
30967 if (decl)
30968 DECL_PURE_P (decl) = 1;
30969 else
30970 ix86_builtins_isa[(int) code].pure_p = true;
30972 return decl;
30975 /* Like def_builtin, but for additional isa2 flags. */
30977 static inline tree
30978 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30979 enum ix86_builtin_func_type tcode,
30980 enum ix86_builtins code)
30982 tree decl = NULL_TREE;
30984 ix86_builtins_isa[(int) code].isa2 = mask;
30986 if (mask == 0
30987 || (mask & ix86_isa_flags2) != 0
30988 || (lang_hooks.builtin_function
30989 == lang_hooks.builtin_function_ext_scope))
30992 tree type = ix86_get_builtin_func_type (tcode);
30993 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30994 NULL, NULL_TREE);
30995 ix86_builtins[(int) code] = decl;
30996 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30998 else
31000 /* Just a MASK where set_and_not_built_p == true can potentially
31001 include a builtin. */
31002 deferred_isa_values2 |= mask;
31003 ix86_builtins[(int) code] = NULL_TREE;
31004 ix86_builtins_isa[(int) code].tcode = tcode;
31005 ix86_builtins_isa[(int) code].name = name;
31006 ix86_builtins_isa[(int) code].leaf_p = false;
31007 ix86_builtins_isa[(int) code].nothrow_p = false;
31008 ix86_builtins_isa[(int) code].const_p = false;
31009 ix86_builtins_isa[(int) code].pure_p = false;
31010 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31013 return decl;
31016 /* Like def_builtin, but also marks the function decl "const". */
31018 static inline tree
31019 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31020 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31022 tree decl = def_builtin2 (mask, name, tcode, code);
31023 if (decl)
31024 TREE_READONLY (decl) = 1;
31025 else
31026 ix86_builtins_isa[(int) code].const_p = true;
31028 return decl;
31031 /* Like def_builtin, but also marks the function decl "pure". */
31033 static inline tree
31034 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
31035 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31037 tree decl = def_builtin2 (mask, name, tcode, code);
31038 if (decl)
31039 DECL_PURE_P (decl) = 1;
31040 else
31041 ix86_builtins_isa[(int) code].pure_p = true;
31043 return decl;
31046 /* Add any new builtin functions for a given ISA that may not have been
31047 declared. This saves a bit of space compared to adding all of the
31048 declarations to the tree, even if we didn't use them. */
31050 static void
31051 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31053 isa &= ~OPTION_MASK_ISA_64BIT;
31055 if ((isa & deferred_isa_values) == 0
31056 && (isa2 & deferred_isa_values2) == 0)
31057 return;
31059 /* Bits in ISA value can be removed from potential isa values. */
31060 deferred_isa_values &= ~isa;
31061 deferred_isa_values2 &= ~isa2;
31063 int i;
31064 tree saved_current_target_pragma = current_target_pragma;
31065 current_target_pragma = NULL_TREE;
31067 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31069 if (((ix86_builtins_isa[i].isa & isa) != 0
31070 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31071 && ix86_builtins_isa[i].set_and_not_built_p)
31073 tree decl, type;
31075 /* Don't define the builtin again. */
31076 ix86_builtins_isa[i].set_and_not_built_p = false;
31078 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31079 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31080 type, i, BUILT_IN_MD, NULL,
31081 NULL_TREE);
31083 ix86_builtins[i] = decl;
31084 if (ix86_builtins_isa[i].const_p)
31085 TREE_READONLY (decl) = 1;
31086 if (ix86_builtins_isa[i].pure_p)
31087 DECL_PURE_P (decl) = 1;
31088 if (ix86_builtins_isa[i].leaf_p)
31089 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31090 NULL_TREE);
31091 if (ix86_builtins_isa[i].nothrow_p)
31092 TREE_NOTHROW (decl) = 1;
31096 current_target_pragma = saved_current_target_pragma;
31099 /* Bits for builtin_description.flag. */
31101 /* Set when we don't support the comparison natively, and should
31102 swap_comparison in order to support it. */
31103 #define BUILTIN_DESC_SWAP_OPERANDS 1
31105 struct builtin_description
31107 const HOST_WIDE_INT mask;
31108 const enum insn_code icode;
31109 const char *const name;
31110 const enum ix86_builtins code;
31111 const enum rtx_code comparison;
31112 const int flag;
31115 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31116 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31117 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31118 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31119 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31120 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31121 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31122 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31123 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31124 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31125 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31126 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31127 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31128 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31129 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31130 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31131 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31132 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31133 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31134 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31135 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31136 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31137 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31138 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31139 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31140 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31141 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31142 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31143 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31144 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31145 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31146 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31147 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31148 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31149 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31150 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31151 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31152 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31153 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31154 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31155 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31156 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31157 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31158 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31159 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31160 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31161 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31162 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31163 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31164 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31165 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31166 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31168 #define BDESC(mask, icode, name, code, comparison, flag) \
31169 { mask, icode, name, code, comparison, flag },
31170 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31171 static const struct builtin_description bdesc_##kind[] = \
31173 BDESC (mask, icode, name, code, comparison, flag)
31174 #define BDESC_END(kind, next_kind) \
31177 #include "i386-builtin.def"
31179 #undef BDESC
31180 #undef BDESC_FIRST
31181 #undef BDESC_END
31183 /* TM vector builtins. */
31185 /* Reuse the existing x86-specific `struct builtin_description' cause
31186 we're lazy. Add casts to make them fit. */
31187 static const struct builtin_description bdesc_tm[] =
31189 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31190 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31191 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31192 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31193 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31194 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31195 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31197 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31198 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31199 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31200 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31201 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31202 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31203 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31205 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31206 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31207 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31208 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31209 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31210 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31211 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31213 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31214 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31215 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31218 /* Initialize the transactional memory vector load/store builtins. */
31220 static void
31221 ix86_init_tm_builtins (void)
31223 enum ix86_builtin_func_type ftype;
31224 const struct builtin_description *d;
31225 size_t i;
31226 tree decl;
31227 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31228 tree attrs_log, attrs_type_log;
31230 if (!flag_tm)
31231 return;
31233 /* If there are no builtins defined, we must be compiling in a
31234 language without trans-mem support. */
31235 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31236 return;
31238 /* Use whatever attributes a normal TM load has. */
31239 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31240 attrs_load = DECL_ATTRIBUTES (decl);
31241 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31242 /* Use whatever attributes a normal TM store has. */
31243 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31244 attrs_store = DECL_ATTRIBUTES (decl);
31245 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31246 /* Use whatever attributes a normal TM log has. */
31247 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31248 attrs_log = DECL_ATTRIBUTES (decl);
31249 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31251 for (i = 0, d = bdesc_tm;
31252 i < ARRAY_SIZE (bdesc_tm);
31253 i++, d++)
31255 if ((d->mask & ix86_isa_flags) != 0
31256 || (lang_hooks.builtin_function
31257 == lang_hooks.builtin_function_ext_scope))
31259 tree type, attrs, attrs_type;
31260 enum built_in_function code = (enum built_in_function) d->code;
31262 ftype = (enum ix86_builtin_func_type) d->flag;
31263 type = ix86_get_builtin_func_type (ftype);
31265 if (BUILTIN_TM_LOAD_P (code))
31267 attrs = attrs_load;
31268 attrs_type = attrs_type_load;
31270 else if (BUILTIN_TM_STORE_P (code))
31272 attrs = attrs_store;
31273 attrs_type = attrs_type_store;
31275 else
31277 attrs = attrs_log;
31278 attrs_type = attrs_type_log;
31280 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31281 /* The builtin without the prefix for
31282 calling it directly. */
31283 d->name + strlen ("__builtin_"),
31284 attrs);
31285 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31286 set the TYPE_ATTRIBUTES. */
31287 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31289 set_builtin_decl (code, decl, false);
31294 /* Macros for verification of enum ix86_builtins order. */
31295 #define BDESC_VERIFY(x, y, z) \
31296 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31297 #define BDESC_VERIFYS(x, y, z) \
31298 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31300 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31301 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31302 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31303 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31304 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31305 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31306 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31307 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31308 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31309 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31310 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31311 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31312 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31313 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31314 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31315 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
31316 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31317 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31318 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31319 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31320 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31321 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31322 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31323 IX86_BUILTIN__BDESC_CET_LAST, 1);
31324 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31325 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31327 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31328 in the current target ISA to allow the user to compile particular modules
31329 with different target specific options that differ from the command line
31330 options. */
31331 static void
31332 ix86_init_mmx_sse_builtins (void)
31334 const struct builtin_description * d;
31335 enum ix86_builtin_func_type ftype;
31336 size_t i;
31338 /* Add all special builtins with variable number of operands. */
31339 for (i = 0, d = bdesc_special_args;
31340 i < ARRAY_SIZE (bdesc_special_args);
31341 i++, d++)
31343 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31344 if (d->name == 0)
31345 continue;
31347 ftype = (enum ix86_builtin_func_type) d->flag;
31348 def_builtin (d->mask, d->name, ftype, d->code);
31350 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31351 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31352 ARRAY_SIZE (bdesc_special_args) - 1);
31354 /* Add all special builtins with variable number of operands. */
31355 for (i = 0, d = bdesc_special_args2;
31356 i < ARRAY_SIZE (bdesc_special_args2);
31357 i++, d++)
31359 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
31360 if (d->name == 0)
31361 continue;
31363 ftype = (enum ix86_builtin_func_type) d->flag;
31364 def_builtin2 (d->mask, d->name, ftype, d->code);
31366 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
31367 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31368 ARRAY_SIZE (bdesc_special_args2) - 1);
31370 /* Add all builtins with variable number of operands. */
31371 for (i = 0, d = bdesc_args;
31372 i < ARRAY_SIZE (bdesc_args);
31373 i++, d++)
31375 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31376 if (d->name == 0)
31377 continue;
31379 ftype = (enum ix86_builtin_func_type) d->flag;
31380 def_builtin_const (d->mask, d->name, ftype, d->code);
31382 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31383 IX86_BUILTIN__BDESC_ARGS_FIRST,
31384 ARRAY_SIZE (bdesc_args) - 1);
31386 /* Add all builtins with variable number of operands. */
31387 for (i = 0, d = bdesc_args2;
31388 i < ARRAY_SIZE (bdesc_args2);
31389 i++, d++)
31391 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31392 if (d->name == 0)
31393 continue;
31395 ftype = (enum ix86_builtin_func_type) d->flag;
31396 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31398 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31399 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31400 ARRAY_SIZE (bdesc_args2) - 1);
31402 /* Add all builtins with rounding. */
31403 for (i = 0, d = bdesc_round_args;
31404 i < ARRAY_SIZE (bdesc_round_args);
31405 i++, d++)
31407 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31408 if (d->name == 0)
31409 continue;
31411 ftype = (enum ix86_builtin_func_type) d->flag;
31412 def_builtin_const (d->mask, d->name, ftype, d->code);
31414 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31415 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31416 ARRAY_SIZE (bdesc_round_args) - 1);
31418 /* pcmpestr[im] insns. */
31419 for (i = 0, d = bdesc_pcmpestr;
31420 i < ARRAY_SIZE (bdesc_pcmpestr);
31421 i++, d++)
31423 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31424 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31425 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31426 else
31427 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31428 def_builtin_const (d->mask, d->name, ftype, d->code);
31430 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31431 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31432 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31434 /* pcmpistr[im] insns. */
31435 for (i = 0, d = bdesc_pcmpistr;
31436 i < ARRAY_SIZE (bdesc_pcmpistr);
31437 i++, d++)
31439 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31440 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31441 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31442 else
31443 ftype = INT_FTYPE_V16QI_V16QI_INT;
31444 def_builtin_const (d->mask, d->name, ftype, d->code);
31446 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31447 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31448 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31450 /* comi/ucomi insns. */
31451 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31453 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31454 if (d->mask == OPTION_MASK_ISA_SSE2)
31455 ftype = INT_FTYPE_V2DF_V2DF;
31456 else
31457 ftype = INT_FTYPE_V4SF_V4SF;
31458 def_builtin_const (d->mask, d->name, ftype, d->code);
31460 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31461 IX86_BUILTIN__BDESC_COMI_FIRST,
31462 ARRAY_SIZE (bdesc_comi) - 1);
31464 /* SSE */
31465 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31466 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31467 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31468 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31470 /* SSE or 3DNow!A */
31471 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31472 /* As it uses V4HImode, we have to require -mmmx too. */
31473 | OPTION_MASK_ISA_MMX,
31474 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31475 IX86_BUILTIN_MASKMOVQ);
31477 /* SSE2 */
31478 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31479 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31481 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31482 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31483 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31484 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31486 /* SSE3. */
31487 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31488 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31489 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31490 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31492 /* AES */
31493 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31494 "__builtin_ia32_aesenc128",
31495 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31496 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31497 "__builtin_ia32_aesenclast128",
31498 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31499 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31500 "__builtin_ia32_aesdec128",
31501 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31502 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31503 "__builtin_ia32_aesdeclast128",
31504 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31505 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31506 "__builtin_ia32_aesimc128",
31507 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31508 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31509 "__builtin_ia32_aeskeygenassist128",
31510 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31512 /* PCLMUL */
31513 def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2,
31514 "__builtin_ia32_pclmulqdq128",
31515 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31517 /* RDRND */
31518 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31519 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31520 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31521 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31522 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31523 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31524 IX86_BUILTIN_RDRAND64_STEP);
31526 /* AVX2 */
31527 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31528 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31529 IX86_BUILTIN_GATHERSIV2DF);
31531 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31532 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31533 IX86_BUILTIN_GATHERSIV4DF);
31535 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31536 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31537 IX86_BUILTIN_GATHERDIV2DF);
31539 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31540 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31541 IX86_BUILTIN_GATHERDIV4DF);
31543 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31544 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31545 IX86_BUILTIN_GATHERSIV4SF);
31547 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31548 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31549 IX86_BUILTIN_GATHERSIV8SF);
31551 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31552 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31553 IX86_BUILTIN_GATHERDIV4SF);
31555 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31556 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31557 IX86_BUILTIN_GATHERDIV8SF);
31559 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31560 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31561 IX86_BUILTIN_GATHERSIV2DI);
31563 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31564 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31565 IX86_BUILTIN_GATHERSIV4DI);
31567 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31568 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31569 IX86_BUILTIN_GATHERDIV2DI);
31571 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31572 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31573 IX86_BUILTIN_GATHERDIV4DI);
31575 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31576 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31577 IX86_BUILTIN_GATHERSIV4SI);
31579 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31580 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31581 IX86_BUILTIN_GATHERSIV8SI);
31583 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31584 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31585 IX86_BUILTIN_GATHERDIV4SI);
31587 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31588 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31589 IX86_BUILTIN_GATHERDIV8SI);
31591 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31592 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31593 IX86_BUILTIN_GATHERALTSIV4DF);
31595 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31596 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31597 IX86_BUILTIN_GATHERALTDIV8SF);
31599 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31600 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31601 IX86_BUILTIN_GATHERALTSIV4DI);
31603 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31604 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31605 IX86_BUILTIN_GATHERALTDIV8SI);
31607 /* AVX512F */
31608 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31609 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31610 IX86_BUILTIN_GATHER3SIV16SF);
31612 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31613 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31614 IX86_BUILTIN_GATHER3SIV8DF);
31616 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31617 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31618 IX86_BUILTIN_GATHER3DIV16SF);
31620 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31621 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31622 IX86_BUILTIN_GATHER3DIV8DF);
31624 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31625 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31626 IX86_BUILTIN_GATHER3SIV16SI);
31628 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31629 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31630 IX86_BUILTIN_GATHER3SIV8DI);
31632 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31633 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31634 IX86_BUILTIN_GATHER3DIV16SI);
31636 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31637 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31638 IX86_BUILTIN_GATHER3DIV8DI);
31640 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31641 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31642 IX86_BUILTIN_GATHER3ALTSIV8DF);
31644 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31645 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31646 IX86_BUILTIN_GATHER3ALTDIV16SF);
31648 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31649 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31650 IX86_BUILTIN_GATHER3ALTSIV8DI);
31652 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31653 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31654 IX86_BUILTIN_GATHER3ALTDIV16SI);
31656 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31657 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31658 IX86_BUILTIN_SCATTERSIV16SF);
31660 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31661 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31662 IX86_BUILTIN_SCATTERSIV8DF);
31664 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31665 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31666 IX86_BUILTIN_SCATTERDIV16SF);
31668 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31669 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31670 IX86_BUILTIN_SCATTERDIV8DF);
31672 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31673 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31674 IX86_BUILTIN_SCATTERSIV16SI);
31676 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31677 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31678 IX86_BUILTIN_SCATTERSIV8DI);
31680 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31681 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31682 IX86_BUILTIN_SCATTERDIV16SI);
31684 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31685 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31686 IX86_BUILTIN_SCATTERDIV8DI);
31688 /* AVX512VL */
31689 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31690 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31691 IX86_BUILTIN_GATHER3SIV2DF);
31693 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31694 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31695 IX86_BUILTIN_GATHER3SIV4DF);
31697 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31698 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31699 IX86_BUILTIN_GATHER3DIV2DF);
31701 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31702 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31703 IX86_BUILTIN_GATHER3DIV4DF);
31705 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31706 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31707 IX86_BUILTIN_GATHER3SIV4SF);
31709 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31710 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31711 IX86_BUILTIN_GATHER3SIV8SF);
31713 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31714 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31715 IX86_BUILTIN_GATHER3DIV4SF);
31717 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31718 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31719 IX86_BUILTIN_GATHER3DIV8SF);
31721 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31722 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31723 IX86_BUILTIN_GATHER3SIV2DI);
31725 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31726 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31727 IX86_BUILTIN_GATHER3SIV4DI);
31729 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31730 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31731 IX86_BUILTIN_GATHER3DIV2DI);
31733 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31734 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31735 IX86_BUILTIN_GATHER3DIV4DI);
31737 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31738 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31739 IX86_BUILTIN_GATHER3SIV4SI);
31741 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31742 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31743 IX86_BUILTIN_GATHER3SIV8SI);
31745 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31746 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31747 IX86_BUILTIN_GATHER3DIV4SI);
31749 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31750 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31751 IX86_BUILTIN_GATHER3DIV8SI);
31753 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31754 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31755 IX86_BUILTIN_GATHER3ALTSIV4DF);
31757 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31758 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31759 IX86_BUILTIN_GATHER3ALTDIV8SF);
31761 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31762 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31763 IX86_BUILTIN_GATHER3ALTSIV4DI);
31765 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31766 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31767 IX86_BUILTIN_GATHER3ALTDIV8SI);
31769 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31770 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31771 IX86_BUILTIN_SCATTERSIV8SF);
31773 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31774 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31775 IX86_BUILTIN_SCATTERSIV4SF);
31777 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31778 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31779 IX86_BUILTIN_SCATTERSIV4DF);
31781 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31782 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31783 IX86_BUILTIN_SCATTERSIV2DF);
31785 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31786 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31787 IX86_BUILTIN_SCATTERDIV8SF);
31789 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31790 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31791 IX86_BUILTIN_SCATTERDIV4SF);
31793 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31794 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31795 IX86_BUILTIN_SCATTERDIV4DF);
31797 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31798 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31799 IX86_BUILTIN_SCATTERDIV2DF);
31801 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31802 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31803 IX86_BUILTIN_SCATTERSIV8SI);
31805 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31806 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31807 IX86_BUILTIN_SCATTERSIV4SI);
31809 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31810 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31811 IX86_BUILTIN_SCATTERSIV4DI);
31813 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31814 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31815 IX86_BUILTIN_SCATTERSIV2DI);
31817 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31818 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31819 IX86_BUILTIN_SCATTERDIV8SI);
31821 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31822 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31823 IX86_BUILTIN_SCATTERDIV4SI);
31825 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31826 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31827 IX86_BUILTIN_SCATTERDIV4DI);
31829 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31830 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31831 IX86_BUILTIN_SCATTERDIV2DI);
31832 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31833 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31834 IX86_BUILTIN_SCATTERALTSIV8DF);
31836 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31837 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31838 IX86_BUILTIN_SCATTERALTDIV16SF);
31840 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31841 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31842 IX86_BUILTIN_SCATTERALTSIV8DI);
31844 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31845 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31846 IX86_BUILTIN_SCATTERALTDIV16SI);
31848 /* AVX512PF */
31849 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31850 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31851 IX86_BUILTIN_GATHERPFDPD);
31852 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31853 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31854 IX86_BUILTIN_GATHERPFDPS);
31855 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31856 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31857 IX86_BUILTIN_GATHERPFQPD);
31858 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31859 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31860 IX86_BUILTIN_GATHERPFQPS);
31861 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31862 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31863 IX86_BUILTIN_SCATTERPFDPD);
31864 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31865 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31866 IX86_BUILTIN_SCATTERPFDPS);
31867 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31868 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31869 IX86_BUILTIN_SCATTERPFQPD);
31870 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31871 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31872 IX86_BUILTIN_SCATTERPFQPS);
31874 /* SHA */
31875 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31876 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31877 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31878 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31879 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31880 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31881 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31882 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31883 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31884 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31885 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31886 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31887 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31888 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31890 /* RTM. */
31891 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31892 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31894 /* MMX access to the vec_init patterns. */
31895 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31896 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31898 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31899 V4HI_FTYPE_HI_HI_HI_HI,
31900 IX86_BUILTIN_VEC_INIT_V4HI);
31902 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31903 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31904 IX86_BUILTIN_VEC_INIT_V8QI);
31906 /* Access to the vec_extract patterns. */
31907 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31908 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31909 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31910 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31911 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31912 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31913 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31914 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31915 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31916 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31918 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31919 /* As it uses V4HImode, we have to require -mmmx too. */
31920 | OPTION_MASK_ISA_MMX,
31921 "__builtin_ia32_vec_ext_v4hi",
31922 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31924 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31925 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31927 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31928 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31930 /* Access to the vec_set patterns. */
31931 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31932 "__builtin_ia32_vec_set_v2di",
31933 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31935 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31936 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31938 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31939 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31941 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31942 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31944 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31945 /* As it uses V4HImode, we have to require -mmmx too. */
31946 | OPTION_MASK_ISA_MMX,
31947 "__builtin_ia32_vec_set_v4hi",
31948 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31950 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31951 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31953 /* RDSEED */
31954 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31955 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31956 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31957 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31958 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31959 "__builtin_ia32_rdseed_di_step",
31960 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31962 /* ADCX */
31963 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31964 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31965 def_builtin (OPTION_MASK_ISA_64BIT,
31966 "__builtin_ia32_addcarryx_u64",
31967 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31968 IX86_BUILTIN_ADDCARRYX64);
31970 /* SBB */
31971 def_builtin (0, "__builtin_ia32_sbb_u32",
31972 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31973 def_builtin (OPTION_MASK_ISA_64BIT,
31974 "__builtin_ia32_sbb_u64",
31975 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31976 IX86_BUILTIN_SBB64);
31978 /* Read/write FLAGS. */
31979 def_builtin (0, "__builtin_ia32_readeflags_u32",
31980 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31981 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31982 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31983 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31984 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31985 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31986 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31988 /* CLFLUSHOPT. */
31989 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31990 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31992 /* CLWB. */
31993 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31994 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31996 /* MONITORX and MWAITX. */
31997 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31998 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31999 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
32000 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
32002 /* CLZERO. */
32003 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
32004 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
32006 /* Add FMA4 multi-arg argument instructions */
32007 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32009 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
32010 if (d->name == 0)
32011 continue;
32013 ftype = (enum ix86_builtin_func_type) d->flag;
32014 def_builtin_const (d->mask, d->name, ftype, d->code);
32016 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
32017 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32018 ARRAY_SIZE (bdesc_multi_arg) - 1);
32020 /* Add CET inrinsics. */
32021 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
32023 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
32024 if (d->name == 0)
32025 continue;
32027 ftype = (enum ix86_builtin_func_type) d->flag;
32028 def_builtin (d->mask, d->name, ftype, d->code);
32030 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
32031 IX86_BUILTIN__BDESC_CET_FIRST,
32032 ARRAY_SIZE (bdesc_cet) - 1);
32034 for (i = 0, d = bdesc_cet_rdssp;
32035 i < ARRAY_SIZE (bdesc_cet_rdssp);
32036 i++, d++)
32038 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
32039 if (d->name == 0)
32040 continue;
32042 ftype = (enum ix86_builtin_func_type) d->flag;
32043 def_builtin (d->mask, d->name, ftype, d->code);
32045 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
32046 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
32047 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
32050 static void
32051 ix86_init_mpx_builtins ()
32053 const struct builtin_description * d;
32054 enum ix86_builtin_func_type ftype;
32055 tree decl;
32056 size_t i;
32058 for (i = 0, d = bdesc_mpx;
32059 i < ARRAY_SIZE (bdesc_mpx);
32060 i++, d++)
32062 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32063 if (d->name == 0)
32064 continue;
32066 ftype = (enum ix86_builtin_func_type) d->flag;
32067 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
32069 /* With no leaf and nothrow flags for MPX builtins
32070 abnormal edges may follow its call when setjmp
32071 presents in the function. Since we may have a lot
32072 of MPX builtins calls it causes lots of useless
32073 edges and enormous PHI nodes. To avoid this we mark
32074 MPX builtins as leaf and nothrow. */
32075 if (decl)
32077 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32078 NULL_TREE);
32079 TREE_NOTHROW (decl) = 1;
32081 else
32083 ix86_builtins_isa[(int)d->code].leaf_p = true;
32084 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32087 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32088 IX86_BUILTIN__BDESC_MPX_FIRST,
32089 ARRAY_SIZE (bdesc_mpx) - 1);
32091 for (i = 0, d = bdesc_mpx_const;
32092 i < ARRAY_SIZE (bdesc_mpx_const);
32093 i++, d++)
32095 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32096 if (d->name == 0)
32097 continue;
32099 ftype = (enum ix86_builtin_func_type) d->flag;
32100 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
32102 if (decl)
32104 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32105 NULL_TREE);
32106 TREE_NOTHROW (decl) = 1;
32108 else
32110 ix86_builtins_isa[(int)d->code].leaf_p = true;
32111 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32114 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32115 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32116 ARRAY_SIZE (bdesc_mpx_const) - 1);
32118 #undef BDESC_VERIFY
32119 #undef BDESC_VERIFYS
32121 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32122 to return a pointer to VERSION_DECL if the outcome of the expression
32123 formed by PREDICATE_CHAIN is true. This function will be called during
32124 version dispatch to decide which function version to execute. It returns
32125 the basic block at the end, to which more conditions can be added. */
32127 static basic_block
32128 add_condition_to_bb (tree function_decl, tree version_decl,
32129 tree predicate_chain, basic_block new_bb)
32131 gimple *return_stmt;
32132 tree convert_expr, result_var;
32133 gimple *convert_stmt;
32134 gimple *call_cond_stmt;
32135 gimple *if_else_stmt;
32137 basic_block bb1, bb2, bb3;
32138 edge e12, e23;
32140 tree cond_var, and_expr_var = NULL_TREE;
32141 gimple_seq gseq;
32143 tree predicate_decl, predicate_arg;
32145 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32147 gcc_assert (new_bb != NULL);
32148 gseq = bb_seq (new_bb);
32151 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32152 build_fold_addr_expr (version_decl));
32153 result_var = create_tmp_var (ptr_type_node);
32154 convert_stmt = gimple_build_assign (result_var, convert_expr);
32155 return_stmt = gimple_build_return (result_var);
32157 if (predicate_chain == NULL_TREE)
32159 gimple_seq_add_stmt (&gseq, convert_stmt);
32160 gimple_seq_add_stmt (&gseq, return_stmt);
32161 set_bb_seq (new_bb, gseq);
32162 gimple_set_bb (convert_stmt, new_bb);
32163 gimple_set_bb (return_stmt, new_bb);
32164 pop_cfun ();
32165 return new_bb;
32168 while (predicate_chain != NULL)
32170 cond_var = create_tmp_var (integer_type_node);
32171 predicate_decl = TREE_PURPOSE (predicate_chain);
32172 predicate_arg = TREE_VALUE (predicate_chain);
32173 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32174 gimple_call_set_lhs (call_cond_stmt, cond_var);
32176 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32177 gimple_set_bb (call_cond_stmt, new_bb);
32178 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32180 predicate_chain = TREE_CHAIN (predicate_chain);
32182 if (and_expr_var == NULL)
32183 and_expr_var = cond_var;
32184 else
32186 gimple *assign_stmt;
32187 /* Use MIN_EXPR to check if any integer is zero?.
32188 and_expr_var = min_expr <cond_var, and_expr_var> */
32189 assign_stmt = gimple_build_assign (and_expr_var,
32190 build2 (MIN_EXPR, integer_type_node,
32191 cond_var, and_expr_var));
32193 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32194 gimple_set_bb (assign_stmt, new_bb);
32195 gimple_seq_add_stmt (&gseq, assign_stmt);
32199 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32200 integer_zero_node,
32201 NULL_TREE, NULL_TREE);
32202 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32203 gimple_set_bb (if_else_stmt, new_bb);
32204 gimple_seq_add_stmt (&gseq, if_else_stmt);
32206 gimple_seq_add_stmt (&gseq, convert_stmt);
32207 gimple_seq_add_stmt (&gseq, return_stmt);
32208 set_bb_seq (new_bb, gseq);
32210 bb1 = new_bb;
32211 e12 = split_block (bb1, if_else_stmt);
32212 bb2 = e12->dest;
32213 e12->flags &= ~EDGE_FALLTHRU;
32214 e12->flags |= EDGE_TRUE_VALUE;
32216 e23 = split_block (bb2, return_stmt);
32218 gimple_set_bb (convert_stmt, bb2);
32219 gimple_set_bb (return_stmt, bb2);
32221 bb3 = e23->dest;
32222 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32224 remove_edge (e23);
32225 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32227 pop_cfun ();
32229 return bb3;
32232 /* This parses the attribute arguments to target in DECL and determines
32233 the right builtin to use to match the platform specification.
32234 It returns the priority value for this version decl. If PREDICATE_LIST
32235 is not NULL, it stores the list of cpu features that need to be checked
32236 before dispatching this function. */
32238 static unsigned int
32239 get_builtin_code_for_version (tree decl, tree *predicate_list)
32241 tree attrs;
32242 struct cl_target_option cur_target;
32243 tree target_node;
32244 struct cl_target_option *new_target;
32245 const char *arg_str = NULL;
32246 const char *attrs_str = NULL;
32247 char *tok_str = NULL;
32248 char *token;
32250 /* Priority of i386 features, greater value is higher priority. This is
32251 used to decide the order in which function dispatch must happen. For
32252 instance, a version specialized for SSE4.2 should be checked for dispatch
32253 before a version for SSE3, as SSE4.2 implies SSE3. */
32254 enum feature_priority
32256 P_ZERO = 0,
32257 P_MMX,
32258 P_SSE,
32259 P_SSE2,
32260 P_SSE3,
32261 P_SSSE3,
32262 P_PROC_SSSE3,
32263 P_SSE4_A,
32264 P_PROC_SSE4_A,
32265 P_SSE4_1,
32266 P_SSE4_2,
32267 P_PROC_SSE4_2,
32268 P_POPCNT,
32269 P_AES,
32270 P_PCLMUL,
32271 P_AVX,
32272 P_PROC_AVX,
32273 P_BMI,
32274 P_PROC_BMI,
32275 P_FMA4,
32276 P_XOP,
32277 P_PROC_XOP,
32278 P_FMA,
32279 P_PROC_FMA,
32280 P_BMI2,
32281 P_AVX2,
32282 P_PROC_AVX2,
32283 P_AVX512F,
32284 P_PROC_AVX512F
32287 enum feature_priority priority = P_ZERO;
32289 /* These are the target attribute strings for which a dispatcher is
32290 available, from fold_builtin_cpu. */
32292 static struct _feature_list
32294 const char *const name;
32295 const enum feature_priority priority;
32297 const feature_list[] =
32299 {"mmx", P_MMX},
32300 {"sse", P_SSE},
32301 {"sse2", P_SSE2},
32302 {"sse3", P_SSE3},
32303 {"sse4a", P_SSE4_A},
32304 {"ssse3", P_SSSE3},
32305 {"sse4.1", P_SSE4_1},
32306 {"sse4.2", P_SSE4_2},
32307 {"popcnt", P_POPCNT},
32308 {"aes", P_AES},
32309 {"pclmul", P_PCLMUL},
32310 {"avx", P_AVX},
32311 {"bmi", P_BMI},
32312 {"fma4", P_FMA4},
32313 {"xop", P_XOP},
32314 {"fma", P_FMA},
32315 {"bmi2", P_BMI2},
32316 {"avx2", P_AVX2},
32317 {"avx512f", P_AVX512F}
32321 static unsigned int NUM_FEATURES
32322 = sizeof (feature_list) / sizeof (struct _feature_list);
32324 unsigned int i;
32326 tree predicate_chain = NULL_TREE;
32327 tree predicate_decl, predicate_arg;
32329 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32330 gcc_assert (attrs != NULL);
32332 attrs = TREE_VALUE (TREE_VALUE (attrs));
32334 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32335 attrs_str = TREE_STRING_POINTER (attrs);
32337 /* Return priority zero for default function. */
32338 if (strcmp (attrs_str, "default") == 0)
32339 return 0;
32341 /* Handle arch= if specified. For priority, set it to be 1 more than
32342 the best instruction set the processor can handle. For instance, if
32343 there is a version for atom and a version for ssse3 (the highest ISA
32344 priority for atom), the atom version must be checked for dispatch
32345 before the ssse3 version. */
32346 if (strstr (attrs_str, "arch=") != NULL)
32348 cl_target_option_save (&cur_target, &global_options);
32349 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32350 &global_options_set);
32352 gcc_assert (target_node);
32353 new_target = TREE_TARGET_OPTION (target_node);
32354 gcc_assert (new_target);
32356 if (new_target->arch_specified && new_target->arch > 0)
32358 switch (new_target->arch)
32360 case PROCESSOR_CORE2:
32361 arg_str = "core2";
32362 priority = P_PROC_SSSE3;
32363 break;
32364 case PROCESSOR_NEHALEM:
32365 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32367 arg_str = "westmere";
32368 priority = P_AES;
32370 else
32372 /* We translate "arch=corei7" and "arch=nehalem" to
32373 "corei7" so that it will be mapped to M_INTEL_COREI7
32374 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32375 arg_str = "corei7";
32376 priority = P_PROC_SSE4_2;
32378 break;
32379 case PROCESSOR_SANDYBRIDGE:
32380 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32381 arg_str = "ivybridge";
32382 else
32383 arg_str = "sandybridge";
32384 priority = P_PROC_AVX;
32385 break;
32386 case PROCESSOR_HASWELL:
32387 case PROCESSOR_SKYLAKE_AVX512:
32388 if (new_target->x_ix86_isa_flags
32389 & OPTION_MASK_ISA_AVX512VBMI)
32390 arg_str = "cannonlake";
32391 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32392 arg_str = "skylake-avx512";
32393 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32394 arg_str = "skylake";
32395 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32396 arg_str = "broadwell";
32397 else
32398 arg_str = "haswell";
32399 priority = P_PROC_AVX2;
32400 break;
32401 case PROCESSOR_ICELAKE_CLIENT:
32402 arg_str = "icelake-client";
32403 priority = P_PROC_AVX2;
32404 break;
32405 case PROCESSOR_ICELAKE_SERVER:
32406 arg_str = "icelake-server";
32407 priority = P_PROC_AVX2;
32408 break;
32409 case PROCESSOR_BONNELL:
32410 arg_str = "bonnell";
32411 priority = P_PROC_SSSE3;
32412 break;
32413 case PROCESSOR_KNL:
32414 arg_str = "knl";
32415 priority = P_PROC_AVX512F;
32416 break;
32417 case PROCESSOR_KNM:
32418 arg_str = "knm";
32419 priority = P_PROC_AVX512F;
32420 break;
32421 case PROCESSOR_SILVERMONT:
32422 arg_str = "silvermont";
32423 priority = P_PROC_SSE4_2;
32424 break;
32425 case PROCESSOR_AMDFAM10:
32426 arg_str = "amdfam10h";
32427 priority = P_PROC_SSE4_A;
32428 break;
32429 case PROCESSOR_BTVER1:
32430 arg_str = "btver1";
32431 priority = P_PROC_SSE4_A;
32432 break;
32433 case PROCESSOR_BTVER2:
32434 arg_str = "btver2";
32435 priority = P_PROC_BMI;
32436 break;
32437 case PROCESSOR_BDVER1:
32438 arg_str = "bdver1";
32439 priority = P_PROC_XOP;
32440 break;
32441 case PROCESSOR_BDVER2:
32442 arg_str = "bdver2";
32443 priority = P_PROC_FMA;
32444 break;
32445 case PROCESSOR_BDVER3:
32446 arg_str = "bdver3";
32447 priority = P_PROC_FMA;
32448 break;
32449 case PROCESSOR_BDVER4:
32450 arg_str = "bdver4";
32451 priority = P_PROC_AVX2;
32452 break;
32453 case PROCESSOR_ZNVER1:
32454 arg_str = "znver1";
32455 priority = P_PROC_AVX2;
32456 break;
32460 cl_target_option_restore (&global_options, &cur_target);
32462 if (predicate_list && arg_str == NULL)
32464 error_at (DECL_SOURCE_LOCATION (decl),
32465 "No dispatcher found for the versioning attributes");
32466 return 0;
32469 if (predicate_list)
32471 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32472 /* For a C string literal the length includes the trailing NULL. */
32473 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32474 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32475 predicate_chain);
32479 /* Process feature name. */
32480 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32481 strcpy (tok_str, attrs_str);
32482 token = strtok (tok_str, ",");
32483 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32485 while (token != NULL)
32487 /* Do not process "arch=" */
32488 if (strncmp (token, "arch=", 5) == 0)
32490 token = strtok (NULL, ",");
32491 continue;
32493 for (i = 0; i < NUM_FEATURES; ++i)
32495 if (strcmp (token, feature_list[i].name) == 0)
32497 if (predicate_list)
32499 predicate_arg = build_string_literal (
32500 strlen (feature_list[i].name) + 1,
32501 feature_list[i].name);
32502 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32503 predicate_chain);
32505 /* Find the maximum priority feature. */
32506 if (feature_list[i].priority > priority)
32507 priority = feature_list[i].priority;
32509 break;
32512 if (predicate_list && i == NUM_FEATURES)
32514 error_at (DECL_SOURCE_LOCATION (decl),
32515 "No dispatcher found for %s", token);
32516 return 0;
32518 token = strtok (NULL, ",");
32520 free (tok_str);
32522 if (predicate_list && predicate_chain == NULL_TREE)
32524 error_at (DECL_SOURCE_LOCATION (decl),
32525 "No dispatcher found for the versioning attributes : %s",
32526 attrs_str);
32527 return 0;
32529 else if (predicate_list)
32531 predicate_chain = nreverse (predicate_chain);
32532 *predicate_list = predicate_chain;
32535 return priority;
32538 /* This compares the priority of target features in function DECL1
32539 and DECL2. It returns positive value if DECL1 is higher priority,
32540 negative value if DECL2 is higher priority and 0 if they are the
32541 same. */
32543 static int
32544 ix86_compare_version_priority (tree decl1, tree decl2)
32546 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32547 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32549 return (int)priority1 - (int)priority2;
32552 /* V1 and V2 point to function versions with different priorities
32553 based on the target ISA. This function compares their priorities. */
32555 static int
32556 feature_compare (const void *v1, const void *v2)
32558 typedef struct _function_version_info
32560 tree version_decl;
32561 tree predicate_chain;
32562 unsigned int dispatch_priority;
32563 } function_version_info;
32565 const function_version_info c1 = *(const function_version_info *)v1;
32566 const function_version_info c2 = *(const function_version_info *)v2;
32567 return (c2.dispatch_priority - c1.dispatch_priority);
32570 /* This function generates the dispatch function for
32571 multi-versioned functions. DISPATCH_DECL is the function which will
32572 contain the dispatch logic. FNDECLS are the function choices for
32573 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32574 in DISPATCH_DECL in which the dispatch code is generated. */
32576 static int
32577 dispatch_function_versions (tree dispatch_decl,
32578 void *fndecls_p,
32579 basic_block *empty_bb)
32581 tree default_decl;
32582 gimple *ifunc_cpu_init_stmt;
32583 gimple_seq gseq;
32584 int ix;
32585 tree ele;
32586 vec<tree> *fndecls;
32587 unsigned int num_versions = 0;
32588 unsigned int actual_versions = 0;
32589 unsigned int i;
32591 struct _function_version_info
32593 tree version_decl;
32594 tree predicate_chain;
32595 unsigned int dispatch_priority;
32596 }*function_version_info;
32598 gcc_assert (dispatch_decl != NULL
32599 && fndecls_p != NULL
32600 && empty_bb != NULL);
32602 /*fndecls_p is actually a vector. */
32603 fndecls = static_cast<vec<tree> *> (fndecls_p);
32605 /* At least one more version other than the default. */
32606 num_versions = fndecls->length ();
32607 gcc_assert (num_versions >= 2);
32609 function_version_info = (struct _function_version_info *)
32610 XNEWVEC (struct _function_version_info, (num_versions - 1));
32612 /* The first version in the vector is the default decl. */
32613 default_decl = (*fndecls)[0];
32615 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32617 gseq = bb_seq (*empty_bb);
32618 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32619 constructors, so explicity call __builtin_cpu_init here. */
32620 ifunc_cpu_init_stmt = gimple_build_call_vec (
32621 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32622 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32623 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32624 set_bb_seq (*empty_bb, gseq);
32626 pop_cfun ();
32629 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32631 tree version_decl = ele;
32632 tree predicate_chain = NULL_TREE;
32633 unsigned int priority;
32634 /* Get attribute string, parse it and find the right predicate decl.
32635 The predicate function could be a lengthy combination of many
32636 features, like arch-type and various isa-variants. */
32637 priority = get_builtin_code_for_version (version_decl,
32638 &predicate_chain);
32640 if (predicate_chain == NULL_TREE)
32641 continue;
32643 function_version_info [actual_versions].version_decl = version_decl;
32644 function_version_info [actual_versions].predicate_chain
32645 = predicate_chain;
32646 function_version_info [actual_versions].dispatch_priority = priority;
32647 actual_versions++;
32650 /* Sort the versions according to descending order of dispatch priority. The
32651 priority is based on the ISA. This is not a perfect solution. There
32652 could still be ambiguity. If more than one function version is suitable
32653 to execute, which one should be dispatched? In future, allow the user
32654 to specify a dispatch priority next to the version. */
32655 qsort (function_version_info, actual_versions,
32656 sizeof (struct _function_version_info), feature_compare);
32658 for (i = 0; i < actual_versions; ++i)
32659 *empty_bb = add_condition_to_bb (dispatch_decl,
32660 function_version_info[i].version_decl,
32661 function_version_info[i].predicate_chain,
32662 *empty_bb);
32664 /* dispatch default version at the end. */
32665 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32666 NULL, *empty_bb);
32668 free (function_version_info);
32669 return 0;
32672 /* This function changes the assembler name for functions that are
32673 versions. If DECL is a function version and has a "target"
32674 attribute, it appends the attribute string to its assembler name. */
32676 static tree
32677 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32679 tree version_attr;
32680 const char *orig_name, *version_string;
32681 char *attr_str, *assembler_name;
32683 if (DECL_DECLARED_INLINE_P (decl)
32684 && lookup_attribute ("gnu_inline",
32685 DECL_ATTRIBUTES (decl)))
32686 error_at (DECL_SOURCE_LOCATION (decl),
32687 "Function versions cannot be marked as gnu_inline,"
32688 " bodies have to be generated");
32690 if (DECL_VIRTUAL_P (decl)
32691 || DECL_VINDEX (decl))
32692 sorry ("Virtual function multiversioning not supported");
32694 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32696 /* target attribute string cannot be NULL. */
32697 gcc_assert (version_attr != NULL_TREE);
32699 orig_name = IDENTIFIER_POINTER (id);
32700 version_string
32701 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32703 if (strcmp (version_string, "default") == 0)
32704 return id;
32706 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32707 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32709 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32711 /* Allow assembler name to be modified if already set. */
32712 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32713 SET_DECL_RTL (decl, NULL);
32715 tree ret = get_identifier (assembler_name);
32716 XDELETEVEC (attr_str);
32717 XDELETEVEC (assembler_name);
32718 return ret;
32722 static tree
32723 ix86_mangle_decl_assembler_name (tree decl, tree id)
32725 /* For function version, add the target suffix to the assembler name. */
32726 if (TREE_CODE (decl) == FUNCTION_DECL
32727 && DECL_FUNCTION_VERSIONED (decl))
32728 id = ix86_mangle_function_version_assembler_name (decl, id);
32729 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32730 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32731 #endif
32733 return id;
32736 /* Make a dispatcher declaration for the multi-versioned function DECL.
32737 Calls to DECL function will be replaced with calls to the dispatcher
32738 by the front-end. Returns the decl of the dispatcher function. */
32740 static tree
32741 ix86_get_function_versions_dispatcher (void *decl)
32743 tree fn = (tree) decl;
32744 struct cgraph_node *node = NULL;
32745 struct cgraph_node *default_node = NULL;
32746 struct cgraph_function_version_info *node_v = NULL;
32747 struct cgraph_function_version_info *first_v = NULL;
32749 tree dispatch_decl = NULL;
32751 struct cgraph_function_version_info *default_version_info = NULL;
32753 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32755 node = cgraph_node::get (fn);
32756 gcc_assert (node != NULL);
32758 node_v = node->function_version ();
32759 gcc_assert (node_v != NULL);
32761 if (node_v->dispatcher_resolver != NULL)
32762 return node_v->dispatcher_resolver;
32764 /* Find the default version and make it the first node. */
32765 first_v = node_v;
32766 /* Go to the beginning of the chain. */
32767 while (first_v->prev != NULL)
32768 first_v = first_v->prev;
32769 default_version_info = first_v;
32770 while (default_version_info != NULL)
32772 if (is_function_default_version
32773 (default_version_info->this_node->decl))
32774 break;
32775 default_version_info = default_version_info->next;
32778 /* If there is no default node, just return NULL. */
32779 if (default_version_info == NULL)
32780 return NULL;
32782 /* Make default info the first node. */
32783 if (first_v != default_version_info)
32785 default_version_info->prev->next = default_version_info->next;
32786 if (default_version_info->next)
32787 default_version_info->next->prev = default_version_info->prev;
32788 first_v->prev = default_version_info;
32789 default_version_info->next = first_v;
32790 default_version_info->prev = NULL;
32793 default_node = default_version_info->this_node;
32795 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32796 if (targetm.has_ifunc_p ())
32798 struct cgraph_function_version_info *it_v = NULL;
32799 struct cgraph_node *dispatcher_node = NULL;
32800 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32802 /* Right now, the dispatching is done via ifunc. */
32803 dispatch_decl = make_dispatcher_decl (default_node->decl);
32805 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32806 gcc_assert (dispatcher_node != NULL);
32807 dispatcher_node->dispatcher_function = 1;
32808 dispatcher_version_info
32809 = dispatcher_node->insert_new_function_version ();
32810 dispatcher_version_info->next = default_version_info;
32811 dispatcher_node->definition = 1;
32813 /* Set the dispatcher for all the versions. */
32814 it_v = default_version_info;
32815 while (it_v != NULL)
32817 it_v->dispatcher_resolver = dispatch_decl;
32818 it_v = it_v->next;
32821 else
32822 #endif
32824 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32825 "multiversioning needs ifunc which is not supported "
32826 "on this target");
32829 return dispatch_decl;
32832 /* Make the resolver function decl to dispatch the versions of
32833 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32834 ifunc alias that will point to the created resolver. Create an
32835 empty basic block in the resolver and store the pointer in
32836 EMPTY_BB. Return the decl of the resolver function. */
32838 static tree
32839 make_resolver_func (const tree default_decl,
32840 const tree ifunc_alias_decl,
32841 basic_block *empty_bb)
32843 char *resolver_name;
32844 tree decl, type, decl_name, t;
32846 /* IFUNC's have to be globally visible. So, if the default_decl is
32847 not, then the name of the IFUNC should be made unique. */
32848 if (TREE_PUBLIC (default_decl) == 0)
32850 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32851 symtab->change_decl_assembler_name (ifunc_alias_decl,
32852 get_identifier (ifunc_name));
32853 XDELETEVEC (ifunc_name);
32856 resolver_name = make_unique_name (default_decl, "resolver", false);
32858 /* The resolver function should return a (void *). */
32859 type = build_function_type_list (ptr_type_node, NULL_TREE);
32861 decl = build_fn_decl (resolver_name, type);
32862 decl_name = get_identifier (resolver_name);
32863 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32865 DECL_NAME (decl) = decl_name;
32866 TREE_USED (decl) = 1;
32867 DECL_ARTIFICIAL (decl) = 1;
32868 DECL_IGNORED_P (decl) = 1;
32869 TREE_PUBLIC (decl) = 0;
32870 DECL_UNINLINABLE (decl) = 1;
32872 /* Resolver is not external, body is generated. */
32873 DECL_EXTERNAL (decl) = 0;
32874 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32876 DECL_CONTEXT (decl) = NULL_TREE;
32877 DECL_INITIAL (decl) = make_node (BLOCK);
32878 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32880 if (DECL_COMDAT_GROUP (default_decl)
32881 || TREE_PUBLIC (default_decl))
32883 /* In this case, each translation unit with a call to this
32884 versioned function will put out a resolver. Ensure it
32885 is comdat to keep just one copy. */
32886 DECL_COMDAT (decl) = 1;
32887 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32889 /* Build result decl and add to function_decl. */
32890 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32891 DECL_ARTIFICIAL (t) = 1;
32892 DECL_IGNORED_P (t) = 1;
32893 DECL_RESULT (decl) = t;
32895 gimplify_function_tree (decl);
32896 push_cfun (DECL_STRUCT_FUNCTION (decl));
32897 *empty_bb = init_lowered_empty_function (decl, false,
32898 profile_count::uninitialized ());
32900 cgraph_node::add_new_function (decl, true);
32901 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32903 pop_cfun ();
32905 gcc_assert (ifunc_alias_decl != NULL);
32906 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32907 DECL_ATTRIBUTES (ifunc_alias_decl)
32908 = make_attribute ("ifunc", resolver_name,
32909 DECL_ATTRIBUTES (ifunc_alias_decl));
32911 /* Create the alias for dispatch to resolver here. */
32912 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32913 XDELETEVEC (resolver_name);
32914 return decl;
32917 /* Generate the dispatching code body to dispatch multi-versioned function
32918 DECL. The target hook is called to process the "target" attributes and
32919 provide the code to dispatch the right function at run-time. NODE points
32920 to the dispatcher decl whose body will be created. */
32922 static tree
32923 ix86_generate_version_dispatcher_body (void *node_p)
32925 tree resolver_decl;
32926 basic_block empty_bb;
32927 tree default_ver_decl;
32928 struct cgraph_node *versn;
32929 struct cgraph_node *node;
32931 struct cgraph_function_version_info *node_version_info = NULL;
32932 struct cgraph_function_version_info *versn_info = NULL;
32934 node = (cgraph_node *)node_p;
32936 node_version_info = node->function_version ();
32937 gcc_assert (node->dispatcher_function
32938 && node_version_info != NULL);
32940 if (node_version_info->dispatcher_resolver)
32941 return node_version_info->dispatcher_resolver;
32943 /* The first version in the chain corresponds to the default version. */
32944 default_ver_decl = node_version_info->next->this_node->decl;
32946 /* node is going to be an alias, so remove the finalized bit. */
32947 node->definition = false;
32949 resolver_decl = make_resolver_func (default_ver_decl,
32950 node->decl, &empty_bb);
32952 node_version_info->dispatcher_resolver = resolver_decl;
32954 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32956 auto_vec<tree, 2> fn_ver_vec;
32958 for (versn_info = node_version_info->next; versn_info;
32959 versn_info = versn_info->next)
32961 versn = versn_info->this_node;
32962 /* Check for virtual functions here again, as by this time it should
32963 have been determined if this function needs a vtable index or
32964 not. This happens for methods in derived classes that override
32965 virtual methods in base classes but are not explicitly marked as
32966 virtual. */
32967 if (DECL_VINDEX (versn->decl))
32968 sorry ("Virtual function multiversioning not supported");
32970 fn_ver_vec.safe_push (versn->decl);
32973 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32974 cgraph_edge::rebuild_edges ();
32975 pop_cfun ();
32976 return resolver_decl;
32978 /* This builds the processor_model struct type defined in
32979 libgcc/config/i386/cpuinfo.c */
32981 static tree
32982 build_processor_model_struct (void)
32984 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32985 "__cpu_features"};
32986 tree field = NULL_TREE, field_chain = NULL_TREE;
32987 int i;
32988 tree type = make_node (RECORD_TYPE);
32990 /* The first 3 fields are unsigned int. */
32991 for (i = 0; i < 3; ++i)
32993 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32994 get_identifier (field_name[i]), unsigned_type_node);
32995 if (field_chain != NULL_TREE)
32996 DECL_CHAIN (field) = field_chain;
32997 field_chain = field;
33000 /* The last field is an array of unsigned integers of size one. */
33001 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33002 get_identifier (field_name[3]),
33003 build_array_type (unsigned_type_node,
33004 build_index_type (size_one_node)));
33005 if (field_chain != NULL_TREE)
33006 DECL_CHAIN (field) = field_chain;
33007 field_chain = field;
33009 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33010 return type;
33013 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33015 static tree
33016 make_var_decl (tree type, const char *name)
33018 tree new_decl;
33020 new_decl = build_decl (UNKNOWN_LOCATION,
33021 VAR_DECL,
33022 get_identifier(name),
33023 type);
33025 DECL_EXTERNAL (new_decl) = 1;
33026 TREE_STATIC (new_decl) = 1;
33027 TREE_PUBLIC (new_decl) = 1;
33028 DECL_INITIAL (new_decl) = 0;
33029 DECL_ARTIFICIAL (new_decl) = 0;
33030 DECL_PRESERVE_P (new_decl) = 1;
33032 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33033 assemble_variable (new_decl, 0, 0, 0);
33035 return new_decl;
33038 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33039 into an integer defined in libgcc/config/i386/cpuinfo.c */
33041 static tree
33042 fold_builtin_cpu (tree fndecl, tree *args)
33044 unsigned int i;
33045 enum ix86_builtins fn_code = (enum ix86_builtins)
33046 DECL_FUNCTION_CODE (fndecl);
33047 tree param_string_cst = NULL;
33049 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33050 enum processor_features
33052 F_CMOV = 0,
33053 F_MMX,
33054 F_POPCNT,
33055 F_SSE,
33056 F_SSE2,
33057 F_SSE3,
33058 F_SSSE3,
33059 F_SSE4_1,
33060 F_SSE4_2,
33061 F_AVX,
33062 F_AVX2,
33063 F_SSE4_A,
33064 F_FMA4,
33065 F_XOP,
33066 F_FMA,
33067 F_AVX512F,
33068 F_BMI,
33069 F_BMI2,
33070 F_AES,
33071 F_PCLMUL,
33072 F_AVX512VL,
33073 F_AVX512BW,
33074 F_AVX512DQ,
33075 F_AVX512CD,
33076 F_AVX512ER,
33077 F_AVX512PF,
33078 F_AVX512VBMI,
33079 F_AVX512IFMA,
33080 F_AVX5124VNNIW,
33081 F_AVX5124FMAPS,
33082 F_AVX512VPOPCNTDQ,
33083 F_AVX512VBMI2,
33084 F_GFNI,
33085 F_VPCLMULQDQ,
33086 F_AVX512VNNI,
33087 F_AVX512BITALG,
33088 F_MAX
33091 /* These are the values for vendor types and cpu types and subtypes
33092 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33093 the corresponding start value. */
33094 enum processor_model
33096 M_INTEL = 1,
33097 M_AMD,
33098 M_CPU_TYPE_START,
33099 M_INTEL_BONNELL,
33100 M_INTEL_CORE2,
33101 M_INTEL_COREI7,
33102 M_AMDFAM10H,
33103 M_AMDFAM15H,
33104 M_INTEL_SILVERMONT,
33105 M_INTEL_KNL,
33106 M_AMD_BTVER1,
33107 M_AMD_BTVER2,
33108 M_AMDFAM17H,
33109 M_INTEL_KNM,
33110 M_CPU_SUBTYPE_START,
33111 M_INTEL_COREI7_NEHALEM,
33112 M_INTEL_COREI7_WESTMERE,
33113 M_INTEL_COREI7_SANDYBRIDGE,
33114 M_AMDFAM10H_BARCELONA,
33115 M_AMDFAM10H_SHANGHAI,
33116 M_AMDFAM10H_ISTANBUL,
33117 M_AMDFAM15H_BDVER1,
33118 M_AMDFAM15H_BDVER2,
33119 M_AMDFAM15H_BDVER3,
33120 M_AMDFAM15H_BDVER4,
33121 M_AMDFAM17H_ZNVER1,
33122 M_INTEL_COREI7_IVYBRIDGE,
33123 M_INTEL_COREI7_HASWELL,
33124 M_INTEL_COREI7_BROADWELL,
33125 M_INTEL_COREI7_SKYLAKE,
33126 M_INTEL_COREI7_SKYLAKE_AVX512,
33127 M_INTEL_COREI7_CANNONLAKE,
33128 M_INTEL_COREI7_ICELAKE_CLIENT,
33129 M_INTEL_COREI7_ICELAKE_SERVER
33132 static struct _arch_names_table
33134 const char *const name;
33135 const enum processor_model model;
33137 const arch_names_table[] =
33139 {"amd", M_AMD},
33140 {"intel", M_INTEL},
33141 {"atom", M_INTEL_BONNELL},
33142 {"slm", M_INTEL_SILVERMONT},
33143 {"core2", M_INTEL_CORE2},
33144 {"corei7", M_INTEL_COREI7},
33145 {"nehalem", M_INTEL_COREI7_NEHALEM},
33146 {"westmere", M_INTEL_COREI7_WESTMERE},
33147 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33148 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33149 {"haswell", M_INTEL_COREI7_HASWELL},
33150 {"broadwell", M_INTEL_COREI7_BROADWELL},
33151 {"skylake", M_INTEL_COREI7_SKYLAKE},
33152 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33153 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
33154 {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
33155 {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
33156 {"bonnell", M_INTEL_BONNELL},
33157 {"silvermont", M_INTEL_SILVERMONT},
33158 {"knl", M_INTEL_KNL},
33159 {"knm", M_INTEL_KNM},
33160 {"amdfam10h", M_AMDFAM10H},
33161 {"barcelona", M_AMDFAM10H_BARCELONA},
33162 {"shanghai", M_AMDFAM10H_SHANGHAI},
33163 {"istanbul", M_AMDFAM10H_ISTANBUL},
33164 {"btver1", M_AMD_BTVER1},
33165 {"amdfam15h", M_AMDFAM15H},
33166 {"bdver1", M_AMDFAM15H_BDVER1},
33167 {"bdver2", M_AMDFAM15H_BDVER2},
33168 {"bdver3", M_AMDFAM15H_BDVER3},
33169 {"bdver4", M_AMDFAM15H_BDVER4},
33170 {"btver2", M_AMD_BTVER2},
33171 {"amdfam17h", M_AMDFAM17H},
33172 {"znver1", M_AMDFAM17H_ZNVER1},
33175 static struct _isa_names_table
33177 const char *const name;
33178 const enum processor_features feature;
33180 const isa_names_table[] =
33182 {"cmov", F_CMOV},
33183 {"mmx", F_MMX},
33184 {"popcnt", F_POPCNT},
33185 {"sse", F_SSE},
33186 {"sse2", F_SSE2},
33187 {"sse3", F_SSE3},
33188 {"ssse3", F_SSSE3},
33189 {"sse4a", F_SSE4_A},
33190 {"sse4.1", F_SSE4_1},
33191 {"sse4.2", F_SSE4_2},
33192 {"avx", F_AVX},
33193 {"fma4", F_FMA4},
33194 {"xop", F_XOP},
33195 {"fma", F_FMA},
33196 {"avx2", F_AVX2},
33197 {"avx512f", F_AVX512F},
33198 {"bmi", F_BMI},
33199 {"bmi2", F_BMI2},
33200 {"aes", F_AES},
33201 {"pclmul", F_PCLMUL},
33202 {"avx512vl",F_AVX512VL},
33203 {"avx512bw",F_AVX512BW},
33204 {"avx512dq",F_AVX512DQ},
33205 {"avx512cd",F_AVX512CD},
33206 {"avx512er",F_AVX512ER},
33207 {"avx512pf",F_AVX512PF},
33208 {"avx512vbmi",F_AVX512VBMI},
33209 {"avx512ifma",F_AVX512IFMA},
33210 {"avx5124vnniw",F_AVX5124VNNIW},
33211 {"avx5124fmaps",F_AVX5124FMAPS},
33212 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ},
33213 {"avx512vbmi2", F_AVX512VBMI2},
33214 {"gfni", F_GFNI},
33215 {"vpclmulqdq", F_VPCLMULQDQ},
33216 {"avx512vnni", F_AVX512VNNI},
33217 {"avx512bitalg", F_AVX512BITALG}
33220 tree __processor_model_type = build_processor_model_struct ();
33221 tree __cpu_model_var = make_var_decl (__processor_model_type,
33222 "__cpu_model");
33225 varpool_node::add (__cpu_model_var);
33227 gcc_assert ((args != NULL) && (*args != NULL));
33229 param_string_cst = *args;
33230 while (param_string_cst
33231 && TREE_CODE (param_string_cst) != STRING_CST)
33233 /* *args must be a expr that can contain other EXPRS leading to a
33234 STRING_CST. */
33235 if (!EXPR_P (param_string_cst))
33237 error ("Parameter to builtin must be a string constant or literal");
33238 return integer_zero_node;
33240 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33243 gcc_assert (param_string_cst);
33245 if (fn_code == IX86_BUILTIN_CPU_IS)
33247 tree ref;
33248 tree field;
33249 tree final;
33251 unsigned int field_val = 0;
33252 unsigned int NUM_ARCH_NAMES
33253 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33255 for (i = 0; i < NUM_ARCH_NAMES; i++)
33256 if (strcmp (arch_names_table[i].name,
33257 TREE_STRING_POINTER (param_string_cst)) == 0)
33258 break;
33260 if (i == NUM_ARCH_NAMES)
33262 error ("Parameter to builtin not valid: %s",
33263 TREE_STRING_POINTER (param_string_cst));
33264 return integer_zero_node;
33267 field = TYPE_FIELDS (__processor_model_type);
33268 field_val = arch_names_table[i].model;
33270 /* CPU types are stored in the next field. */
33271 if (field_val > M_CPU_TYPE_START
33272 && field_val < M_CPU_SUBTYPE_START)
33274 field = DECL_CHAIN (field);
33275 field_val -= M_CPU_TYPE_START;
33278 /* CPU subtypes are stored in the next field. */
33279 if (field_val > M_CPU_SUBTYPE_START)
33281 field = DECL_CHAIN ( DECL_CHAIN (field));
33282 field_val -= M_CPU_SUBTYPE_START;
33285 /* Get the appropriate field in __cpu_model. */
33286 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33287 field, NULL_TREE);
33289 /* Check the value. */
33290 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33291 build_int_cstu (unsigned_type_node, field_val));
33292 return build1 (CONVERT_EXPR, integer_type_node, final);
33294 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33296 tree ref;
33297 tree array_elt;
33298 tree field;
33299 tree final;
33301 unsigned int field_val = 0;
33302 unsigned int NUM_ISA_NAMES
33303 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33305 for (i = 0; i < NUM_ISA_NAMES; i++)
33306 if (strcmp (isa_names_table[i].name,
33307 TREE_STRING_POINTER (param_string_cst)) == 0)
33308 break;
33310 if (i == NUM_ISA_NAMES)
33312 error ("Parameter to builtin not valid: %s",
33313 TREE_STRING_POINTER (param_string_cst));
33314 return integer_zero_node;
33317 if (isa_names_table[i].feature >= 32)
33319 tree __cpu_features2_var = make_var_decl (unsigned_type_node,
33320 "__cpu_features2");
33322 varpool_node::add (__cpu_features2_var);
33323 field_val = (1U << (isa_names_table[i].feature - 32));
33324 /* Return __cpu_features2 & field_val */
33325 final = build2 (BIT_AND_EXPR, unsigned_type_node,
33326 __cpu_features2_var,
33327 build_int_cstu (unsigned_type_node, field_val));
33328 return build1 (CONVERT_EXPR, integer_type_node, final);
33331 field = TYPE_FIELDS (__processor_model_type);
33332 /* Get the last field, which is __cpu_features. */
33333 while (DECL_CHAIN (field))
33334 field = DECL_CHAIN (field);
33336 /* Get the appropriate field: __cpu_model.__cpu_features */
33337 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33338 field, NULL_TREE);
33340 /* Access the 0th element of __cpu_features array. */
33341 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33342 integer_zero_node, NULL_TREE, NULL_TREE);
33344 field_val = (1U << isa_names_table[i].feature);
33345 /* Return __cpu_model.__cpu_features[0] & field_val */
33346 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33347 build_int_cstu (unsigned_type_node, field_val));
33348 return build1 (CONVERT_EXPR, integer_type_node, final);
33350 gcc_unreachable ();
33353 static tree
33354 ix86_fold_builtin (tree fndecl, int n_args,
33355 tree *args, bool ignore ATTRIBUTE_UNUSED)
33357 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33359 enum ix86_builtins fn_code = (enum ix86_builtins)
33360 DECL_FUNCTION_CODE (fndecl);
33361 switch (fn_code)
33363 case IX86_BUILTIN_CPU_IS:
33364 case IX86_BUILTIN_CPU_SUPPORTS:
33365 gcc_assert (n_args == 1);
33366 return fold_builtin_cpu (fndecl, args);
33368 case IX86_BUILTIN_NANQ:
33369 case IX86_BUILTIN_NANSQ:
33371 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33372 const char *str = c_getstr (*args);
33373 int quiet = fn_code == IX86_BUILTIN_NANQ;
33374 REAL_VALUE_TYPE real;
33376 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33377 return build_real (type, real);
33378 return NULL_TREE;
33381 case IX86_BUILTIN_INFQ:
33382 case IX86_BUILTIN_HUGE_VALQ:
33384 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33385 REAL_VALUE_TYPE inf;
33386 real_inf (&inf);
33387 return build_real (type, inf);
33390 case IX86_BUILTIN_TZCNT16:
33391 case IX86_BUILTIN_CTZS:
33392 case IX86_BUILTIN_TZCNT32:
33393 case IX86_BUILTIN_TZCNT64:
33394 gcc_assert (n_args == 1);
33395 if (TREE_CODE (args[0]) == INTEGER_CST)
33397 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33398 tree arg = args[0];
33399 if (fn_code == IX86_BUILTIN_TZCNT16
33400 || fn_code == IX86_BUILTIN_CTZS)
33401 arg = fold_convert (short_unsigned_type_node, arg);
33402 if (integer_zerop (arg))
33403 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33404 else
33405 return fold_const_call (CFN_CTZ, type, arg);
33407 break;
33409 case IX86_BUILTIN_LZCNT16:
33410 case IX86_BUILTIN_CLZS:
33411 case IX86_BUILTIN_LZCNT32:
33412 case IX86_BUILTIN_LZCNT64:
33413 gcc_assert (n_args == 1);
33414 if (TREE_CODE (args[0]) == INTEGER_CST)
33416 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33417 tree arg = args[0];
33418 if (fn_code == IX86_BUILTIN_LZCNT16
33419 || fn_code == IX86_BUILTIN_CLZS)
33420 arg = fold_convert (short_unsigned_type_node, arg);
33421 if (integer_zerop (arg))
33422 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33423 else
33424 return fold_const_call (CFN_CLZ, type, arg);
33426 break;
33428 case IX86_BUILTIN_BEXTR32:
33429 case IX86_BUILTIN_BEXTR64:
33430 case IX86_BUILTIN_BEXTRI32:
33431 case IX86_BUILTIN_BEXTRI64:
33432 gcc_assert (n_args == 2);
33433 if (tree_fits_uhwi_p (args[1]))
33435 unsigned HOST_WIDE_INT res = 0;
33436 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33437 unsigned int start = tree_to_uhwi (args[1]);
33438 unsigned int len = (start & 0xff00) >> 8;
33439 start &= 0xff;
33440 if (start >= prec || len == 0)
33441 res = 0;
33442 else if (!tree_fits_uhwi_p (args[0]))
33443 break;
33444 else
33445 res = tree_to_uhwi (args[0]) >> start;
33446 if (len > prec)
33447 len = prec;
33448 if (len < HOST_BITS_PER_WIDE_INT)
33449 res &= (HOST_WIDE_INT_1U << len) - 1;
33450 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33452 break;
33454 case IX86_BUILTIN_BZHI32:
33455 case IX86_BUILTIN_BZHI64:
33456 gcc_assert (n_args == 2);
33457 if (tree_fits_uhwi_p (args[1]))
33459 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33460 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33461 return args[0];
33462 if (!tree_fits_uhwi_p (args[0]))
33463 break;
33464 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33465 res &= ~(HOST_WIDE_INT_M1U << idx);
33466 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33468 break;
33470 case IX86_BUILTIN_PDEP32:
33471 case IX86_BUILTIN_PDEP64:
33472 gcc_assert (n_args == 2);
33473 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33475 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33476 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33477 unsigned HOST_WIDE_INT res = 0;
33478 unsigned HOST_WIDE_INT m, k = 1;
33479 for (m = 1; m; m <<= 1)
33480 if ((mask & m) != 0)
33482 if ((src & k) != 0)
33483 res |= m;
33484 k <<= 1;
33486 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33488 break;
33490 case IX86_BUILTIN_PEXT32:
33491 case IX86_BUILTIN_PEXT64:
33492 gcc_assert (n_args == 2);
33493 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33495 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33496 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33497 unsigned HOST_WIDE_INT res = 0;
33498 unsigned HOST_WIDE_INT m, k = 1;
33499 for (m = 1; m; m <<= 1)
33500 if ((mask & m) != 0)
33502 if ((src & m) != 0)
33503 res |= k;
33504 k <<= 1;
33506 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33508 break;
33510 default:
33511 break;
33515 #ifdef SUBTARGET_FOLD_BUILTIN
33516 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33517 #endif
33519 return NULL_TREE;
33522 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33523 constant) in GIMPLE. */
33525 bool
33526 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33528 gimple *stmt = gsi_stmt (*gsi);
33529 tree fndecl = gimple_call_fndecl (stmt);
33530 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33531 int n_args = gimple_call_num_args (stmt);
33532 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33533 tree decl = NULL_TREE;
33534 tree arg0, arg1;
33536 switch (fn_code)
33538 case IX86_BUILTIN_TZCNT32:
33539 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33540 goto fold_tzcnt_lzcnt;
33542 case IX86_BUILTIN_TZCNT64:
33543 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33544 goto fold_tzcnt_lzcnt;
33546 case IX86_BUILTIN_LZCNT32:
33547 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33548 goto fold_tzcnt_lzcnt;
33550 case IX86_BUILTIN_LZCNT64:
33551 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33552 goto fold_tzcnt_lzcnt;
33554 fold_tzcnt_lzcnt:
33555 gcc_assert (n_args == 1);
33556 arg0 = gimple_call_arg (stmt, 0);
33557 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33559 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33560 /* If arg0 is provably non-zero, optimize into generic
33561 __builtin_c[tl]z{,ll} function the middle-end handles
33562 better. */
33563 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33564 return false;
33566 location_t loc = gimple_location (stmt);
33567 gimple *g = gimple_build_call (decl, 1, arg0);
33568 gimple_set_location (g, loc);
33569 tree lhs = make_ssa_name (integer_type_node);
33570 gimple_call_set_lhs (g, lhs);
33571 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33572 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33573 gimple_set_location (g, loc);
33574 gsi_replace (gsi, g, false);
33575 return true;
33577 break;
33579 case IX86_BUILTIN_BZHI32:
33580 case IX86_BUILTIN_BZHI64:
33581 gcc_assert (n_args == 2);
33582 arg1 = gimple_call_arg (stmt, 1);
33583 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33585 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33586 arg0 = gimple_call_arg (stmt, 0);
33587 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33588 break;
33589 location_t loc = gimple_location (stmt);
33590 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33591 gimple_set_location (g, loc);
33592 gsi_replace (gsi, g, false);
33593 return true;
33595 break;
33597 case IX86_BUILTIN_PDEP32:
33598 case IX86_BUILTIN_PDEP64:
33599 case IX86_BUILTIN_PEXT32:
33600 case IX86_BUILTIN_PEXT64:
33601 gcc_assert (n_args == 2);
33602 arg1 = gimple_call_arg (stmt, 1);
33603 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33605 location_t loc = gimple_location (stmt);
33606 arg0 = gimple_call_arg (stmt, 0);
33607 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33608 gimple_set_location (g, loc);
33609 gsi_replace (gsi, g, false);
33610 return true;
33612 break;
33614 default:
33615 break;
33618 return false;
33621 /* Make builtins to detect cpu type and features supported. NAME is
33622 the builtin name, CODE is the builtin code, and FTYPE is the function
33623 type of the builtin. */
33625 static void
33626 make_cpu_type_builtin (const char* name, int code,
33627 enum ix86_builtin_func_type ftype, bool is_const)
33629 tree decl;
33630 tree type;
33632 type = ix86_get_builtin_func_type (ftype);
33633 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33634 NULL, NULL_TREE);
33635 gcc_assert (decl != NULL_TREE);
33636 ix86_builtins[(int) code] = decl;
33637 TREE_READONLY (decl) = is_const;
33640 /* Make builtins to get CPU type and features supported. The created
33641 builtins are :
33643 __builtin_cpu_init (), to detect cpu type and features,
33644 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33645 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33648 static void
33649 ix86_init_platform_type_builtins (void)
33651 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33652 INT_FTYPE_VOID, false);
33653 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33654 INT_FTYPE_PCCHAR, true);
33655 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33656 INT_FTYPE_PCCHAR, true);
33659 /* Internal method for ix86_init_builtins. */
33661 static void
33662 ix86_init_builtins_va_builtins_abi (void)
33664 tree ms_va_ref, sysv_va_ref;
33665 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33666 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33667 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33668 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33670 if (!TARGET_64BIT)
33671 return;
33672 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33673 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33674 ms_va_ref = build_reference_type (ms_va_list_type_node);
33675 sysv_va_ref =
33676 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33678 fnvoid_va_end_ms =
33679 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33680 fnvoid_va_start_ms =
33681 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33682 fnvoid_va_end_sysv =
33683 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33684 fnvoid_va_start_sysv =
33685 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33686 NULL_TREE);
33687 fnvoid_va_copy_ms =
33688 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33689 NULL_TREE);
33690 fnvoid_va_copy_sysv =
33691 build_function_type_list (void_type_node, sysv_va_ref,
33692 sysv_va_ref, NULL_TREE);
33694 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33695 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33696 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33697 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33698 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33699 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33700 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33701 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33702 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33703 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33704 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33705 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33708 static void
33709 ix86_init_builtin_types (void)
33711 tree float80_type_node, const_string_type_node;
33713 /* The __float80 type. */
33714 float80_type_node = long_double_type_node;
33715 if (TYPE_MODE (float80_type_node) != XFmode)
33717 if (float64x_type_node != NULL_TREE
33718 && TYPE_MODE (float64x_type_node) == XFmode)
33719 float80_type_node = float64x_type_node;
33720 else
33722 /* The __float80 type. */
33723 float80_type_node = make_node (REAL_TYPE);
33725 TYPE_PRECISION (float80_type_node) = 80;
33726 layout_type (float80_type_node);
33729 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33731 /* The __float128 type. The node has already been created as
33732 _Float128, so we only need to register the __float128 name for
33733 it. */
33734 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33736 const_string_type_node
33737 = build_pointer_type (build_qualified_type
33738 (char_type_node, TYPE_QUAL_CONST));
33740 /* This macro is built by i386-builtin-types.awk. */
33741 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33744 static void
33745 ix86_init_builtins (void)
33747 tree ftype, decl;
33749 ix86_init_builtin_types ();
33751 /* Builtins to get CPU type and features. */
33752 ix86_init_platform_type_builtins ();
33754 /* TFmode support builtins. */
33755 def_builtin_const (0, "__builtin_infq",
33756 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33757 def_builtin_const (0, "__builtin_huge_valq",
33758 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33760 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33761 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33762 BUILT_IN_MD, "nanq", NULL_TREE);
33763 TREE_READONLY (decl) = 1;
33764 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33766 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33767 BUILT_IN_MD, "nansq", NULL_TREE);
33768 TREE_READONLY (decl) = 1;
33769 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33771 /* We will expand them to normal call if SSE isn't available since
33772 they are used by libgcc. */
33773 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33774 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33775 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33776 TREE_READONLY (decl) = 1;
33777 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33779 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33780 decl = add_builtin_function ("__builtin_copysignq", ftype,
33781 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33782 "__copysigntf3", NULL_TREE);
33783 TREE_READONLY (decl) = 1;
33784 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33786 ix86_init_tm_builtins ();
33787 ix86_init_mmx_sse_builtins ();
33788 ix86_init_mpx_builtins ();
33790 if (TARGET_LP64)
33791 ix86_init_builtins_va_builtins_abi ();
33793 #ifdef SUBTARGET_INIT_BUILTINS
33794 SUBTARGET_INIT_BUILTINS;
33795 #endif
33798 /* Return the ix86 builtin for CODE. */
33800 static tree
33801 ix86_builtin_decl (unsigned code, bool)
33803 if (code >= IX86_BUILTIN_MAX)
33804 return error_mark_node;
33806 return ix86_builtins[code];
33809 /* Errors in the source file can cause expand_expr to return const0_rtx
33810 where we expect a vector. To avoid crashing, use one of the vector
33811 clear instructions. */
33812 static rtx
33813 safe_vector_operand (rtx x, machine_mode mode)
33815 if (x == const0_rtx)
33816 x = CONST0_RTX (mode);
33817 return x;
33820 /* Fixup modeless constants to fit required mode. */
33821 static rtx
33822 fixup_modeless_constant (rtx x, machine_mode mode)
33824 if (GET_MODE (x) == VOIDmode)
33825 x = convert_to_mode (mode, x, 1);
33826 return x;
33829 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33831 static rtx
33832 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33834 rtx pat;
33835 tree arg0 = CALL_EXPR_ARG (exp, 0);
33836 tree arg1 = CALL_EXPR_ARG (exp, 1);
33837 rtx op0 = expand_normal (arg0);
33838 rtx op1 = expand_normal (arg1);
33839 machine_mode tmode = insn_data[icode].operand[0].mode;
33840 machine_mode mode0 = insn_data[icode].operand[1].mode;
33841 machine_mode mode1 = insn_data[icode].operand[2].mode;
33843 if (VECTOR_MODE_P (mode0))
33844 op0 = safe_vector_operand (op0, mode0);
33845 if (VECTOR_MODE_P (mode1))
33846 op1 = safe_vector_operand (op1, mode1);
33848 if (optimize || !target
33849 || GET_MODE (target) != tmode
33850 || !insn_data[icode].operand[0].predicate (target, tmode))
33851 target = gen_reg_rtx (tmode);
33853 if (GET_MODE (op1) == SImode && mode1 == TImode)
33855 rtx x = gen_reg_rtx (V4SImode);
33856 emit_insn (gen_sse2_loadd (x, op1));
33857 op1 = gen_lowpart (TImode, x);
33860 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33861 op0 = copy_to_mode_reg (mode0, op0);
33862 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33863 op1 = copy_to_mode_reg (mode1, op1);
33865 pat = GEN_FCN (icode) (target, op0, op1);
33866 if (! pat)
33867 return 0;
33869 emit_insn (pat);
33871 return target;
33874 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33876 static rtx
33877 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33878 enum ix86_builtin_func_type m_type,
33879 enum rtx_code sub_code)
33881 rtx pat;
33882 int i;
33883 int nargs;
33884 bool comparison_p = false;
33885 bool tf_p = false;
33886 bool last_arg_constant = false;
33887 int num_memory = 0;
33888 struct {
33889 rtx op;
33890 machine_mode mode;
33891 } args[4];
33893 machine_mode tmode = insn_data[icode].operand[0].mode;
33895 switch (m_type)
33897 case MULTI_ARG_4_DF2_DI_I:
33898 case MULTI_ARG_4_DF2_DI_I1:
33899 case MULTI_ARG_4_SF2_SI_I:
33900 case MULTI_ARG_4_SF2_SI_I1:
33901 nargs = 4;
33902 last_arg_constant = true;
33903 break;
33905 case MULTI_ARG_3_SF:
33906 case MULTI_ARG_3_DF:
33907 case MULTI_ARG_3_SF2:
33908 case MULTI_ARG_3_DF2:
33909 case MULTI_ARG_3_DI:
33910 case MULTI_ARG_3_SI:
33911 case MULTI_ARG_3_SI_DI:
33912 case MULTI_ARG_3_HI:
33913 case MULTI_ARG_3_HI_SI:
33914 case MULTI_ARG_3_QI:
33915 case MULTI_ARG_3_DI2:
33916 case MULTI_ARG_3_SI2:
33917 case MULTI_ARG_3_HI2:
33918 case MULTI_ARG_3_QI2:
33919 nargs = 3;
33920 break;
33922 case MULTI_ARG_2_SF:
33923 case MULTI_ARG_2_DF:
33924 case MULTI_ARG_2_DI:
33925 case MULTI_ARG_2_SI:
33926 case MULTI_ARG_2_HI:
33927 case MULTI_ARG_2_QI:
33928 nargs = 2;
33929 break;
33931 case MULTI_ARG_2_DI_IMM:
33932 case MULTI_ARG_2_SI_IMM:
33933 case MULTI_ARG_2_HI_IMM:
33934 case MULTI_ARG_2_QI_IMM:
33935 nargs = 2;
33936 last_arg_constant = true;
33937 break;
33939 case MULTI_ARG_1_SF:
33940 case MULTI_ARG_1_DF:
33941 case MULTI_ARG_1_SF2:
33942 case MULTI_ARG_1_DF2:
33943 case MULTI_ARG_1_DI:
33944 case MULTI_ARG_1_SI:
33945 case MULTI_ARG_1_HI:
33946 case MULTI_ARG_1_QI:
33947 case MULTI_ARG_1_SI_DI:
33948 case MULTI_ARG_1_HI_DI:
33949 case MULTI_ARG_1_HI_SI:
33950 case MULTI_ARG_1_QI_DI:
33951 case MULTI_ARG_1_QI_SI:
33952 case MULTI_ARG_1_QI_HI:
33953 nargs = 1;
33954 break;
33956 case MULTI_ARG_2_DI_CMP:
33957 case MULTI_ARG_2_SI_CMP:
33958 case MULTI_ARG_2_HI_CMP:
33959 case MULTI_ARG_2_QI_CMP:
33960 nargs = 2;
33961 comparison_p = true;
33962 break;
33964 case MULTI_ARG_2_SF_TF:
33965 case MULTI_ARG_2_DF_TF:
33966 case MULTI_ARG_2_DI_TF:
33967 case MULTI_ARG_2_SI_TF:
33968 case MULTI_ARG_2_HI_TF:
33969 case MULTI_ARG_2_QI_TF:
33970 nargs = 2;
33971 tf_p = true;
33972 break;
33974 default:
33975 gcc_unreachable ();
33978 if (optimize || !target
33979 || GET_MODE (target) != tmode
33980 || !insn_data[icode].operand[0].predicate (target, tmode))
33981 target = gen_reg_rtx (tmode);
33982 else if (memory_operand (target, tmode))
33983 num_memory++;
33985 gcc_assert (nargs <= 4);
33987 for (i = 0; i < nargs; i++)
33989 tree arg = CALL_EXPR_ARG (exp, i);
33990 rtx op = expand_normal (arg);
33991 int adjust = (comparison_p) ? 1 : 0;
33992 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33994 if (last_arg_constant && i == nargs - 1)
33996 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33998 enum insn_code new_icode = icode;
33999 switch (icode)
34001 case CODE_FOR_xop_vpermil2v2df3:
34002 case CODE_FOR_xop_vpermil2v4sf3:
34003 case CODE_FOR_xop_vpermil2v4df3:
34004 case CODE_FOR_xop_vpermil2v8sf3:
34005 error ("the last argument must be a 2-bit immediate");
34006 return gen_reg_rtx (tmode);
34007 case CODE_FOR_xop_rotlv2di3:
34008 new_icode = CODE_FOR_rotlv2di3;
34009 goto xop_rotl;
34010 case CODE_FOR_xop_rotlv4si3:
34011 new_icode = CODE_FOR_rotlv4si3;
34012 goto xop_rotl;
34013 case CODE_FOR_xop_rotlv8hi3:
34014 new_icode = CODE_FOR_rotlv8hi3;
34015 goto xop_rotl;
34016 case CODE_FOR_xop_rotlv16qi3:
34017 new_icode = CODE_FOR_rotlv16qi3;
34018 xop_rotl:
34019 if (CONST_INT_P (op))
34021 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34022 op = GEN_INT (INTVAL (op) & mask);
34023 gcc_checking_assert
34024 (insn_data[icode].operand[i + 1].predicate (op, mode));
34026 else
34028 gcc_checking_assert
34029 (nargs == 2
34030 && insn_data[new_icode].operand[0].mode == tmode
34031 && insn_data[new_icode].operand[1].mode == tmode
34032 && insn_data[new_icode].operand[2].mode == mode
34033 && insn_data[new_icode].operand[0].predicate
34034 == insn_data[icode].operand[0].predicate
34035 && insn_data[new_icode].operand[1].predicate
34036 == insn_data[icode].operand[1].predicate);
34037 icode = new_icode;
34038 goto non_constant;
34040 break;
34041 default:
34042 gcc_unreachable ();
34046 else
34048 non_constant:
34049 if (VECTOR_MODE_P (mode))
34050 op = safe_vector_operand (op, mode);
34052 /* If we aren't optimizing, only allow one memory operand to be
34053 generated. */
34054 if (memory_operand (op, mode))
34055 num_memory++;
34057 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34059 if (optimize
34060 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34061 || num_memory > 1)
34062 op = force_reg (mode, op);
34065 args[i].op = op;
34066 args[i].mode = mode;
34069 switch (nargs)
34071 case 1:
34072 pat = GEN_FCN (icode) (target, args[0].op);
34073 break;
34075 case 2:
34076 if (tf_p)
34077 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34078 GEN_INT ((int)sub_code));
34079 else if (! comparison_p)
34080 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34081 else
34083 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34084 args[0].op,
34085 args[1].op);
34087 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34089 break;
34091 case 3:
34092 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34093 break;
34095 case 4:
34096 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34097 break;
34099 default:
34100 gcc_unreachable ();
34103 if (! pat)
34104 return 0;
34106 emit_insn (pat);
34107 return target;
34110 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34111 insns with vec_merge. */
34113 static rtx
34114 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34115 rtx target)
34117 rtx pat;
34118 tree arg0 = CALL_EXPR_ARG (exp, 0);
34119 rtx op1, op0 = expand_normal (arg0);
34120 machine_mode tmode = insn_data[icode].operand[0].mode;
34121 machine_mode mode0 = insn_data[icode].operand[1].mode;
34123 if (optimize || !target
34124 || GET_MODE (target) != tmode
34125 || !insn_data[icode].operand[0].predicate (target, tmode))
34126 target = gen_reg_rtx (tmode);
34128 if (VECTOR_MODE_P (mode0))
34129 op0 = safe_vector_operand (op0, mode0);
34131 if ((optimize && !register_operand (op0, mode0))
34132 || !insn_data[icode].operand[1].predicate (op0, mode0))
34133 op0 = copy_to_mode_reg (mode0, op0);
34135 op1 = op0;
34136 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34137 op1 = copy_to_mode_reg (mode0, op1);
34139 pat = GEN_FCN (icode) (target, op0, op1);
34140 if (! pat)
34141 return 0;
34142 emit_insn (pat);
34143 return target;
34146 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34148 static rtx
34149 ix86_expand_sse_compare (const struct builtin_description *d,
34150 tree exp, rtx target, bool swap)
34152 rtx pat;
34153 tree arg0 = CALL_EXPR_ARG (exp, 0);
34154 tree arg1 = CALL_EXPR_ARG (exp, 1);
34155 rtx op0 = expand_normal (arg0);
34156 rtx op1 = expand_normal (arg1);
34157 rtx op2;
34158 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34159 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34160 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34161 enum rtx_code comparison = d->comparison;
34163 if (VECTOR_MODE_P (mode0))
34164 op0 = safe_vector_operand (op0, mode0);
34165 if (VECTOR_MODE_P (mode1))
34166 op1 = safe_vector_operand (op1, mode1);
34168 /* Swap operands if we have a comparison that isn't available in
34169 hardware. */
34170 if (swap)
34171 std::swap (op0, op1);
34173 if (optimize || !target
34174 || GET_MODE (target) != tmode
34175 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34176 target = gen_reg_rtx (tmode);
34178 if ((optimize && !register_operand (op0, mode0))
34179 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34180 op0 = copy_to_mode_reg (mode0, op0);
34181 if ((optimize && !register_operand (op1, mode1))
34182 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34183 op1 = copy_to_mode_reg (mode1, op1);
34185 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34186 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34187 if (! pat)
34188 return 0;
34189 emit_insn (pat);
34190 return target;
34193 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34195 static rtx
34196 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34197 rtx target)
34199 rtx pat;
34200 tree arg0 = CALL_EXPR_ARG (exp, 0);
34201 tree arg1 = CALL_EXPR_ARG (exp, 1);
34202 rtx op0 = expand_normal (arg0);
34203 rtx op1 = expand_normal (arg1);
34204 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34205 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34206 enum rtx_code comparison = d->comparison;
34208 if (VECTOR_MODE_P (mode0))
34209 op0 = safe_vector_operand (op0, mode0);
34210 if (VECTOR_MODE_P (mode1))
34211 op1 = safe_vector_operand (op1, mode1);
34213 /* Swap operands if we have a comparison that isn't available in
34214 hardware. */
34215 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34216 std::swap (op0, op1);
34218 target = gen_reg_rtx (SImode);
34219 emit_move_insn (target, const0_rtx);
34220 target = gen_rtx_SUBREG (QImode, target, 0);
34222 if ((optimize && !register_operand (op0, mode0))
34223 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34224 op0 = copy_to_mode_reg (mode0, op0);
34225 if ((optimize && !register_operand (op1, mode1))
34226 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34227 op1 = copy_to_mode_reg (mode1, op1);
34229 pat = GEN_FCN (d->icode) (op0, op1);
34230 if (! pat)
34231 return 0;
34232 emit_insn (pat);
34233 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34234 gen_rtx_fmt_ee (comparison, QImode,
34235 SET_DEST (pat),
34236 const0_rtx)));
34238 return SUBREG_REG (target);
34241 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34243 static rtx
34244 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34245 rtx target)
34247 rtx pat;
34248 tree arg0 = CALL_EXPR_ARG (exp, 0);
34249 rtx op1, op0 = expand_normal (arg0);
34250 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34251 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34253 if (optimize || target == 0
34254 || GET_MODE (target) != tmode
34255 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34256 target = gen_reg_rtx (tmode);
34258 if (VECTOR_MODE_P (mode0))
34259 op0 = safe_vector_operand (op0, mode0);
34261 if ((optimize && !register_operand (op0, mode0))
34262 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34263 op0 = copy_to_mode_reg (mode0, op0);
34265 op1 = GEN_INT (d->comparison);
34267 pat = GEN_FCN (d->icode) (target, op0, op1);
34268 if (! pat)
34269 return 0;
34270 emit_insn (pat);
34271 return target;
34274 static rtx
34275 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34276 tree exp, rtx target)
34278 rtx pat;
34279 tree arg0 = CALL_EXPR_ARG (exp, 0);
34280 tree arg1 = CALL_EXPR_ARG (exp, 1);
34281 rtx op0 = expand_normal (arg0);
34282 rtx op1 = expand_normal (arg1);
34283 rtx op2;
34284 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34285 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34286 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34288 if (optimize || target == 0
34289 || GET_MODE (target) != tmode
34290 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34291 target = gen_reg_rtx (tmode);
34293 op0 = safe_vector_operand (op0, mode0);
34294 op1 = safe_vector_operand (op1, mode1);
34296 if ((optimize && !register_operand (op0, mode0))
34297 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34298 op0 = copy_to_mode_reg (mode0, op0);
34299 if ((optimize && !register_operand (op1, mode1))
34300 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34301 op1 = copy_to_mode_reg (mode1, op1);
34303 op2 = GEN_INT (d->comparison);
34305 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34306 if (! pat)
34307 return 0;
34308 emit_insn (pat);
34309 return target;
34312 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34314 static rtx
34315 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34316 rtx target)
34318 rtx pat;
34319 tree arg0 = CALL_EXPR_ARG (exp, 0);
34320 tree arg1 = CALL_EXPR_ARG (exp, 1);
34321 rtx op0 = expand_normal (arg0);
34322 rtx op1 = expand_normal (arg1);
34323 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34324 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34325 enum rtx_code comparison = d->comparison;
34327 if (VECTOR_MODE_P (mode0))
34328 op0 = safe_vector_operand (op0, mode0);
34329 if (VECTOR_MODE_P (mode1))
34330 op1 = safe_vector_operand (op1, mode1);
34332 target = gen_reg_rtx (SImode);
34333 emit_move_insn (target, const0_rtx);
34334 target = gen_rtx_SUBREG (QImode, target, 0);
34336 if ((optimize && !register_operand (op0, mode0))
34337 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34338 op0 = copy_to_mode_reg (mode0, op0);
34339 if ((optimize && !register_operand (op1, mode1))
34340 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34341 op1 = copy_to_mode_reg (mode1, op1);
34343 pat = GEN_FCN (d->icode) (op0, op1);
34344 if (! pat)
34345 return 0;
34346 emit_insn (pat);
34347 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34348 gen_rtx_fmt_ee (comparison, QImode,
34349 SET_DEST (pat),
34350 const0_rtx)));
34352 return SUBREG_REG (target);
34355 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34357 static rtx
34358 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34359 tree exp, rtx target)
34361 rtx pat;
34362 tree arg0 = CALL_EXPR_ARG (exp, 0);
34363 tree arg1 = CALL_EXPR_ARG (exp, 1);
34364 tree arg2 = CALL_EXPR_ARG (exp, 2);
34365 tree arg3 = CALL_EXPR_ARG (exp, 3);
34366 tree arg4 = CALL_EXPR_ARG (exp, 4);
34367 rtx scratch0, scratch1;
34368 rtx op0 = expand_normal (arg0);
34369 rtx op1 = expand_normal (arg1);
34370 rtx op2 = expand_normal (arg2);
34371 rtx op3 = expand_normal (arg3);
34372 rtx op4 = expand_normal (arg4);
34373 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34375 tmode0 = insn_data[d->icode].operand[0].mode;
34376 tmode1 = insn_data[d->icode].operand[1].mode;
34377 modev2 = insn_data[d->icode].operand[2].mode;
34378 modei3 = insn_data[d->icode].operand[3].mode;
34379 modev4 = insn_data[d->icode].operand[4].mode;
34380 modei5 = insn_data[d->icode].operand[5].mode;
34381 modeimm = insn_data[d->icode].operand[6].mode;
34383 if (VECTOR_MODE_P (modev2))
34384 op0 = safe_vector_operand (op0, modev2);
34385 if (VECTOR_MODE_P (modev4))
34386 op2 = safe_vector_operand (op2, modev4);
34388 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34389 op0 = copy_to_mode_reg (modev2, op0);
34390 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34391 op1 = copy_to_mode_reg (modei3, op1);
34392 if ((optimize && !register_operand (op2, modev4))
34393 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34394 op2 = copy_to_mode_reg (modev4, op2);
34395 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34396 op3 = copy_to_mode_reg (modei5, op3);
34398 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34400 error ("the fifth argument must be an 8-bit immediate");
34401 return const0_rtx;
34404 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34406 if (optimize || !target
34407 || GET_MODE (target) != tmode0
34408 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34409 target = gen_reg_rtx (tmode0);
34411 scratch1 = gen_reg_rtx (tmode1);
34413 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34415 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34417 if (optimize || !target
34418 || GET_MODE (target) != tmode1
34419 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34420 target = gen_reg_rtx (tmode1);
34422 scratch0 = gen_reg_rtx (tmode0);
34424 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34426 else
34428 gcc_assert (d->flag);
34430 scratch0 = gen_reg_rtx (tmode0);
34431 scratch1 = gen_reg_rtx (tmode1);
34433 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34436 if (! pat)
34437 return 0;
34439 emit_insn (pat);
34441 if (d->flag)
34443 target = gen_reg_rtx (SImode);
34444 emit_move_insn (target, const0_rtx);
34445 target = gen_rtx_SUBREG (QImode, target, 0);
34447 emit_insn
34448 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34449 gen_rtx_fmt_ee (EQ, QImode,
34450 gen_rtx_REG ((machine_mode) d->flag,
34451 FLAGS_REG),
34452 const0_rtx)));
34453 return SUBREG_REG (target);
34455 else
34456 return target;
34460 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34462 static rtx
34463 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34464 tree exp, rtx target)
34466 rtx pat;
34467 tree arg0 = CALL_EXPR_ARG (exp, 0);
34468 tree arg1 = CALL_EXPR_ARG (exp, 1);
34469 tree arg2 = CALL_EXPR_ARG (exp, 2);
34470 rtx scratch0, scratch1;
34471 rtx op0 = expand_normal (arg0);
34472 rtx op1 = expand_normal (arg1);
34473 rtx op2 = expand_normal (arg2);
34474 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34476 tmode0 = insn_data[d->icode].operand[0].mode;
34477 tmode1 = insn_data[d->icode].operand[1].mode;
34478 modev2 = insn_data[d->icode].operand[2].mode;
34479 modev3 = insn_data[d->icode].operand[3].mode;
34480 modeimm = insn_data[d->icode].operand[4].mode;
34482 if (VECTOR_MODE_P (modev2))
34483 op0 = safe_vector_operand (op0, modev2);
34484 if (VECTOR_MODE_P (modev3))
34485 op1 = safe_vector_operand (op1, modev3);
34487 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34488 op0 = copy_to_mode_reg (modev2, op0);
34489 if ((optimize && !register_operand (op1, modev3))
34490 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34491 op1 = copy_to_mode_reg (modev3, op1);
34493 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34495 error ("the third argument must be an 8-bit immediate");
34496 return const0_rtx;
34499 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34501 if (optimize || !target
34502 || GET_MODE (target) != tmode0
34503 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34504 target = gen_reg_rtx (tmode0);
34506 scratch1 = gen_reg_rtx (tmode1);
34508 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34510 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34512 if (optimize || !target
34513 || GET_MODE (target) != tmode1
34514 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34515 target = gen_reg_rtx (tmode1);
34517 scratch0 = gen_reg_rtx (tmode0);
34519 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34521 else
34523 gcc_assert (d->flag);
34525 scratch0 = gen_reg_rtx (tmode0);
34526 scratch1 = gen_reg_rtx (tmode1);
34528 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34531 if (! pat)
34532 return 0;
34534 emit_insn (pat);
34536 if (d->flag)
34538 target = gen_reg_rtx (SImode);
34539 emit_move_insn (target, const0_rtx);
34540 target = gen_rtx_SUBREG (QImode, target, 0);
34542 emit_insn
34543 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34544 gen_rtx_fmt_ee (EQ, QImode,
34545 gen_rtx_REG ((machine_mode) d->flag,
34546 FLAGS_REG),
34547 const0_rtx)));
34548 return SUBREG_REG (target);
34550 else
34551 return target;
34554 /* Subroutine of ix86_expand_builtin to take care of insns with
34555 variable number of operands. */
34557 static rtx
34558 ix86_expand_args_builtin (const struct builtin_description *d,
34559 tree exp, rtx target)
34561 rtx pat, real_target;
34562 unsigned int i, nargs;
34563 unsigned int nargs_constant = 0;
34564 unsigned int mask_pos = 0;
34565 int num_memory = 0;
34566 struct
34568 rtx op;
34569 machine_mode mode;
34570 } args[6];
34571 bool second_arg_count = false;
34572 enum insn_code icode = d->icode;
34573 const struct insn_data_d *insn_p = &insn_data[icode];
34574 machine_mode tmode = insn_p->operand[0].mode;
34575 machine_mode rmode = VOIDmode;
34576 bool swap = false;
34577 enum rtx_code comparison = d->comparison;
34579 switch ((enum ix86_builtin_func_type) d->flag)
34581 case V2DF_FTYPE_V2DF_ROUND:
34582 case V4DF_FTYPE_V4DF_ROUND:
34583 case V8DF_FTYPE_V8DF_ROUND:
34584 case V4SF_FTYPE_V4SF_ROUND:
34585 case V8SF_FTYPE_V8SF_ROUND:
34586 case V16SF_FTYPE_V16SF_ROUND:
34587 case V4SI_FTYPE_V4SF_ROUND:
34588 case V8SI_FTYPE_V8SF_ROUND:
34589 case V16SI_FTYPE_V16SF_ROUND:
34590 return ix86_expand_sse_round (d, exp, target);
34591 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34592 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34593 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34594 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34595 case INT_FTYPE_V8SF_V8SF_PTEST:
34596 case INT_FTYPE_V4DI_V4DI_PTEST:
34597 case INT_FTYPE_V4DF_V4DF_PTEST:
34598 case INT_FTYPE_V4SF_V4SF_PTEST:
34599 case INT_FTYPE_V2DI_V2DI_PTEST:
34600 case INT_FTYPE_V2DF_V2DF_PTEST:
34601 return ix86_expand_sse_ptest (d, exp, target);
34602 case FLOAT128_FTYPE_FLOAT128:
34603 case FLOAT_FTYPE_FLOAT:
34604 case INT_FTYPE_INT:
34605 case UINT_FTYPE_UINT:
34606 case UINT16_FTYPE_UINT16:
34607 case UINT64_FTYPE_INT:
34608 case UINT64_FTYPE_UINT64:
34609 case INT64_FTYPE_INT64:
34610 case INT64_FTYPE_V4SF:
34611 case INT64_FTYPE_V2DF:
34612 case INT_FTYPE_V16QI:
34613 case INT_FTYPE_V8QI:
34614 case INT_FTYPE_V8SF:
34615 case INT_FTYPE_V4DF:
34616 case INT_FTYPE_V4SF:
34617 case INT_FTYPE_V2DF:
34618 case INT_FTYPE_V32QI:
34619 case V16QI_FTYPE_V16QI:
34620 case V8SI_FTYPE_V8SF:
34621 case V8SI_FTYPE_V4SI:
34622 case V8HI_FTYPE_V8HI:
34623 case V8HI_FTYPE_V16QI:
34624 case V8QI_FTYPE_V8QI:
34625 case V8SF_FTYPE_V8SF:
34626 case V8SF_FTYPE_V8SI:
34627 case V8SF_FTYPE_V4SF:
34628 case V8SF_FTYPE_V8HI:
34629 case V4SI_FTYPE_V4SI:
34630 case V4SI_FTYPE_V16QI:
34631 case V4SI_FTYPE_V4SF:
34632 case V4SI_FTYPE_V8SI:
34633 case V4SI_FTYPE_V8HI:
34634 case V4SI_FTYPE_V4DF:
34635 case V4SI_FTYPE_V2DF:
34636 case V4HI_FTYPE_V4HI:
34637 case V4DF_FTYPE_V4DF:
34638 case V4DF_FTYPE_V4SI:
34639 case V4DF_FTYPE_V4SF:
34640 case V4DF_FTYPE_V2DF:
34641 case V4SF_FTYPE_V4SF:
34642 case V4SF_FTYPE_V4SI:
34643 case V4SF_FTYPE_V8SF:
34644 case V4SF_FTYPE_V4DF:
34645 case V4SF_FTYPE_V8HI:
34646 case V4SF_FTYPE_V2DF:
34647 case V2DI_FTYPE_V2DI:
34648 case V2DI_FTYPE_V16QI:
34649 case V2DI_FTYPE_V8HI:
34650 case V2DI_FTYPE_V4SI:
34651 case V2DF_FTYPE_V2DF:
34652 case V2DF_FTYPE_V4SI:
34653 case V2DF_FTYPE_V4DF:
34654 case V2DF_FTYPE_V4SF:
34655 case V2DF_FTYPE_V2SI:
34656 case V2SI_FTYPE_V2SI:
34657 case V2SI_FTYPE_V4SF:
34658 case V2SI_FTYPE_V2SF:
34659 case V2SI_FTYPE_V2DF:
34660 case V2SF_FTYPE_V2SF:
34661 case V2SF_FTYPE_V2SI:
34662 case V32QI_FTYPE_V32QI:
34663 case V32QI_FTYPE_V16QI:
34664 case V16HI_FTYPE_V16HI:
34665 case V16HI_FTYPE_V8HI:
34666 case V8SI_FTYPE_V8SI:
34667 case V16HI_FTYPE_V16QI:
34668 case V8SI_FTYPE_V16QI:
34669 case V4DI_FTYPE_V16QI:
34670 case V8SI_FTYPE_V8HI:
34671 case V4DI_FTYPE_V8HI:
34672 case V4DI_FTYPE_V4SI:
34673 case V4DI_FTYPE_V2DI:
34674 case UQI_FTYPE_UQI:
34675 case UHI_FTYPE_UHI:
34676 case USI_FTYPE_USI:
34677 case USI_FTYPE_UQI:
34678 case USI_FTYPE_UHI:
34679 case UDI_FTYPE_UDI:
34680 case UHI_FTYPE_V16QI:
34681 case USI_FTYPE_V32QI:
34682 case UDI_FTYPE_V64QI:
34683 case V16QI_FTYPE_UHI:
34684 case V32QI_FTYPE_USI:
34685 case V64QI_FTYPE_UDI:
34686 case V8HI_FTYPE_UQI:
34687 case V16HI_FTYPE_UHI:
34688 case V32HI_FTYPE_USI:
34689 case V4SI_FTYPE_UQI:
34690 case V8SI_FTYPE_UQI:
34691 case V4SI_FTYPE_UHI:
34692 case V8SI_FTYPE_UHI:
34693 case UQI_FTYPE_V8HI:
34694 case UHI_FTYPE_V16HI:
34695 case USI_FTYPE_V32HI:
34696 case UQI_FTYPE_V4SI:
34697 case UQI_FTYPE_V8SI:
34698 case UHI_FTYPE_V16SI:
34699 case UQI_FTYPE_V2DI:
34700 case UQI_FTYPE_V4DI:
34701 case UQI_FTYPE_V8DI:
34702 case V16SI_FTYPE_UHI:
34703 case V2DI_FTYPE_UQI:
34704 case V4DI_FTYPE_UQI:
34705 case V16SI_FTYPE_INT:
34706 case V16SF_FTYPE_V8SF:
34707 case V16SI_FTYPE_V8SI:
34708 case V16SF_FTYPE_V4SF:
34709 case V16SI_FTYPE_V4SI:
34710 case V16SI_FTYPE_V16SF:
34711 case V16SI_FTYPE_V16SI:
34712 case V64QI_FTYPE_V64QI:
34713 case V32HI_FTYPE_V32HI:
34714 case V16SF_FTYPE_V16SF:
34715 case V8DI_FTYPE_UQI:
34716 case V8DI_FTYPE_V8DI:
34717 case V8DF_FTYPE_V4DF:
34718 case V8DF_FTYPE_V2DF:
34719 case V8DF_FTYPE_V8DF:
34720 case V4DI_FTYPE_V4DI:
34721 nargs = 1;
34722 break;
34723 case V4SF_FTYPE_V4SF_VEC_MERGE:
34724 case V2DF_FTYPE_V2DF_VEC_MERGE:
34725 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34726 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34727 case V16QI_FTYPE_V16QI_V16QI:
34728 case V16QI_FTYPE_V8HI_V8HI:
34729 case V16SF_FTYPE_V16SF_V16SF:
34730 case V8QI_FTYPE_V8QI_V8QI:
34731 case V8QI_FTYPE_V4HI_V4HI:
34732 case V8HI_FTYPE_V8HI_V8HI:
34733 case V8HI_FTYPE_V16QI_V16QI:
34734 case V8HI_FTYPE_V4SI_V4SI:
34735 case V8SF_FTYPE_V8SF_V8SF:
34736 case V8SF_FTYPE_V8SF_V8SI:
34737 case V8DF_FTYPE_V8DF_V8DF:
34738 case V4SI_FTYPE_V4SI_V4SI:
34739 case V4SI_FTYPE_V8HI_V8HI:
34740 case V4SI_FTYPE_V2DF_V2DF:
34741 case V4HI_FTYPE_V4HI_V4HI:
34742 case V4HI_FTYPE_V8QI_V8QI:
34743 case V4HI_FTYPE_V2SI_V2SI:
34744 case V4DF_FTYPE_V4DF_V4DF:
34745 case V4DF_FTYPE_V4DF_V4DI:
34746 case V4SF_FTYPE_V4SF_V4SF:
34747 case V4SF_FTYPE_V4SF_V4SI:
34748 case V4SF_FTYPE_V4SF_V2SI:
34749 case V4SF_FTYPE_V4SF_V2DF:
34750 case V4SF_FTYPE_V4SF_UINT:
34751 case V4SF_FTYPE_V4SF_DI:
34752 case V4SF_FTYPE_V4SF_SI:
34753 case V2DI_FTYPE_V2DI_V2DI:
34754 case V2DI_FTYPE_V16QI_V16QI:
34755 case V2DI_FTYPE_V4SI_V4SI:
34756 case V2DI_FTYPE_V2DI_V16QI:
34757 case V2SI_FTYPE_V2SI_V2SI:
34758 case V2SI_FTYPE_V4HI_V4HI:
34759 case V2SI_FTYPE_V2SF_V2SF:
34760 case V2DF_FTYPE_V2DF_V2DF:
34761 case V2DF_FTYPE_V2DF_V4SF:
34762 case V2DF_FTYPE_V2DF_V2DI:
34763 case V2DF_FTYPE_V2DF_DI:
34764 case V2DF_FTYPE_V2DF_SI:
34765 case V2DF_FTYPE_V2DF_UINT:
34766 case V2SF_FTYPE_V2SF_V2SF:
34767 case V1DI_FTYPE_V1DI_V1DI:
34768 case V1DI_FTYPE_V8QI_V8QI:
34769 case V1DI_FTYPE_V2SI_V2SI:
34770 case V32QI_FTYPE_V16HI_V16HI:
34771 case V16HI_FTYPE_V8SI_V8SI:
34772 case V64QI_FTYPE_V64QI_V64QI:
34773 case V32QI_FTYPE_V32QI_V32QI:
34774 case V16HI_FTYPE_V32QI_V32QI:
34775 case V16HI_FTYPE_V16HI_V16HI:
34776 case V8SI_FTYPE_V4DF_V4DF:
34777 case V8SI_FTYPE_V8SI_V8SI:
34778 case V8SI_FTYPE_V16HI_V16HI:
34779 case V4DI_FTYPE_V4DI_V4DI:
34780 case V4DI_FTYPE_V8SI_V8SI:
34781 case V8DI_FTYPE_V64QI_V64QI:
34782 if (comparison == UNKNOWN)
34783 return ix86_expand_binop_builtin (icode, exp, target);
34784 nargs = 2;
34785 break;
34786 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34787 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34788 gcc_assert (comparison != UNKNOWN);
34789 nargs = 2;
34790 swap = true;
34791 break;
34792 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34793 case V16HI_FTYPE_V16HI_SI_COUNT:
34794 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34795 case V8SI_FTYPE_V8SI_SI_COUNT:
34796 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34797 case V4DI_FTYPE_V4DI_INT_COUNT:
34798 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34799 case V8HI_FTYPE_V8HI_SI_COUNT:
34800 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34801 case V4SI_FTYPE_V4SI_SI_COUNT:
34802 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34803 case V4HI_FTYPE_V4HI_SI_COUNT:
34804 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34805 case V2DI_FTYPE_V2DI_SI_COUNT:
34806 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34807 case V2SI_FTYPE_V2SI_SI_COUNT:
34808 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34809 case V1DI_FTYPE_V1DI_SI_COUNT:
34810 nargs = 2;
34811 second_arg_count = true;
34812 break;
34813 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34814 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34815 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34816 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34817 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34818 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34819 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34820 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34821 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34822 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34823 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34824 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34825 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34826 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34827 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34828 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34829 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34830 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34831 nargs = 4;
34832 second_arg_count = true;
34833 break;
34834 case UINT64_FTYPE_UINT64_UINT64:
34835 case UINT_FTYPE_UINT_UINT:
34836 case UINT_FTYPE_UINT_USHORT:
34837 case UINT_FTYPE_UINT_UCHAR:
34838 case UINT16_FTYPE_UINT16_INT:
34839 case UINT8_FTYPE_UINT8_INT:
34840 case UQI_FTYPE_UQI_UQI:
34841 case UHI_FTYPE_UHI_UHI:
34842 case USI_FTYPE_USI_USI:
34843 case UDI_FTYPE_UDI_UDI:
34844 case V16SI_FTYPE_V8DF_V8DF:
34845 nargs = 2;
34846 break;
34847 case V2DI_FTYPE_V2DI_INT_CONVERT:
34848 nargs = 2;
34849 rmode = V1TImode;
34850 nargs_constant = 1;
34851 break;
34852 case V4DI_FTYPE_V4DI_INT_CONVERT:
34853 nargs = 2;
34854 rmode = V2TImode;
34855 nargs_constant = 1;
34856 break;
34857 case V8DI_FTYPE_V8DI_INT_CONVERT:
34858 nargs = 2;
34859 rmode = V4TImode;
34860 nargs_constant = 1;
34861 break;
34862 case V8HI_FTYPE_V8HI_INT:
34863 case V8HI_FTYPE_V8SF_INT:
34864 case V16HI_FTYPE_V16SF_INT:
34865 case V8HI_FTYPE_V4SF_INT:
34866 case V8SF_FTYPE_V8SF_INT:
34867 case V4SF_FTYPE_V16SF_INT:
34868 case V16SF_FTYPE_V16SF_INT:
34869 case V4SI_FTYPE_V4SI_INT:
34870 case V4SI_FTYPE_V8SI_INT:
34871 case V4HI_FTYPE_V4HI_INT:
34872 case V4DF_FTYPE_V4DF_INT:
34873 case V4DF_FTYPE_V8DF_INT:
34874 case V4SF_FTYPE_V4SF_INT:
34875 case V4SF_FTYPE_V8SF_INT:
34876 case V2DI_FTYPE_V2DI_INT:
34877 case V2DF_FTYPE_V2DF_INT:
34878 case V2DF_FTYPE_V4DF_INT:
34879 case V16HI_FTYPE_V16HI_INT:
34880 case V8SI_FTYPE_V8SI_INT:
34881 case V16SI_FTYPE_V16SI_INT:
34882 case V4SI_FTYPE_V16SI_INT:
34883 case V4DI_FTYPE_V4DI_INT:
34884 case V2DI_FTYPE_V4DI_INT:
34885 case V4DI_FTYPE_V8DI_INT:
34886 case QI_FTYPE_V4SF_INT:
34887 case QI_FTYPE_V2DF_INT:
34888 case UQI_FTYPE_UQI_UQI_CONST:
34889 case UHI_FTYPE_UHI_UQI:
34890 case USI_FTYPE_USI_UQI:
34891 case UDI_FTYPE_UDI_UQI:
34892 nargs = 2;
34893 nargs_constant = 1;
34894 break;
34895 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34896 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34897 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34898 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34899 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34900 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34901 case UHI_FTYPE_V16SI_V16SI_UHI:
34902 case UQI_FTYPE_V8DI_V8DI_UQI:
34903 case V16HI_FTYPE_V16SI_V16HI_UHI:
34904 case V16QI_FTYPE_V16SI_V16QI_UHI:
34905 case V16QI_FTYPE_V8DI_V16QI_UQI:
34906 case V16SF_FTYPE_V16SF_V16SF_UHI:
34907 case V16SF_FTYPE_V4SF_V16SF_UHI:
34908 case V16SI_FTYPE_SI_V16SI_UHI:
34909 case V16SI_FTYPE_V16HI_V16SI_UHI:
34910 case V16SI_FTYPE_V16QI_V16SI_UHI:
34911 case V8SF_FTYPE_V4SF_V8SF_UQI:
34912 case V4DF_FTYPE_V2DF_V4DF_UQI:
34913 case V8SI_FTYPE_V4SI_V8SI_UQI:
34914 case V8SI_FTYPE_SI_V8SI_UQI:
34915 case V4SI_FTYPE_V4SI_V4SI_UQI:
34916 case V4SI_FTYPE_SI_V4SI_UQI:
34917 case V4DI_FTYPE_V2DI_V4DI_UQI:
34918 case V4DI_FTYPE_DI_V4DI_UQI:
34919 case V2DI_FTYPE_V2DI_V2DI_UQI:
34920 case V2DI_FTYPE_DI_V2DI_UQI:
34921 case V64QI_FTYPE_V64QI_V64QI_UDI:
34922 case V64QI_FTYPE_V16QI_V64QI_UDI:
34923 case V64QI_FTYPE_QI_V64QI_UDI:
34924 case V32QI_FTYPE_V32QI_V32QI_USI:
34925 case V32QI_FTYPE_V16QI_V32QI_USI:
34926 case V32QI_FTYPE_QI_V32QI_USI:
34927 case V16QI_FTYPE_V16QI_V16QI_UHI:
34928 case V16QI_FTYPE_QI_V16QI_UHI:
34929 case V32HI_FTYPE_V8HI_V32HI_USI:
34930 case V32HI_FTYPE_HI_V32HI_USI:
34931 case V16HI_FTYPE_V8HI_V16HI_UHI:
34932 case V16HI_FTYPE_HI_V16HI_UHI:
34933 case V8HI_FTYPE_V8HI_V8HI_UQI:
34934 case V8HI_FTYPE_HI_V8HI_UQI:
34935 case V8SF_FTYPE_V8HI_V8SF_UQI:
34936 case V4SF_FTYPE_V8HI_V4SF_UQI:
34937 case V8SI_FTYPE_V8SF_V8SI_UQI:
34938 case V4SI_FTYPE_V4SF_V4SI_UQI:
34939 case V4DI_FTYPE_V4SF_V4DI_UQI:
34940 case V2DI_FTYPE_V4SF_V2DI_UQI:
34941 case V4SF_FTYPE_V4DI_V4SF_UQI:
34942 case V4SF_FTYPE_V2DI_V4SF_UQI:
34943 case V4DF_FTYPE_V4DI_V4DF_UQI:
34944 case V2DF_FTYPE_V2DI_V2DF_UQI:
34945 case V16QI_FTYPE_V8HI_V16QI_UQI:
34946 case V16QI_FTYPE_V16HI_V16QI_UHI:
34947 case V16QI_FTYPE_V4SI_V16QI_UQI:
34948 case V16QI_FTYPE_V8SI_V16QI_UQI:
34949 case V8HI_FTYPE_V4SI_V8HI_UQI:
34950 case V8HI_FTYPE_V8SI_V8HI_UQI:
34951 case V16QI_FTYPE_V2DI_V16QI_UQI:
34952 case V16QI_FTYPE_V4DI_V16QI_UQI:
34953 case V8HI_FTYPE_V2DI_V8HI_UQI:
34954 case V8HI_FTYPE_V4DI_V8HI_UQI:
34955 case V4SI_FTYPE_V2DI_V4SI_UQI:
34956 case V4SI_FTYPE_V4DI_V4SI_UQI:
34957 case V32QI_FTYPE_V32HI_V32QI_USI:
34958 case UHI_FTYPE_V16QI_V16QI_UHI:
34959 case USI_FTYPE_V32QI_V32QI_USI:
34960 case UDI_FTYPE_V64QI_V64QI_UDI:
34961 case UQI_FTYPE_V8HI_V8HI_UQI:
34962 case UHI_FTYPE_V16HI_V16HI_UHI:
34963 case USI_FTYPE_V32HI_V32HI_USI:
34964 case UQI_FTYPE_V4SI_V4SI_UQI:
34965 case UQI_FTYPE_V8SI_V8SI_UQI:
34966 case UQI_FTYPE_V2DI_V2DI_UQI:
34967 case UQI_FTYPE_V4DI_V4DI_UQI:
34968 case V4SF_FTYPE_V2DF_V4SF_UQI:
34969 case V4SF_FTYPE_V4DF_V4SF_UQI:
34970 case V16SI_FTYPE_V16SI_V16SI_UHI:
34971 case V16SI_FTYPE_V4SI_V16SI_UHI:
34972 case V2DI_FTYPE_V4SI_V2DI_UQI:
34973 case V2DI_FTYPE_V8HI_V2DI_UQI:
34974 case V2DI_FTYPE_V16QI_V2DI_UQI:
34975 case V4DI_FTYPE_V4DI_V4DI_UQI:
34976 case V4DI_FTYPE_V4SI_V4DI_UQI:
34977 case V4DI_FTYPE_V8HI_V4DI_UQI:
34978 case V4DI_FTYPE_V16QI_V4DI_UQI:
34979 case V4DI_FTYPE_V4DF_V4DI_UQI:
34980 case V2DI_FTYPE_V2DF_V2DI_UQI:
34981 case V4SI_FTYPE_V4DF_V4SI_UQI:
34982 case V4SI_FTYPE_V2DF_V4SI_UQI:
34983 case V4SI_FTYPE_V8HI_V4SI_UQI:
34984 case V4SI_FTYPE_V16QI_V4SI_UQI:
34985 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34986 case V8DF_FTYPE_V2DF_V8DF_UQI:
34987 case V8DF_FTYPE_V4DF_V8DF_UQI:
34988 case V8DF_FTYPE_V8DF_V8DF_UQI:
34989 case V8SF_FTYPE_V8SF_V8SF_UQI:
34990 case V8SF_FTYPE_V8SI_V8SF_UQI:
34991 case V4DF_FTYPE_V4DF_V4DF_UQI:
34992 case V4SF_FTYPE_V4SF_V4SF_UQI:
34993 case V2DF_FTYPE_V2DF_V2DF_UQI:
34994 case V2DF_FTYPE_V4SF_V2DF_UQI:
34995 case V2DF_FTYPE_V4SI_V2DF_UQI:
34996 case V4SF_FTYPE_V4SI_V4SF_UQI:
34997 case V4DF_FTYPE_V4SF_V4DF_UQI:
34998 case V4DF_FTYPE_V4SI_V4DF_UQI:
34999 case V8SI_FTYPE_V8SI_V8SI_UQI:
35000 case V8SI_FTYPE_V8HI_V8SI_UQI:
35001 case V8SI_FTYPE_V16QI_V8SI_UQI:
35002 case V8DF_FTYPE_V8SI_V8DF_UQI:
35003 case V8DI_FTYPE_DI_V8DI_UQI:
35004 case V16SF_FTYPE_V8SF_V16SF_UHI:
35005 case V16SI_FTYPE_V8SI_V16SI_UHI:
35006 case V16HI_FTYPE_V16HI_V16HI_UHI:
35007 case V8HI_FTYPE_V16QI_V8HI_UQI:
35008 case V16HI_FTYPE_V16QI_V16HI_UHI:
35009 case V32HI_FTYPE_V32HI_V32HI_USI:
35010 case V32HI_FTYPE_V32QI_V32HI_USI:
35011 case V8DI_FTYPE_V16QI_V8DI_UQI:
35012 case V8DI_FTYPE_V2DI_V8DI_UQI:
35013 case V8DI_FTYPE_V4DI_V8DI_UQI:
35014 case V8DI_FTYPE_V8DI_V8DI_UQI:
35015 case V8DI_FTYPE_V8HI_V8DI_UQI:
35016 case V8DI_FTYPE_V8SI_V8DI_UQI:
35017 case V8HI_FTYPE_V8DI_V8HI_UQI:
35018 case V8SI_FTYPE_V8DI_V8SI_UQI:
35019 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35020 case V16SI_FTYPE_V16SI_V16SI_V16SI:
35021 case V8DI_FTYPE_V8DI_V8DI_V8DI:
35022 case V32HI_FTYPE_V32HI_V32HI_V32HI:
35023 case V2DI_FTYPE_V2DI_V2DI_V2DI:
35024 case V16HI_FTYPE_V16HI_V16HI_V16HI:
35025 case V8SI_FTYPE_V8SI_V8SI_V8SI:
35026 case V8HI_FTYPE_V8HI_V8HI_V8HI:
35027 nargs = 3;
35028 break;
35029 case V32QI_FTYPE_V32QI_V32QI_INT:
35030 case V16HI_FTYPE_V16HI_V16HI_INT:
35031 case V16QI_FTYPE_V16QI_V16QI_INT:
35032 case V4DI_FTYPE_V4DI_V4DI_INT:
35033 case V8HI_FTYPE_V8HI_V8HI_INT:
35034 case V8SI_FTYPE_V8SI_V8SI_INT:
35035 case V8SI_FTYPE_V8SI_V4SI_INT:
35036 case V8SF_FTYPE_V8SF_V8SF_INT:
35037 case V8SF_FTYPE_V8SF_V4SF_INT:
35038 case V4SI_FTYPE_V4SI_V4SI_INT:
35039 case V4DF_FTYPE_V4DF_V4DF_INT:
35040 case V16SF_FTYPE_V16SF_V16SF_INT:
35041 case V16SF_FTYPE_V16SF_V4SF_INT:
35042 case V16SI_FTYPE_V16SI_V4SI_INT:
35043 case V4DF_FTYPE_V4DF_V2DF_INT:
35044 case V4SF_FTYPE_V4SF_V4SF_INT:
35045 case V2DI_FTYPE_V2DI_V2DI_INT:
35046 case V4DI_FTYPE_V4DI_V2DI_INT:
35047 case V2DF_FTYPE_V2DF_V2DF_INT:
35048 case UQI_FTYPE_V8DI_V8UDI_INT:
35049 case UQI_FTYPE_V8DF_V8DF_INT:
35050 case UQI_FTYPE_V2DF_V2DF_INT:
35051 case UQI_FTYPE_V4SF_V4SF_INT:
35052 case UHI_FTYPE_V16SI_V16SI_INT:
35053 case UHI_FTYPE_V16SF_V16SF_INT:
35054 case V64QI_FTYPE_V64QI_V64QI_INT:
35055 case V32HI_FTYPE_V32HI_V32HI_INT:
35056 case V16SI_FTYPE_V16SI_V16SI_INT:
35057 case V8DI_FTYPE_V8DI_V8DI_INT:
35058 nargs = 3;
35059 nargs_constant = 1;
35060 break;
35061 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35062 nargs = 3;
35063 rmode = V4DImode;
35064 nargs_constant = 1;
35065 break;
35066 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35067 nargs = 3;
35068 rmode = V2DImode;
35069 nargs_constant = 1;
35070 break;
35071 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35072 nargs = 3;
35073 rmode = DImode;
35074 nargs_constant = 1;
35075 break;
35076 case V2DI_FTYPE_V2DI_UINT_UINT:
35077 nargs = 3;
35078 nargs_constant = 2;
35079 break;
35080 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35081 nargs = 3;
35082 rmode = V8DImode;
35083 nargs_constant = 1;
35084 break;
35085 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35086 nargs = 5;
35087 rmode = V8DImode;
35088 mask_pos = 2;
35089 nargs_constant = 1;
35090 break;
35091 case QI_FTYPE_V8DF_INT_UQI:
35092 case QI_FTYPE_V4DF_INT_UQI:
35093 case QI_FTYPE_V2DF_INT_UQI:
35094 case HI_FTYPE_V16SF_INT_UHI:
35095 case QI_FTYPE_V8SF_INT_UQI:
35096 case QI_FTYPE_V4SF_INT_UQI:
35097 case V4SI_FTYPE_V4SI_V4SI_UHI:
35098 case V8SI_FTYPE_V8SI_V8SI_UHI:
35099 nargs = 3;
35100 mask_pos = 1;
35101 nargs_constant = 1;
35102 break;
35103 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35104 nargs = 5;
35105 rmode = V4DImode;
35106 mask_pos = 2;
35107 nargs_constant = 1;
35108 break;
35109 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35110 nargs = 5;
35111 rmode = V2DImode;
35112 mask_pos = 2;
35113 nargs_constant = 1;
35114 break;
35115 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35116 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35117 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35118 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35119 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35120 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35121 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35122 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35123 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35124 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35125 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35126 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35127 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35128 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35129 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35130 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35131 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35132 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35133 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35134 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35135 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35136 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35137 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35138 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35139 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35140 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35141 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35142 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35143 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35144 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35145 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35146 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35147 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35148 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35149 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35150 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35151 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35152 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35153 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35154 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35155 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35156 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35157 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35158 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35159 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35160 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35161 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35162 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35163 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35164 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35165 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35166 nargs = 4;
35167 break;
35168 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35169 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35170 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35171 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35172 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35173 nargs = 4;
35174 nargs_constant = 1;
35175 break;
35176 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35177 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35178 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35179 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35180 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35181 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35182 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35183 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35184 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35185 case USI_FTYPE_V32QI_V32QI_INT_USI:
35186 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35187 case USI_FTYPE_V32HI_V32HI_INT_USI:
35188 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35189 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35190 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
35191 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
35192 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
35193 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
35194 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
35195 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
35196 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
35197 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
35198 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
35199 nargs = 4;
35200 mask_pos = 1;
35201 nargs_constant = 1;
35202 break;
35203 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35204 nargs = 4;
35205 nargs_constant = 2;
35206 break;
35207 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35208 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35209 nargs = 4;
35210 break;
35211 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35212 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35213 mask_pos = 1;
35214 nargs = 4;
35215 nargs_constant = 1;
35216 break;
35217 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35218 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35219 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35220 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35221 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35222 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35223 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35224 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35225 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35226 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35227 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35228 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35229 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35230 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35231 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35232 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35233 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35234 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35235 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35236 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35237 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35238 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35239 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35240 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35241 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35242 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35243 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35244 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35245 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35246 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35247 nargs = 4;
35248 mask_pos = 2;
35249 nargs_constant = 1;
35250 break;
35251 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35252 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35253 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35254 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35255 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35256 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35257 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35258 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35259 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35260 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35261 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35262 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35263 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35264 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35265 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35266 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35267 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35268 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35269 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35270 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35271 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35272 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35273 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35274 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35275 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35276 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35277 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35278 nargs = 5;
35279 mask_pos = 2;
35280 nargs_constant = 1;
35281 break;
35282 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35283 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35284 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35285 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35286 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35287 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35288 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35289 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35290 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35291 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35292 nargs = 5;
35293 mask_pos = 1;
35294 nargs_constant = 1;
35295 break;
35296 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35297 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35298 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35299 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35300 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35301 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35302 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35303 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35304 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35305 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35306 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35307 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35308 nargs = 5;
35309 mask_pos = 1;
35310 nargs_constant = 2;
35311 break;
35313 default:
35314 gcc_unreachable ();
35317 gcc_assert (nargs <= ARRAY_SIZE (args));
35319 if (comparison != UNKNOWN)
35321 gcc_assert (nargs == 2);
35322 return ix86_expand_sse_compare (d, exp, target, swap);
35325 if (rmode == VOIDmode || rmode == tmode)
35327 if (optimize
35328 || target == 0
35329 || GET_MODE (target) != tmode
35330 || !insn_p->operand[0].predicate (target, tmode))
35331 target = gen_reg_rtx (tmode);
35332 else if (memory_operand (target, tmode))
35333 num_memory++;
35334 real_target = target;
35336 else
35338 real_target = gen_reg_rtx (tmode);
35339 target = lowpart_subreg (rmode, real_target, tmode);
35342 for (i = 0; i < nargs; i++)
35344 tree arg = CALL_EXPR_ARG (exp, i);
35345 rtx op = expand_normal (arg);
35346 machine_mode mode = insn_p->operand[i + 1].mode;
35347 bool match = insn_p->operand[i + 1].predicate (op, mode);
35349 if (second_arg_count && i == 1)
35351 /* SIMD shift insns take either an 8-bit immediate or
35352 register as count. But builtin functions take int as
35353 count. If count doesn't match, we put it in register.
35354 The instructions are using 64-bit count, if op is just
35355 32-bit, zero-extend it, as negative shift counts
35356 are undefined behavior and zero-extension is more
35357 efficient. */
35358 if (!match)
35360 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35361 op = convert_modes (mode, GET_MODE (op), op, 1);
35362 else
35363 op = lowpart_subreg (mode, op, GET_MODE (op));
35364 if (!insn_p->operand[i + 1].predicate (op, mode))
35365 op = copy_to_reg (op);
35368 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35369 (!mask_pos && (nargs - i) <= nargs_constant))
35371 if (!match)
35372 switch (icode)
35374 case CODE_FOR_avx_vinsertf128v4di:
35375 case CODE_FOR_avx_vextractf128v4di:
35376 error ("the last argument must be an 1-bit immediate");
35377 return const0_rtx;
35379 case CODE_FOR_avx512f_cmpv8di3_mask:
35380 case CODE_FOR_avx512f_cmpv16si3_mask:
35381 case CODE_FOR_avx512f_ucmpv8di3_mask:
35382 case CODE_FOR_avx512f_ucmpv16si3_mask:
35383 case CODE_FOR_avx512vl_cmpv4di3_mask:
35384 case CODE_FOR_avx512vl_cmpv8si3_mask:
35385 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35386 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35387 case CODE_FOR_avx512vl_cmpv2di3_mask:
35388 case CODE_FOR_avx512vl_cmpv4si3_mask:
35389 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35390 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35391 error ("the last argument must be a 3-bit immediate");
35392 return const0_rtx;
35394 case CODE_FOR_sse4_1_roundsd:
35395 case CODE_FOR_sse4_1_roundss:
35397 case CODE_FOR_sse4_1_roundpd:
35398 case CODE_FOR_sse4_1_roundps:
35399 case CODE_FOR_avx_roundpd256:
35400 case CODE_FOR_avx_roundps256:
35402 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35403 case CODE_FOR_sse4_1_roundps_sfix:
35404 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35405 case CODE_FOR_avx_roundps_sfix256:
35407 case CODE_FOR_sse4_1_blendps:
35408 case CODE_FOR_avx_blendpd256:
35409 case CODE_FOR_avx_vpermilv4df:
35410 case CODE_FOR_avx_vpermilv4df_mask:
35411 case CODE_FOR_avx512f_getmantv8df_mask:
35412 case CODE_FOR_avx512f_getmantv16sf_mask:
35413 case CODE_FOR_avx512vl_getmantv8sf_mask:
35414 case CODE_FOR_avx512vl_getmantv4df_mask:
35415 case CODE_FOR_avx512vl_getmantv4sf_mask:
35416 case CODE_FOR_avx512vl_getmantv2df_mask:
35417 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35418 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35419 case CODE_FOR_avx512dq_rangepv4df_mask:
35420 case CODE_FOR_avx512dq_rangepv8sf_mask:
35421 case CODE_FOR_avx512dq_rangepv2df_mask:
35422 case CODE_FOR_avx512dq_rangepv4sf_mask:
35423 case CODE_FOR_avx_shufpd256_mask:
35424 error ("the last argument must be a 4-bit immediate");
35425 return const0_rtx;
35427 case CODE_FOR_sha1rnds4:
35428 case CODE_FOR_sse4_1_blendpd:
35429 case CODE_FOR_avx_vpermilv2df:
35430 case CODE_FOR_avx_vpermilv2df_mask:
35431 case CODE_FOR_xop_vpermil2v2df3:
35432 case CODE_FOR_xop_vpermil2v4sf3:
35433 case CODE_FOR_xop_vpermil2v4df3:
35434 case CODE_FOR_xop_vpermil2v8sf3:
35435 case CODE_FOR_avx512f_vinsertf32x4_mask:
35436 case CODE_FOR_avx512f_vinserti32x4_mask:
35437 case CODE_FOR_avx512f_vextractf32x4_mask:
35438 case CODE_FOR_avx512f_vextracti32x4_mask:
35439 case CODE_FOR_sse2_shufpd:
35440 case CODE_FOR_sse2_shufpd_mask:
35441 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35442 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35443 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35444 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35445 error ("the last argument must be a 2-bit immediate");
35446 return const0_rtx;
35448 case CODE_FOR_avx_vextractf128v4df:
35449 case CODE_FOR_avx_vextractf128v8sf:
35450 case CODE_FOR_avx_vextractf128v8si:
35451 case CODE_FOR_avx_vinsertf128v4df:
35452 case CODE_FOR_avx_vinsertf128v8sf:
35453 case CODE_FOR_avx_vinsertf128v8si:
35454 case CODE_FOR_avx512f_vinsertf64x4_mask:
35455 case CODE_FOR_avx512f_vinserti64x4_mask:
35456 case CODE_FOR_avx512f_vextractf64x4_mask:
35457 case CODE_FOR_avx512f_vextracti64x4_mask:
35458 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35459 case CODE_FOR_avx512dq_vinserti32x8_mask:
35460 case CODE_FOR_avx512vl_vinsertv4df:
35461 case CODE_FOR_avx512vl_vinsertv4di:
35462 case CODE_FOR_avx512vl_vinsertv8sf:
35463 case CODE_FOR_avx512vl_vinsertv8si:
35464 error ("the last argument must be a 1-bit immediate");
35465 return const0_rtx;
35467 case CODE_FOR_avx_vmcmpv2df3:
35468 case CODE_FOR_avx_vmcmpv4sf3:
35469 case CODE_FOR_avx_cmpv2df3:
35470 case CODE_FOR_avx_cmpv4sf3:
35471 case CODE_FOR_avx_cmpv4df3:
35472 case CODE_FOR_avx_cmpv8sf3:
35473 case CODE_FOR_avx512f_cmpv8df3_mask:
35474 case CODE_FOR_avx512f_cmpv16sf3_mask:
35475 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35476 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35477 error ("the last argument must be a 5-bit immediate");
35478 return const0_rtx;
35480 default:
35481 switch (nargs_constant)
35483 case 2:
35484 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35485 (!mask_pos && (nargs - i) == nargs_constant))
35487 error ("the next to last argument must be an 8-bit immediate");
35488 break;
35490 /* FALLTHRU */
35491 case 1:
35492 error ("the last argument must be an 8-bit immediate");
35493 break;
35494 default:
35495 gcc_unreachable ();
35497 return const0_rtx;
35500 else
35502 if (VECTOR_MODE_P (mode))
35503 op = safe_vector_operand (op, mode);
35505 /* If we aren't optimizing, only allow one memory operand to
35506 be generated. */
35507 if (memory_operand (op, mode))
35508 num_memory++;
35510 op = fixup_modeless_constant (op, mode);
35512 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35514 if (optimize || !match || num_memory > 1)
35515 op = copy_to_mode_reg (mode, op);
35517 else
35519 op = copy_to_reg (op);
35520 op = lowpart_subreg (mode, op, GET_MODE (op));
35524 args[i].op = op;
35525 args[i].mode = mode;
35528 switch (nargs)
35530 case 1:
35531 pat = GEN_FCN (icode) (real_target, args[0].op);
35532 break;
35533 case 2:
35534 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35535 break;
35536 case 3:
35537 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35538 args[2].op);
35539 break;
35540 case 4:
35541 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35542 args[2].op, args[3].op);
35543 break;
35544 case 5:
35545 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35546 args[2].op, args[3].op, args[4].op);
35547 break;
35548 case 6:
35549 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35550 args[2].op, args[3].op, args[4].op,
35551 args[5].op);
35552 break;
35553 default:
35554 gcc_unreachable ();
35557 if (! pat)
35558 return 0;
35560 emit_insn (pat);
35561 return target;
35564 /* Transform pattern of following layout:
35565 (set A
35566 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35568 into:
35569 (set (A B)) */
35571 static rtx
35572 ix86_erase_embedded_rounding (rtx pat)
35574 if (GET_CODE (pat) == INSN)
35575 pat = PATTERN (pat);
35577 gcc_assert (GET_CODE (pat) == SET);
35578 rtx src = SET_SRC (pat);
35579 gcc_assert (XVECLEN (src, 0) == 2);
35580 rtx p0 = XVECEXP (src, 0, 0);
35581 gcc_assert (GET_CODE (src) == UNSPEC
35582 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35583 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35584 return res;
35587 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35588 with rounding. */
35589 static rtx
35590 ix86_expand_sse_comi_round (const struct builtin_description *d,
35591 tree exp, rtx target)
35593 rtx pat, set_dst;
35594 tree arg0 = CALL_EXPR_ARG (exp, 0);
35595 tree arg1 = CALL_EXPR_ARG (exp, 1);
35596 tree arg2 = CALL_EXPR_ARG (exp, 2);
35597 tree arg3 = CALL_EXPR_ARG (exp, 3);
35598 rtx op0 = expand_normal (arg0);
35599 rtx op1 = expand_normal (arg1);
35600 rtx op2 = expand_normal (arg2);
35601 rtx op3 = expand_normal (arg3);
35602 enum insn_code icode = d->icode;
35603 const struct insn_data_d *insn_p = &insn_data[icode];
35604 machine_mode mode0 = insn_p->operand[0].mode;
35605 machine_mode mode1 = insn_p->operand[1].mode;
35606 enum rtx_code comparison = UNEQ;
35607 bool need_ucomi = false;
35609 /* See avxintrin.h for values. */
35610 enum rtx_code comi_comparisons[32] =
35612 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35613 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35614 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35616 bool need_ucomi_values[32] =
35618 true, false, false, true, true, false, false, true,
35619 true, false, false, true, true, false, false, true,
35620 false, true, true, false, false, true, true, false,
35621 false, true, true, false, false, true, true, false
35624 if (!CONST_INT_P (op2))
35626 error ("the third argument must be comparison constant");
35627 return const0_rtx;
35629 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35631 error ("incorrect comparison mode");
35632 return const0_rtx;
35635 if (!insn_p->operand[2].predicate (op3, SImode))
35637 error ("incorrect rounding operand");
35638 return const0_rtx;
35641 comparison = comi_comparisons[INTVAL (op2)];
35642 need_ucomi = need_ucomi_values[INTVAL (op2)];
35644 if (VECTOR_MODE_P (mode0))
35645 op0 = safe_vector_operand (op0, mode0);
35646 if (VECTOR_MODE_P (mode1))
35647 op1 = safe_vector_operand (op1, mode1);
35649 target = gen_reg_rtx (SImode);
35650 emit_move_insn (target, const0_rtx);
35651 target = gen_rtx_SUBREG (QImode, target, 0);
35653 if ((optimize && !register_operand (op0, mode0))
35654 || !insn_p->operand[0].predicate (op0, mode0))
35655 op0 = copy_to_mode_reg (mode0, op0);
35656 if ((optimize && !register_operand (op1, mode1))
35657 || !insn_p->operand[1].predicate (op1, mode1))
35658 op1 = copy_to_mode_reg (mode1, op1);
35660 if (need_ucomi)
35661 icode = icode == CODE_FOR_sse_comi_round
35662 ? CODE_FOR_sse_ucomi_round
35663 : CODE_FOR_sse2_ucomi_round;
35665 pat = GEN_FCN (icode) (op0, op1, op3);
35666 if (! pat)
35667 return 0;
35669 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35670 if (INTVAL (op3) == NO_ROUND)
35672 pat = ix86_erase_embedded_rounding (pat);
35673 if (! pat)
35674 return 0;
35676 set_dst = SET_DEST (pat);
35678 else
35680 gcc_assert (GET_CODE (pat) == SET);
35681 set_dst = SET_DEST (pat);
35684 emit_insn (pat);
35685 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35686 gen_rtx_fmt_ee (comparison, QImode,
35687 set_dst,
35688 const0_rtx)));
35690 return SUBREG_REG (target);
35693 static rtx
35694 ix86_expand_round_builtin (const struct builtin_description *d,
35695 tree exp, rtx target)
35697 rtx pat;
35698 unsigned int i, nargs;
35699 struct
35701 rtx op;
35702 machine_mode mode;
35703 } args[6];
35704 enum insn_code icode = d->icode;
35705 const struct insn_data_d *insn_p = &insn_data[icode];
35706 machine_mode tmode = insn_p->operand[0].mode;
35707 unsigned int nargs_constant = 0;
35708 unsigned int redundant_embed_rnd = 0;
35710 switch ((enum ix86_builtin_func_type) d->flag)
35712 case UINT64_FTYPE_V2DF_INT:
35713 case UINT64_FTYPE_V4SF_INT:
35714 case UINT_FTYPE_V2DF_INT:
35715 case UINT_FTYPE_V4SF_INT:
35716 case INT64_FTYPE_V2DF_INT:
35717 case INT64_FTYPE_V4SF_INT:
35718 case INT_FTYPE_V2DF_INT:
35719 case INT_FTYPE_V4SF_INT:
35720 nargs = 2;
35721 break;
35722 case V4SF_FTYPE_V4SF_UINT_INT:
35723 case V4SF_FTYPE_V4SF_UINT64_INT:
35724 case V2DF_FTYPE_V2DF_UINT64_INT:
35725 case V4SF_FTYPE_V4SF_INT_INT:
35726 case V4SF_FTYPE_V4SF_INT64_INT:
35727 case V2DF_FTYPE_V2DF_INT64_INT:
35728 case V4SF_FTYPE_V4SF_V4SF_INT:
35729 case V2DF_FTYPE_V2DF_V2DF_INT:
35730 case V4SF_FTYPE_V4SF_V2DF_INT:
35731 case V2DF_FTYPE_V2DF_V4SF_INT:
35732 nargs = 3;
35733 break;
35734 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35735 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35736 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35737 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35738 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35739 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35740 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35741 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35742 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35743 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35744 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35745 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35746 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35747 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35748 nargs = 4;
35749 break;
35750 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35751 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35752 nargs_constant = 2;
35753 nargs = 4;
35754 break;
35755 case INT_FTYPE_V4SF_V4SF_INT_INT:
35756 case INT_FTYPE_V2DF_V2DF_INT_INT:
35757 return ix86_expand_sse_comi_round (d, exp, target);
35758 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35759 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35760 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35761 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35762 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35763 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35764 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35765 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35766 nargs = 5;
35767 break;
35768 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35769 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35770 nargs_constant = 4;
35771 nargs = 5;
35772 break;
35773 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35774 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35775 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35776 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35777 nargs_constant = 3;
35778 nargs = 5;
35779 break;
35780 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35781 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35782 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35783 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35784 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35785 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35786 nargs = 6;
35787 nargs_constant = 4;
35788 break;
35789 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35790 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35791 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35792 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35793 nargs = 6;
35794 nargs_constant = 3;
35795 break;
35796 default:
35797 gcc_unreachable ();
35799 gcc_assert (nargs <= ARRAY_SIZE (args));
35801 if (optimize
35802 || target == 0
35803 || GET_MODE (target) != tmode
35804 || !insn_p->operand[0].predicate (target, tmode))
35805 target = gen_reg_rtx (tmode);
35807 for (i = 0; i < nargs; i++)
35809 tree arg = CALL_EXPR_ARG (exp, i);
35810 rtx op = expand_normal (arg);
35811 machine_mode mode = insn_p->operand[i + 1].mode;
35812 bool match = insn_p->operand[i + 1].predicate (op, mode);
35814 if (i == nargs - nargs_constant)
35816 if (!match)
35818 switch (icode)
35820 case CODE_FOR_avx512f_getmantv8df_mask_round:
35821 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35822 case CODE_FOR_avx512f_vgetmantv2df_round:
35823 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35824 case CODE_FOR_avx512f_vgetmantv4sf_round:
35825 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35826 error ("the immediate argument must be a 4-bit immediate");
35827 return const0_rtx;
35828 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35829 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35830 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35831 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35832 error ("the immediate argument must be a 5-bit immediate");
35833 return const0_rtx;
35834 default:
35835 error ("the immediate argument must be an 8-bit immediate");
35836 return const0_rtx;
35840 else if (i == nargs-1)
35842 if (!insn_p->operand[nargs].predicate (op, SImode))
35844 error ("incorrect rounding operand");
35845 return const0_rtx;
35848 /* If there is no rounding use normal version of the pattern. */
35849 if (INTVAL (op) == NO_ROUND)
35850 redundant_embed_rnd = 1;
35852 else
35854 if (VECTOR_MODE_P (mode))
35855 op = safe_vector_operand (op, mode);
35857 op = fixup_modeless_constant (op, mode);
35859 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35861 if (optimize || !match)
35862 op = copy_to_mode_reg (mode, op);
35864 else
35866 op = copy_to_reg (op);
35867 op = lowpart_subreg (mode, op, GET_MODE (op));
35871 args[i].op = op;
35872 args[i].mode = mode;
35875 switch (nargs)
35877 case 1:
35878 pat = GEN_FCN (icode) (target, args[0].op);
35879 break;
35880 case 2:
35881 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35882 break;
35883 case 3:
35884 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35885 args[2].op);
35886 break;
35887 case 4:
35888 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35889 args[2].op, args[3].op);
35890 break;
35891 case 5:
35892 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35893 args[2].op, args[3].op, args[4].op);
35894 break;
35895 case 6:
35896 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35897 args[2].op, args[3].op, args[4].op,
35898 args[5].op);
35899 break;
35900 default:
35901 gcc_unreachable ();
35904 if (!pat)
35905 return 0;
35907 if (redundant_embed_rnd)
35908 pat = ix86_erase_embedded_rounding (pat);
35910 emit_insn (pat);
35911 return target;
35914 /* Subroutine of ix86_expand_builtin to take care of special insns
35915 with variable number of operands. */
35917 static rtx
35918 ix86_expand_special_args_builtin (const struct builtin_description *d,
35919 tree exp, rtx target)
35921 tree arg;
35922 rtx pat, op;
35923 unsigned int i, nargs, arg_adjust, memory;
35924 bool aligned_mem = false;
35925 struct
35927 rtx op;
35928 machine_mode mode;
35929 } args[3];
35930 enum insn_code icode = d->icode;
35931 bool last_arg_constant = false;
35932 const struct insn_data_d *insn_p = &insn_data[icode];
35933 machine_mode tmode = insn_p->operand[0].mode;
35934 enum { load, store } klass;
35936 switch ((enum ix86_builtin_func_type) d->flag)
35938 case VOID_FTYPE_VOID:
35939 emit_insn (GEN_FCN (icode) (target));
35940 return 0;
35941 case VOID_FTYPE_UINT64:
35942 case VOID_FTYPE_UNSIGNED:
35943 nargs = 0;
35944 klass = store;
35945 memory = 0;
35946 break;
35948 case INT_FTYPE_VOID:
35949 case USHORT_FTYPE_VOID:
35950 case UINT64_FTYPE_VOID:
35951 case UINT_FTYPE_VOID:
35952 case UNSIGNED_FTYPE_VOID:
35953 nargs = 0;
35954 klass = load;
35955 memory = 0;
35956 break;
35957 case UINT64_FTYPE_PUNSIGNED:
35958 case V2DI_FTYPE_PV2DI:
35959 case V4DI_FTYPE_PV4DI:
35960 case V32QI_FTYPE_PCCHAR:
35961 case V16QI_FTYPE_PCCHAR:
35962 case V8SF_FTYPE_PCV4SF:
35963 case V8SF_FTYPE_PCFLOAT:
35964 case V4SF_FTYPE_PCFLOAT:
35965 case V4DF_FTYPE_PCV2DF:
35966 case V4DF_FTYPE_PCDOUBLE:
35967 case V2DF_FTYPE_PCDOUBLE:
35968 case VOID_FTYPE_PVOID:
35969 case V8DI_FTYPE_PV8DI:
35970 nargs = 1;
35971 klass = load;
35972 memory = 0;
35973 switch (icode)
35975 case CODE_FOR_sse4_1_movntdqa:
35976 case CODE_FOR_avx2_movntdqa:
35977 case CODE_FOR_avx512f_movntdqa:
35978 aligned_mem = true;
35979 break;
35980 default:
35981 break;
35983 break;
35984 case VOID_FTYPE_PV2SF_V4SF:
35985 case VOID_FTYPE_PV8DI_V8DI:
35986 case VOID_FTYPE_PV4DI_V4DI:
35987 case VOID_FTYPE_PV2DI_V2DI:
35988 case VOID_FTYPE_PCHAR_V32QI:
35989 case VOID_FTYPE_PCHAR_V16QI:
35990 case VOID_FTYPE_PFLOAT_V16SF:
35991 case VOID_FTYPE_PFLOAT_V8SF:
35992 case VOID_FTYPE_PFLOAT_V4SF:
35993 case VOID_FTYPE_PDOUBLE_V8DF:
35994 case VOID_FTYPE_PDOUBLE_V4DF:
35995 case VOID_FTYPE_PDOUBLE_V2DF:
35996 case VOID_FTYPE_PLONGLONG_LONGLONG:
35997 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35998 case VOID_FTYPE_PINT_INT:
35999 nargs = 1;
36000 klass = store;
36001 /* Reserve memory operand for target. */
36002 memory = ARRAY_SIZE (args);
36003 switch (icode)
36005 /* These builtins and instructions require the memory
36006 to be properly aligned. */
36007 case CODE_FOR_avx_movntv4di:
36008 case CODE_FOR_sse2_movntv2di:
36009 case CODE_FOR_avx_movntv8sf:
36010 case CODE_FOR_sse_movntv4sf:
36011 case CODE_FOR_sse4a_vmmovntv4sf:
36012 case CODE_FOR_avx_movntv4df:
36013 case CODE_FOR_sse2_movntv2df:
36014 case CODE_FOR_sse4a_vmmovntv2df:
36015 case CODE_FOR_sse2_movntidi:
36016 case CODE_FOR_sse_movntq:
36017 case CODE_FOR_sse2_movntisi:
36018 case CODE_FOR_avx512f_movntv16sf:
36019 case CODE_FOR_avx512f_movntv8df:
36020 case CODE_FOR_avx512f_movntv8di:
36021 aligned_mem = true;
36022 break;
36023 default:
36024 break;
36026 break;
36027 case V4SF_FTYPE_V4SF_PCV2SF:
36028 case V2DF_FTYPE_V2DF_PCDOUBLE:
36029 nargs = 2;
36030 klass = load;
36031 memory = 1;
36032 break;
36033 case V8SF_FTYPE_PCV8SF_V8SI:
36034 case V4DF_FTYPE_PCV4DF_V4DI:
36035 case V4SF_FTYPE_PCV4SF_V4SI:
36036 case V2DF_FTYPE_PCV2DF_V2DI:
36037 case V8SI_FTYPE_PCV8SI_V8SI:
36038 case V4DI_FTYPE_PCV4DI_V4DI:
36039 case V4SI_FTYPE_PCV4SI_V4SI:
36040 case V2DI_FTYPE_PCV2DI_V2DI:
36041 case VOID_FTYPE_INT_INT64:
36042 nargs = 2;
36043 klass = load;
36044 memory = 0;
36045 break;
36046 case VOID_FTYPE_PV8DF_V8DF_UQI:
36047 case VOID_FTYPE_PV4DF_V4DF_UQI:
36048 case VOID_FTYPE_PV2DF_V2DF_UQI:
36049 case VOID_FTYPE_PV16SF_V16SF_UHI:
36050 case VOID_FTYPE_PV8SF_V8SF_UQI:
36051 case VOID_FTYPE_PV4SF_V4SF_UQI:
36052 case VOID_FTYPE_PV8DI_V8DI_UQI:
36053 case VOID_FTYPE_PV4DI_V4DI_UQI:
36054 case VOID_FTYPE_PV2DI_V2DI_UQI:
36055 case VOID_FTYPE_PV16SI_V16SI_UHI:
36056 case VOID_FTYPE_PV8SI_V8SI_UQI:
36057 case VOID_FTYPE_PV4SI_V4SI_UQI:
36058 case VOID_FTYPE_PV64QI_V64QI_UDI:
36059 case VOID_FTYPE_PV32HI_V32HI_USI:
36060 case VOID_FTYPE_PV32QI_V32QI_USI:
36061 case VOID_FTYPE_PV16QI_V16QI_UHI:
36062 case VOID_FTYPE_PV16HI_V16HI_UHI:
36063 case VOID_FTYPE_PV8HI_V8HI_UQI:
36064 switch (icode)
36066 /* These builtins and instructions require the memory
36067 to be properly aligned. */
36068 case CODE_FOR_avx512f_storev16sf_mask:
36069 case CODE_FOR_avx512f_storev16si_mask:
36070 case CODE_FOR_avx512f_storev8df_mask:
36071 case CODE_FOR_avx512f_storev8di_mask:
36072 case CODE_FOR_avx512vl_storev8sf_mask:
36073 case CODE_FOR_avx512vl_storev8si_mask:
36074 case CODE_FOR_avx512vl_storev4df_mask:
36075 case CODE_FOR_avx512vl_storev4di_mask:
36076 case CODE_FOR_avx512vl_storev4sf_mask:
36077 case CODE_FOR_avx512vl_storev4si_mask:
36078 case CODE_FOR_avx512vl_storev2df_mask:
36079 case CODE_FOR_avx512vl_storev2di_mask:
36080 aligned_mem = true;
36081 break;
36082 default:
36083 break;
36085 /* FALLTHRU */
36086 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36087 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36088 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36089 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36090 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36091 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36092 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36093 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36094 case VOID_FTYPE_PV8SI_V8DI_UQI:
36095 case VOID_FTYPE_PV8HI_V8DI_UQI:
36096 case VOID_FTYPE_PV16HI_V16SI_UHI:
36097 case VOID_FTYPE_PV16QI_V8DI_UQI:
36098 case VOID_FTYPE_PV16QI_V16SI_UHI:
36099 case VOID_FTYPE_PV4SI_V4DI_UQI:
36100 case VOID_FTYPE_PV4SI_V2DI_UQI:
36101 case VOID_FTYPE_PV8HI_V4DI_UQI:
36102 case VOID_FTYPE_PV8HI_V2DI_UQI:
36103 case VOID_FTYPE_PV8HI_V8SI_UQI:
36104 case VOID_FTYPE_PV8HI_V4SI_UQI:
36105 case VOID_FTYPE_PV16QI_V4DI_UQI:
36106 case VOID_FTYPE_PV16QI_V2DI_UQI:
36107 case VOID_FTYPE_PV16QI_V8SI_UQI:
36108 case VOID_FTYPE_PV16QI_V4SI_UQI:
36109 case VOID_FTYPE_PCHAR_V64QI_UDI:
36110 case VOID_FTYPE_PCHAR_V32QI_USI:
36111 case VOID_FTYPE_PCHAR_V16QI_UHI:
36112 case VOID_FTYPE_PSHORT_V32HI_USI:
36113 case VOID_FTYPE_PSHORT_V16HI_UHI:
36114 case VOID_FTYPE_PSHORT_V8HI_UQI:
36115 case VOID_FTYPE_PINT_V16SI_UHI:
36116 case VOID_FTYPE_PINT_V8SI_UQI:
36117 case VOID_FTYPE_PINT_V4SI_UQI:
36118 case VOID_FTYPE_PINT64_V8DI_UQI:
36119 case VOID_FTYPE_PINT64_V4DI_UQI:
36120 case VOID_FTYPE_PINT64_V2DI_UQI:
36121 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36122 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36123 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36124 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36125 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36126 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36127 case VOID_FTYPE_PV32QI_V32HI_USI:
36128 case VOID_FTYPE_PV16QI_V16HI_UHI:
36129 case VOID_FTYPE_PV8QI_V8HI_UQI:
36130 nargs = 2;
36131 klass = store;
36132 /* Reserve memory operand for target. */
36133 memory = ARRAY_SIZE (args);
36134 break;
36135 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36136 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36137 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36138 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36139 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36140 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36141 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36142 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36143 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36144 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36145 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36146 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36147 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
36148 case V32HI_FTYPE_PCV32HI_V32HI_USI:
36149 case V32QI_FTYPE_PCV32QI_V32QI_USI:
36150 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
36151 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
36152 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
36153 switch (icode)
36155 /* These builtins and instructions require the memory
36156 to be properly aligned. */
36157 case CODE_FOR_avx512f_loadv16sf_mask:
36158 case CODE_FOR_avx512f_loadv16si_mask:
36159 case CODE_FOR_avx512f_loadv8df_mask:
36160 case CODE_FOR_avx512f_loadv8di_mask:
36161 case CODE_FOR_avx512vl_loadv8sf_mask:
36162 case CODE_FOR_avx512vl_loadv8si_mask:
36163 case CODE_FOR_avx512vl_loadv4df_mask:
36164 case CODE_FOR_avx512vl_loadv4di_mask:
36165 case CODE_FOR_avx512vl_loadv4sf_mask:
36166 case CODE_FOR_avx512vl_loadv4si_mask:
36167 case CODE_FOR_avx512vl_loadv2df_mask:
36168 case CODE_FOR_avx512vl_loadv2di_mask:
36169 case CODE_FOR_avx512bw_loadv64qi_mask:
36170 case CODE_FOR_avx512vl_loadv32qi_mask:
36171 case CODE_FOR_avx512vl_loadv16qi_mask:
36172 case CODE_FOR_avx512bw_loadv32hi_mask:
36173 case CODE_FOR_avx512vl_loadv16hi_mask:
36174 case CODE_FOR_avx512vl_loadv8hi_mask:
36175 aligned_mem = true;
36176 break;
36177 default:
36178 break;
36180 /* FALLTHRU */
36181 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36182 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36183 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36184 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36185 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36186 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36187 case V16SI_FTYPE_PCINT_V16SI_UHI:
36188 case V8SI_FTYPE_PCINT_V8SI_UQI:
36189 case V4SI_FTYPE_PCINT_V4SI_UQI:
36190 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36191 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36192 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36193 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36194 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36195 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36196 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36197 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36198 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36199 nargs = 3;
36200 klass = load;
36201 memory = 0;
36202 break;
36203 case VOID_FTYPE_UINT_UINT_UINT:
36204 case VOID_FTYPE_UINT64_UINT_UINT:
36205 case UCHAR_FTYPE_UINT_UINT_UINT:
36206 case UCHAR_FTYPE_UINT64_UINT_UINT:
36207 nargs = 3;
36208 klass = load;
36209 memory = ARRAY_SIZE (args);
36210 last_arg_constant = true;
36211 break;
36212 default:
36213 gcc_unreachable ();
36216 gcc_assert (nargs <= ARRAY_SIZE (args));
36218 if (klass == store)
36220 arg = CALL_EXPR_ARG (exp, 0);
36221 op = expand_normal (arg);
36222 gcc_assert (target == 0);
36223 if (memory)
36225 op = ix86_zero_extend_to_Pmode (op);
36226 target = gen_rtx_MEM (tmode, op);
36227 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36228 on it. Try to improve it using get_pointer_alignment,
36229 and if the special builtin is one that requires strict
36230 mode alignment, also from it's GET_MODE_ALIGNMENT.
36231 Failure to do so could lead to ix86_legitimate_combined_insn
36232 rejecting all changes to such insns. */
36233 unsigned int align = get_pointer_alignment (arg);
36234 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36235 align = GET_MODE_ALIGNMENT (tmode);
36236 if (MEM_ALIGN (target) < align)
36237 set_mem_align (target, align);
36239 else
36240 target = force_reg (tmode, op);
36241 arg_adjust = 1;
36243 else
36245 arg_adjust = 0;
36246 if (optimize
36247 || target == 0
36248 || !register_operand (target, tmode)
36249 || GET_MODE (target) != tmode)
36250 target = gen_reg_rtx (tmode);
36253 for (i = 0; i < nargs; i++)
36255 machine_mode mode = insn_p->operand[i + 1].mode;
36256 bool match;
36258 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36259 op = expand_normal (arg);
36260 match = insn_p->operand[i + 1].predicate (op, mode);
36262 if (last_arg_constant && (i + 1) == nargs)
36264 if (!match)
36266 if (icode == CODE_FOR_lwp_lwpvalsi3
36267 || icode == CODE_FOR_lwp_lwpinssi3
36268 || icode == CODE_FOR_lwp_lwpvaldi3
36269 || icode == CODE_FOR_lwp_lwpinsdi3)
36270 error ("the last argument must be a 32-bit immediate");
36271 else
36272 error ("the last argument must be an 8-bit immediate");
36273 return const0_rtx;
36276 else
36278 if (i == memory)
36280 /* This must be the memory operand. */
36281 op = ix86_zero_extend_to_Pmode (op);
36282 op = gen_rtx_MEM (mode, op);
36283 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36284 on it. Try to improve it using get_pointer_alignment,
36285 and if the special builtin is one that requires strict
36286 mode alignment, also from it's GET_MODE_ALIGNMENT.
36287 Failure to do so could lead to ix86_legitimate_combined_insn
36288 rejecting all changes to such insns. */
36289 unsigned int align = get_pointer_alignment (arg);
36290 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36291 align = GET_MODE_ALIGNMENT (mode);
36292 if (MEM_ALIGN (op) < align)
36293 set_mem_align (op, align);
36295 else
36297 /* This must be register. */
36298 if (VECTOR_MODE_P (mode))
36299 op = safe_vector_operand (op, mode);
36301 op = fixup_modeless_constant (op, mode);
36303 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36304 op = copy_to_mode_reg (mode, op);
36305 else
36307 op = copy_to_reg (op);
36308 op = lowpart_subreg (mode, op, GET_MODE (op));
36313 args[i].op = op;
36314 args[i].mode = mode;
36317 switch (nargs)
36319 case 0:
36320 pat = GEN_FCN (icode) (target);
36321 break;
36322 case 1:
36323 pat = GEN_FCN (icode) (target, args[0].op);
36324 break;
36325 case 2:
36326 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36327 break;
36328 case 3:
36329 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36330 break;
36331 default:
36332 gcc_unreachable ();
36335 if (! pat)
36336 return 0;
36337 emit_insn (pat);
36338 return klass == store ? 0 : target;
36341 /* Return the integer constant in ARG. Constrain it to be in the range
36342 of the subparts of VEC_TYPE; issue an error if not. */
36344 static int
36345 get_element_number (tree vec_type, tree arg)
36347 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36349 if (!tree_fits_uhwi_p (arg)
36350 || (elt = tree_to_uhwi (arg), elt > max))
36352 error ("selector must be an integer constant in the range 0..%wi", max);
36353 return 0;
36356 return elt;
36359 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36360 ix86_expand_vector_init. We DO have language-level syntax for this, in
36361 the form of (type){ init-list }. Except that since we can't place emms
36362 instructions from inside the compiler, we can't allow the use of MMX
36363 registers unless the user explicitly asks for it. So we do *not* define
36364 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36365 we have builtins invoked by mmintrin.h that gives us license to emit
36366 these sorts of instructions. */
36368 static rtx
36369 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36371 machine_mode tmode = TYPE_MODE (type);
36372 machine_mode inner_mode = GET_MODE_INNER (tmode);
36373 int i, n_elt = GET_MODE_NUNITS (tmode);
36374 rtvec v = rtvec_alloc (n_elt);
36376 gcc_assert (VECTOR_MODE_P (tmode));
36377 gcc_assert (call_expr_nargs (exp) == n_elt);
36379 for (i = 0; i < n_elt; ++i)
36381 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36382 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36385 if (!target || !register_operand (target, tmode))
36386 target = gen_reg_rtx (tmode);
36388 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36389 return target;
36392 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36393 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36394 had a language-level syntax for referencing vector elements. */
36396 static rtx
36397 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36399 machine_mode tmode, mode0;
36400 tree arg0, arg1;
36401 int elt;
36402 rtx op0;
36404 arg0 = CALL_EXPR_ARG (exp, 0);
36405 arg1 = CALL_EXPR_ARG (exp, 1);
36407 op0 = expand_normal (arg0);
36408 elt = get_element_number (TREE_TYPE (arg0), arg1);
36410 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36411 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36412 gcc_assert (VECTOR_MODE_P (mode0));
36414 op0 = force_reg (mode0, op0);
36416 if (optimize || !target || !register_operand (target, tmode))
36417 target = gen_reg_rtx (tmode);
36419 ix86_expand_vector_extract (true, target, op0, elt);
36421 return target;
36424 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36425 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36426 a language-level syntax for referencing vector elements. */
36428 static rtx
36429 ix86_expand_vec_set_builtin (tree exp)
36431 machine_mode tmode, mode1;
36432 tree arg0, arg1, arg2;
36433 int elt;
36434 rtx op0, op1, target;
36436 arg0 = CALL_EXPR_ARG (exp, 0);
36437 arg1 = CALL_EXPR_ARG (exp, 1);
36438 arg2 = CALL_EXPR_ARG (exp, 2);
36440 tmode = TYPE_MODE (TREE_TYPE (arg0));
36441 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36442 gcc_assert (VECTOR_MODE_P (tmode));
36444 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36445 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36446 elt = get_element_number (TREE_TYPE (arg0), arg2);
36448 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36449 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36451 op0 = force_reg (tmode, op0);
36452 op1 = force_reg (mode1, op1);
36454 /* OP0 is the source of these builtin functions and shouldn't be
36455 modified. Create a copy, use it and return it as target. */
36456 target = gen_reg_rtx (tmode);
36457 emit_move_insn (target, op0);
36458 ix86_expand_vector_set (true, target, op1, elt);
36460 return target;
36463 /* Emit conditional move of SRC to DST with condition
36464 OP1 CODE OP2. */
36465 static void
36466 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36468 rtx t;
36470 if (TARGET_CMOVE)
36472 t = ix86_expand_compare (code, op1, op2);
36473 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36474 src, dst)));
36476 else
36478 rtx_code_label *nomove = gen_label_rtx ();
36479 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36480 const0_rtx, GET_MODE (op1), 1, nomove);
36481 emit_move_insn (dst, src);
36482 emit_label (nomove);
36486 /* Choose max of DST and SRC and put it to DST. */
36487 static void
36488 ix86_emit_move_max (rtx dst, rtx src)
36490 ix86_emit_cmove (dst, src, LTU, dst, src);
36493 /* Expand an expression EXP that calls a built-in function,
36494 with result going to TARGET if that's convenient
36495 (and in mode MODE if that's convenient).
36496 SUBTARGET may be used as the target for computing one of EXP's operands.
36497 IGNORE is nonzero if the value is to be ignored. */
36499 static rtx
36500 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36501 machine_mode mode, int ignore)
36503 size_t i;
36504 enum insn_code icode, icode2;
36505 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36506 tree arg0, arg1, arg2, arg3, arg4;
36507 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36508 machine_mode mode0, mode1, mode2, mode3, mode4;
36509 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36511 /* For CPU builtins that can be folded, fold first and expand the fold. */
36512 switch (fcode)
36514 case IX86_BUILTIN_CPU_INIT:
36516 /* Make it call __cpu_indicator_init in libgcc. */
36517 tree call_expr, fndecl, type;
36518 type = build_function_type_list (integer_type_node, NULL_TREE);
36519 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36520 call_expr = build_call_expr (fndecl, 0);
36521 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36523 case IX86_BUILTIN_CPU_IS:
36524 case IX86_BUILTIN_CPU_SUPPORTS:
36526 tree arg0 = CALL_EXPR_ARG (exp, 0);
36527 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36528 gcc_assert (fold_expr != NULL_TREE);
36529 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36533 HOST_WIDE_INT isa = ix86_isa_flags;
36534 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36535 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36536 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36537 /* The general case is we require all the ISAs specified in bisa{,2}
36538 to be enabled.
36539 The exceptions are:
36540 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36541 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36542 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36543 where for each this pair it is sufficient if either of the ISAs is
36544 enabled, plus if it is ored with other options also those others. */
36545 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36546 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36547 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36548 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36549 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36550 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36551 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36552 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36553 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36554 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36555 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36556 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36557 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36559 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36560 (enum fpmath_unit) 0, false);
36561 if (!opts)
36562 error ("%qE needs unknown isa option", fndecl);
36563 else
36565 gcc_assert (opts != NULL);
36566 error ("%qE needs isa option %s", fndecl, opts);
36567 free (opts);
36569 return expand_call (exp, target, ignore);
36572 switch (fcode)
36574 case IX86_BUILTIN_BNDMK:
36575 if (!target
36576 || GET_MODE (target) != BNDmode
36577 || !register_operand (target, BNDmode))
36578 target = gen_reg_rtx (BNDmode);
36580 arg0 = CALL_EXPR_ARG (exp, 0);
36581 arg1 = CALL_EXPR_ARG (exp, 1);
36583 op0 = expand_normal (arg0);
36584 op1 = expand_normal (arg1);
36586 if (!register_operand (op0, Pmode))
36587 op0 = ix86_zero_extend_to_Pmode (op0);
36588 if (!register_operand (op1, Pmode))
36589 op1 = ix86_zero_extend_to_Pmode (op1);
36591 /* Builtin arg1 is size of block but instruction op1 should
36592 be (size - 1). */
36593 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36594 NULL_RTX, 1, OPTAB_DIRECT);
36596 emit_insn (BNDmode == BND64mode
36597 ? gen_bnd64_mk (target, op0, op1)
36598 : gen_bnd32_mk (target, op0, op1));
36599 return target;
36601 case IX86_BUILTIN_BNDSTX:
36602 arg0 = CALL_EXPR_ARG (exp, 0);
36603 arg1 = CALL_EXPR_ARG (exp, 1);
36604 arg2 = CALL_EXPR_ARG (exp, 2);
36606 op0 = expand_normal (arg0);
36607 op1 = expand_normal (arg1);
36608 op2 = expand_normal (arg2);
36610 if (!register_operand (op0, Pmode))
36611 op0 = ix86_zero_extend_to_Pmode (op0);
36612 if (!register_operand (op1, BNDmode))
36613 op1 = copy_to_mode_reg (BNDmode, op1);
36614 if (!register_operand (op2, Pmode))
36615 op2 = ix86_zero_extend_to_Pmode (op2);
36617 emit_insn (BNDmode == BND64mode
36618 ? gen_bnd64_stx (op2, op0, op1)
36619 : gen_bnd32_stx (op2, op0, op1));
36620 return 0;
36622 case IX86_BUILTIN_BNDLDX:
36623 if (!target
36624 || GET_MODE (target) != BNDmode
36625 || !register_operand (target, BNDmode))
36626 target = gen_reg_rtx (BNDmode);
36628 arg0 = CALL_EXPR_ARG (exp, 0);
36629 arg1 = CALL_EXPR_ARG (exp, 1);
36631 op0 = expand_normal (arg0);
36632 op1 = expand_normal (arg1);
36634 if (!register_operand (op0, Pmode))
36635 op0 = ix86_zero_extend_to_Pmode (op0);
36636 if (!register_operand (op1, Pmode))
36637 op1 = ix86_zero_extend_to_Pmode (op1);
36639 emit_insn (BNDmode == BND64mode
36640 ? gen_bnd64_ldx (target, op0, op1)
36641 : gen_bnd32_ldx (target, op0, op1));
36642 return target;
36644 case IX86_BUILTIN_BNDCL:
36645 arg0 = CALL_EXPR_ARG (exp, 0);
36646 arg1 = CALL_EXPR_ARG (exp, 1);
36648 op0 = expand_normal (arg0);
36649 op1 = expand_normal (arg1);
36651 if (!register_operand (op0, Pmode))
36652 op0 = ix86_zero_extend_to_Pmode (op0);
36653 if (!register_operand (op1, BNDmode))
36654 op1 = copy_to_mode_reg (BNDmode, op1);
36656 emit_insn (BNDmode == BND64mode
36657 ? gen_bnd64_cl (op1, op0)
36658 : gen_bnd32_cl (op1, op0));
36659 return 0;
36661 case IX86_BUILTIN_BNDCU:
36662 arg0 = CALL_EXPR_ARG (exp, 0);
36663 arg1 = CALL_EXPR_ARG (exp, 1);
36665 op0 = expand_normal (arg0);
36666 op1 = expand_normal (arg1);
36668 if (!register_operand (op0, Pmode))
36669 op0 = ix86_zero_extend_to_Pmode (op0);
36670 if (!register_operand (op1, BNDmode))
36671 op1 = copy_to_mode_reg (BNDmode, op1);
36673 emit_insn (BNDmode == BND64mode
36674 ? gen_bnd64_cu (op1, op0)
36675 : gen_bnd32_cu (op1, op0));
36676 return 0;
36678 case IX86_BUILTIN_BNDRET:
36679 arg0 = CALL_EXPR_ARG (exp, 0);
36680 target = chkp_get_rtl_bounds (arg0);
36682 /* If no bounds were specified for returned value,
36683 then use INIT bounds. It usually happens when
36684 some built-in function is expanded. */
36685 if (!target)
36687 rtx t1 = gen_reg_rtx (Pmode);
36688 rtx t2 = gen_reg_rtx (Pmode);
36689 target = gen_reg_rtx (BNDmode);
36690 emit_move_insn (t1, const0_rtx);
36691 emit_move_insn (t2, constm1_rtx);
36692 emit_insn (BNDmode == BND64mode
36693 ? gen_bnd64_mk (target, t1, t2)
36694 : gen_bnd32_mk (target, t1, t2));
36697 gcc_assert (target && REG_P (target));
36698 return target;
36700 case IX86_BUILTIN_BNDNARROW:
36702 rtx m1, m1h1, m1h2, lb, ub, t1;
36704 /* Return value and lb. */
36705 arg0 = CALL_EXPR_ARG (exp, 0);
36706 /* Bounds. */
36707 arg1 = CALL_EXPR_ARG (exp, 1);
36708 /* Size. */
36709 arg2 = CALL_EXPR_ARG (exp, 2);
36711 lb = expand_normal (arg0);
36712 op1 = expand_normal (arg1);
36713 op2 = expand_normal (arg2);
36715 /* Size was passed but we need to use (size - 1) as for bndmk. */
36716 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36717 NULL_RTX, 1, OPTAB_DIRECT);
36719 /* Add LB to size and inverse to get UB. */
36720 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36721 op2, 1, OPTAB_DIRECT);
36722 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36724 if (!register_operand (lb, Pmode))
36725 lb = ix86_zero_extend_to_Pmode (lb);
36726 if (!register_operand (ub, Pmode))
36727 ub = ix86_zero_extend_to_Pmode (ub);
36729 /* We need to move bounds to memory before any computations. */
36730 if (MEM_P (op1))
36731 m1 = op1;
36732 else
36734 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36735 emit_move_insn (m1, op1);
36738 /* Generate mem expression to be used for access to LB and UB. */
36739 m1h1 = adjust_address (m1, Pmode, 0);
36740 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36742 t1 = gen_reg_rtx (Pmode);
36744 /* Compute LB. */
36745 emit_move_insn (t1, m1h1);
36746 ix86_emit_move_max (t1, lb);
36747 emit_move_insn (m1h1, t1);
36749 /* Compute UB. UB is stored in 1's complement form. Therefore
36750 we also use max here. */
36751 emit_move_insn (t1, m1h2);
36752 ix86_emit_move_max (t1, ub);
36753 emit_move_insn (m1h2, t1);
36755 op2 = gen_reg_rtx (BNDmode);
36756 emit_move_insn (op2, m1);
36758 return chkp_join_splitted_slot (lb, op2);
36761 case IX86_BUILTIN_BNDINT:
36763 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36765 if (!target
36766 || GET_MODE (target) != BNDmode
36767 || !register_operand (target, BNDmode))
36768 target = gen_reg_rtx (BNDmode);
36770 arg0 = CALL_EXPR_ARG (exp, 0);
36771 arg1 = CALL_EXPR_ARG (exp, 1);
36773 op0 = expand_normal (arg0);
36774 op1 = expand_normal (arg1);
36776 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36777 rh1 = adjust_address (res, Pmode, 0);
36778 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36780 /* Put first bounds to temporaries. */
36781 lb1 = gen_reg_rtx (Pmode);
36782 ub1 = gen_reg_rtx (Pmode);
36783 if (MEM_P (op0))
36785 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36786 emit_move_insn (ub1, adjust_address (op0, Pmode,
36787 GET_MODE_SIZE (Pmode)));
36789 else
36791 emit_move_insn (res, op0);
36792 emit_move_insn (lb1, rh1);
36793 emit_move_insn (ub1, rh2);
36796 /* Put second bounds to temporaries. */
36797 lb2 = gen_reg_rtx (Pmode);
36798 ub2 = gen_reg_rtx (Pmode);
36799 if (MEM_P (op1))
36801 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36802 emit_move_insn (ub2, adjust_address (op1, Pmode,
36803 GET_MODE_SIZE (Pmode)));
36805 else
36807 emit_move_insn (res, op1);
36808 emit_move_insn (lb2, rh1);
36809 emit_move_insn (ub2, rh2);
36812 /* Compute LB. */
36813 ix86_emit_move_max (lb1, lb2);
36814 emit_move_insn (rh1, lb1);
36816 /* Compute UB. UB is stored in 1's complement form. Therefore
36817 we also use max here. */
36818 ix86_emit_move_max (ub1, ub2);
36819 emit_move_insn (rh2, ub1);
36821 emit_move_insn (target, res);
36823 return target;
36826 case IX86_BUILTIN_SIZEOF:
36828 tree name;
36829 rtx symbol;
36831 if (!target
36832 || GET_MODE (target) != Pmode
36833 || !register_operand (target, Pmode))
36834 target = gen_reg_rtx (Pmode);
36836 arg0 = CALL_EXPR_ARG (exp, 0);
36837 gcc_assert (VAR_P (arg0));
36839 name = DECL_ASSEMBLER_NAME (arg0);
36840 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36842 emit_insn (Pmode == SImode
36843 ? gen_move_size_reloc_si (target, symbol)
36844 : gen_move_size_reloc_di (target, symbol));
36846 return target;
36849 case IX86_BUILTIN_BNDLOWER:
36851 rtx mem, hmem;
36853 if (!target
36854 || GET_MODE (target) != Pmode
36855 || !register_operand (target, Pmode))
36856 target = gen_reg_rtx (Pmode);
36858 arg0 = CALL_EXPR_ARG (exp, 0);
36859 op0 = expand_normal (arg0);
36861 /* We need to move bounds to memory first. */
36862 if (MEM_P (op0))
36863 mem = op0;
36864 else
36866 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36867 emit_move_insn (mem, op0);
36870 /* Generate mem expression to access LB and load it. */
36871 hmem = adjust_address (mem, Pmode, 0);
36872 emit_move_insn (target, hmem);
36874 return target;
36877 case IX86_BUILTIN_BNDUPPER:
36879 rtx mem, hmem, res;
36881 if (!target
36882 || GET_MODE (target) != Pmode
36883 || !register_operand (target, Pmode))
36884 target = gen_reg_rtx (Pmode);
36886 arg0 = CALL_EXPR_ARG (exp, 0);
36887 op0 = expand_normal (arg0);
36889 /* We need to move bounds to memory first. */
36890 if (MEM_P (op0))
36891 mem = op0;
36892 else
36894 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36895 emit_move_insn (mem, op0);
36898 /* Generate mem expression to access UB. */
36899 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36901 /* We need to inverse all bits of UB. */
36902 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36904 if (res != target)
36905 emit_move_insn (target, res);
36907 return target;
36910 case IX86_BUILTIN_MASKMOVQ:
36911 case IX86_BUILTIN_MASKMOVDQU:
36912 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36913 ? CODE_FOR_mmx_maskmovq
36914 : CODE_FOR_sse2_maskmovdqu);
36915 /* Note the arg order is different from the operand order. */
36916 arg1 = CALL_EXPR_ARG (exp, 0);
36917 arg2 = CALL_EXPR_ARG (exp, 1);
36918 arg0 = CALL_EXPR_ARG (exp, 2);
36919 op0 = expand_normal (arg0);
36920 op1 = expand_normal (arg1);
36921 op2 = expand_normal (arg2);
36922 mode0 = insn_data[icode].operand[0].mode;
36923 mode1 = insn_data[icode].operand[1].mode;
36924 mode2 = insn_data[icode].operand[2].mode;
36926 op0 = ix86_zero_extend_to_Pmode (op0);
36927 op0 = gen_rtx_MEM (mode1, op0);
36929 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36930 op0 = copy_to_mode_reg (mode0, op0);
36931 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36932 op1 = copy_to_mode_reg (mode1, op1);
36933 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36934 op2 = copy_to_mode_reg (mode2, op2);
36935 pat = GEN_FCN (icode) (op0, op1, op2);
36936 if (! pat)
36937 return 0;
36938 emit_insn (pat);
36939 return 0;
36941 case IX86_BUILTIN_LDMXCSR:
36942 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36943 target = assign_386_stack_local (SImode, SLOT_TEMP);
36944 emit_move_insn (target, op0);
36945 emit_insn (gen_sse_ldmxcsr (target));
36946 return 0;
36948 case IX86_BUILTIN_STMXCSR:
36949 target = assign_386_stack_local (SImode, SLOT_TEMP);
36950 emit_insn (gen_sse_stmxcsr (target));
36951 return copy_to_mode_reg (SImode, target);
36953 case IX86_BUILTIN_CLFLUSH:
36954 arg0 = CALL_EXPR_ARG (exp, 0);
36955 op0 = expand_normal (arg0);
36956 icode = CODE_FOR_sse2_clflush;
36957 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36958 op0 = ix86_zero_extend_to_Pmode (op0);
36960 emit_insn (gen_sse2_clflush (op0));
36961 return 0;
36963 case IX86_BUILTIN_CLWB:
36964 arg0 = CALL_EXPR_ARG (exp, 0);
36965 op0 = expand_normal (arg0);
36966 icode = CODE_FOR_clwb;
36967 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36968 op0 = ix86_zero_extend_to_Pmode (op0);
36970 emit_insn (gen_clwb (op0));
36971 return 0;
36973 case IX86_BUILTIN_CLFLUSHOPT:
36974 arg0 = CALL_EXPR_ARG (exp, 0);
36975 op0 = expand_normal (arg0);
36976 icode = CODE_FOR_clflushopt;
36977 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36978 op0 = ix86_zero_extend_to_Pmode (op0);
36980 emit_insn (gen_clflushopt (op0));
36981 return 0;
36983 case IX86_BUILTIN_MONITOR:
36984 case IX86_BUILTIN_MONITORX:
36985 arg0 = CALL_EXPR_ARG (exp, 0);
36986 arg1 = CALL_EXPR_ARG (exp, 1);
36987 arg2 = CALL_EXPR_ARG (exp, 2);
36988 op0 = expand_normal (arg0);
36989 op1 = expand_normal (arg1);
36990 op2 = expand_normal (arg2);
36991 if (!REG_P (op0))
36992 op0 = ix86_zero_extend_to_Pmode (op0);
36993 if (!REG_P (op1))
36994 op1 = copy_to_mode_reg (SImode, op1);
36995 if (!REG_P (op2))
36996 op2 = copy_to_mode_reg (SImode, op2);
36998 emit_insn (fcode == IX86_BUILTIN_MONITOR
36999 ? ix86_gen_monitor (op0, op1, op2)
37000 : ix86_gen_monitorx (op0, op1, op2));
37001 return 0;
37003 case IX86_BUILTIN_MWAIT:
37004 arg0 = CALL_EXPR_ARG (exp, 0);
37005 arg1 = CALL_EXPR_ARG (exp, 1);
37006 op0 = expand_normal (arg0);
37007 op1 = expand_normal (arg1);
37008 if (!REG_P (op0))
37009 op0 = copy_to_mode_reg (SImode, op0);
37010 if (!REG_P (op1))
37011 op1 = copy_to_mode_reg (SImode, op1);
37012 emit_insn (gen_sse3_mwait (op0, op1));
37013 return 0;
37015 case IX86_BUILTIN_MWAITX:
37016 arg0 = CALL_EXPR_ARG (exp, 0);
37017 arg1 = CALL_EXPR_ARG (exp, 1);
37018 arg2 = CALL_EXPR_ARG (exp, 2);
37019 op0 = expand_normal (arg0);
37020 op1 = expand_normal (arg1);
37021 op2 = expand_normal (arg2);
37022 if (!REG_P (op0))
37023 op0 = copy_to_mode_reg (SImode, op0);
37024 if (!REG_P (op1))
37025 op1 = copy_to_mode_reg (SImode, op1);
37026 if (!REG_P (op2))
37027 op2 = copy_to_mode_reg (SImode, op2);
37028 emit_insn (gen_mwaitx (op0, op1, op2));
37029 return 0;
37031 case IX86_BUILTIN_CLZERO:
37032 arg0 = CALL_EXPR_ARG (exp, 0);
37033 op0 = expand_normal (arg0);
37034 if (!REG_P (op0))
37035 op0 = ix86_zero_extend_to_Pmode (op0);
37036 emit_insn (ix86_gen_clzero (op0));
37037 return 0;
37039 case IX86_BUILTIN_VEC_INIT_V2SI:
37040 case IX86_BUILTIN_VEC_INIT_V4HI:
37041 case IX86_BUILTIN_VEC_INIT_V8QI:
37042 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37044 case IX86_BUILTIN_VEC_EXT_V2DF:
37045 case IX86_BUILTIN_VEC_EXT_V2DI:
37046 case IX86_BUILTIN_VEC_EXT_V4SF:
37047 case IX86_BUILTIN_VEC_EXT_V4SI:
37048 case IX86_BUILTIN_VEC_EXT_V8HI:
37049 case IX86_BUILTIN_VEC_EXT_V2SI:
37050 case IX86_BUILTIN_VEC_EXT_V4HI:
37051 case IX86_BUILTIN_VEC_EXT_V16QI:
37052 return ix86_expand_vec_ext_builtin (exp, target);
37054 case IX86_BUILTIN_VEC_SET_V2DI:
37055 case IX86_BUILTIN_VEC_SET_V4SF:
37056 case IX86_BUILTIN_VEC_SET_V4SI:
37057 case IX86_BUILTIN_VEC_SET_V8HI:
37058 case IX86_BUILTIN_VEC_SET_V4HI:
37059 case IX86_BUILTIN_VEC_SET_V16QI:
37060 return ix86_expand_vec_set_builtin (exp);
37062 case IX86_BUILTIN_NANQ:
37063 case IX86_BUILTIN_NANSQ:
37064 return expand_call (exp, target, ignore);
37066 case IX86_BUILTIN_RDPID:
37068 op0 = gen_reg_rtx (TARGET_64BIT ? DImode : SImode);
37070 if (TARGET_64BIT)
37072 insn = gen_rdpid_rex64 (op0);
37073 op0 = convert_to_mode (SImode, op0, 1);
37075 else
37076 insn = gen_rdpid (op0);
37077 emit_insn (insn);
37079 if (target == 0)
37081 /* mode is VOIDmode if __builtin_rdpid has been called
37082 without lhs. */
37083 if (mode == VOIDmode)
37084 return target;
37085 target = gen_reg_rtx (mode);
37087 emit_move_insn (target, op0);
37088 return target;
37089 case IX86_BUILTIN_RDPMC:
37090 case IX86_BUILTIN_RDTSC:
37091 case IX86_BUILTIN_RDTSCP:
37092 case IX86_BUILTIN_XGETBV:
37094 op0 = gen_reg_rtx (DImode);
37095 op1 = gen_reg_rtx (DImode);
37097 if (fcode == IX86_BUILTIN_RDPMC)
37099 arg0 = CALL_EXPR_ARG (exp, 0);
37100 op2 = expand_normal (arg0);
37101 if (!register_operand (op2, SImode))
37102 op2 = copy_to_mode_reg (SImode, op2);
37104 insn = (TARGET_64BIT
37105 ? gen_rdpmc_rex64 (op0, op1, op2)
37106 : gen_rdpmc (op0, op2));
37107 emit_insn (insn);
37109 else if (fcode == IX86_BUILTIN_XGETBV)
37111 arg0 = CALL_EXPR_ARG (exp, 0);
37112 op2 = expand_normal (arg0);
37113 if (!register_operand (op2, SImode))
37114 op2 = copy_to_mode_reg (SImode, op2);
37116 insn = (TARGET_64BIT
37117 ? gen_xgetbv_rex64 (op0, op1, op2)
37118 : gen_xgetbv (op0, op2));
37119 emit_insn (insn);
37121 else if (fcode == IX86_BUILTIN_RDTSC)
37123 insn = (TARGET_64BIT
37124 ? gen_rdtsc_rex64 (op0, op1)
37125 : gen_rdtsc (op0));
37126 emit_insn (insn);
37128 else
37130 op2 = gen_reg_rtx (SImode);
37132 insn = (TARGET_64BIT
37133 ? gen_rdtscp_rex64 (op0, op1, op2)
37134 : gen_rdtscp (op0, op2));
37135 emit_insn (insn);
37137 arg0 = CALL_EXPR_ARG (exp, 0);
37138 op4 = expand_normal (arg0);
37139 if (!address_operand (op4, VOIDmode))
37141 op4 = convert_memory_address (Pmode, op4);
37142 op4 = copy_addr_to_reg (op4);
37144 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37147 if (target == 0)
37149 /* mode is VOIDmode if __builtin_rd* has been called
37150 without lhs. */
37151 if (mode == VOIDmode)
37152 return target;
37153 target = gen_reg_rtx (mode);
37156 if (TARGET_64BIT)
37158 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37159 op1, 1, OPTAB_DIRECT);
37160 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37161 op0, 1, OPTAB_DIRECT);
37164 emit_move_insn (target, op0);
37165 return target;
37167 case IX86_BUILTIN_FXSAVE:
37168 case IX86_BUILTIN_FXRSTOR:
37169 case IX86_BUILTIN_FXSAVE64:
37170 case IX86_BUILTIN_FXRSTOR64:
37171 case IX86_BUILTIN_FNSTENV:
37172 case IX86_BUILTIN_FLDENV:
37173 mode0 = BLKmode;
37174 switch (fcode)
37176 case IX86_BUILTIN_FXSAVE:
37177 icode = CODE_FOR_fxsave;
37178 break;
37179 case IX86_BUILTIN_FXRSTOR:
37180 icode = CODE_FOR_fxrstor;
37181 break;
37182 case IX86_BUILTIN_FXSAVE64:
37183 icode = CODE_FOR_fxsave64;
37184 break;
37185 case IX86_BUILTIN_FXRSTOR64:
37186 icode = CODE_FOR_fxrstor64;
37187 break;
37188 case IX86_BUILTIN_FNSTENV:
37189 icode = CODE_FOR_fnstenv;
37190 break;
37191 case IX86_BUILTIN_FLDENV:
37192 icode = CODE_FOR_fldenv;
37193 break;
37194 default:
37195 gcc_unreachable ();
37198 arg0 = CALL_EXPR_ARG (exp, 0);
37199 op0 = expand_normal (arg0);
37201 if (!address_operand (op0, VOIDmode))
37203 op0 = convert_memory_address (Pmode, op0);
37204 op0 = copy_addr_to_reg (op0);
37206 op0 = gen_rtx_MEM (mode0, op0);
37208 pat = GEN_FCN (icode) (op0);
37209 if (pat)
37210 emit_insn (pat);
37211 return 0;
37213 case IX86_BUILTIN_XSETBV:
37214 arg0 = CALL_EXPR_ARG (exp, 0);
37215 arg1 = CALL_EXPR_ARG (exp, 1);
37216 op0 = expand_normal (arg0);
37217 op1 = expand_normal (arg1);
37219 if (!REG_P (op0))
37220 op0 = copy_to_mode_reg (SImode, op0);
37222 if (TARGET_64BIT)
37224 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37225 NULL, 1, OPTAB_DIRECT);
37227 op2 = gen_lowpart (SImode, op2);
37228 op1 = gen_lowpart (SImode, op1);
37229 if (!REG_P (op1))
37230 op1 = copy_to_mode_reg (SImode, op1);
37231 if (!REG_P (op2))
37232 op2 = copy_to_mode_reg (SImode, op2);
37233 icode = CODE_FOR_xsetbv_rex64;
37234 pat = GEN_FCN (icode) (op0, op1, op2);
37236 else
37238 if (!REG_P (op1))
37239 op1 = copy_to_mode_reg (DImode, op1);
37240 icode = CODE_FOR_xsetbv;
37241 pat = GEN_FCN (icode) (op0, op1);
37243 if (pat)
37244 emit_insn (pat);
37245 return 0;
37247 case IX86_BUILTIN_XSAVE:
37248 case IX86_BUILTIN_XRSTOR:
37249 case IX86_BUILTIN_XSAVE64:
37250 case IX86_BUILTIN_XRSTOR64:
37251 case IX86_BUILTIN_XSAVEOPT:
37252 case IX86_BUILTIN_XSAVEOPT64:
37253 case IX86_BUILTIN_XSAVES:
37254 case IX86_BUILTIN_XRSTORS:
37255 case IX86_BUILTIN_XSAVES64:
37256 case IX86_BUILTIN_XRSTORS64:
37257 case IX86_BUILTIN_XSAVEC:
37258 case IX86_BUILTIN_XSAVEC64:
37259 arg0 = CALL_EXPR_ARG (exp, 0);
37260 arg1 = CALL_EXPR_ARG (exp, 1);
37261 op0 = expand_normal (arg0);
37262 op1 = expand_normal (arg1);
37264 if (!address_operand (op0, VOIDmode))
37266 op0 = convert_memory_address (Pmode, op0);
37267 op0 = copy_addr_to_reg (op0);
37269 op0 = gen_rtx_MEM (BLKmode, op0);
37271 op1 = force_reg (DImode, op1);
37273 if (TARGET_64BIT)
37275 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37276 NULL, 1, OPTAB_DIRECT);
37277 switch (fcode)
37279 case IX86_BUILTIN_XSAVE:
37280 icode = CODE_FOR_xsave_rex64;
37281 break;
37282 case IX86_BUILTIN_XRSTOR:
37283 icode = CODE_FOR_xrstor_rex64;
37284 break;
37285 case IX86_BUILTIN_XSAVE64:
37286 icode = CODE_FOR_xsave64;
37287 break;
37288 case IX86_BUILTIN_XRSTOR64:
37289 icode = CODE_FOR_xrstor64;
37290 break;
37291 case IX86_BUILTIN_XSAVEOPT:
37292 icode = CODE_FOR_xsaveopt_rex64;
37293 break;
37294 case IX86_BUILTIN_XSAVEOPT64:
37295 icode = CODE_FOR_xsaveopt64;
37296 break;
37297 case IX86_BUILTIN_XSAVES:
37298 icode = CODE_FOR_xsaves_rex64;
37299 break;
37300 case IX86_BUILTIN_XRSTORS:
37301 icode = CODE_FOR_xrstors_rex64;
37302 break;
37303 case IX86_BUILTIN_XSAVES64:
37304 icode = CODE_FOR_xsaves64;
37305 break;
37306 case IX86_BUILTIN_XRSTORS64:
37307 icode = CODE_FOR_xrstors64;
37308 break;
37309 case IX86_BUILTIN_XSAVEC:
37310 icode = CODE_FOR_xsavec_rex64;
37311 break;
37312 case IX86_BUILTIN_XSAVEC64:
37313 icode = CODE_FOR_xsavec64;
37314 break;
37315 default:
37316 gcc_unreachable ();
37319 op2 = gen_lowpart (SImode, op2);
37320 op1 = gen_lowpart (SImode, op1);
37321 pat = GEN_FCN (icode) (op0, op1, op2);
37323 else
37325 switch (fcode)
37327 case IX86_BUILTIN_XSAVE:
37328 icode = CODE_FOR_xsave;
37329 break;
37330 case IX86_BUILTIN_XRSTOR:
37331 icode = CODE_FOR_xrstor;
37332 break;
37333 case IX86_BUILTIN_XSAVEOPT:
37334 icode = CODE_FOR_xsaveopt;
37335 break;
37336 case IX86_BUILTIN_XSAVES:
37337 icode = CODE_FOR_xsaves;
37338 break;
37339 case IX86_BUILTIN_XRSTORS:
37340 icode = CODE_FOR_xrstors;
37341 break;
37342 case IX86_BUILTIN_XSAVEC:
37343 icode = CODE_FOR_xsavec;
37344 break;
37345 default:
37346 gcc_unreachable ();
37348 pat = GEN_FCN (icode) (op0, op1);
37351 if (pat)
37352 emit_insn (pat);
37353 return 0;
37355 case IX86_BUILTIN_LLWPCB:
37356 arg0 = CALL_EXPR_ARG (exp, 0);
37357 op0 = expand_normal (arg0);
37358 icode = CODE_FOR_lwp_llwpcb;
37359 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37360 op0 = ix86_zero_extend_to_Pmode (op0);
37361 emit_insn (gen_lwp_llwpcb (op0));
37362 return 0;
37364 case IX86_BUILTIN_SLWPCB:
37365 icode = CODE_FOR_lwp_slwpcb;
37366 if (!target
37367 || !insn_data[icode].operand[0].predicate (target, Pmode))
37368 target = gen_reg_rtx (Pmode);
37369 emit_insn (gen_lwp_slwpcb (target));
37370 return target;
37372 case IX86_BUILTIN_BEXTRI32:
37373 case IX86_BUILTIN_BEXTRI64:
37374 arg0 = CALL_EXPR_ARG (exp, 0);
37375 arg1 = CALL_EXPR_ARG (exp, 1);
37376 op0 = expand_normal (arg0);
37377 op1 = expand_normal (arg1);
37378 icode = (fcode == IX86_BUILTIN_BEXTRI32
37379 ? CODE_FOR_tbm_bextri_si
37380 : CODE_FOR_tbm_bextri_di);
37381 if (!CONST_INT_P (op1))
37383 error ("last argument must be an immediate");
37384 return const0_rtx;
37386 else
37388 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37389 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37390 op1 = GEN_INT (length);
37391 op2 = GEN_INT (lsb_index);
37392 pat = GEN_FCN (icode) (target, op0, op1, op2);
37393 if (pat)
37394 emit_insn (pat);
37395 return target;
37398 case IX86_BUILTIN_RDRAND16_STEP:
37399 icode = CODE_FOR_rdrandhi_1;
37400 mode0 = HImode;
37401 goto rdrand_step;
37403 case IX86_BUILTIN_RDRAND32_STEP:
37404 icode = CODE_FOR_rdrandsi_1;
37405 mode0 = SImode;
37406 goto rdrand_step;
37408 case IX86_BUILTIN_RDRAND64_STEP:
37409 icode = CODE_FOR_rdranddi_1;
37410 mode0 = DImode;
37412 rdrand_step:
37413 arg0 = CALL_EXPR_ARG (exp, 0);
37414 op1 = expand_normal (arg0);
37415 if (!address_operand (op1, VOIDmode))
37417 op1 = convert_memory_address (Pmode, op1);
37418 op1 = copy_addr_to_reg (op1);
37421 op0 = gen_reg_rtx (mode0);
37422 emit_insn (GEN_FCN (icode) (op0));
37424 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37426 op1 = gen_reg_rtx (SImode);
37427 emit_move_insn (op1, CONST1_RTX (SImode));
37429 /* Emit SImode conditional move. */
37430 if (mode0 == HImode)
37432 if (TARGET_ZERO_EXTEND_WITH_AND
37433 && optimize_function_for_speed_p (cfun))
37435 op2 = force_reg (SImode, const0_rtx);
37437 emit_insn (gen_movstricthi
37438 (gen_lowpart (HImode, op2), op0));
37440 else
37442 op2 = gen_reg_rtx (SImode);
37444 emit_insn (gen_zero_extendhisi2 (op2, op0));
37447 else if (mode0 == SImode)
37448 op2 = op0;
37449 else
37450 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37452 if (target == 0
37453 || !register_operand (target, SImode))
37454 target = gen_reg_rtx (SImode);
37456 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37457 const0_rtx);
37458 emit_insn (gen_rtx_SET (target,
37459 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37460 return target;
37462 case IX86_BUILTIN_RDSEED16_STEP:
37463 icode = CODE_FOR_rdseedhi_1;
37464 mode0 = HImode;
37465 goto rdseed_step;
37467 case IX86_BUILTIN_RDSEED32_STEP:
37468 icode = CODE_FOR_rdseedsi_1;
37469 mode0 = SImode;
37470 goto rdseed_step;
37472 case IX86_BUILTIN_RDSEED64_STEP:
37473 icode = CODE_FOR_rdseeddi_1;
37474 mode0 = DImode;
37476 rdseed_step:
37477 arg0 = CALL_EXPR_ARG (exp, 0);
37478 op1 = expand_normal (arg0);
37479 if (!address_operand (op1, VOIDmode))
37481 op1 = convert_memory_address (Pmode, op1);
37482 op1 = copy_addr_to_reg (op1);
37485 op0 = gen_reg_rtx (mode0);
37486 emit_insn (GEN_FCN (icode) (op0));
37488 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37490 op2 = gen_reg_rtx (QImode);
37492 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37493 const0_rtx);
37494 emit_insn (gen_rtx_SET (op2, pat));
37496 if (target == 0
37497 || !register_operand (target, SImode))
37498 target = gen_reg_rtx (SImode);
37500 emit_insn (gen_zero_extendqisi2 (target, op2));
37501 return target;
37503 case IX86_BUILTIN_SBB32:
37504 icode = CODE_FOR_subborrowsi;
37505 icode2 = CODE_FOR_subborrowsi_0;
37506 mode0 = SImode;
37507 mode1 = DImode;
37508 mode2 = CCmode;
37509 goto handlecarry;
37511 case IX86_BUILTIN_SBB64:
37512 icode = CODE_FOR_subborrowdi;
37513 icode2 = CODE_FOR_subborrowdi_0;
37514 mode0 = DImode;
37515 mode1 = TImode;
37516 mode2 = CCmode;
37517 goto handlecarry;
37519 case IX86_BUILTIN_ADDCARRYX32:
37520 icode = CODE_FOR_addcarrysi;
37521 icode2 = CODE_FOR_addcarrysi_0;
37522 mode0 = SImode;
37523 mode1 = DImode;
37524 mode2 = CCCmode;
37525 goto handlecarry;
37527 case IX86_BUILTIN_ADDCARRYX64:
37528 icode = CODE_FOR_addcarrydi;
37529 icode2 = CODE_FOR_addcarrydi_0;
37530 mode0 = DImode;
37531 mode1 = TImode;
37532 mode2 = CCCmode;
37534 handlecarry:
37535 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37536 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37537 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37538 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37540 op1 = expand_normal (arg0);
37541 if (!integer_zerop (arg0))
37542 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37544 op2 = expand_normal (arg1);
37545 if (!register_operand (op2, mode0))
37546 op2 = copy_to_mode_reg (mode0, op2);
37548 op3 = expand_normal (arg2);
37549 if (!register_operand (op3, mode0))
37550 op3 = copy_to_mode_reg (mode0, op3);
37552 op4 = expand_normal (arg3);
37553 if (!address_operand (op4, VOIDmode))
37555 op4 = convert_memory_address (Pmode, op4);
37556 op4 = copy_addr_to_reg (op4);
37559 op0 = gen_reg_rtx (mode0);
37560 if (integer_zerop (arg0))
37562 /* If arg0 is 0, optimize right away into add or sub
37563 instruction that sets CCCmode flags. */
37564 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37565 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37567 else
37569 /* Generate CF from input operand. */
37570 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37572 /* Generate instruction that consumes CF. */
37573 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37574 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37575 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37576 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37579 /* Return current CF value. */
37580 if (target == 0)
37581 target = gen_reg_rtx (QImode);
37583 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37584 emit_insn (gen_rtx_SET (target, pat));
37586 /* Store the result. */
37587 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37589 return target;
37591 case IX86_BUILTIN_READ_FLAGS:
37592 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37594 if (optimize
37595 || target == NULL_RTX
37596 || !nonimmediate_operand (target, word_mode)
37597 || GET_MODE (target) != word_mode)
37598 target = gen_reg_rtx (word_mode);
37600 emit_insn (gen_pop (target));
37601 return target;
37603 case IX86_BUILTIN_WRITE_FLAGS:
37605 arg0 = CALL_EXPR_ARG (exp, 0);
37606 op0 = expand_normal (arg0);
37607 if (!general_no_elim_operand (op0, word_mode))
37608 op0 = copy_to_mode_reg (word_mode, op0);
37610 emit_insn (gen_push (op0));
37611 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37612 return 0;
37614 case IX86_BUILTIN_KTESTC8:
37615 icode = CODE_FOR_ktestqi;
37616 mode3 = CCCmode;
37617 goto kortest;
37619 case IX86_BUILTIN_KTESTZ8:
37620 icode = CODE_FOR_ktestqi;
37621 mode3 = CCZmode;
37622 goto kortest;
37624 case IX86_BUILTIN_KTESTC16:
37625 icode = CODE_FOR_ktesthi;
37626 mode3 = CCCmode;
37627 goto kortest;
37629 case IX86_BUILTIN_KTESTZ16:
37630 icode = CODE_FOR_ktesthi;
37631 mode3 = CCZmode;
37632 goto kortest;
37634 case IX86_BUILTIN_KTESTC32:
37635 icode = CODE_FOR_ktestsi;
37636 mode3 = CCCmode;
37637 goto kortest;
37639 case IX86_BUILTIN_KTESTZ32:
37640 icode = CODE_FOR_ktestsi;
37641 mode3 = CCZmode;
37642 goto kortest;
37644 case IX86_BUILTIN_KTESTC64:
37645 icode = CODE_FOR_ktestdi;
37646 mode3 = CCCmode;
37647 goto kortest;
37649 case IX86_BUILTIN_KTESTZ64:
37650 icode = CODE_FOR_ktestdi;
37651 mode3 = CCZmode;
37652 goto kortest;
37654 case IX86_BUILTIN_KORTESTC8:
37655 icode = CODE_FOR_kortestqi;
37656 mode3 = CCCmode;
37657 goto kortest;
37659 case IX86_BUILTIN_KORTESTZ8:
37660 icode = CODE_FOR_kortestqi;
37661 mode3 = CCZmode;
37662 goto kortest;
37664 case IX86_BUILTIN_KORTESTC16:
37665 icode = CODE_FOR_kortesthi;
37666 mode3 = CCCmode;
37667 goto kortest;
37669 case IX86_BUILTIN_KORTESTZ16:
37670 icode = CODE_FOR_kortesthi;
37671 mode3 = CCZmode;
37672 goto kortest;
37674 case IX86_BUILTIN_KORTESTC32:
37675 icode = CODE_FOR_kortestsi;
37676 mode3 = CCCmode;
37677 goto kortest;
37679 case IX86_BUILTIN_KORTESTZ32:
37680 icode = CODE_FOR_kortestsi;
37681 mode3 = CCZmode;
37682 goto kortest;
37684 case IX86_BUILTIN_KORTESTC64:
37685 icode = CODE_FOR_kortestdi;
37686 mode3 = CCCmode;
37687 goto kortest;
37689 case IX86_BUILTIN_KORTESTZ64:
37690 icode = CODE_FOR_kortestdi;
37691 mode3 = CCZmode;
37693 kortest:
37694 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37695 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37696 op0 = expand_normal (arg0);
37697 op1 = expand_normal (arg1);
37699 mode0 = insn_data[icode].operand[0].mode;
37700 mode1 = insn_data[icode].operand[1].mode;
37702 if (GET_MODE (op0) != VOIDmode)
37703 op0 = force_reg (GET_MODE (op0), op0);
37705 op0 = gen_lowpart (mode0, op0);
37707 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37708 op0 = copy_to_mode_reg (mode0, op0);
37710 if (GET_MODE (op1) != VOIDmode)
37711 op1 = force_reg (GET_MODE (op1), op1);
37713 op1 = gen_lowpart (mode1, op1);
37715 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37716 op1 = copy_to_mode_reg (mode1, op1);
37718 target = gen_reg_rtx (QImode);
37720 /* Emit kortest. */
37721 emit_insn (GEN_FCN (icode) (op0, op1));
37722 /* And use setcc to return result from flags. */
37723 ix86_expand_setcc (target, EQ,
37724 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37725 return target;
37727 case IX86_BUILTIN_GATHERSIV2DF:
37728 icode = CODE_FOR_avx2_gathersiv2df;
37729 goto gather_gen;
37730 case IX86_BUILTIN_GATHERSIV4DF:
37731 icode = CODE_FOR_avx2_gathersiv4df;
37732 goto gather_gen;
37733 case IX86_BUILTIN_GATHERDIV2DF:
37734 icode = CODE_FOR_avx2_gatherdiv2df;
37735 goto gather_gen;
37736 case IX86_BUILTIN_GATHERDIV4DF:
37737 icode = CODE_FOR_avx2_gatherdiv4df;
37738 goto gather_gen;
37739 case IX86_BUILTIN_GATHERSIV4SF:
37740 icode = CODE_FOR_avx2_gathersiv4sf;
37741 goto gather_gen;
37742 case IX86_BUILTIN_GATHERSIV8SF:
37743 icode = CODE_FOR_avx2_gathersiv8sf;
37744 goto gather_gen;
37745 case IX86_BUILTIN_GATHERDIV4SF:
37746 icode = CODE_FOR_avx2_gatherdiv4sf;
37747 goto gather_gen;
37748 case IX86_BUILTIN_GATHERDIV8SF:
37749 icode = CODE_FOR_avx2_gatherdiv8sf;
37750 goto gather_gen;
37751 case IX86_BUILTIN_GATHERSIV2DI:
37752 icode = CODE_FOR_avx2_gathersiv2di;
37753 goto gather_gen;
37754 case IX86_BUILTIN_GATHERSIV4DI:
37755 icode = CODE_FOR_avx2_gathersiv4di;
37756 goto gather_gen;
37757 case IX86_BUILTIN_GATHERDIV2DI:
37758 icode = CODE_FOR_avx2_gatherdiv2di;
37759 goto gather_gen;
37760 case IX86_BUILTIN_GATHERDIV4DI:
37761 icode = CODE_FOR_avx2_gatherdiv4di;
37762 goto gather_gen;
37763 case IX86_BUILTIN_GATHERSIV4SI:
37764 icode = CODE_FOR_avx2_gathersiv4si;
37765 goto gather_gen;
37766 case IX86_BUILTIN_GATHERSIV8SI:
37767 icode = CODE_FOR_avx2_gathersiv8si;
37768 goto gather_gen;
37769 case IX86_BUILTIN_GATHERDIV4SI:
37770 icode = CODE_FOR_avx2_gatherdiv4si;
37771 goto gather_gen;
37772 case IX86_BUILTIN_GATHERDIV8SI:
37773 icode = CODE_FOR_avx2_gatherdiv8si;
37774 goto gather_gen;
37775 case IX86_BUILTIN_GATHERALTSIV4DF:
37776 icode = CODE_FOR_avx2_gathersiv4df;
37777 goto gather_gen;
37778 case IX86_BUILTIN_GATHERALTDIV8SF:
37779 icode = CODE_FOR_avx2_gatherdiv8sf;
37780 goto gather_gen;
37781 case IX86_BUILTIN_GATHERALTSIV4DI:
37782 icode = CODE_FOR_avx2_gathersiv4di;
37783 goto gather_gen;
37784 case IX86_BUILTIN_GATHERALTDIV8SI:
37785 icode = CODE_FOR_avx2_gatherdiv8si;
37786 goto gather_gen;
37787 case IX86_BUILTIN_GATHER3SIV16SF:
37788 icode = CODE_FOR_avx512f_gathersiv16sf;
37789 goto gather_gen;
37790 case IX86_BUILTIN_GATHER3SIV8DF:
37791 icode = CODE_FOR_avx512f_gathersiv8df;
37792 goto gather_gen;
37793 case IX86_BUILTIN_GATHER3DIV16SF:
37794 icode = CODE_FOR_avx512f_gatherdiv16sf;
37795 goto gather_gen;
37796 case IX86_BUILTIN_GATHER3DIV8DF:
37797 icode = CODE_FOR_avx512f_gatherdiv8df;
37798 goto gather_gen;
37799 case IX86_BUILTIN_GATHER3SIV16SI:
37800 icode = CODE_FOR_avx512f_gathersiv16si;
37801 goto gather_gen;
37802 case IX86_BUILTIN_GATHER3SIV8DI:
37803 icode = CODE_FOR_avx512f_gathersiv8di;
37804 goto gather_gen;
37805 case IX86_BUILTIN_GATHER3DIV16SI:
37806 icode = CODE_FOR_avx512f_gatherdiv16si;
37807 goto gather_gen;
37808 case IX86_BUILTIN_GATHER3DIV8DI:
37809 icode = CODE_FOR_avx512f_gatherdiv8di;
37810 goto gather_gen;
37811 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37812 icode = CODE_FOR_avx512f_gathersiv8df;
37813 goto gather_gen;
37814 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37815 icode = CODE_FOR_avx512f_gatherdiv16sf;
37816 goto gather_gen;
37817 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37818 icode = CODE_FOR_avx512f_gathersiv8di;
37819 goto gather_gen;
37820 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37821 icode = CODE_FOR_avx512f_gatherdiv16si;
37822 goto gather_gen;
37823 case IX86_BUILTIN_GATHER3SIV2DF:
37824 icode = CODE_FOR_avx512vl_gathersiv2df;
37825 goto gather_gen;
37826 case IX86_BUILTIN_GATHER3SIV4DF:
37827 icode = CODE_FOR_avx512vl_gathersiv4df;
37828 goto gather_gen;
37829 case IX86_BUILTIN_GATHER3DIV2DF:
37830 icode = CODE_FOR_avx512vl_gatherdiv2df;
37831 goto gather_gen;
37832 case IX86_BUILTIN_GATHER3DIV4DF:
37833 icode = CODE_FOR_avx512vl_gatherdiv4df;
37834 goto gather_gen;
37835 case IX86_BUILTIN_GATHER3SIV4SF:
37836 icode = CODE_FOR_avx512vl_gathersiv4sf;
37837 goto gather_gen;
37838 case IX86_BUILTIN_GATHER3SIV8SF:
37839 icode = CODE_FOR_avx512vl_gathersiv8sf;
37840 goto gather_gen;
37841 case IX86_BUILTIN_GATHER3DIV4SF:
37842 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37843 goto gather_gen;
37844 case IX86_BUILTIN_GATHER3DIV8SF:
37845 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37846 goto gather_gen;
37847 case IX86_BUILTIN_GATHER3SIV2DI:
37848 icode = CODE_FOR_avx512vl_gathersiv2di;
37849 goto gather_gen;
37850 case IX86_BUILTIN_GATHER3SIV4DI:
37851 icode = CODE_FOR_avx512vl_gathersiv4di;
37852 goto gather_gen;
37853 case IX86_BUILTIN_GATHER3DIV2DI:
37854 icode = CODE_FOR_avx512vl_gatherdiv2di;
37855 goto gather_gen;
37856 case IX86_BUILTIN_GATHER3DIV4DI:
37857 icode = CODE_FOR_avx512vl_gatherdiv4di;
37858 goto gather_gen;
37859 case IX86_BUILTIN_GATHER3SIV4SI:
37860 icode = CODE_FOR_avx512vl_gathersiv4si;
37861 goto gather_gen;
37862 case IX86_BUILTIN_GATHER3SIV8SI:
37863 icode = CODE_FOR_avx512vl_gathersiv8si;
37864 goto gather_gen;
37865 case IX86_BUILTIN_GATHER3DIV4SI:
37866 icode = CODE_FOR_avx512vl_gatherdiv4si;
37867 goto gather_gen;
37868 case IX86_BUILTIN_GATHER3DIV8SI:
37869 icode = CODE_FOR_avx512vl_gatherdiv8si;
37870 goto gather_gen;
37871 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37872 icode = CODE_FOR_avx512vl_gathersiv4df;
37873 goto gather_gen;
37874 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37875 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37876 goto gather_gen;
37877 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37878 icode = CODE_FOR_avx512vl_gathersiv4di;
37879 goto gather_gen;
37880 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37881 icode = CODE_FOR_avx512vl_gatherdiv8si;
37882 goto gather_gen;
37883 case IX86_BUILTIN_SCATTERSIV16SF:
37884 icode = CODE_FOR_avx512f_scattersiv16sf;
37885 goto scatter_gen;
37886 case IX86_BUILTIN_SCATTERSIV8DF:
37887 icode = CODE_FOR_avx512f_scattersiv8df;
37888 goto scatter_gen;
37889 case IX86_BUILTIN_SCATTERDIV16SF:
37890 icode = CODE_FOR_avx512f_scatterdiv16sf;
37891 goto scatter_gen;
37892 case IX86_BUILTIN_SCATTERDIV8DF:
37893 icode = CODE_FOR_avx512f_scatterdiv8df;
37894 goto scatter_gen;
37895 case IX86_BUILTIN_SCATTERSIV16SI:
37896 icode = CODE_FOR_avx512f_scattersiv16si;
37897 goto scatter_gen;
37898 case IX86_BUILTIN_SCATTERSIV8DI:
37899 icode = CODE_FOR_avx512f_scattersiv8di;
37900 goto scatter_gen;
37901 case IX86_BUILTIN_SCATTERDIV16SI:
37902 icode = CODE_FOR_avx512f_scatterdiv16si;
37903 goto scatter_gen;
37904 case IX86_BUILTIN_SCATTERDIV8DI:
37905 icode = CODE_FOR_avx512f_scatterdiv8di;
37906 goto scatter_gen;
37907 case IX86_BUILTIN_SCATTERSIV8SF:
37908 icode = CODE_FOR_avx512vl_scattersiv8sf;
37909 goto scatter_gen;
37910 case IX86_BUILTIN_SCATTERSIV4SF:
37911 icode = CODE_FOR_avx512vl_scattersiv4sf;
37912 goto scatter_gen;
37913 case IX86_BUILTIN_SCATTERSIV4DF:
37914 icode = CODE_FOR_avx512vl_scattersiv4df;
37915 goto scatter_gen;
37916 case IX86_BUILTIN_SCATTERSIV2DF:
37917 icode = CODE_FOR_avx512vl_scattersiv2df;
37918 goto scatter_gen;
37919 case IX86_BUILTIN_SCATTERDIV8SF:
37920 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37921 goto scatter_gen;
37922 case IX86_BUILTIN_SCATTERDIV4SF:
37923 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37924 goto scatter_gen;
37925 case IX86_BUILTIN_SCATTERDIV4DF:
37926 icode = CODE_FOR_avx512vl_scatterdiv4df;
37927 goto scatter_gen;
37928 case IX86_BUILTIN_SCATTERDIV2DF:
37929 icode = CODE_FOR_avx512vl_scatterdiv2df;
37930 goto scatter_gen;
37931 case IX86_BUILTIN_SCATTERSIV8SI:
37932 icode = CODE_FOR_avx512vl_scattersiv8si;
37933 goto scatter_gen;
37934 case IX86_BUILTIN_SCATTERSIV4SI:
37935 icode = CODE_FOR_avx512vl_scattersiv4si;
37936 goto scatter_gen;
37937 case IX86_BUILTIN_SCATTERSIV4DI:
37938 icode = CODE_FOR_avx512vl_scattersiv4di;
37939 goto scatter_gen;
37940 case IX86_BUILTIN_SCATTERSIV2DI:
37941 icode = CODE_FOR_avx512vl_scattersiv2di;
37942 goto scatter_gen;
37943 case IX86_BUILTIN_SCATTERDIV8SI:
37944 icode = CODE_FOR_avx512vl_scatterdiv8si;
37945 goto scatter_gen;
37946 case IX86_BUILTIN_SCATTERDIV4SI:
37947 icode = CODE_FOR_avx512vl_scatterdiv4si;
37948 goto scatter_gen;
37949 case IX86_BUILTIN_SCATTERDIV4DI:
37950 icode = CODE_FOR_avx512vl_scatterdiv4di;
37951 goto scatter_gen;
37952 case IX86_BUILTIN_SCATTERDIV2DI:
37953 icode = CODE_FOR_avx512vl_scatterdiv2di;
37954 goto scatter_gen;
37955 case IX86_BUILTIN_GATHERPFDPD:
37956 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37957 goto vec_prefetch_gen;
37958 case IX86_BUILTIN_SCATTERALTSIV8DF:
37959 icode = CODE_FOR_avx512f_scattersiv8df;
37960 goto scatter_gen;
37961 case IX86_BUILTIN_SCATTERALTDIV16SF:
37962 icode = CODE_FOR_avx512f_scatterdiv16sf;
37963 goto scatter_gen;
37964 case IX86_BUILTIN_SCATTERALTSIV8DI:
37965 icode = CODE_FOR_avx512f_scattersiv8di;
37966 goto scatter_gen;
37967 case IX86_BUILTIN_SCATTERALTDIV16SI:
37968 icode = CODE_FOR_avx512f_scatterdiv16si;
37969 goto scatter_gen;
37970 case IX86_BUILTIN_GATHERPFDPS:
37971 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37972 goto vec_prefetch_gen;
37973 case IX86_BUILTIN_GATHERPFQPD:
37974 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37975 goto vec_prefetch_gen;
37976 case IX86_BUILTIN_GATHERPFQPS:
37977 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37978 goto vec_prefetch_gen;
37979 case IX86_BUILTIN_SCATTERPFDPD:
37980 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37981 goto vec_prefetch_gen;
37982 case IX86_BUILTIN_SCATTERPFDPS:
37983 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37984 goto vec_prefetch_gen;
37985 case IX86_BUILTIN_SCATTERPFQPD:
37986 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37987 goto vec_prefetch_gen;
37988 case IX86_BUILTIN_SCATTERPFQPS:
37989 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37990 goto vec_prefetch_gen;
37992 gather_gen:
37993 rtx half;
37994 rtx (*gen) (rtx, rtx);
37996 arg0 = CALL_EXPR_ARG (exp, 0);
37997 arg1 = CALL_EXPR_ARG (exp, 1);
37998 arg2 = CALL_EXPR_ARG (exp, 2);
37999 arg3 = CALL_EXPR_ARG (exp, 3);
38000 arg4 = CALL_EXPR_ARG (exp, 4);
38001 op0 = expand_normal (arg0);
38002 op1 = expand_normal (arg1);
38003 op2 = expand_normal (arg2);
38004 op3 = expand_normal (arg3);
38005 op4 = expand_normal (arg4);
38006 /* Note the arg order is different from the operand order. */
38007 mode0 = insn_data[icode].operand[1].mode;
38008 mode2 = insn_data[icode].operand[3].mode;
38009 mode3 = insn_data[icode].operand[4].mode;
38010 mode4 = insn_data[icode].operand[5].mode;
38012 if (target == NULL_RTX
38013 || GET_MODE (target) != insn_data[icode].operand[0].mode
38014 || !insn_data[icode].operand[0].predicate (target,
38015 GET_MODE (target)))
38016 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38017 else
38018 subtarget = target;
38020 switch (fcode)
38022 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38023 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38024 half = gen_reg_rtx (V8SImode);
38025 if (!nonimmediate_operand (op2, V16SImode))
38026 op2 = copy_to_mode_reg (V16SImode, op2);
38027 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38028 op2 = half;
38029 break;
38030 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38031 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38032 case IX86_BUILTIN_GATHERALTSIV4DF:
38033 case IX86_BUILTIN_GATHERALTSIV4DI:
38034 half = gen_reg_rtx (V4SImode);
38035 if (!nonimmediate_operand (op2, V8SImode))
38036 op2 = copy_to_mode_reg (V8SImode, op2);
38037 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38038 op2 = half;
38039 break;
38040 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38041 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38042 half = gen_reg_rtx (mode0);
38043 if (mode0 == V8SFmode)
38044 gen = gen_vec_extract_lo_v16sf;
38045 else
38046 gen = gen_vec_extract_lo_v16si;
38047 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38048 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38049 emit_insn (gen (half, op0));
38050 op0 = half;
38051 if (GET_MODE (op3) != VOIDmode)
38053 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38054 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38055 emit_insn (gen (half, op3));
38056 op3 = half;
38058 break;
38059 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38060 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38061 case IX86_BUILTIN_GATHERALTDIV8SF:
38062 case IX86_BUILTIN_GATHERALTDIV8SI:
38063 half = gen_reg_rtx (mode0);
38064 if (mode0 == V4SFmode)
38065 gen = gen_vec_extract_lo_v8sf;
38066 else
38067 gen = gen_vec_extract_lo_v8si;
38068 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38069 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38070 emit_insn (gen (half, op0));
38071 op0 = half;
38072 if (GET_MODE (op3) != VOIDmode)
38074 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38075 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38076 emit_insn (gen (half, op3));
38077 op3 = half;
38079 break;
38080 default:
38081 break;
38084 /* Force memory operand only with base register here. But we
38085 don't want to do it on memory operand for other builtin
38086 functions. */
38087 op1 = ix86_zero_extend_to_Pmode (op1);
38089 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38090 op0 = copy_to_mode_reg (mode0, op0);
38091 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38092 op1 = copy_to_mode_reg (Pmode, op1);
38093 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38094 op2 = copy_to_mode_reg (mode2, op2);
38096 op3 = fixup_modeless_constant (op3, mode3);
38098 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38100 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38101 op3 = copy_to_mode_reg (mode3, op3);
38103 else
38105 op3 = copy_to_reg (op3);
38106 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38108 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38110 error ("the last argument must be scale 1, 2, 4, 8");
38111 return const0_rtx;
38114 /* Optimize. If mask is known to have all high bits set,
38115 replace op0 with pc_rtx to signal that the instruction
38116 overwrites the whole destination and doesn't use its
38117 previous contents. */
38118 if (optimize)
38120 if (TREE_CODE (arg3) == INTEGER_CST)
38122 if (integer_all_onesp (arg3))
38123 op0 = pc_rtx;
38125 else if (TREE_CODE (arg3) == VECTOR_CST)
38127 unsigned int negative = 0;
38128 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38130 tree cst = VECTOR_CST_ELT (arg3, i);
38131 if (TREE_CODE (cst) == INTEGER_CST
38132 && tree_int_cst_sign_bit (cst))
38133 negative++;
38134 else if (TREE_CODE (cst) == REAL_CST
38135 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38136 negative++;
38138 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38139 op0 = pc_rtx;
38141 else if (TREE_CODE (arg3) == SSA_NAME
38142 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38144 /* Recognize also when mask is like:
38145 __v2df src = _mm_setzero_pd ();
38146 __v2df mask = _mm_cmpeq_pd (src, src);
38148 __v8sf src = _mm256_setzero_ps ();
38149 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38150 as that is a cheaper way to load all ones into
38151 a register than having to load a constant from
38152 memory. */
38153 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38154 if (is_gimple_call (def_stmt))
38156 tree fndecl = gimple_call_fndecl (def_stmt);
38157 if (fndecl
38158 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38159 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38161 case IX86_BUILTIN_CMPPD:
38162 case IX86_BUILTIN_CMPPS:
38163 case IX86_BUILTIN_CMPPD256:
38164 case IX86_BUILTIN_CMPPS256:
38165 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38166 break;
38167 /* FALLTHRU */
38168 case IX86_BUILTIN_CMPEQPD:
38169 case IX86_BUILTIN_CMPEQPS:
38170 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38171 && initializer_zerop (gimple_call_arg (def_stmt,
38172 1)))
38173 op0 = pc_rtx;
38174 break;
38175 default:
38176 break;
38182 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38183 if (! pat)
38184 return const0_rtx;
38185 emit_insn (pat);
38187 switch (fcode)
38189 case IX86_BUILTIN_GATHER3DIV16SF:
38190 if (target == NULL_RTX)
38191 target = gen_reg_rtx (V8SFmode);
38192 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38193 break;
38194 case IX86_BUILTIN_GATHER3DIV16SI:
38195 if (target == NULL_RTX)
38196 target = gen_reg_rtx (V8SImode);
38197 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38198 break;
38199 case IX86_BUILTIN_GATHER3DIV8SF:
38200 case IX86_BUILTIN_GATHERDIV8SF:
38201 if (target == NULL_RTX)
38202 target = gen_reg_rtx (V4SFmode);
38203 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38204 break;
38205 case IX86_BUILTIN_GATHER3DIV8SI:
38206 case IX86_BUILTIN_GATHERDIV8SI:
38207 if (target == NULL_RTX)
38208 target = gen_reg_rtx (V4SImode);
38209 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38210 break;
38211 default:
38212 target = subtarget;
38213 break;
38215 return target;
38217 scatter_gen:
38218 arg0 = CALL_EXPR_ARG (exp, 0);
38219 arg1 = CALL_EXPR_ARG (exp, 1);
38220 arg2 = CALL_EXPR_ARG (exp, 2);
38221 arg3 = CALL_EXPR_ARG (exp, 3);
38222 arg4 = CALL_EXPR_ARG (exp, 4);
38223 op0 = expand_normal (arg0);
38224 op1 = expand_normal (arg1);
38225 op2 = expand_normal (arg2);
38226 op3 = expand_normal (arg3);
38227 op4 = expand_normal (arg4);
38228 mode1 = insn_data[icode].operand[1].mode;
38229 mode2 = insn_data[icode].operand[2].mode;
38230 mode3 = insn_data[icode].operand[3].mode;
38231 mode4 = insn_data[icode].operand[4].mode;
38233 /* Scatter instruction stores operand op3 to memory with
38234 indices from op2 and scale from op4 under writemask op1.
38235 If index operand op2 has more elements then source operand
38236 op3 one need to use only its low half. And vice versa. */
38237 switch (fcode)
38239 case IX86_BUILTIN_SCATTERALTSIV8DF:
38240 case IX86_BUILTIN_SCATTERALTSIV8DI:
38241 half = gen_reg_rtx (V8SImode);
38242 if (!nonimmediate_operand (op2, V16SImode))
38243 op2 = copy_to_mode_reg (V16SImode, op2);
38244 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38245 op2 = half;
38246 break;
38247 case IX86_BUILTIN_SCATTERALTDIV16SF:
38248 case IX86_BUILTIN_SCATTERALTDIV16SI:
38249 half = gen_reg_rtx (mode3);
38250 if (mode3 == V8SFmode)
38251 gen = gen_vec_extract_lo_v16sf;
38252 else
38253 gen = gen_vec_extract_lo_v16si;
38254 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38255 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38256 emit_insn (gen (half, op3));
38257 op3 = half;
38258 break;
38259 default:
38260 break;
38263 /* Force memory operand only with base register here. But we
38264 don't want to do it on memory operand for other builtin
38265 functions. */
38266 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38268 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38269 op0 = copy_to_mode_reg (Pmode, op0);
38271 op1 = fixup_modeless_constant (op1, mode1);
38273 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38275 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38276 op1 = copy_to_mode_reg (mode1, op1);
38278 else
38280 op1 = copy_to_reg (op1);
38281 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38284 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38285 op2 = copy_to_mode_reg (mode2, op2);
38287 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38288 op3 = copy_to_mode_reg (mode3, op3);
38290 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38292 error ("the last argument must be scale 1, 2, 4, 8");
38293 return const0_rtx;
38296 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38297 if (! pat)
38298 return const0_rtx;
38300 emit_insn (pat);
38301 return 0;
38303 vec_prefetch_gen:
38304 arg0 = CALL_EXPR_ARG (exp, 0);
38305 arg1 = CALL_EXPR_ARG (exp, 1);
38306 arg2 = CALL_EXPR_ARG (exp, 2);
38307 arg3 = CALL_EXPR_ARG (exp, 3);
38308 arg4 = CALL_EXPR_ARG (exp, 4);
38309 op0 = expand_normal (arg0);
38310 op1 = expand_normal (arg1);
38311 op2 = expand_normal (arg2);
38312 op3 = expand_normal (arg3);
38313 op4 = expand_normal (arg4);
38314 mode0 = insn_data[icode].operand[0].mode;
38315 mode1 = insn_data[icode].operand[1].mode;
38316 mode3 = insn_data[icode].operand[3].mode;
38317 mode4 = insn_data[icode].operand[4].mode;
38319 op0 = fixup_modeless_constant (op0, mode0);
38321 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38323 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38324 op0 = copy_to_mode_reg (mode0, op0);
38326 else
38328 op0 = copy_to_reg (op0);
38329 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38332 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38333 op1 = copy_to_mode_reg (mode1, op1);
38335 /* Force memory operand only with base register here. But we
38336 don't want to do it on memory operand for other builtin
38337 functions. */
38338 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38340 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38341 op2 = copy_to_mode_reg (Pmode, op2);
38343 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38345 error ("the forth argument must be scale 1, 2, 4, 8");
38346 return const0_rtx;
38349 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38351 error ("incorrect hint operand");
38352 return const0_rtx;
38355 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38356 if (! pat)
38357 return const0_rtx;
38359 emit_insn (pat);
38361 return 0;
38363 case IX86_BUILTIN_XABORT:
38364 icode = CODE_FOR_xabort;
38365 arg0 = CALL_EXPR_ARG (exp, 0);
38366 op0 = expand_normal (arg0);
38367 mode0 = insn_data[icode].operand[0].mode;
38368 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38370 error ("the xabort's argument must be an 8-bit immediate");
38371 return const0_rtx;
38373 emit_insn (gen_xabort (op0));
38374 return 0;
38376 case IX86_BUILTIN_RSTORSSP:
38377 case IX86_BUILTIN_CLRSSBSY:
38378 arg0 = CALL_EXPR_ARG (exp, 0);
38379 op0 = expand_normal (arg0);
38380 icode = (fcode == IX86_BUILTIN_RSTORSSP
38381 ? CODE_FOR_rstorssp
38382 : CODE_FOR_clrssbsy);
38383 if (!address_operand (op0, VOIDmode))
38385 op1 = convert_memory_address (Pmode, op0);
38386 op0 = copy_addr_to_reg (op1);
38388 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38389 return 0;
38391 case IX86_BUILTIN_WRSSD:
38392 case IX86_BUILTIN_WRSSQ:
38393 case IX86_BUILTIN_WRUSSD:
38394 case IX86_BUILTIN_WRUSSQ:
38395 arg0 = CALL_EXPR_ARG (exp, 0);
38396 op0 = expand_normal (arg0);
38397 arg1 = CALL_EXPR_ARG (exp, 1);
38398 op1 = expand_normal (arg1);
38399 switch (fcode)
38401 case IX86_BUILTIN_WRSSD:
38402 icode = CODE_FOR_wrsssi;
38403 mode = SImode;
38404 break;
38405 case IX86_BUILTIN_WRSSQ:
38406 icode = CODE_FOR_wrssdi;
38407 mode = DImode;
38408 break;
38409 case IX86_BUILTIN_WRUSSD:
38410 icode = CODE_FOR_wrusssi;
38411 mode = SImode;
38412 break;
38413 case IX86_BUILTIN_WRUSSQ:
38414 icode = CODE_FOR_wrussdi;
38415 mode = DImode;
38416 break;
38418 op0 = force_reg (mode, op0);
38419 if (!address_operand (op1, VOIDmode))
38421 op2 = convert_memory_address (Pmode, op1);
38422 op1 = copy_addr_to_reg (op2);
38424 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38425 return 0;
38427 default:
38428 break;
38431 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38432 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38434 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38435 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38436 target);
38439 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
38440 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
38442 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
38443 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
38444 target);
38447 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38448 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38450 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38451 switch (fcode)
38453 case IX86_BUILTIN_FABSQ:
38454 case IX86_BUILTIN_COPYSIGNQ:
38455 if (!TARGET_SSE)
38456 /* Emit a normal call if SSE isn't available. */
38457 return expand_call (exp, target, ignore);
38458 /* FALLTHRU */
38459 default:
38460 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38464 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38465 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38467 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38468 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38469 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38470 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38471 int masked = 1;
38472 machine_mode mode, wide_mode, nar_mode;
38474 nar_mode = V4SFmode;
38475 mode = V16SFmode;
38476 wide_mode = V64SFmode;
38477 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38478 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38480 switch (fcode)
38482 case IX86_BUILTIN_4FMAPS:
38483 fcn = gen_avx5124fmaddps_4fmaddps;
38484 masked = 0;
38485 goto v4fma_expand;
38487 case IX86_BUILTIN_4DPWSSD:
38488 nar_mode = V4SImode;
38489 mode = V16SImode;
38490 wide_mode = V64SImode;
38491 fcn = gen_avx5124vnniw_vp4dpwssd;
38492 masked = 0;
38493 goto v4fma_expand;
38495 case IX86_BUILTIN_4DPWSSDS:
38496 nar_mode = V4SImode;
38497 mode = V16SImode;
38498 wide_mode = V64SImode;
38499 fcn = gen_avx5124vnniw_vp4dpwssds;
38500 masked = 0;
38501 goto v4fma_expand;
38503 case IX86_BUILTIN_4FNMAPS:
38504 fcn = gen_avx5124fmaddps_4fnmaddps;
38505 masked = 0;
38506 goto v4fma_expand;
38508 case IX86_BUILTIN_4FNMAPS_MASK:
38509 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38510 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38511 goto v4fma_expand;
38513 case IX86_BUILTIN_4DPWSSD_MASK:
38514 nar_mode = V4SImode;
38515 mode = V16SImode;
38516 wide_mode = V64SImode;
38517 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38518 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38519 goto v4fma_expand;
38521 case IX86_BUILTIN_4DPWSSDS_MASK:
38522 nar_mode = V4SImode;
38523 mode = V16SImode;
38524 wide_mode = V64SImode;
38525 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38526 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38527 goto v4fma_expand;
38529 case IX86_BUILTIN_4FMAPS_MASK:
38531 tree args[4];
38532 rtx ops[4];
38533 rtx wide_reg;
38534 rtx accum;
38535 rtx addr;
38536 rtx mem;
38538 v4fma_expand:
38539 wide_reg = gen_reg_rtx (wide_mode);
38540 for (i = 0; i < 4; i++)
38542 args[i] = CALL_EXPR_ARG (exp, i);
38543 ops[i] = expand_normal (args[i]);
38545 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38546 ops[i]);
38549 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38550 accum = force_reg (mode, accum);
38552 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38553 addr = force_reg (Pmode, addr);
38555 mem = gen_rtx_MEM (nar_mode, addr);
38557 target = gen_reg_rtx (mode);
38559 emit_move_insn (target, accum);
38561 if (! masked)
38562 emit_insn (fcn (target, accum, wide_reg, mem));
38563 else
38565 rtx merge, mask;
38566 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38568 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38570 if (CONST_INT_P (mask))
38571 mask = fixup_modeless_constant (mask, HImode);
38573 mask = force_reg (HImode, mask);
38575 if (GET_MODE (mask) != HImode)
38576 mask = gen_rtx_SUBREG (HImode, mask, 0);
38578 /* If merge is 0 then we're about to emit z-masked variant. */
38579 if (const0_operand (merge, mode))
38580 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38581 /* If merge is the same as accum then emit merge-masked variant. */
38582 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38584 merge = force_reg (mode, merge);
38585 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38587 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38588 else
38590 target = gen_reg_rtx (mode);
38591 emit_move_insn (target, merge);
38592 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38595 return target;
38598 case IX86_BUILTIN_4FNMASS:
38599 fcn = gen_avx5124fmaddps_4fnmaddss;
38600 masked = 0;
38601 goto s4fma_expand;
38603 case IX86_BUILTIN_4FMASS:
38604 fcn = gen_avx5124fmaddps_4fmaddss;
38605 masked = 0;
38606 goto s4fma_expand;
38608 case IX86_BUILTIN_4FNMASS_MASK:
38609 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38610 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38611 goto s4fma_expand;
38613 case IX86_BUILTIN_4FMASS_MASK:
38615 tree args[4];
38616 rtx ops[4];
38617 rtx wide_reg;
38618 rtx accum;
38619 rtx addr;
38620 rtx mem;
38622 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38623 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38625 s4fma_expand:
38626 mode = V4SFmode;
38627 wide_reg = gen_reg_rtx (V64SFmode);
38628 for (i = 0; i < 4; i++)
38630 rtx tmp;
38631 args[i] = CALL_EXPR_ARG (exp, i);
38632 ops[i] = expand_normal (args[i]);
38634 tmp = gen_reg_rtx (SFmode);
38635 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38637 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38638 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38641 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38642 accum = force_reg (V4SFmode, accum);
38644 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38645 addr = force_reg (Pmode, addr);
38647 mem = gen_rtx_MEM (V4SFmode, addr);
38649 target = gen_reg_rtx (V4SFmode);
38651 emit_move_insn (target, accum);
38653 if (! masked)
38654 emit_insn (fcn (target, accum, wide_reg, mem));
38655 else
38657 rtx merge, mask;
38658 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38660 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38662 if (CONST_INT_P (mask))
38663 mask = fixup_modeless_constant (mask, QImode);
38665 mask = force_reg (QImode, mask);
38667 if (GET_MODE (mask) != QImode)
38668 mask = gen_rtx_SUBREG (QImode, mask, 0);
38670 /* If merge is 0 then we're about to emit z-masked variant. */
38671 if (const0_operand (merge, mode))
38672 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38673 /* If merge is the same as accum then emit merge-masked
38674 variant. */
38675 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38677 merge = force_reg (mode, merge);
38678 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38680 /* Merge with something unknown might happen if we z-mask
38681 w/ -O0. */
38682 else
38684 target = gen_reg_rtx (mode);
38685 emit_move_insn (target, merge);
38686 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38689 return target;
38691 case IX86_BUILTIN_RDPID:
38692 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38693 target);
38694 default:
38695 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38699 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38700 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38702 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38703 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38706 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38707 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38709 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38710 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38713 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38714 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38716 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38717 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38720 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38721 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38723 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38724 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38727 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38728 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38730 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38731 const struct builtin_description *d = bdesc_multi_arg + i;
38732 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38733 (enum ix86_builtin_func_type)
38734 d->flag, d->comparison);
38737 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38738 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38740 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38741 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38742 target);
38745 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38746 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38748 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38749 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
38750 target);
38753 gcc_unreachable ();
38756 /* This returns the target-specific builtin with code CODE if
38757 current_function_decl has visibility on this builtin, which is checked
38758 using isa flags. Returns NULL_TREE otherwise. */
38760 static tree ix86_get_builtin (enum ix86_builtins code)
38762 struct cl_target_option *opts;
38763 tree target_tree = NULL_TREE;
38765 /* Determine the isa flags of current_function_decl. */
38767 if (current_function_decl)
38768 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38770 if (target_tree == NULL)
38771 target_tree = target_option_default_node;
38773 opts = TREE_TARGET_OPTION (target_tree);
38775 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38776 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38777 return ix86_builtin_decl (code, true);
38778 else
38779 return NULL_TREE;
38782 /* Return function decl for target specific builtin
38783 for given MPX builtin passed i FCODE. */
38784 static tree
38785 ix86_builtin_mpx_function (unsigned fcode)
38787 switch (fcode)
38789 case BUILT_IN_CHKP_BNDMK:
38790 return ix86_builtins[IX86_BUILTIN_BNDMK];
38792 case BUILT_IN_CHKP_BNDSTX:
38793 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38795 case BUILT_IN_CHKP_BNDLDX:
38796 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38798 case BUILT_IN_CHKP_BNDCL:
38799 return ix86_builtins[IX86_BUILTIN_BNDCL];
38801 case BUILT_IN_CHKP_BNDCU:
38802 return ix86_builtins[IX86_BUILTIN_BNDCU];
38804 case BUILT_IN_CHKP_BNDRET:
38805 return ix86_builtins[IX86_BUILTIN_BNDRET];
38807 case BUILT_IN_CHKP_INTERSECT:
38808 return ix86_builtins[IX86_BUILTIN_BNDINT];
38810 case BUILT_IN_CHKP_NARROW:
38811 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38813 case BUILT_IN_CHKP_SIZEOF:
38814 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38816 case BUILT_IN_CHKP_EXTRACT_LOWER:
38817 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38819 case BUILT_IN_CHKP_EXTRACT_UPPER:
38820 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38822 default:
38823 return NULL_TREE;
38826 gcc_unreachable ();
38829 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38831 Return an address to be used to load/store bounds for pointer
38832 passed in SLOT.
38834 SLOT_NO is an integer constant holding number of a target
38835 dependent special slot to be used in case SLOT is not a memory.
38837 SPECIAL_BASE is a pointer to be used as a base of fake address
38838 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38839 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38841 static rtx
38842 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38844 rtx addr = NULL;
38846 /* NULL slot means we pass bounds for pointer not passed to the
38847 function at all. Register slot means we pass pointer in a
38848 register. In both these cases bounds are passed via Bounds
38849 Table. Since we do not have actual pointer stored in memory,
38850 we have to use fake addresses to access Bounds Table. We
38851 start with (special_base - sizeof (void*)) and decrease this
38852 address by pointer size to get addresses for other slots. */
38853 if (!slot || REG_P (slot))
38855 gcc_assert (CONST_INT_P (slot_no));
38856 addr = plus_constant (Pmode, special_base,
38857 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38859 /* If pointer is passed in a memory then its address is used to
38860 access Bounds Table. */
38861 else if (MEM_P (slot))
38863 addr = XEXP (slot, 0);
38864 if (!register_operand (addr, Pmode))
38865 addr = copy_addr_to_reg (addr);
38867 else
38868 gcc_unreachable ();
38870 return addr;
38873 /* Expand pass uses this hook to load bounds for function parameter
38874 PTR passed in SLOT in case its bounds are not passed in a register.
38876 If SLOT is a memory, then bounds are loaded as for regular pointer
38877 loaded from memory. PTR may be NULL in case SLOT is a memory.
38878 In such case value of PTR (if required) may be loaded from SLOT.
38880 If SLOT is NULL or a register then SLOT_NO is an integer constant
38881 holding number of the target dependent special slot which should be
38882 used to obtain bounds.
38884 Return loaded bounds. */
38886 static rtx
38887 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38889 rtx reg = gen_reg_rtx (BNDmode);
38890 rtx addr;
38892 /* Get address to be used to access Bounds Table. Special slots start
38893 at the location of return address of the current function. */
38894 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38896 /* Load pointer value from a memory if we don't have it. */
38897 if (!ptr)
38899 gcc_assert (MEM_P (slot));
38900 ptr = copy_addr_to_reg (slot);
38903 if (!register_operand (ptr, Pmode))
38904 ptr = ix86_zero_extend_to_Pmode (ptr);
38906 emit_insn (BNDmode == BND64mode
38907 ? gen_bnd64_ldx (reg, addr, ptr)
38908 : gen_bnd32_ldx (reg, addr, ptr));
38910 return reg;
38913 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38914 passed in SLOT in case BOUNDS are not passed in a register.
38916 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38917 stored in memory. PTR may be NULL in case SLOT is a memory.
38918 In such case value of PTR (if required) may be loaded from SLOT.
38920 If SLOT is NULL or a register then SLOT_NO is an integer constant
38921 holding number of the target dependent special slot which should be
38922 used to store BOUNDS. */
38924 static void
38925 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38927 rtx addr;
38929 /* Get address to be used to access Bounds Table. Special slots start
38930 at the location of return address of a called function. */
38931 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38933 /* Load pointer value from a memory if we don't have it. */
38934 if (!ptr)
38936 gcc_assert (MEM_P (slot));
38937 ptr = copy_addr_to_reg (slot);
38940 if (!register_operand (ptr, Pmode))
38941 ptr = ix86_zero_extend_to_Pmode (ptr);
38943 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38944 if (!register_operand (bounds, BNDmode))
38945 bounds = copy_to_mode_reg (BNDmode, bounds);
38947 emit_insn (BNDmode == BND64mode
38948 ? gen_bnd64_stx (addr, ptr, bounds)
38949 : gen_bnd32_stx (addr, ptr, bounds));
38952 /* Load and return bounds returned by function in SLOT. */
38954 static rtx
38955 ix86_load_returned_bounds (rtx slot)
38957 rtx res;
38959 gcc_assert (REG_P (slot));
38960 res = gen_reg_rtx (BNDmode);
38961 emit_move_insn (res, slot);
38963 return res;
38966 /* Store BOUNDS returned by function into SLOT. */
38968 static void
38969 ix86_store_returned_bounds (rtx slot, rtx bounds)
38971 gcc_assert (REG_P (slot));
38972 emit_move_insn (slot, bounds);
38975 /* Returns a function decl for a vectorized version of the combined function
38976 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38977 if it is not available. */
38979 static tree
38980 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38981 tree type_in)
38983 machine_mode in_mode, out_mode;
38984 int in_n, out_n;
38986 if (TREE_CODE (type_out) != VECTOR_TYPE
38987 || TREE_CODE (type_in) != VECTOR_TYPE)
38988 return NULL_TREE;
38990 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38991 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38992 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38993 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38995 switch (fn)
38997 CASE_CFN_EXP2:
38998 if (out_mode == SFmode && in_mode == SFmode)
39000 if (out_n == 16 && in_n == 16)
39001 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39003 break;
39005 CASE_CFN_IFLOOR:
39006 CASE_CFN_LFLOOR:
39007 CASE_CFN_LLFLOOR:
39008 /* The round insn does not trap on denormals. */
39009 if (flag_trapping_math || !TARGET_SSE4_1)
39010 break;
39012 if (out_mode == SImode && in_mode == DFmode)
39014 if (out_n == 4 && in_n == 2)
39015 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39016 else if (out_n == 8 && in_n == 4)
39017 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39018 else if (out_n == 16 && in_n == 8)
39019 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39021 if (out_mode == SImode && in_mode == SFmode)
39023 if (out_n == 4 && in_n == 4)
39024 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39025 else if (out_n == 8 && in_n == 8)
39026 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39027 else if (out_n == 16 && in_n == 16)
39028 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39030 break;
39032 CASE_CFN_ICEIL:
39033 CASE_CFN_LCEIL:
39034 CASE_CFN_LLCEIL:
39035 /* The round insn does not trap on denormals. */
39036 if (flag_trapping_math || !TARGET_SSE4_1)
39037 break;
39039 if (out_mode == SImode && in_mode == DFmode)
39041 if (out_n == 4 && in_n == 2)
39042 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39043 else if (out_n == 8 && in_n == 4)
39044 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39045 else if (out_n == 16 && in_n == 8)
39046 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39048 if (out_mode == SImode && in_mode == SFmode)
39050 if (out_n == 4 && in_n == 4)
39051 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39052 else if (out_n == 8 && in_n == 8)
39053 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39054 else if (out_n == 16 && in_n == 16)
39055 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39057 break;
39059 CASE_CFN_IRINT:
39060 CASE_CFN_LRINT:
39061 CASE_CFN_LLRINT:
39062 if (out_mode == SImode && in_mode == DFmode)
39064 if (out_n == 4 && in_n == 2)
39065 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39066 else if (out_n == 8 && in_n == 4)
39067 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39068 else if (out_n == 16 && in_n == 8)
39069 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39071 if (out_mode == SImode && in_mode == SFmode)
39073 if (out_n == 4 && in_n == 4)
39074 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39075 else if (out_n == 8 && in_n == 8)
39076 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39077 else if (out_n == 16 && in_n == 16)
39078 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39080 break;
39082 CASE_CFN_IROUND:
39083 CASE_CFN_LROUND:
39084 CASE_CFN_LLROUND:
39085 /* The round insn does not trap on denormals. */
39086 if (flag_trapping_math || !TARGET_SSE4_1)
39087 break;
39089 if (out_mode == SImode && in_mode == DFmode)
39091 if (out_n == 4 && in_n == 2)
39092 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39093 else if (out_n == 8 && in_n == 4)
39094 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39095 else if (out_n == 16 && in_n == 8)
39096 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39098 if (out_mode == SImode && in_mode == SFmode)
39100 if (out_n == 4 && in_n == 4)
39101 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39102 else if (out_n == 8 && in_n == 8)
39103 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39104 else if (out_n == 16 && in_n == 16)
39105 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39107 break;
39109 CASE_CFN_FLOOR:
39110 /* The round insn does not trap on denormals. */
39111 if (flag_trapping_math || !TARGET_SSE4_1)
39112 break;
39114 if (out_mode == DFmode && in_mode == DFmode)
39116 if (out_n == 2 && in_n == 2)
39117 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39118 else if (out_n == 4 && in_n == 4)
39119 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39120 else if (out_n == 8 && in_n == 8)
39121 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39123 if (out_mode == SFmode && in_mode == SFmode)
39125 if (out_n == 4 && in_n == 4)
39126 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39127 else if (out_n == 8 && in_n == 8)
39128 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39129 else if (out_n == 16 && in_n == 16)
39130 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39132 break;
39134 CASE_CFN_CEIL:
39135 /* The round insn does not trap on denormals. */
39136 if (flag_trapping_math || !TARGET_SSE4_1)
39137 break;
39139 if (out_mode == DFmode && in_mode == DFmode)
39141 if (out_n == 2 && in_n == 2)
39142 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39143 else if (out_n == 4 && in_n == 4)
39144 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39145 else if (out_n == 8 && in_n == 8)
39146 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39148 if (out_mode == SFmode && in_mode == SFmode)
39150 if (out_n == 4 && in_n == 4)
39151 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39152 else if (out_n == 8 && in_n == 8)
39153 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39154 else if (out_n == 16 && in_n == 16)
39155 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39157 break;
39159 CASE_CFN_TRUNC:
39160 /* The round insn does not trap on denormals. */
39161 if (flag_trapping_math || !TARGET_SSE4_1)
39162 break;
39164 if (out_mode == DFmode && in_mode == DFmode)
39166 if (out_n == 2 && in_n == 2)
39167 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39168 else if (out_n == 4 && in_n == 4)
39169 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39170 else if (out_n == 8 && in_n == 8)
39171 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39173 if (out_mode == SFmode && in_mode == SFmode)
39175 if (out_n == 4 && in_n == 4)
39176 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39177 else if (out_n == 8 && in_n == 8)
39178 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39179 else if (out_n == 16 && in_n == 16)
39180 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39182 break;
39184 CASE_CFN_RINT:
39185 /* The round insn does not trap on denormals. */
39186 if (flag_trapping_math || !TARGET_SSE4_1)
39187 break;
39189 if (out_mode == DFmode && in_mode == DFmode)
39191 if (out_n == 2 && in_n == 2)
39192 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39193 else if (out_n == 4 && in_n == 4)
39194 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39196 if (out_mode == SFmode && in_mode == SFmode)
39198 if (out_n == 4 && in_n == 4)
39199 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39200 else if (out_n == 8 && in_n == 8)
39201 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39203 break;
39205 CASE_CFN_FMA:
39206 if (out_mode == DFmode && in_mode == DFmode)
39208 if (out_n == 2 && in_n == 2)
39209 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39210 if (out_n == 4 && in_n == 4)
39211 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39213 if (out_mode == SFmode && in_mode == SFmode)
39215 if (out_n == 4 && in_n == 4)
39216 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39217 if (out_n == 8 && in_n == 8)
39218 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39220 break;
39222 default:
39223 break;
39226 /* Dispatch to a handler for a vectorization library. */
39227 if (ix86_veclib_handler)
39228 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39230 return NULL_TREE;
39233 /* Handler for an SVML-style interface to
39234 a library with vectorized intrinsics. */
39236 static tree
39237 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39239 char name[20];
39240 tree fntype, new_fndecl, args;
39241 unsigned arity;
39242 const char *bname;
39243 machine_mode el_mode, in_mode;
39244 int n, in_n;
39246 /* The SVML is suitable for unsafe math only. */
39247 if (!flag_unsafe_math_optimizations)
39248 return NULL_TREE;
39250 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39251 n = TYPE_VECTOR_SUBPARTS (type_out);
39252 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39253 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39254 if (el_mode != in_mode
39255 || n != in_n)
39256 return NULL_TREE;
39258 switch (fn)
39260 CASE_CFN_EXP:
39261 CASE_CFN_LOG:
39262 CASE_CFN_LOG10:
39263 CASE_CFN_POW:
39264 CASE_CFN_TANH:
39265 CASE_CFN_TAN:
39266 CASE_CFN_ATAN:
39267 CASE_CFN_ATAN2:
39268 CASE_CFN_ATANH:
39269 CASE_CFN_CBRT:
39270 CASE_CFN_SINH:
39271 CASE_CFN_SIN:
39272 CASE_CFN_ASINH:
39273 CASE_CFN_ASIN:
39274 CASE_CFN_COSH:
39275 CASE_CFN_COS:
39276 CASE_CFN_ACOSH:
39277 CASE_CFN_ACOS:
39278 if ((el_mode != DFmode || n != 2)
39279 && (el_mode != SFmode || n != 4))
39280 return NULL_TREE;
39281 break;
39283 default:
39284 return NULL_TREE;
39287 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39288 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39290 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39291 strcpy (name, "vmlsLn4");
39292 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39293 strcpy (name, "vmldLn2");
39294 else if (n == 4)
39296 sprintf (name, "vmls%s", bname+10);
39297 name[strlen (name)-1] = '4';
39299 else
39300 sprintf (name, "vmld%s2", bname+10);
39302 /* Convert to uppercase. */
39303 name[4] &= ~0x20;
39305 arity = 0;
39306 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39307 arity++;
39309 if (arity == 1)
39310 fntype = build_function_type_list (type_out, type_in, NULL);
39311 else
39312 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39314 /* Build a function declaration for the vectorized function. */
39315 new_fndecl = build_decl (BUILTINS_LOCATION,
39316 FUNCTION_DECL, get_identifier (name), fntype);
39317 TREE_PUBLIC (new_fndecl) = 1;
39318 DECL_EXTERNAL (new_fndecl) = 1;
39319 DECL_IS_NOVOPS (new_fndecl) = 1;
39320 TREE_READONLY (new_fndecl) = 1;
39322 return new_fndecl;
39325 /* Handler for an ACML-style interface to
39326 a library with vectorized intrinsics. */
39328 static tree
39329 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39331 char name[20] = "__vr.._";
39332 tree fntype, new_fndecl, args;
39333 unsigned arity;
39334 const char *bname;
39335 machine_mode el_mode, in_mode;
39336 int n, in_n;
39338 /* The ACML is 64bits only and suitable for unsafe math only as
39339 it does not correctly support parts of IEEE with the required
39340 precision such as denormals. */
39341 if (!TARGET_64BIT
39342 || !flag_unsafe_math_optimizations)
39343 return NULL_TREE;
39345 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39346 n = TYPE_VECTOR_SUBPARTS (type_out);
39347 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39348 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39349 if (el_mode != in_mode
39350 || n != in_n)
39351 return NULL_TREE;
39353 switch (fn)
39355 CASE_CFN_SIN:
39356 CASE_CFN_COS:
39357 CASE_CFN_EXP:
39358 CASE_CFN_LOG:
39359 CASE_CFN_LOG2:
39360 CASE_CFN_LOG10:
39361 if (el_mode == DFmode && n == 2)
39363 name[4] = 'd';
39364 name[5] = '2';
39366 else if (el_mode == SFmode && n == 4)
39368 name[4] = 's';
39369 name[5] = '4';
39371 else
39372 return NULL_TREE;
39373 break;
39375 default:
39376 return NULL_TREE;
39379 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39380 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39381 sprintf (name + 7, "%s", bname+10);
39383 arity = 0;
39384 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39385 arity++;
39387 if (arity == 1)
39388 fntype = build_function_type_list (type_out, type_in, NULL);
39389 else
39390 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39392 /* Build a function declaration for the vectorized function. */
39393 new_fndecl = build_decl (BUILTINS_LOCATION,
39394 FUNCTION_DECL, get_identifier (name), fntype);
39395 TREE_PUBLIC (new_fndecl) = 1;
39396 DECL_EXTERNAL (new_fndecl) = 1;
39397 DECL_IS_NOVOPS (new_fndecl) = 1;
39398 TREE_READONLY (new_fndecl) = 1;
39400 return new_fndecl;
39403 /* Returns a decl of a function that implements gather load with
39404 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39405 Return NULL_TREE if it is not available. */
39407 static tree
39408 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39409 const_tree index_type, int scale)
39411 bool si;
39412 enum ix86_builtins code;
39414 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39415 return NULL_TREE;
39417 if ((TREE_CODE (index_type) != INTEGER_TYPE
39418 && !POINTER_TYPE_P (index_type))
39419 || (TYPE_MODE (index_type) != SImode
39420 && TYPE_MODE (index_type) != DImode))
39421 return NULL_TREE;
39423 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39424 return NULL_TREE;
39426 /* v*gather* insn sign extends index to pointer mode. */
39427 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39428 && TYPE_UNSIGNED (index_type))
39429 return NULL_TREE;
39431 if (scale <= 0
39432 || scale > 8
39433 || (scale & (scale - 1)) != 0)
39434 return NULL_TREE;
39436 si = TYPE_MODE (index_type) == SImode;
39437 switch (TYPE_MODE (mem_vectype))
39439 case E_V2DFmode:
39440 if (TARGET_AVX512VL)
39441 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39442 else
39443 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39444 break;
39445 case E_V4DFmode:
39446 if (TARGET_AVX512VL)
39447 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39448 else
39449 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39450 break;
39451 case E_V2DImode:
39452 if (TARGET_AVX512VL)
39453 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39454 else
39455 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39456 break;
39457 case E_V4DImode:
39458 if (TARGET_AVX512VL)
39459 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39460 else
39461 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39462 break;
39463 case E_V4SFmode:
39464 if (TARGET_AVX512VL)
39465 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39466 else
39467 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39468 break;
39469 case E_V8SFmode:
39470 if (TARGET_AVX512VL)
39471 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39472 else
39473 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39474 break;
39475 case E_V4SImode:
39476 if (TARGET_AVX512VL)
39477 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39478 else
39479 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39480 break;
39481 case E_V8SImode:
39482 if (TARGET_AVX512VL)
39483 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39484 else
39485 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39486 break;
39487 case E_V8DFmode:
39488 if (TARGET_AVX512F)
39489 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39490 else
39491 return NULL_TREE;
39492 break;
39493 case E_V8DImode:
39494 if (TARGET_AVX512F)
39495 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39496 else
39497 return NULL_TREE;
39498 break;
39499 case E_V16SFmode:
39500 if (TARGET_AVX512F)
39501 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39502 else
39503 return NULL_TREE;
39504 break;
39505 case E_V16SImode:
39506 if (TARGET_AVX512F)
39507 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39508 else
39509 return NULL_TREE;
39510 break;
39511 default:
39512 return NULL_TREE;
39515 return ix86_get_builtin (code);
39518 /* Returns a decl of a function that implements scatter store with
39519 register type VECTYPE and index type INDEX_TYPE and SCALE.
39520 Return NULL_TREE if it is not available. */
39522 static tree
39523 ix86_vectorize_builtin_scatter (const_tree vectype,
39524 const_tree index_type, int scale)
39526 bool si;
39527 enum ix86_builtins code;
39529 if (!TARGET_AVX512F)
39530 return NULL_TREE;
39532 if ((TREE_CODE (index_type) != INTEGER_TYPE
39533 && !POINTER_TYPE_P (index_type))
39534 || (TYPE_MODE (index_type) != SImode
39535 && TYPE_MODE (index_type) != DImode))
39536 return NULL_TREE;
39538 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39539 return NULL_TREE;
39541 /* v*scatter* insn sign extends index to pointer mode. */
39542 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39543 && TYPE_UNSIGNED (index_type))
39544 return NULL_TREE;
39546 /* Scale can be 1, 2, 4 or 8. */
39547 if (scale <= 0
39548 || scale > 8
39549 || (scale & (scale - 1)) != 0)
39550 return NULL_TREE;
39552 si = TYPE_MODE (index_type) == SImode;
39553 switch (TYPE_MODE (vectype))
39555 case E_V8DFmode:
39556 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39557 break;
39558 case E_V8DImode:
39559 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39560 break;
39561 case E_V16SFmode:
39562 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39563 break;
39564 case E_V16SImode:
39565 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39566 break;
39567 default:
39568 return NULL_TREE;
39571 return ix86_builtins[code];
39574 /* Return true if it is safe to use the rsqrt optabs to optimize
39575 1.0/sqrt. */
39577 static bool
39578 use_rsqrt_p ()
39580 return (TARGET_SSE_MATH
39581 && flag_finite_math_only
39582 && !flag_trapping_math
39583 && flag_unsafe_math_optimizations);
39586 /* Returns a code for a target-specific builtin that implements
39587 reciprocal of the function, or NULL_TREE if not available. */
39589 static tree
39590 ix86_builtin_reciprocal (tree fndecl)
39592 switch (DECL_FUNCTION_CODE (fndecl))
39594 /* Vectorized version of sqrt to rsqrt conversion. */
39595 case IX86_BUILTIN_SQRTPS_NR:
39596 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39598 case IX86_BUILTIN_SQRTPS_NR256:
39599 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39601 default:
39602 return NULL_TREE;
39606 /* Helper for avx_vpermilps256_operand et al. This is also used by
39607 the expansion functions to turn the parallel back into a mask.
39608 The return value is 0 for no match and the imm8+1 for a match. */
39611 avx_vpermilp_parallel (rtx par, machine_mode mode)
39613 unsigned i, nelt = GET_MODE_NUNITS (mode);
39614 unsigned mask = 0;
39615 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39617 if (XVECLEN (par, 0) != (int) nelt)
39618 return 0;
39620 /* Validate that all of the elements are constants, and not totally
39621 out of range. Copy the data into an integral array to make the
39622 subsequent checks easier. */
39623 for (i = 0; i < nelt; ++i)
39625 rtx er = XVECEXP (par, 0, i);
39626 unsigned HOST_WIDE_INT ei;
39628 if (!CONST_INT_P (er))
39629 return 0;
39630 ei = INTVAL (er);
39631 if (ei >= nelt)
39632 return 0;
39633 ipar[i] = ei;
39636 switch (mode)
39638 case E_V8DFmode:
39639 /* In the 512-bit DFmode case, we can only move elements within
39640 a 128-bit lane. First fill the second part of the mask,
39641 then fallthru. */
39642 for (i = 4; i < 6; ++i)
39644 if (ipar[i] < 4 || ipar[i] >= 6)
39645 return 0;
39646 mask |= (ipar[i] - 4) << i;
39648 for (i = 6; i < 8; ++i)
39650 if (ipar[i] < 6)
39651 return 0;
39652 mask |= (ipar[i] - 6) << i;
39654 /* FALLTHRU */
39656 case E_V4DFmode:
39657 /* In the 256-bit DFmode case, we can only move elements within
39658 a 128-bit lane. */
39659 for (i = 0; i < 2; ++i)
39661 if (ipar[i] >= 2)
39662 return 0;
39663 mask |= ipar[i] << i;
39665 for (i = 2; i < 4; ++i)
39667 if (ipar[i] < 2)
39668 return 0;
39669 mask |= (ipar[i] - 2) << i;
39671 break;
39673 case E_V16SFmode:
39674 /* In 512 bit SFmode case, permutation in the upper 256 bits
39675 must mirror the permutation in the lower 256-bits. */
39676 for (i = 0; i < 8; ++i)
39677 if (ipar[i] + 8 != ipar[i + 8])
39678 return 0;
39679 /* FALLTHRU */
39681 case E_V8SFmode:
39682 /* In 256 bit SFmode case, we have full freedom of
39683 movement within the low 128-bit lane, but the high 128-bit
39684 lane must mirror the exact same pattern. */
39685 for (i = 0; i < 4; ++i)
39686 if (ipar[i] + 4 != ipar[i + 4])
39687 return 0;
39688 nelt = 4;
39689 /* FALLTHRU */
39691 case E_V2DFmode:
39692 case E_V4SFmode:
39693 /* In the 128-bit case, we've full freedom in the placement of
39694 the elements from the source operand. */
39695 for (i = 0; i < nelt; ++i)
39696 mask |= ipar[i] << (i * (nelt / 2));
39697 break;
39699 default:
39700 gcc_unreachable ();
39703 /* Make sure success has a non-zero value by adding one. */
39704 return mask + 1;
39707 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39708 the expansion functions to turn the parallel back into a mask.
39709 The return value is 0 for no match and the imm8+1 for a match. */
39712 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39714 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39715 unsigned mask = 0;
39716 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39718 if (XVECLEN (par, 0) != (int) nelt)
39719 return 0;
39721 /* Validate that all of the elements are constants, and not totally
39722 out of range. Copy the data into an integral array to make the
39723 subsequent checks easier. */
39724 for (i = 0; i < nelt; ++i)
39726 rtx er = XVECEXP (par, 0, i);
39727 unsigned HOST_WIDE_INT ei;
39729 if (!CONST_INT_P (er))
39730 return 0;
39731 ei = INTVAL (er);
39732 if (ei >= 2 * nelt)
39733 return 0;
39734 ipar[i] = ei;
39737 /* Validate that the halves of the permute are halves. */
39738 for (i = 0; i < nelt2 - 1; ++i)
39739 if (ipar[i] + 1 != ipar[i + 1])
39740 return 0;
39741 for (i = nelt2; i < nelt - 1; ++i)
39742 if (ipar[i] + 1 != ipar[i + 1])
39743 return 0;
39745 /* Reconstruct the mask. */
39746 for (i = 0; i < 2; ++i)
39748 unsigned e = ipar[i * nelt2];
39749 if (e % nelt2)
39750 return 0;
39751 e /= nelt2;
39752 mask |= e << (i * 4);
39755 /* Make sure success has a non-zero value by adding one. */
39756 return mask + 1;
39759 /* Return a register priority for hard reg REGNO. */
39760 static int
39761 ix86_register_priority (int hard_regno)
39763 /* ebp and r13 as the base always wants a displacement, r12 as the
39764 base always wants an index. So discourage their usage in an
39765 address. */
39766 if (hard_regno == R12_REG || hard_regno == R13_REG)
39767 return 0;
39768 if (hard_regno == BP_REG)
39769 return 1;
39770 /* New x86-64 int registers result in bigger code size. Discourage
39771 them. */
39772 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39773 return 2;
39774 /* New x86-64 SSE registers result in bigger code size. Discourage
39775 them. */
39776 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39777 return 2;
39778 /* Usage of AX register results in smaller code. Prefer it. */
39779 if (hard_regno == AX_REG)
39780 return 4;
39781 return 3;
39784 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39786 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39787 QImode must go into class Q_REGS.
39788 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39789 movdf to do mem-to-mem moves through integer regs. */
39791 static reg_class_t
39792 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39794 machine_mode mode = GET_MODE (x);
39796 /* We're only allowed to return a subclass of CLASS. Many of the
39797 following checks fail for NO_REGS, so eliminate that early. */
39798 if (regclass == NO_REGS)
39799 return NO_REGS;
39801 /* All classes can load zeros. */
39802 if (x == CONST0_RTX (mode))
39803 return regclass;
39805 /* Force constants into memory if we are loading a (nonzero) constant into
39806 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39807 instructions to load from a constant. */
39808 if (CONSTANT_P (x)
39809 && (MAYBE_MMX_CLASS_P (regclass)
39810 || MAYBE_SSE_CLASS_P (regclass)
39811 || MAYBE_MASK_CLASS_P (regclass)))
39812 return NO_REGS;
39814 /* Floating-point constants need more complex checks. */
39815 if (CONST_DOUBLE_P (x))
39817 /* General regs can load everything. */
39818 if (INTEGER_CLASS_P (regclass))
39819 return regclass;
39821 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39822 zero above. We only want to wind up preferring 80387 registers if
39823 we plan on doing computation with them. */
39824 if (IS_STACK_MODE (mode)
39825 && standard_80387_constant_p (x) > 0)
39827 /* Limit class to FP regs. */
39828 if (FLOAT_CLASS_P (regclass))
39829 return FLOAT_REGS;
39830 else if (regclass == FP_TOP_SSE_REGS)
39831 return FP_TOP_REG;
39832 else if (regclass == FP_SECOND_SSE_REGS)
39833 return FP_SECOND_REG;
39836 return NO_REGS;
39839 /* Prefer SSE regs only, if we can use them for math. */
39840 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39841 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39843 /* Generally when we see PLUS here, it's the function invariant
39844 (plus soft-fp const_int). Which can only be computed into general
39845 regs. */
39846 if (GET_CODE (x) == PLUS)
39847 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39849 /* QImode constants are easy to load, but non-constant QImode data
39850 must go into Q_REGS. */
39851 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39853 if (Q_CLASS_P (regclass))
39854 return regclass;
39855 else if (reg_class_subset_p (Q_REGS, regclass))
39856 return Q_REGS;
39857 else
39858 return NO_REGS;
39861 return regclass;
39864 /* Discourage putting floating-point values in SSE registers unless
39865 SSE math is being used, and likewise for the 387 registers. */
39866 static reg_class_t
39867 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39869 machine_mode mode = GET_MODE (x);
39871 /* Restrict the output reload class to the register bank that we are doing
39872 math on. If we would like not to return a subset of CLASS, reject this
39873 alternative: if reload cannot do this, it will still use its choice. */
39874 mode = GET_MODE (x);
39875 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39876 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39878 if (IS_STACK_MODE (mode))
39880 if (regclass == FP_TOP_SSE_REGS)
39881 return FP_TOP_REG;
39882 else if (regclass == FP_SECOND_SSE_REGS)
39883 return FP_SECOND_REG;
39884 else
39885 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39888 return regclass;
39891 static reg_class_t
39892 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39893 machine_mode mode, secondary_reload_info *sri)
39895 /* Double-word spills from general registers to non-offsettable memory
39896 references (zero-extended addresses) require special handling. */
39897 if (TARGET_64BIT
39898 && MEM_P (x)
39899 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39900 && INTEGER_CLASS_P (rclass)
39901 && !offsettable_memref_p (x))
39903 sri->icode = (in_p
39904 ? CODE_FOR_reload_noff_load
39905 : CODE_FOR_reload_noff_store);
39906 /* Add the cost of moving address to a temporary. */
39907 sri->extra_cost = 1;
39909 return NO_REGS;
39912 /* QImode spills from non-QI registers require
39913 intermediate register on 32bit targets. */
39914 if (mode == QImode
39915 && ((!TARGET_64BIT && !in_p
39916 && INTEGER_CLASS_P (rclass)
39917 && MAYBE_NON_Q_CLASS_P (rclass))
39918 || (!TARGET_AVX512DQ
39919 && MAYBE_MASK_CLASS_P (rclass))))
39921 int regno = true_regnum (x);
39923 /* Return Q_REGS if the operand is in memory. */
39924 if (regno == -1)
39925 return Q_REGS;
39927 return NO_REGS;
39930 /* This condition handles corner case where an expression involving
39931 pointers gets vectorized. We're trying to use the address of a
39932 stack slot as a vector initializer.
39934 (set (reg:V2DI 74 [ vect_cst_.2 ])
39935 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39937 Eventually frame gets turned into sp+offset like this:
39939 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39940 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39941 (const_int 392 [0x188]))))
39943 That later gets turned into:
39945 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39946 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39947 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39949 We'll have the following reload recorded:
39951 Reload 0: reload_in (DI) =
39952 (plus:DI (reg/f:DI 7 sp)
39953 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39954 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39955 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39956 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39957 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39958 reload_reg_rtx: (reg:V2DI 22 xmm1)
39960 Which isn't going to work since SSE instructions can't handle scalar
39961 additions. Returning GENERAL_REGS forces the addition into integer
39962 register and reload can handle subsequent reloads without problems. */
39964 if (in_p && GET_CODE (x) == PLUS
39965 && SSE_CLASS_P (rclass)
39966 && SCALAR_INT_MODE_P (mode))
39967 return GENERAL_REGS;
39969 return NO_REGS;
39972 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39974 static bool
39975 ix86_class_likely_spilled_p (reg_class_t rclass)
39977 switch (rclass)
39979 case AREG:
39980 case DREG:
39981 case CREG:
39982 case BREG:
39983 case AD_REGS:
39984 case SIREG:
39985 case DIREG:
39986 case SSE_FIRST_REG:
39987 case FP_TOP_REG:
39988 case FP_SECOND_REG:
39989 case BND_REGS:
39990 return true;
39992 default:
39993 break;
39996 return false;
39999 /* If we are copying between registers from different register sets
40000 (e.g. FP and integer), we may need a memory location.
40002 The function can't work reliably when one of the CLASSES is a class
40003 containing registers from multiple sets. We avoid this by never combining
40004 different sets in a single alternative in the machine description.
40005 Ensure that this constraint holds to avoid unexpected surprises.
40007 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40008 so do not enforce these sanity checks.
40010 To optimize register_move_cost performance, define inline variant. */
40012 static inline bool
40013 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
40014 reg_class_t class2, int strict)
40016 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40017 return false;
40019 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40020 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40021 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40022 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40023 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40024 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40025 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40026 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40028 gcc_assert (!strict || lra_in_progress);
40029 return true;
40032 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40033 return true;
40035 /* Between mask and general, we have moves no larger than word size. */
40036 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40037 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40038 return true;
40040 /* ??? This is a lie. We do have moves between mmx/general, and for
40041 mmx/sse2. But by saying we need secondary memory we discourage the
40042 register allocator from using the mmx registers unless needed. */
40043 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40044 return true;
40046 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40048 /* SSE1 doesn't have any direct moves from other classes. */
40049 if (!TARGET_SSE2)
40050 return true;
40052 /* If the target says that inter-unit moves are more expensive
40053 than moving through memory, then don't generate them. */
40054 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40055 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40056 return true;
40058 /* Between SSE and general, we have moves no larger than word size. */
40059 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40060 return true;
40063 return false;
40066 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
40068 static bool
40069 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
40070 reg_class_t class2)
40072 return inline_secondary_memory_needed (mode, class1, class2, true);
40075 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
40077 get_secondary_mem widens integral modes to BITS_PER_WORD.
40078 There is no need to emit full 64 bit move on 64 bit targets
40079 for integral modes that can be moved using 32 bit move. */
40081 static machine_mode
40082 ix86_secondary_memory_needed_mode (machine_mode mode)
40084 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
40085 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
40086 return mode;
40089 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40091 On the 80386, this is the size of MODE in words,
40092 except in the FP regs, where a single reg is always enough. */
40094 static unsigned char
40095 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40097 if (MAYBE_INTEGER_CLASS_P (rclass))
40099 if (mode == XFmode)
40100 return (TARGET_64BIT ? 2 : 3);
40101 else if (mode == XCmode)
40102 return (TARGET_64BIT ? 4 : 6);
40103 else
40104 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40106 else
40108 if (COMPLEX_MODE_P (mode))
40109 return 2;
40110 else
40111 return 1;
40115 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
40117 static bool
40118 ix86_can_change_mode_class (machine_mode from, machine_mode to,
40119 reg_class_t regclass)
40121 if (from == to)
40122 return true;
40124 /* x87 registers can't do subreg at all, as all values are reformatted
40125 to extended precision. */
40126 if (MAYBE_FLOAT_CLASS_P (regclass))
40127 return false;
40129 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40131 /* Vector registers do not support QI or HImode loads. If we don't
40132 disallow a change to these modes, reload will assume it's ok to
40133 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40134 the vec_dupv4hi pattern. */
40135 if (GET_MODE_SIZE (from) < 4)
40136 return false;
40139 return true;
40142 /* Return index of MODE in the sse load/store tables. */
40144 static inline int
40145 sse_store_index (machine_mode mode)
40147 switch (GET_MODE_SIZE (mode))
40149 case 4:
40150 return 0;
40151 case 8:
40152 return 1;
40153 case 16:
40154 return 2;
40155 case 32:
40156 return 3;
40157 case 64:
40158 return 4;
40159 default:
40160 return -1;
40164 /* Return the cost of moving data of mode M between a
40165 register and memory. A value of 2 is the default; this cost is
40166 relative to those in `REGISTER_MOVE_COST'.
40168 This function is used extensively by register_move_cost that is used to
40169 build tables at startup. Make it inline in this case.
40170 When IN is 2, return maximum of in and out move cost.
40172 If moving between registers and memory is more expensive than
40173 between two registers, you should define this macro to express the
40174 relative cost.
40176 Model also increased moving costs of QImode registers in non
40177 Q_REGS classes.
40179 static inline int
40180 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40181 int in)
40183 int cost;
40184 if (FLOAT_CLASS_P (regclass))
40186 int index;
40187 switch (mode)
40189 case E_SFmode:
40190 index = 0;
40191 break;
40192 case E_DFmode:
40193 index = 1;
40194 break;
40195 case E_XFmode:
40196 index = 2;
40197 break;
40198 default:
40199 return 100;
40201 if (in == 2)
40202 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40203 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40205 if (SSE_CLASS_P (regclass))
40207 int index = sse_store_index (mode);
40208 if (index == -1)
40209 return 100;
40210 if (in == 2)
40211 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40212 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40214 if (MMX_CLASS_P (regclass))
40216 int index;
40217 switch (GET_MODE_SIZE (mode))
40219 case 4:
40220 index = 0;
40221 break;
40222 case 8:
40223 index = 1;
40224 break;
40225 default:
40226 return 100;
40228 if (in)
40229 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40230 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40232 switch (GET_MODE_SIZE (mode))
40234 case 1:
40235 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40237 if (!in)
40238 return ix86_cost->int_store[0];
40239 if (TARGET_PARTIAL_REG_DEPENDENCY
40240 && optimize_function_for_speed_p (cfun))
40241 cost = ix86_cost->movzbl_load;
40242 else
40243 cost = ix86_cost->int_load[0];
40244 if (in == 2)
40245 return MAX (cost, ix86_cost->int_store[0]);
40246 return cost;
40248 else
40250 if (in == 2)
40251 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40252 if (in)
40253 return ix86_cost->movzbl_load;
40254 else
40255 return ix86_cost->int_store[0] + 4;
40257 break;
40258 case 2:
40259 if (in == 2)
40260 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40261 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40262 default:
40263 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40264 if (mode == TFmode)
40265 mode = XFmode;
40266 if (in == 2)
40267 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40268 else if (in)
40269 cost = ix86_cost->int_load[2];
40270 else
40271 cost = ix86_cost->int_store[2];
40272 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40276 static int
40277 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40278 bool in)
40280 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40284 /* Return the cost of moving data from a register in class CLASS1 to
40285 one in class CLASS2.
40287 It is not required that the cost always equal 2 when FROM is the same as TO;
40288 on some machines it is expensive to move between registers if they are not
40289 general registers. */
40291 static int
40292 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40293 reg_class_t class2_i)
40295 enum reg_class class1 = (enum reg_class) class1_i;
40296 enum reg_class class2 = (enum reg_class) class2_i;
40298 /* In case we require secondary memory, compute cost of the store followed
40299 by load. In order to avoid bad register allocation choices, we need
40300 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40302 if (inline_secondary_memory_needed (mode, class1, class2, false))
40304 int cost = 1;
40306 cost += inline_memory_move_cost (mode, class1, 2);
40307 cost += inline_memory_move_cost (mode, class2, 2);
40309 /* In case of copying from general_purpose_register we may emit multiple
40310 stores followed by single load causing memory size mismatch stall.
40311 Count this as arbitrarily high cost of 20. */
40312 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40313 && TARGET_MEMORY_MISMATCH_STALL
40314 && targetm.class_max_nregs (class1, mode)
40315 > targetm.class_max_nregs (class2, mode))
40316 cost += 20;
40318 /* In the case of FP/MMX moves, the registers actually overlap, and we
40319 have to switch modes in order to treat them differently. */
40320 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40321 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40322 cost += 20;
40324 return cost;
40327 /* Moves between SSE/MMX and integer unit are expensive. */
40328 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40329 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40331 /* ??? By keeping returned value relatively high, we limit the number
40332 of moves between integer and MMX/SSE registers for all targets.
40333 Additionally, high value prevents problem with x86_modes_tieable_p(),
40334 where integer modes in MMX/SSE registers are not tieable
40335 because of missing QImode and HImode moves to, from or between
40336 MMX/SSE registers. */
40337 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40338 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40340 if (MAYBE_FLOAT_CLASS_P (class1))
40341 return ix86_cost->fp_move;
40342 if (MAYBE_SSE_CLASS_P (class1))
40344 if (GET_MODE_BITSIZE (mode) <= 128)
40345 return ix86_cost->xmm_move;
40346 if (GET_MODE_BITSIZE (mode) <= 256)
40347 return ix86_cost->ymm_move;
40348 return ix86_cost->zmm_move;
40350 if (MAYBE_MMX_CLASS_P (class1))
40351 return ix86_cost->mmx_move;
40352 return 2;
40355 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
40356 words of a value of mode MODE but can be less for certain modes in
40357 special long registers.
40359 Actually there are no two word move instructions for consecutive
40360 registers. And only registers 0-3 may have mov byte instructions
40361 applied to them. */
40363 static unsigned int
40364 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40366 if (GENERAL_REGNO_P (regno))
40368 if (mode == XFmode)
40369 return TARGET_64BIT ? 2 : 3;
40370 if (mode == XCmode)
40371 return TARGET_64BIT ? 4 : 6;
40372 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40374 if (COMPLEX_MODE_P (mode))
40375 return 2;
40376 if (mode == V64SFmode || mode == V64SImode)
40377 return 4;
40378 return 1;
40381 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40383 static bool
40384 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40386 /* Flags and only flags can only hold CCmode values. */
40387 if (CC_REGNO_P (regno))
40388 return GET_MODE_CLASS (mode) == MODE_CC;
40389 if (GET_MODE_CLASS (mode) == MODE_CC
40390 || GET_MODE_CLASS (mode) == MODE_RANDOM
40391 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40392 return false;
40393 if (STACK_REGNO_P (regno))
40394 return VALID_FP_MODE_P (mode);
40395 if (MASK_REGNO_P (regno))
40396 return (VALID_MASK_REG_MODE (mode)
40397 || (TARGET_AVX512BW
40398 && VALID_MASK_AVX512BW_MODE (mode)));
40399 if (BND_REGNO_P (regno))
40400 return VALID_BND_REG_MODE (mode);
40401 if (SSE_REGNO_P (regno))
40403 /* We implement the move patterns for all vector modes into and
40404 out of SSE registers, even when no operation instructions
40405 are available. */
40407 /* For AVX-512 we allow, regardless of regno:
40408 - XI mode
40409 - any of 512-bit wide vector mode
40410 - any scalar mode. */
40411 if (TARGET_AVX512F
40412 && (mode == XImode
40413 || VALID_AVX512F_REG_MODE (mode)
40414 || VALID_AVX512F_SCALAR_MODE (mode)))
40415 return true;
40417 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40418 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40419 && MOD4_SSE_REGNO_P (regno)
40420 && mode == V64SFmode)
40421 return true;
40423 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40424 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40425 && MOD4_SSE_REGNO_P (regno)
40426 && mode == V64SImode)
40427 return true;
40429 /* TODO check for QI/HI scalars. */
40430 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40431 if (TARGET_AVX512VL
40432 && (mode == OImode
40433 || mode == TImode
40434 || VALID_AVX256_REG_MODE (mode)
40435 || VALID_AVX512VL_128_REG_MODE (mode)))
40436 return true;
40438 /* xmm16-xmm31 are only available for AVX-512. */
40439 if (EXT_REX_SSE_REGNO_P (regno))
40440 return false;
40442 /* OImode and AVX modes are available only when AVX is enabled. */
40443 return ((TARGET_AVX
40444 && VALID_AVX256_REG_OR_OI_MODE (mode))
40445 || VALID_SSE_REG_MODE (mode)
40446 || VALID_SSE2_REG_MODE (mode)
40447 || VALID_MMX_REG_MODE (mode)
40448 || VALID_MMX_REG_MODE_3DNOW (mode));
40450 if (MMX_REGNO_P (regno))
40452 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40453 so if the register is available at all, then we can move data of
40454 the given mode into or out of it. */
40455 return (VALID_MMX_REG_MODE (mode)
40456 || VALID_MMX_REG_MODE_3DNOW (mode));
40459 if (mode == QImode)
40461 /* Take care for QImode values - they can be in non-QI regs,
40462 but then they do cause partial register stalls. */
40463 if (ANY_QI_REGNO_P (regno))
40464 return true;
40465 if (!TARGET_PARTIAL_REG_STALL)
40466 return true;
40467 /* LRA checks if the hard register is OK for the given mode.
40468 QImode values can live in non-QI regs, so we allow all
40469 registers here. */
40470 if (lra_in_progress)
40471 return true;
40472 return !can_create_pseudo_p ();
40474 /* We handle both integer and floats in the general purpose registers. */
40475 else if (VALID_INT_MODE_P (mode))
40476 return true;
40477 else if (VALID_FP_MODE_P (mode))
40478 return true;
40479 else if (VALID_DFP_MODE_P (mode))
40480 return true;
40481 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40482 on to use that value in smaller contexts, this can easily force a
40483 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40484 supporting DImode, allow it. */
40485 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40486 return true;
40488 return false;
40491 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40492 saves SSE registers across calls is Win64 (thus no need to check the
40493 current ABI here), and with AVX enabled Win64 only guarantees that
40494 the low 16 bytes are saved. */
40496 static bool
40497 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40499 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40502 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40503 tieable integer mode. */
40505 static bool
40506 ix86_tieable_integer_mode_p (machine_mode mode)
40508 switch (mode)
40510 case E_HImode:
40511 case E_SImode:
40512 return true;
40514 case E_QImode:
40515 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40517 case E_DImode:
40518 return TARGET_64BIT;
40520 default:
40521 return false;
40525 /* Implement TARGET_MODES_TIEABLE_P.
40527 Return true if MODE1 is accessible in a register that can hold MODE2
40528 without copying. That is, all register classes that can hold MODE2
40529 can also hold MODE1. */
40531 static bool
40532 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40534 if (mode1 == mode2)
40535 return true;
40537 if (ix86_tieable_integer_mode_p (mode1)
40538 && ix86_tieable_integer_mode_p (mode2))
40539 return true;
40541 /* MODE2 being XFmode implies fp stack or general regs, which means we
40542 can tie any smaller floating point modes to it. Note that we do not
40543 tie this with TFmode. */
40544 if (mode2 == XFmode)
40545 return mode1 == SFmode || mode1 == DFmode;
40547 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40548 that we can tie it with SFmode. */
40549 if (mode2 == DFmode)
40550 return mode1 == SFmode;
40552 /* If MODE2 is only appropriate for an SSE register, then tie with
40553 any other mode acceptable to SSE registers. */
40554 if (GET_MODE_SIZE (mode2) == 32
40555 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40556 return (GET_MODE_SIZE (mode1) == 32
40557 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40558 if (GET_MODE_SIZE (mode2) == 16
40559 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40560 return (GET_MODE_SIZE (mode1) == 16
40561 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40563 /* If MODE2 is appropriate for an MMX register, then tie
40564 with any other mode acceptable to MMX registers. */
40565 if (GET_MODE_SIZE (mode2) == 8
40566 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40567 return (GET_MODE_SIZE (mode1) == 8
40568 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40570 return false;
40573 /* Return the cost of moving between two registers of mode MODE. */
40575 static int
40576 ix86_set_reg_reg_cost (machine_mode mode)
40578 unsigned int units = UNITS_PER_WORD;
40580 switch (GET_MODE_CLASS (mode))
40582 default:
40583 break;
40585 case MODE_CC:
40586 units = GET_MODE_SIZE (CCmode);
40587 break;
40589 case MODE_FLOAT:
40590 if ((TARGET_SSE && mode == TFmode)
40591 || (TARGET_80387 && mode == XFmode)
40592 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40593 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40594 units = GET_MODE_SIZE (mode);
40595 break;
40597 case MODE_COMPLEX_FLOAT:
40598 if ((TARGET_SSE && mode == TCmode)
40599 || (TARGET_80387 && mode == XCmode)
40600 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40601 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40602 units = GET_MODE_SIZE (mode);
40603 break;
40605 case MODE_VECTOR_INT:
40606 case MODE_VECTOR_FLOAT:
40607 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40608 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40609 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40610 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40611 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40612 units = GET_MODE_SIZE (mode);
40615 /* Return the cost of moving between two registers of mode MODE,
40616 assuming that the move will be in pieces of at most UNITS bytes. */
40617 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40620 /* Return cost of vector operation in MODE given that scalar version has
40621 COST. If PARALLEL is true assume that CPU has more than one unit
40622 performing the operation. */
40624 static int
40625 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40627 if (!VECTOR_MODE_P (mode))
40628 return cost;
40630 if (!parallel)
40631 return cost * GET_MODE_NUNITS (mode);
40632 if (GET_MODE_BITSIZE (mode) == 128
40633 && TARGET_SSE_SPLIT_REGS)
40634 return cost * 2;
40635 if (GET_MODE_BITSIZE (mode) > 128
40636 && TARGET_AVX128_OPTIMAL)
40637 return cost * GET_MODE_BITSIZE (mode) / 128;
40638 return cost;
40641 /* Return cost of multiplication in MODE. */
40643 static int
40644 ix86_multiplication_cost (const struct processor_costs *cost,
40645 enum machine_mode mode)
40647 machine_mode inner_mode = mode;
40648 if (VECTOR_MODE_P (mode))
40649 inner_mode = GET_MODE_INNER (mode);
40651 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40652 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40653 else if (X87_FLOAT_MODE_P (mode))
40654 return cost->fmul;
40655 else if (FLOAT_MODE_P (mode))
40656 return ix86_vec_cost (mode,
40657 inner_mode == DFmode
40658 ? cost->mulsd : cost->mulss, true);
40659 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40661 /* vpmullq is used in this case. No emulation is needed. */
40662 if (TARGET_AVX512DQ)
40663 return ix86_vec_cost (mode, cost->mulss, true);
40665 /* V*QImode is emulated with 7-13 insns. */
40666 if (mode == V16QImode || mode == V32QImode)
40668 int extra = 11;
40669 if (TARGET_XOP && mode == V16QImode)
40670 extra = 5;
40671 else if (TARGET_SSSE3)
40672 extra = 6;
40673 return ix86_vec_cost (mode,
40674 cost->mulss * 2 + cost->sse_op * extra,
40675 true);
40677 /* V*DImode is emulated with 5-8 insns. */
40678 else if (mode == V2DImode || mode == V4DImode)
40680 if (TARGET_XOP && mode == V2DImode)
40681 return ix86_vec_cost (mode,
40682 cost->mulss * 2 + cost->sse_op * 3,
40683 true);
40684 else
40685 return ix86_vec_cost (mode,
40686 cost->mulss * 3 + cost->sse_op * 5,
40687 true);
40689 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40690 insns, including two PMULUDQ. */
40691 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40692 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40693 true);
40694 else
40695 return ix86_vec_cost (mode, cost->mulss, true);
40697 else
40698 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40701 /* Return cost of multiplication in MODE. */
40703 static int
40704 ix86_division_cost (const struct processor_costs *cost,
40705 enum machine_mode mode)
40707 machine_mode inner_mode = mode;
40708 if (VECTOR_MODE_P (mode))
40709 inner_mode = GET_MODE_INNER (mode);
40711 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40712 return inner_mode == DFmode ? cost->divsd : cost->divss;
40713 else if (X87_FLOAT_MODE_P (mode))
40714 return cost->fdiv;
40715 else if (FLOAT_MODE_P (mode))
40716 return ix86_vec_cost (mode,
40717 inner_mode == DFmode ? cost->divsd : cost->divss,
40718 true);
40719 else
40720 return cost->divide[MODE_INDEX (mode)];
40723 /* Return cost of shift in MODE.
40724 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40725 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40726 if op1 is a result of subreg.
40728 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40730 static int
40731 ix86_shift_rotate_cost (const struct processor_costs *cost,
40732 enum machine_mode mode, bool constant_op1,
40733 HOST_WIDE_INT op1_val,
40734 bool speed,
40735 bool and_in_op1,
40736 bool shift_and_truncate,
40737 bool *skip_op0, bool *skip_op1)
40739 if (skip_op0)
40740 *skip_op0 = *skip_op1 = false;
40741 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40743 /* V*QImode is emulated with 1-11 insns. */
40744 if (mode == V16QImode || mode == V32QImode)
40746 int count = 11;
40747 if (TARGET_XOP && mode == V16QImode)
40749 /* For XOP we use vpshab, which requires a broadcast of the
40750 value to the variable shift insn. For constants this
40751 means a V16Q const in mem; even when we can perform the
40752 shift with one insn set the cost to prefer paddb. */
40753 if (constant_op1)
40755 if (skip_op1)
40756 *skip_op1 = true;
40757 return ix86_vec_cost (mode,
40758 cost->sse_op
40759 + (speed
40761 : COSTS_N_BYTES
40762 (GET_MODE_UNIT_SIZE (mode))), true);
40764 count = 3;
40766 else if (TARGET_SSSE3)
40767 count = 7;
40768 return ix86_vec_cost (mode, cost->sse_op * count, true);
40770 else
40771 return ix86_vec_cost (mode, cost->sse_op, true);
40773 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40775 if (constant_op1)
40777 if (op1_val > 32)
40778 return cost->shift_const + COSTS_N_INSNS (2);
40779 else
40780 return cost->shift_const * 2;
40782 else
40784 if (and_in_op1)
40785 return cost->shift_var * 2;
40786 else
40787 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40790 else
40792 if (constant_op1)
40793 return cost->shift_const;
40794 else if (shift_and_truncate)
40796 if (skip_op0)
40797 *skip_op0 = *skip_op1 = true;
40798 /* Return the cost after shift-and truncation. */
40799 return cost->shift_var;
40801 else
40802 return cost->shift_var;
40804 return cost->shift_const;
40807 /* Compute a (partial) cost for rtx X. Return true if the complete
40808 cost has been computed, and false if subexpressions should be
40809 scanned. In either case, *TOTAL contains the cost result. */
40811 static bool
40812 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40813 int *total, bool speed)
40815 rtx mask;
40816 enum rtx_code code = GET_CODE (x);
40817 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40818 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40819 int src_cost;
40821 switch (code)
40823 case SET:
40824 if (register_operand (SET_DEST (x), VOIDmode)
40825 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40827 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40828 return true;
40831 if (register_operand (SET_SRC (x), VOIDmode))
40832 /* Avoid potentially incorrect high cost from rtx_costs
40833 for non-tieable SUBREGs. */
40834 src_cost = 0;
40835 else
40837 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40839 if (CONSTANT_P (SET_SRC (x)))
40840 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40841 a small value, possibly zero for cheap constants. */
40842 src_cost += COSTS_N_INSNS (1);
40845 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40846 return true;
40848 case CONST_INT:
40849 case CONST:
40850 case LABEL_REF:
40851 case SYMBOL_REF:
40852 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40853 *total = 3;
40854 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40855 *total = 2;
40856 else if (flag_pic && SYMBOLIC_CONST (x)
40857 && !(TARGET_64BIT
40858 && (GET_CODE (x) == LABEL_REF
40859 || (GET_CODE (x) == SYMBOL_REF
40860 && SYMBOL_REF_LOCAL_P (x))))
40861 /* Use 0 cost for CONST to improve its propagation. */
40862 && (TARGET_64BIT || GET_CODE (x) != CONST))
40863 *total = 1;
40864 else
40865 *total = 0;
40866 return true;
40868 case CONST_DOUBLE:
40869 if (IS_STACK_MODE (mode))
40870 switch (standard_80387_constant_p (x))
40872 case -1:
40873 case 0:
40874 break;
40875 case 1: /* 0.0 */
40876 *total = 1;
40877 return true;
40878 default: /* Other constants */
40879 *total = 2;
40880 return true;
40882 /* FALLTHRU */
40884 case CONST_VECTOR:
40885 switch (standard_sse_constant_p (x, mode))
40887 case 0:
40888 break;
40889 case 1: /* 0: xor eliminates false dependency */
40890 *total = 0;
40891 return true;
40892 default: /* -1: cmp contains false dependency */
40893 *total = 1;
40894 return true;
40896 /* FALLTHRU */
40898 case CONST_WIDE_INT:
40899 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40900 it'll probably end up. Add a penalty for size. */
40901 *total = (COSTS_N_INSNS (1)
40902 + (!TARGET_64BIT && flag_pic)
40903 + (GET_MODE_SIZE (mode) <= 4
40904 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40905 return true;
40907 case ZERO_EXTEND:
40908 /* The zero extensions is often completely free on x86_64, so make
40909 it as cheap as possible. */
40910 if (TARGET_64BIT && mode == DImode
40911 && GET_MODE (XEXP (x, 0)) == SImode)
40912 *total = 1;
40913 else if (TARGET_ZERO_EXTEND_WITH_AND)
40914 *total = cost->add;
40915 else
40916 *total = cost->movzx;
40917 return false;
40919 case SIGN_EXTEND:
40920 *total = cost->movsx;
40921 return false;
40923 case ASHIFT:
40924 if (SCALAR_INT_MODE_P (mode)
40925 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40926 && CONST_INT_P (XEXP (x, 1)))
40928 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40929 if (value == 1)
40931 *total = cost->add;
40932 return false;
40934 if ((value == 2 || value == 3)
40935 && cost->lea <= cost->shift_const)
40937 *total = cost->lea;
40938 return false;
40941 /* FALLTHRU */
40943 case ROTATE:
40944 case ASHIFTRT:
40945 case LSHIFTRT:
40946 case ROTATERT:
40947 bool skip_op0, skip_op1;
40948 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40949 CONST_INT_P (XEXP (x, 1))
40950 ? INTVAL (XEXP (x, 1)) : -1,
40951 speed,
40952 GET_CODE (XEXP (x, 1)) == AND,
40953 SUBREG_P (XEXP (x, 1))
40954 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40955 &skip_op0, &skip_op1);
40956 if (skip_op0 || skip_op1)
40958 if (!skip_op0)
40959 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40960 if (!skip_op1)
40961 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40962 return true;
40964 return false;
40966 case FMA:
40968 rtx sub;
40970 gcc_assert (FLOAT_MODE_P (mode));
40971 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40973 *total = ix86_vec_cost (mode,
40974 mode == SFmode ? cost->fmass : cost->fmasd,
40975 true);
40976 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40978 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40979 sub = XEXP (x, 0);
40980 if (GET_CODE (sub) == NEG)
40981 sub = XEXP (sub, 0);
40982 *total += rtx_cost (sub, mode, FMA, 0, speed);
40984 sub = XEXP (x, 2);
40985 if (GET_CODE (sub) == NEG)
40986 sub = XEXP (sub, 0);
40987 *total += rtx_cost (sub, mode, FMA, 2, speed);
40988 return true;
40991 case MULT:
40992 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40994 rtx op0 = XEXP (x, 0);
40995 rtx op1 = XEXP (x, 1);
40996 int nbits;
40997 if (CONST_INT_P (XEXP (x, 1)))
40999 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41000 for (nbits = 0; value != 0; value &= value - 1)
41001 nbits++;
41003 else
41004 /* This is arbitrary. */
41005 nbits = 7;
41007 /* Compute costs correctly for widening multiplication. */
41008 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41009 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41010 == GET_MODE_SIZE (mode))
41012 int is_mulwiden = 0;
41013 machine_mode inner_mode = GET_MODE (op0);
41015 if (GET_CODE (op0) == GET_CODE (op1))
41016 is_mulwiden = 1, op1 = XEXP (op1, 0);
41017 else if (CONST_INT_P (op1))
41019 if (GET_CODE (op0) == SIGN_EXTEND)
41020 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41021 == INTVAL (op1);
41022 else
41023 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41026 if (is_mulwiden)
41027 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41030 *total = (cost->mult_init[MODE_INDEX (mode)]
41031 + nbits * cost->mult_bit
41032 + rtx_cost (op0, mode, outer_code, opno, speed)
41033 + rtx_cost (op1, mode, outer_code, opno, speed));
41035 return true;
41037 *total = ix86_multiplication_cost (cost, mode);
41038 return false;
41040 case DIV:
41041 case UDIV:
41042 case MOD:
41043 case UMOD:
41044 *total = ix86_division_cost (cost, mode);
41045 return false;
41047 case PLUS:
41048 if (GET_MODE_CLASS (mode) == MODE_INT
41049 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41051 if (GET_CODE (XEXP (x, 0)) == PLUS
41052 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41053 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41054 && CONSTANT_P (XEXP (x, 1)))
41056 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41057 if (val == 2 || val == 4 || val == 8)
41059 *total = cost->lea;
41060 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41061 outer_code, opno, speed);
41062 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41063 outer_code, opno, speed);
41064 *total += rtx_cost (XEXP (x, 1), mode,
41065 outer_code, opno, speed);
41066 return true;
41069 else if (GET_CODE (XEXP (x, 0)) == MULT
41070 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41072 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41073 if (val == 2 || val == 4 || val == 8)
41075 *total = cost->lea;
41076 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41077 outer_code, opno, speed);
41078 *total += rtx_cost (XEXP (x, 1), mode,
41079 outer_code, opno, speed);
41080 return true;
41083 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41085 /* Add with carry, ignore the cost of adding a carry flag. */
41086 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41087 *total = cost->add;
41088 else
41090 *total = cost->lea;
41091 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41092 outer_code, opno, speed);
41095 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41096 outer_code, opno, speed);
41097 *total += rtx_cost (XEXP (x, 1), mode,
41098 outer_code, opno, speed);
41099 return true;
41102 /* FALLTHRU */
41104 case MINUS:
41105 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41106 if (GET_MODE_CLASS (mode) == MODE_INT
41107 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41108 && GET_CODE (XEXP (x, 0)) == MINUS
41109 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41111 *total = cost->add;
41112 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41113 outer_code, opno, speed);
41114 *total += rtx_cost (XEXP (x, 1), mode,
41115 outer_code, opno, speed);
41116 return true;
41119 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41121 *total = cost->addss;
41122 return false;
41124 else if (X87_FLOAT_MODE_P (mode))
41126 *total = cost->fadd;
41127 return false;
41129 else if (FLOAT_MODE_P (mode))
41131 *total = ix86_vec_cost (mode, cost->addss, true);
41132 return false;
41134 /* FALLTHRU */
41136 case AND:
41137 case IOR:
41138 case XOR:
41139 if (GET_MODE_CLASS (mode) == MODE_INT
41140 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41142 *total = (cost->add * 2
41143 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41144 << (GET_MODE (XEXP (x, 0)) != DImode))
41145 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41146 << (GET_MODE (XEXP (x, 1)) != DImode)));
41147 return true;
41149 /* FALLTHRU */
41151 case NEG:
41152 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41154 *total = cost->sse_op;
41155 return false;
41157 else if (X87_FLOAT_MODE_P (mode))
41159 *total = cost->fchs;
41160 return false;
41162 else if (FLOAT_MODE_P (mode))
41164 *total = ix86_vec_cost (mode, cost->sse_op, true);
41165 return false;
41167 /* FALLTHRU */
41169 case NOT:
41170 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41171 *total = ix86_vec_cost (mode, cost->sse_op, true);
41172 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41173 *total = cost->add * 2;
41174 else
41175 *total = cost->add;
41176 return false;
41178 case COMPARE:
41179 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41180 && XEXP (XEXP (x, 0), 1) == const1_rtx
41181 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41182 && XEXP (x, 1) == const0_rtx)
41184 /* This kind of construct is implemented using test[bwl].
41185 Treat it as if we had an AND. */
41186 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41187 *total = (cost->add
41188 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41189 opno, speed)
41190 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41191 return true;
41194 /* The embedded comparison operand is completely free. */
41195 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41196 && XEXP (x, 1) == const0_rtx)
41197 *total = 0;
41199 return false;
41201 case FLOAT_EXTEND:
41202 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41203 *total = 0;
41204 else
41205 *total = ix86_vec_cost (mode, cost->addss, true);
41206 return false;
41208 case FLOAT_TRUNCATE:
41209 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41210 *total = cost->fadd;
41211 else
41212 *total = ix86_vec_cost (mode, cost->addss, true);
41213 return false;
41215 case ABS:
41216 /* SSE requires memory load for the constant operand. It may make
41217 sense to account for this. Of course the constant operand may or
41218 may not be reused. */
41219 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41220 *total = cost->sse_op;
41221 else if (X87_FLOAT_MODE_P (mode))
41222 *total = cost->fabs;
41223 else if (FLOAT_MODE_P (mode))
41224 *total = ix86_vec_cost (mode, cost->sse_op, true);
41225 return false;
41227 case SQRT:
41228 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41229 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
41230 else if (X87_FLOAT_MODE_P (mode))
41231 *total = cost->fsqrt;
41232 else if (FLOAT_MODE_P (mode))
41233 *total = ix86_vec_cost (mode,
41234 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
41235 true);
41236 return false;
41238 case UNSPEC:
41239 if (XINT (x, 1) == UNSPEC_TP)
41240 *total = 0;
41241 return false;
41243 case VEC_SELECT:
41244 case VEC_CONCAT:
41245 case VEC_DUPLICATE:
41246 /* ??? Assume all of these vector manipulation patterns are
41247 recognizable. In which case they all pretty much have the
41248 same cost. */
41249 *total = cost->sse_op;
41250 return true;
41251 case VEC_MERGE:
41252 mask = XEXP (x, 2);
41253 /* This is masked instruction, assume the same cost,
41254 as nonmasked variant. */
41255 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41256 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41257 else
41258 *total = cost->sse_op;
41259 return true;
41261 default:
41262 return false;
41266 #if TARGET_MACHO
41268 static int current_machopic_label_num;
41270 /* Given a symbol name and its associated stub, write out the
41271 definition of the stub. */
41273 void
41274 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41276 unsigned int length;
41277 char *binder_name, *symbol_name, lazy_ptr_name[32];
41278 int label = ++current_machopic_label_num;
41280 /* For 64-bit we shouldn't get here. */
41281 gcc_assert (!TARGET_64BIT);
41283 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41284 symb = targetm.strip_name_encoding (symb);
41286 length = strlen (stub);
41287 binder_name = XALLOCAVEC (char, length + 32);
41288 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41290 length = strlen (symb);
41291 symbol_name = XALLOCAVEC (char, length + 32);
41292 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41294 sprintf (lazy_ptr_name, "L%d$lz", label);
41296 if (MACHOPIC_ATT_STUB)
41297 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41298 else if (MACHOPIC_PURE)
41299 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41300 else
41301 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41303 fprintf (file, "%s:\n", stub);
41304 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41306 if (MACHOPIC_ATT_STUB)
41308 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41310 else if (MACHOPIC_PURE)
41312 /* PIC stub. */
41313 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41314 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41315 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41316 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41317 label, lazy_ptr_name, label);
41318 fprintf (file, "\tjmp\t*%%ecx\n");
41320 else
41321 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41323 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41324 it needs no stub-binding-helper. */
41325 if (MACHOPIC_ATT_STUB)
41326 return;
41328 fprintf (file, "%s:\n", binder_name);
41330 if (MACHOPIC_PURE)
41332 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41333 fprintf (file, "\tpushl\t%%ecx\n");
41335 else
41336 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41338 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41340 /* N.B. Keep the correspondence of these
41341 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41342 old-pic/new-pic/non-pic stubs; altering this will break
41343 compatibility with existing dylibs. */
41344 if (MACHOPIC_PURE)
41346 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41347 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41349 else
41350 /* 16-byte -mdynamic-no-pic stub. */
41351 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41353 fprintf (file, "%s:\n", lazy_ptr_name);
41354 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41355 fprintf (file, ASM_LONG "%s\n", binder_name);
41357 #endif /* TARGET_MACHO */
41359 /* Order the registers for register allocator. */
41361 void
41362 x86_order_regs_for_local_alloc (void)
41364 int pos = 0;
41365 int i;
41367 /* First allocate the local general purpose registers. */
41368 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41369 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41370 reg_alloc_order [pos++] = i;
41372 /* Global general purpose registers. */
41373 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41374 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41375 reg_alloc_order [pos++] = i;
41377 /* x87 registers come first in case we are doing FP math
41378 using them. */
41379 if (!TARGET_SSE_MATH)
41380 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41381 reg_alloc_order [pos++] = i;
41383 /* SSE registers. */
41384 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41385 reg_alloc_order [pos++] = i;
41386 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41387 reg_alloc_order [pos++] = i;
41389 /* Extended REX SSE registers. */
41390 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41391 reg_alloc_order [pos++] = i;
41393 /* Mask register. */
41394 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41395 reg_alloc_order [pos++] = i;
41397 /* MPX bound registers. */
41398 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41399 reg_alloc_order [pos++] = i;
41401 /* x87 registers. */
41402 if (TARGET_SSE_MATH)
41403 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41404 reg_alloc_order [pos++] = i;
41406 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41407 reg_alloc_order [pos++] = i;
41409 /* Initialize the rest of array as we do not allocate some registers
41410 at all. */
41411 while (pos < FIRST_PSEUDO_REGISTER)
41412 reg_alloc_order [pos++] = 0;
41415 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41416 in struct attribute_spec handler. */
41417 static tree
41418 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41419 bool *no_add_attrs)
41421 if (TREE_CODE (*node) != FUNCTION_TYPE
41422 && TREE_CODE (*node) != METHOD_TYPE
41423 && TREE_CODE (*node) != FIELD_DECL
41424 && TREE_CODE (*node) != TYPE_DECL)
41426 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41427 name);
41428 *no_add_attrs = true;
41429 return NULL_TREE;
41431 if (TARGET_64BIT)
41433 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41434 name);
41435 *no_add_attrs = true;
41436 return NULL_TREE;
41438 if (is_attribute_p ("callee_pop_aggregate_return", name))
41440 tree cst;
41442 cst = TREE_VALUE (args);
41443 if (TREE_CODE (cst) != INTEGER_CST)
41445 warning (OPT_Wattributes,
41446 "%qE attribute requires an integer constant argument",
41447 name);
41448 *no_add_attrs = true;
41450 else if (compare_tree_int (cst, 0) != 0
41451 && compare_tree_int (cst, 1) != 0)
41453 warning (OPT_Wattributes,
41454 "argument to %qE attribute is neither zero, nor one",
41455 name);
41456 *no_add_attrs = true;
41459 return NULL_TREE;
41462 return NULL_TREE;
41465 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41466 struct attribute_spec.handler. */
41467 static tree
41468 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41469 bool *no_add_attrs)
41471 if (TREE_CODE (*node) != FUNCTION_TYPE
41472 && TREE_CODE (*node) != METHOD_TYPE
41473 && TREE_CODE (*node) != FIELD_DECL
41474 && TREE_CODE (*node) != TYPE_DECL)
41476 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41477 name);
41478 *no_add_attrs = true;
41479 return NULL_TREE;
41482 /* Can combine regparm with all attributes but fastcall. */
41483 if (is_attribute_p ("ms_abi", name))
41485 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41487 error ("ms_abi and sysv_abi attributes are not compatible");
41490 return NULL_TREE;
41492 else if (is_attribute_p ("sysv_abi", name))
41494 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41496 error ("ms_abi and sysv_abi attributes are not compatible");
41499 return NULL_TREE;
41502 return NULL_TREE;
41505 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41506 struct attribute_spec.handler. */
41507 static tree
41508 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41509 bool *no_add_attrs)
41511 tree *type = NULL;
41512 if (DECL_P (*node))
41514 if (TREE_CODE (*node) == TYPE_DECL)
41515 type = &TREE_TYPE (*node);
41517 else
41518 type = node;
41520 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41522 warning (OPT_Wattributes, "%qE attribute ignored",
41523 name);
41524 *no_add_attrs = true;
41527 else if ((is_attribute_p ("ms_struct", name)
41528 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41529 || ((is_attribute_p ("gcc_struct", name)
41530 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41532 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41533 name);
41534 *no_add_attrs = true;
41537 return NULL_TREE;
41540 static tree
41541 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41542 bool *no_add_attrs)
41544 if (TREE_CODE (*node) != FUNCTION_DECL)
41546 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41547 name);
41548 *no_add_attrs = true;
41551 if (is_attribute_p ("indirect_branch", name))
41553 tree cst = TREE_VALUE (args);
41554 if (TREE_CODE (cst) != STRING_CST)
41556 warning (OPT_Wattributes,
41557 "%qE attribute requires a string constant argument",
41558 name);
41559 *no_add_attrs = true;
41561 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41562 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41563 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41564 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41566 warning (OPT_Wattributes,
41567 "argument to %qE attribute is not "
41568 "(keep|thunk|thunk-inline|thunk-extern)", name);
41569 *no_add_attrs = true;
41573 if (is_attribute_p ("function_return", name))
41575 tree cst = TREE_VALUE (args);
41576 if (TREE_CODE (cst) != STRING_CST)
41578 warning (OPT_Wattributes,
41579 "%qE attribute requires a string constant argument",
41580 name);
41581 *no_add_attrs = true;
41583 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41584 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41585 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41586 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41588 warning (OPT_Wattributes,
41589 "argument to %qE attribute is not "
41590 "(keep|thunk|thunk-inline|thunk-extern)", name);
41591 *no_add_attrs = true;
41595 return NULL_TREE;
41598 static tree
41599 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41600 int, bool *)
41602 return NULL_TREE;
41605 static tree
41606 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41608 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41609 but the function type contains args and return type data. */
41610 tree func_type = *node;
41611 tree return_type = TREE_TYPE (func_type);
41613 int nargs = 0;
41614 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41615 while (current_arg_type
41616 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41618 if (nargs == 0)
41620 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41621 error ("interrupt service routine should have a pointer "
41622 "as the first argument");
41624 else if (nargs == 1)
41626 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41627 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41628 error ("interrupt service routine should have unsigned %s"
41629 "int as the second argument",
41630 TARGET_64BIT
41631 ? (TARGET_X32 ? "long long " : "long ")
41632 : "");
41634 nargs++;
41635 current_arg_type = TREE_CHAIN (current_arg_type);
41637 if (!nargs || nargs > 2)
41638 error ("interrupt service routine can only have a pointer argument "
41639 "and an optional integer argument");
41640 if (! VOID_TYPE_P (return_type))
41641 error ("interrupt service routine can't have non-void return value");
41643 return NULL_TREE;
41646 static bool
41647 ix86_ms_bitfield_layout_p (const_tree record_type)
41649 return ((TARGET_MS_BITFIELD_LAYOUT
41650 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41651 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41654 /* Returns an expression indicating where the this parameter is
41655 located on entry to the FUNCTION. */
41657 static rtx
41658 x86_this_parameter (tree function)
41660 tree type = TREE_TYPE (function);
41661 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41662 int nregs;
41664 if (TARGET_64BIT)
41666 const int *parm_regs;
41668 if (ix86_function_type_abi (type) == MS_ABI)
41669 parm_regs = x86_64_ms_abi_int_parameter_registers;
41670 else
41671 parm_regs = x86_64_int_parameter_registers;
41672 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41675 nregs = ix86_function_regparm (type, function);
41677 if (nregs > 0 && !stdarg_p (type))
41679 int regno;
41680 unsigned int ccvt = ix86_get_callcvt (type);
41682 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41683 regno = aggr ? DX_REG : CX_REG;
41684 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41686 regno = CX_REG;
41687 if (aggr)
41688 return gen_rtx_MEM (SImode,
41689 plus_constant (Pmode, stack_pointer_rtx, 4));
41691 else
41693 regno = AX_REG;
41694 if (aggr)
41696 regno = DX_REG;
41697 if (nregs == 1)
41698 return gen_rtx_MEM (SImode,
41699 plus_constant (Pmode,
41700 stack_pointer_rtx, 4));
41703 return gen_rtx_REG (SImode, regno);
41706 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41707 aggr ? 8 : 4));
41710 /* Determine whether x86_output_mi_thunk can succeed. */
41712 static bool
41713 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41714 const_tree function)
41716 /* 64-bit can handle anything. */
41717 if (TARGET_64BIT)
41718 return true;
41720 /* For 32-bit, everything's fine if we have one free register. */
41721 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41722 return true;
41724 /* Need a free register for vcall_offset. */
41725 if (vcall_offset)
41726 return false;
41728 /* Need a free register for GOT references. */
41729 if (flag_pic && !targetm.binds_local_p (function))
41730 return false;
41732 /* Otherwise ok. */
41733 return true;
41736 /* Output the assembler code for a thunk function. THUNK_DECL is the
41737 declaration for the thunk function itself, FUNCTION is the decl for
41738 the target function. DELTA is an immediate constant offset to be
41739 added to THIS. If VCALL_OFFSET is nonzero, the word at
41740 *(*this + vcall_offset) should be added to THIS. */
41742 static void
41743 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41744 HOST_WIDE_INT vcall_offset, tree function)
41746 rtx this_param = x86_this_parameter (function);
41747 rtx this_reg, tmp, fnaddr;
41748 unsigned int tmp_regno;
41749 rtx_insn *insn;
41751 if (TARGET_64BIT)
41752 tmp_regno = R10_REG;
41753 else
41755 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41756 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41757 tmp_regno = AX_REG;
41758 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41759 tmp_regno = DX_REG;
41760 else
41761 tmp_regno = CX_REG;
41764 emit_note (NOTE_INSN_PROLOGUE_END);
41766 /* CET is enabled, insert EB instruction. */
41767 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
41768 emit_insn (gen_nop_endbr ());
41770 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41771 pull it in now and let DELTA benefit. */
41772 if (REG_P (this_param))
41773 this_reg = this_param;
41774 else if (vcall_offset)
41776 /* Put the this parameter into %eax. */
41777 this_reg = gen_rtx_REG (Pmode, AX_REG);
41778 emit_move_insn (this_reg, this_param);
41780 else
41781 this_reg = NULL_RTX;
41783 /* Adjust the this parameter by a fixed constant. */
41784 if (delta)
41786 rtx delta_rtx = GEN_INT (delta);
41787 rtx delta_dst = this_reg ? this_reg : this_param;
41789 if (TARGET_64BIT)
41791 if (!x86_64_general_operand (delta_rtx, Pmode))
41793 tmp = gen_rtx_REG (Pmode, tmp_regno);
41794 emit_move_insn (tmp, delta_rtx);
41795 delta_rtx = tmp;
41799 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41802 /* Adjust the this parameter by a value stored in the vtable. */
41803 if (vcall_offset)
41805 rtx vcall_addr, vcall_mem, this_mem;
41807 tmp = gen_rtx_REG (Pmode, tmp_regno);
41809 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41810 if (Pmode != ptr_mode)
41811 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41812 emit_move_insn (tmp, this_mem);
41814 /* Adjust the this parameter. */
41815 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41816 if (TARGET_64BIT
41817 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41819 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41820 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41821 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41824 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41825 if (Pmode != ptr_mode)
41826 emit_insn (gen_addsi_1_zext (this_reg,
41827 gen_rtx_REG (ptr_mode,
41828 REGNO (this_reg)),
41829 vcall_mem));
41830 else
41831 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41834 /* If necessary, drop THIS back to its stack slot. */
41835 if (this_reg && this_reg != this_param)
41836 emit_move_insn (this_param, this_reg);
41838 fnaddr = XEXP (DECL_RTL (function), 0);
41839 if (TARGET_64BIT)
41841 if (!flag_pic || targetm.binds_local_p (function)
41842 || TARGET_PECOFF)
41844 else
41846 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41847 tmp = gen_rtx_CONST (Pmode, tmp);
41848 fnaddr = gen_const_mem (Pmode, tmp);
41851 else
41853 if (!flag_pic || targetm.binds_local_p (function))
41855 #if TARGET_MACHO
41856 else if (TARGET_MACHO)
41858 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41859 fnaddr = XEXP (fnaddr, 0);
41861 #endif /* TARGET_MACHO */
41862 else
41864 tmp = gen_rtx_REG (Pmode, CX_REG);
41865 output_set_got (tmp, NULL_RTX);
41867 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41868 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41869 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41870 fnaddr = gen_const_mem (Pmode, fnaddr);
41874 /* Our sibling call patterns do not allow memories, because we have no
41875 predicate that can distinguish between frame and non-frame memory.
41876 For our purposes here, we can get away with (ab)using a jump pattern,
41877 because we're going to do no optimization. */
41878 if (MEM_P (fnaddr))
41880 if (sibcall_insn_operand (fnaddr, word_mode))
41882 fnaddr = XEXP (DECL_RTL (function), 0);
41883 tmp = gen_rtx_MEM (QImode, fnaddr);
41884 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41885 tmp = emit_call_insn (tmp);
41886 SIBLING_CALL_P (tmp) = 1;
41888 else
41889 emit_jump_insn (gen_indirect_jump (fnaddr));
41891 else
41893 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41895 // CM_LARGE_PIC always uses pseudo PIC register which is
41896 // uninitialized. Since FUNCTION is local and calling it
41897 // doesn't go through PLT, we use scratch register %r11 as
41898 // PIC register and initialize it here.
41899 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41900 ix86_init_large_pic_reg (tmp_regno);
41901 fnaddr = legitimize_pic_address (fnaddr,
41902 gen_rtx_REG (Pmode, tmp_regno));
41905 if (!sibcall_insn_operand (fnaddr, word_mode))
41907 tmp = gen_rtx_REG (word_mode, tmp_regno);
41908 if (GET_MODE (fnaddr) != word_mode)
41909 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41910 emit_move_insn (tmp, fnaddr);
41911 fnaddr = tmp;
41914 tmp = gen_rtx_MEM (QImode, fnaddr);
41915 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41916 tmp = emit_call_insn (tmp);
41917 SIBLING_CALL_P (tmp) = 1;
41919 emit_barrier ();
41921 /* Emit just enough of rest_of_compilation to get the insns emitted.
41922 Note that use_thunk calls assemble_start_function et al. */
41923 insn = get_insns ();
41924 shorten_branches (insn);
41925 final_start_function (insn, file, 1);
41926 final (insn, file, 1);
41927 final_end_function ();
41930 static void
41931 x86_file_start (void)
41933 default_file_start ();
41934 if (TARGET_16BIT)
41935 fputs ("\t.code16gcc\n", asm_out_file);
41936 #if TARGET_MACHO
41937 darwin_file_start ();
41938 #endif
41939 if (X86_FILE_START_VERSION_DIRECTIVE)
41940 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41941 if (X86_FILE_START_FLTUSED)
41942 fputs ("\t.global\t__fltused\n", asm_out_file);
41943 if (ix86_asm_dialect == ASM_INTEL)
41944 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41948 x86_field_alignment (tree type, int computed)
41950 machine_mode mode;
41952 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41953 return computed;
41954 if (TARGET_IAMCU)
41955 return iamcu_alignment (type, computed);
41956 mode = TYPE_MODE (strip_array_types (type));
41957 if (mode == DFmode || mode == DCmode
41958 || GET_MODE_CLASS (mode) == MODE_INT
41959 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41960 return MIN (32, computed);
41961 return computed;
41964 /* Print call to TARGET to FILE. */
41966 static void
41967 x86_print_call_or_nop (FILE *file, const char *target)
41969 if (flag_nop_mcount)
41970 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41971 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41972 else
41973 fprintf (file, "1:\tcall\t%s\n", target);
41976 /* Output assembler code to FILE to increment profiler label # LABELNO
41977 for profiling a function entry. */
41978 void
41979 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41981 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41982 : MCOUNT_NAME);
41983 if (TARGET_64BIT)
41985 #ifndef NO_PROFILE_COUNTERS
41986 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41987 #endif
41989 if (!TARGET_PECOFF && flag_pic)
41990 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41991 else
41992 x86_print_call_or_nop (file, mcount_name);
41994 else if (flag_pic)
41996 #ifndef NO_PROFILE_COUNTERS
41997 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41998 LPREFIX, labelno);
41999 #endif
42000 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42002 else
42004 #ifndef NO_PROFILE_COUNTERS
42005 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42006 LPREFIX, labelno);
42007 #endif
42008 x86_print_call_or_nop (file, mcount_name);
42011 if (flag_record_mcount)
42013 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42014 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42015 fprintf (file, "\t.previous\n");
42019 /* We don't have exact information about the insn sizes, but we may assume
42020 quite safely that we are informed about all 1 byte insns and memory
42021 address sizes. This is enough to eliminate unnecessary padding in
42022 99% of cases. */
42025 ix86_min_insn_size (rtx_insn *insn)
42027 int l = 0, len;
42029 if (!INSN_P (insn) || !active_insn_p (insn))
42030 return 0;
42032 /* Discard alignments we've emit and jump instructions. */
42033 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42034 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42035 return 0;
42037 /* Important case - calls are always 5 bytes.
42038 It is common to have many calls in the row. */
42039 if (CALL_P (insn)
42040 && symbolic_reference_mentioned_p (PATTERN (insn))
42041 && !SIBLING_CALL_P (insn))
42042 return 5;
42043 len = get_attr_length (insn);
42044 if (len <= 1)
42045 return 1;
42047 /* For normal instructions we rely on get_attr_length being exact,
42048 with a few exceptions. */
42049 if (!JUMP_P (insn))
42051 enum attr_type type = get_attr_type (insn);
42053 switch (type)
42055 case TYPE_MULTI:
42056 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42057 || asm_noperands (PATTERN (insn)) >= 0)
42058 return 0;
42059 break;
42060 case TYPE_OTHER:
42061 case TYPE_FCMP:
42062 break;
42063 default:
42064 /* Otherwise trust get_attr_length. */
42065 return len;
42068 l = get_attr_length_address (insn);
42069 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42070 l = 4;
42072 if (l)
42073 return 1+l;
42074 else
42075 return 2;
42078 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42080 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42081 window. */
42083 static void
42084 ix86_avoid_jump_mispredicts (void)
42086 rtx_insn *insn, *start = get_insns ();
42087 int nbytes = 0, njumps = 0;
42088 bool isjump = false;
42090 /* Look for all minimal intervals of instructions containing 4 jumps.
42091 The intervals are bounded by START and INSN. NBYTES is the total
42092 size of instructions in the interval including INSN and not including
42093 START. When the NBYTES is smaller than 16 bytes, it is possible
42094 that the end of START and INSN ends up in the same 16byte page.
42096 The smallest offset in the page INSN can start is the case where START
42097 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42098 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42100 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42101 have to, control transfer to label(s) can be performed through other
42102 means, and also we estimate minimum length of all asm stmts as 0. */
42103 for (insn = start; insn; insn = NEXT_INSN (insn))
42105 int min_size;
42107 if (LABEL_P (insn))
42109 int align = label_to_alignment (insn);
42110 int max_skip = label_to_max_skip (insn);
42112 if (max_skip > 15)
42113 max_skip = 15;
42114 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42115 already in the current 16 byte page, because otherwise
42116 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42117 bytes to reach 16 byte boundary. */
42118 if (align <= 0
42119 || (align <= 3 && max_skip != (1 << align) - 1))
42120 max_skip = 0;
42121 if (dump_file)
42122 fprintf (dump_file, "Label %i with max_skip %i\n",
42123 INSN_UID (insn), max_skip);
42124 if (max_skip)
42126 while (nbytes + max_skip >= 16)
42128 start = NEXT_INSN (start);
42129 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42130 || CALL_P (start))
42131 njumps--, isjump = true;
42132 else
42133 isjump = false;
42134 nbytes -= ix86_min_insn_size (start);
42137 continue;
42140 min_size = ix86_min_insn_size (insn);
42141 nbytes += min_size;
42142 if (dump_file)
42143 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42144 INSN_UID (insn), min_size);
42145 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42146 || CALL_P (insn))
42147 njumps++;
42148 else
42149 continue;
42151 while (njumps > 3)
42153 start = NEXT_INSN (start);
42154 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42155 || CALL_P (start))
42156 njumps--, isjump = true;
42157 else
42158 isjump = false;
42159 nbytes -= ix86_min_insn_size (start);
42161 gcc_assert (njumps >= 0);
42162 if (dump_file)
42163 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42164 INSN_UID (start), INSN_UID (insn), nbytes);
42166 if (njumps == 3 && isjump && nbytes < 16)
42168 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
42170 if (dump_file)
42171 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42172 INSN_UID (insn), padsize);
42173 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42177 #endif
42179 /* AMD Athlon works faster
42180 when RET is not destination of conditional jump or directly preceded
42181 by other jump instruction. We avoid the penalty by inserting NOP just
42182 before the RET instructions in such cases. */
42183 static void
42184 ix86_pad_returns (void)
42186 edge e;
42187 edge_iterator ei;
42189 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42191 basic_block bb = e->src;
42192 rtx_insn *ret = BB_END (bb);
42193 rtx_insn *prev;
42194 bool replace = false;
42196 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42197 || optimize_bb_for_size_p (bb))
42198 continue;
42199 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42200 if (active_insn_p (prev) || LABEL_P (prev))
42201 break;
42202 if (prev && LABEL_P (prev))
42204 edge e;
42205 edge_iterator ei;
42207 FOR_EACH_EDGE (e, ei, bb->preds)
42208 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42209 && !(e->flags & EDGE_FALLTHRU))
42211 replace = true;
42212 break;
42215 if (!replace)
42217 prev = prev_active_insn (ret);
42218 if (prev
42219 && ((JUMP_P (prev) && any_condjump_p (prev))
42220 || CALL_P (prev)))
42221 replace = true;
42222 /* Empty functions get branch mispredict even when
42223 the jump destination is not visible to us. */
42224 if (!prev && !optimize_function_for_size_p (cfun))
42225 replace = true;
42227 if (replace)
42229 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42230 delete_insn (ret);
42235 /* Count the minimum number of instructions in BB. Return 4 if the
42236 number of instructions >= 4. */
42238 static int
42239 ix86_count_insn_bb (basic_block bb)
42241 rtx_insn *insn;
42242 int insn_count = 0;
42244 /* Count number of instructions in this block. Return 4 if the number
42245 of instructions >= 4. */
42246 FOR_BB_INSNS (bb, insn)
42248 /* Only happen in exit blocks. */
42249 if (JUMP_P (insn)
42250 && ANY_RETURN_P (PATTERN (insn)))
42251 break;
42253 if (NONDEBUG_INSN_P (insn)
42254 && GET_CODE (PATTERN (insn)) != USE
42255 && GET_CODE (PATTERN (insn)) != CLOBBER)
42257 insn_count++;
42258 if (insn_count >= 4)
42259 return insn_count;
42263 return insn_count;
42267 /* Count the minimum number of instructions in code path in BB.
42268 Return 4 if the number of instructions >= 4. */
42270 static int
42271 ix86_count_insn (basic_block bb)
42273 edge e;
42274 edge_iterator ei;
42275 int min_prev_count;
42277 /* Only bother counting instructions along paths with no
42278 more than 2 basic blocks between entry and exit. Given
42279 that BB has an edge to exit, determine if a predecessor
42280 of BB has an edge from entry. If so, compute the number
42281 of instructions in the predecessor block. If there
42282 happen to be multiple such blocks, compute the minimum. */
42283 min_prev_count = 4;
42284 FOR_EACH_EDGE (e, ei, bb->preds)
42286 edge prev_e;
42287 edge_iterator prev_ei;
42289 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42291 min_prev_count = 0;
42292 break;
42294 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42296 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42298 int count = ix86_count_insn_bb (e->src);
42299 if (count < min_prev_count)
42300 min_prev_count = count;
42301 break;
42306 if (min_prev_count < 4)
42307 min_prev_count += ix86_count_insn_bb (bb);
42309 return min_prev_count;
42312 /* Pad short function to 4 instructions. */
42314 static void
42315 ix86_pad_short_function (void)
42317 edge e;
42318 edge_iterator ei;
42320 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42322 rtx_insn *ret = BB_END (e->src);
42323 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42325 int insn_count = ix86_count_insn (e->src);
42327 /* Pad short function. */
42328 if (insn_count < 4)
42330 rtx_insn *insn = ret;
42332 /* Find epilogue. */
42333 while (insn
42334 && (!NOTE_P (insn)
42335 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42336 insn = PREV_INSN (insn);
42338 if (!insn)
42339 insn = ret;
42341 /* Two NOPs count as one instruction. */
42342 insn_count = 2 * (4 - insn_count);
42343 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42349 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42350 the epilogue, the Windows system unwinder will apply epilogue logic and
42351 produce incorrect offsets. This can be avoided by adding a nop between
42352 the last insn that can throw and the first insn of the epilogue. */
42354 static void
42355 ix86_seh_fixup_eh_fallthru (void)
42357 edge e;
42358 edge_iterator ei;
42360 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42362 rtx_insn *insn, *next;
42364 /* Find the beginning of the epilogue. */
42365 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42366 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42367 break;
42368 if (insn == NULL)
42369 continue;
42371 /* We only care about preceding insns that can throw. */
42372 insn = prev_active_insn (insn);
42373 if (insn == NULL || !can_throw_internal (insn))
42374 continue;
42376 /* Do not separate calls from their debug information. */
42377 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42378 if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
42379 insn = next;
42380 else
42381 break;
42383 emit_insn_after (gen_nops (const1_rtx), insn);
42387 /* Given a register number BASE, the lowest of a group of registers, update
42388 regsets IN and OUT with the registers that should be avoided in input
42389 and output operands respectively when trying to avoid generating a modr/m
42390 byte for -mmitigate-rop. */
42392 static void
42393 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42395 SET_HARD_REG_BIT (out, base);
42396 SET_HARD_REG_BIT (out, base + 1);
42397 SET_HARD_REG_BIT (in, base + 2);
42398 SET_HARD_REG_BIT (in, base + 3);
42401 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42402 that certain encodings of modr/m bytes do not occur. */
42403 static void
42404 ix86_mitigate_rop (void)
42406 HARD_REG_SET input_risky;
42407 HARD_REG_SET output_risky;
42408 HARD_REG_SET inout_risky;
42410 CLEAR_HARD_REG_SET (output_risky);
42411 CLEAR_HARD_REG_SET (input_risky);
42412 SET_HARD_REG_BIT (output_risky, AX_REG);
42413 SET_HARD_REG_BIT (output_risky, CX_REG);
42414 SET_HARD_REG_BIT (input_risky, BX_REG);
42415 SET_HARD_REG_BIT (input_risky, DX_REG);
42416 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42417 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42418 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42419 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42420 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42421 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42422 COPY_HARD_REG_SET (inout_risky, input_risky);
42423 IOR_HARD_REG_SET (inout_risky, output_risky);
42425 df_note_add_problem ();
42426 /* Fix up what stack-regs did. */
42427 df_insn_rescan_all ();
42428 df_analyze ();
42430 regrename_init (true);
42431 regrename_analyze (NULL);
42433 auto_vec<du_head_p> cands;
42435 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42437 if (!NONDEBUG_INSN_P (insn))
42438 continue;
42440 if (GET_CODE (PATTERN (insn)) == USE
42441 || GET_CODE (PATTERN (insn)) == CLOBBER)
42442 continue;
42444 extract_insn (insn);
42446 int opno0, opno1;
42447 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42448 recog_data.n_operands, &opno0,
42449 &opno1);
42451 if (!ix86_rop_should_change_byte_p (modrm))
42452 continue;
42454 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42456 /* This happens when regrename has to fail a block. */
42457 if (!info->op_info)
42458 continue;
42460 if (info->op_info[opno0].n_chains != 0)
42462 gcc_assert (info->op_info[opno0].n_chains == 1);
42463 du_head_p op0c;
42464 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42465 if (op0c->target_data_1 + op0c->target_data_2 == 0
42466 && !op0c->cannot_rename)
42467 cands.safe_push (op0c);
42469 op0c->target_data_1++;
42471 if (info->op_info[opno1].n_chains != 0)
42473 gcc_assert (info->op_info[opno1].n_chains == 1);
42474 du_head_p op1c;
42475 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42476 if (op1c->target_data_1 + op1c->target_data_2 == 0
42477 && !op1c->cannot_rename)
42478 cands.safe_push (op1c);
42480 op1c->target_data_2++;
42484 int i;
42485 du_head_p head;
42486 FOR_EACH_VEC_ELT (cands, i, head)
42488 int old_reg, best_reg;
42489 HARD_REG_SET unavailable;
42491 CLEAR_HARD_REG_SET (unavailable);
42492 if (head->target_data_1)
42493 IOR_HARD_REG_SET (unavailable, output_risky);
42494 if (head->target_data_2)
42495 IOR_HARD_REG_SET (unavailable, input_risky);
42497 int n_uses;
42498 reg_class superclass = regrename_find_superclass (head, &n_uses,
42499 &unavailable);
42500 old_reg = head->regno;
42501 best_reg = find_rename_reg (head, superclass, &unavailable,
42502 old_reg, false);
42503 bool ok = regrename_do_replace (head, best_reg);
42504 gcc_assert (ok);
42505 if (dump_file)
42506 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42507 reg_names[best_reg], reg_class_names[superclass]);
42511 regrename_finish ();
42513 df_analyze ();
42515 basic_block bb;
42516 regset_head live;
42518 INIT_REG_SET (&live);
42520 FOR_EACH_BB_FN (bb, cfun)
42522 rtx_insn *insn;
42524 COPY_REG_SET (&live, DF_LR_OUT (bb));
42525 df_simulate_initialize_backwards (bb, &live);
42527 FOR_BB_INSNS_REVERSE (bb, insn)
42529 if (!NONDEBUG_INSN_P (insn))
42530 continue;
42532 df_simulate_one_insn_backwards (bb, insn, &live);
42534 if (GET_CODE (PATTERN (insn)) == USE
42535 || GET_CODE (PATTERN (insn)) == CLOBBER)
42536 continue;
42538 extract_insn (insn);
42539 constrain_operands_cached (insn, reload_completed);
42540 int opno0, opno1;
42541 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42542 recog_data.n_operands, &opno0,
42543 &opno1);
42544 if (modrm < 0
42545 || !ix86_rop_should_change_byte_p (modrm)
42546 || opno0 == opno1)
42547 continue;
42549 rtx oldreg = recog_data.operand[opno1];
42550 preprocess_constraints (insn);
42551 const operand_alternative *alt = which_op_alt ();
42553 int i;
42554 for (i = 0; i < recog_data.n_operands; i++)
42555 if (i != opno1
42556 && alt[i].earlyclobber
42557 && reg_overlap_mentioned_p (recog_data.operand[i],
42558 oldreg))
42559 break;
42561 if (i < recog_data.n_operands)
42562 continue;
42564 if (dump_file)
42565 fprintf (dump_file,
42566 "attempting to fix modrm byte in insn %d:"
42567 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42568 reg_class_names[alt[opno1].cl]);
42570 HARD_REG_SET unavailable;
42571 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42572 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42573 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42574 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42575 IOR_HARD_REG_SET (unavailable, output_risky);
42576 IOR_COMPL_HARD_REG_SET (unavailable,
42577 reg_class_contents[alt[opno1].cl]);
42579 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42580 if (!TEST_HARD_REG_BIT (unavailable, i))
42581 break;
42582 if (i == FIRST_PSEUDO_REGISTER)
42584 if (dump_file)
42585 fprintf (dump_file, ", none available\n");
42586 continue;
42588 if (dump_file)
42589 fprintf (dump_file, " -> %d\n", i);
42590 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42591 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42592 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42597 /* Implement machine specific optimizations. We implement padding of returns
42598 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42599 static void
42600 ix86_reorg (void)
42602 /* We are freeing block_for_insn in the toplev to keep compatibility
42603 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42604 compute_bb_for_insn ();
42606 if (flag_mitigate_rop)
42607 ix86_mitigate_rop ();
42609 if (TARGET_SEH && current_function_has_exception_handlers ())
42610 ix86_seh_fixup_eh_fallthru ();
42612 if (optimize && optimize_function_for_speed_p (cfun))
42614 if (TARGET_PAD_SHORT_FUNCTION)
42615 ix86_pad_short_function ();
42616 else if (TARGET_PAD_RETURNS)
42617 ix86_pad_returns ();
42618 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42619 if (TARGET_FOUR_JUMP_LIMIT)
42620 ix86_avoid_jump_mispredicts ();
42621 #endif
42625 /* Return nonzero when QImode register that must be represented via REX prefix
42626 is used. */
42627 bool
42628 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42630 int i;
42631 extract_insn_cached (insn);
42632 for (i = 0; i < recog_data.n_operands; i++)
42633 if (GENERAL_REG_P (recog_data.operand[i])
42634 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42635 return true;
42636 return false;
42639 /* Return true when INSN mentions register that must be encoded using REX
42640 prefix. */
42641 bool
42642 x86_extended_reg_mentioned_p (rtx insn)
42644 subrtx_iterator::array_type array;
42645 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42647 const_rtx x = *iter;
42648 if (REG_P (x)
42649 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42650 return true;
42652 return false;
42655 /* If profitable, negate (without causing overflow) integer constant
42656 of mode MODE at location LOC. Return true in this case. */
42657 bool
42658 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42660 HOST_WIDE_INT val;
42662 if (!CONST_INT_P (*loc))
42663 return false;
42665 switch (mode)
42667 case E_DImode:
42668 /* DImode x86_64 constants must fit in 32 bits. */
42669 gcc_assert (x86_64_immediate_operand (*loc, mode));
42671 mode = SImode;
42672 break;
42674 case E_SImode:
42675 case E_HImode:
42676 case E_QImode:
42677 break;
42679 default:
42680 gcc_unreachable ();
42683 /* Avoid overflows. */
42684 if (mode_signbit_p (mode, *loc))
42685 return false;
42687 val = INTVAL (*loc);
42689 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42690 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42691 if ((val < 0 && val != -128)
42692 || val == 128)
42694 *loc = GEN_INT (-val);
42695 return true;
42698 return false;
42701 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42702 optabs would emit if we didn't have TFmode patterns. */
42704 void
42705 x86_emit_floatuns (rtx operands[2])
42707 rtx_code_label *neglab, *donelab;
42708 rtx i0, i1, f0, in, out;
42709 machine_mode mode, inmode;
42711 inmode = GET_MODE (operands[1]);
42712 gcc_assert (inmode == SImode || inmode == DImode);
42714 out = operands[0];
42715 in = force_reg (inmode, operands[1]);
42716 mode = GET_MODE (out);
42717 neglab = gen_label_rtx ();
42718 donelab = gen_label_rtx ();
42719 f0 = gen_reg_rtx (mode);
42721 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42723 expand_float (out, in, 0);
42725 emit_jump_insn (gen_jump (donelab));
42726 emit_barrier ();
42728 emit_label (neglab);
42730 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42731 1, OPTAB_DIRECT);
42732 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42733 1, OPTAB_DIRECT);
42734 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42736 expand_float (f0, i0, 0);
42738 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42740 emit_label (donelab);
42743 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42744 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42745 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42746 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42748 /* Get a vector mode of the same size as the original but with elements
42749 twice as wide. This is only guaranteed to apply to integral vectors. */
42751 static inline machine_mode
42752 get_mode_wider_vector (machine_mode o)
42754 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42755 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42756 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42757 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42758 return n;
42761 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42762 fill target with val via vec_duplicate. */
42764 static bool
42765 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42767 bool ok;
42768 rtx_insn *insn;
42769 rtx dup;
42771 /* First attempt to recognize VAL as-is. */
42772 dup = gen_vec_duplicate (mode, val);
42773 insn = emit_insn (gen_rtx_SET (target, dup));
42774 if (recog_memoized (insn) < 0)
42776 rtx_insn *seq;
42777 machine_mode innermode = GET_MODE_INNER (mode);
42778 rtx reg;
42780 /* If that fails, force VAL into a register. */
42782 start_sequence ();
42783 reg = force_reg (innermode, val);
42784 if (GET_MODE (reg) != innermode)
42785 reg = gen_lowpart (innermode, reg);
42786 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42787 seq = get_insns ();
42788 end_sequence ();
42789 if (seq)
42790 emit_insn_before (seq, insn);
42792 ok = recog_memoized (insn) >= 0;
42793 gcc_assert (ok);
42795 return true;
42798 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42799 with all elements equal to VAR. Return true if successful. */
42801 static bool
42802 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42803 rtx target, rtx val)
42805 bool ok;
42807 switch (mode)
42809 case E_V2SImode:
42810 case E_V2SFmode:
42811 if (!mmx_ok)
42812 return false;
42813 /* FALLTHRU */
42815 case E_V4DFmode:
42816 case E_V4DImode:
42817 case E_V8SFmode:
42818 case E_V8SImode:
42819 case E_V2DFmode:
42820 case E_V2DImode:
42821 case E_V4SFmode:
42822 case E_V4SImode:
42823 case E_V16SImode:
42824 case E_V8DImode:
42825 case E_V16SFmode:
42826 case E_V8DFmode:
42827 return ix86_vector_duplicate_value (mode, target, val);
42829 case E_V4HImode:
42830 if (!mmx_ok)
42831 return false;
42832 if (TARGET_SSE || TARGET_3DNOW_A)
42834 rtx x;
42836 val = gen_lowpart (SImode, val);
42837 x = gen_rtx_TRUNCATE (HImode, val);
42838 x = gen_rtx_VEC_DUPLICATE (mode, x);
42839 emit_insn (gen_rtx_SET (target, x));
42840 return true;
42842 goto widen;
42844 case E_V8QImode:
42845 if (!mmx_ok)
42846 return false;
42847 goto widen;
42849 case E_V8HImode:
42850 if (TARGET_AVX2)
42851 return ix86_vector_duplicate_value (mode, target, val);
42853 if (TARGET_SSE2)
42855 struct expand_vec_perm_d dperm;
42856 rtx tmp1, tmp2;
42858 permute:
42859 memset (&dperm, 0, sizeof (dperm));
42860 dperm.target = target;
42861 dperm.vmode = mode;
42862 dperm.nelt = GET_MODE_NUNITS (mode);
42863 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42864 dperm.one_operand_p = true;
42866 /* Extend to SImode using a paradoxical SUBREG. */
42867 tmp1 = gen_reg_rtx (SImode);
42868 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42870 /* Insert the SImode value as low element of a V4SImode vector. */
42871 tmp2 = gen_reg_rtx (V4SImode);
42872 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42873 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42875 ok = (expand_vec_perm_1 (&dperm)
42876 || expand_vec_perm_broadcast_1 (&dperm));
42877 gcc_assert (ok);
42878 return ok;
42880 goto widen;
42882 case E_V16QImode:
42883 if (TARGET_AVX2)
42884 return ix86_vector_duplicate_value (mode, target, val);
42886 if (TARGET_SSE2)
42887 goto permute;
42888 goto widen;
42890 widen:
42891 /* Replicate the value once into the next wider mode and recurse. */
42893 machine_mode smode, wsmode, wvmode;
42894 rtx x;
42896 smode = GET_MODE_INNER (mode);
42897 wvmode = get_mode_wider_vector (mode);
42898 wsmode = GET_MODE_INNER (wvmode);
42900 val = convert_modes (wsmode, smode, val, true);
42901 x = expand_simple_binop (wsmode, ASHIFT, val,
42902 GEN_INT (GET_MODE_BITSIZE (smode)),
42903 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42904 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42906 x = gen_reg_rtx (wvmode);
42907 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42908 gcc_assert (ok);
42909 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42910 return ok;
42913 case E_V16HImode:
42914 case E_V32QImode:
42915 if (TARGET_AVX2)
42916 return ix86_vector_duplicate_value (mode, target, val);
42917 else
42919 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42920 rtx x = gen_reg_rtx (hvmode);
42922 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42923 gcc_assert (ok);
42925 x = gen_rtx_VEC_CONCAT (mode, x, x);
42926 emit_insn (gen_rtx_SET (target, x));
42928 return true;
42930 case E_V64QImode:
42931 case E_V32HImode:
42932 if (TARGET_AVX512BW)
42933 return ix86_vector_duplicate_value (mode, target, val);
42934 else
42936 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42937 rtx x = gen_reg_rtx (hvmode);
42939 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42940 gcc_assert (ok);
42942 x = gen_rtx_VEC_CONCAT (mode, x, x);
42943 emit_insn (gen_rtx_SET (target, x));
42945 return true;
42947 default:
42948 return false;
42952 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42953 whose ONE_VAR element is VAR, and other elements are zero. Return true
42954 if successful. */
42956 static bool
42957 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42958 rtx target, rtx var, int one_var)
42960 machine_mode vsimode;
42961 rtx new_target;
42962 rtx x, tmp;
42963 bool use_vector_set = false;
42964 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42966 switch (mode)
42968 case E_V2DImode:
42969 /* For SSE4.1, we normally use vector set. But if the second
42970 element is zero and inter-unit moves are OK, we use movq
42971 instead. */
42972 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42973 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42974 && one_var == 0));
42975 break;
42976 case E_V16QImode:
42977 case E_V4SImode:
42978 case E_V4SFmode:
42979 use_vector_set = TARGET_SSE4_1;
42980 break;
42981 case E_V8HImode:
42982 use_vector_set = TARGET_SSE2;
42983 break;
42984 case E_V4HImode:
42985 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42986 break;
42987 case E_V32QImode:
42988 case E_V16HImode:
42989 use_vector_set = TARGET_AVX;
42990 break;
42991 case E_V8SImode:
42992 use_vector_set = TARGET_AVX;
42993 gen_vec_set_0 = gen_vec_setv8si_0;
42994 break;
42995 case E_V8SFmode:
42996 use_vector_set = TARGET_AVX;
42997 gen_vec_set_0 = gen_vec_setv8sf_0;
42998 break;
42999 case E_V4DFmode:
43000 use_vector_set = TARGET_AVX;
43001 gen_vec_set_0 = gen_vec_setv4df_0;
43002 break;
43003 case E_V4DImode:
43004 /* Use ix86_expand_vector_set in 64bit mode only. */
43005 use_vector_set = TARGET_AVX && TARGET_64BIT;
43006 gen_vec_set_0 = gen_vec_setv4di_0;
43007 break;
43008 case E_V16SImode:
43009 use_vector_set = TARGET_AVX512F && one_var == 0;
43010 gen_vec_set_0 = gen_vec_setv16si_0;
43011 break;
43012 case E_V16SFmode:
43013 use_vector_set = TARGET_AVX512F && one_var == 0;
43014 gen_vec_set_0 = gen_vec_setv16sf_0;
43015 break;
43016 case E_V8DFmode:
43017 use_vector_set = TARGET_AVX512F && one_var == 0;
43018 gen_vec_set_0 = gen_vec_setv8df_0;
43019 break;
43020 case E_V8DImode:
43021 /* Use ix86_expand_vector_set in 64bit mode only. */
43022 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
43023 gen_vec_set_0 = gen_vec_setv8di_0;
43024 break;
43025 default:
43026 break;
43029 if (use_vector_set)
43031 if (gen_vec_set_0 && one_var == 0)
43033 var = force_reg (GET_MODE_INNER (mode), var);
43034 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
43035 return true;
43037 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43038 var = force_reg (GET_MODE_INNER (mode), var);
43039 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43040 return true;
43043 switch (mode)
43045 case E_V2SFmode:
43046 case E_V2SImode:
43047 if (!mmx_ok)
43048 return false;
43049 /* FALLTHRU */
43051 case E_V2DFmode:
43052 case E_V2DImode:
43053 if (one_var != 0)
43054 return false;
43055 var = force_reg (GET_MODE_INNER (mode), var);
43056 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43057 emit_insn (gen_rtx_SET (target, x));
43058 return true;
43060 case E_V4SFmode:
43061 case E_V4SImode:
43062 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43063 new_target = gen_reg_rtx (mode);
43064 else
43065 new_target = target;
43066 var = force_reg (GET_MODE_INNER (mode), var);
43067 x = gen_rtx_VEC_DUPLICATE (mode, var);
43068 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43069 emit_insn (gen_rtx_SET (new_target, x));
43070 if (one_var != 0)
43072 /* We need to shuffle the value to the correct position, so
43073 create a new pseudo to store the intermediate result. */
43075 /* With SSE2, we can use the integer shuffle insns. */
43076 if (mode != V4SFmode && TARGET_SSE2)
43078 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43079 const1_rtx,
43080 GEN_INT (one_var == 1 ? 0 : 1),
43081 GEN_INT (one_var == 2 ? 0 : 1),
43082 GEN_INT (one_var == 3 ? 0 : 1)));
43083 if (target != new_target)
43084 emit_move_insn (target, new_target);
43085 return true;
43088 /* Otherwise convert the intermediate result to V4SFmode and
43089 use the SSE1 shuffle instructions. */
43090 if (mode != V4SFmode)
43092 tmp = gen_reg_rtx (V4SFmode);
43093 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43095 else
43096 tmp = new_target;
43098 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43099 const1_rtx,
43100 GEN_INT (one_var == 1 ? 0 : 1),
43101 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43102 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43104 if (mode != V4SFmode)
43105 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43106 else if (tmp != target)
43107 emit_move_insn (target, tmp);
43109 else if (target != new_target)
43110 emit_move_insn (target, new_target);
43111 return true;
43113 case E_V8HImode:
43114 case E_V16QImode:
43115 vsimode = V4SImode;
43116 goto widen;
43117 case E_V4HImode:
43118 case E_V8QImode:
43119 if (!mmx_ok)
43120 return false;
43121 vsimode = V2SImode;
43122 goto widen;
43123 widen:
43124 if (one_var != 0)
43125 return false;
43127 /* Zero extend the variable element to SImode and recurse. */
43128 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43130 x = gen_reg_rtx (vsimode);
43131 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43132 var, one_var))
43133 gcc_unreachable ();
43135 emit_move_insn (target, gen_lowpart (mode, x));
43136 return true;
43138 default:
43139 return false;
43143 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43144 consisting of the values in VALS. It is known that all elements
43145 except ONE_VAR are constants. Return true if successful. */
43147 static bool
43148 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43149 rtx target, rtx vals, int one_var)
43151 rtx var = XVECEXP (vals, 0, one_var);
43152 machine_mode wmode;
43153 rtx const_vec, x;
43155 const_vec = copy_rtx (vals);
43156 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43157 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43159 switch (mode)
43161 case E_V2DFmode:
43162 case E_V2DImode:
43163 case E_V2SFmode:
43164 case E_V2SImode:
43165 /* For the two element vectors, it's just as easy to use
43166 the general case. */
43167 return false;
43169 case E_V4DImode:
43170 /* Use ix86_expand_vector_set in 64bit mode only. */
43171 if (!TARGET_64BIT)
43172 return false;
43173 /* FALLTHRU */
43174 case E_V4DFmode:
43175 case E_V8SFmode:
43176 case E_V8SImode:
43177 case E_V16HImode:
43178 case E_V32QImode:
43179 case E_V4SFmode:
43180 case E_V4SImode:
43181 case E_V8HImode:
43182 case E_V4HImode:
43183 break;
43185 case E_V16QImode:
43186 if (TARGET_SSE4_1)
43187 break;
43188 wmode = V8HImode;
43189 goto widen;
43190 case E_V8QImode:
43191 wmode = V4HImode;
43192 goto widen;
43193 widen:
43194 /* There's no way to set one QImode entry easily. Combine
43195 the variable value with its adjacent constant value, and
43196 promote to an HImode set. */
43197 x = XVECEXP (vals, 0, one_var ^ 1);
43198 if (one_var & 1)
43200 var = convert_modes (HImode, QImode, var, true);
43201 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43202 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43203 x = GEN_INT (INTVAL (x) & 0xff);
43205 else
43207 var = convert_modes (HImode, QImode, var, true);
43208 x = gen_int_mode (INTVAL (x) << 8, HImode);
43210 if (x != const0_rtx)
43211 var = expand_simple_binop (HImode, IOR, var, x, var,
43212 1, OPTAB_LIB_WIDEN);
43214 x = gen_reg_rtx (wmode);
43215 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43216 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43218 emit_move_insn (target, gen_lowpart (mode, x));
43219 return true;
43221 default:
43222 return false;
43225 emit_move_insn (target, const_vec);
43226 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43227 return true;
43230 /* A subroutine of ix86_expand_vector_init_general. Use vector
43231 concatenate to handle the most general case: all values variable,
43232 and none identical. */
43234 static void
43235 ix86_expand_vector_init_concat (machine_mode mode,
43236 rtx target, rtx *ops, int n)
43238 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43239 rtx first[16], second[8], third[4];
43240 rtvec v;
43241 int i, j;
43243 switch (n)
43245 case 2:
43246 switch (mode)
43248 case E_V16SImode:
43249 cmode = V8SImode;
43250 break;
43251 case E_V16SFmode:
43252 cmode = V8SFmode;
43253 break;
43254 case E_V8DImode:
43255 cmode = V4DImode;
43256 break;
43257 case E_V8DFmode:
43258 cmode = V4DFmode;
43259 break;
43260 case E_V8SImode:
43261 cmode = V4SImode;
43262 break;
43263 case E_V8SFmode:
43264 cmode = V4SFmode;
43265 break;
43266 case E_V4DImode:
43267 cmode = V2DImode;
43268 break;
43269 case E_V4DFmode:
43270 cmode = V2DFmode;
43271 break;
43272 case E_V4SImode:
43273 cmode = V2SImode;
43274 break;
43275 case E_V4SFmode:
43276 cmode = V2SFmode;
43277 break;
43278 case E_V2DImode:
43279 cmode = DImode;
43280 break;
43281 case E_V2SImode:
43282 cmode = SImode;
43283 break;
43284 case E_V2DFmode:
43285 cmode = DFmode;
43286 break;
43287 case E_V2SFmode:
43288 cmode = SFmode;
43289 break;
43290 default:
43291 gcc_unreachable ();
43294 if (!register_operand (ops[1], cmode))
43295 ops[1] = force_reg (cmode, ops[1]);
43296 if (!register_operand (ops[0], cmode))
43297 ops[0] = force_reg (cmode, ops[0]);
43298 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43299 ops[1])));
43300 break;
43302 case 4:
43303 switch (mode)
43305 case E_V4DImode:
43306 cmode = V2DImode;
43307 break;
43308 case E_V4DFmode:
43309 cmode = V2DFmode;
43310 break;
43311 case E_V4SImode:
43312 cmode = V2SImode;
43313 break;
43314 case E_V4SFmode:
43315 cmode = V2SFmode;
43316 break;
43317 default:
43318 gcc_unreachable ();
43320 goto half;
43322 case 8:
43323 switch (mode)
43325 case E_V8DImode:
43326 cmode = V2DImode;
43327 hmode = V4DImode;
43328 break;
43329 case E_V8DFmode:
43330 cmode = V2DFmode;
43331 hmode = V4DFmode;
43332 break;
43333 case E_V8SImode:
43334 cmode = V2SImode;
43335 hmode = V4SImode;
43336 break;
43337 case E_V8SFmode:
43338 cmode = V2SFmode;
43339 hmode = V4SFmode;
43340 break;
43341 default:
43342 gcc_unreachable ();
43344 goto half;
43346 case 16:
43347 switch (mode)
43349 case E_V16SImode:
43350 cmode = V2SImode;
43351 hmode = V4SImode;
43352 gmode = V8SImode;
43353 break;
43354 case E_V16SFmode:
43355 cmode = V2SFmode;
43356 hmode = V4SFmode;
43357 gmode = V8SFmode;
43358 break;
43359 default:
43360 gcc_unreachable ();
43362 goto half;
43364 half:
43365 /* FIXME: We process inputs backward to help RA. PR 36222. */
43366 i = n - 1;
43367 j = (n >> 1) - 1;
43368 for (; i > 0; i -= 2, j--)
43370 first[j] = gen_reg_rtx (cmode);
43371 v = gen_rtvec (2, ops[i - 1], ops[i]);
43372 ix86_expand_vector_init (false, first[j],
43373 gen_rtx_PARALLEL (cmode, v));
43376 n >>= 1;
43377 if (n > 4)
43379 gcc_assert (hmode != VOIDmode);
43380 gcc_assert (gmode != VOIDmode);
43381 for (i = j = 0; i < n; i += 2, j++)
43383 second[j] = gen_reg_rtx (hmode);
43384 ix86_expand_vector_init_concat (hmode, second [j],
43385 &first [i], 2);
43387 n >>= 1;
43388 for (i = j = 0; i < n; i += 2, j++)
43390 third[j] = gen_reg_rtx (gmode);
43391 ix86_expand_vector_init_concat (gmode, third[j],
43392 &second[i], 2);
43394 n >>= 1;
43395 ix86_expand_vector_init_concat (mode, target, third, n);
43397 else if (n > 2)
43399 gcc_assert (hmode != VOIDmode);
43400 for (i = j = 0; i < n; i += 2, j++)
43402 second[j] = gen_reg_rtx (hmode);
43403 ix86_expand_vector_init_concat (hmode, second [j],
43404 &first [i], 2);
43406 n >>= 1;
43407 ix86_expand_vector_init_concat (mode, target, second, n);
43409 else
43410 ix86_expand_vector_init_concat (mode, target, first, n);
43411 break;
43413 default:
43414 gcc_unreachable ();
43418 /* A subroutine of ix86_expand_vector_init_general. Use vector
43419 interleave to handle the most general case: all values variable,
43420 and none identical. */
43422 static void
43423 ix86_expand_vector_init_interleave (machine_mode mode,
43424 rtx target, rtx *ops, int n)
43426 machine_mode first_imode, second_imode, third_imode, inner_mode;
43427 int i, j;
43428 rtx op0, op1;
43429 rtx (*gen_load_even) (rtx, rtx, rtx);
43430 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43431 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43433 switch (mode)
43435 case E_V8HImode:
43436 gen_load_even = gen_vec_setv8hi;
43437 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43438 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43439 inner_mode = HImode;
43440 first_imode = V4SImode;
43441 second_imode = V2DImode;
43442 third_imode = VOIDmode;
43443 break;
43444 case E_V16QImode:
43445 gen_load_even = gen_vec_setv16qi;
43446 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43447 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43448 inner_mode = QImode;
43449 first_imode = V8HImode;
43450 second_imode = V4SImode;
43451 third_imode = V2DImode;
43452 break;
43453 default:
43454 gcc_unreachable ();
43457 for (i = 0; i < n; i++)
43459 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43460 op0 = gen_reg_rtx (SImode);
43461 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43463 /* Insert the SImode value as low element of V4SImode vector. */
43464 op1 = gen_reg_rtx (V4SImode);
43465 op0 = gen_rtx_VEC_MERGE (V4SImode,
43466 gen_rtx_VEC_DUPLICATE (V4SImode,
43467 op0),
43468 CONST0_RTX (V4SImode),
43469 const1_rtx);
43470 emit_insn (gen_rtx_SET (op1, op0));
43472 /* Cast the V4SImode vector back to a vector in orignal mode. */
43473 op0 = gen_reg_rtx (mode);
43474 emit_move_insn (op0, gen_lowpart (mode, op1));
43476 /* Load even elements into the second position. */
43477 emit_insn (gen_load_even (op0,
43478 force_reg (inner_mode,
43479 ops [i + i + 1]),
43480 const1_rtx));
43482 /* Cast vector to FIRST_IMODE vector. */
43483 ops[i] = gen_reg_rtx (first_imode);
43484 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43487 /* Interleave low FIRST_IMODE vectors. */
43488 for (i = j = 0; i < n; i += 2, j++)
43490 op0 = gen_reg_rtx (first_imode);
43491 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43493 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43494 ops[j] = gen_reg_rtx (second_imode);
43495 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43498 /* Interleave low SECOND_IMODE vectors. */
43499 switch (second_imode)
43501 case E_V4SImode:
43502 for (i = j = 0; i < n / 2; i += 2, j++)
43504 op0 = gen_reg_rtx (second_imode);
43505 emit_insn (gen_interleave_second_low (op0, ops[i],
43506 ops[i + 1]));
43508 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43509 vector. */
43510 ops[j] = gen_reg_rtx (third_imode);
43511 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43513 second_imode = V2DImode;
43514 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43515 /* FALLTHRU */
43517 case E_V2DImode:
43518 op0 = gen_reg_rtx (second_imode);
43519 emit_insn (gen_interleave_second_low (op0, ops[0],
43520 ops[1]));
43522 /* Cast the SECOND_IMODE vector back to a vector on original
43523 mode. */
43524 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43525 break;
43527 default:
43528 gcc_unreachable ();
43532 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43533 all values variable, and none identical. */
43535 static void
43536 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43537 rtx target, rtx vals)
43539 rtx ops[64], op0, op1, op2, op3, op4, op5;
43540 machine_mode half_mode = VOIDmode;
43541 machine_mode quarter_mode = VOIDmode;
43542 int n, i;
43544 switch (mode)
43546 case E_V2SFmode:
43547 case E_V2SImode:
43548 if (!mmx_ok && !TARGET_SSE)
43549 break;
43550 /* FALLTHRU */
43552 case E_V16SImode:
43553 case E_V16SFmode:
43554 case E_V8DFmode:
43555 case E_V8DImode:
43556 case E_V8SFmode:
43557 case E_V8SImode:
43558 case E_V4DFmode:
43559 case E_V4DImode:
43560 case E_V4SFmode:
43561 case E_V4SImode:
43562 case E_V2DFmode:
43563 case E_V2DImode:
43564 n = GET_MODE_NUNITS (mode);
43565 for (i = 0; i < n; i++)
43566 ops[i] = XVECEXP (vals, 0, i);
43567 ix86_expand_vector_init_concat (mode, target, ops, n);
43568 return;
43570 case E_V2TImode:
43571 for (i = 0; i < 2; i++)
43572 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43573 op0 = gen_reg_rtx (V4DImode);
43574 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43575 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43576 return;
43578 case E_V4TImode:
43579 for (i = 0; i < 4; i++)
43580 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43581 ops[4] = gen_reg_rtx (V4DImode);
43582 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43583 ops[5] = gen_reg_rtx (V4DImode);
43584 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43585 op0 = gen_reg_rtx (V8DImode);
43586 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43587 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43588 return;
43590 case E_V32QImode:
43591 half_mode = V16QImode;
43592 goto half;
43594 case E_V16HImode:
43595 half_mode = V8HImode;
43596 goto half;
43598 half:
43599 n = GET_MODE_NUNITS (mode);
43600 for (i = 0; i < n; i++)
43601 ops[i] = XVECEXP (vals, 0, i);
43602 op0 = gen_reg_rtx (half_mode);
43603 op1 = gen_reg_rtx (half_mode);
43604 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43605 n >> 2);
43606 ix86_expand_vector_init_interleave (half_mode, op1,
43607 &ops [n >> 1], n >> 2);
43608 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43609 return;
43611 case E_V64QImode:
43612 quarter_mode = V16QImode;
43613 half_mode = V32QImode;
43614 goto quarter;
43616 case E_V32HImode:
43617 quarter_mode = V8HImode;
43618 half_mode = V16HImode;
43619 goto quarter;
43621 quarter:
43622 n = GET_MODE_NUNITS (mode);
43623 for (i = 0; i < n; i++)
43624 ops[i] = XVECEXP (vals, 0, i);
43625 op0 = gen_reg_rtx (quarter_mode);
43626 op1 = gen_reg_rtx (quarter_mode);
43627 op2 = gen_reg_rtx (quarter_mode);
43628 op3 = gen_reg_rtx (quarter_mode);
43629 op4 = gen_reg_rtx (half_mode);
43630 op5 = gen_reg_rtx (half_mode);
43631 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43632 n >> 3);
43633 ix86_expand_vector_init_interleave (quarter_mode, op1,
43634 &ops [n >> 2], n >> 3);
43635 ix86_expand_vector_init_interleave (quarter_mode, op2,
43636 &ops [n >> 1], n >> 3);
43637 ix86_expand_vector_init_interleave (quarter_mode, op3,
43638 &ops [(n >> 1) | (n >> 2)], n >> 3);
43639 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43640 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43641 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43642 return;
43644 case E_V16QImode:
43645 if (!TARGET_SSE4_1)
43646 break;
43647 /* FALLTHRU */
43649 case E_V8HImode:
43650 if (!TARGET_SSE2)
43651 break;
43653 /* Don't use ix86_expand_vector_init_interleave if we can't
43654 move from GPR to SSE register directly. */
43655 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43656 break;
43658 n = GET_MODE_NUNITS (mode);
43659 for (i = 0; i < n; i++)
43660 ops[i] = XVECEXP (vals, 0, i);
43661 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43662 return;
43664 case E_V4HImode:
43665 case E_V8QImode:
43666 break;
43668 default:
43669 gcc_unreachable ();
43673 int i, j, n_elts, n_words, n_elt_per_word;
43674 machine_mode inner_mode;
43675 rtx words[4], shift;
43677 inner_mode = GET_MODE_INNER (mode);
43678 n_elts = GET_MODE_NUNITS (mode);
43679 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43680 n_elt_per_word = n_elts / n_words;
43681 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43683 for (i = 0; i < n_words; ++i)
43685 rtx word = NULL_RTX;
43687 for (j = 0; j < n_elt_per_word; ++j)
43689 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43690 elt = convert_modes (word_mode, inner_mode, elt, true);
43692 if (j == 0)
43693 word = elt;
43694 else
43696 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43697 word, 1, OPTAB_LIB_WIDEN);
43698 word = expand_simple_binop (word_mode, IOR, word, elt,
43699 word, 1, OPTAB_LIB_WIDEN);
43703 words[i] = word;
43706 if (n_words == 1)
43707 emit_move_insn (target, gen_lowpart (mode, words[0]));
43708 else if (n_words == 2)
43710 rtx tmp = gen_reg_rtx (mode);
43711 emit_clobber (tmp);
43712 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43713 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43714 emit_move_insn (target, tmp);
43716 else if (n_words == 4)
43718 rtx tmp = gen_reg_rtx (V4SImode);
43719 gcc_assert (word_mode == SImode);
43720 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43721 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43722 emit_move_insn (target, gen_lowpart (mode, tmp));
43724 else
43725 gcc_unreachable ();
43729 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43730 instructions unless MMX_OK is true. */
43732 void
43733 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43735 machine_mode mode = GET_MODE (target);
43736 machine_mode inner_mode = GET_MODE_INNER (mode);
43737 int n_elts = GET_MODE_NUNITS (mode);
43738 int n_var = 0, one_var = -1;
43739 bool all_same = true, all_const_zero = true;
43740 int i;
43741 rtx x;
43743 /* Handle first initialization from vector elts. */
43744 if (n_elts != XVECLEN (vals, 0))
43746 rtx subtarget = target;
43747 x = XVECEXP (vals, 0, 0);
43748 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43749 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43751 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43752 if (inner_mode == QImode || inner_mode == HImode)
43754 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43755 mode = mode_for_vector (SImode, n_bits / 4).require ();
43756 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43757 ops[0] = gen_lowpart (inner_mode, ops[0]);
43758 ops[1] = gen_lowpart (inner_mode, ops[1]);
43759 subtarget = gen_reg_rtx (mode);
43761 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43762 if (subtarget != target)
43763 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43764 return;
43766 gcc_unreachable ();
43769 for (i = 0; i < n_elts; ++i)
43771 x = XVECEXP (vals, 0, i);
43772 if (!(CONST_SCALAR_INT_P (x)
43773 || CONST_DOUBLE_P (x)
43774 || CONST_FIXED_P (x)))
43775 n_var++, one_var = i;
43776 else if (x != CONST0_RTX (inner_mode))
43777 all_const_zero = false;
43778 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43779 all_same = false;
43782 /* Constants are best loaded from the constant pool. */
43783 if (n_var == 0)
43785 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43786 return;
43789 /* If all values are identical, broadcast the value. */
43790 if (all_same
43791 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43792 XVECEXP (vals, 0, 0)))
43793 return;
43795 /* Values where only one field is non-constant are best loaded from
43796 the pool and overwritten via move later. */
43797 if (n_var == 1)
43799 if (all_const_zero
43800 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43801 XVECEXP (vals, 0, one_var),
43802 one_var))
43803 return;
43805 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43806 return;
43809 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43812 void
43813 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43815 machine_mode mode = GET_MODE (target);
43816 machine_mode inner_mode = GET_MODE_INNER (mode);
43817 machine_mode half_mode;
43818 bool use_vec_merge = false;
43819 rtx tmp;
43820 static rtx (*gen_extract[6][2]) (rtx, rtx)
43822 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43823 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43824 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43825 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43826 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43827 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43829 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43831 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43832 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43833 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43834 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43835 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43836 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43838 int i, j, n;
43839 machine_mode mmode = VOIDmode;
43840 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43842 switch (mode)
43844 case E_V2SFmode:
43845 case E_V2SImode:
43846 if (mmx_ok)
43848 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43849 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43850 if (elt == 0)
43851 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43852 else
43853 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43854 emit_insn (gen_rtx_SET (target, tmp));
43855 return;
43857 break;
43859 case E_V2DImode:
43860 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43861 if (use_vec_merge)
43862 break;
43864 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43865 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43866 if (elt == 0)
43867 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43868 else
43869 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43870 emit_insn (gen_rtx_SET (target, tmp));
43871 return;
43873 case E_V2DFmode:
43875 rtx op0, op1;
43877 /* For the two element vectors, we implement a VEC_CONCAT with
43878 the extraction of the other element. */
43880 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43881 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43883 if (elt == 0)
43884 op0 = val, op1 = tmp;
43885 else
43886 op0 = tmp, op1 = val;
43888 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43889 emit_insn (gen_rtx_SET (target, tmp));
43891 return;
43893 case E_V4SFmode:
43894 use_vec_merge = TARGET_SSE4_1;
43895 if (use_vec_merge)
43896 break;
43898 switch (elt)
43900 case 0:
43901 use_vec_merge = true;
43902 break;
43904 case 1:
43905 /* tmp = target = A B C D */
43906 tmp = copy_to_reg (target);
43907 /* target = A A B B */
43908 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43909 /* target = X A B B */
43910 ix86_expand_vector_set (false, target, val, 0);
43911 /* target = A X C D */
43912 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43913 const1_rtx, const0_rtx,
43914 GEN_INT (2+4), GEN_INT (3+4)));
43915 return;
43917 case 2:
43918 /* tmp = target = A B C D */
43919 tmp = copy_to_reg (target);
43920 /* tmp = X B C D */
43921 ix86_expand_vector_set (false, tmp, val, 0);
43922 /* target = A B X D */
43923 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43924 const0_rtx, const1_rtx,
43925 GEN_INT (0+4), GEN_INT (3+4)));
43926 return;
43928 case 3:
43929 /* tmp = target = A B C D */
43930 tmp = copy_to_reg (target);
43931 /* tmp = X B C D */
43932 ix86_expand_vector_set (false, tmp, val, 0);
43933 /* target = A B X D */
43934 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43935 const0_rtx, const1_rtx,
43936 GEN_INT (2+4), GEN_INT (0+4)));
43937 return;
43939 default:
43940 gcc_unreachable ();
43942 break;
43944 case E_V4SImode:
43945 use_vec_merge = TARGET_SSE4_1;
43946 if (use_vec_merge)
43947 break;
43949 /* Element 0 handled by vec_merge below. */
43950 if (elt == 0)
43952 use_vec_merge = true;
43953 break;
43956 if (TARGET_SSE2)
43958 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43959 store into element 0, then shuffle them back. */
43961 rtx order[4];
43963 order[0] = GEN_INT (elt);
43964 order[1] = const1_rtx;
43965 order[2] = const2_rtx;
43966 order[3] = GEN_INT (3);
43967 order[elt] = const0_rtx;
43969 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43970 order[1], order[2], order[3]));
43972 ix86_expand_vector_set (false, target, val, 0);
43974 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43975 order[1], order[2], order[3]));
43977 else
43979 /* For SSE1, we have to reuse the V4SF code. */
43980 rtx t = gen_reg_rtx (V4SFmode);
43981 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43982 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43983 emit_move_insn (target, gen_lowpart (mode, t));
43985 return;
43987 case E_V8HImode:
43988 use_vec_merge = TARGET_SSE2;
43989 break;
43990 case E_V4HImode:
43991 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43992 break;
43994 case E_V16QImode:
43995 use_vec_merge = TARGET_SSE4_1;
43996 break;
43998 case E_V8QImode:
43999 break;
44001 case E_V32QImode:
44002 half_mode = V16QImode;
44003 j = 0;
44004 n = 16;
44005 goto half;
44007 case E_V16HImode:
44008 half_mode = V8HImode;
44009 j = 1;
44010 n = 8;
44011 goto half;
44013 case E_V8SImode:
44014 half_mode = V4SImode;
44015 j = 2;
44016 n = 4;
44017 goto half;
44019 case E_V4DImode:
44020 half_mode = V2DImode;
44021 j = 3;
44022 n = 2;
44023 goto half;
44025 case E_V8SFmode:
44026 half_mode = V4SFmode;
44027 j = 4;
44028 n = 4;
44029 goto half;
44031 case E_V4DFmode:
44032 half_mode = V2DFmode;
44033 j = 5;
44034 n = 2;
44035 goto half;
44037 half:
44038 /* Compute offset. */
44039 i = elt / n;
44040 elt %= n;
44042 gcc_assert (i <= 1);
44044 /* Extract the half. */
44045 tmp = gen_reg_rtx (half_mode);
44046 emit_insn (gen_extract[j][i] (tmp, target));
44048 /* Put val in tmp at elt. */
44049 ix86_expand_vector_set (false, tmp, val, elt);
44051 /* Put it back. */
44052 emit_insn (gen_insert[j][i] (target, target, tmp));
44053 return;
44055 case E_V8DFmode:
44056 if (TARGET_AVX512F)
44058 mmode = QImode;
44059 gen_blendm = gen_avx512f_blendmv8df;
44061 break;
44063 case E_V8DImode:
44064 if (TARGET_AVX512F)
44066 mmode = QImode;
44067 gen_blendm = gen_avx512f_blendmv8di;
44069 break;
44071 case E_V16SFmode:
44072 if (TARGET_AVX512F)
44074 mmode = HImode;
44075 gen_blendm = gen_avx512f_blendmv16sf;
44077 break;
44079 case E_V16SImode:
44080 if (TARGET_AVX512F)
44082 mmode = HImode;
44083 gen_blendm = gen_avx512f_blendmv16si;
44085 break;
44087 case E_V32HImode:
44088 if (TARGET_AVX512BW)
44090 mmode = SImode;
44091 gen_blendm = gen_avx512bw_blendmv32hi;
44093 else if (TARGET_AVX512F)
44095 half_mode = E_V8HImode;
44096 n = 8;
44097 goto quarter;
44099 break;
44101 case E_V64QImode:
44102 if (TARGET_AVX512BW)
44104 mmode = DImode;
44105 gen_blendm = gen_avx512bw_blendmv64qi;
44107 else if (TARGET_AVX512F)
44109 half_mode = E_V16QImode;
44110 n = 16;
44111 goto quarter;
44113 break;
44115 quarter:
44116 /* Compute offset. */
44117 i = elt / n;
44118 elt %= n;
44120 gcc_assert (i <= 3);
44123 /* Extract the quarter. */
44124 tmp = gen_reg_rtx (V4SImode);
44125 rtx tmp2 = gen_lowpart (V16SImode, target);
44126 rtx mask = gen_reg_rtx (QImode);
44128 emit_move_insn (mask, constm1_rtx);
44129 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
44130 tmp, mask));
44132 tmp2 = gen_reg_rtx (half_mode);
44133 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
44134 tmp = tmp2;
44136 /* Put val in tmp at elt. */
44137 ix86_expand_vector_set (false, tmp, val, elt);
44139 /* Put it back. */
44140 tmp2 = gen_reg_rtx (V16SImode);
44141 rtx tmp3 = gen_lowpart (V16SImode, target);
44142 mask = gen_reg_rtx (HImode);
44143 emit_move_insn (mask, constm1_rtx);
44144 tmp = gen_lowpart (V4SImode, tmp);
44145 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
44146 tmp3, mask));
44147 emit_move_insn (target, gen_lowpart (mode, tmp2));
44149 return;
44151 default:
44152 break;
44155 if (mmode != VOIDmode)
44157 tmp = gen_reg_rtx (mode);
44158 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44159 /* The avx512*_blendm<mode> expanders have different operand order
44160 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44161 elements where the mask is set and second input operand otherwise,
44162 in {sse,avx}*_*blend* the first input operand is used for elements
44163 where the mask is clear and second input operand otherwise. */
44164 emit_insn (gen_blendm (target, target, tmp,
44165 force_reg (mmode,
44166 gen_int_mode (1 << elt, mmode))));
44168 else if (use_vec_merge)
44170 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44171 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44172 emit_insn (gen_rtx_SET (target, tmp));
44174 else
44176 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44178 emit_move_insn (mem, target);
44180 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44181 emit_move_insn (tmp, val);
44183 emit_move_insn (target, mem);
44187 void
44188 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44190 machine_mode mode = GET_MODE (vec);
44191 machine_mode inner_mode = GET_MODE_INNER (mode);
44192 bool use_vec_extr = false;
44193 rtx tmp;
44195 switch (mode)
44197 case E_V2SImode:
44198 case E_V2SFmode:
44199 if (!mmx_ok)
44200 break;
44201 /* FALLTHRU */
44203 case E_V2DFmode:
44204 case E_V2DImode:
44205 case E_V2TImode:
44206 case E_V4TImode:
44207 use_vec_extr = true;
44208 break;
44210 case E_V4SFmode:
44211 use_vec_extr = TARGET_SSE4_1;
44212 if (use_vec_extr)
44213 break;
44215 switch (elt)
44217 case 0:
44218 tmp = vec;
44219 break;
44221 case 1:
44222 case 3:
44223 tmp = gen_reg_rtx (mode);
44224 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44225 GEN_INT (elt), GEN_INT (elt),
44226 GEN_INT (elt+4), GEN_INT (elt+4)));
44227 break;
44229 case 2:
44230 tmp = gen_reg_rtx (mode);
44231 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44232 break;
44234 default:
44235 gcc_unreachable ();
44237 vec = tmp;
44238 use_vec_extr = true;
44239 elt = 0;
44240 break;
44242 case E_V4SImode:
44243 use_vec_extr = TARGET_SSE4_1;
44244 if (use_vec_extr)
44245 break;
44247 if (TARGET_SSE2)
44249 switch (elt)
44251 case 0:
44252 tmp = vec;
44253 break;
44255 case 1:
44256 case 3:
44257 tmp = gen_reg_rtx (mode);
44258 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44259 GEN_INT (elt), GEN_INT (elt),
44260 GEN_INT (elt), GEN_INT (elt)));
44261 break;
44263 case 2:
44264 tmp = gen_reg_rtx (mode);
44265 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44266 break;
44268 default:
44269 gcc_unreachable ();
44271 vec = tmp;
44272 use_vec_extr = true;
44273 elt = 0;
44275 else
44277 /* For SSE1, we have to reuse the V4SF code. */
44278 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44279 gen_lowpart (V4SFmode, vec), elt);
44280 return;
44282 break;
44284 case E_V8HImode:
44285 use_vec_extr = TARGET_SSE2;
44286 break;
44287 case E_V4HImode:
44288 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44289 break;
44291 case E_V16QImode:
44292 use_vec_extr = TARGET_SSE4_1;
44293 break;
44295 case E_V8SFmode:
44296 if (TARGET_AVX)
44298 tmp = gen_reg_rtx (V4SFmode);
44299 if (elt < 4)
44300 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44301 else
44302 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44303 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44304 return;
44306 break;
44308 case E_V4DFmode:
44309 if (TARGET_AVX)
44311 tmp = gen_reg_rtx (V2DFmode);
44312 if (elt < 2)
44313 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44314 else
44315 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44316 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44317 return;
44319 break;
44321 case E_V32QImode:
44322 if (TARGET_AVX)
44324 tmp = gen_reg_rtx (V16QImode);
44325 if (elt < 16)
44326 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44327 else
44328 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44329 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44330 return;
44332 break;
44334 case E_V16HImode:
44335 if (TARGET_AVX)
44337 tmp = gen_reg_rtx (V8HImode);
44338 if (elt < 8)
44339 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44340 else
44341 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44342 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44343 return;
44345 break;
44347 case E_V8SImode:
44348 if (TARGET_AVX)
44350 tmp = gen_reg_rtx (V4SImode);
44351 if (elt < 4)
44352 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44353 else
44354 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44355 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44356 return;
44358 break;
44360 case E_V4DImode:
44361 if (TARGET_AVX)
44363 tmp = gen_reg_rtx (V2DImode);
44364 if (elt < 2)
44365 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44366 else
44367 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44368 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44369 return;
44371 break;
44373 case E_V32HImode:
44374 if (TARGET_AVX512BW)
44376 tmp = gen_reg_rtx (V16HImode);
44377 if (elt < 16)
44378 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44379 else
44380 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44381 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44382 return;
44384 break;
44386 case E_V64QImode:
44387 if (TARGET_AVX512BW)
44389 tmp = gen_reg_rtx (V32QImode);
44390 if (elt < 32)
44391 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44392 else
44393 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44394 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44395 return;
44397 break;
44399 case E_V16SFmode:
44400 tmp = gen_reg_rtx (V8SFmode);
44401 if (elt < 8)
44402 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44403 else
44404 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44405 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44406 return;
44408 case E_V8DFmode:
44409 tmp = gen_reg_rtx (V4DFmode);
44410 if (elt < 4)
44411 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44412 else
44413 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44414 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44415 return;
44417 case E_V16SImode:
44418 tmp = gen_reg_rtx (V8SImode);
44419 if (elt < 8)
44420 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44421 else
44422 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44423 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44424 return;
44426 case E_V8DImode:
44427 tmp = gen_reg_rtx (V4DImode);
44428 if (elt < 4)
44429 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44430 else
44431 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44432 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44433 return;
44435 case E_V8QImode:
44436 /* ??? Could extract the appropriate HImode element and shift. */
44437 default:
44438 break;
44441 if (use_vec_extr)
44443 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44444 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44446 /* Let the rtl optimizers know about the zero extension performed. */
44447 if (inner_mode == QImode || inner_mode == HImode)
44449 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44450 target = gen_lowpart (SImode, target);
44453 emit_insn (gen_rtx_SET (target, tmp));
44455 else
44457 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44459 emit_move_insn (mem, vec);
44461 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44462 emit_move_insn (target, tmp);
44466 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44467 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44468 The upper bits of DEST are undefined, though they shouldn't cause
44469 exceptions (some bits from src or all zeros are ok). */
44471 static void
44472 emit_reduc_half (rtx dest, rtx src, int i)
44474 rtx tem, d = dest;
44475 switch (GET_MODE (src))
44477 case E_V4SFmode:
44478 if (i == 128)
44479 tem = gen_sse_movhlps (dest, src, src);
44480 else
44481 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44482 GEN_INT (1 + 4), GEN_INT (1 + 4));
44483 break;
44484 case E_V2DFmode:
44485 tem = gen_vec_interleave_highv2df (dest, src, src);
44486 break;
44487 case E_V16QImode:
44488 case E_V8HImode:
44489 case E_V4SImode:
44490 case E_V2DImode:
44491 d = gen_reg_rtx (V1TImode);
44492 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44493 GEN_INT (i / 2));
44494 break;
44495 case E_V8SFmode:
44496 if (i == 256)
44497 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44498 else
44499 tem = gen_avx_shufps256 (dest, src, src,
44500 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44501 break;
44502 case E_V4DFmode:
44503 if (i == 256)
44504 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44505 else
44506 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44507 break;
44508 case E_V32QImode:
44509 case E_V16HImode:
44510 case E_V8SImode:
44511 case E_V4DImode:
44512 if (i == 256)
44514 if (GET_MODE (dest) != V4DImode)
44515 d = gen_reg_rtx (V4DImode);
44516 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44517 gen_lowpart (V4DImode, src),
44518 const1_rtx);
44520 else
44522 d = gen_reg_rtx (V2TImode);
44523 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44524 GEN_INT (i / 2));
44526 break;
44527 case E_V64QImode:
44528 case E_V32HImode:
44529 case E_V16SImode:
44530 case E_V16SFmode:
44531 case E_V8DImode:
44532 case E_V8DFmode:
44533 if (i > 128)
44534 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44535 gen_lowpart (V16SImode, src),
44536 gen_lowpart (V16SImode, src),
44537 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44538 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44539 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44540 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44541 GEN_INT (0xC), GEN_INT (0xD),
44542 GEN_INT (0xE), GEN_INT (0xF),
44543 GEN_INT (0x10), GEN_INT (0x11),
44544 GEN_INT (0x12), GEN_INT (0x13),
44545 GEN_INT (0x14), GEN_INT (0x15),
44546 GEN_INT (0x16), GEN_INT (0x17));
44547 else
44548 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44549 gen_lowpart (V16SImode, src),
44550 GEN_INT (i == 128 ? 0x2 : 0x1),
44551 GEN_INT (0x3),
44552 GEN_INT (0x3),
44553 GEN_INT (0x3),
44554 GEN_INT (i == 128 ? 0x6 : 0x5),
44555 GEN_INT (0x7),
44556 GEN_INT (0x7),
44557 GEN_INT (0x7),
44558 GEN_INT (i == 128 ? 0xA : 0x9),
44559 GEN_INT (0xB),
44560 GEN_INT (0xB),
44561 GEN_INT (0xB),
44562 GEN_INT (i == 128 ? 0xE : 0xD),
44563 GEN_INT (0xF),
44564 GEN_INT (0xF),
44565 GEN_INT (0xF));
44566 break;
44567 default:
44568 gcc_unreachable ();
44570 emit_insn (tem);
44571 if (d != dest)
44572 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44575 /* Expand a vector reduction. FN is the binary pattern to reduce;
44576 DEST is the destination; IN is the input vector. */
44578 void
44579 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44581 rtx half, dst, vec = in;
44582 machine_mode mode = GET_MODE (in);
44583 int i;
44585 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44586 if (TARGET_SSE4_1
44587 && mode == V8HImode
44588 && fn == gen_uminv8hi3)
44590 emit_insn (gen_sse4_1_phminposuw (dest, in));
44591 return;
44594 for (i = GET_MODE_BITSIZE (mode);
44595 i > GET_MODE_UNIT_BITSIZE (mode);
44596 i >>= 1)
44598 half = gen_reg_rtx (mode);
44599 emit_reduc_half (half, vec, i);
44600 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44601 dst = dest;
44602 else
44603 dst = gen_reg_rtx (mode);
44604 emit_insn (fn (dst, half, vec));
44605 vec = dst;
44609 /* Target hook for scalar_mode_supported_p. */
44610 static bool
44611 ix86_scalar_mode_supported_p (scalar_mode mode)
44613 if (DECIMAL_FLOAT_MODE_P (mode))
44614 return default_decimal_float_supported_p ();
44615 else if (mode == TFmode)
44616 return true;
44617 else
44618 return default_scalar_mode_supported_p (mode);
44621 /* Implements target hook vector_mode_supported_p. */
44622 static bool
44623 ix86_vector_mode_supported_p (machine_mode mode)
44625 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44626 return true;
44627 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44628 return true;
44629 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44630 return true;
44631 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44632 return true;
44633 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44634 return true;
44635 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44636 return true;
44637 return false;
44640 /* Target hook for c_mode_for_suffix. */
44641 static machine_mode
44642 ix86_c_mode_for_suffix (char suffix)
44644 if (suffix == 'q')
44645 return TFmode;
44646 if (suffix == 'w')
44647 return XFmode;
44649 return VOIDmode;
44652 /* Worker function for TARGET_MD_ASM_ADJUST.
44654 We implement asm flag outputs, and maintain source compatibility
44655 with the old cc0-based compiler. */
44657 static rtx_insn *
44658 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44659 vec<const char *> &constraints,
44660 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44662 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44663 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44665 bool saw_asm_flag = false;
44667 start_sequence ();
44668 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44670 const char *con = constraints[i];
44671 if (strncmp (con, "=@cc", 4) != 0)
44672 continue;
44673 con += 4;
44674 if (strchr (con, ',') != NULL)
44676 error ("alternatives not allowed in asm flag output");
44677 continue;
44680 bool invert = false;
44681 if (con[0] == 'n')
44682 invert = true, con++;
44684 machine_mode mode = CCmode;
44685 rtx_code code = UNKNOWN;
44687 switch (con[0])
44689 case 'a':
44690 if (con[1] == 0)
44691 mode = CCAmode, code = EQ;
44692 else if (con[1] == 'e' && con[2] == 0)
44693 mode = CCCmode, code = NE;
44694 break;
44695 case 'b':
44696 if (con[1] == 0)
44697 mode = CCCmode, code = EQ;
44698 else if (con[1] == 'e' && con[2] == 0)
44699 mode = CCAmode, code = NE;
44700 break;
44701 case 'c':
44702 if (con[1] == 0)
44703 mode = CCCmode, code = EQ;
44704 break;
44705 case 'e':
44706 if (con[1] == 0)
44707 mode = CCZmode, code = EQ;
44708 break;
44709 case 'g':
44710 if (con[1] == 0)
44711 mode = CCGCmode, code = GT;
44712 else if (con[1] == 'e' && con[2] == 0)
44713 mode = CCGCmode, code = GE;
44714 break;
44715 case 'l':
44716 if (con[1] == 0)
44717 mode = CCGCmode, code = LT;
44718 else if (con[1] == 'e' && con[2] == 0)
44719 mode = CCGCmode, code = LE;
44720 break;
44721 case 'o':
44722 if (con[1] == 0)
44723 mode = CCOmode, code = EQ;
44724 break;
44725 case 'p':
44726 if (con[1] == 0)
44727 mode = CCPmode, code = EQ;
44728 break;
44729 case 's':
44730 if (con[1] == 0)
44731 mode = CCSmode, code = EQ;
44732 break;
44733 case 'z':
44734 if (con[1] == 0)
44735 mode = CCZmode, code = EQ;
44736 break;
44738 if (code == UNKNOWN)
44740 error ("unknown asm flag output %qs", constraints[i]);
44741 continue;
44743 if (invert)
44744 code = reverse_condition (code);
44746 rtx dest = outputs[i];
44747 if (!saw_asm_flag)
44749 /* This is the first asm flag output. Here we put the flags
44750 register in as the real output and adjust the condition to
44751 allow it. */
44752 constraints[i] = "=Bf";
44753 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44754 saw_asm_flag = true;
44756 else
44758 /* We don't need the flags register as output twice. */
44759 constraints[i] = "=X";
44760 outputs[i] = gen_rtx_SCRATCH (SImode);
44763 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44764 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44766 machine_mode dest_mode = GET_MODE (dest);
44767 if (!SCALAR_INT_MODE_P (dest_mode))
44769 error ("invalid type for asm flag output");
44770 continue;
44773 if (dest_mode == DImode && !TARGET_64BIT)
44774 dest_mode = SImode;
44776 if (dest_mode != QImode)
44778 rtx destqi = gen_reg_rtx (QImode);
44779 emit_insn (gen_rtx_SET (destqi, x));
44781 if (TARGET_ZERO_EXTEND_WITH_AND
44782 && optimize_function_for_speed_p (cfun))
44784 x = force_reg (dest_mode, const0_rtx);
44786 emit_insn (gen_movstrictqi
44787 (gen_lowpart (QImode, x), destqi));
44789 else
44790 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44793 if (dest_mode != GET_MODE (dest))
44795 rtx tmp = gen_reg_rtx (SImode);
44797 emit_insn (gen_rtx_SET (tmp, x));
44798 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44800 else
44801 emit_insn (gen_rtx_SET (dest, x));
44803 rtx_insn *seq = get_insns ();
44804 end_sequence ();
44806 if (saw_asm_flag)
44807 return seq;
44808 else
44810 /* If we had no asm flag outputs, clobber the flags. */
44811 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44812 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44813 return NULL;
44817 /* Implements target vector targetm.asm.encode_section_info. */
44819 static void ATTRIBUTE_UNUSED
44820 ix86_encode_section_info (tree decl, rtx rtl, int first)
44822 default_encode_section_info (decl, rtl, first);
44824 if (ix86_in_large_data_p (decl))
44825 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44828 /* Worker function for REVERSE_CONDITION. */
44830 enum rtx_code
44831 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44833 return (mode == CCFPmode
44834 ? reverse_condition_maybe_unordered (code)
44835 : reverse_condition (code));
44838 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44839 to OPERANDS[0]. */
44841 const char *
44842 output_387_reg_move (rtx_insn *insn, rtx *operands)
44844 if (REG_P (operands[0]))
44846 if (REG_P (operands[1])
44847 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44849 if (REGNO (operands[0]) == FIRST_STACK_REG)
44850 return output_387_ffreep (operands, 0);
44851 return "fstp\t%y0";
44853 if (STACK_TOP_P (operands[0]))
44854 return "fld%Z1\t%y1";
44855 return "fst\t%y0";
44857 else if (MEM_P (operands[0]))
44859 gcc_assert (REG_P (operands[1]));
44860 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44861 return "fstp%Z0\t%y0";
44862 else
44864 /* There is no non-popping store to memory for XFmode.
44865 So if we need one, follow the store with a load. */
44866 if (GET_MODE (operands[0]) == XFmode)
44867 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44868 else
44869 return "fst%Z0\t%y0";
44872 else
44873 gcc_unreachable();
44876 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44877 FP status register is set. */
44879 void
44880 ix86_emit_fp_unordered_jump (rtx label)
44882 rtx reg = gen_reg_rtx (HImode);
44883 rtx temp;
44885 emit_insn (gen_x86_fnstsw_1 (reg));
44887 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44889 emit_insn (gen_x86_sahf_1 (reg));
44891 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44892 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44894 else
44896 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44898 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44899 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44902 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44903 gen_rtx_LABEL_REF (VOIDmode, label),
44904 pc_rtx);
44905 temp = gen_rtx_SET (pc_rtx, temp);
44907 emit_jump_insn (temp);
44908 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44911 /* Output code to perform a log1p XFmode calculation. */
44913 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44915 rtx_code_label *label1 = gen_label_rtx ();
44916 rtx_code_label *label2 = gen_label_rtx ();
44918 rtx tmp = gen_reg_rtx (XFmode);
44919 rtx tmp2 = gen_reg_rtx (XFmode);
44920 rtx test;
44922 emit_insn (gen_absxf2 (tmp, op1));
44923 test = gen_rtx_GE (VOIDmode, tmp,
44924 const_double_from_real_value (
44925 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44926 XFmode));
44927 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44929 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44930 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44931 emit_jump (label2);
44933 emit_label (label1);
44934 emit_move_insn (tmp, CONST1_RTX (XFmode));
44935 emit_insn (gen_addxf3 (tmp, op1, tmp));
44936 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44937 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44939 emit_label (label2);
44942 /* Emit code for round calculation. */
44943 void ix86_emit_i387_round (rtx op0, rtx op1)
44945 machine_mode inmode = GET_MODE (op1);
44946 machine_mode outmode = GET_MODE (op0);
44947 rtx e1, e2, res, tmp, tmp1, half;
44948 rtx scratch = gen_reg_rtx (HImode);
44949 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44950 rtx_code_label *jump_label = gen_label_rtx ();
44951 rtx insn;
44952 rtx (*gen_abs) (rtx, rtx);
44953 rtx (*gen_neg) (rtx, rtx);
44955 switch (inmode)
44957 case E_SFmode:
44958 gen_abs = gen_abssf2;
44959 break;
44960 case E_DFmode:
44961 gen_abs = gen_absdf2;
44962 break;
44963 case E_XFmode:
44964 gen_abs = gen_absxf2;
44965 break;
44966 default:
44967 gcc_unreachable ();
44970 switch (outmode)
44972 case E_SFmode:
44973 gen_neg = gen_negsf2;
44974 break;
44975 case E_DFmode:
44976 gen_neg = gen_negdf2;
44977 break;
44978 case E_XFmode:
44979 gen_neg = gen_negxf2;
44980 break;
44981 case E_HImode:
44982 gen_neg = gen_neghi2;
44983 break;
44984 case E_SImode:
44985 gen_neg = gen_negsi2;
44986 break;
44987 case E_DImode:
44988 gen_neg = gen_negdi2;
44989 break;
44990 default:
44991 gcc_unreachable ();
44994 e1 = gen_reg_rtx (inmode);
44995 e2 = gen_reg_rtx (inmode);
44996 res = gen_reg_rtx (outmode);
44998 half = const_double_from_real_value (dconsthalf, inmode);
45000 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45002 /* scratch = fxam(op1) */
45003 emit_insn (gen_rtx_SET (scratch,
45004 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45005 UNSPEC_FXAM)));
45006 /* e1 = fabs(op1) */
45007 emit_insn (gen_abs (e1, op1));
45009 /* e2 = e1 + 0.5 */
45010 half = force_reg (inmode, half);
45011 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45013 /* res = floor(e2) */
45014 if (inmode != XFmode)
45016 tmp1 = gen_reg_rtx (XFmode);
45018 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45020 else
45021 tmp1 = e2;
45023 switch (outmode)
45025 case E_SFmode:
45026 case E_DFmode:
45028 rtx tmp0 = gen_reg_rtx (XFmode);
45030 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45032 emit_insn (gen_rtx_SET (res,
45033 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45034 UNSPEC_TRUNC_NOOP)));
45036 break;
45037 case E_XFmode:
45038 emit_insn (gen_frndintxf2_floor (res, tmp1));
45039 break;
45040 case E_HImode:
45041 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45042 break;
45043 case E_SImode:
45044 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45045 break;
45046 case E_DImode:
45047 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45048 break;
45049 default:
45050 gcc_unreachable ();
45053 /* flags = signbit(a) */
45054 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45056 /* if (flags) then res = -res */
45057 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45058 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45059 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45060 pc_rtx);
45061 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45062 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45063 JUMP_LABEL (insn) = jump_label;
45065 emit_insn (gen_neg (res, res));
45067 emit_label (jump_label);
45068 LABEL_NUSES (jump_label) = 1;
45070 emit_move_insn (op0, res);
45073 /* Output code to perform a Newton-Rhapson approximation of a single precision
45074 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45076 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45078 rtx x0, x1, e0, e1;
45080 x0 = gen_reg_rtx (mode);
45081 e0 = gen_reg_rtx (mode);
45082 e1 = gen_reg_rtx (mode);
45083 x1 = gen_reg_rtx (mode);
45085 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45087 b = force_reg (mode, b);
45089 /* x0 = rcp(b) estimate */
45090 if (mode == V16SFmode || mode == V8DFmode)
45092 if (TARGET_AVX512ER)
45094 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45095 UNSPEC_RCP28)));
45096 /* res = a * x0 */
45097 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45098 return;
45100 else
45101 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45102 UNSPEC_RCP14)));
45104 else
45105 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45106 UNSPEC_RCP)));
45108 /* e0 = x0 * b */
45109 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45111 /* e0 = x0 * e0 */
45112 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45114 /* e1 = x0 + x0 */
45115 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45117 /* x1 = e1 - e0 */
45118 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45120 /* res = a * x1 */
45121 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45124 /* Output code to perform a Newton-Rhapson approximation of a
45125 single precision floating point [reciprocal] square root. */
45127 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45129 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45130 REAL_VALUE_TYPE r;
45131 int unspec;
45133 x0 = gen_reg_rtx (mode);
45134 e0 = gen_reg_rtx (mode);
45135 e1 = gen_reg_rtx (mode);
45136 e2 = gen_reg_rtx (mode);
45137 e3 = gen_reg_rtx (mode);
45139 if (TARGET_AVX512ER && mode == V16SFmode)
45141 if (recip)
45142 /* res = rsqrt28(a) estimate */
45143 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45144 UNSPEC_RSQRT28)));
45145 else
45147 /* x0 = rsqrt28(a) estimate */
45148 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45149 UNSPEC_RSQRT28)));
45150 /* res = rcp28(x0) estimate */
45151 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45152 UNSPEC_RCP28)));
45154 return;
45157 real_from_integer (&r, VOIDmode, -3, SIGNED);
45158 mthree = const_double_from_real_value (r, SFmode);
45160 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45161 mhalf = const_double_from_real_value (r, SFmode);
45162 unspec = UNSPEC_RSQRT;
45164 if (VECTOR_MODE_P (mode))
45166 mthree = ix86_build_const_vector (mode, true, mthree);
45167 mhalf = ix86_build_const_vector (mode, true, mhalf);
45168 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45169 if (GET_MODE_SIZE (mode) == 64)
45170 unspec = UNSPEC_RSQRT14;
45173 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45174 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45176 a = force_reg (mode, a);
45178 /* x0 = rsqrt(a) estimate */
45179 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45180 unspec)));
45182 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45183 if (!recip)
45185 rtx zero = force_reg (mode, CONST0_RTX(mode));
45186 rtx mask;
45188 /* Handle masked compare. */
45189 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45191 mask = gen_reg_rtx (HImode);
45192 /* Imm value 0x4 corresponds to not-equal comparison. */
45193 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45194 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45196 else
45198 mask = gen_reg_rtx (mode);
45199 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45200 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45204 /* e0 = x0 * a */
45205 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45206 /* e1 = e0 * x0 */
45207 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45209 /* e2 = e1 - 3. */
45210 mthree = force_reg (mode, mthree);
45211 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45213 mhalf = force_reg (mode, mhalf);
45214 if (recip)
45215 /* e3 = -.5 * x0 */
45216 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45217 else
45218 /* e3 = -.5 * e0 */
45219 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45220 /* ret = e2 * e3 */
45221 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45224 #ifdef TARGET_SOLARIS
45225 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45227 static void
45228 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45229 tree decl)
45231 /* With Binutils 2.15, the "@unwind" marker must be specified on
45232 every occurrence of the ".eh_frame" section, not just the first
45233 one. */
45234 if (TARGET_64BIT
45235 && strcmp (name, ".eh_frame") == 0)
45237 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45238 flags & SECTION_WRITE ? "aw" : "a");
45239 return;
45242 #ifndef USE_GAS
45243 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45245 solaris_elf_asm_comdat_section (name, flags, decl);
45246 return;
45248 #endif
45250 default_elf_asm_named_section (name, flags, decl);
45252 #endif /* TARGET_SOLARIS */
45254 /* Return the mangling of TYPE if it is an extended fundamental type. */
45256 static const char *
45257 ix86_mangle_type (const_tree type)
45259 type = TYPE_MAIN_VARIANT (type);
45261 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45262 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45263 return NULL;
45265 switch (TYPE_MODE (type))
45267 case E_TFmode:
45268 /* __float128 is "g". */
45269 return "g";
45270 case E_XFmode:
45271 /* "long double" or __float80 is "e". */
45272 return "e";
45273 default:
45274 return NULL;
45278 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45280 static tree
45281 ix86_stack_protect_guard (void)
45283 if (TARGET_SSP_TLS_GUARD)
45285 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45286 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45287 tree type = build_qualified_type (type_node, qual);
45288 tree t;
45290 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45292 t = ix86_tls_stack_chk_guard_decl;
45294 if (t == NULL)
45296 rtx x;
45298 t = build_decl
45299 (UNKNOWN_LOCATION, VAR_DECL,
45300 get_identifier (ix86_stack_protector_guard_symbol_str),
45301 type);
45302 TREE_STATIC (t) = 1;
45303 TREE_PUBLIC (t) = 1;
45304 DECL_EXTERNAL (t) = 1;
45305 TREE_USED (t) = 1;
45306 TREE_THIS_VOLATILE (t) = 1;
45307 DECL_ARTIFICIAL (t) = 1;
45308 DECL_IGNORED_P (t) = 1;
45310 /* Do not share RTL as the declaration is visible outside of
45311 current function. */
45312 x = DECL_RTL (t);
45313 RTX_FLAG (x, used) = 1;
45315 ix86_tls_stack_chk_guard_decl = t;
45318 else
45320 tree asptrtype = build_pointer_type (type);
45322 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45323 t = build2 (MEM_REF, asptrtype, t,
45324 build_int_cst (asptrtype, 0));
45327 return t;
45330 return default_stack_protect_guard ();
45333 /* For 32-bit code we can save PIC register setup by using
45334 __stack_chk_fail_local hidden function instead of calling
45335 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45336 register, so it is better to call __stack_chk_fail directly. */
45338 static tree ATTRIBUTE_UNUSED
45339 ix86_stack_protect_fail (void)
45341 return TARGET_64BIT
45342 ? default_external_stack_protect_fail ()
45343 : default_hidden_stack_protect_fail ();
45346 /* Select a format to encode pointers in exception handling data. CODE
45347 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45348 true if the symbol may be affected by dynamic relocations.
45350 ??? All x86 object file formats are capable of representing this.
45351 After all, the relocation needed is the same as for the call insn.
45352 Whether or not a particular assembler allows us to enter such, I
45353 guess we'll have to see. */
45355 asm_preferred_eh_data_format (int code, int global)
45357 if (flag_pic)
45359 int type = DW_EH_PE_sdata8;
45360 if (!TARGET_64BIT
45361 || ix86_cmodel == CM_SMALL_PIC
45362 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45363 type = DW_EH_PE_sdata4;
45364 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45366 if (ix86_cmodel == CM_SMALL
45367 || (ix86_cmodel == CM_MEDIUM && code))
45368 return DW_EH_PE_udata4;
45369 return DW_EH_PE_absptr;
45372 /* Expand copysign from SIGN to the positive value ABS_VALUE
45373 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45374 the sign-bit. */
45375 static void
45376 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45378 machine_mode mode = GET_MODE (sign);
45379 rtx sgn = gen_reg_rtx (mode);
45380 if (mask == NULL_RTX)
45382 machine_mode vmode;
45384 if (mode == SFmode)
45385 vmode = V4SFmode;
45386 else if (mode == DFmode)
45387 vmode = V2DFmode;
45388 else
45389 vmode = mode;
45391 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45392 if (!VECTOR_MODE_P (mode))
45394 /* We need to generate a scalar mode mask in this case. */
45395 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45396 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45397 mask = gen_reg_rtx (mode);
45398 emit_insn (gen_rtx_SET (mask, tmp));
45401 else
45402 mask = gen_rtx_NOT (mode, mask);
45403 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45404 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45407 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45408 mask for masking out the sign-bit is stored in *SMASK, if that is
45409 non-null. */
45410 static rtx
45411 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45413 machine_mode vmode, mode = GET_MODE (op0);
45414 rtx xa, mask;
45416 xa = gen_reg_rtx (mode);
45417 if (mode == SFmode)
45418 vmode = V4SFmode;
45419 else if (mode == DFmode)
45420 vmode = V2DFmode;
45421 else
45422 vmode = mode;
45423 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45424 if (!VECTOR_MODE_P (mode))
45426 /* We need to generate a scalar mode mask in this case. */
45427 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45428 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45429 mask = gen_reg_rtx (mode);
45430 emit_insn (gen_rtx_SET (mask, tmp));
45432 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45434 if (smask)
45435 *smask = mask;
45437 return xa;
45440 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45441 swapping the operands if SWAP_OPERANDS is true. The expanded
45442 code is a forward jump to a newly created label in case the
45443 comparison is true. The generated label rtx is returned. */
45444 static rtx_code_label *
45445 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45446 bool swap_operands)
45448 bool unordered_compare = ix86_unordered_fp_compare (code);
45449 rtx_code_label *label;
45450 rtx tmp, reg;
45452 if (swap_operands)
45453 std::swap (op0, op1);
45455 label = gen_label_rtx ();
45456 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45457 if (unordered_compare)
45458 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45459 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45460 emit_insn (gen_rtx_SET (reg, tmp));
45461 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45462 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45463 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45464 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45465 JUMP_LABEL (tmp) = label;
45467 return label;
45470 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45471 using comparison code CODE. Operands are swapped for the comparison if
45472 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45473 static rtx
45474 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45475 bool swap_operands)
45477 rtx (*insn)(rtx, rtx, rtx, rtx);
45478 machine_mode mode = GET_MODE (op0);
45479 rtx mask = gen_reg_rtx (mode);
45481 if (swap_operands)
45482 std::swap (op0, op1);
45484 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45486 emit_insn (insn (mask, op0, op1,
45487 gen_rtx_fmt_ee (code, mode, op0, op1)));
45488 return mask;
45491 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45492 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45493 static rtx
45494 ix86_gen_TWO52 (machine_mode mode)
45496 REAL_VALUE_TYPE TWO52r;
45497 rtx TWO52;
45499 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45500 TWO52 = const_double_from_real_value (TWO52r, mode);
45501 TWO52 = force_reg (mode, TWO52);
45503 return TWO52;
45506 /* Expand SSE sequence for computing lround from OP1 storing
45507 into OP0. */
45508 void
45509 ix86_expand_lround (rtx op0, rtx op1)
45511 /* C code for the stuff we're doing below:
45512 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45513 return (long)tmp;
45515 machine_mode mode = GET_MODE (op1);
45516 const struct real_format *fmt;
45517 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45518 rtx adj;
45520 /* load nextafter (0.5, 0.0) */
45521 fmt = REAL_MODE_FORMAT (mode);
45522 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45523 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45525 /* adj = copysign (0.5, op1) */
45526 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45527 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45529 /* adj = op1 + adj */
45530 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45532 /* op0 = (imode)adj */
45533 expand_fix (op0, adj, 0);
45536 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45537 into OPERAND0. */
45538 void
45539 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45541 /* C code for the stuff we're doing below (for do_floor):
45542 xi = (long)op1;
45543 xi -= (double)xi > op1 ? 1 : 0;
45544 return xi;
45546 machine_mode fmode = GET_MODE (op1);
45547 machine_mode imode = GET_MODE (op0);
45548 rtx ireg, freg, tmp;
45549 rtx_code_label *label;
45551 /* reg = (long)op1 */
45552 ireg = gen_reg_rtx (imode);
45553 expand_fix (ireg, op1, 0);
45555 /* freg = (double)reg */
45556 freg = gen_reg_rtx (fmode);
45557 expand_float (freg, ireg, 0);
45559 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45560 label = ix86_expand_sse_compare_and_jump (UNLE,
45561 freg, op1, !do_floor);
45562 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45563 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45564 emit_move_insn (ireg, tmp);
45566 emit_label (label);
45567 LABEL_NUSES (label) = 1;
45569 emit_move_insn (op0, ireg);
45572 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45573 void
45574 ix86_expand_rint (rtx operand0, rtx operand1)
45576 /* C code for the stuff we're doing below:
45577 xa = fabs (operand1);
45578 if (!isless (xa, 2**52))
45579 return operand1;
45580 two52 = 2**52;
45581 if (flag_rounding_math)
45583 two52 = copysign (two52, operand1);
45584 xa = operand1;
45586 xa = xa + two52 - two52;
45587 return copysign (xa, operand1);
45589 machine_mode mode = GET_MODE (operand0);
45590 rtx res, xa, TWO52, two52, mask;
45591 rtx_code_label *label;
45593 res = gen_reg_rtx (mode);
45594 emit_move_insn (res, operand1);
45596 /* xa = abs (operand1) */
45597 xa = ix86_expand_sse_fabs (res, &mask);
45599 /* if (!isless (xa, TWO52)) goto label; */
45600 TWO52 = ix86_gen_TWO52 (mode);
45601 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45603 two52 = TWO52;
45604 if (flag_rounding_math)
45606 two52 = gen_reg_rtx (mode);
45607 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45608 xa = res;
45611 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45612 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45614 ix86_sse_copysign_to_positive (res, xa, res, mask);
45616 emit_label (label);
45617 LABEL_NUSES (label) = 1;
45619 emit_move_insn (operand0, res);
45622 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45623 into OPERAND0. */
45624 void
45625 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45627 /* C code for the stuff we expand below.
45628 double xa = fabs (x), x2;
45629 if (!isless (xa, TWO52))
45630 return x;
45631 xa = xa + TWO52 - TWO52;
45632 x2 = copysign (xa, x);
45633 Compensate. Floor:
45634 if (x2 > x)
45635 x2 -= 1;
45636 Compensate. Ceil:
45637 if (x2 < x)
45638 x2 -= -1;
45639 return x2;
45641 machine_mode mode = GET_MODE (operand0);
45642 rtx xa, TWO52, tmp, one, res, mask;
45643 rtx_code_label *label;
45645 TWO52 = ix86_gen_TWO52 (mode);
45647 /* Temporary for holding the result, initialized to the input
45648 operand to ease control flow. */
45649 res = gen_reg_rtx (mode);
45650 emit_move_insn (res, operand1);
45652 /* xa = abs (operand1) */
45653 xa = ix86_expand_sse_fabs (res, &mask);
45655 /* if (!isless (xa, TWO52)) goto label; */
45656 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45658 /* xa = xa + TWO52 - TWO52; */
45659 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45660 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45662 /* xa = copysign (xa, operand1) */
45663 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45665 /* generate 1.0 or -1.0 */
45666 one = force_reg (mode,
45667 const_double_from_real_value (do_floor
45668 ? dconst1 : dconstm1, mode));
45670 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45671 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45672 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45673 /* We always need to subtract here to preserve signed zero. */
45674 tmp = expand_simple_binop (mode, MINUS,
45675 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45676 emit_move_insn (res, tmp);
45678 emit_label (label);
45679 LABEL_NUSES (label) = 1;
45681 emit_move_insn (operand0, res);
45684 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45685 into OPERAND0. */
45686 void
45687 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45689 /* C code for the stuff we expand below.
45690 double xa = fabs (x), x2;
45691 if (!isless (xa, TWO52))
45692 return x;
45693 x2 = (double)(long)x;
45694 Compensate. Floor:
45695 if (x2 > x)
45696 x2 -= 1;
45697 Compensate. Ceil:
45698 if (x2 < x)
45699 x2 += 1;
45700 if (HONOR_SIGNED_ZEROS (mode))
45701 return copysign (x2, x);
45702 return x2;
45704 machine_mode mode = GET_MODE (operand0);
45705 rtx xa, xi, TWO52, tmp, one, res, mask;
45706 rtx_code_label *label;
45708 TWO52 = ix86_gen_TWO52 (mode);
45710 /* Temporary for holding the result, initialized to the input
45711 operand to ease control flow. */
45712 res = gen_reg_rtx (mode);
45713 emit_move_insn (res, operand1);
45715 /* xa = abs (operand1) */
45716 xa = ix86_expand_sse_fabs (res, &mask);
45718 /* if (!isless (xa, TWO52)) goto label; */
45719 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45721 /* xa = (double)(long)x */
45722 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45723 expand_fix (xi, res, 0);
45724 expand_float (xa, xi, 0);
45726 /* generate 1.0 */
45727 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45729 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45730 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45731 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45732 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45733 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45734 emit_move_insn (res, tmp);
45736 if (HONOR_SIGNED_ZEROS (mode))
45737 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45739 emit_label (label);
45740 LABEL_NUSES (label) = 1;
45742 emit_move_insn (operand0, res);
45745 /* Expand SSE sequence for computing round from OPERAND1 storing
45746 into OPERAND0. Sequence that works without relying on DImode truncation
45747 via cvttsd2siq that is only available on 64bit targets. */
45748 void
45749 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45751 /* C code for the stuff we expand below.
45752 double xa = fabs (x), xa2, x2;
45753 if (!isless (xa, TWO52))
45754 return x;
45755 Using the absolute value and copying back sign makes
45756 -0.0 -> -0.0 correct.
45757 xa2 = xa + TWO52 - TWO52;
45758 Compensate.
45759 dxa = xa2 - xa;
45760 if (dxa <= -0.5)
45761 xa2 += 1;
45762 else if (dxa > 0.5)
45763 xa2 -= 1;
45764 x2 = copysign (xa2, x);
45765 return x2;
45767 machine_mode mode = GET_MODE (operand0);
45768 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45769 rtx_code_label *label;
45771 TWO52 = ix86_gen_TWO52 (mode);
45773 /* Temporary for holding the result, initialized to the input
45774 operand to ease control flow. */
45775 res = gen_reg_rtx (mode);
45776 emit_move_insn (res, operand1);
45778 /* xa = abs (operand1) */
45779 xa = ix86_expand_sse_fabs (res, &mask);
45781 /* if (!isless (xa, TWO52)) goto label; */
45782 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45784 /* xa2 = xa + TWO52 - TWO52; */
45785 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45786 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45788 /* dxa = xa2 - xa; */
45789 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45791 /* generate 0.5, 1.0 and -0.5 */
45792 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45793 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45794 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45795 0, OPTAB_DIRECT);
45797 /* Compensate. */
45798 tmp = gen_reg_rtx (mode);
45799 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45800 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45801 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45802 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45803 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45804 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45805 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45806 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45808 /* res = copysign (xa2, operand1) */
45809 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45811 emit_label (label);
45812 LABEL_NUSES (label) = 1;
45814 emit_move_insn (operand0, res);
45817 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45818 into OPERAND0. */
45819 void
45820 ix86_expand_trunc (rtx operand0, rtx operand1)
45822 /* C code for SSE variant we expand below.
45823 double xa = fabs (x), x2;
45824 if (!isless (xa, TWO52))
45825 return x;
45826 x2 = (double)(long)x;
45827 if (HONOR_SIGNED_ZEROS (mode))
45828 return copysign (x2, x);
45829 return x2;
45831 machine_mode mode = GET_MODE (operand0);
45832 rtx xa, xi, TWO52, res, mask;
45833 rtx_code_label *label;
45835 TWO52 = ix86_gen_TWO52 (mode);
45837 /* Temporary for holding the result, initialized to the input
45838 operand to ease control flow. */
45839 res = gen_reg_rtx (mode);
45840 emit_move_insn (res, operand1);
45842 /* xa = abs (operand1) */
45843 xa = ix86_expand_sse_fabs (res, &mask);
45845 /* if (!isless (xa, TWO52)) goto label; */
45846 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45848 /* x = (double)(long)x */
45849 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45850 expand_fix (xi, res, 0);
45851 expand_float (res, xi, 0);
45853 if (HONOR_SIGNED_ZEROS (mode))
45854 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45856 emit_label (label);
45857 LABEL_NUSES (label) = 1;
45859 emit_move_insn (operand0, res);
45862 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45863 into OPERAND0. */
45864 void
45865 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45867 machine_mode mode = GET_MODE (operand0);
45868 rtx xa, mask, TWO52, one, res, smask, tmp;
45869 rtx_code_label *label;
45871 /* C code for SSE variant we expand below.
45872 double xa = fabs (x), x2;
45873 if (!isless (xa, TWO52))
45874 return x;
45875 xa2 = xa + TWO52 - TWO52;
45876 Compensate:
45877 if (xa2 > xa)
45878 xa2 -= 1.0;
45879 x2 = copysign (xa2, x);
45880 return x2;
45883 TWO52 = ix86_gen_TWO52 (mode);
45885 /* Temporary for holding the result, initialized to the input
45886 operand to ease control flow. */
45887 res = gen_reg_rtx (mode);
45888 emit_move_insn (res, operand1);
45890 /* xa = abs (operand1) */
45891 xa = ix86_expand_sse_fabs (res, &smask);
45893 /* if (!isless (xa, TWO52)) goto label; */
45894 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45896 /* res = xa + TWO52 - TWO52; */
45897 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45898 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45899 emit_move_insn (res, tmp);
45901 /* generate 1.0 */
45902 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45904 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45905 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45906 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45907 tmp = expand_simple_binop (mode, MINUS,
45908 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45909 emit_move_insn (res, tmp);
45911 /* res = copysign (res, operand1) */
45912 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45914 emit_label (label);
45915 LABEL_NUSES (label) = 1;
45917 emit_move_insn (operand0, res);
45920 /* Expand SSE sequence for computing round from OPERAND1 storing
45921 into OPERAND0. */
45922 void
45923 ix86_expand_round (rtx operand0, rtx operand1)
45925 /* C code for the stuff we're doing below:
45926 double xa = fabs (x);
45927 if (!isless (xa, TWO52))
45928 return x;
45929 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45930 return copysign (xa, x);
45932 machine_mode mode = GET_MODE (operand0);
45933 rtx res, TWO52, xa, xi, half, mask;
45934 rtx_code_label *label;
45935 const struct real_format *fmt;
45936 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45938 /* Temporary for holding the result, initialized to the input
45939 operand to ease control flow. */
45940 res = gen_reg_rtx (mode);
45941 emit_move_insn (res, operand1);
45943 TWO52 = ix86_gen_TWO52 (mode);
45944 xa = ix86_expand_sse_fabs (res, &mask);
45945 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45947 /* load nextafter (0.5, 0.0) */
45948 fmt = REAL_MODE_FORMAT (mode);
45949 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45950 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45952 /* xa = xa + 0.5 */
45953 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45954 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45956 /* xa = (double)(int64_t)xa */
45957 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45958 expand_fix (xi, xa, 0);
45959 expand_float (xa, xi, 0);
45961 /* res = copysign (xa, operand1) */
45962 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45964 emit_label (label);
45965 LABEL_NUSES (label) = 1;
45967 emit_move_insn (operand0, res);
45970 /* Expand SSE sequence for computing round
45971 from OP1 storing into OP0 using sse4 round insn. */
45972 void
45973 ix86_expand_round_sse4 (rtx op0, rtx op1)
45975 machine_mode mode = GET_MODE (op0);
45976 rtx e1, e2, res, half;
45977 const struct real_format *fmt;
45978 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45979 rtx (*gen_copysign) (rtx, rtx, rtx);
45980 rtx (*gen_round) (rtx, rtx, rtx);
45982 switch (mode)
45984 case E_SFmode:
45985 gen_copysign = gen_copysignsf3;
45986 gen_round = gen_sse4_1_roundsf2;
45987 break;
45988 case E_DFmode:
45989 gen_copysign = gen_copysigndf3;
45990 gen_round = gen_sse4_1_rounddf2;
45991 break;
45992 default:
45993 gcc_unreachable ();
45996 /* round (a) = trunc (a + copysign (0.5, a)) */
45998 /* load nextafter (0.5, 0.0) */
45999 fmt = REAL_MODE_FORMAT (mode);
46000 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46001 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46002 half = const_double_from_real_value (pred_half, mode);
46004 /* e1 = copysign (0.5, op1) */
46005 e1 = gen_reg_rtx (mode);
46006 emit_insn (gen_copysign (e1, half, op1));
46008 /* e2 = op1 + e1 */
46009 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46011 /* res = trunc (e2) */
46012 res = gen_reg_rtx (mode);
46013 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46015 emit_move_insn (op0, res);
46019 /* Table of valid machine attributes. */
46020 static const struct attribute_spec ix86_attribute_table[] =
46022 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
46023 affects_type_identity, handler, exclude } */
46024 /* Stdcall attribute says callee is responsible for popping arguments
46025 if they are not variable. */
46026 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
46027 NULL },
46028 /* Fastcall attribute says callee is responsible for popping arguments
46029 if they are not variable. */
46030 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
46031 NULL },
46032 /* Thiscall attribute says callee is responsible for popping arguments
46033 if they are not variable. */
46034 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
46035 NULL },
46036 /* Cdecl attribute says the callee is a normal C declaration */
46037 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
46038 NULL },
46039 /* Regparm attribute specifies how many integer arguments are to be
46040 passed in registers. */
46041 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
46042 NULL },
46043 /* Sseregparm attribute says we are using x86_64 calling conventions
46044 for FP arguments. */
46045 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
46046 NULL },
46047 /* The transactional memory builtins are implicitly regparm or fastcall
46048 depending on the ABI. Override the generic do-nothing attribute that
46049 these builtins were declared with. */
46050 { "*tm regparm", 0, 0, false, true, true, true,
46051 ix86_handle_tm_regparm_attribute, NULL },
46052 /* force_align_arg_pointer says this function realigns the stack at entry. */
46053 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46054 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
46055 NULL },
46056 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46057 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
46058 NULL },
46059 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
46060 NULL },
46061 { "shared", 0, 0, true, false, false, false,
46062 ix86_handle_shared_attribute, NULL },
46063 #endif
46064 { "ms_struct", 0, 0, false, false, false, false,
46065 ix86_handle_struct_attribute, NULL },
46066 { "gcc_struct", 0, 0, false, false, false, false,
46067 ix86_handle_struct_attribute, NULL },
46068 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46069 SUBTARGET_ATTRIBUTE_TABLE,
46070 #endif
46071 /* ms_abi and sysv_abi calling convention function attributes. */
46072 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
46073 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
46074 NULL },
46075 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46076 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46077 { "ms_hook_prologue", 0, 0, true, false, false, false,
46078 ix86_handle_fndecl_attribute, NULL },
46079 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
46080 ix86_handle_callee_pop_aggregate_return, NULL },
46081 { "interrupt", 0, 0, false, true, true, false,
46082 ix86_handle_interrupt_attribute, NULL },
46083 { "no_caller_saved_registers", 0, 0, false, true, true, false,
46084 ix86_handle_no_caller_saved_registers_attribute, NULL },
46085 { "naked", 0, 0, true, false, false, false,
46086 ix86_handle_fndecl_attribute, NULL },
46087 { "indirect_branch", 1, 1, true, false, false, false,
46088 ix86_handle_fndecl_attribute, NULL },
46089 { "function_return", 1, 1, true, false, false, false,
46090 ix86_handle_fndecl_attribute, NULL },
46092 /* End element. */
46093 { NULL, 0, 0, false, false, false, false, NULL, NULL }
46096 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46097 static int
46098 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46099 tree vectype, int)
46101 bool fp = false;
46102 machine_mode mode = TImode;
46103 int index;
46104 if (vectype != NULL)
46106 fp = FLOAT_TYPE_P (vectype);
46107 mode = TYPE_MODE (vectype);
46110 switch (type_of_cost)
46112 case scalar_stmt:
46113 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
46115 case scalar_load:
46116 /* load/store costs are relative to register move which is 2. Recompute
46117 it to COSTS_N_INSNS so everything have same base. */
46118 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
46119 : ix86_cost->int_load [2]) / 2;
46121 case scalar_store:
46122 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
46123 : ix86_cost->int_store [2]) / 2;
46125 case vector_stmt:
46126 return ix86_vec_cost (mode,
46127 fp ? ix86_cost->addss : ix86_cost->sse_op,
46128 true);
46130 case vector_load:
46131 index = sse_store_index (mode);
46132 /* See PR82713 - we may end up being called on non-vector type. */
46133 if (index < 0)
46134 index = 2;
46135 return ix86_vec_cost (mode,
46136 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
46137 true);
46139 case vector_store:
46140 index = sse_store_index (mode);
46141 /* See PR82713 - we may end up being called on non-vector type. */
46142 if (index < 0)
46143 index = 2;
46144 return ix86_vec_cost (mode,
46145 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
46146 true);
46148 case vec_to_scalar:
46149 case scalar_to_vec:
46150 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
46152 /* We should have separate costs for unaligned loads and gather/scatter.
46153 Do that incrementally. */
46154 case unaligned_load:
46155 index = sse_store_index (mode);
46156 /* See PR82713 - we may end up being called on non-vector type. */
46157 if (index < 0)
46158 index = 2;
46159 return ix86_vec_cost (mode,
46160 COSTS_N_INSNS
46161 (ix86_cost->sse_unaligned_load[index]) / 2,
46162 true);
46164 case unaligned_store:
46165 index = sse_store_index (mode);
46166 /* See PR82713 - we may end up being called on non-vector type. */
46167 if (index < 0)
46168 index = 2;
46169 return ix86_vec_cost (mode,
46170 COSTS_N_INSNS
46171 (ix86_cost->sse_unaligned_store[index]) / 2,
46172 true);
46174 case vector_gather_load:
46175 return ix86_vec_cost (mode,
46176 COSTS_N_INSNS
46177 (ix86_cost->gather_static
46178 + ix86_cost->gather_per_elt
46179 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46180 true);
46182 case vector_scatter_store:
46183 return ix86_vec_cost (mode,
46184 COSTS_N_INSNS
46185 (ix86_cost->scatter_static
46186 + ix86_cost->scatter_per_elt
46187 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46188 true);
46190 case cond_branch_taken:
46191 return ix86_cost->cond_taken_branch_cost;
46193 case cond_branch_not_taken:
46194 return ix86_cost->cond_not_taken_branch_cost;
46196 case vec_perm:
46197 case vec_promote_demote:
46198 return ix86_vec_cost (mode,
46199 ix86_cost->sse_op, true);
46201 case vec_construct:
46203 /* N element inserts. */
46204 int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
46205 /* One vinserti128 for combining two SSE vectors for AVX256. */
46206 if (GET_MODE_BITSIZE (mode) == 256)
46207 cost += ix86_vec_cost (mode, ix86_cost->addss, true);
46208 /* One vinserti64x4 and two vinserti128 for combining SSE
46209 and AVX256 vectors to AVX512. */
46210 else if (GET_MODE_BITSIZE (mode) == 512)
46211 cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
46212 return cost;
46215 default:
46216 gcc_unreachable ();
46220 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46221 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46222 insn every time. */
46224 static GTY(()) rtx_insn *vselect_insn;
46226 /* Initialize vselect_insn. */
46228 static void
46229 init_vselect_insn (void)
46231 unsigned i;
46232 rtx x;
46234 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46235 for (i = 0; i < MAX_VECT_LEN; ++i)
46236 XVECEXP (x, 0, i) = const0_rtx;
46237 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46238 const0_rtx), x);
46239 x = gen_rtx_SET (const0_rtx, x);
46240 start_sequence ();
46241 vselect_insn = emit_insn (x);
46242 end_sequence ();
46245 /* Construct (set target (vec_select op0 (parallel perm))) and
46246 return true if that's a valid instruction in the active ISA. */
46248 static bool
46249 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46250 unsigned nelt, bool testing_p)
46252 unsigned int i;
46253 rtx x, save_vconcat;
46254 int icode;
46256 if (vselect_insn == NULL_RTX)
46257 init_vselect_insn ();
46259 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46260 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46261 for (i = 0; i < nelt; ++i)
46262 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46263 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46264 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46265 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46266 SET_DEST (PATTERN (vselect_insn)) = target;
46267 icode = recog_memoized (vselect_insn);
46269 if (icode >= 0 && !testing_p)
46270 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46272 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46273 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46274 INSN_CODE (vselect_insn) = -1;
46276 return icode >= 0;
46279 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46281 static bool
46282 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46283 const unsigned char *perm, unsigned nelt,
46284 bool testing_p)
46286 machine_mode v2mode;
46287 rtx x;
46288 bool ok;
46290 if (vselect_insn == NULL_RTX)
46291 init_vselect_insn ();
46293 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46294 return false;
46295 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46296 PUT_MODE (x, v2mode);
46297 XEXP (x, 0) = op0;
46298 XEXP (x, 1) = op1;
46299 ok = expand_vselect (target, x, perm, nelt, testing_p);
46300 XEXP (x, 0) = const0_rtx;
46301 XEXP (x, 1) = const0_rtx;
46302 return ok;
46305 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46306 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46308 static bool
46309 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46311 machine_mode mmode, vmode = d->vmode;
46312 unsigned i, mask, nelt = d->nelt;
46313 rtx target, op0, op1, maskop, x;
46314 rtx rperm[32], vperm;
46316 if (d->one_operand_p)
46317 return false;
46318 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46319 && (TARGET_AVX512BW
46320 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46322 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46324 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46326 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46328 else
46329 return false;
46331 /* This is a blend, not a permute. Elements must stay in their
46332 respective lanes. */
46333 for (i = 0; i < nelt; ++i)
46335 unsigned e = d->perm[i];
46336 if (!(e == i || e == i + nelt))
46337 return false;
46340 if (d->testing_p)
46341 return true;
46343 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46344 decision should be extracted elsewhere, so that we only try that
46345 sequence once all budget==3 options have been tried. */
46346 target = d->target;
46347 op0 = d->op0;
46348 op1 = d->op1;
46349 mask = 0;
46351 switch (vmode)
46353 case E_V8DFmode:
46354 case E_V16SFmode:
46355 case E_V4DFmode:
46356 case E_V8SFmode:
46357 case E_V2DFmode:
46358 case E_V4SFmode:
46359 case E_V8HImode:
46360 case E_V8SImode:
46361 case E_V32HImode:
46362 case E_V64QImode:
46363 case E_V16SImode:
46364 case E_V8DImode:
46365 for (i = 0; i < nelt; ++i)
46366 mask |= (d->perm[i] >= nelt) << i;
46367 break;
46369 case E_V2DImode:
46370 for (i = 0; i < 2; ++i)
46371 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46372 vmode = V8HImode;
46373 goto do_subreg;
46375 case E_V4SImode:
46376 for (i = 0; i < 4; ++i)
46377 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46378 vmode = V8HImode;
46379 goto do_subreg;
46381 case E_V16QImode:
46382 /* See if bytes move in pairs so we can use pblendw with
46383 an immediate argument, rather than pblendvb with a vector
46384 argument. */
46385 for (i = 0; i < 16; i += 2)
46386 if (d->perm[i] + 1 != d->perm[i + 1])
46388 use_pblendvb:
46389 for (i = 0; i < nelt; ++i)
46390 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46392 finish_pblendvb:
46393 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46394 vperm = force_reg (vmode, vperm);
46396 if (GET_MODE_SIZE (vmode) == 16)
46397 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46398 else
46399 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46400 if (target != d->target)
46401 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46402 return true;
46405 for (i = 0; i < 8; ++i)
46406 mask |= (d->perm[i * 2] >= 16) << i;
46407 vmode = V8HImode;
46408 /* FALLTHRU */
46410 do_subreg:
46411 target = gen_reg_rtx (vmode);
46412 op0 = gen_lowpart (vmode, op0);
46413 op1 = gen_lowpart (vmode, op1);
46414 break;
46416 case E_V32QImode:
46417 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46418 for (i = 0; i < 32; i += 2)
46419 if (d->perm[i] + 1 != d->perm[i + 1])
46420 goto use_pblendvb;
46421 /* See if bytes move in quadruplets. If yes, vpblendd
46422 with immediate can be used. */
46423 for (i = 0; i < 32; i += 4)
46424 if (d->perm[i] + 2 != d->perm[i + 2])
46425 break;
46426 if (i < 32)
46428 /* See if bytes move the same in both lanes. If yes,
46429 vpblendw with immediate can be used. */
46430 for (i = 0; i < 16; i += 2)
46431 if (d->perm[i] + 16 != d->perm[i + 16])
46432 goto use_pblendvb;
46434 /* Use vpblendw. */
46435 for (i = 0; i < 16; ++i)
46436 mask |= (d->perm[i * 2] >= 32) << i;
46437 vmode = V16HImode;
46438 goto do_subreg;
46441 /* Use vpblendd. */
46442 for (i = 0; i < 8; ++i)
46443 mask |= (d->perm[i * 4] >= 32) << i;
46444 vmode = V8SImode;
46445 goto do_subreg;
46447 case E_V16HImode:
46448 /* See if words move in pairs. If yes, vpblendd can be used. */
46449 for (i = 0; i < 16; i += 2)
46450 if (d->perm[i] + 1 != d->perm[i + 1])
46451 break;
46452 if (i < 16)
46454 /* See if words move the same in both lanes. If not,
46455 vpblendvb must be used. */
46456 for (i = 0; i < 8; i++)
46457 if (d->perm[i] + 8 != d->perm[i + 8])
46459 /* Use vpblendvb. */
46460 for (i = 0; i < 32; ++i)
46461 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46463 vmode = V32QImode;
46464 nelt = 32;
46465 target = gen_reg_rtx (vmode);
46466 op0 = gen_lowpart (vmode, op0);
46467 op1 = gen_lowpart (vmode, op1);
46468 goto finish_pblendvb;
46471 /* Use vpblendw. */
46472 for (i = 0; i < 16; ++i)
46473 mask |= (d->perm[i] >= 16) << i;
46474 break;
46477 /* Use vpblendd. */
46478 for (i = 0; i < 8; ++i)
46479 mask |= (d->perm[i * 2] >= 16) << i;
46480 vmode = V8SImode;
46481 goto do_subreg;
46483 case E_V4DImode:
46484 /* Use vpblendd. */
46485 for (i = 0; i < 4; ++i)
46486 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46487 vmode = V8SImode;
46488 goto do_subreg;
46490 default:
46491 gcc_unreachable ();
46494 switch (vmode)
46496 case E_V8DFmode:
46497 case E_V8DImode:
46498 mmode = QImode;
46499 break;
46500 case E_V16SFmode:
46501 case E_V16SImode:
46502 mmode = HImode;
46503 break;
46504 case E_V32HImode:
46505 mmode = SImode;
46506 break;
46507 case E_V64QImode:
46508 mmode = DImode;
46509 break;
46510 default:
46511 mmode = VOIDmode;
46514 if (mmode != VOIDmode)
46515 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46516 else
46517 maskop = GEN_INT (mask);
46519 /* This matches five different patterns with the different modes. */
46520 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46521 x = gen_rtx_SET (target, x);
46522 emit_insn (x);
46523 if (target != d->target)
46524 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46526 return true;
46529 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46530 in terms of the variable form of vpermilps.
46532 Note that we will have already failed the immediate input vpermilps,
46533 which requires that the high and low part shuffle be identical; the
46534 variable form doesn't require that. */
46536 static bool
46537 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46539 rtx rperm[8], vperm;
46540 unsigned i;
46542 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46543 return false;
46545 /* We can only permute within the 128-bit lane. */
46546 for (i = 0; i < 8; ++i)
46548 unsigned e = d->perm[i];
46549 if (i < 4 ? e >= 4 : e < 4)
46550 return false;
46553 if (d->testing_p)
46554 return true;
46556 for (i = 0; i < 8; ++i)
46558 unsigned e = d->perm[i];
46560 /* Within each 128-bit lane, the elements of op0 are numbered
46561 from 0 and the elements of op1 are numbered from 4. */
46562 if (e >= 8 + 4)
46563 e -= 8;
46564 else if (e >= 4)
46565 e -= 4;
46567 rperm[i] = GEN_INT (e);
46570 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46571 vperm = force_reg (V8SImode, vperm);
46572 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46574 return true;
46577 /* Return true if permutation D can be performed as VMODE permutation
46578 instead. */
46580 static bool
46581 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46583 unsigned int i, j, chunk;
46585 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46586 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46587 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46588 return false;
46590 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46591 return true;
46593 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46594 for (i = 0; i < d->nelt; i += chunk)
46595 if (d->perm[i] & (chunk - 1))
46596 return false;
46597 else
46598 for (j = 1; j < chunk; ++j)
46599 if (d->perm[i] + j != d->perm[i + j])
46600 return false;
46602 return true;
46605 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46606 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46608 static bool
46609 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46611 unsigned i, nelt, eltsz, mask;
46612 unsigned char perm[64];
46613 machine_mode vmode = V16QImode;
46614 rtx rperm[64], vperm, target, op0, op1;
46616 nelt = d->nelt;
46618 if (!d->one_operand_p)
46620 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46622 if (TARGET_AVX2
46623 && valid_perm_using_mode_p (V2TImode, d))
46625 if (d->testing_p)
46626 return true;
46628 /* Use vperm2i128 insn. The pattern uses
46629 V4DImode instead of V2TImode. */
46630 target = d->target;
46631 if (d->vmode != V4DImode)
46632 target = gen_reg_rtx (V4DImode);
46633 op0 = gen_lowpart (V4DImode, d->op0);
46634 op1 = gen_lowpart (V4DImode, d->op1);
46635 rperm[0]
46636 = GEN_INT ((d->perm[0] / (nelt / 2))
46637 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46638 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46639 if (target != d->target)
46640 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46641 return true;
46643 return false;
46646 else
46648 if (GET_MODE_SIZE (d->vmode) == 16)
46650 if (!TARGET_SSSE3)
46651 return false;
46653 else if (GET_MODE_SIZE (d->vmode) == 32)
46655 if (!TARGET_AVX2)
46656 return false;
46658 /* V4DImode should be already handled through
46659 expand_vselect by vpermq instruction. */
46660 gcc_assert (d->vmode != V4DImode);
46662 vmode = V32QImode;
46663 if (d->vmode == V8SImode
46664 || d->vmode == V16HImode
46665 || d->vmode == V32QImode)
46667 /* First see if vpermq can be used for
46668 V8SImode/V16HImode/V32QImode. */
46669 if (valid_perm_using_mode_p (V4DImode, d))
46671 for (i = 0; i < 4; i++)
46672 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46673 if (d->testing_p)
46674 return true;
46675 target = gen_reg_rtx (V4DImode);
46676 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46677 perm, 4, false))
46679 emit_move_insn (d->target,
46680 gen_lowpart (d->vmode, target));
46681 return true;
46683 return false;
46686 /* Next see if vpermd can be used. */
46687 if (valid_perm_using_mode_p (V8SImode, d))
46688 vmode = V8SImode;
46690 /* Or if vpermps can be used. */
46691 else if (d->vmode == V8SFmode)
46692 vmode = V8SImode;
46694 if (vmode == V32QImode)
46696 /* vpshufb only works intra lanes, it is not
46697 possible to shuffle bytes in between the lanes. */
46698 for (i = 0; i < nelt; ++i)
46699 if ((d->perm[i] ^ i) & (nelt / 2))
46700 return false;
46703 else if (GET_MODE_SIZE (d->vmode) == 64)
46705 if (!TARGET_AVX512BW)
46706 return false;
46708 /* If vpermq didn't work, vpshufb won't work either. */
46709 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46710 return false;
46712 vmode = V64QImode;
46713 if (d->vmode == V16SImode
46714 || d->vmode == V32HImode
46715 || d->vmode == V64QImode)
46717 /* First see if vpermq can be used for
46718 V16SImode/V32HImode/V64QImode. */
46719 if (valid_perm_using_mode_p (V8DImode, d))
46721 for (i = 0; i < 8; i++)
46722 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46723 if (d->testing_p)
46724 return true;
46725 target = gen_reg_rtx (V8DImode);
46726 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46727 perm, 8, false))
46729 emit_move_insn (d->target,
46730 gen_lowpart (d->vmode, target));
46731 return true;
46733 return false;
46736 /* Next see if vpermd can be used. */
46737 if (valid_perm_using_mode_p (V16SImode, d))
46738 vmode = V16SImode;
46740 /* Or if vpermps can be used. */
46741 else if (d->vmode == V16SFmode)
46742 vmode = V16SImode;
46743 if (vmode == V64QImode)
46745 /* vpshufb only works intra lanes, it is not
46746 possible to shuffle bytes in between the lanes. */
46747 for (i = 0; i < nelt; ++i)
46748 if ((d->perm[i] ^ i) & (nelt / 4))
46749 return false;
46752 else
46753 return false;
46756 if (d->testing_p)
46757 return true;
46759 if (vmode == V8SImode)
46760 for (i = 0; i < 8; ++i)
46761 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46762 else if (vmode == V16SImode)
46763 for (i = 0; i < 16; ++i)
46764 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46765 else
46767 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46768 if (!d->one_operand_p)
46769 mask = 2 * nelt - 1;
46770 else if (vmode == V16QImode)
46771 mask = nelt - 1;
46772 else if (vmode == V64QImode)
46773 mask = nelt / 4 - 1;
46774 else
46775 mask = nelt / 2 - 1;
46777 for (i = 0; i < nelt; ++i)
46779 unsigned j, e = d->perm[i] & mask;
46780 for (j = 0; j < eltsz; ++j)
46781 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46785 vperm = gen_rtx_CONST_VECTOR (vmode,
46786 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46787 vperm = force_reg (vmode, vperm);
46789 target = d->target;
46790 if (d->vmode != vmode)
46791 target = gen_reg_rtx (vmode);
46792 op0 = gen_lowpart (vmode, d->op0);
46793 if (d->one_operand_p)
46795 if (vmode == V16QImode)
46796 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46797 else if (vmode == V32QImode)
46798 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46799 else if (vmode == V64QImode)
46800 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46801 else if (vmode == V8SFmode)
46802 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46803 else if (vmode == V8SImode)
46804 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46805 else if (vmode == V16SFmode)
46806 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46807 else if (vmode == V16SImode)
46808 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46809 else
46810 gcc_unreachable ();
46812 else
46814 op1 = gen_lowpart (vmode, d->op1);
46815 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46817 if (target != d->target)
46818 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46820 return true;
46823 /* For V*[QHS]Imode permutations, check if the same permutation
46824 can't be performed in a 2x, 4x or 8x wider inner mode. */
46826 static bool
46827 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46828 struct expand_vec_perm_d *nd)
46830 int i;
46831 machine_mode mode = VOIDmode;
46833 switch (d->vmode)
46835 case E_V16QImode: mode = V8HImode; break;
46836 case E_V32QImode: mode = V16HImode; break;
46837 case E_V64QImode: mode = V32HImode; break;
46838 case E_V8HImode: mode = V4SImode; break;
46839 case E_V16HImode: mode = V8SImode; break;
46840 case E_V32HImode: mode = V16SImode; break;
46841 case E_V4SImode: mode = V2DImode; break;
46842 case E_V8SImode: mode = V4DImode; break;
46843 case E_V16SImode: mode = V8DImode; break;
46844 default: return false;
46846 for (i = 0; i < d->nelt; i += 2)
46847 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46848 return false;
46849 nd->vmode = mode;
46850 nd->nelt = d->nelt / 2;
46851 for (i = 0; i < nd->nelt; i++)
46852 nd->perm[i] = d->perm[2 * i] / 2;
46853 if (GET_MODE_INNER (mode) != DImode)
46854 canonicalize_vector_int_perm (nd, nd);
46855 if (nd != d)
46857 nd->one_operand_p = d->one_operand_p;
46858 nd->testing_p = d->testing_p;
46859 if (d->op0 == d->op1)
46860 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46861 else
46863 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46864 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46866 if (d->testing_p)
46867 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46868 else
46869 nd->target = gen_reg_rtx (nd->vmode);
46871 return true;
46874 /* Try to expand one-operand permutation with constant mask. */
46876 static bool
46877 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46879 machine_mode mode = GET_MODE (d->op0);
46880 machine_mode maskmode = mode;
46881 rtx (*gen) (rtx, rtx, rtx) = NULL;
46882 rtx target, op0, mask;
46883 rtx vec[64];
46885 if (!rtx_equal_p (d->op0, d->op1))
46886 return false;
46888 if (!TARGET_AVX512F)
46889 return false;
46891 switch (mode)
46893 case E_V16SImode:
46894 gen = gen_avx512f_permvarv16si;
46895 break;
46896 case E_V16SFmode:
46897 gen = gen_avx512f_permvarv16sf;
46898 maskmode = V16SImode;
46899 break;
46900 case E_V8DImode:
46901 gen = gen_avx512f_permvarv8di;
46902 break;
46903 case E_V8DFmode:
46904 gen = gen_avx512f_permvarv8df;
46905 maskmode = V8DImode;
46906 break;
46907 default:
46908 return false;
46911 target = d->target;
46912 op0 = d->op0;
46913 for (int i = 0; i < d->nelt; ++i)
46914 vec[i] = GEN_INT (d->perm[i]);
46915 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46916 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46917 return true;
46920 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46921 in a single instruction. */
46923 static bool
46924 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46926 unsigned i, nelt = d->nelt;
46927 struct expand_vec_perm_d nd;
46929 /* Check plain VEC_SELECT first, because AVX has instructions that could
46930 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46931 input where SEL+CONCAT may not. */
46932 if (d->one_operand_p)
46934 int mask = nelt - 1;
46935 bool identity_perm = true;
46936 bool broadcast_perm = true;
46938 for (i = 0; i < nelt; i++)
46940 nd.perm[i] = d->perm[i] & mask;
46941 if (nd.perm[i] != i)
46942 identity_perm = false;
46943 if (nd.perm[i])
46944 broadcast_perm = false;
46947 if (identity_perm)
46949 if (!d->testing_p)
46950 emit_move_insn (d->target, d->op0);
46951 return true;
46953 else if (broadcast_perm && TARGET_AVX2)
46955 /* Use vpbroadcast{b,w,d}. */
46956 rtx (*gen) (rtx, rtx) = NULL;
46957 switch (d->vmode)
46959 case E_V64QImode:
46960 if (TARGET_AVX512BW)
46961 gen = gen_avx512bw_vec_dupv64qi_1;
46962 break;
46963 case E_V32QImode:
46964 gen = gen_avx2_pbroadcastv32qi_1;
46965 break;
46966 case E_V32HImode:
46967 if (TARGET_AVX512BW)
46968 gen = gen_avx512bw_vec_dupv32hi_1;
46969 break;
46970 case E_V16HImode:
46971 gen = gen_avx2_pbroadcastv16hi_1;
46972 break;
46973 case E_V16SImode:
46974 if (TARGET_AVX512F)
46975 gen = gen_avx512f_vec_dupv16si_1;
46976 break;
46977 case E_V8SImode:
46978 gen = gen_avx2_pbroadcastv8si_1;
46979 break;
46980 case E_V16QImode:
46981 gen = gen_avx2_pbroadcastv16qi;
46982 break;
46983 case E_V8HImode:
46984 gen = gen_avx2_pbroadcastv8hi;
46985 break;
46986 case E_V16SFmode:
46987 if (TARGET_AVX512F)
46988 gen = gen_avx512f_vec_dupv16sf_1;
46989 break;
46990 case E_V8SFmode:
46991 gen = gen_avx2_vec_dupv8sf_1;
46992 break;
46993 case E_V8DFmode:
46994 if (TARGET_AVX512F)
46995 gen = gen_avx512f_vec_dupv8df_1;
46996 break;
46997 case E_V8DImode:
46998 if (TARGET_AVX512F)
46999 gen = gen_avx512f_vec_dupv8di_1;
47000 break;
47001 /* For other modes prefer other shuffles this function creates. */
47002 default: break;
47004 if (gen != NULL)
47006 if (!d->testing_p)
47007 emit_insn (gen (d->target, d->op0));
47008 return true;
47012 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47013 return true;
47015 /* There are plenty of patterns in sse.md that are written for
47016 SEL+CONCAT and are not replicated for a single op. Perhaps
47017 that should be changed, to avoid the nastiness here. */
47019 /* Recognize interleave style patterns, which means incrementing
47020 every other permutation operand. */
47021 for (i = 0; i < nelt; i += 2)
47023 nd.perm[i] = d->perm[i] & mask;
47024 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47026 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47027 d->testing_p))
47028 return true;
47030 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47031 if (nelt >= 4)
47033 for (i = 0; i < nelt; i += 4)
47035 nd.perm[i + 0] = d->perm[i + 0] & mask;
47036 nd.perm[i + 1] = d->perm[i + 1] & mask;
47037 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47038 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47041 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47042 d->testing_p))
47043 return true;
47047 /* Finally, try the fully general two operand permute. */
47048 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47049 d->testing_p))
47050 return true;
47052 /* Recognize interleave style patterns with reversed operands. */
47053 if (!d->one_operand_p)
47055 for (i = 0; i < nelt; ++i)
47057 unsigned e = d->perm[i];
47058 if (e >= nelt)
47059 e -= nelt;
47060 else
47061 e += nelt;
47062 nd.perm[i] = e;
47065 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47066 d->testing_p))
47067 return true;
47070 /* Try the SSE4.1 blend variable merge instructions. */
47071 if (expand_vec_perm_blend (d))
47072 return true;
47074 /* Try one of the AVX vpermil variable permutations. */
47075 if (expand_vec_perm_vpermil (d))
47076 return true;
47078 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47079 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47080 if (expand_vec_perm_pshufb (d))
47081 return true;
47083 /* Try the AVX2 vpalignr instruction. */
47084 if (expand_vec_perm_palignr (d, true))
47085 return true;
47087 /* Try the AVX512F vperm{s,d} instructions. */
47088 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47089 return true;
47091 /* Try the AVX512F vpermt2/vpermi2 instructions. */
47092 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47093 return true;
47095 /* See if we can get the same permutation in different vector integer
47096 mode. */
47097 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47099 if (!d->testing_p)
47100 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47101 return true;
47103 return false;
47106 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47107 in terms of a pair of pshuflw + pshufhw instructions. */
47109 static bool
47110 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47112 unsigned char perm2[MAX_VECT_LEN];
47113 unsigned i;
47114 bool ok;
47116 if (d->vmode != V8HImode || !d->one_operand_p)
47117 return false;
47119 /* The two permutations only operate in 64-bit lanes. */
47120 for (i = 0; i < 4; ++i)
47121 if (d->perm[i] >= 4)
47122 return false;
47123 for (i = 4; i < 8; ++i)
47124 if (d->perm[i] < 4)
47125 return false;
47127 if (d->testing_p)
47128 return true;
47130 /* Emit the pshuflw. */
47131 memcpy (perm2, d->perm, 4);
47132 for (i = 4; i < 8; ++i)
47133 perm2[i] = i;
47134 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47135 gcc_assert (ok);
47137 /* Emit the pshufhw. */
47138 memcpy (perm2 + 4, d->perm + 4, 4);
47139 for (i = 0; i < 4; ++i)
47140 perm2[i] = i;
47141 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47142 gcc_assert (ok);
47144 return true;
47147 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47148 the permutation using the SSSE3 palignr instruction. This succeeds
47149 when all of the elements in PERM fit within one vector and we merely
47150 need to shift them down so that a single vector permutation has a
47151 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47152 the vpalignr instruction itself can perform the requested permutation. */
47154 static bool
47155 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47157 unsigned i, nelt = d->nelt;
47158 unsigned min, max, minswap, maxswap;
47159 bool in_order, ok, swap = false;
47160 rtx shift, target;
47161 struct expand_vec_perm_d dcopy;
47163 /* Even with AVX, palignr only operates on 128-bit vectors,
47164 in AVX2 palignr operates on both 128-bit lanes. */
47165 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47166 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47167 return false;
47169 min = 2 * nelt;
47170 max = 0;
47171 minswap = 2 * nelt;
47172 maxswap = 0;
47173 for (i = 0; i < nelt; ++i)
47175 unsigned e = d->perm[i];
47176 unsigned eswap = d->perm[i] ^ nelt;
47177 if (GET_MODE_SIZE (d->vmode) == 32)
47179 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47180 eswap = e ^ (nelt / 2);
47182 if (e < min)
47183 min = e;
47184 if (e > max)
47185 max = e;
47186 if (eswap < minswap)
47187 minswap = eswap;
47188 if (eswap > maxswap)
47189 maxswap = eswap;
47191 if (min == 0
47192 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47194 if (d->one_operand_p
47195 || minswap == 0
47196 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47197 ? nelt / 2 : nelt))
47198 return false;
47199 swap = true;
47200 min = minswap;
47201 max = maxswap;
47204 /* Given that we have SSSE3, we know we'll be able to implement the
47205 single operand permutation after the palignr with pshufb for
47206 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47207 first. */
47208 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47209 return true;
47211 dcopy = *d;
47212 if (swap)
47214 dcopy.op0 = d->op1;
47215 dcopy.op1 = d->op0;
47216 for (i = 0; i < nelt; ++i)
47217 dcopy.perm[i] ^= nelt;
47220 in_order = true;
47221 for (i = 0; i < nelt; ++i)
47223 unsigned e = dcopy.perm[i];
47224 if (GET_MODE_SIZE (d->vmode) == 32
47225 && e >= nelt
47226 && (e & (nelt / 2 - 1)) < min)
47227 e = e - min - (nelt / 2);
47228 else
47229 e = e - min;
47230 if (e != i)
47231 in_order = false;
47232 dcopy.perm[i] = e;
47234 dcopy.one_operand_p = true;
47236 if (single_insn_only_p && !in_order)
47237 return false;
47239 /* For AVX2, test whether we can permute the result in one instruction. */
47240 if (d->testing_p)
47242 if (in_order)
47243 return true;
47244 dcopy.op1 = dcopy.op0;
47245 return expand_vec_perm_1 (&dcopy);
47248 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47249 if (GET_MODE_SIZE (d->vmode) == 16)
47251 target = gen_reg_rtx (TImode);
47252 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47253 gen_lowpart (TImode, dcopy.op0), shift));
47255 else
47257 target = gen_reg_rtx (V2TImode);
47258 emit_insn (gen_avx2_palignrv2ti (target,
47259 gen_lowpart (V2TImode, dcopy.op1),
47260 gen_lowpart (V2TImode, dcopy.op0),
47261 shift));
47264 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47266 /* Test for the degenerate case where the alignment by itself
47267 produces the desired permutation. */
47268 if (in_order)
47270 emit_move_insn (d->target, dcopy.op0);
47271 return true;
47274 ok = expand_vec_perm_1 (&dcopy);
47275 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47277 return ok;
47280 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47281 the permutation using the SSE4_1 pblendv instruction. Potentially
47282 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47284 static bool
47285 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47287 unsigned i, which, nelt = d->nelt;
47288 struct expand_vec_perm_d dcopy, dcopy1;
47289 machine_mode vmode = d->vmode;
47290 bool ok;
47292 /* Use the same checks as in expand_vec_perm_blend. */
47293 if (d->one_operand_p)
47294 return false;
47295 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47297 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47299 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47301 else
47302 return false;
47304 /* Figure out where permutation elements stay not in their
47305 respective lanes. */
47306 for (i = 0, which = 0; i < nelt; ++i)
47308 unsigned e = d->perm[i];
47309 if (e != i)
47310 which |= (e < nelt ? 1 : 2);
47312 /* We can pblend the part where elements stay not in their
47313 respective lanes only when these elements are all in one
47314 half of a permutation.
47315 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47316 lanes, but both 8 and 9 >= 8
47317 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47318 respective lanes and 8 >= 8, but 2 not. */
47319 if (which != 1 && which != 2)
47320 return false;
47321 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47322 return true;
47324 /* First we apply one operand permutation to the part where
47325 elements stay not in their respective lanes. */
47326 dcopy = *d;
47327 if (which == 2)
47328 dcopy.op0 = dcopy.op1 = d->op1;
47329 else
47330 dcopy.op0 = dcopy.op1 = d->op0;
47331 if (!d->testing_p)
47332 dcopy.target = gen_reg_rtx (vmode);
47333 dcopy.one_operand_p = true;
47335 for (i = 0; i < nelt; ++i)
47336 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47338 ok = expand_vec_perm_1 (&dcopy);
47339 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47340 return false;
47341 else
47342 gcc_assert (ok);
47343 if (d->testing_p)
47344 return true;
47346 /* Next we put permuted elements into their positions. */
47347 dcopy1 = *d;
47348 if (which == 2)
47349 dcopy1.op1 = dcopy.target;
47350 else
47351 dcopy1.op0 = dcopy.target;
47353 for (i = 0; i < nelt; ++i)
47354 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47356 ok = expand_vec_perm_blend (&dcopy1);
47357 gcc_assert (ok);
47359 return true;
47362 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47364 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47365 a two vector permutation into a single vector permutation by using
47366 an interleave operation to merge the vectors. */
47368 static bool
47369 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47371 struct expand_vec_perm_d dremap, dfinal;
47372 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47373 unsigned HOST_WIDE_INT contents;
47374 unsigned char remap[2 * MAX_VECT_LEN];
47375 rtx_insn *seq;
47376 bool ok, same_halves = false;
47378 if (GET_MODE_SIZE (d->vmode) == 16)
47380 if (d->one_operand_p)
47381 return false;
47383 else if (GET_MODE_SIZE (d->vmode) == 32)
47385 if (!TARGET_AVX)
47386 return false;
47387 /* For 32-byte modes allow even d->one_operand_p.
47388 The lack of cross-lane shuffling in some instructions
47389 might prevent a single insn shuffle. */
47390 dfinal = *d;
47391 dfinal.testing_p = true;
47392 /* If expand_vec_perm_interleave3 can expand this into
47393 a 3 insn sequence, give up and let it be expanded as
47394 3 insn sequence. While that is one insn longer,
47395 it doesn't need a memory operand and in the common
47396 case that both interleave low and high permutations
47397 with the same operands are adjacent needs 4 insns
47398 for both after CSE. */
47399 if (expand_vec_perm_interleave3 (&dfinal))
47400 return false;
47402 else
47403 return false;
47405 /* Examine from whence the elements come. */
47406 contents = 0;
47407 for (i = 0; i < nelt; ++i)
47408 contents |= HOST_WIDE_INT_1U << d->perm[i];
47410 memset (remap, 0xff, sizeof (remap));
47411 dremap = *d;
47413 if (GET_MODE_SIZE (d->vmode) == 16)
47415 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47417 /* Split the two input vectors into 4 halves. */
47418 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47419 h2 = h1 << nelt2;
47420 h3 = h2 << nelt2;
47421 h4 = h3 << nelt2;
47423 /* If the elements from the low halves use interleave low, and similarly
47424 for interleave high. If the elements are from mis-matched halves, we
47425 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47426 if ((contents & (h1 | h3)) == contents)
47428 /* punpckl* */
47429 for (i = 0; i < nelt2; ++i)
47431 remap[i] = i * 2;
47432 remap[i + nelt] = i * 2 + 1;
47433 dremap.perm[i * 2] = i;
47434 dremap.perm[i * 2 + 1] = i + nelt;
47436 if (!TARGET_SSE2 && d->vmode == V4SImode)
47437 dremap.vmode = V4SFmode;
47439 else if ((contents & (h2 | h4)) == contents)
47441 /* punpckh* */
47442 for (i = 0; i < nelt2; ++i)
47444 remap[i + nelt2] = i * 2;
47445 remap[i + nelt + nelt2] = i * 2 + 1;
47446 dremap.perm[i * 2] = i + nelt2;
47447 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47449 if (!TARGET_SSE2 && d->vmode == V4SImode)
47450 dremap.vmode = V4SFmode;
47452 else if ((contents & (h1 | h4)) == contents)
47454 /* shufps */
47455 for (i = 0; i < nelt2; ++i)
47457 remap[i] = i;
47458 remap[i + nelt + nelt2] = i + nelt2;
47459 dremap.perm[i] = i;
47460 dremap.perm[i + nelt2] = i + nelt + nelt2;
47462 if (nelt != 4)
47464 /* shufpd */
47465 dremap.vmode = V2DImode;
47466 dremap.nelt = 2;
47467 dremap.perm[0] = 0;
47468 dremap.perm[1] = 3;
47471 else if ((contents & (h2 | h3)) == contents)
47473 /* shufps */
47474 for (i = 0; i < nelt2; ++i)
47476 remap[i + nelt2] = i;
47477 remap[i + nelt] = i + nelt2;
47478 dremap.perm[i] = i + nelt2;
47479 dremap.perm[i + nelt2] = i + nelt;
47481 if (nelt != 4)
47483 /* shufpd */
47484 dremap.vmode = V2DImode;
47485 dremap.nelt = 2;
47486 dremap.perm[0] = 1;
47487 dremap.perm[1] = 2;
47490 else
47491 return false;
47493 else
47495 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47496 unsigned HOST_WIDE_INT q[8];
47497 unsigned int nonzero_halves[4];
47499 /* Split the two input vectors into 8 quarters. */
47500 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47501 for (i = 1; i < 8; ++i)
47502 q[i] = q[0] << (nelt4 * i);
47503 for (i = 0; i < 4; ++i)
47504 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47506 nonzero_halves[nzcnt] = i;
47507 ++nzcnt;
47510 if (nzcnt == 1)
47512 gcc_assert (d->one_operand_p);
47513 nonzero_halves[1] = nonzero_halves[0];
47514 same_halves = true;
47516 else if (d->one_operand_p)
47518 gcc_assert (nonzero_halves[0] == 0);
47519 gcc_assert (nonzero_halves[1] == 1);
47522 if (nzcnt <= 2)
47524 if (d->perm[0] / nelt2 == nonzero_halves[1])
47526 /* Attempt to increase the likelihood that dfinal
47527 shuffle will be intra-lane. */
47528 std::swap (nonzero_halves[0], nonzero_halves[1]);
47531 /* vperm2f128 or vperm2i128. */
47532 for (i = 0; i < nelt2; ++i)
47534 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47535 remap[i + nonzero_halves[0] * nelt2] = i;
47536 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47537 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47540 if (d->vmode != V8SFmode
47541 && d->vmode != V4DFmode
47542 && d->vmode != V8SImode)
47544 dremap.vmode = V8SImode;
47545 dremap.nelt = 8;
47546 for (i = 0; i < 4; ++i)
47548 dremap.perm[i] = i + nonzero_halves[0] * 4;
47549 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47553 else if (d->one_operand_p)
47554 return false;
47555 else if (TARGET_AVX2
47556 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47558 /* vpunpckl* */
47559 for (i = 0; i < nelt4; ++i)
47561 remap[i] = i * 2;
47562 remap[i + nelt] = i * 2 + 1;
47563 remap[i + nelt2] = i * 2 + nelt2;
47564 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47565 dremap.perm[i * 2] = i;
47566 dremap.perm[i * 2 + 1] = i + nelt;
47567 dremap.perm[i * 2 + nelt2] = i + nelt2;
47568 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47571 else if (TARGET_AVX2
47572 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47574 /* vpunpckh* */
47575 for (i = 0; i < nelt4; ++i)
47577 remap[i + nelt4] = i * 2;
47578 remap[i + nelt + nelt4] = i * 2 + 1;
47579 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47580 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47581 dremap.perm[i * 2] = i + nelt4;
47582 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47583 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47584 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47587 else
47588 return false;
47591 /* Use the remapping array set up above to move the elements from their
47592 swizzled locations into their final destinations. */
47593 dfinal = *d;
47594 for (i = 0; i < nelt; ++i)
47596 unsigned e = remap[d->perm[i]];
47597 gcc_assert (e < nelt);
47598 /* If same_halves is true, both halves of the remapped vector are the
47599 same. Avoid cross-lane accesses if possible. */
47600 if (same_halves && i >= nelt2)
47602 gcc_assert (e < nelt2);
47603 dfinal.perm[i] = e + nelt2;
47605 else
47606 dfinal.perm[i] = e;
47608 if (!d->testing_p)
47610 dremap.target = gen_reg_rtx (dremap.vmode);
47611 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47613 dfinal.op1 = dfinal.op0;
47614 dfinal.one_operand_p = true;
47616 /* Test if the final remap can be done with a single insn. For V4SFmode or
47617 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47618 start_sequence ();
47619 ok = expand_vec_perm_1 (&dfinal);
47620 seq = get_insns ();
47621 end_sequence ();
47623 if (!ok)
47624 return false;
47626 if (d->testing_p)
47627 return true;
47629 if (dremap.vmode != dfinal.vmode)
47631 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47632 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47635 ok = expand_vec_perm_1 (&dremap);
47636 gcc_assert (ok);
47638 emit_insn (seq);
47639 return true;
47642 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47643 a single vector cross-lane permutation into vpermq followed
47644 by any of the single insn permutations. */
47646 static bool
47647 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47649 struct expand_vec_perm_d dremap, dfinal;
47650 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47651 unsigned contents[2];
47652 bool ok;
47654 if (!(TARGET_AVX2
47655 && (d->vmode == V32QImode || d->vmode == V16HImode)
47656 && d->one_operand_p))
47657 return false;
47659 contents[0] = 0;
47660 contents[1] = 0;
47661 for (i = 0; i < nelt2; ++i)
47663 contents[0] |= 1u << (d->perm[i] / nelt4);
47664 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47667 for (i = 0; i < 2; ++i)
47669 unsigned int cnt = 0;
47670 for (j = 0; j < 4; ++j)
47671 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47672 return false;
47675 if (d->testing_p)
47676 return true;
47678 dremap = *d;
47679 dremap.vmode = V4DImode;
47680 dremap.nelt = 4;
47681 dremap.target = gen_reg_rtx (V4DImode);
47682 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47683 dremap.op1 = dremap.op0;
47684 dremap.one_operand_p = true;
47685 for (i = 0; i < 2; ++i)
47687 unsigned int cnt = 0;
47688 for (j = 0; j < 4; ++j)
47689 if ((contents[i] & (1u << j)) != 0)
47690 dremap.perm[2 * i + cnt++] = j;
47691 for (; cnt < 2; ++cnt)
47692 dremap.perm[2 * i + cnt] = 0;
47695 dfinal = *d;
47696 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47697 dfinal.op1 = dfinal.op0;
47698 dfinal.one_operand_p = true;
47699 for (i = 0, j = 0; i < nelt; ++i)
47701 if (i == nelt2)
47702 j = 2;
47703 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47704 if ((d->perm[i] / nelt4) == dremap.perm[j])
47706 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47707 dfinal.perm[i] |= nelt4;
47708 else
47709 gcc_unreachable ();
47712 ok = expand_vec_perm_1 (&dremap);
47713 gcc_assert (ok);
47715 ok = expand_vec_perm_1 (&dfinal);
47716 gcc_assert (ok);
47718 return true;
47721 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47722 a vector permutation using two instructions, vperm2f128 resp.
47723 vperm2i128 followed by any single in-lane permutation. */
47725 static bool
47726 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47728 struct expand_vec_perm_d dfirst, dsecond;
47729 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47730 bool ok;
47732 if (!TARGET_AVX
47733 || GET_MODE_SIZE (d->vmode) != 32
47734 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47735 return false;
47737 dsecond = *d;
47738 dsecond.one_operand_p = false;
47739 dsecond.testing_p = true;
47741 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47742 immediate. For perm < 16 the second permutation uses
47743 d->op0 as first operand, for perm >= 16 it uses d->op1
47744 as first operand. The second operand is the result of
47745 vperm2[fi]128. */
47746 for (perm = 0; perm < 32; perm++)
47748 /* Ignore permutations which do not move anything cross-lane. */
47749 if (perm < 16)
47751 /* The second shuffle for e.g. V4DFmode has
47752 0123 and ABCD operands.
47753 Ignore AB23, as 23 is already in the second lane
47754 of the first operand. */
47755 if ((perm & 0xc) == (1 << 2)) continue;
47756 /* And 01CD, as 01 is in the first lane of the first
47757 operand. */
47758 if ((perm & 3) == 0) continue;
47759 /* And 4567, as then the vperm2[fi]128 doesn't change
47760 anything on the original 4567 second operand. */
47761 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47763 else
47765 /* The second shuffle for e.g. V4DFmode has
47766 4567 and ABCD operands.
47767 Ignore AB67, as 67 is already in the second lane
47768 of the first operand. */
47769 if ((perm & 0xc) == (3 << 2)) continue;
47770 /* And 45CD, as 45 is in the first lane of the first
47771 operand. */
47772 if ((perm & 3) == 2) continue;
47773 /* And 0123, as then the vperm2[fi]128 doesn't change
47774 anything on the original 0123 first operand. */
47775 if ((perm & 0xf) == (1 << 2)) continue;
47778 for (i = 0; i < nelt; i++)
47780 j = d->perm[i] / nelt2;
47781 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47782 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47783 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47784 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47785 else
47786 break;
47789 if (i == nelt)
47791 start_sequence ();
47792 ok = expand_vec_perm_1 (&dsecond);
47793 end_sequence ();
47795 else
47796 ok = false;
47798 if (ok)
47800 if (d->testing_p)
47801 return true;
47803 /* Found a usable second shuffle. dfirst will be
47804 vperm2f128 on d->op0 and d->op1. */
47805 dsecond.testing_p = false;
47806 dfirst = *d;
47807 dfirst.target = gen_reg_rtx (d->vmode);
47808 for (i = 0; i < nelt; i++)
47809 dfirst.perm[i] = (i & (nelt2 - 1))
47810 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47812 canonicalize_perm (&dfirst);
47813 ok = expand_vec_perm_1 (&dfirst);
47814 gcc_assert (ok);
47816 /* And dsecond is some single insn shuffle, taking
47817 d->op0 and result of vperm2f128 (if perm < 16) or
47818 d->op1 and result of vperm2f128 (otherwise). */
47819 if (perm >= 16)
47820 dsecond.op0 = dsecond.op1;
47821 dsecond.op1 = dfirst.target;
47823 ok = expand_vec_perm_1 (&dsecond);
47824 gcc_assert (ok);
47826 return true;
47829 /* For one operand, the only useful vperm2f128 permutation is 0x01
47830 aka lanes swap. */
47831 if (d->one_operand_p)
47832 return false;
47835 return false;
47838 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47839 a two vector permutation using 2 intra-lane interleave insns
47840 and cross-lane shuffle for 32-byte vectors. */
47842 static bool
47843 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47845 unsigned i, nelt;
47846 rtx (*gen) (rtx, rtx, rtx);
47848 if (d->one_operand_p)
47849 return false;
47850 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47852 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47854 else
47855 return false;
47857 nelt = d->nelt;
47858 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47859 return false;
47860 for (i = 0; i < nelt; i += 2)
47861 if (d->perm[i] != d->perm[0] + i / 2
47862 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47863 return false;
47865 if (d->testing_p)
47866 return true;
47868 switch (d->vmode)
47870 case E_V32QImode:
47871 if (d->perm[0])
47872 gen = gen_vec_interleave_highv32qi;
47873 else
47874 gen = gen_vec_interleave_lowv32qi;
47875 break;
47876 case E_V16HImode:
47877 if (d->perm[0])
47878 gen = gen_vec_interleave_highv16hi;
47879 else
47880 gen = gen_vec_interleave_lowv16hi;
47881 break;
47882 case E_V8SImode:
47883 if (d->perm[0])
47884 gen = gen_vec_interleave_highv8si;
47885 else
47886 gen = gen_vec_interleave_lowv8si;
47887 break;
47888 case E_V4DImode:
47889 if (d->perm[0])
47890 gen = gen_vec_interleave_highv4di;
47891 else
47892 gen = gen_vec_interleave_lowv4di;
47893 break;
47894 case E_V8SFmode:
47895 if (d->perm[0])
47896 gen = gen_vec_interleave_highv8sf;
47897 else
47898 gen = gen_vec_interleave_lowv8sf;
47899 break;
47900 case E_V4DFmode:
47901 if (d->perm[0])
47902 gen = gen_vec_interleave_highv4df;
47903 else
47904 gen = gen_vec_interleave_lowv4df;
47905 break;
47906 default:
47907 gcc_unreachable ();
47910 emit_insn (gen (d->target, d->op0, d->op1));
47911 return true;
47914 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47915 a single vector permutation using a single intra-lane vector
47916 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47917 the non-swapped and swapped vectors together. */
47919 static bool
47920 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47922 struct expand_vec_perm_d dfirst, dsecond;
47923 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47924 rtx_insn *seq;
47925 bool ok;
47926 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47928 if (!TARGET_AVX
47929 || TARGET_AVX2
47930 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47931 || !d->one_operand_p)
47932 return false;
47934 dfirst = *d;
47935 for (i = 0; i < nelt; i++)
47936 dfirst.perm[i] = 0xff;
47937 for (i = 0, msk = 0; i < nelt; i++)
47939 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47940 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47941 return false;
47942 dfirst.perm[j] = d->perm[i];
47943 if (j != i)
47944 msk |= (1 << i);
47946 for (i = 0; i < nelt; i++)
47947 if (dfirst.perm[i] == 0xff)
47948 dfirst.perm[i] = i;
47950 if (!d->testing_p)
47951 dfirst.target = gen_reg_rtx (dfirst.vmode);
47953 start_sequence ();
47954 ok = expand_vec_perm_1 (&dfirst);
47955 seq = get_insns ();
47956 end_sequence ();
47958 if (!ok)
47959 return false;
47961 if (d->testing_p)
47962 return true;
47964 emit_insn (seq);
47966 dsecond = *d;
47967 dsecond.op0 = dfirst.target;
47968 dsecond.op1 = dfirst.target;
47969 dsecond.one_operand_p = true;
47970 dsecond.target = gen_reg_rtx (dsecond.vmode);
47971 for (i = 0; i < nelt; i++)
47972 dsecond.perm[i] = i ^ nelt2;
47974 ok = expand_vec_perm_1 (&dsecond);
47975 gcc_assert (ok);
47977 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47978 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47979 return true;
47982 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47983 permutation using two vperm2f128, followed by a vshufpd insn blending
47984 the two vectors together. */
47986 static bool
47987 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47989 struct expand_vec_perm_d dfirst, dsecond, dthird;
47990 bool ok;
47992 if (!TARGET_AVX || (d->vmode != V4DFmode))
47993 return false;
47995 if (d->testing_p)
47996 return true;
47998 dfirst = *d;
47999 dsecond = *d;
48000 dthird = *d;
48002 dfirst.perm[0] = (d->perm[0] & ~1);
48003 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48004 dfirst.perm[2] = (d->perm[2] & ~1);
48005 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48006 dsecond.perm[0] = (d->perm[1] & ~1);
48007 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48008 dsecond.perm[2] = (d->perm[3] & ~1);
48009 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48010 dthird.perm[0] = (d->perm[0] % 2);
48011 dthird.perm[1] = (d->perm[1] % 2) + 4;
48012 dthird.perm[2] = (d->perm[2] % 2) + 2;
48013 dthird.perm[3] = (d->perm[3] % 2) + 6;
48015 dfirst.target = gen_reg_rtx (dfirst.vmode);
48016 dsecond.target = gen_reg_rtx (dsecond.vmode);
48017 dthird.op0 = dfirst.target;
48018 dthird.op1 = dsecond.target;
48019 dthird.one_operand_p = false;
48021 canonicalize_perm (&dfirst);
48022 canonicalize_perm (&dsecond);
48024 ok = expand_vec_perm_1 (&dfirst)
48025 && expand_vec_perm_1 (&dsecond)
48026 && expand_vec_perm_1 (&dthird);
48028 gcc_assert (ok);
48030 return true;
48033 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48034 permutation with two pshufb insns and an ior. We should have already
48035 failed all two instruction sequences. */
48037 static bool
48038 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48040 rtx rperm[2][16], vperm, l, h, op, m128;
48041 unsigned int i, nelt, eltsz;
48043 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48044 return false;
48045 gcc_assert (!d->one_operand_p);
48047 if (d->testing_p)
48048 return true;
48050 nelt = d->nelt;
48051 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48053 /* Generate two permutation masks. If the required element is within
48054 the given vector it is shuffled into the proper lane. If the required
48055 element is in the other vector, force a zero into the lane by setting
48056 bit 7 in the permutation mask. */
48057 m128 = GEN_INT (-128);
48058 for (i = 0; i < nelt; ++i)
48060 unsigned j, e = d->perm[i];
48061 unsigned which = (e >= nelt);
48062 if (e >= nelt)
48063 e -= nelt;
48065 for (j = 0; j < eltsz; ++j)
48067 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48068 rperm[1-which][i*eltsz + j] = m128;
48072 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48073 vperm = force_reg (V16QImode, vperm);
48075 l = gen_reg_rtx (V16QImode);
48076 op = gen_lowpart (V16QImode, d->op0);
48077 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48079 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48080 vperm = force_reg (V16QImode, vperm);
48082 h = gen_reg_rtx (V16QImode);
48083 op = gen_lowpart (V16QImode, d->op1);
48084 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48086 op = d->target;
48087 if (d->vmode != V16QImode)
48088 op = gen_reg_rtx (V16QImode);
48089 emit_insn (gen_iorv16qi3 (op, l, h));
48090 if (op != d->target)
48091 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48093 return true;
48096 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48097 with two vpshufb insns, vpermq and vpor. We should have already failed
48098 all two or three instruction sequences. */
48100 static bool
48101 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48103 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48104 unsigned int i, nelt, eltsz;
48106 if (!TARGET_AVX2
48107 || !d->one_operand_p
48108 || (d->vmode != V32QImode && d->vmode != V16HImode))
48109 return false;
48111 if (d->testing_p)
48112 return true;
48114 nelt = d->nelt;
48115 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48117 /* Generate two permutation masks. If the required element is within
48118 the same lane, it is shuffled in. If the required element from the
48119 other lane, force a zero by setting bit 7 in the permutation mask.
48120 In the other mask the mask has non-negative elements if element
48121 is requested from the other lane, but also moved to the other lane,
48122 so that the result of vpshufb can have the two V2TImode halves
48123 swapped. */
48124 m128 = GEN_INT (-128);
48125 for (i = 0; i < nelt; ++i)
48127 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48128 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48130 for (j = 0; j < eltsz; ++j)
48132 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48133 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48137 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48138 vperm = force_reg (V32QImode, vperm);
48140 h = gen_reg_rtx (V32QImode);
48141 op = gen_lowpart (V32QImode, d->op0);
48142 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48144 /* Swap the 128-byte lanes of h into hp. */
48145 hp = gen_reg_rtx (V4DImode);
48146 op = gen_lowpart (V4DImode, h);
48147 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48148 const1_rtx));
48150 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48151 vperm = force_reg (V32QImode, vperm);
48153 l = gen_reg_rtx (V32QImode);
48154 op = gen_lowpart (V32QImode, d->op0);
48155 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48157 op = d->target;
48158 if (d->vmode != V32QImode)
48159 op = gen_reg_rtx (V32QImode);
48160 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48161 if (op != d->target)
48162 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48164 return true;
48167 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48168 and extract-odd permutations of two V32QImode and V16QImode operand
48169 with two vpshufb insns, vpor and vpermq. We should have already
48170 failed all two or three instruction sequences. */
48172 static bool
48173 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48175 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48176 unsigned int i, nelt, eltsz;
48178 if (!TARGET_AVX2
48179 || d->one_operand_p
48180 || (d->vmode != V32QImode && d->vmode != V16HImode))
48181 return false;
48183 for (i = 0; i < d->nelt; ++i)
48184 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48185 return false;
48187 if (d->testing_p)
48188 return true;
48190 nelt = d->nelt;
48191 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48193 /* Generate two permutation masks. In the first permutation mask
48194 the first quarter will contain indexes for the first half
48195 of the op0, the second quarter will contain bit 7 set, third quarter
48196 will contain indexes for the second half of the op0 and the
48197 last quarter bit 7 set. In the second permutation mask
48198 the first quarter will contain bit 7 set, the second quarter
48199 indexes for the first half of the op1, the third quarter bit 7 set
48200 and last quarter indexes for the second half of the op1.
48201 I.e. the first mask e.g. for V32QImode extract even will be:
48202 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48203 (all values masked with 0xf except for -128) and second mask
48204 for extract even will be
48205 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48206 m128 = GEN_INT (-128);
48207 for (i = 0; i < nelt; ++i)
48209 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48210 unsigned which = d->perm[i] >= nelt;
48211 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48213 for (j = 0; j < eltsz; ++j)
48215 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48216 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48220 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48221 vperm = force_reg (V32QImode, vperm);
48223 l = gen_reg_rtx (V32QImode);
48224 op = gen_lowpart (V32QImode, d->op0);
48225 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48227 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48228 vperm = force_reg (V32QImode, vperm);
48230 h = gen_reg_rtx (V32QImode);
48231 op = gen_lowpart (V32QImode, d->op1);
48232 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48234 ior = gen_reg_rtx (V32QImode);
48235 emit_insn (gen_iorv32qi3 (ior, l, h));
48237 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48238 op = gen_reg_rtx (V4DImode);
48239 ior = gen_lowpart (V4DImode, ior);
48240 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48241 const1_rtx, GEN_INT (3)));
48242 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48244 return true;
48247 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48248 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48249 with two "and" and "pack" or two "shift" and "pack" insns. We should
48250 have already failed all two instruction sequences. */
48252 static bool
48253 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48255 rtx op, dop0, dop1, t;
48256 unsigned i, odd, c, s, nelt = d->nelt;
48257 bool end_perm = false;
48258 machine_mode half_mode;
48259 rtx (*gen_and) (rtx, rtx, rtx);
48260 rtx (*gen_pack) (rtx, rtx, rtx);
48261 rtx (*gen_shift) (rtx, rtx, rtx);
48263 if (d->one_operand_p)
48264 return false;
48266 switch (d->vmode)
48268 case E_V8HImode:
48269 /* Required for "pack". */
48270 if (!TARGET_SSE4_1)
48271 return false;
48272 c = 0xffff;
48273 s = 16;
48274 half_mode = V4SImode;
48275 gen_and = gen_andv4si3;
48276 gen_pack = gen_sse4_1_packusdw;
48277 gen_shift = gen_lshrv4si3;
48278 break;
48279 case E_V16QImode:
48280 /* No check as all instructions are SSE2. */
48281 c = 0xff;
48282 s = 8;
48283 half_mode = V8HImode;
48284 gen_and = gen_andv8hi3;
48285 gen_pack = gen_sse2_packuswb;
48286 gen_shift = gen_lshrv8hi3;
48287 break;
48288 case E_V16HImode:
48289 if (!TARGET_AVX2)
48290 return false;
48291 c = 0xffff;
48292 s = 16;
48293 half_mode = V8SImode;
48294 gen_and = gen_andv8si3;
48295 gen_pack = gen_avx2_packusdw;
48296 gen_shift = gen_lshrv8si3;
48297 end_perm = true;
48298 break;
48299 case E_V32QImode:
48300 if (!TARGET_AVX2)
48301 return false;
48302 c = 0xff;
48303 s = 8;
48304 half_mode = V16HImode;
48305 gen_and = gen_andv16hi3;
48306 gen_pack = gen_avx2_packuswb;
48307 gen_shift = gen_lshrv16hi3;
48308 end_perm = true;
48309 break;
48310 default:
48311 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48312 general shuffles. */
48313 return false;
48316 /* Check that permutation is even or odd. */
48317 odd = d->perm[0];
48318 if (odd > 1)
48319 return false;
48321 for (i = 1; i < nelt; ++i)
48322 if (d->perm[i] != 2 * i + odd)
48323 return false;
48325 if (d->testing_p)
48326 return true;
48328 dop0 = gen_reg_rtx (half_mode);
48329 dop1 = gen_reg_rtx (half_mode);
48330 if (odd == 0)
48332 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48333 t = force_reg (half_mode, t);
48334 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48335 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48337 else
48339 emit_insn (gen_shift (dop0,
48340 gen_lowpart (half_mode, d->op0),
48341 GEN_INT (s)));
48342 emit_insn (gen_shift (dop1,
48343 gen_lowpart (half_mode, d->op1),
48344 GEN_INT (s)));
48346 /* In AVX2 for 256 bit case we need to permute pack result. */
48347 if (TARGET_AVX2 && end_perm)
48349 op = gen_reg_rtx (d->vmode);
48350 t = gen_reg_rtx (V4DImode);
48351 emit_insn (gen_pack (op, dop0, dop1));
48352 emit_insn (gen_avx2_permv4di_1 (t,
48353 gen_lowpart (V4DImode, op),
48354 const0_rtx,
48355 const2_rtx,
48356 const1_rtx,
48357 GEN_INT (3)));
48358 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48360 else
48361 emit_insn (gen_pack (d->target, dop0, dop1));
48363 return true;
48366 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48367 and extract-odd permutations of two V64QI operands
48368 with two "shifts", two "truncs" and one "concat" insns for "odd"
48369 and two "truncs" and one concat insn for "even."
48370 Have already failed all two instruction sequences. */
48372 static bool
48373 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48375 rtx t1, t2, t3, t4;
48376 unsigned i, odd, nelt = d->nelt;
48378 if (!TARGET_AVX512BW
48379 || d->one_operand_p
48380 || d->vmode != V64QImode)
48381 return false;
48383 /* Check that permutation is even or odd. */
48384 odd = d->perm[0];
48385 if (odd > 1)
48386 return false;
48388 for (i = 1; i < nelt; ++i)
48389 if (d->perm[i] != 2 * i + odd)
48390 return false;
48392 if (d->testing_p)
48393 return true;
48396 if (odd)
48398 t1 = gen_reg_rtx (V32HImode);
48399 t2 = gen_reg_rtx (V32HImode);
48400 emit_insn (gen_lshrv32hi3 (t1,
48401 gen_lowpart (V32HImode, d->op0),
48402 GEN_INT (8)));
48403 emit_insn (gen_lshrv32hi3 (t2,
48404 gen_lowpart (V32HImode, d->op1),
48405 GEN_INT (8)));
48407 else
48409 t1 = gen_lowpart (V32HImode, d->op0);
48410 t2 = gen_lowpart (V32HImode, d->op1);
48413 t3 = gen_reg_rtx (V32QImode);
48414 t4 = gen_reg_rtx (V32QImode);
48415 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48416 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48417 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48419 return true;
48422 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48423 and extract-odd permutations. */
48425 static bool
48426 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48428 rtx t1, t2, t3, t4, t5;
48430 switch (d->vmode)
48432 case E_V4DFmode:
48433 if (d->testing_p)
48434 break;
48435 t1 = gen_reg_rtx (V4DFmode);
48436 t2 = gen_reg_rtx (V4DFmode);
48438 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48439 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48440 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48442 /* Now an unpck[lh]pd will produce the result required. */
48443 if (odd)
48444 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48445 else
48446 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48447 emit_insn (t3);
48448 break;
48450 case E_V8SFmode:
48452 int mask = odd ? 0xdd : 0x88;
48454 if (d->testing_p)
48455 break;
48456 t1 = gen_reg_rtx (V8SFmode);
48457 t2 = gen_reg_rtx (V8SFmode);
48458 t3 = gen_reg_rtx (V8SFmode);
48460 /* Shuffle within the 128-bit lanes to produce:
48461 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48462 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48463 GEN_INT (mask)));
48465 /* Shuffle the lanes around to produce:
48466 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48467 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48468 GEN_INT (0x3)));
48470 /* Shuffle within the 128-bit lanes to produce:
48471 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48472 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48474 /* Shuffle within the 128-bit lanes to produce:
48475 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48476 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48478 /* Shuffle the lanes around to produce:
48479 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48480 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48481 GEN_INT (0x20)));
48483 break;
48485 case E_V2DFmode:
48486 case E_V4SFmode:
48487 case E_V2DImode:
48488 case E_V4SImode:
48489 /* These are always directly implementable by expand_vec_perm_1. */
48490 gcc_unreachable ();
48492 case E_V8HImode:
48493 if (TARGET_SSE4_1)
48494 return expand_vec_perm_even_odd_pack (d);
48495 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48496 return expand_vec_perm_pshufb2 (d);
48497 else
48499 if (d->testing_p)
48500 break;
48501 /* We need 2*log2(N)-1 operations to achieve odd/even
48502 with interleave. */
48503 t1 = gen_reg_rtx (V8HImode);
48504 t2 = gen_reg_rtx (V8HImode);
48505 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48506 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48507 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48508 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48509 if (odd)
48510 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48511 else
48512 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48513 emit_insn (t3);
48515 break;
48517 case E_V16QImode:
48518 return expand_vec_perm_even_odd_pack (d);
48520 case E_V16HImode:
48521 case E_V32QImode:
48522 return expand_vec_perm_even_odd_pack (d);
48524 case E_V64QImode:
48525 return expand_vec_perm_even_odd_trunc (d);
48527 case E_V4DImode:
48528 if (!TARGET_AVX2)
48530 struct expand_vec_perm_d d_copy = *d;
48531 d_copy.vmode = V4DFmode;
48532 if (d->testing_p)
48533 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48534 else
48535 d_copy.target = gen_reg_rtx (V4DFmode);
48536 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48537 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48538 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48540 if (!d->testing_p)
48541 emit_move_insn (d->target,
48542 gen_lowpart (V4DImode, d_copy.target));
48543 return true;
48545 return false;
48548 if (d->testing_p)
48549 break;
48551 t1 = gen_reg_rtx (V4DImode);
48552 t2 = gen_reg_rtx (V4DImode);
48554 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48555 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48556 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48558 /* Now an vpunpck[lh]qdq will produce the result required. */
48559 if (odd)
48560 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48561 else
48562 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48563 emit_insn (t3);
48564 break;
48566 case E_V8SImode:
48567 if (!TARGET_AVX2)
48569 struct expand_vec_perm_d d_copy = *d;
48570 d_copy.vmode = V8SFmode;
48571 if (d->testing_p)
48572 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48573 else
48574 d_copy.target = gen_reg_rtx (V8SFmode);
48575 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48576 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48577 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48579 if (!d->testing_p)
48580 emit_move_insn (d->target,
48581 gen_lowpart (V8SImode, d_copy.target));
48582 return true;
48584 return false;
48587 if (d->testing_p)
48588 break;
48590 t1 = gen_reg_rtx (V8SImode);
48591 t2 = gen_reg_rtx (V8SImode);
48592 t3 = gen_reg_rtx (V4DImode);
48593 t4 = gen_reg_rtx (V4DImode);
48594 t5 = gen_reg_rtx (V4DImode);
48596 /* Shuffle the lanes around into
48597 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48598 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48599 gen_lowpart (V4DImode, d->op1),
48600 GEN_INT (0x20)));
48601 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48602 gen_lowpart (V4DImode, d->op1),
48603 GEN_INT (0x31)));
48605 /* Swap the 2nd and 3rd position in each lane into
48606 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48607 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48608 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48609 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48610 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48612 /* Now an vpunpck[lh]qdq will produce
48613 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48614 if (odd)
48615 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48616 gen_lowpart (V4DImode, t2));
48617 else
48618 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48619 gen_lowpart (V4DImode, t2));
48620 emit_insn (t3);
48621 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48622 break;
48624 default:
48625 gcc_unreachable ();
48628 return true;
48631 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48632 extract-even and extract-odd permutations. */
48634 static bool
48635 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48637 unsigned i, odd, nelt = d->nelt;
48639 odd = d->perm[0];
48640 if (odd != 0 && odd != 1)
48641 return false;
48643 for (i = 1; i < nelt; ++i)
48644 if (d->perm[i] != 2 * i + odd)
48645 return false;
48647 return expand_vec_perm_even_odd_1 (d, odd);
48650 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48651 permutations. We assume that expand_vec_perm_1 has already failed. */
48653 static bool
48654 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48656 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48657 machine_mode vmode = d->vmode;
48658 unsigned char perm2[4];
48659 rtx op0 = d->op0, dest;
48660 bool ok;
48662 switch (vmode)
48664 case E_V4DFmode:
48665 case E_V8SFmode:
48666 /* These are special-cased in sse.md so that we can optionally
48667 use the vbroadcast instruction. They expand to two insns
48668 if the input happens to be in a register. */
48669 gcc_unreachable ();
48671 case E_V2DFmode:
48672 case E_V2DImode:
48673 case E_V4SFmode:
48674 case E_V4SImode:
48675 /* These are always implementable using standard shuffle patterns. */
48676 gcc_unreachable ();
48678 case E_V8HImode:
48679 case E_V16QImode:
48680 /* These can be implemented via interleave. We save one insn by
48681 stopping once we have promoted to V4SImode and then use pshufd. */
48682 if (d->testing_p)
48683 return true;
48686 rtx dest;
48687 rtx (*gen) (rtx, rtx, rtx)
48688 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48689 : gen_vec_interleave_lowv8hi;
48691 if (elt >= nelt2)
48693 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48694 : gen_vec_interleave_highv8hi;
48695 elt -= nelt2;
48697 nelt2 /= 2;
48699 dest = gen_reg_rtx (vmode);
48700 emit_insn (gen (dest, op0, op0));
48701 vmode = get_mode_wider_vector (vmode);
48702 op0 = gen_lowpart (vmode, dest);
48704 while (vmode != V4SImode);
48706 memset (perm2, elt, 4);
48707 dest = gen_reg_rtx (V4SImode);
48708 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48709 gcc_assert (ok);
48710 if (!d->testing_p)
48711 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48712 return true;
48714 case E_V64QImode:
48715 case E_V32QImode:
48716 case E_V16HImode:
48717 case E_V8SImode:
48718 case E_V4DImode:
48719 /* For AVX2 broadcasts of the first element vpbroadcast* or
48720 vpermq should be used by expand_vec_perm_1. */
48721 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48722 return false;
48724 default:
48725 gcc_unreachable ();
48729 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48730 broadcast permutations. */
48732 static bool
48733 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48735 unsigned i, elt, nelt = d->nelt;
48737 if (!d->one_operand_p)
48738 return false;
48740 elt = d->perm[0];
48741 for (i = 1; i < nelt; ++i)
48742 if (d->perm[i] != elt)
48743 return false;
48745 return expand_vec_perm_broadcast_1 (d);
48748 /* Implement arbitrary permutations of two V64QImode operands
48749 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48750 static bool
48751 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48753 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48754 return false;
48756 if (d->testing_p)
48757 return true;
48759 struct expand_vec_perm_d ds[2];
48760 rtx rperm[128], vperm, target0, target1;
48761 unsigned int i, nelt;
48762 machine_mode vmode;
48764 nelt = d->nelt;
48765 vmode = V64QImode;
48767 for (i = 0; i < 2; i++)
48769 ds[i] = *d;
48770 ds[i].vmode = V32HImode;
48771 ds[i].nelt = 32;
48772 ds[i].target = gen_reg_rtx (V32HImode);
48773 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48774 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48777 /* Prepare permutations such that the first one takes care of
48778 putting the even bytes into the right positions or one higher
48779 positions (ds[0]) and the second one takes care of
48780 putting the odd bytes into the right positions or one below
48781 (ds[1]). */
48783 for (i = 0; i < nelt; i++)
48785 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48786 if (i & 1)
48788 rperm[i] = constm1_rtx;
48789 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48791 else
48793 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48794 rperm[i + 64] = constm1_rtx;
48798 bool ok = expand_vec_perm_1 (&ds[0]);
48799 gcc_assert (ok);
48800 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48802 ok = expand_vec_perm_1 (&ds[1]);
48803 gcc_assert (ok);
48804 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48806 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48807 vperm = force_reg (vmode, vperm);
48808 target0 = gen_reg_rtx (V64QImode);
48809 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48811 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48812 vperm = force_reg (vmode, vperm);
48813 target1 = gen_reg_rtx (V64QImode);
48814 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48816 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48817 return true;
48820 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48821 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48822 all the shorter instruction sequences. */
48824 static bool
48825 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48827 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48828 unsigned int i, nelt, eltsz;
48829 bool used[4];
48831 if (!TARGET_AVX2
48832 || d->one_operand_p
48833 || (d->vmode != V32QImode && d->vmode != V16HImode))
48834 return false;
48836 if (d->testing_p)
48837 return true;
48839 nelt = d->nelt;
48840 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48842 /* Generate 4 permutation masks. If the required element is within
48843 the same lane, it is shuffled in. If the required element from the
48844 other lane, force a zero by setting bit 7 in the permutation mask.
48845 In the other mask the mask has non-negative elements if element
48846 is requested from the other lane, but also moved to the other lane,
48847 so that the result of vpshufb can have the two V2TImode halves
48848 swapped. */
48849 m128 = GEN_INT (-128);
48850 for (i = 0; i < 32; ++i)
48852 rperm[0][i] = m128;
48853 rperm[1][i] = m128;
48854 rperm[2][i] = m128;
48855 rperm[3][i] = m128;
48857 used[0] = false;
48858 used[1] = false;
48859 used[2] = false;
48860 used[3] = false;
48861 for (i = 0; i < nelt; ++i)
48863 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48864 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48865 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48867 for (j = 0; j < eltsz; ++j)
48868 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48869 used[which] = true;
48872 for (i = 0; i < 2; ++i)
48874 if (!used[2 * i + 1])
48876 h[i] = NULL_RTX;
48877 continue;
48879 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48880 gen_rtvec_v (32, rperm[2 * i + 1]));
48881 vperm = force_reg (V32QImode, vperm);
48882 h[i] = gen_reg_rtx (V32QImode);
48883 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48884 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48887 /* Swap the 128-byte lanes of h[X]. */
48888 for (i = 0; i < 2; ++i)
48890 if (h[i] == NULL_RTX)
48891 continue;
48892 op = gen_reg_rtx (V4DImode);
48893 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48894 const2_rtx, GEN_INT (3), const0_rtx,
48895 const1_rtx));
48896 h[i] = gen_lowpart (V32QImode, op);
48899 for (i = 0; i < 2; ++i)
48901 if (!used[2 * i])
48903 l[i] = NULL_RTX;
48904 continue;
48906 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48907 vperm = force_reg (V32QImode, vperm);
48908 l[i] = gen_reg_rtx (V32QImode);
48909 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48910 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48913 for (i = 0; i < 2; ++i)
48915 if (h[i] && l[i])
48917 op = gen_reg_rtx (V32QImode);
48918 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48919 l[i] = op;
48921 else if (h[i])
48922 l[i] = h[i];
48925 gcc_assert (l[0] && l[1]);
48926 op = d->target;
48927 if (d->vmode != V32QImode)
48928 op = gen_reg_rtx (V32QImode);
48929 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48930 if (op != d->target)
48931 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48932 return true;
48935 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48936 taken care of, perform the expansion in D and return true on success. */
48938 static bool
48939 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48941 /* Try a single instruction expansion. */
48942 if (expand_vec_perm_1 (d))
48943 return true;
48945 /* Try sequences of two instructions. */
48947 if (expand_vec_perm_pshuflw_pshufhw (d))
48948 return true;
48950 if (expand_vec_perm_palignr (d, false))
48951 return true;
48953 if (expand_vec_perm_interleave2 (d))
48954 return true;
48956 if (expand_vec_perm_broadcast (d))
48957 return true;
48959 if (expand_vec_perm_vpermq_perm_1 (d))
48960 return true;
48962 if (expand_vec_perm_vperm2f128 (d))
48963 return true;
48965 if (expand_vec_perm_pblendv (d))
48966 return true;
48968 /* Try sequences of three instructions. */
48970 if (expand_vec_perm_even_odd_pack (d))
48971 return true;
48973 if (expand_vec_perm_2vperm2f128_vshuf (d))
48974 return true;
48976 if (expand_vec_perm_pshufb2 (d))
48977 return true;
48979 if (expand_vec_perm_interleave3 (d))
48980 return true;
48982 if (expand_vec_perm_vperm2f128_vblend (d))
48983 return true;
48985 /* Try sequences of four instructions. */
48987 if (expand_vec_perm_even_odd_trunc (d))
48988 return true;
48989 if (expand_vec_perm_vpshufb2_vpermq (d))
48990 return true;
48992 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48993 return true;
48995 if (expand_vec_perm_vpermt2_vpshub2 (d))
48996 return true;
48998 /* ??? Look for narrow permutations whose element orderings would
48999 allow the promotion to a wider mode. */
49001 /* ??? Look for sequences of interleave or a wider permute that place
49002 the data into the correct lanes for a half-vector shuffle like
49003 pshuf[lh]w or vpermilps. */
49005 /* ??? Look for sequences of interleave that produce the desired results.
49006 The combinatorics of punpck[lh] get pretty ugly... */
49008 if (expand_vec_perm_even_odd (d))
49009 return true;
49011 /* Even longer sequences. */
49012 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49013 return true;
49015 /* See if we can get the same permutation in different vector integer
49016 mode. */
49017 struct expand_vec_perm_d nd;
49018 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49020 if (!d->testing_p)
49021 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49022 return true;
49025 return false;
49028 /* If a permutation only uses one operand, make it clear. Returns true
49029 if the permutation references both operands. */
49031 static bool
49032 canonicalize_perm (struct expand_vec_perm_d *d)
49034 int i, which, nelt = d->nelt;
49036 for (i = which = 0; i < nelt; ++i)
49037 which |= (d->perm[i] < nelt ? 1 : 2);
49039 d->one_operand_p = true;
49040 switch (which)
49042 default:
49043 gcc_unreachable();
49045 case 3:
49046 if (!rtx_equal_p (d->op0, d->op1))
49048 d->one_operand_p = false;
49049 break;
49051 /* The elements of PERM do not suggest that only the first operand
49052 is used, but both operands are identical. Allow easier matching
49053 of the permutation by folding the permutation into the single
49054 input vector. */
49055 /* FALLTHRU */
49057 case 2:
49058 for (i = 0; i < nelt; ++i)
49059 d->perm[i] &= nelt - 1;
49060 d->op0 = d->op1;
49061 break;
49063 case 1:
49064 d->op1 = d->op0;
49065 break;
49068 return (which == 3);
49071 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
49073 static bool
49074 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
49075 rtx op1, const vec_perm_indices &sel)
49077 struct expand_vec_perm_d d;
49078 unsigned char perm[MAX_VECT_LEN];
49079 unsigned int i, nelt, which;
49080 bool two_args;
49082 d.target = target;
49083 d.op0 = op0;
49084 d.op1 = op1;
49086 d.vmode = vmode;
49087 gcc_assert (VECTOR_MODE_P (d.vmode));
49088 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49089 d.testing_p = !target;
49091 gcc_assert (sel.length () == nelt);
49092 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49094 /* Given sufficient ISA support we can just return true here
49095 for selected vector modes. */
49096 switch (d.vmode)
49098 case E_V16SFmode:
49099 case E_V16SImode:
49100 case E_V8DImode:
49101 case E_V8DFmode:
49102 if (!TARGET_AVX512F)
49103 return false;
49104 /* All implementable with a single vperm[it]2 insn. */
49105 if (d.testing_p)
49106 return true;
49107 break;
49108 case E_V32HImode:
49109 if (!TARGET_AVX512BW)
49110 return false;
49111 if (d.testing_p)
49112 /* All implementable with a single vperm[it]2 insn. */
49113 return true;
49114 break;
49115 case E_V64QImode:
49116 if (!TARGET_AVX512BW)
49117 return false;
49118 if (d.testing_p)
49119 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
49120 return true;
49121 break;
49122 case E_V8SImode:
49123 case E_V8SFmode:
49124 case E_V4DFmode:
49125 case E_V4DImode:
49126 if (!TARGET_AVX)
49127 return false;
49128 if (d.testing_p && TARGET_AVX512VL)
49129 /* All implementable with a single vperm[it]2 insn. */
49130 return true;
49131 break;
49132 case E_V16HImode:
49133 if (!TARGET_SSE2)
49134 return false;
49135 if (d.testing_p && TARGET_AVX2)
49136 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49137 return true;
49138 break;
49139 case E_V32QImode:
49140 if (!TARGET_SSE2)
49141 return false;
49142 if (d.testing_p && TARGET_AVX2)
49143 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49144 return true;
49145 break;
49146 case E_V8HImode:
49147 case E_V16QImode:
49148 if (!TARGET_SSE2)
49149 return false;
49150 /* Fall through. */
49151 case E_V4SImode:
49152 case E_V4SFmode:
49153 if (!TARGET_SSE)
49154 return false;
49155 /* All implementable with a single vpperm insn. */
49156 if (d.testing_p && TARGET_XOP)
49157 return true;
49158 /* All implementable with 2 pshufb + 1 ior. */
49159 if (d.testing_p && TARGET_SSSE3)
49160 return true;
49161 break;
49162 case E_V2DImode:
49163 case E_V2DFmode:
49164 if (!TARGET_SSE)
49165 return false;
49166 /* All implementable with shufpd or unpck[lh]pd. */
49167 if (d.testing_p)
49168 return true;
49169 break;
49170 default:
49171 return false;
49174 for (i = which = 0; i < nelt; ++i)
49176 unsigned char e = sel[i];
49177 gcc_assert (e < 2 * nelt);
49178 d.perm[i] = e;
49179 perm[i] = e;
49180 which |= (e < nelt ? 1 : 2);
49183 if (d.testing_p)
49185 /* For all elements from second vector, fold the elements to first. */
49186 if (which == 2)
49187 for (i = 0; i < nelt; ++i)
49188 d.perm[i] -= nelt;
49190 /* Check whether the mask can be applied to the vector type. */
49191 d.one_operand_p = (which != 3);
49193 /* Implementable with shufps or pshufd. */
49194 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49195 return true;
49197 /* Otherwise we have to go through the motions and see if we can
49198 figure out how to generate the requested permutation. */
49199 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49200 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49201 if (!d.one_operand_p)
49202 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49204 start_sequence ();
49205 bool ret = ix86_expand_vec_perm_const_1 (&d);
49206 end_sequence ();
49208 return ret;
49211 two_args = canonicalize_perm (&d);
49213 if (ix86_expand_vec_perm_const_1 (&d))
49214 return true;
49216 /* If the selector says both arguments are needed, but the operands are the
49217 same, the above tried to expand with one_operand_p and flattened selector.
49218 If that didn't work, retry without one_operand_p; we succeeded with that
49219 during testing. */
49220 if (two_args && d.one_operand_p)
49222 d.one_operand_p = false;
49223 memcpy (d.perm, perm, sizeof (perm));
49224 return ix86_expand_vec_perm_const_1 (&d);
49227 return false;
49230 void
49231 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49233 struct expand_vec_perm_d d;
49234 unsigned i, nelt;
49236 d.target = targ;
49237 d.op0 = op0;
49238 d.op1 = op1;
49239 d.vmode = GET_MODE (targ);
49240 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49241 d.one_operand_p = false;
49242 d.testing_p = false;
49244 for (i = 0; i < nelt; ++i)
49245 d.perm[i] = i * 2 + odd;
49247 /* We'll either be able to implement the permutation directly... */
49248 if (expand_vec_perm_1 (&d))
49249 return;
49251 /* ... or we use the special-case patterns. */
49252 expand_vec_perm_even_odd_1 (&d, odd);
49255 static void
49256 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49258 struct expand_vec_perm_d d;
49259 unsigned i, nelt, base;
49260 bool ok;
49262 d.target = targ;
49263 d.op0 = op0;
49264 d.op1 = op1;
49265 d.vmode = GET_MODE (targ);
49266 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49267 d.one_operand_p = false;
49268 d.testing_p = false;
49270 base = high_p ? nelt / 2 : 0;
49271 for (i = 0; i < nelt / 2; ++i)
49273 d.perm[i * 2] = i + base;
49274 d.perm[i * 2 + 1] = i + base + nelt;
49277 /* Note that for AVX this isn't one instruction. */
49278 ok = ix86_expand_vec_perm_const_1 (&d);
49279 gcc_assert (ok);
49283 /* Expand a vector operation CODE for a V*QImode in terms of the
49284 same operation on V*HImode. */
49286 void
49287 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49289 machine_mode qimode = GET_MODE (dest);
49290 machine_mode himode;
49291 rtx (*gen_il) (rtx, rtx, rtx);
49292 rtx (*gen_ih) (rtx, rtx, rtx);
49293 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49294 struct expand_vec_perm_d d;
49295 bool ok, full_interleave;
49296 bool uns_p = false;
49297 int i;
49299 switch (qimode)
49301 case E_V16QImode:
49302 himode = V8HImode;
49303 gen_il = gen_vec_interleave_lowv16qi;
49304 gen_ih = gen_vec_interleave_highv16qi;
49305 break;
49306 case E_V32QImode:
49307 himode = V16HImode;
49308 gen_il = gen_avx2_interleave_lowv32qi;
49309 gen_ih = gen_avx2_interleave_highv32qi;
49310 break;
49311 case E_V64QImode:
49312 himode = V32HImode;
49313 gen_il = gen_avx512bw_interleave_lowv64qi;
49314 gen_ih = gen_avx512bw_interleave_highv64qi;
49315 break;
49316 default:
49317 gcc_unreachable ();
49320 op2_l = op2_h = op2;
49321 switch (code)
49323 case MULT:
49324 /* Unpack data such that we've got a source byte in each low byte of
49325 each word. We don't care what goes into the high byte of each word.
49326 Rather than trying to get zero in there, most convenient is to let
49327 it be a copy of the low byte. */
49328 op2_l = gen_reg_rtx (qimode);
49329 op2_h = gen_reg_rtx (qimode);
49330 emit_insn (gen_il (op2_l, op2, op2));
49331 emit_insn (gen_ih (op2_h, op2, op2));
49333 op1_l = gen_reg_rtx (qimode);
49334 op1_h = gen_reg_rtx (qimode);
49335 emit_insn (gen_il (op1_l, op1, op1));
49336 emit_insn (gen_ih (op1_h, op1, op1));
49337 full_interleave = qimode == V16QImode;
49338 break;
49340 case ASHIFT:
49341 case LSHIFTRT:
49342 uns_p = true;
49343 /* FALLTHRU */
49344 case ASHIFTRT:
49345 op1_l = gen_reg_rtx (himode);
49346 op1_h = gen_reg_rtx (himode);
49347 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49348 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49349 full_interleave = true;
49350 break;
49351 default:
49352 gcc_unreachable ();
49355 /* Perform the operation. */
49356 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49357 1, OPTAB_DIRECT);
49358 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49359 1, OPTAB_DIRECT);
49360 gcc_assert (res_l && res_h);
49362 /* Merge the data back into the right place. */
49363 d.target = dest;
49364 d.op0 = gen_lowpart (qimode, res_l);
49365 d.op1 = gen_lowpart (qimode, res_h);
49366 d.vmode = qimode;
49367 d.nelt = GET_MODE_NUNITS (qimode);
49368 d.one_operand_p = false;
49369 d.testing_p = false;
49371 if (full_interleave)
49373 /* For SSE2, we used an full interleave, so the desired
49374 results are in the even elements. */
49375 for (i = 0; i < d.nelt; ++i)
49376 d.perm[i] = i * 2;
49378 else
49380 /* For AVX, the interleave used above was not cross-lane. So the
49381 extraction is evens but with the second and third quarter swapped.
49382 Happily, that is even one insn shorter than even extraction.
49383 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49384 always first from the first and then from the second source operand,
49385 the index bits above the low 4 bits remains the same.
49386 Thus, for d.nelt == 32 we want permutation
49387 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49388 and for d.nelt == 64 we want permutation
49389 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49390 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49391 for (i = 0; i < d.nelt; ++i)
49392 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49395 ok = ix86_expand_vec_perm_const_1 (&d);
49396 gcc_assert (ok);
49398 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49399 gen_rtx_fmt_ee (code, qimode, op1, op2));
49402 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49403 if op is CONST_VECTOR with all odd elements equal to their
49404 preceding element. */
49406 static bool
49407 const_vector_equal_evenodd_p (rtx op)
49409 machine_mode mode = GET_MODE (op);
49410 int i, nunits = GET_MODE_NUNITS (mode);
49411 if (GET_CODE (op) != CONST_VECTOR
49412 || nunits != CONST_VECTOR_NUNITS (op))
49413 return false;
49414 for (i = 0; i < nunits; i += 2)
49415 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49416 return false;
49417 return true;
49420 void
49421 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49422 bool uns_p, bool odd_p)
49424 machine_mode mode = GET_MODE (op1);
49425 machine_mode wmode = GET_MODE (dest);
49426 rtx x;
49427 rtx orig_op1 = op1, orig_op2 = op2;
49429 if (!nonimmediate_operand (op1, mode))
49430 op1 = force_reg (mode, op1);
49431 if (!nonimmediate_operand (op2, mode))
49432 op2 = force_reg (mode, op2);
49434 /* We only play even/odd games with vectors of SImode. */
49435 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49437 /* If we're looking for the odd results, shift those members down to
49438 the even slots. For some cpus this is faster than a PSHUFD. */
49439 if (odd_p)
49441 /* For XOP use vpmacsdqh, but only for smult, as it is only
49442 signed. */
49443 if (TARGET_XOP && mode == V4SImode && !uns_p)
49445 x = force_reg (wmode, CONST0_RTX (wmode));
49446 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49447 return;
49450 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49451 if (!const_vector_equal_evenodd_p (orig_op1))
49452 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49453 x, NULL, 1, OPTAB_DIRECT);
49454 if (!const_vector_equal_evenodd_p (orig_op2))
49455 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49456 x, NULL, 1, OPTAB_DIRECT);
49457 op1 = gen_lowpart (mode, op1);
49458 op2 = gen_lowpart (mode, op2);
49461 if (mode == V16SImode)
49463 if (uns_p)
49464 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49465 else
49466 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49468 else if (mode == V8SImode)
49470 if (uns_p)
49471 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49472 else
49473 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49475 else if (uns_p)
49476 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49477 else if (TARGET_SSE4_1)
49478 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49479 else
49481 rtx s1, s2, t0, t1, t2;
49483 /* The easiest way to implement this without PMULDQ is to go through
49484 the motions as if we are performing a full 64-bit multiply. With
49485 the exception that we need to do less shuffling of the elements. */
49487 /* Compute the sign-extension, aka highparts, of the two operands. */
49488 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49489 op1, pc_rtx, pc_rtx);
49490 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49491 op2, pc_rtx, pc_rtx);
49493 /* Multiply LO(A) * HI(B), and vice-versa. */
49494 t1 = gen_reg_rtx (wmode);
49495 t2 = gen_reg_rtx (wmode);
49496 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49497 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49499 /* Multiply LO(A) * LO(B). */
49500 t0 = gen_reg_rtx (wmode);
49501 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49503 /* Combine and shift the highparts into place. */
49504 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49505 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49506 1, OPTAB_DIRECT);
49508 /* Combine high and low parts. */
49509 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49510 return;
49512 emit_insn (x);
49515 void
49516 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49517 bool uns_p, bool high_p)
49519 machine_mode wmode = GET_MODE (dest);
49520 machine_mode mode = GET_MODE (op1);
49521 rtx t1, t2, t3, t4, mask;
49523 switch (mode)
49525 case E_V4SImode:
49526 t1 = gen_reg_rtx (mode);
49527 t2 = gen_reg_rtx (mode);
49528 if (TARGET_XOP && !uns_p)
49530 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49531 shuffle the elements once so that all elements are in the right
49532 place for immediate use: { A C B D }. */
49533 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49534 const1_rtx, GEN_INT (3)));
49535 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49536 const1_rtx, GEN_INT (3)));
49538 else
49540 /* Put the elements into place for the multiply. */
49541 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49542 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49543 high_p = false;
49545 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49546 break;
49548 case E_V8SImode:
49549 /* Shuffle the elements between the lanes. After this we
49550 have { A B E F | C D G H } for each operand. */
49551 t1 = gen_reg_rtx (V4DImode);
49552 t2 = gen_reg_rtx (V4DImode);
49553 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49554 const0_rtx, const2_rtx,
49555 const1_rtx, GEN_INT (3)));
49556 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49557 const0_rtx, const2_rtx,
49558 const1_rtx, GEN_INT (3)));
49560 /* Shuffle the elements within the lanes. After this we
49561 have { A A B B | C C D D } or { E E F F | G G H H }. */
49562 t3 = gen_reg_rtx (V8SImode);
49563 t4 = gen_reg_rtx (V8SImode);
49564 mask = GEN_INT (high_p
49565 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49566 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49567 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49568 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49570 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49571 break;
49573 case E_V8HImode:
49574 case E_V16HImode:
49575 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49576 uns_p, OPTAB_DIRECT);
49577 t2 = expand_binop (mode,
49578 uns_p ? umul_highpart_optab : smul_highpart_optab,
49579 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49580 gcc_assert (t1 && t2);
49582 t3 = gen_reg_rtx (mode);
49583 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49584 emit_move_insn (dest, gen_lowpart (wmode, t3));
49585 break;
49587 case E_V16QImode:
49588 case E_V32QImode:
49589 case E_V32HImode:
49590 case E_V16SImode:
49591 case E_V64QImode:
49592 t1 = gen_reg_rtx (wmode);
49593 t2 = gen_reg_rtx (wmode);
49594 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49595 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49597 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49598 break;
49600 default:
49601 gcc_unreachable ();
49605 void
49606 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49608 rtx res_1, res_2, res_3, res_4;
49610 res_1 = gen_reg_rtx (V4SImode);
49611 res_2 = gen_reg_rtx (V4SImode);
49612 res_3 = gen_reg_rtx (V2DImode);
49613 res_4 = gen_reg_rtx (V2DImode);
49614 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49615 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49617 /* Move the results in element 2 down to element 1; we don't care
49618 what goes in elements 2 and 3. Then we can merge the parts
49619 back together with an interleave.
49621 Note that two other sequences were tried:
49622 (1) Use interleaves at the start instead of psrldq, which allows
49623 us to use a single shufps to merge things back at the end.
49624 (2) Use shufps here to combine the two vectors, then pshufd to
49625 put the elements in the correct order.
49626 In both cases the cost of the reformatting stall was too high
49627 and the overall sequence slower. */
49629 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49630 const0_rtx, const2_rtx,
49631 const0_rtx, const0_rtx));
49632 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49633 const0_rtx, const2_rtx,
49634 const0_rtx, const0_rtx));
49635 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49637 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49640 void
49641 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49643 machine_mode mode = GET_MODE (op0);
49644 rtx t1, t2, t3, t4, t5, t6;
49646 if (TARGET_AVX512DQ && mode == V8DImode)
49647 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49648 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49649 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49650 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49651 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49652 else if (TARGET_XOP && mode == V2DImode)
49654 /* op1: A,B,C,D, op2: E,F,G,H */
49655 op1 = gen_lowpart (V4SImode, op1);
49656 op2 = gen_lowpart (V4SImode, op2);
49658 t1 = gen_reg_rtx (V4SImode);
49659 t2 = gen_reg_rtx (V4SImode);
49660 t3 = gen_reg_rtx (V2DImode);
49661 t4 = gen_reg_rtx (V2DImode);
49663 /* t1: B,A,D,C */
49664 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49665 GEN_INT (1),
49666 GEN_INT (0),
49667 GEN_INT (3),
49668 GEN_INT (2)));
49670 /* t2: (B*E),(A*F),(D*G),(C*H) */
49671 emit_insn (gen_mulv4si3 (t2, t1, op2));
49673 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49674 emit_insn (gen_xop_phadddq (t3, t2));
49676 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49677 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49679 /* Multiply lower parts and add all */
49680 t5 = gen_reg_rtx (V2DImode);
49681 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49682 gen_lowpart (V4SImode, op1),
49683 gen_lowpart (V4SImode, op2)));
49684 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49687 else
49689 machine_mode nmode;
49690 rtx (*umul) (rtx, rtx, rtx);
49692 if (mode == V2DImode)
49694 umul = gen_vec_widen_umult_even_v4si;
49695 nmode = V4SImode;
49697 else if (mode == V4DImode)
49699 umul = gen_vec_widen_umult_even_v8si;
49700 nmode = V8SImode;
49702 else if (mode == V8DImode)
49704 umul = gen_vec_widen_umult_even_v16si;
49705 nmode = V16SImode;
49707 else
49708 gcc_unreachable ();
49711 /* Multiply low parts. */
49712 t1 = gen_reg_rtx (mode);
49713 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49715 /* Shift input vectors right 32 bits so we can multiply high parts. */
49716 t6 = GEN_INT (32);
49717 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49718 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49720 /* Multiply high parts by low parts. */
49721 t4 = gen_reg_rtx (mode);
49722 t5 = gen_reg_rtx (mode);
49723 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49724 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49726 /* Combine and shift the highparts back. */
49727 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49728 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49730 /* Combine high and low parts. */
49731 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49734 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49735 gen_rtx_MULT (mode, op1, op2));
49738 /* Return 1 if control tansfer instruction INSN
49739 should be encoded with bnd prefix.
49740 If insn is NULL then return 1 when control
49741 transfer instructions should be prefixed with
49742 bnd by default for current function. */
49744 bool
49745 ix86_bnd_prefixed_insn_p (rtx insn)
49747 /* For call insns check special flag. */
49748 if (insn && CALL_P (insn))
49750 rtx call = get_call_rtx_from (insn);
49751 if (call)
49752 return CALL_EXPR_WITH_BOUNDS_P (call);
49755 /* All other insns are prefixed only if function is instrumented. */
49756 return chkp_function_instrumented_p (current_function_decl);
49759 /* Return 1 if control tansfer instruction INSN
49760 should be encoded with notrack prefix. */
49762 static bool
49763 ix86_notrack_prefixed_insn_p (rtx insn)
49765 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
49766 return false;
49768 if (CALL_P (insn))
49770 rtx call = get_call_rtx_from (insn);
49771 gcc_assert (call != NULL_RTX);
49772 rtx addr = XEXP (call, 0);
49774 /* Do not emit 'notrack' if it's not an indirect call. */
49775 if (MEM_P (addr)
49776 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49777 return false;
49778 else
49779 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49782 if (JUMP_P (insn) && !flag_cet_switch)
49784 rtx target = JUMP_LABEL (insn);
49785 if (target == NULL_RTX || ANY_RETURN_P (target))
49786 return false;
49788 /* Check the jump is a switch table. */
49789 rtx_insn *label = as_a<rtx_insn *> (target);
49790 rtx_insn *table = next_insn (label);
49791 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49792 return false;
49793 else
49794 return true;
49796 return false;
49799 /* Calculate integer abs() using only SSE2 instructions. */
49801 void
49802 ix86_expand_sse2_abs (rtx target, rtx input)
49804 machine_mode mode = GET_MODE (target);
49805 rtx tmp0, tmp1, x;
49807 switch (mode)
49809 /* For 32-bit signed integer X, the best way to calculate the absolute
49810 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49811 case E_V4SImode:
49812 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49813 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49814 NULL, 0, OPTAB_DIRECT);
49815 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49816 NULL, 0, OPTAB_DIRECT);
49817 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49818 target, 0, OPTAB_DIRECT);
49819 break;
49821 /* For 16-bit signed integer X, the best way to calculate the absolute
49822 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49823 case E_V8HImode:
49824 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49826 x = expand_simple_binop (mode, SMAX, tmp0, input,
49827 target, 0, OPTAB_DIRECT);
49828 break;
49830 /* For 8-bit signed integer X, the best way to calculate the absolute
49831 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49832 as SSE2 provides the PMINUB insn. */
49833 case E_V16QImode:
49834 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49836 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49837 target, 0, OPTAB_DIRECT);
49838 break;
49840 default:
49841 gcc_unreachable ();
49844 if (x != target)
49845 emit_move_insn (target, x);
49848 /* Expand an extract from a vector register through pextr insn.
49849 Return true if successful. */
49851 bool
49852 ix86_expand_pextr (rtx *operands)
49854 rtx dst = operands[0];
49855 rtx src = operands[1];
49857 unsigned int size = INTVAL (operands[2]);
49858 unsigned int pos = INTVAL (operands[3]);
49860 if (SUBREG_P (dst))
49862 /* Reject non-lowpart subregs. */
49863 if (SUBREG_BYTE (dst) > 0)
49864 return false;
49865 dst = SUBREG_REG (dst);
49868 if (SUBREG_P (src))
49870 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49871 src = SUBREG_REG (src);
49874 switch (GET_MODE (src))
49876 case E_V16QImode:
49877 case E_V8HImode:
49878 case E_V4SImode:
49879 case E_V2DImode:
49880 case E_V1TImode:
49881 case E_TImode:
49883 machine_mode srcmode, dstmode;
49884 rtx d, pat;
49886 if (!int_mode_for_size (size, 0).exists (&dstmode))
49887 return false;
49889 switch (dstmode)
49891 case E_QImode:
49892 if (!TARGET_SSE4_1)
49893 return false;
49894 srcmode = V16QImode;
49895 break;
49897 case E_HImode:
49898 if (!TARGET_SSE2)
49899 return false;
49900 srcmode = V8HImode;
49901 break;
49903 case E_SImode:
49904 if (!TARGET_SSE4_1)
49905 return false;
49906 srcmode = V4SImode;
49907 break;
49909 case E_DImode:
49910 gcc_assert (TARGET_64BIT);
49911 if (!TARGET_SSE4_1)
49912 return false;
49913 srcmode = V2DImode;
49914 break;
49916 default:
49917 return false;
49920 /* Reject extractions from misaligned positions. */
49921 if (pos & (size-1))
49922 return false;
49924 if (GET_MODE (dst) == dstmode)
49925 d = dst;
49926 else
49927 d = gen_reg_rtx (dstmode);
49929 /* Construct insn pattern. */
49930 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49931 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49933 /* Let the rtl optimizers know about the zero extension performed. */
49934 if (dstmode == QImode || dstmode == HImode)
49936 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49937 d = gen_lowpart (SImode, d);
49940 emit_insn (gen_rtx_SET (d, pat));
49942 if (d != dst)
49943 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49944 return true;
49947 default:
49948 return false;
49952 /* Expand an insert into a vector register through pinsr insn.
49953 Return true if successful. */
49955 bool
49956 ix86_expand_pinsr (rtx *operands)
49958 rtx dst = operands[0];
49959 rtx src = operands[3];
49961 unsigned int size = INTVAL (operands[1]);
49962 unsigned int pos = INTVAL (operands[2]);
49964 if (SUBREG_P (dst))
49966 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49967 dst = SUBREG_REG (dst);
49970 switch (GET_MODE (dst))
49972 case E_V16QImode:
49973 case E_V8HImode:
49974 case E_V4SImode:
49975 case E_V2DImode:
49976 case E_V1TImode:
49977 case E_TImode:
49979 machine_mode srcmode, dstmode;
49980 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49981 rtx d;
49983 if (!int_mode_for_size (size, 0).exists (&srcmode))
49984 return false;
49986 switch (srcmode)
49988 case E_QImode:
49989 if (!TARGET_SSE4_1)
49990 return false;
49991 dstmode = V16QImode;
49992 pinsr = gen_sse4_1_pinsrb;
49993 break;
49995 case E_HImode:
49996 if (!TARGET_SSE2)
49997 return false;
49998 dstmode = V8HImode;
49999 pinsr = gen_sse2_pinsrw;
50000 break;
50002 case E_SImode:
50003 if (!TARGET_SSE4_1)
50004 return false;
50005 dstmode = V4SImode;
50006 pinsr = gen_sse4_1_pinsrd;
50007 break;
50009 case E_DImode:
50010 gcc_assert (TARGET_64BIT);
50011 if (!TARGET_SSE4_1)
50012 return false;
50013 dstmode = V2DImode;
50014 pinsr = gen_sse4_1_pinsrq;
50015 break;
50017 default:
50018 return false;
50021 /* Reject insertions to misaligned positions. */
50022 if (pos & (size-1))
50023 return false;
50025 if (SUBREG_P (src))
50027 unsigned int srcpos = SUBREG_BYTE (src);
50029 if (srcpos > 0)
50031 rtx extr_ops[4];
50033 extr_ops[0] = gen_reg_rtx (srcmode);
50034 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50035 extr_ops[2] = GEN_INT (size);
50036 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50038 if (!ix86_expand_pextr (extr_ops))
50039 return false;
50041 src = extr_ops[0];
50043 else
50044 src = gen_lowpart (srcmode, SUBREG_REG (src));
50047 if (GET_MODE (dst) == dstmode)
50048 d = dst;
50049 else
50050 d = gen_reg_rtx (dstmode);
50052 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50053 gen_lowpart (srcmode, src),
50054 GEN_INT (1 << (pos / size))));
50055 if (d != dst)
50056 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50057 return true;
50060 default:
50061 return false;
50065 /* This function returns the calling abi specific va_list type node.
50066 It returns the FNDECL specific va_list type. */
50068 static tree
50069 ix86_fn_abi_va_list (tree fndecl)
50071 if (!TARGET_64BIT)
50072 return va_list_type_node;
50073 gcc_assert (fndecl != NULL_TREE);
50075 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50076 return ms_va_list_type_node;
50077 else
50078 return sysv_va_list_type_node;
50081 /* Returns the canonical va_list type specified by TYPE. If there
50082 is no valid TYPE provided, it return NULL_TREE. */
50084 static tree
50085 ix86_canonical_va_list_type (tree type)
50087 if (TARGET_64BIT)
50089 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50090 return ms_va_list_type_node;
50092 if ((TREE_CODE (type) == ARRAY_TYPE
50093 && integer_zerop (array_type_nelts (type)))
50094 || POINTER_TYPE_P (type))
50096 tree elem_type = TREE_TYPE (type);
50097 if (TREE_CODE (elem_type) == RECORD_TYPE
50098 && lookup_attribute ("sysv_abi va_list",
50099 TYPE_ATTRIBUTES (elem_type)))
50100 return sysv_va_list_type_node;
50103 return NULL_TREE;
50106 return std_canonical_va_list_type (type);
50109 /* Iterate through the target-specific builtin types for va_list.
50110 IDX denotes the iterator, *PTREE is set to the result type of
50111 the va_list builtin, and *PNAME to its internal type.
50112 Returns zero if there is no element for this index, otherwise
50113 IDX should be increased upon the next call.
50114 Note, do not iterate a base builtin's name like __builtin_va_list.
50115 Used from c_common_nodes_and_builtins. */
50117 static int
50118 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50120 if (TARGET_64BIT)
50122 switch (idx)
50124 default:
50125 break;
50127 case 0:
50128 *ptree = ms_va_list_type_node;
50129 *pname = "__builtin_ms_va_list";
50130 return 1;
50132 case 1:
50133 *ptree = sysv_va_list_type_node;
50134 *pname = "__builtin_sysv_va_list";
50135 return 1;
50139 return 0;
50142 #undef TARGET_SCHED_DISPATCH
50143 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
50144 #undef TARGET_SCHED_DISPATCH_DO
50145 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
50146 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50147 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50148 #undef TARGET_SCHED_REORDER
50149 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
50150 #undef TARGET_SCHED_ADJUST_PRIORITY
50151 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50152 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50153 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50154 ix86_dependencies_evaluation_hook
50157 /* Implementation of reassociation_width target hook used by
50158 reassoc phase to identify parallelism level in reassociated
50159 tree. Statements tree_code is passed in OPC. Arguments type
50160 is passed in MODE. */
50162 static int
50163 ix86_reassociation_width (unsigned int op, machine_mode mode)
50165 int width = 1;
50166 /* Vector part. */
50167 if (VECTOR_MODE_P (mode))
50169 int div = 1;
50170 if (INTEGRAL_MODE_P (mode))
50171 width = ix86_cost->reassoc_vec_int;
50172 else if (FLOAT_MODE_P (mode))
50173 width = ix86_cost->reassoc_vec_fp;
50175 if (width == 1)
50176 return 1;
50178 /* Integer vector instructions execute in FP unit
50179 and can execute 3 additions and one multiplication per cycle. */
50180 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
50181 && op != PLUS && op != MINUS)
50182 return 1;
50184 /* Account for targets that splits wide vectors into multiple parts. */
50185 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
50186 div = GET_MODE_BITSIZE (mode) / 128;
50187 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
50188 div = GET_MODE_BITSIZE (mode) / 64;
50189 width = (width + div - 1) / div;
50191 /* Scalar part. */
50192 else if (INTEGRAL_MODE_P (mode))
50193 width = ix86_cost->reassoc_int;
50194 else if (FLOAT_MODE_P (mode))
50195 width = ix86_cost->reassoc_fp;
50197 /* Avoid using too many registers in 32bit mode. */
50198 if (!TARGET_64BIT && width > 2)
50199 width = 2;
50200 return width;
50203 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50204 place emms and femms instructions. */
50206 static machine_mode
50207 ix86_preferred_simd_mode (scalar_mode mode)
50209 if (!TARGET_SSE)
50210 return word_mode;
50212 switch (mode)
50214 case E_QImode:
50215 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50216 return V64QImode;
50217 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50218 return V32QImode;
50219 else
50220 return V16QImode;
50222 case E_HImode:
50223 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50224 return V32HImode;
50225 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50226 return V16HImode;
50227 else
50228 return V8HImode;
50230 case E_SImode:
50231 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50232 return V16SImode;
50233 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50234 return V8SImode;
50235 else
50236 return V4SImode;
50238 case E_DImode:
50239 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50240 return V8DImode;
50241 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50242 return V4DImode;
50243 else
50244 return V2DImode;
50246 case E_SFmode:
50247 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50248 return V16SFmode;
50249 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50250 return V8SFmode;
50251 else
50252 return V4SFmode;
50254 case E_DFmode:
50255 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50256 return V8DFmode;
50257 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50258 return V4DFmode;
50259 else if (TARGET_SSE2)
50260 return V2DFmode;
50261 /* FALLTHRU */
50263 default:
50264 return word_mode;
50268 /* All CPUs prefer to avoid cross-lane operations so perform reductions
50269 upper against lower halves up to SSE reg size. */
50271 static machine_mode
50272 ix86_split_reduction (machine_mode mode)
50274 /* Reduce lowpart against highpart until we reach SSE reg width to
50275 avoid cross-lane operations. */
50276 switch (mode)
50278 case E_V8DImode:
50279 case E_V4DImode:
50280 return V2DImode;
50281 case E_V16SImode:
50282 case E_V8SImode:
50283 return V4SImode;
50284 case E_V32HImode:
50285 case E_V16HImode:
50286 return V8HImode;
50287 case E_V64QImode:
50288 case E_V32QImode:
50289 return V16QImode;
50290 case E_V16SFmode:
50291 case E_V8SFmode:
50292 return V4SFmode;
50293 case E_V8DFmode:
50294 case E_V4DFmode:
50295 return V2DFmode;
50296 default:
50297 return mode;
50301 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50302 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50303 256bit and 128bit vectors. */
50305 static void
50306 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
50308 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50310 sizes->safe_push (64);
50311 sizes->safe_push (32);
50312 sizes->safe_push (16);
50314 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50316 sizes->safe_push (32);
50317 sizes->safe_push (16);
50321 /* Implemenation of targetm.vectorize.get_mask_mode. */
50323 static opt_machine_mode
50324 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50326 unsigned elem_size = vector_size / nunits;
50328 /* Scalar mask case. */
50329 if ((TARGET_AVX512F && vector_size == 64)
50330 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50332 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50333 return smallest_int_mode_for_size (nunits);
50336 scalar_int_mode elem_mode
50337 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50339 gcc_assert (elem_size * nunits == vector_size);
50341 return mode_for_vector (elem_mode, nunits);
50346 /* Return class of registers which could be used for pseudo of MODE
50347 and of class RCLASS for spilling instead of memory. Return NO_REGS
50348 if it is not possible or non-profitable. */
50350 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50352 static reg_class_t
50353 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50355 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50356 && TARGET_SSE2
50357 && TARGET_INTER_UNIT_MOVES_TO_VEC
50358 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50359 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50360 && INTEGER_CLASS_P (rclass))
50361 return ALL_SSE_REGS;
50362 return NO_REGS;
50365 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50366 but returns a lower bound. */
50368 static unsigned int
50369 ix86_max_noce_ifcvt_seq_cost (edge e)
50371 bool predictable_p = predictable_edge_p (e);
50373 enum compiler_param param
50374 = (predictable_p
50375 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50376 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50378 /* If we have a parameter set, use that, otherwise take a guess using
50379 BRANCH_COST. */
50380 if (global_options_set.x_param_values[param])
50381 return PARAM_VALUE (param);
50382 else
50383 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50386 /* Return true if SEQ is a good candidate as a replacement for the
50387 if-convertible sequence described in IF_INFO. */
50389 static bool
50390 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50392 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50394 int cmov_cnt = 0;
50395 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50396 Maybe we should allow even more conditional moves as long as they
50397 are used far enough not to stall the CPU, or also consider
50398 IF_INFO->TEST_BB succ edge probabilities. */
50399 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50401 rtx set = single_set (insn);
50402 if (!set)
50403 continue;
50404 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50405 continue;
50406 rtx src = SET_SRC (set);
50407 machine_mode mode = GET_MODE (src);
50408 if (GET_MODE_CLASS (mode) != MODE_INT
50409 && GET_MODE_CLASS (mode) != MODE_FLOAT)
50410 continue;
50411 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50412 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50413 continue;
50414 /* insn is CMOV or FCMOV. */
50415 if (++cmov_cnt > 1)
50416 return false;
50419 return default_noce_conversion_profitable_p (seq, if_info);
50422 /* Implement targetm.vectorize.init_cost. */
50424 static void *
50425 ix86_init_cost (struct loop *)
50427 unsigned *cost = XNEWVEC (unsigned, 3);
50428 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50429 return cost;
50432 /* Implement targetm.vectorize.add_stmt_cost. */
50434 static unsigned
50435 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50436 struct _stmt_vec_info *stmt_info, int misalign,
50437 enum vect_cost_model_location where)
50439 unsigned *cost = (unsigned *) data;
50440 unsigned retval = 0;
50442 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50443 int stmt_cost = - 1;
50445 if ((kind == vector_stmt || kind == scalar_stmt)
50446 && stmt_info
50447 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50449 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50450 bool fp = false;
50451 machine_mode mode = TImode;
50453 if (vectype != NULL)
50455 fp = FLOAT_TYPE_P (vectype);
50456 mode = TYPE_MODE (vectype);
50458 /*machine_mode inner_mode = mode;
50459 if (VECTOR_MODE_P (mode))
50460 inner_mode = GET_MODE_INNER (mode);*/
50462 switch (subcode)
50464 case PLUS_EXPR:
50465 case POINTER_PLUS_EXPR:
50466 case MINUS_EXPR:
50467 if (kind == scalar_stmt)
50469 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50470 stmt_cost = ix86_cost->addss;
50471 else if (X87_FLOAT_MODE_P (mode))
50472 stmt_cost = ix86_cost->fadd;
50473 else
50474 stmt_cost = ix86_cost->add;
50476 else
50477 stmt_cost = ix86_vec_cost (mode,
50478 fp ? ix86_cost->addss
50479 : ix86_cost->sse_op,
50480 true);
50481 break;
50483 case MULT_EXPR:
50484 case WIDEN_MULT_EXPR:
50485 case MULT_HIGHPART_EXPR:
50486 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50487 break;
50488 case FMA_EXPR:
50489 stmt_cost = ix86_vec_cost (mode,
50490 mode == SFmode ? ix86_cost->fmass
50491 : ix86_cost->fmasd,
50492 true);
50493 break;
50494 case NEGATE_EXPR:
50495 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50496 stmt_cost = ix86_cost->sse_op;
50497 else if (X87_FLOAT_MODE_P (mode))
50498 stmt_cost = ix86_cost->fchs;
50499 else if (VECTOR_MODE_P (mode))
50500 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50501 else
50502 stmt_cost = ix86_cost->add;
50503 break;
50504 case TRUNC_DIV_EXPR:
50505 case CEIL_DIV_EXPR:
50506 case FLOOR_DIV_EXPR:
50507 case ROUND_DIV_EXPR:
50508 case TRUNC_MOD_EXPR:
50509 case CEIL_MOD_EXPR:
50510 case FLOOR_MOD_EXPR:
50511 case RDIV_EXPR:
50512 case ROUND_MOD_EXPR:
50513 case EXACT_DIV_EXPR:
50514 stmt_cost = ix86_division_cost (ix86_cost, mode);
50515 break;
50517 case RSHIFT_EXPR:
50518 case LSHIFT_EXPR:
50519 case LROTATE_EXPR:
50520 case RROTATE_EXPR:
50522 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50523 stmt_cost = ix86_shift_rotate_cost
50524 (ix86_cost, mode,
50525 TREE_CODE (op2) == INTEGER_CST,
50526 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50527 true, false, false, NULL, NULL);
50529 break;
50530 case NOP_EXPR:
50531 /* Only sign-conversions are free. */
50532 if (tree_nop_conversion_p
50533 (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
50534 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
50535 stmt_cost = 0;
50536 break;
50538 case BIT_IOR_EXPR:
50539 case ABS_EXPR:
50540 case MIN_EXPR:
50541 case MAX_EXPR:
50542 case BIT_XOR_EXPR:
50543 case BIT_AND_EXPR:
50544 case BIT_NOT_EXPR:
50545 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50546 stmt_cost = ix86_cost->sse_op;
50547 else if (VECTOR_MODE_P (mode))
50548 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50549 else
50550 stmt_cost = ix86_cost->add;
50551 break;
50552 default:
50553 break;
50556 /* If we do elementwise loads into a vector then we are bound by
50557 latency and execution resources for the many scalar loads
50558 (AGU and load ports). Try to account for this by scaling the
50559 construction cost by the number of elements involved. */
50560 if (kind == vec_construct
50561 && stmt_info
50562 && stmt_info->type == load_vec_info_type
50563 && stmt_info->memory_access_type == VMAT_ELEMENTWISE)
50565 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50566 stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
50568 if (stmt_cost == -1)
50569 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50571 /* Penalize DFmode vector operations for Bonnell. */
50572 if (TARGET_BONNELL && kind == vector_stmt
50573 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50574 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50576 /* Statements in an inner loop relative to the loop being
50577 vectorized are weighted more heavily. The value here is
50578 arbitrary and could potentially be improved with analysis. */
50579 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50580 count *= 50; /* FIXME. */
50582 retval = (unsigned) (count * stmt_cost);
50584 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50585 for Silvermont as it has out of order integer pipeline and can execute
50586 2 scalar instruction per tick, but has in order SIMD pipeline. */
50587 if ((TARGET_SILVERMONT || TARGET_INTEL)
50588 && stmt_info && stmt_info->stmt)
50590 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50591 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50592 retval = (retval * 17) / 10;
50595 cost[where] += retval;
50597 return retval;
50600 /* Implement targetm.vectorize.finish_cost. */
50602 static void
50603 ix86_finish_cost (void *data, unsigned *prologue_cost,
50604 unsigned *body_cost, unsigned *epilogue_cost)
50606 unsigned *cost = (unsigned *) data;
50607 *prologue_cost = cost[vect_prologue];
50608 *body_cost = cost[vect_body];
50609 *epilogue_cost = cost[vect_epilogue];
50612 /* Implement targetm.vectorize.destroy_cost_data. */
50614 static void
50615 ix86_destroy_cost_data (void *data)
50617 free (data);
50620 /* Validate target specific memory model bits in VAL. */
50622 static unsigned HOST_WIDE_INT
50623 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50625 enum memmodel model = memmodel_from_int (val);
50626 bool strong;
50628 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50629 |MEMMODEL_MASK)
50630 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50632 warning (OPT_Winvalid_memory_model,
50633 "unknown architecture specific memory model");
50634 return MEMMODEL_SEQ_CST;
50636 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50637 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50639 warning (OPT_Winvalid_memory_model,
50640 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50641 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50643 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50645 warning (OPT_Winvalid_memory_model,
50646 "HLE_RELEASE not used with RELEASE or stronger memory model");
50647 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50649 return val;
50652 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50653 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50654 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50655 or number of vecsize_mangle variants that should be emitted. */
50657 static int
50658 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50659 struct cgraph_simd_clone *clonei,
50660 tree base_type, int num)
50662 int ret = 1;
50664 if (clonei->simdlen
50665 && (clonei->simdlen < 2
50666 || clonei->simdlen > 1024
50667 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50669 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50670 "unsupported simdlen %d", clonei->simdlen);
50671 return 0;
50674 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50675 if (TREE_CODE (ret_type) != VOID_TYPE)
50676 switch (TYPE_MODE (ret_type))
50678 case E_QImode:
50679 case E_HImode:
50680 case E_SImode:
50681 case E_DImode:
50682 case E_SFmode:
50683 case E_DFmode:
50684 /* case E_SCmode: */
50685 /* case E_DCmode: */
50686 break;
50687 default:
50688 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50689 "unsupported return type %qT for simd", ret_type);
50690 return 0;
50693 tree t;
50694 int i;
50696 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50697 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50698 switch (TYPE_MODE (TREE_TYPE (t)))
50700 case E_QImode:
50701 case E_HImode:
50702 case E_SImode:
50703 case E_DImode:
50704 case E_SFmode:
50705 case E_DFmode:
50706 /* case E_SCmode: */
50707 /* case E_DCmode: */
50708 break;
50709 default:
50710 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50711 "unsupported argument type %qT for simd", TREE_TYPE (t));
50712 return 0;
50715 if (!TREE_PUBLIC (node->decl))
50717 /* If the function isn't exported, we can pick up just one ISA
50718 for the clones. */
50719 if (TARGET_AVX512F)
50720 clonei->vecsize_mangle = 'e';
50721 else if (TARGET_AVX2)
50722 clonei->vecsize_mangle = 'd';
50723 else if (TARGET_AVX)
50724 clonei->vecsize_mangle = 'c';
50725 else
50726 clonei->vecsize_mangle = 'b';
50727 ret = 1;
50729 else
50731 clonei->vecsize_mangle = "bcde"[num];
50732 ret = 4;
50734 clonei->mask_mode = VOIDmode;
50735 switch (clonei->vecsize_mangle)
50737 case 'b':
50738 clonei->vecsize_int = 128;
50739 clonei->vecsize_float = 128;
50740 break;
50741 case 'c':
50742 clonei->vecsize_int = 128;
50743 clonei->vecsize_float = 256;
50744 break;
50745 case 'd':
50746 clonei->vecsize_int = 256;
50747 clonei->vecsize_float = 256;
50748 break;
50749 case 'e':
50750 clonei->vecsize_int = 512;
50751 clonei->vecsize_float = 512;
50752 if (TYPE_MODE (base_type) == QImode)
50753 clonei->mask_mode = DImode;
50754 else
50755 clonei->mask_mode = SImode;
50756 break;
50758 if (clonei->simdlen == 0)
50760 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50761 clonei->simdlen = clonei->vecsize_int;
50762 else
50763 clonei->simdlen = clonei->vecsize_float;
50764 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50766 else if (clonei->simdlen > 16)
50768 /* For compatibility with ICC, use the same upper bounds
50769 for simdlen. In particular, for CTYPE below, use the return type,
50770 unless the function returns void, in that case use the characteristic
50771 type. If it is possible for given SIMDLEN to pass CTYPE value
50772 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50773 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50774 emit corresponding clone. */
50775 tree ctype = ret_type;
50776 if (TREE_CODE (ret_type) == VOID_TYPE)
50777 ctype = base_type;
50778 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50779 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50780 cnt /= clonei->vecsize_int;
50781 else
50782 cnt /= clonei->vecsize_float;
50783 if (cnt > (TARGET_64BIT ? 16 : 8))
50785 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50786 "unsupported simdlen %d", clonei->simdlen);
50787 return 0;
50790 return ret;
50793 /* Add target attribute to SIMD clone NODE if needed. */
50795 static void
50796 ix86_simd_clone_adjust (struct cgraph_node *node)
50798 const char *str = NULL;
50799 gcc_assert (node->decl == cfun->decl);
50800 switch (node->simdclone->vecsize_mangle)
50802 case 'b':
50803 if (!TARGET_SSE2)
50804 str = "sse2";
50805 break;
50806 case 'c':
50807 if (!TARGET_AVX)
50808 str = "avx";
50809 break;
50810 case 'd':
50811 if (!TARGET_AVX2)
50812 str = "avx2";
50813 break;
50814 case 'e':
50815 if (!TARGET_AVX512F)
50816 str = "avx512f";
50817 break;
50818 default:
50819 gcc_unreachable ();
50821 if (str == NULL)
50822 return;
50823 push_cfun (NULL);
50824 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50825 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50826 gcc_assert (ok);
50827 pop_cfun ();
50828 ix86_reset_previous_fndecl ();
50829 ix86_set_current_function (node->decl);
50832 /* If SIMD clone NODE can't be used in a vectorized loop
50833 in current function, return -1, otherwise return a badness of using it
50834 (0 if it is most desirable from vecsize_mangle point of view, 1
50835 slightly less desirable, etc.). */
50837 static int
50838 ix86_simd_clone_usable (struct cgraph_node *node)
50840 switch (node->simdclone->vecsize_mangle)
50842 case 'b':
50843 if (!TARGET_SSE2)
50844 return -1;
50845 if (!TARGET_AVX)
50846 return 0;
50847 return TARGET_AVX2 ? 2 : 1;
50848 case 'c':
50849 if (!TARGET_AVX)
50850 return -1;
50851 return TARGET_AVX2 ? 1 : 0;
50852 case 'd':
50853 if (!TARGET_AVX2)
50854 return -1;
50855 return 0;
50856 case 'e':
50857 if (!TARGET_AVX512F)
50858 return -1;
50859 return 0;
50860 default:
50861 gcc_unreachable ();
50865 /* This function adjusts the unroll factor based on
50866 the hardware capabilities. For ex, bdver3 has
50867 a loop buffer which makes unrolling of smaller
50868 loops less important. This function decides the
50869 unroll factor using number of memory references
50870 (value 32 is used) as a heuristic. */
50872 static unsigned
50873 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50875 basic_block *bbs;
50876 rtx_insn *insn;
50877 unsigned i;
50878 unsigned mem_count = 0;
50880 if (!TARGET_ADJUST_UNROLL)
50881 return nunroll;
50883 /* Count the number of memory references within the loop body.
50884 This value determines the unrolling factor for bdver3 and bdver4
50885 architectures. */
50886 subrtx_iterator::array_type array;
50887 bbs = get_loop_body (loop);
50888 for (i = 0; i < loop->num_nodes; i++)
50889 FOR_BB_INSNS (bbs[i], insn)
50890 if (NONDEBUG_INSN_P (insn))
50891 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50892 if (const_rtx x = *iter)
50893 if (MEM_P (x))
50895 machine_mode mode = GET_MODE (x);
50896 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50897 if (n_words > 4)
50898 mem_count += 2;
50899 else
50900 mem_count += 1;
50902 free (bbs);
50904 if (mem_count && mem_count <=32)
50905 return MIN (nunroll, 32 / mem_count);
50907 return nunroll;
50911 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50913 static bool
50914 ix86_float_exceptions_rounding_supported_p (void)
50916 /* For x87 floating point with standard excess precision handling,
50917 there is no adddf3 pattern (since x87 floating point only has
50918 XFmode operations) so the default hook implementation gets this
50919 wrong. */
50920 return TARGET_80387 || TARGET_SSE_MATH;
50923 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50925 static void
50926 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50928 if (!TARGET_80387 && !TARGET_SSE_MATH)
50929 return;
50930 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50931 if (TARGET_80387)
50933 tree fenv_index_type = build_index_type (size_int (6));
50934 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50935 tree fenv_var = create_tmp_var_raw (fenv_type);
50936 TREE_ADDRESSABLE (fenv_var) = 1;
50937 tree fenv_ptr = build_pointer_type (fenv_type);
50938 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50939 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50940 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50941 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50942 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50943 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50944 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50945 tree hold_fnclex = build_call_expr (fnclex, 0);
50946 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50947 NULL_TREE, NULL_TREE);
50948 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50949 hold_fnclex);
50950 *clear = build_call_expr (fnclex, 0);
50951 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50952 tree fnstsw_call = build_call_expr (fnstsw, 0);
50953 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50954 sw_var, fnstsw_call);
50955 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50956 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50957 exceptions_var, exceptions_x87);
50958 *update = build2 (COMPOUND_EXPR, integer_type_node,
50959 sw_mod, update_mod);
50960 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50961 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50963 if (TARGET_SSE_MATH)
50965 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50966 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50967 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50968 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50969 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50970 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50971 mxcsr_orig_var, stmxcsr_hold_call);
50972 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50973 mxcsr_orig_var,
50974 build_int_cst (unsigned_type_node, 0x1f80));
50975 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50976 build_int_cst (unsigned_type_node, 0xffffffc0));
50977 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50978 mxcsr_mod_var, hold_mod_val);
50979 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50980 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50981 hold_assign_orig, hold_assign_mod);
50982 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50983 ldmxcsr_hold_call);
50984 if (*hold)
50985 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50986 else
50987 *hold = hold_all;
50988 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50989 if (*clear)
50990 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50991 ldmxcsr_clear_call);
50992 else
50993 *clear = ldmxcsr_clear_call;
50994 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50995 tree exceptions_sse = fold_convert (integer_type_node,
50996 stxmcsr_update_call);
50997 if (*update)
50999 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51000 exceptions_var, exceptions_sse);
51001 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51002 exceptions_var, exceptions_mod);
51003 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51004 exceptions_assign);
51006 else
51007 *update = build2 (MODIFY_EXPR, integer_type_node,
51008 exceptions_var, exceptions_sse);
51009 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51010 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51011 ldmxcsr_update_call);
51013 tree atomic_feraiseexcept
51014 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51015 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51016 1, exceptions_var);
51017 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51018 atomic_feraiseexcept_call);
51021 /* Return mode to be used for bounds or VOIDmode
51022 if bounds are not supported. */
51024 static machine_mode
51025 ix86_mpx_bound_mode ()
51027 /* Do not support pointer checker if MPX
51028 is not enabled. */
51029 if (!TARGET_MPX)
51031 if (flag_check_pointer_bounds)
51032 warning (0, "Pointer Checker requires MPX support on this target."
51033 " Use -mmpx options to enable MPX.");
51034 return VOIDmode;
51037 return BNDmode;
51040 /* Return constant used to statically initialize constant bounds.
51042 This function is used to create special bound values. For now
51043 only INIT bounds and NONE bounds are expected. More special
51044 values may be added later. */
51046 static tree
51047 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51049 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51050 : build_zero_cst (pointer_sized_int_node);
51051 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51052 : build_minus_one_cst (pointer_sized_int_node);
51054 /* This function is supposed to be used to create INIT and
51055 NONE bounds only. */
51056 gcc_assert ((lb == 0 && ub == -1)
51057 || (lb == -1 && ub == 0));
51059 return build_complex (NULL, low, high);
51062 /* Generate a list of statements STMTS to initialize pointer bounds
51063 variable VAR with bounds LB and UB. Return the number of generated
51064 statements. */
51066 static int
51067 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51069 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51070 tree lhs, modify, var_p;
51072 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51073 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51075 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51076 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51077 append_to_statement_list (modify, stmts);
51079 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51080 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51081 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51082 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51083 append_to_statement_list (modify, stmts);
51085 return 2;
51088 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51089 /* For i386, common symbol is local only for non-PIE binaries. For
51090 x86-64, common symbol is local only for non-PIE binaries or linker
51091 supports copy reloc in PIE binaries. */
51093 static bool
51094 ix86_binds_local_p (const_tree exp)
51096 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51097 (!flag_pic
51098 || (TARGET_64BIT
51099 && HAVE_LD_PIE_COPYRELOC != 0)));
51101 #endif
51103 /* If MEM is in the form of [base+offset], extract the two parts
51104 of address and set to BASE and OFFSET, otherwise return false. */
51106 static bool
51107 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51109 rtx addr;
51111 gcc_assert (MEM_P (mem));
51113 addr = XEXP (mem, 0);
51115 if (GET_CODE (addr) == CONST)
51116 addr = XEXP (addr, 0);
51118 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51120 *base = addr;
51121 *offset = const0_rtx;
51122 return true;
51125 if (GET_CODE (addr) == PLUS
51126 && (REG_P (XEXP (addr, 0))
51127 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51128 && CONST_INT_P (XEXP (addr, 1)))
51130 *base = XEXP (addr, 0);
51131 *offset = XEXP (addr, 1);
51132 return true;
51135 return false;
51138 /* Given OPERANDS of consecutive load/store, check if we can merge
51139 them into move multiple. LOAD is true if they are load instructions.
51140 MODE is the mode of memory operands. */
51142 bool
51143 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51144 machine_mode mode)
51146 HOST_WIDE_INT offval_1, offval_2, msize;
51147 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51149 if (load)
51151 mem_1 = operands[1];
51152 mem_2 = operands[3];
51153 reg_1 = operands[0];
51154 reg_2 = operands[2];
51156 else
51158 mem_1 = operands[0];
51159 mem_2 = operands[2];
51160 reg_1 = operands[1];
51161 reg_2 = operands[3];
51164 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51166 if (REGNO (reg_1) != REGNO (reg_2))
51167 return false;
51169 /* Check if the addresses are in the form of [base+offset]. */
51170 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51171 return false;
51172 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51173 return false;
51175 /* Check if the bases are the same. */
51176 if (!rtx_equal_p (base_1, base_2))
51177 return false;
51179 offval_1 = INTVAL (offset_1);
51180 offval_2 = INTVAL (offset_2);
51181 msize = GET_MODE_SIZE (mode);
51182 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51183 if (offval_1 + msize != offval_2)
51184 return false;
51186 return true;
51189 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51191 static bool
51192 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51193 optimization_type opt_type)
51195 switch (op)
51197 case asin_optab:
51198 case acos_optab:
51199 case log1p_optab:
51200 case exp_optab:
51201 case exp10_optab:
51202 case exp2_optab:
51203 case expm1_optab:
51204 case ldexp_optab:
51205 case scalb_optab:
51206 case round_optab:
51207 return opt_type == OPTIMIZE_FOR_SPEED;
51209 case rint_optab:
51210 if (SSE_FLOAT_MODE_P (mode1)
51211 && TARGET_SSE_MATH
51212 && !flag_trapping_math
51213 && !TARGET_SSE4_1)
51214 return opt_type == OPTIMIZE_FOR_SPEED;
51215 return true;
51217 case floor_optab:
51218 case ceil_optab:
51219 case btrunc_optab:
51220 if (SSE_FLOAT_MODE_P (mode1)
51221 && TARGET_SSE_MATH
51222 && !flag_trapping_math
51223 && TARGET_SSE4_1)
51224 return true;
51225 return opt_type == OPTIMIZE_FOR_SPEED;
51227 case rsqrt_optab:
51228 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51230 default:
51231 return true;
51235 /* Address space support.
51237 This is not "far pointers" in the 16-bit sense, but an easy way
51238 to use %fs and %gs segment prefixes. Therefore:
51240 (a) All address spaces have the same modes,
51241 (b) All address spaces have the same addresss forms,
51242 (c) While %fs and %gs are technically subsets of the generic
51243 address space, they are probably not subsets of each other.
51244 (d) Since we have no access to the segment base register values
51245 without resorting to a system call, we cannot convert a
51246 non-default address space to a default address space.
51247 Therefore we do not claim %fs or %gs are subsets of generic.
51249 Therefore we can (mostly) use the default hooks. */
51251 /* All use of segmentation is assumed to make address 0 valid. */
51253 static bool
51254 ix86_addr_space_zero_address_valid (addr_space_t as)
51256 return as != ADDR_SPACE_GENERIC;
51259 static void
51260 ix86_init_libfuncs (void)
51262 if (TARGET_64BIT)
51264 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51265 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51267 else
51269 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51270 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51273 #if TARGET_MACHO
51274 darwin_rename_builtins ();
51275 #endif
51278 /* Generate call to __divmoddi4. */
51280 static void
51281 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51282 rtx op0, rtx op1,
51283 rtx *quot_p, rtx *rem_p)
51285 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51287 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51288 mode,
51289 op0, GET_MODE (op0),
51290 op1, GET_MODE (op1),
51291 XEXP (rem, 0), Pmode);
51292 *quot_p = quot;
51293 *rem_p = rem;
51296 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51297 FPU, assume that the fpcw is set to extended precision; when using
51298 only SSE, rounding is correct; when using both SSE and the FPU,
51299 the rounding precision is indeterminate, since either may be chosen
51300 apparently at random. */
51302 static enum flt_eval_method
51303 ix86_excess_precision (enum excess_precision_type type)
51305 switch (type)
51307 case EXCESS_PRECISION_TYPE_FAST:
51308 /* The fastest type to promote to will always be the native type,
51309 whether that occurs with implicit excess precision or
51310 otherwise. */
51311 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51312 case EXCESS_PRECISION_TYPE_STANDARD:
51313 case EXCESS_PRECISION_TYPE_IMPLICIT:
51314 /* Otherwise, the excess precision we want when we are
51315 in a standards compliant mode, and the implicit precision we
51316 provide would be identical were it not for the unpredictable
51317 cases. */
51318 if (!TARGET_80387)
51319 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51320 else if (!TARGET_MIX_SSE_I387)
51322 if (!TARGET_SSE_MATH)
51323 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51324 else if (TARGET_SSE2)
51325 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51328 /* If we are in standards compliant mode, but we know we will
51329 calculate in unpredictable precision, return
51330 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51331 excess precision if the target can't guarantee it will honor
51332 it. */
51333 return (type == EXCESS_PRECISION_TYPE_STANDARD
51334 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51335 : FLT_EVAL_METHOD_UNPREDICTABLE);
51336 default:
51337 gcc_unreachable ();
51340 return FLT_EVAL_METHOD_UNPREDICTABLE;
51343 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
51344 decrements by exactly 2 no matter what the position was, there is no pushb.
51346 But as CIE data alignment factor on this arch is -4 for 32bit targets
51347 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51348 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
51350 poly_int64
51351 ix86_push_rounding (poly_int64 bytes)
51353 return ROUND_UP (bytes, UNITS_PER_WORD);
51356 /* Target-specific selftests. */
51358 #if CHECKING_P
51360 namespace selftest {
51362 /* Verify that hard regs are dumped as expected (in compact mode). */
51364 static void
51365 ix86_test_dumping_hard_regs ()
51367 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51368 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51371 /* Test dumping an insn with repeated references to the same SCRATCH,
51372 to verify the rtx_reuse code. */
51374 static void
51375 ix86_test_dumping_memory_blockage ()
51377 set_new_first_and_last_insn (NULL, NULL);
51379 rtx pat = gen_memory_blockage ();
51380 rtx_reuse_manager r;
51381 r.preprocess (pat);
51383 /* Verify that the repeated references to the SCRATCH show use
51384 reuse IDS. The first should be prefixed with a reuse ID,
51385 and the second should be dumped as a "reuse_rtx" of that ID.
51386 The expected string assumes Pmode == DImode. */
51387 if (Pmode == DImode)
51388 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51389 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51390 " (unspec:BLK [\n"
51391 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51392 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51395 /* Verify loading an RTL dump; specifically a dump of copying
51396 a param on x86_64 from a hard reg into the frame.
51397 This test is target-specific since the dump contains target-specific
51398 hard reg names. */
51400 static void
51401 ix86_test_loading_dump_fragment_1 ()
51403 rtl_dump_test t (SELFTEST_LOCATION,
51404 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51406 rtx_insn *insn = get_insn_by_uid (1);
51408 /* The block structure and indentation here is purely for
51409 readability; it mirrors the structure of the rtx. */
51410 tree mem_expr;
51412 rtx pat = PATTERN (insn);
51413 ASSERT_EQ (SET, GET_CODE (pat));
51415 rtx dest = SET_DEST (pat);
51416 ASSERT_EQ (MEM, GET_CODE (dest));
51417 /* Verify the "/c" was parsed. */
51418 ASSERT_TRUE (RTX_FLAG (dest, call));
51419 ASSERT_EQ (SImode, GET_MODE (dest));
51421 rtx addr = XEXP (dest, 0);
51422 ASSERT_EQ (PLUS, GET_CODE (addr));
51423 ASSERT_EQ (DImode, GET_MODE (addr));
51425 rtx lhs = XEXP (addr, 0);
51426 /* Verify that the "frame" REG was consolidated. */
51427 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51430 rtx rhs = XEXP (addr, 1);
51431 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51432 ASSERT_EQ (-4, INTVAL (rhs));
51435 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51436 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51437 /* "i" should have been handled by synthesizing a global int
51438 variable named "i". */
51439 mem_expr = MEM_EXPR (dest);
51440 ASSERT_NE (mem_expr, NULL);
51441 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51442 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51443 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51444 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51445 /* "+0". */
51446 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51447 ASSERT_EQ (0, MEM_OFFSET (dest));
51448 /* "S4". */
51449 ASSERT_EQ (4, MEM_SIZE (dest));
51450 /* "A32. */
51451 ASSERT_EQ (32, MEM_ALIGN (dest));
51454 rtx src = SET_SRC (pat);
51455 ASSERT_EQ (REG, GET_CODE (src));
51456 ASSERT_EQ (SImode, GET_MODE (src));
51457 ASSERT_EQ (5, REGNO (src));
51458 tree reg_expr = REG_EXPR (src);
51459 /* "i" here should point to the same var as for the MEM_EXPR. */
51460 ASSERT_EQ (reg_expr, mem_expr);
51465 /* Verify that the RTL loader copes with a call_insn dump.
51466 This test is target-specific since the dump contains a target-specific
51467 hard reg name. */
51469 static void
51470 ix86_test_loading_call_insn ()
51472 /* The test dump includes register "xmm0", where requires TARGET_SSE
51473 to exist. */
51474 if (!TARGET_SSE)
51475 return;
51477 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51479 rtx_insn *insn = get_insns ();
51480 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51482 /* "/j". */
51483 ASSERT_TRUE (RTX_FLAG (insn, jump));
51485 rtx pat = PATTERN (insn);
51486 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51488 /* Verify REG_NOTES. */
51490 /* "(expr_list:REG_CALL_DECL". */
51491 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51492 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51493 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51495 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51496 rtx_expr_list *note1 = note0->next ();
51497 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51499 ASSERT_EQ (NULL, note1->next ());
51502 /* Verify CALL_INSN_FUNCTION_USAGE. */
51504 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51505 rtx_expr_list *usage
51506 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51507 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51508 ASSERT_EQ (DFmode, GET_MODE (usage));
51509 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51510 ASSERT_EQ (NULL, usage->next ());
51514 /* Verify that the RTL loader copes a dump from print_rtx_function.
51515 This test is target-specific since the dump contains target-specific
51516 hard reg names. */
51518 static void
51519 ix86_test_loading_full_dump ()
51521 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51523 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51525 rtx_insn *insn_1 = get_insn_by_uid (1);
51526 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51528 rtx_insn *insn_7 = get_insn_by_uid (7);
51529 ASSERT_EQ (INSN, GET_CODE (insn_7));
51530 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51532 rtx_insn *insn_15 = get_insn_by_uid (15);
51533 ASSERT_EQ (INSN, GET_CODE (insn_15));
51534 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51536 /* Verify crtl->return_rtx. */
51537 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51538 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51539 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51542 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51543 In particular, verify that it correctly loads the 2nd operand.
51544 This test is target-specific since these are machine-specific
51545 operands (and enums). */
51547 static void
51548 ix86_test_loading_unspec ()
51550 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51552 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51554 ASSERT_TRUE (cfun);
51556 /* Test of an UNSPEC. */
51557 rtx_insn *insn = get_insns ();
51558 ASSERT_EQ (INSN, GET_CODE (insn));
51559 rtx set = single_set (insn);
51560 ASSERT_NE (NULL, set);
51561 rtx dst = SET_DEST (set);
51562 ASSERT_EQ (MEM, GET_CODE (dst));
51563 rtx src = SET_SRC (set);
51564 ASSERT_EQ (UNSPEC, GET_CODE (src));
51565 ASSERT_EQ (BLKmode, GET_MODE (src));
51566 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51568 rtx v0 = XVECEXP (src, 0, 0);
51570 /* Verify that the two uses of the first SCRATCH have pointer
51571 equality. */
51572 rtx scratch_a = XEXP (dst, 0);
51573 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51575 rtx scratch_b = XEXP (v0, 0);
51576 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51578 ASSERT_EQ (scratch_a, scratch_b);
51580 /* Verify that the two mems are thus treated as equal. */
51581 ASSERT_TRUE (rtx_equal_p (dst, v0));
51583 /* Verify the the insn is recognized. */
51584 ASSERT_NE(-1, recog_memoized (insn));
51586 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51587 insn = NEXT_INSN (insn);
51588 ASSERT_EQ (INSN, GET_CODE (insn));
51590 set = single_set (insn);
51591 ASSERT_NE (NULL, set);
51593 src = SET_SRC (set);
51594 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51595 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51598 /* Run all target-specific selftests. */
51600 static void
51601 ix86_run_selftests (void)
51603 ix86_test_dumping_hard_regs ();
51604 ix86_test_dumping_memory_blockage ();
51606 /* Various tests of loading RTL dumps, here because they contain
51607 ix86-isms (e.g. names of hard regs). */
51608 ix86_test_loading_dump_fragment_1 ();
51609 ix86_test_loading_call_insn ();
51610 ix86_test_loading_full_dump ();
51611 ix86_test_loading_unspec ();
51614 } // namespace selftest
51616 #endif /* CHECKING_P */
51618 /* Initialize the GCC target structure. */
51619 #undef TARGET_RETURN_IN_MEMORY
51620 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51622 #undef TARGET_LEGITIMIZE_ADDRESS
51623 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51625 #undef TARGET_ATTRIBUTE_TABLE
51626 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51627 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51628 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51629 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51630 # undef TARGET_MERGE_DECL_ATTRIBUTES
51631 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51632 #endif
51634 #undef TARGET_COMP_TYPE_ATTRIBUTES
51635 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51637 #undef TARGET_INIT_BUILTINS
51638 #define TARGET_INIT_BUILTINS ix86_init_builtins
51639 #undef TARGET_BUILTIN_DECL
51640 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51641 #undef TARGET_EXPAND_BUILTIN
51642 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51644 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51645 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51646 ix86_builtin_vectorized_function
51648 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51649 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51651 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51652 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51654 #undef TARGET_BUILTIN_RECIPROCAL
51655 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51657 #undef TARGET_ASM_FUNCTION_EPILOGUE
51658 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51660 #undef TARGET_ENCODE_SECTION_INFO
51661 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51662 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51663 #else
51664 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51665 #endif
51667 #undef TARGET_ASM_OPEN_PAREN
51668 #define TARGET_ASM_OPEN_PAREN ""
51669 #undef TARGET_ASM_CLOSE_PAREN
51670 #define TARGET_ASM_CLOSE_PAREN ""
51672 #undef TARGET_ASM_BYTE_OP
51673 #define TARGET_ASM_BYTE_OP ASM_BYTE
51675 #undef TARGET_ASM_ALIGNED_HI_OP
51676 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51677 #undef TARGET_ASM_ALIGNED_SI_OP
51678 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51679 #ifdef ASM_QUAD
51680 #undef TARGET_ASM_ALIGNED_DI_OP
51681 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51682 #endif
51684 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51685 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51687 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51688 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51690 #undef TARGET_ASM_UNALIGNED_HI_OP
51691 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51692 #undef TARGET_ASM_UNALIGNED_SI_OP
51693 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51694 #undef TARGET_ASM_UNALIGNED_DI_OP
51695 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51697 #undef TARGET_PRINT_OPERAND
51698 #define TARGET_PRINT_OPERAND ix86_print_operand
51699 #undef TARGET_PRINT_OPERAND_ADDRESS
51700 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51701 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51702 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51703 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51704 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51706 #undef TARGET_SCHED_INIT_GLOBAL
51707 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51708 #undef TARGET_SCHED_ADJUST_COST
51709 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51710 #undef TARGET_SCHED_ISSUE_RATE
51711 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51712 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51713 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51714 ia32_multipass_dfa_lookahead
51715 #undef TARGET_SCHED_MACRO_FUSION_P
51716 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51717 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51718 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51720 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51721 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51723 #undef TARGET_MEMMODEL_CHECK
51724 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51726 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51727 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51729 #ifdef HAVE_AS_TLS
51730 #undef TARGET_HAVE_TLS
51731 #define TARGET_HAVE_TLS true
51732 #endif
51733 #undef TARGET_CANNOT_FORCE_CONST_MEM
51734 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51735 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51736 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51738 #undef TARGET_DELEGITIMIZE_ADDRESS
51739 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51741 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51742 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51744 #undef TARGET_MS_BITFIELD_LAYOUT_P
51745 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51747 #if TARGET_MACHO
51748 #undef TARGET_BINDS_LOCAL_P
51749 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51750 #else
51751 #undef TARGET_BINDS_LOCAL_P
51752 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51753 #endif
51754 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51755 #undef TARGET_BINDS_LOCAL_P
51756 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51757 #endif
51759 #undef TARGET_ASM_OUTPUT_MI_THUNK
51760 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51761 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51762 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51764 #undef TARGET_ASM_FILE_START
51765 #define TARGET_ASM_FILE_START x86_file_start
51767 #undef TARGET_OPTION_OVERRIDE
51768 #define TARGET_OPTION_OVERRIDE ix86_option_override
51770 #undef TARGET_REGISTER_MOVE_COST
51771 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51772 #undef TARGET_MEMORY_MOVE_COST
51773 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51774 #undef TARGET_RTX_COSTS
51775 #define TARGET_RTX_COSTS ix86_rtx_costs
51776 #undef TARGET_ADDRESS_COST
51777 #define TARGET_ADDRESS_COST ix86_address_cost
51779 #undef TARGET_FLAGS_REGNUM
51780 #define TARGET_FLAGS_REGNUM FLAGS_REG
51781 #undef TARGET_FIXED_CONDITION_CODE_REGS
51782 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51783 #undef TARGET_CC_MODES_COMPATIBLE
51784 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51786 #undef TARGET_MACHINE_DEPENDENT_REORG
51787 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51789 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51790 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51792 #undef TARGET_BUILD_BUILTIN_VA_LIST
51793 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51795 #undef TARGET_FOLD_BUILTIN
51796 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51798 #undef TARGET_GIMPLE_FOLD_BUILTIN
51799 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51801 #undef TARGET_COMPARE_VERSION_PRIORITY
51802 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51804 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51805 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51806 ix86_generate_version_dispatcher_body
51808 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51809 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51810 ix86_get_function_versions_dispatcher
51812 #undef TARGET_ENUM_VA_LIST_P
51813 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51815 #undef TARGET_FN_ABI_VA_LIST
51816 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51818 #undef TARGET_CANONICAL_VA_LIST_TYPE
51819 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51821 #undef TARGET_EXPAND_BUILTIN_VA_START
51822 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51824 #undef TARGET_MD_ASM_ADJUST
51825 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51827 #undef TARGET_C_EXCESS_PRECISION
51828 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51829 #undef TARGET_PROMOTE_PROTOTYPES
51830 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51831 #undef TARGET_SETUP_INCOMING_VARARGS
51832 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51833 #undef TARGET_MUST_PASS_IN_STACK
51834 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51835 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51836 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51837 #undef TARGET_FUNCTION_ARG_ADVANCE
51838 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51839 #undef TARGET_FUNCTION_ARG
51840 #define TARGET_FUNCTION_ARG ix86_function_arg
51841 #undef TARGET_INIT_PIC_REG
51842 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51843 #undef TARGET_USE_PSEUDO_PIC_REG
51844 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51845 #undef TARGET_FUNCTION_ARG_BOUNDARY
51846 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51847 #undef TARGET_PASS_BY_REFERENCE
51848 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51849 #undef TARGET_INTERNAL_ARG_POINTER
51850 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51851 #undef TARGET_UPDATE_STACK_BOUNDARY
51852 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51853 #undef TARGET_GET_DRAP_RTX
51854 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51855 #undef TARGET_STRICT_ARGUMENT_NAMING
51856 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51857 #undef TARGET_STATIC_CHAIN
51858 #define TARGET_STATIC_CHAIN ix86_static_chain
51859 #undef TARGET_TRAMPOLINE_INIT
51860 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51861 #undef TARGET_RETURN_POPS_ARGS
51862 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51864 #undef TARGET_WARN_FUNC_RETURN
51865 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51867 #undef TARGET_LEGITIMATE_COMBINED_INSN
51868 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51870 #undef TARGET_ASAN_SHADOW_OFFSET
51871 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51873 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51874 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51876 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51877 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51879 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51880 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51882 #undef TARGET_C_MODE_FOR_SUFFIX
51883 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51885 #ifdef HAVE_AS_TLS
51886 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51887 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51888 #endif
51890 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51891 #undef TARGET_INSERT_ATTRIBUTES
51892 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51893 #endif
51895 #undef TARGET_MANGLE_TYPE
51896 #define TARGET_MANGLE_TYPE ix86_mangle_type
51898 #undef TARGET_STACK_PROTECT_GUARD
51899 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51901 #if !TARGET_MACHO
51902 #undef TARGET_STACK_PROTECT_FAIL
51903 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51904 #endif
51906 #undef TARGET_FUNCTION_VALUE
51907 #define TARGET_FUNCTION_VALUE ix86_function_value
51909 #undef TARGET_FUNCTION_VALUE_REGNO_P
51910 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51912 #undef TARGET_PROMOTE_FUNCTION_MODE
51913 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51915 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51916 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51918 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51919 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51921 #undef TARGET_INSTANTIATE_DECLS
51922 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51924 #undef TARGET_SECONDARY_RELOAD
51925 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51926 #undef TARGET_SECONDARY_MEMORY_NEEDED
51927 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51928 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51929 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51931 #undef TARGET_CLASS_MAX_NREGS
51932 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51934 #undef TARGET_PREFERRED_RELOAD_CLASS
51935 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51936 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51937 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51938 #undef TARGET_CLASS_LIKELY_SPILLED_P
51939 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51941 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51942 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51943 ix86_builtin_vectorization_cost
51944 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51945 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51946 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51947 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51948 ix86_preferred_simd_mode
51949 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51950 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51951 ix86_split_reduction
51952 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51953 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51954 ix86_autovectorize_vector_sizes
51955 #undef TARGET_VECTORIZE_GET_MASK_MODE
51956 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51957 #undef TARGET_VECTORIZE_INIT_COST
51958 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51959 #undef TARGET_VECTORIZE_ADD_STMT_COST
51960 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51961 #undef TARGET_VECTORIZE_FINISH_COST
51962 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51963 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51964 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51966 #undef TARGET_SET_CURRENT_FUNCTION
51967 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51969 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51970 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51972 #undef TARGET_OPTION_SAVE
51973 #define TARGET_OPTION_SAVE ix86_function_specific_save
51975 #undef TARGET_OPTION_RESTORE
51976 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51978 #undef TARGET_OPTION_POST_STREAM_IN
51979 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51981 #undef TARGET_OPTION_PRINT
51982 #define TARGET_OPTION_PRINT ix86_function_specific_print
51984 #undef TARGET_OPTION_FUNCTION_VERSIONS
51985 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51987 #undef TARGET_CAN_INLINE_P
51988 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51990 #undef TARGET_LEGITIMATE_ADDRESS_P
51991 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51993 #undef TARGET_REGISTER_PRIORITY
51994 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51996 #undef TARGET_REGISTER_USAGE_LEVELING_P
51997 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51999 #undef TARGET_LEGITIMATE_CONSTANT_P
52000 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52002 #undef TARGET_COMPUTE_FRAME_LAYOUT
52003 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52005 #undef TARGET_FRAME_POINTER_REQUIRED
52006 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52008 #undef TARGET_CAN_ELIMINATE
52009 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52011 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52012 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52014 #undef TARGET_ASM_CODE_END
52015 #define TARGET_ASM_CODE_END ix86_code_end
52017 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52018 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52020 #undef TARGET_CANONICALIZE_COMPARISON
52021 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
52023 #undef TARGET_LOOP_UNROLL_ADJUST
52024 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52026 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52027 #undef TARGET_SPILL_CLASS
52028 #define TARGET_SPILL_CLASS ix86_spill_class
52030 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52031 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52032 ix86_simd_clone_compute_vecsize_and_simdlen
52034 #undef TARGET_SIMD_CLONE_ADJUST
52035 #define TARGET_SIMD_CLONE_ADJUST \
52036 ix86_simd_clone_adjust
52038 #undef TARGET_SIMD_CLONE_USABLE
52039 #define TARGET_SIMD_CLONE_USABLE \
52040 ix86_simd_clone_usable
52042 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52043 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52044 ix86_float_exceptions_rounding_supported_p
52046 #undef TARGET_MODE_EMIT
52047 #define TARGET_MODE_EMIT ix86_emit_mode_set
52049 #undef TARGET_MODE_NEEDED
52050 #define TARGET_MODE_NEEDED ix86_mode_needed
52052 #undef TARGET_MODE_AFTER
52053 #define TARGET_MODE_AFTER ix86_mode_after
52055 #undef TARGET_MODE_ENTRY
52056 #define TARGET_MODE_ENTRY ix86_mode_entry
52058 #undef TARGET_MODE_EXIT
52059 #define TARGET_MODE_EXIT ix86_mode_exit
52061 #undef TARGET_MODE_PRIORITY
52062 #define TARGET_MODE_PRIORITY ix86_mode_priority
52064 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52065 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52067 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52068 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52070 #undef TARGET_STORE_BOUNDS_FOR_ARG
52071 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52073 #undef TARGET_LOAD_RETURNED_BOUNDS
52074 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52076 #undef TARGET_STORE_RETURNED_BOUNDS
52077 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52079 #undef TARGET_CHKP_BOUND_MODE
52080 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52082 #undef TARGET_BUILTIN_CHKP_FUNCTION
52083 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52085 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52086 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52088 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52089 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52091 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52092 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52094 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52095 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52097 #undef TARGET_OFFLOAD_OPTIONS
52098 #define TARGET_OFFLOAD_OPTIONS \
52099 ix86_offload_options
52101 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52102 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52104 #undef TARGET_OPTAB_SUPPORTED_P
52105 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52107 #undef TARGET_HARD_REGNO_SCRATCH_OK
52108 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52110 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52111 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52113 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52114 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52116 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52117 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52119 #undef TARGET_INIT_LIBFUNCS
52120 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52122 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52123 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52125 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52126 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52128 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52129 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52131 #undef TARGET_HARD_REGNO_NREGS
52132 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
52133 #undef TARGET_HARD_REGNO_MODE_OK
52134 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
52136 #undef TARGET_MODES_TIEABLE_P
52137 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
52139 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
52140 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
52141 ix86_hard_regno_call_part_clobbered
52143 #undef TARGET_CAN_CHANGE_MODE_CLASS
52144 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
52146 #undef TARGET_STATIC_RTX_ALIGNMENT
52147 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
52148 #undef TARGET_CONSTANT_ALIGNMENT
52149 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
52151 #undef TARGET_EMPTY_RECORD_P
52152 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
52154 #undef TARGET_WARN_PARAMETER_PASSING_ABI
52155 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
52157 #if CHECKING_P
52158 #undef TARGET_RUN_TARGET_SELFTESTS
52159 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52160 #endif /* #if CHECKING_P */
52162 struct gcc_target targetm = TARGET_INITIALIZER;
52164 #include "gt-i386.h"