PR target/84945
[official-gcc.git] / gcc / config / i386 / i386.c
blob5b1e962dedb04b220c9cc717580b9bddba7619b8
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
93 #include "wide-int-bitmask.h"
95 /* This file should be included last. */
96 #include "target-def.h"
98 #include "x86-tune-costs.h"
100 static rtx legitimize_dllimport_symbol (rtx, bool);
101 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
102 static rtx legitimize_pe_coff_symbol (rtx, bool);
103 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
104 static bool ix86_save_reg (unsigned int, bool, bool);
105 static bool ix86_function_naked (const_tree);
106 static bool ix86_notrack_prefixed_insn_p (rtx);
107 static void ix86_emit_restore_reg_using_pop (rtx);
110 #ifndef CHECK_STACK_LIMIT
111 #define CHECK_STACK_LIMIT (-1)
112 #endif
114 /* Return index of given mode in mult and division cost tables. */
115 #define MODE_INDEX(mode) \
116 ((mode) == QImode ? 0 \
117 : (mode) == HImode ? 1 \
118 : (mode) == SImode ? 2 \
119 : (mode) == DImode ? 3 \
120 : 4)
123 /* Set by -mtune. */
124 const struct processor_costs *ix86_tune_cost = NULL;
126 /* Set by -mtune or -Os. */
127 const struct processor_costs *ix86_cost = NULL;
129 /* Processor feature/optimization bitmasks. */
130 #define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
131 #define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
132 #define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
133 #define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
134 #define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
135 #define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
136 #define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
137 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
138 #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
139 #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
140 #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
141 #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
142 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
143 #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
144 #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
145 #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
146 #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
147 #define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
148 #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
149 #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
150 #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
151 #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
153 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
154 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
155 #define m_K6_GEODE (m_K6 | m_GEODE)
156 #define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
157 #define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
158 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
159 #define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
160 #define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
161 #define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
162 #define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
163 #define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
164 #define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
165 #define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
166 #define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
167 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
168 #define m_BTVER (m_BTVER1 | m_BTVER2)
169 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
170 | m_ZNVER1)
172 #define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
174 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
175 #undef DEF_TUNE
176 #define DEF_TUNE(tune, name, selector) name,
177 #include "x86-tune.def"
178 #undef DEF_TUNE
181 /* Feature tests against the various tunings. */
182 unsigned char ix86_tune_features[X86_TUNE_LAST];
184 /* Feature tests against the various tunings used to create ix86_tune_features
185 based on the processor mask. */
186 static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
187 #undef DEF_TUNE
188 #define DEF_TUNE(tune, name, selector) selector,
189 #include "x86-tune.def"
190 #undef DEF_TUNE
193 /* Feature tests against the various architecture variations. */
194 unsigned char ix86_arch_features[X86_ARCH_LAST];
196 /* Feature tests against the various architecture variations, used to create
197 ix86_arch_features based on the processor mask. */
198 static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
199 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
200 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
202 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
203 ~m_386,
205 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
206 ~(m_386 | m_486),
208 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
209 ~m_386,
211 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
212 ~m_386,
215 /* In case the average insn count for single function invocation is
216 lower than this constant, emit fast (but longer) prologue and
217 epilogue code. */
218 #define FAST_PROLOGUE_INSN_COUNT 20
220 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
221 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
222 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
223 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
225 /* Array of the smallest class containing reg number REGNO, indexed by
226 REGNO. Used by REGNO_REG_CLASS in i386.h. */
228 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
230 /* ax, dx, cx, bx */
231 AREG, DREG, CREG, BREG,
232 /* si, di, bp, sp */
233 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
234 /* FP registers */
235 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
236 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
237 /* arg pointer */
238 NON_Q_REGS,
239 /* flags, fpsr, fpcr, frame */
240 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
241 /* SSE registers */
242 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
243 SSE_REGS, SSE_REGS,
244 /* MMX registers */
245 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
246 MMX_REGS, MMX_REGS,
247 /* REX registers */
248 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
250 /* SSE REX registers */
251 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
252 SSE_REGS, SSE_REGS,
253 /* AVX-512 SSE registers */
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
256 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
257 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
258 /* Mask registers. */
259 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
260 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
261 /* MPX bound registers */
262 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
265 /* The "default" register map used in 32bit mode. */
267 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
269 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
270 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
271 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
272 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
273 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
275 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
276 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
277 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
278 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
279 101, 102, 103, 104, /* bound registers */
282 /* The "default" register map used in 64bit mode. */
284 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
286 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
287 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
288 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
289 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
290 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
291 8,9,10,11,12,13,14,15, /* extended integer registers */
292 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
293 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
294 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
295 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
296 126, 127, 128, 129, /* bound registers */
299 /* Define the register numbers to be used in Dwarf debugging information.
300 The SVR4 reference port C compiler uses the following register numbers
301 in its Dwarf output code:
302 0 for %eax (gcc regno = 0)
303 1 for %ecx (gcc regno = 2)
304 2 for %edx (gcc regno = 1)
305 3 for %ebx (gcc regno = 3)
306 4 for %esp (gcc regno = 7)
307 5 for %ebp (gcc regno = 6)
308 6 for %esi (gcc regno = 4)
309 7 for %edi (gcc regno = 5)
310 The following three DWARF register numbers are never generated by
311 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
312 believed these numbers have these meanings.
313 8 for %eip (no gcc equivalent)
314 9 for %eflags (gcc regno = 17)
315 10 for %trapno (no gcc equivalent)
316 It is not at all clear how we should number the FP stack registers
317 for the x86 architecture. If the version of SDB on x86/svr4 were
318 a bit less brain dead with respect to floating-point then we would
319 have a precedent to follow with respect to DWARF register numbers
320 for x86 FP registers, but the SDB on x86/svr4 was so completely
321 broken with respect to FP registers that it is hardly worth thinking
322 of it as something to strive for compatibility with.
323 The version of x86/svr4 SDB I had does (partially)
324 seem to believe that DWARF register number 11 is associated with
325 the x86 register %st(0), but that's about all. Higher DWARF
326 register numbers don't seem to be associated with anything in
327 particular, and even for DWARF regno 11, SDB only seemed to under-
328 stand that it should say that a variable lives in %st(0) (when
329 asked via an `=' command) if we said it was in DWARF regno 11,
330 but SDB still printed garbage when asked for the value of the
331 variable in question (via a `/' command).
332 (Also note that the labels SDB printed for various FP stack regs
333 when doing an `x' command were all wrong.)
334 Note that these problems generally don't affect the native SVR4
335 C compiler because it doesn't allow the use of -O with -g and
336 because when it is *not* optimizing, it allocates a memory
337 location for each floating-point variable, and the memory
338 location is what gets described in the DWARF AT_location
339 attribute for the variable in question.
340 Regardless of the severe mental illness of the x86/svr4 SDB, we
341 do something sensible here and we use the following DWARF
342 register numbers. Note that these are all stack-top-relative
343 numbers.
344 11 for %st(0) (gcc regno = 8)
345 12 for %st(1) (gcc regno = 9)
346 13 for %st(2) (gcc regno = 10)
347 14 for %st(3) (gcc regno = 11)
348 15 for %st(4) (gcc regno = 12)
349 16 for %st(5) (gcc regno = 13)
350 17 for %st(6) (gcc regno = 14)
351 18 for %st(7) (gcc regno = 15)
353 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
355 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
356 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
357 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
358 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
359 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
361 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
362 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
363 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
364 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
365 101, 102, 103, 104, /* bound registers */
368 /* Define parameter passing and return registers. */
370 static int const x86_64_int_parameter_registers[6] =
372 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
375 static int const x86_64_ms_abi_int_parameter_registers[4] =
377 CX_REG, DX_REG, R8_REG, R9_REG
380 static int const x86_64_int_return_registers[4] =
382 AX_REG, DX_REG, DI_REG, SI_REG
385 /* Additional registers that are clobbered by SYSV calls. */
387 #define NUM_X86_64_MS_CLOBBERED_REGS 12
388 static int const x86_64_ms_sysv_extra_clobbered_registers
389 [NUM_X86_64_MS_CLOBBERED_REGS] =
391 SI_REG, DI_REG,
392 XMM6_REG, XMM7_REG,
393 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
394 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
397 enum xlogue_stub {
398 XLOGUE_STUB_SAVE,
399 XLOGUE_STUB_RESTORE,
400 XLOGUE_STUB_RESTORE_TAIL,
401 XLOGUE_STUB_SAVE_HFP,
402 XLOGUE_STUB_RESTORE_HFP,
403 XLOGUE_STUB_RESTORE_HFP_TAIL,
405 XLOGUE_STUB_COUNT
408 enum xlogue_stub_sets {
409 XLOGUE_SET_ALIGNED,
410 XLOGUE_SET_ALIGNED_PLUS_8,
411 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
412 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
414 XLOGUE_SET_COUNT
417 /* Register save/restore layout used by out-of-line stubs. */
418 class xlogue_layout {
419 public:
420 struct reginfo
422 unsigned regno;
423 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
424 rsi) to where each register is stored. */
427 unsigned get_nregs () const {return m_nregs;}
428 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
430 const reginfo &get_reginfo (unsigned reg) const
432 gcc_assert (reg < m_nregs);
433 return m_regs[reg];
436 static const char *get_stub_name (enum xlogue_stub stub,
437 unsigned n_extra_args);
439 /* Returns an rtx for the stub's symbol based upon
440 1.) the specified stub (save, restore or restore_ret) and
441 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
442 3.) rather or not stack alignment is being performed. */
443 static rtx get_stub_rtx (enum xlogue_stub stub);
445 /* Returns the amount of stack space (including padding) that the stub
446 needs to store registers based upon data in the machine_function. */
447 HOST_WIDE_INT get_stack_space_used () const
449 const struct machine_function *m = cfun->machine;
450 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
452 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
453 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
456 /* Returns the offset for the base pointer used by the stub. */
457 HOST_WIDE_INT get_stub_ptr_offset () const
459 return STUB_INDEX_OFFSET + m_stack_align_off_in;
462 static const struct xlogue_layout &get_instance ();
463 static unsigned count_stub_managed_regs ();
464 static bool is_stub_managed_reg (unsigned regno, unsigned count);
466 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
467 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
468 static const unsigned MAX_REGS = 18;
469 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
470 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
471 static const unsigned STUB_NAME_MAX_LEN = 20;
472 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
473 static const unsigned REG_ORDER[MAX_REGS];
474 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
476 private:
477 xlogue_layout ();
478 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
479 xlogue_layout (const xlogue_layout &);
481 /* True if hard frame pointer is used. */
482 bool m_hfp;
484 /* Max number of register this layout manages. */
485 unsigned m_nregs;
487 /* Incoming offset from 16-byte alignment. */
488 HOST_WIDE_INT m_stack_align_off_in;
490 /* Register order and offsets. */
491 struct reginfo m_regs[MAX_REGS];
493 /* Lazy-inited cache of symbol names for stubs. */
494 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
495 [STUB_NAME_MAX_LEN];
497 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
500 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
501 "savms64",
502 "resms64",
503 "resms64x",
504 "savms64f",
505 "resms64f",
506 "resms64fx"
509 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
510 /* The below offset values are where each register is stored for the layout
511 relative to incoming stack pointer. The value of each m_regs[].offset will
512 be relative to the incoming base pointer (rax or rsi) used by the stub.
514 s_instances: 0 1 2 3
515 Offset: realigned or aligned + 8
516 Register aligned aligned + 8 aligned w/HFP w/HFP */
517 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
518 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
519 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
520 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
521 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
522 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
523 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
524 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
525 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
526 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
527 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
528 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
529 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
530 BP_REG, /* 0xc0 0xc8 N/A N/A */
531 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
532 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
533 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
534 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
537 /* Instantiate static const values. */
538 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
539 const unsigned xlogue_layout::MIN_REGS;
540 const unsigned xlogue_layout::MAX_REGS;
541 const unsigned xlogue_layout::MAX_EXTRA_REGS;
542 const unsigned xlogue_layout::VARIANT_COUNT;
543 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
545 /* Initialize xlogue_layout::s_stub_names to zero. */
546 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
547 [STUB_NAME_MAX_LEN];
549 /* Instantiates all xlogue_layout instances. */
550 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
551 xlogue_layout (0, false),
552 xlogue_layout (8, false),
553 xlogue_layout (0, true),
554 xlogue_layout (8, true)
557 /* Return an appropriate const instance of xlogue_layout based upon values
558 in cfun->machine and crtl. */
559 const struct xlogue_layout &
560 xlogue_layout::get_instance ()
562 enum xlogue_stub_sets stub_set;
563 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
565 if (stack_realign_fp)
566 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
567 else if (frame_pointer_needed)
568 stub_set = aligned_plus_8
569 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
570 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
571 else
572 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
574 return s_instances[stub_set];
577 /* Determine how many clobbered registers can be saved by the stub.
578 Returns the count of registers the stub will save and restore. */
579 unsigned
580 xlogue_layout::count_stub_managed_regs ()
582 bool hfp = frame_pointer_needed || stack_realign_fp;
583 unsigned i, count;
584 unsigned regno;
586 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
588 regno = REG_ORDER[i];
589 if (regno == BP_REG && hfp)
590 continue;
591 if (!ix86_save_reg (regno, false, false))
592 break;
593 ++count;
595 return count;
598 /* Determine if register REGNO is a stub managed register given the
599 total COUNT of stub managed registers. */
600 bool
601 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
603 bool hfp = frame_pointer_needed || stack_realign_fp;
604 unsigned i;
606 for (i = 0; i < count; ++i)
608 gcc_assert (i < MAX_REGS);
609 if (REG_ORDER[i] == BP_REG && hfp)
610 ++count;
611 else if (REG_ORDER[i] == regno)
612 return true;
614 return false;
617 /* Constructor for xlogue_layout. */
618 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
619 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
620 m_stack_align_off_in (stack_align_off_in)
622 HOST_WIDE_INT offset = stack_align_off_in;
623 unsigned i, j;
625 for (i = j = 0; i < MAX_REGS; ++i)
627 unsigned regno = REG_ORDER[i];
629 if (regno == BP_REG && hfp)
630 continue;
631 if (SSE_REGNO_P (regno))
633 offset += 16;
634 /* Verify that SSE regs are always aligned. */
635 gcc_assert (!((stack_align_off_in + offset) & 15));
637 else
638 offset += 8;
640 m_regs[j].regno = regno;
641 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
643 gcc_assert (j == m_nregs);
646 const char *
647 xlogue_layout::get_stub_name (enum xlogue_stub stub,
648 unsigned n_extra_regs)
650 const int have_avx = TARGET_AVX;
651 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
653 /* Lazy init */
654 if (!*name)
656 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
657 (have_avx ? "avx" : "sse"),
658 STUB_BASE_NAMES[stub],
659 MIN_REGS + n_extra_regs);
660 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
663 return name;
666 /* Return rtx of a symbol ref for the entry point (based upon
667 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
669 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
671 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
672 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
673 gcc_assert (stub < XLOGUE_STUB_COUNT);
674 gcc_assert (crtl->stack_realign_finalized);
676 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
679 /* Define the structure for the machine field in struct function. */
681 struct GTY(()) stack_local_entry {
682 unsigned short mode;
683 unsigned short n;
684 rtx rtl;
685 struct stack_local_entry *next;
688 /* Which cpu are we scheduling for. */
689 enum attr_cpu ix86_schedule;
691 /* Which cpu are we optimizing for. */
692 enum processor_type ix86_tune;
694 /* Which instruction set architecture to use. */
695 enum processor_type ix86_arch;
697 /* True if processor has SSE prefetch instruction. */
698 unsigned char x86_prefetch_sse;
700 /* -mstackrealign option */
701 static const char ix86_force_align_arg_pointer_string[]
702 = "force_align_arg_pointer";
704 static rtx (*ix86_gen_leave) (void);
705 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
708 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
709 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_clzero) (rtx);
712 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
714 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
715 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
716 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
717 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
719 /* Preferred alignment for stack boundary in bits. */
720 unsigned int ix86_preferred_stack_boundary;
722 /* Alignment for incoming stack boundary in bits specified at
723 command line. */
724 static unsigned int ix86_user_incoming_stack_boundary;
726 /* Default alignment for incoming stack boundary in bits. */
727 static unsigned int ix86_default_incoming_stack_boundary;
729 /* Alignment for incoming stack boundary in bits. */
730 unsigned int ix86_incoming_stack_boundary;
732 /* Calling abi specific va_list type nodes. */
733 static GTY(()) tree sysv_va_list_type_node;
734 static GTY(()) tree ms_va_list_type_node;
736 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
737 char internal_label_prefix[16];
738 int internal_label_prefix_len;
740 /* Fence to use after loop using movnt. */
741 tree x86_mfence;
743 /* Register class used for passing given 64bit part of the argument.
744 These represent classes as documented by the PS ABI, with the exception
745 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
746 use SF or DFmode move instead of DImode to avoid reformatting penalties.
748 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
749 whenever possible (upper half does contain padding). */
750 enum x86_64_reg_class
752 X86_64_NO_CLASS,
753 X86_64_INTEGER_CLASS,
754 X86_64_INTEGERSI_CLASS,
755 X86_64_SSE_CLASS,
756 X86_64_SSESF_CLASS,
757 X86_64_SSEDF_CLASS,
758 X86_64_SSEUP_CLASS,
759 X86_64_X87_CLASS,
760 X86_64_X87UP_CLASS,
761 X86_64_COMPLEX_X87_CLASS,
762 X86_64_MEMORY_CLASS
765 #define MAX_CLASSES 8
767 /* Table of constants used by fldpi, fldln2, etc.... */
768 static REAL_VALUE_TYPE ext_80387_constants_table [5];
769 static bool ext_80387_constants_init;
772 static struct machine_function * ix86_init_machine_status (void);
773 static rtx ix86_function_value (const_tree, const_tree, bool);
774 static bool ix86_function_value_regno_p (const unsigned int);
775 static unsigned int ix86_function_arg_boundary (machine_mode,
776 const_tree);
777 static rtx ix86_static_chain (const_tree, bool);
778 static int ix86_function_regparm (const_tree, const_tree);
779 static void ix86_compute_frame_layout (void);
780 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
781 rtx, rtx, int);
782 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
783 static tree ix86_canonical_va_list_type (tree);
784 static void predict_jump (int);
785 static unsigned int split_stack_prologue_scratch_regno (void);
786 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
788 enum ix86_function_specific_strings
790 IX86_FUNCTION_SPECIFIC_ARCH,
791 IX86_FUNCTION_SPECIFIC_TUNE,
792 IX86_FUNCTION_SPECIFIC_MAX
795 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
796 const char *, const char *, enum fpmath_unit,
797 bool);
798 static void ix86_function_specific_save (struct cl_target_option *,
799 struct gcc_options *opts);
800 static void ix86_function_specific_restore (struct gcc_options *opts,
801 struct cl_target_option *);
802 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
803 static void ix86_function_specific_print (FILE *, int,
804 struct cl_target_option *);
805 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
806 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
807 struct gcc_options *,
808 struct gcc_options *,
809 struct gcc_options *);
810 static bool ix86_can_inline_p (tree, tree);
811 static void ix86_set_current_function (tree);
812 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
814 static enum calling_abi ix86_function_abi (const_tree);
817 #ifndef SUBTARGET32_DEFAULT_CPU
818 #define SUBTARGET32_DEFAULT_CPU "i386"
819 #endif
821 /* Whether -mtune= or -march= were specified */
822 static int ix86_tune_defaulted;
823 static int ix86_arch_specified;
825 /* Vectorization library interface and handlers. */
826 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
828 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
829 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
831 /* Processor target table, indexed by processor number */
832 struct ptt
834 const char *const name; /* processor name */
835 const struct processor_costs *cost; /* Processor costs */
836 const int align_loop; /* Default alignments. */
837 const int align_loop_max_skip;
838 const int align_jump;
839 const int align_jump_max_skip;
840 const int align_func;
843 /* This table must be in sync with enum processor_type in i386.h. */
844 static const struct ptt processor_target_table[PROCESSOR_max] =
846 {"generic", &generic_cost, 16, 10, 16, 10, 16},
847 {"i386", &i386_cost, 4, 3, 4, 3, 4},
848 {"i486", &i486_cost, 16, 15, 16, 15, 16},
849 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
850 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
851 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
852 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
853 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
854 {"core2", &core_cost, 16, 10, 16, 10, 16},
855 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
856 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
857 {"haswell", &core_cost, 16, 10, 16, 10, 16},
858 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
859 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
860 {"knl", &slm_cost, 16, 15, 16, 7, 16},
861 {"knm", &slm_cost, 16, 15, 16, 7, 16},
862 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
863 {"cannonlake", &skylake_cost, 16, 10, 16, 10, 16},
864 {"icelake-client", &skylake_cost, 16, 10, 16, 10, 16},
865 {"icelake-server", &skylake_cost, 16, 10, 16, 10, 16},
866 {"intel", &intel_cost, 16, 15, 16, 7, 16},
867 {"geode", &geode_cost, 0, 0, 0, 0, 0},
868 {"k6", &k6_cost, 32, 7, 32, 7, 32},
869 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
870 {"k8", &k8_cost, 16, 7, 16, 7, 16},
871 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
872 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
873 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
874 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
875 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
876 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
877 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
878 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
881 static unsigned int
882 rest_of_handle_insert_vzeroupper (void)
884 int i;
886 /* vzeroupper instructions are inserted immediately after reload to
887 account for possible spills from 256bit or 512bit registers. The pass
888 reuses mode switching infrastructure by re-running mode insertion
889 pass, so disable entities that have already been processed. */
890 for (i = 0; i < MAX_386_ENTITIES; i++)
891 ix86_optimize_mode_switching[i] = 0;
893 ix86_optimize_mode_switching[AVX_U128] = 1;
895 /* Call optimize_mode_switching. */
896 g->get_passes ()->execute_pass_mode_switching ();
897 return 0;
900 /* Return 1 if INSN uses or defines a hard register.
901 Hard register uses in a memory address are ignored.
902 Clobbers and flags definitions are ignored. */
904 static bool
905 has_non_address_hard_reg (rtx_insn *insn)
907 df_ref ref;
908 FOR_EACH_INSN_DEF (ref, insn)
909 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
910 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
911 && DF_REF_REGNO (ref) != FLAGS_REG)
912 return true;
914 FOR_EACH_INSN_USE (ref, insn)
915 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
916 return true;
918 return false;
921 /* Check if comparison INSN may be transformed
922 into vector comparison. Currently we transform
923 zero checks only which look like:
925 (set (reg:CCZ 17 flags)
926 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
927 (subreg:SI (reg:DI x) 0))
928 (const_int 0 [0]))) */
930 static bool
931 convertible_comparison_p (rtx_insn *insn)
933 if (!TARGET_SSE4_1)
934 return false;
936 rtx def_set = single_set (insn);
938 gcc_assert (def_set);
940 rtx src = SET_SRC (def_set);
941 rtx dst = SET_DEST (def_set);
943 gcc_assert (GET_CODE (src) == COMPARE);
945 if (GET_CODE (dst) != REG
946 || REGNO (dst) != FLAGS_REG
947 || GET_MODE (dst) != CCZmode)
948 return false;
950 rtx op1 = XEXP (src, 0);
951 rtx op2 = XEXP (src, 1);
953 if (op2 != CONST0_RTX (GET_MODE (op2)))
954 return false;
956 if (GET_CODE (op1) != IOR)
957 return false;
959 op2 = XEXP (op1, 1);
960 op1 = XEXP (op1, 0);
962 if (!SUBREG_P (op1)
963 || !SUBREG_P (op2)
964 || GET_MODE (op1) != SImode
965 || GET_MODE (op2) != SImode
966 || ((SUBREG_BYTE (op1) != 0
967 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
968 && (SUBREG_BYTE (op2) != 0
969 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
970 return false;
972 op1 = SUBREG_REG (op1);
973 op2 = SUBREG_REG (op2);
975 if (op1 != op2
976 || !REG_P (op1)
977 || GET_MODE (op1) != DImode)
978 return false;
980 return true;
983 /* The DImode version of scalar_to_vector_candidate_p. */
985 static bool
986 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
988 rtx def_set = single_set (insn);
990 if (!def_set)
991 return false;
993 if (has_non_address_hard_reg (insn))
994 return false;
996 rtx src = SET_SRC (def_set);
997 rtx dst = SET_DEST (def_set);
999 if (GET_CODE (src) == COMPARE)
1000 return convertible_comparison_p (insn);
1002 /* We are interested in DImode promotion only. */
1003 if ((GET_MODE (src) != DImode
1004 && !CONST_INT_P (src))
1005 || GET_MODE (dst) != DImode)
1006 return false;
1008 if (!REG_P (dst) && !MEM_P (dst))
1009 return false;
1011 switch (GET_CODE (src))
1013 case ASHIFTRT:
1014 if (!TARGET_AVX512VL)
1015 return false;
1016 /* FALLTHRU */
1018 case ASHIFT:
1019 case LSHIFTRT:
1020 if (!REG_P (XEXP (src, 1))
1021 && (!SUBREG_P (XEXP (src, 1))
1022 || SUBREG_BYTE (XEXP (src, 1)) != 0
1023 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1024 && (!CONST_INT_P (XEXP (src, 1))
1025 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1026 return false;
1028 if (GET_MODE (XEXP (src, 1)) != QImode
1029 && !CONST_INT_P (XEXP (src, 1)))
1030 return false;
1031 break;
1033 case PLUS:
1034 case MINUS:
1035 case IOR:
1036 case XOR:
1037 case AND:
1038 if (!REG_P (XEXP (src, 1))
1039 && !MEM_P (XEXP (src, 1))
1040 && !CONST_INT_P (XEXP (src, 1)))
1041 return false;
1043 if (GET_MODE (XEXP (src, 1)) != DImode
1044 && !CONST_INT_P (XEXP (src, 1)))
1045 return false;
1046 break;
1048 case NEG:
1049 case NOT:
1050 break;
1052 case REG:
1053 return true;
1055 case MEM:
1056 case CONST_INT:
1057 return REG_P (dst);
1059 default:
1060 return false;
1063 if (!REG_P (XEXP (src, 0))
1064 && !MEM_P (XEXP (src, 0))
1065 && !CONST_INT_P (XEXP (src, 0))
1066 /* Check for andnot case. */
1067 && (GET_CODE (src) != AND
1068 || GET_CODE (XEXP (src, 0)) != NOT
1069 || !REG_P (XEXP (XEXP (src, 0), 0))))
1070 return false;
1072 if (GET_MODE (XEXP (src, 0)) != DImode
1073 && !CONST_INT_P (XEXP (src, 0)))
1074 return false;
1076 return true;
1079 /* The TImode version of scalar_to_vector_candidate_p. */
1081 static bool
1082 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1084 rtx def_set = single_set (insn);
1086 if (!def_set)
1087 return false;
1089 if (has_non_address_hard_reg (insn))
1090 return false;
1092 rtx src = SET_SRC (def_set);
1093 rtx dst = SET_DEST (def_set);
1095 /* Only TImode load and store are allowed. */
1096 if (GET_MODE (dst) != TImode)
1097 return false;
1099 if (MEM_P (dst))
1101 /* Check for store. Memory must be aligned or unaligned store
1102 is optimal. Only support store from register, standard SSE
1103 constant or CONST_WIDE_INT generated from piecewise store.
1105 ??? Verify performance impact before enabling CONST_INT for
1106 __int128 store. */
1107 if (misaligned_operand (dst, TImode)
1108 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1109 return false;
1111 switch (GET_CODE (src))
1113 default:
1114 return false;
1116 case REG:
1117 case CONST_WIDE_INT:
1118 return true;
1120 case CONST_INT:
1121 return standard_sse_constant_p (src, TImode);
1124 else if (MEM_P (src))
1126 /* Check for load. Memory must be aligned or unaligned load is
1127 optimal. */
1128 return (REG_P (dst)
1129 && (!misaligned_operand (src, TImode)
1130 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1133 return false;
1136 /* Return 1 if INSN may be converted into vector
1137 instruction. */
1139 static bool
1140 scalar_to_vector_candidate_p (rtx_insn *insn)
1142 if (TARGET_64BIT)
1143 return timode_scalar_to_vector_candidate_p (insn);
1144 else
1145 return dimode_scalar_to_vector_candidate_p (insn);
1148 /* The DImode version of remove_non_convertible_regs. */
1150 static void
1151 dimode_remove_non_convertible_regs (bitmap candidates)
1153 bitmap_iterator bi;
1154 unsigned id;
1155 bitmap regs = BITMAP_ALLOC (NULL);
1157 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1159 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1160 rtx reg = SET_DEST (def_set);
1162 if (!REG_P (reg)
1163 || bitmap_bit_p (regs, REGNO (reg))
1164 || HARD_REGISTER_P (reg))
1165 continue;
1167 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1168 def;
1169 def = DF_REF_NEXT_REG (def))
1171 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1173 if (dump_file)
1174 fprintf (dump_file,
1175 "r%d has non convertible definition in insn %d\n",
1176 REGNO (reg), DF_REF_INSN_UID (def));
1178 bitmap_set_bit (regs, REGNO (reg));
1179 break;
1184 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1186 for (df_ref def = DF_REG_DEF_CHAIN (id);
1187 def;
1188 def = DF_REF_NEXT_REG (def))
1189 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1191 if (dump_file)
1192 fprintf (dump_file, "Removing insn %d from candidates list\n",
1193 DF_REF_INSN_UID (def));
1195 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1199 BITMAP_FREE (regs);
1202 /* For a register REGNO, scan instructions for its defs and uses.
1203 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1205 static void
1206 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1207 unsigned int regno)
1209 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1210 def;
1211 def = DF_REF_NEXT_REG (def))
1213 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1215 if (dump_file)
1216 fprintf (dump_file,
1217 "r%d has non convertible def in insn %d\n",
1218 regno, DF_REF_INSN_UID (def));
1220 bitmap_set_bit (regs, regno);
1221 break;
1225 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1226 ref;
1227 ref = DF_REF_NEXT_REG (ref))
1229 /* Debug instructions are skipped. */
1230 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1231 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1233 if (dump_file)
1234 fprintf (dump_file,
1235 "r%d has non convertible use in insn %d\n",
1236 regno, DF_REF_INSN_UID (ref));
1238 bitmap_set_bit (regs, regno);
1239 break;
1244 /* The TImode version of remove_non_convertible_regs. */
1246 static void
1247 timode_remove_non_convertible_regs (bitmap candidates)
1249 bitmap_iterator bi;
1250 unsigned id;
1251 bitmap regs = BITMAP_ALLOC (NULL);
1253 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1255 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1256 rtx dest = SET_DEST (def_set);
1257 rtx src = SET_SRC (def_set);
1259 if ((!REG_P (dest)
1260 || bitmap_bit_p (regs, REGNO (dest))
1261 || HARD_REGISTER_P (dest))
1262 && (!REG_P (src)
1263 || bitmap_bit_p (regs, REGNO (src))
1264 || HARD_REGISTER_P (src)))
1265 continue;
1267 if (REG_P (dest))
1268 timode_check_non_convertible_regs (candidates, regs,
1269 REGNO (dest));
1271 if (REG_P (src))
1272 timode_check_non_convertible_regs (candidates, regs,
1273 REGNO (src));
1276 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1278 for (df_ref def = DF_REG_DEF_CHAIN (id);
1279 def;
1280 def = DF_REF_NEXT_REG (def))
1281 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1283 if (dump_file)
1284 fprintf (dump_file, "Removing insn %d from candidates list\n",
1285 DF_REF_INSN_UID (def));
1287 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1290 for (df_ref ref = DF_REG_USE_CHAIN (id);
1291 ref;
1292 ref = DF_REF_NEXT_REG (ref))
1293 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1295 if (dump_file)
1296 fprintf (dump_file, "Removing insn %d from candidates list\n",
1297 DF_REF_INSN_UID (ref));
1299 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1303 BITMAP_FREE (regs);
1306 /* For a given bitmap of insn UIDs scans all instruction and
1307 remove insn from CANDIDATES in case it has both convertible
1308 and not convertible definitions.
1310 All insns in a bitmap are conversion candidates according to
1311 scalar_to_vector_candidate_p. Currently it implies all insns
1312 are single_set. */
1314 static void
1315 remove_non_convertible_regs (bitmap candidates)
1317 if (TARGET_64BIT)
1318 timode_remove_non_convertible_regs (candidates);
1319 else
1320 dimode_remove_non_convertible_regs (candidates);
1323 class scalar_chain
1325 public:
1326 scalar_chain ();
1327 virtual ~scalar_chain ();
1329 static unsigned max_id;
1331 /* ID of a chain. */
1332 unsigned int chain_id;
1333 /* A queue of instructions to be included into a chain. */
1334 bitmap queue;
1335 /* Instructions included into a chain. */
1336 bitmap insns;
1337 /* All registers defined by a chain. */
1338 bitmap defs;
1339 /* Registers used in both vector and sclar modes. */
1340 bitmap defs_conv;
1342 void build (bitmap candidates, unsigned insn_uid);
1343 virtual int compute_convert_gain () = 0;
1344 int convert ();
1346 protected:
1347 void add_to_queue (unsigned insn_uid);
1348 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1350 private:
1351 void add_insn (bitmap candidates, unsigned insn_uid);
1352 void analyze_register_chain (bitmap candidates, df_ref ref);
1353 virtual void mark_dual_mode_def (df_ref def) = 0;
1354 virtual void convert_insn (rtx_insn *insn) = 0;
1355 virtual void convert_registers () = 0;
1358 class dimode_scalar_chain : public scalar_chain
1360 public:
1361 int compute_convert_gain ();
1362 private:
1363 void mark_dual_mode_def (df_ref def);
1364 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1365 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1366 void convert_insn (rtx_insn *insn);
1367 void convert_op (rtx *op, rtx_insn *insn);
1368 void convert_reg (unsigned regno);
1369 void make_vector_copies (unsigned regno);
1370 void convert_registers ();
1371 int vector_const_cost (rtx exp);
1374 class timode_scalar_chain : public scalar_chain
1376 public:
1377 /* Convert from TImode to V1TImode is always faster. */
1378 int compute_convert_gain () { return 1; }
1380 private:
1381 void mark_dual_mode_def (df_ref def);
1382 void fix_debug_reg_uses (rtx reg);
1383 void convert_insn (rtx_insn *insn);
1384 /* We don't convert registers to difference size. */
1385 void convert_registers () {}
1388 unsigned scalar_chain::max_id = 0;
1390 /* Initialize new chain. */
1392 scalar_chain::scalar_chain ()
1394 chain_id = ++max_id;
1396 if (dump_file)
1397 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1399 bitmap_obstack_initialize (NULL);
1400 insns = BITMAP_ALLOC (NULL);
1401 defs = BITMAP_ALLOC (NULL);
1402 defs_conv = BITMAP_ALLOC (NULL);
1403 queue = NULL;
1406 /* Free chain's data. */
1408 scalar_chain::~scalar_chain ()
1410 BITMAP_FREE (insns);
1411 BITMAP_FREE (defs);
1412 BITMAP_FREE (defs_conv);
1413 bitmap_obstack_release (NULL);
1416 /* Add instruction into chains' queue. */
1418 void
1419 scalar_chain::add_to_queue (unsigned insn_uid)
1421 if (bitmap_bit_p (insns, insn_uid)
1422 || bitmap_bit_p (queue, insn_uid))
1423 return;
1425 if (dump_file)
1426 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1427 insn_uid, chain_id);
1428 bitmap_set_bit (queue, insn_uid);
1431 /* For DImode conversion, mark register defined by DEF as requiring
1432 conversion. */
1434 void
1435 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1437 gcc_assert (DF_REF_REG_DEF_P (def));
1439 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1440 return;
1442 if (dump_file)
1443 fprintf (dump_file,
1444 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1445 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1447 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1450 /* For TImode conversion, it is unused. */
1452 void
1453 timode_scalar_chain::mark_dual_mode_def (df_ref)
1455 gcc_unreachable ();
1458 /* Check REF's chain to add new insns into a queue
1459 and find registers requiring conversion. */
1461 void
1462 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1464 df_link *chain;
1466 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1467 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1468 add_to_queue (DF_REF_INSN_UID (ref));
1470 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1472 unsigned uid = DF_REF_INSN_UID (chain->ref);
1474 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1475 continue;
1477 if (!DF_REF_REG_MEM_P (chain->ref))
1479 if (bitmap_bit_p (insns, uid))
1480 continue;
1482 if (bitmap_bit_p (candidates, uid))
1484 add_to_queue (uid);
1485 continue;
1489 if (DF_REF_REG_DEF_P (chain->ref))
1491 if (dump_file)
1492 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1493 DF_REF_REGNO (chain->ref), uid);
1494 mark_dual_mode_def (chain->ref);
1496 else
1498 if (dump_file)
1499 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1500 DF_REF_REGNO (chain->ref), uid);
1501 mark_dual_mode_def (ref);
1506 /* Add instruction into a chain. */
1508 void
1509 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1511 if (bitmap_bit_p (insns, insn_uid))
1512 return;
1514 if (dump_file)
1515 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1517 bitmap_set_bit (insns, insn_uid);
1519 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1520 rtx def_set = single_set (insn);
1521 if (def_set && REG_P (SET_DEST (def_set))
1522 && !HARD_REGISTER_P (SET_DEST (def_set)))
1523 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1525 df_ref ref;
1526 df_ref def;
1527 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1528 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1529 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1530 def;
1531 def = DF_REF_NEXT_REG (def))
1532 analyze_register_chain (candidates, def);
1533 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1534 if (!DF_REF_REG_MEM_P (ref))
1535 analyze_register_chain (candidates, ref);
1538 /* Build new chain starting from insn INSN_UID recursively
1539 adding all dependent uses and definitions. */
1541 void
1542 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1544 queue = BITMAP_ALLOC (NULL);
1545 bitmap_set_bit (queue, insn_uid);
1547 if (dump_file)
1548 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1550 while (!bitmap_empty_p (queue))
1552 insn_uid = bitmap_first_set_bit (queue);
1553 bitmap_clear_bit (queue, insn_uid);
1554 bitmap_clear_bit (candidates, insn_uid);
1555 add_insn (candidates, insn_uid);
1558 if (dump_file)
1560 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1561 fprintf (dump_file, " insns: ");
1562 dump_bitmap (dump_file, insns);
1563 if (!bitmap_empty_p (defs_conv))
1565 bitmap_iterator bi;
1566 unsigned id;
1567 const char *comma = "";
1568 fprintf (dump_file, " defs to convert: ");
1569 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1571 fprintf (dump_file, "%sr%d", comma, id);
1572 comma = ", ";
1574 fprintf (dump_file, "\n");
1578 BITMAP_FREE (queue);
1581 /* Return a cost of building a vector costant
1582 instead of using a scalar one. */
1585 dimode_scalar_chain::vector_const_cost (rtx exp)
1587 gcc_assert (CONST_INT_P (exp));
1589 if (standard_sse_constant_p (exp, V2DImode))
1590 return COSTS_N_INSNS (1);
1591 return ix86_cost->sse_load[1];
1594 /* Compute a gain for chain conversion. */
1597 dimode_scalar_chain::compute_convert_gain ()
1599 bitmap_iterator bi;
1600 unsigned insn_uid;
1601 int gain = 0;
1602 int cost = 0;
1604 if (dump_file)
1605 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1607 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1609 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1610 rtx def_set = single_set (insn);
1611 rtx src = SET_SRC (def_set);
1612 rtx dst = SET_DEST (def_set);
1614 if (REG_P (src) && REG_P (dst))
1615 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1616 else if (REG_P (src) && MEM_P (dst))
1617 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1618 else if (MEM_P (src) && REG_P (dst))
1619 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1620 else if (GET_CODE (src) == ASHIFT
1621 || GET_CODE (src) == ASHIFTRT
1622 || GET_CODE (src) == LSHIFTRT)
1624 if (CONST_INT_P (XEXP (src, 0)))
1625 gain -= vector_const_cost (XEXP (src, 0));
1626 if (CONST_INT_P (XEXP (src, 1)))
1628 gain += ix86_cost->shift_const;
1629 if (INTVAL (XEXP (src, 1)) >= 32)
1630 gain -= COSTS_N_INSNS (1);
1632 else
1633 /* Additional gain for omitting two CMOVs. */
1634 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1636 else if (GET_CODE (src) == PLUS
1637 || GET_CODE (src) == MINUS
1638 || GET_CODE (src) == IOR
1639 || GET_CODE (src) == XOR
1640 || GET_CODE (src) == AND)
1642 gain += ix86_cost->add;
1643 /* Additional gain for andnot for targets without BMI. */
1644 if (GET_CODE (XEXP (src, 0)) == NOT
1645 && !TARGET_BMI)
1646 gain += 2 * ix86_cost->add;
1648 if (CONST_INT_P (XEXP (src, 0)))
1649 gain -= vector_const_cost (XEXP (src, 0));
1650 if (CONST_INT_P (XEXP (src, 1)))
1651 gain -= vector_const_cost (XEXP (src, 1));
1653 else if (GET_CODE (src) == NEG
1654 || GET_CODE (src) == NOT)
1655 gain += ix86_cost->add - COSTS_N_INSNS (1);
1656 else if (GET_CODE (src) == COMPARE)
1658 /* Assume comparison cost is the same. */
1660 else if (CONST_INT_P (src))
1662 if (REG_P (dst))
1663 gain += COSTS_N_INSNS (2);
1664 else if (MEM_P (dst))
1665 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1666 gain -= vector_const_cost (src);
1668 else
1669 gcc_unreachable ();
1672 if (dump_file)
1673 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1675 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1676 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1678 if (dump_file)
1679 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1681 gain -= cost;
1683 if (dump_file)
1684 fprintf (dump_file, " Total gain: %d\n", gain);
1686 return gain;
1689 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1692 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1694 if (x == reg)
1695 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1697 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1698 int i, j;
1699 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1701 if (fmt[i] == 'e')
1702 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1703 else if (fmt[i] == 'E')
1704 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1705 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1706 reg, new_reg);
1709 return x;
1712 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1714 void
1715 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1716 rtx reg, rtx new_reg)
1718 replace_with_subreg (single_set (insn), reg, new_reg);
1721 /* Insert generated conversion instruction sequence INSNS
1722 after instruction AFTER. New BB may be required in case
1723 instruction has EH region attached. */
1725 void
1726 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1728 if (!control_flow_insn_p (after))
1730 emit_insn_after (insns, after);
1731 return;
1734 basic_block bb = BLOCK_FOR_INSN (after);
1735 edge e = find_fallthru_edge (bb->succs);
1736 gcc_assert (e);
1738 basic_block new_bb = split_edge (e);
1739 emit_insn_after (insns, BB_HEAD (new_bb));
1742 /* Make vector copies for all register REGNO definitions
1743 and replace its uses in a chain. */
1745 void
1746 dimode_scalar_chain::make_vector_copies (unsigned regno)
1748 rtx reg = regno_reg_rtx[regno];
1749 rtx vreg = gen_reg_rtx (DImode);
1750 bool count_reg = false;
1751 df_ref ref;
1753 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1754 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1756 df_ref use;
1758 /* Detect the count register of a shift instruction. */
1759 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1760 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1762 rtx_insn *insn = DF_REF_INSN (use);
1763 rtx def_set = single_set (insn);
1765 gcc_assert (def_set);
1767 rtx src = SET_SRC (def_set);
1769 if ((GET_CODE (src) == ASHIFT
1770 || GET_CODE (src) == ASHIFTRT
1771 || GET_CODE (src) == LSHIFTRT)
1772 && !CONST_INT_P (XEXP (src, 1))
1773 && reg_or_subregno (XEXP (src, 1)) == regno)
1774 count_reg = true;
1777 start_sequence ();
1778 if (count_reg)
1780 rtx qreg = gen_lowpart (QImode, reg);
1781 rtx tmp = gen_reg_rtx (SImode);
1783 if (TARGET_ZERO_EXTEND_WITH_AND
1784 && optimize_function_for_speed_p (cfun))
1786 emit_move_insn (tmp, const0_rtx);
1787 emit_insn (gen_movstrictqi
1788 (gen_lowpart (QImode, tmp), qreg));
1790 else
1791 emit_insn (gen_rtx_SET
1792 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1794 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1796 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1797 emit_move_insn (slot, tmp);
1798 tmp = copy_rtx (slot);
1801 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1803 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1805 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1806 emit_move_insn (adjust_address (tmp, SImode, 0),
1807 gen_rtx_SUBREG (SImode, reg, 0));
1808 emit_move_insn (adjust_address (tmp, SImode, 4),
1809 gen_rtx_SUBREG (SImode, reg, 4));
1810 emit_move_insn (vreg, tmp);
1812 else if (TARGET_SSE4_1)
1814 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1815 CONST0_RTX (V4SImode),
1816 gen_rtx_SUBREG (SImode, reg, 0)));
1817 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1818 gen_rtx_SUBREG (V4SImode, vreg, 0),
1819 gen_rtx_SUBREG (SImode, reg, 4),
1820 GEN_INT (2)));
1822 else
1824 rtx tmp = gen_reg_rtx (DImode);
1825 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1826 CONST0_RTX (V4SImode),
1827 gen_rtx_SUBREG (SImode, reg, 0)));
1828 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1829 CONST0_RTX (V4SImode),
1830 gen_rtx_SUBREG (SImode, reg, 4)));
1831 emit_insn (gen_vec_interleave_lowv4si
1832 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1833 gen_rtx_SUBREG (V4SImode, vreg, 0),
1834 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1836 rtx_insn *seq = get_insns ();
1837 end_sequence ();
1838 rtx_insn *insn = DF_REF_INSN (ref);
1839 emit_conversion_insns (seq, insn);
1841 if (dump_file)
1842 fprintf (dump_file,
1843 " Copied r%d to a vector register r%d for insn %d\n",
1844 regno, REGNO (vreg), INSN_UID (insn));
1847 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1848 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1850 rtx_insn *insn = DF_REF_INSN (ref);
1851 if (count_reg)
1853 rtx def_set = single_set (insn);
1854 gcc_assert (def_set);
1856 rtx src = SET_SRC (def_set);
1858 if ((GET_CODE (src) == ASHIFT
1859 || GET_CODE (src) == ASHIFTRT
1860 || GET_CODE (src) == LSHIFTRT)
1861 && !CONST_INT_P (XEXP (src, 1))
1862 && reg_or_subregno (XEXP (src, 1)) == regno)
1863 XEXP (src, 1) = vreg;
1865 else
1866 replace_with_subreg_in_insn (insn, reg, vreg);
1868 if (dump_file)
1869 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1870 regno, REGNO (vreg), INSN_UID (insn));
1874 /* Convert all definitions of register REGNO
1875 and fix its uses. Scalar copies may be created
1876 in case register is used in not convertible insn. */
1878 void
1879 dimode_scalar_chain::convert_reg (unsigned regno)
1881 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1882 rtx reg = regno_reg_rtx[regno];
1883 rtx scopy = NULL_RTX;
1884 df_ref ref;
1885 bitmap conv;
1887 conv = BITMAP_ALLOC (NULL);
1888 bitmap_copy (conv, insns);
1890 if (scalar_copy)
1891 scopy = gen_reg_rtx (DImode);
1893 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1895 rtx_insn *insn = DF_REF_INSN (ref);
1896 rtx def_set = single_set (insn);
1897 rtx src = SET_SRC (def_set);
1898 rtx reg = DF_REF_REG (ref);
1900 if (!MEM_P (src))
1902 replace_with_subreg_in_insn (insn, reg, reg);
1903 bitmap_clear_bit (conv, INSN_UID (insn));
1906 if (scalar_copy)
1908 start_sequence ();
1909 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1911 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1912 emit_move_insn (tmp, reg);
1913 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1914 adjust_address (tmp, SImode, 0));
1915 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1916 adjust_address (tmp, SImode, 4));
1918 else if (TARGET_SSE4_1)
1920 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1921 emit_insn
1922 (gen_rtx_SET
1923 (gen_rtx_SUBREG (SImode, scopy, 0),
1924 gen_rtx_VEC_SELECT (SImode,
1925 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1927 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1928 emit_insn
1929 (gen_rtx_SET
1930 (gen_rtx_SUBREG (SImode, scopy, 4),
1931 gen_rtx_VEC_SELECT (SImode,
1932 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1934 else
1936 rtx vcopy = gen_reg_rtx (V2DImode);
1937 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1938 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1939 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 emit_move_insn (vcopy,
1941 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1942 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1943 gen_rtx_SUBREG (SImode, vcopy, 0));
1945 rtx_insn *seq = get_insns ();
1946 end_sequence ();
1947 emit_conversion_insns (seq, insn);
1949 if (dump_file)
1950 fprintf (dump_file,
1951 " Copied r%d to a scalar register r%d for insn %d\n",
1952 regno, REGNO (scopy), INSN_UID (insn));
1956 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1957 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1959 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1961 rtx_insn *insn = DF_REF_INSN (ref);
1963 rtx def_set = single_set (insn);
1964 gcc_assert (def_set);
1966 rtx src = SET_SRC (def_set);
1967 rtx dst = SET_DEST (def_set);
1969 if ((GET_CODE (src) == ASHIFT
1970 || GET_CODE (src) == ASHIFTRT
1971 || GET_CODE (src) == LSHIFTRT)
1972 && !CONST_INT_P (XEXP (src, 1))
1973 && reg_or_subregno (XEXP (src, 1)) == regno)
1975 rtx tmp2 = gen_reg_rtx (V2DImode);
1977 start_sequence ();
1979 if (TARGET_SSE4_1)
1980 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1981 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1982 else
1984 rtx vec_cst
1985 = gen_rtx_CONST_VECTOR (V2DImode,
1986 gen_rtvec (2, GEN_INT (0xff),
1987 const0_rtx));
1988 vec_cst
1989 = validize_mem (force_const_mem (V2DImode, vec_cst));
1991 emit_insn (gen_rtx_SET
1992 (tmp2,
1993 gen_rtx_AND (V2DImode,
1994 gen_rtx_SUBREG (V2DImode, reg, 0),
1995 vec_cst)));
1997 rtx_insn *seq = get_insns ();
1998 end_sequence ();
2000 emit_insn_before (seq, insn);
2002 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
2004 else if (!MEM_P (dst) || !REG_P (src))
2005 replace_with_subreg_in_insn (insn, reg, reg);
2007 bitmap_clear_bit (conv, INSN_UID (insn));
2010 /* Skip debug insns and uninitialized uses. */
2011 else if (DF_REF_CHAIN (ref)
2012 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2014 gcc_assert (scopy);
2015 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2016 df_insn_rescan (DF_REF_INSN (ref));
2019 BITMAP_FREE (conv);
2022 /* Convert operand OP in INSN. We should handle
2023 memory operands and uninitialized registers.
2024 All other register uses are converted during
2025 registers conversion. */
2027 void
2028 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2030 *op = copy_rtx_if_shared (*op);
2032 if (GET_CODE (*op) == NOT)
2034 convert_op (&XEXP (*op, 0), insn);
2035 PUT_MODE (*op, V2DImode);
2037 else if (MEM_P (*op))
2039 rtx tmp = gen_reg_rtx (DImode);
2041 emit_insn_before (gen_move_insn (tmp, *op), insn);
2042 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2044 if (dump_file)
2045 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2046 INSN_UID (insn), REGNO (tmp));
2048 else if (REG_P (*op))
2050 /* We may have not converted register usage in case
2051 this register has no definition. Otherwise it
2052 should be converted in convert_reg. */
2053 df_ref ref;
2054 FOR_EACH_INSN_USE (ref, insn)
2055 if (DF_REF_REGNO (ref) == REGNO (*op))
2057 gcc_assert (!DF_REF_CHAIN (ref));
2058 break;
2060 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2062 else if (CONST_INT_P (*op))
2064 rtx vec_cst;
2065 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2067 /* Prefer all ones vector in case of -1. */
2068 if (constm1_operand (*op, GET_MODE (*op)))
2069 vec_cst = CONSTM1_RTX (V2DImode);
2070 else
2071 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2072 gen_rtvec (2, *op, const0_rtx));
2074 if (!standard_sse_constant_p (vec_cst, V2DImode))
2076 start_sequence ();
2077 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2078 rtx_insn *seq = get_insns ();
2079 end_sequence ();
2080 emit_insn_before (seq, insn);
2083 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2084 *op = tmp;
2086 else
2088 gcc_assert (SUBREG_P (*op));
2089 gcc_assert (GET_MODE (*op) == V2DImode);
2093 /* Convert INSN to vector mode. */
2095 void
2096 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2098 rtx def_set = single_set (insn);
2099 rtx src = SET_SRC (def_set);
2100 rtx dst = SET_DEST (def_set);
2101 rtx subreg;
2103 if (MEM_P (dst) && !REG_P (src))
2105 /* There are no scalar integer instructions and therefore
2106 temporary register usage is required. */
2107 rtx tmp = gen_reg_rtx (DImode);
2108 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2109 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2112 switch (GET_CODE (src))
2114 case ASHIFT:
2115 case ASHIFTRT:
2116 case LSHIFTRT:
2117 convert_op (&XEXP (src, 0), insn);
2118 PUT_MODE (src, V2DImode);
2119 break;
2121 case PLUS:
2122 case MINUS:
2123 case IOR:
2124 case XOR:
2125 case AND:
2126 convert_op (&XEXP (src, 0), insn);
2127 convert_op (&XEXP (src, 1), insn);
2128 PUT_MODE (src, V2DImode);
2129 break;
2131 case NEG:
2132 src = XEXP (src, 0);
2133 convert_op (&src, insn);
2134 subreg = gen_reg_rtx (V2DImode);
2135 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2136 src = gen_rtx_MINUS (V2DImode, subreg, src);
2137 break;
2139 case NOT:
2140 src = XEXP (src, 0);
2141 convert_op (&src, insn);
2142 subreg = gen_reg_rtx (V2DImode);
2143 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2144 src = gen_rtx_XOR (V2DImode, src, subreg);
2145 break;
2147 case MEM:
2148 if (!REG_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case REG:
2153 if (!MEM_P (dst))
2154 convert_op (&src, insn);
2155 break;
2157 case SUBREG:
2158 gcc_assert (GET_MODE (src) == V2DImode);
2159 break;
2161 case COMPARE:
2162 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2164 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2165 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2167 if (REG_P (src))
2168 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2169 else
2170 subreg = copy_rtx_if_shared (src);
2171 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2172 copy_rtx_if_shared (subreg),
2173 copy_rtx_if_shared (subreg)),
2174 insn);
2175 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2176 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2177 copy_rtx_if_shared (src)),
2178 UNSPEC_PTEST);
2179 break;
2181 case CONST_INT:
2182 convert_op (&src, insn);
2183 break;
2185 default:
2186 gcc_unreachable ();
2189 SET_SRC (def_set) = src;
2190 SET_DEST (def_set) = dst;
2192 /* Drop possible dead definitions. */
2193 PATTERN (insn) = def_set;
2195 INSN_CODE (insn) = -1;
2196 recog_memoized (insn);
2197 df_insn_rescan (insn);
2200 /* Fix uses of converted REG in debug insns. */
2202 void
2203 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2205 if (!flag_var_tracking)
2206 return;
2208 df_ref ref, next;
2209 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2211 rtx_insn *insn = DF_REF_INSN (ref);
2212 /* Make sure the next ref is for a different instruction,
2213 so that we're not affected by the rescan. */
2214 next = DF_REF_NEXT_REG (ref);
2215 while (next && DF_REF_INSN (next) == insn)
2216 next = DF_REF_NEXT_REG (next);
2218 if (DEBUG_INSN_P (insn))
2220 /* It may be a debug insn with a TImode variable in
2221 register. */
2222 bool changed = false;
2223 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2225 rtx *loc = DF_REF_LOC (ref);
2226 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2228 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2229 changed = true;
2232 if (changed)
2233 df_insn_rescan (insn);
2238 /* Convert INSN from TImode to V1T1mode. */
2240 void
2241 timode_scalar_chain::convert_insn (rtx_insn *insn)
2243 rtx def_set = single_set (insn);
2244 rtx src = SET_SRC (def_set);
2245 rtx dst = SET_DEST (def_set);
2247 switch (GET_CODE (dst))
2249 case REG:
2251 rtx tmp = find_reg_equal_equiv_note (insn);
2252 if (tmp)
2253 PUT_MODE (XEXP (tmp, 0), V1TImode);
2254 PUT_MODE (dst, V1TImode);
2255 fix_debug_reg_uses (dst);
2257 break;
2258 case MEM:
2259 PUT_MODE (dst, V1TImode);
2260 break;
2262 default:
2263 gcc_unreachable ();
2266 switch (GET_CODE (src))
2268 case REG:
2269 PUT_MODE (src, V1TImode);
2270 /* Call fix_debug_reg_uses only if SRC is never defined. */
2271 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2272 fix_debug_reg_uses (src);
2273 break;
2275 case MEM:
2276 PUT_MODE (src, V1TImode);
2277 break;
2279 case CONST_WIDE_INT:
2280 if (NONDEBUG_INSN_P (insn))
2282 /* Since there are no instructions to store 128-bit constant,
2283 temporary register usage is required. */
2284 rtx tmp = gen_reg_rtx (V1TImode);
2285 start_sequence ();
2286 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2287 src = validize_mem (force_const_mem (V1TImode, src));
2288 rtx_insn *seq = get_insns ();
2289 end_sequence ();
2290 if (seq)
2291 emit_insn_before (seq, insn);
2292 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2293 dst = tmp;
2295 break;
2297 case CONST_INT:
2298 switch (standard_sse_constant_p (src, TImode))
2300 case 1:
2301 src = CONST0_RTX (GET_MODE (dst));
2302 break;
2303 case 2:
2304 src = CONSTM1_RTX (GET_MODE (dst));
2305 break;
2306 default:
2307 gcc_unreachable ();
2309 if (NONDEBUG_INSN_P (insn))
2311 rtx tmp = gen_reg_rtx (V1TImode);
2312 /* Since there are no instructions to store standard SSE
2313 constant, temporary register usage is required. */
2314 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2315 dst = tmp;
2317 break;
2319 default:
2320 gcc_unreachable ();
2323 SET_SRC (def_set) = src;
2324 SET_DEST (def_set) = dst;
2326 /* Drop possible dead definitions. */
2327 PATTERN (insn) = def_set;
2329 INSN_CODE (insn) = -1;
2330 recog_memoized (insn);
2331 df_insn_rescan (insn);
2334 void
2335 dimode_scalar_chain::convert_registers ()
2337 bitmap_iterator bi;
2338 unsigned id;
2340 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2341 convert_reg (id);
2343 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2344 make_vector_copies (id);
2347 /* Convert whole chain creating required register
2348 conversions and copies. */
2351 scalar_chain::convert ()
2353 bitmap_iterator bi;
2354 unsigned id;
2355 int converted_insns = 0;
2357 if (!dbg_cnt (stv_conversion))
2358 return 0;
2360 if (dump_file)
2361 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2363 convert_registers ();
2365 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2367 convert_insn (DF_INSN_UID_GET (id)->insn);
2368 converted_insns++;
2371 return converted_insns;
2374 /* Main STV pass function. Find and convert scalar
2375 instructions into vector mode when profitable. */
2377 static unsigned int
2378 convert_scalars_to_vector ()
2380 basic_block bb;
2381 bitmap candidates;
2382 int converted_insns = 0;
2384 bitmap_obstack_initialize (NULL);
2385 candidates = BITMAP_ALLOC (NULL);
2387 calculate_dominance_info (CDI_DOMINATORS);
2388 df_set_flags (DF_DEFER_INSN_RESCAN);
2389 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2390 df_md_add_problem ();
2391 df_analyze ();
2393 /* Find all instructions we want to convert into vector mode. */
2394 if (dump_file)
2395 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2397 FOR_EACH_BB_FN (bb, cfun)
2399 rtx_insn *insn;
2400 FOR_BB_INSNS (bb, insn)
2401 if (scalar_to_vector_candidate_p (insn))
2403 if (dump_file)
2404 fprintf (dump_file, " insn %d is marked as a candidate\n",
2405 INSN_UID (insn));
2407 bitmap_set_bit (candidates, INSN_UID (insn));
2411 remove_non_convertible_regs (candidates);
2413 if (bitmap_empty_p (candidates))
2414 if (dump_file)
2415 fprintf (dump_file, "There are no candidates for optimization.\n");
2417 while (!bitmap_empty_p (candidates))
2419 unsigned uid = bitmap_first_set_bit (candidates);
2420 scalar_chain *chain;
2422 if (TARGET_64BIT)
2423 chain = new timode_scalar_chain;
2424 else
2425 chain = new dimode_scalar_chain;
2427 /* Find instructions chain we want to convert to vector mode.
2428 Check all uses and definitions to estimate all required
2429 conversions. */
2430 chain->build (candidates, uid);
2432 if (chain->compute_convert_gain () > 0)
2433 converted_insns += chain->convert ();
2434 else
2435 if (dump_file)
2436 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2437 chain->chain_id);
2439 delete chain;
2442 if (dump_file)
2443 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2445 BITMAP_FREE (candidates);
2446 bitmap_obstack_release (NULL);
2447 df_process_deferred_rescans ();
2449 /* Conversion means we may have 128bit register spills/fills
2450 which require aligned stack. */
2451 if (converted_insns)
2453 if (crtl->stack_alignment_needed < 128)
2454 crtl->stack_alignment_needed = 128;
2455 if (crtl->stack_alignment_estimated < 128)
2456 crtl->stack_alignment_estimated = 128;
2457 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2458 if (TARGET_64BIT)
2459 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2460 parm; parm = DECL_CHAIN (parm))
2462 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2463 continue;
2464 if (DECL_RTL_SET_P (parm)
2465 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2467 rtx r = DECL_RTL (parm);
2468 if (REG_P (r))
2469 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2471 if (DECL_INCOMING_RTL (parm)
2472 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2474 rtx r = DECL_INCOMING_RTL (parm);
2475 if (REG_P (r))
2476 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2481 return 0;
2484 namespace {
2486 const pass_data pass_data_insert_vzeroupper =
2488 RTL_PASS, /* type */
2489 "vzeroupper", /* name */
2490 OPTGROUP_NONE, /* optinfo_flags */
2491 TV_MACH_DEP, /* tv_id */
2492 0, /* properties_required */
2493 0, /* properties_provided */
2494 0, /* properties_destroyed */
2495 0, /* todo_flags_start */
2496 TODO_df_finish, /* todo_flags_finish */
2499 class pass_insert_vzeroupper : public rtl_opt_pass
2501 public:
2502 pass_insert_vzeroupper(gcc::context *ctxt)
2503 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2506 /* opt_pass methods: */
2507 virtual bool gate (function *)
2509 return TARGET_AVX
2510 && TARGET_VZEROUPPER && flag_expensive_optimizations
2511 && !optimize_size;
2514 virtual unsigned int execute (function *)
2516 return rest_of_handle_insert_vzeroupper ();
2519 }; // class pass_insert_vzeroupper
2521 const pass_data pass_data_stv =
2523 RTL_PASS, /* type */
2524 "stv", /* name */
2525 OPTGROUP_NONE, /* optinfo_flags */
2526 TV_MACH_DEP, /* tv_id */
2527 0, /* properties_required */
2528 0, /* properties_provided */
2529 0, /* properties_destroyed */
2530 0, /* todo_flags_start */
2531 TODO_df_finish, /* todo_flags_finish */
2534 class pass_stv : public rtl_opt_pass
2536 public:
2537 pass_stv (gcc::context *ctxt)
2538 : rtl_opt_pass (pass_data_stv, ctxt),
2539 timode_p (false)
2542 /* opt_pass methods: */
2543 virtual bool gate (function *)
2545 return (timode_p == !!TARGET_64BIT
2546 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2549 virtual unsigned int execute (function *)
2551 return convert_scalars_to_vector ();
2554 opt_pass *clone ()
2556 return new pass_stv (m_ctxt);
2559 void set_pass_param (unsigned int n, bool param)
2561 gcc_assert (n == 0);
2562 timode_p = param;
2565 private:
2566 bool timode_p;
2567 }; // class pass_stv
2569 } // anon namespace
2571 rtl_opt_pass *
2572 make_pass_insert_vzeroupper (gcc::context *ctxt)
2574 return new pass_insert_vzeroupper (ctxt);
2577 rtl_opt_pass *
2578 make_pass_stv (gcc::context *ctxt)
2580 return new pass_stv (ctxt);
2583 /* Inserting ENDBRANCH instructions. */
2585 static unsigned int
2586 rest_of_insert_endbranch (void)
2588 timevar_push (TV_MACH_DEP);
2590 rtx cet_eb;
2591 rtx_insn *insn;
2592 basic_block bb;
2594 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2595 absent among function attributes. Later an optimization will be
2596 introduced to make analysis if an address of a static function is
2597 taken. A static function whose address is not taken will get a
2598 nocf_check attribute. This will allow to reduce the number of EB. */
2600 if (!lookup_attribute ("nocf_check",
2601 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2602 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2604 cet_eb = gen_nop_endbr ();
2606 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2607 insn = BB_HEAD (bb);
2608 emit_insn_before (cet_eb, insn);
2611 bb = 0;
2612 FOR_EACH_BB_FN (bb, cfun)
2614 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2615 insn = NEXT_INSN (insn))
2617 if (CALL_P (insn))
2619 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2620 continue;
2621 /* Generate ENDBRANCH after CALL, which can return more than
2622 twice, setjmp-like functions. */
2624 cet_eb = gen_nop_endbr ();
2625 emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2626 continue;
2629 if (JUMP_P (insn) && flag_cet_switch)
2631 rtx target = JUMP_LABEL (insn);
2632 if (target == NULL_RTX || ANY_RETURN_P (target))
2633 continue;
2635 /* Check the jump is a switch table. */
2636 rtx_insn *label = as_a<rtx_insn *> (target);
2637 rtx_insn *table = next_insn (label);
2638 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2639 continue;
2641 /* For the indirect jump find out all places it jumps and insert
2642 ENDBRANCH there. It should be done under a special flag to
2643 control ENDBRANCH generation for switch stmts. */
2644 edge_iterator ei;
2645 edge e;
2646 basic_block dest_blk;
2648 FOR_EACH_EDGE (e, ei, bb->succs)
2650 rtx_insn *insn;
2652 dest_blk = e->dest;
2653 insn = BB_HEAD (dest_blk);
2654 gcc_assert (LABEL_P (insn));
2655 cet_eb = gen_nop_endbr ();
2656 emit_insn_after (cet_eb, insn);
2658 continue;
2661 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2662 || (NOTE_P (insn)
2663 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2664 /* TODO. Check /s bit also. */
2666 cet_eb = gen_nop_endbr ();
2667 emit_insn_after (cet_eb, insn);
2668 continue;
2673 timevar_pop (TV_MACH_DEP);
2674 return 0;
2677 namespace {
2679 const pass_data pass_data_insert_endbranch =
2681 RTL_PASS, /* type. */
2682 "cet", /* name. */
2683 OPTGROUP_NONE, /* optinfo_flags. */
2684 TV_MACH_DEP, /* tv_id. */
2685 0, /* properties_required. */
2686 0, /* properties_provided. */
2687 0, /* properties_destroyed. */
2688 0, /* todo_flags_start. */
2689 0, /* todo_flags_finish. */
2692 class pass_insert_endbranch : public rtl_opt_pass
2694 public:
2695 pass_insert_endbranch (gcc::context *ctxt)
2696 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2699 /* opt_pass methods: */
2700 virtual bool gate (function *)
2702 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2705 virtual unsigned int execute (function *)
2707 return rest_of_insert_endbranch ();
2710 }; // class pass_insert_endbranch
2712 } // anon namespace
2714 rtl_opt_pass *
2715 make_pass_insert_endbranch (gcc::context *ctxt)
2717 return new pass_insert_endbranch (ctxt);
2720 /* Return true if a red-zone is in use. We can't use red-zone when
2721 there are local indirect jumps, like "indirect_jump" or "tablejump",
2722 which jumps to another place in the function, since "call" in the
2723 indirect thunk pushes the return address onto stack, destroying
2724 red-zone.
2726 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2727 for CALL, in red-zone, we can allow local indirect jumps with
2728 indirect thunk. */
2730 bool
2731 ix86_using_red_zone (void)
2733 return (TARGET_RED_ZONE
2734 && !TARGET_64BIT_MS_ABI
2735 && (!cfun->machine->has_local_indirect_jump
2736 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2739 /* Return a string that documents the current -m options. The caller is
2740 responsible for freeing the string. */
2742 static char *
2743 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2744 int flags, int flags2,
2745 const char *arch, const char *tune,
2746 enum fpmath_unit fpmath, bool add_nl_p)
2748 struct ix86_target_opts
2750 const char *option; /* option string */
2751 HOST_WIDE_INT mask; /* isa mask options */
2754 /* This table is ordered so that options like -msse4.2 that imply other
2755 ISAs come first. Target string will be displayed in the same order. */
2756 static struct ix86_target_opts isa2_opts[] =
2758 { "-mcx16", OPTION_MASK_ISA_CX16 },
2759 { "-mmpx", OPTION_MASK_ISA_MPX },
2760 { "-mvaes", OPTION_MASK_ISA_VAES },
2761 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2762 { "-mpconfig", OPTION_MASK_ISA_PCONFIG },
2763 { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD },
2764 { "-msgx", OPTION_MASK_ISA_SGX },
2765 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2766 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2767 { "-mibt", OPTION_MASK_ISA_IBT },
2768 { "-mhle", OPTION_MASK_ISA_HLE },
2769 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2770 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2771 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2773 static struct ix86_target_opts isa_opts[] =
2775 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2776 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2777 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2778 { "-mgfni", OPTION_MASK_ISA_GFNI },
2779 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2780 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2781 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2782 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2783 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2784 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2785 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2786 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2787 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2788 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2789 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2790 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2791 { "-mfma", OPTION_MASK_ISA_FMA },
2792 { "-mxop", OPTION_MASK_ISA_XOP },
2793 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2794 { "-mf16c", OPTION_MASK_ISA_F16C },
2795 { "-mavx", OPTION_MASK_ISA_AVX },
2796 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2797 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2798 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2799 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2800 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2801 { "-msse3", OPTION_MASK_ISA_SSE3 },
2802 { "-maes", OPTION_MASK_ISA_AES },
2803 { "-msha", OPTION_MASK_ISA_SHA },
2804 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2805 { "-msse2", OPTION_MASK_ISA_SSE2 },
2806 { "-msse", OPTION_MASK_ISA_SSE },
2807 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2808 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2809 { "-mmmx", OPTION_MASK_ISA_MMX },
2810 { "-mrtm", OPTION_MASK_ISA_RTM },
2811 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2812 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2813 { "-madx", OPTION_MASK_ISA_ADX },
2814 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2815 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2816 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2817 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2818 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2819 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2820 { "-mabm", OPTION_MASK_ISA_ABM },
2821 { "-mbmi", OPTION_MASK_ISA_BMI },
2822 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2823 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2824 { "-mtbm", OPTION_MASK_ISA_TBM },
2825 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2826 { "-msahf", OPTION_MASK_ISA_SAHF },
2827 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2828 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2829 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2830 { "-mpku", OPTION_MASK_ISA_PKU },
2831 { "-mlwp", OPTION_MASK_ISA_LWP },
2832 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2833 { "-mclwb", OPTION_MASK_ISA_CLWB },
2834 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2837 /* Flag options. */
2838 static struct ix86_target_opts flag_opts[] =
2840 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2841 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2842 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2843 { "-m80387", MASK_80387 },
2844 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2845 { "-malign-double", MASK_ALIGN_DOUBLE },
2846 { "-mcld", MASK_CLD },
2847 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2848 { "-mieee-fp", MASK_IEEE_FP },
2849 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2850 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2851 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2852 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2853 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2854 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2855 { "-mno-red-zone", MASK_NO_RED_ZONE },
2856 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2857 { "-mrecip", MASK_RECIP },
2858 { "-mrtd", MASK_RTD },
2859 { "-msseregparm", MASK_SSEREGPARM },
2860 { "-mstack-arg-probe", MASK_STACK_PROBE },
2861 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2862 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2863 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2864 { "-mvzeroupper", MASK_VZEROUPPER },
2865 { "-mstv", MASK_STV },
2866 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2867 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2868 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2871 /* Additional flag options. */
2872 static struct ix86_target_opts flag2_opts[] =
2874 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2877 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2878 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2880 char isa_other[40];
2881 char isa2_other[40];
2882 char flags_other[40];
2883 char flags2_other[40];
2884 unsigned num = 0;
2885 unsigned i, j;
2886 char *ret;
2887 char *ptr;
2888 size_t len;
2889 size_t line_len;
2890 size_t sep_len;
2891 const char *abi;
2893 memset (opts, '\0', sizeof (opts));
2895 /* Add -march= option. */
2896 if (arch)
2898 opts[num][0] = "-march=";
2899 opts[num++][1] = arch;
2902 /* Add -mtune= option. */
2903 if (tune)
2905 opts[num][0] = "-mtune=";
2906 opts[num++][1] = tune;
2909 /* Add -m32/-m64/-mx32. */
2910 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2912 if ((isa & OPTION_MASK_ABI_64) != 0)
2913 abi = "-m64";
2914 else
2915 abi = "-mx32";
2916 isa &= ~ (OPTION_MASK_ISA_64BIT
2917 | OPTION_MASK_ABI_64
2918 | OPTION_MASK_ABI_X32);
2920 else
2921 abi = "-m32";
2922 opts[num++][0] = abi;
2924 /* Pick out the options in isa2 options. */
2925 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2927 if ((isa2 & isa2_opts[i].mask) != 0)
2929 opts[num++][0] = isa2_opts[i].option;
2930 isa2 &= ~ isa2_opts[i].mask;
2934 if (isa2 && add_nl_p)
2936 opts[num++][0] = isa2_other;
2937 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2940 /* Pick out the options in isa options. */
2941 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2943 if ((isa & isa_opts[i].mask) != 0)
2945 opts[num++][0] = isa_opts[i].option;
2946 isa &= ~ isa_opts[i].mask;
2950 if (isa && add_nl_p)
2952 opts[num++][0] = isa_other;
2953 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2956 /* Add flag options. */
2957 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2959 if ((flags & flag_opts[i].mask) != 0)
2961 opts[num++][0] = flag_opts[i].option;
2962 flags &= ~ flag_opts[i].mask;
2966 if (flags && add_nl_p)
2968 opts[num++][0] = flags_other;
2969 sprintf (flags_other, "(other flags: %#x)", flags);
2972 /* Add additional flag options. */
2973 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2975 if ((flags2 & flag2_opts[i].mask) != 0)
2977 opts[num++][0] = flag2_opts[i].option;
2978 flags2 &= ~ flag2_opts[i].mask;
2982 if (flags2 && add_nl_p)
2984 opts[num++][0] = flags2_other;
2985 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2988 /* Add -fpmath= option. */
2989 if (fpmath)
2991 opts[num][0] = "-mfpmath=";
2992 switch ((int) fpmath)
2994 case FPMATH_387:
2995 opts[num++][1] = "387";
2996 break;
2998 case FPMATH_SSE:
2999 opts[num++][1] = "sse";
3000 break;
3002 case FPMATH_387 | FPMATH_SSE:
3003 opts[num++][1] = "sse+387";
3004 break;
3006 default:
3007 gcc_unreachable ();
3011 /* Any options? */
3012 if (num == 0)
3013 return NULL;
3015 gcc_assert (num < ARRAY_SIZE (opts));
3017 /* Size the string. */
3018 len = 0;
3019 sep_len = (add_nl_p) ? 3 : 1;
3020 for (i = 0; i < num; i++)
3022 len += sep_len;
3023 for (j = 0; j < 2; j++)
3024 if (opts[i][j])
3025 len += strlen (opts[i][j]);
3028 /* Build the string. */
3029 ret = ptr = (char *) xmalloc (len);
3030 line_len = 0;
3032 for (i = 0; i < num; i++)
3034 size_t len2[2];
3036 for (j = 0; j < 2; j++)
3037 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3039 if (i != 0)
3041 *ptr++ = ' ';
3042 line_len++;
3044 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3046 *ptr++ = '\\';
3047 *ptr++ = '\n';
3048 line_len = 0;
3052 for (j = 0; j < 2; j++)
3053 if (opts[i][j])
3055 memcpy (ptr, opts[i][j], len2[j]);
3056 ptr += len2[j];
3057 line_len += len2[j];
3061 *ptr = '\0';
3062 gcc_assert (ret + len >= ptr);
3064 return ret;
3067 /* Return true, if profiling code should be emitted before
3068 prologue. Otherwise it returns false.
3069 Note: For x86 with "hotfix" it is sorried. */
3070 static bool
3071 ix86_profile_before_prologue (void)
3073 return flag_fentry != 0;
3076 /* Function that is callable from the debugger to print the current
3077 options. */
3078 void ATTRIBUTE_UNUSED
3079 ix86_debug_options (void)
3081 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3082 target_flags, ix86_target_flags,
3083 ix86_arch_string,ix86_tune_string,
3084 ix86_fpmath, true);
3086 if (opts)
3088 fprintf (stderr, "%s\n\n", opts);
3089 free (opts);
3091 else
3092 fputs ("<no options>\n\n", stderr);
3094 return;
3097 /* Return true if T is one of the bytes we should avoid with
3098 -mmitigate-rop. */
3100 static bool
3101 ix86_rop_should_change_byte_p (int t)
3103 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3106 static const char *stringop_alg_names[] = {
3107 #define DEF_ENUM
3108 #define DEF_ALG(alg, name) #name,
3109 #include "stringop.def"
3110 #undef DEF_ENUM
3111 #undef DEF_ALG
3114 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3115 The string is of the following form (or comma separated list of it):
3117 strategy_alg:max_size:[align|noalign]
3119 where the full size range for the strategy is either [0, max_size] or
3120 [min_size, max_size], in which min_size is the max_size + 1 of the
3121 preceding range. The last size range must have max_size == -1.
3123 Examples:
3126 -mmemcpy-strategy=libcall:-1:noalign
3128 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3132 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3134 This is to tell the compiler to use the following strategy for memset
3135 1) when the expected size is between [1, 16], use rep_8byte strategy;
3136 2) when the size is between [17, 2048], use vector_loop;
3137 3) when the size is > 2048, use libcall. */
3139 struct stringop_size_range
3141 int max;
3142 stringop_alg alg;
3143 bool noalign;
3146 static void
3147 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3149 const struct stringop_algs *default_algs;
3150 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3151 char *curr_range_str, *next_range_str;
3152 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3153 int i = 0, n = 0;
3155 if (is_memset)
3156 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3157 else
3158 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3160 curr_range_str = strategy_str;
3164 int maxs;
3165 char alg_name[128];
3166 char align[16];
3167 next_range_str = strchr (curr_range_str, ',');
3168 if (next_range_str)
3169 *next_range_str++ = '\0';
3171 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3172 align) != 3)
3174 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3175 return;
3178 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3180 error ("size ranges of option %qs should be increasing", opt);
3181 return;
3184 for (i = 0; i < last_alg; i++)
3185 if (!strcmp (alg_name, stringop_alg_names[i]))
3186 break;
3188 if (i == last_alg)
3190 error ("wrong strategy name %qs specified for option %qs",
3191 alg_name, opt);
3193 auto_vec <const char *> candidates;
3194 for (i = 0; i < last_alg; i++)
3195 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3196 candidates.safe_push (stringop_alg_names[i]);
3198 char *s;
3199 const char *hint
3200 = candidates_list_and_hint (alg_name, s, candidates);
3201 if (hint)
3202 inform (input_location,
3203 "valid arguments to %qs are: %s; did you mean %qs?",
3204 opt, s, hint);
3205 else
3206 inform (input_location, "valid arguments to %qs are: %s",
3207 opt, s);
3208 XDELETEVEC (s);
3209 return;
3212 if ((stringop_alg) i == rep_prefix_8_byte
3213 && !TARGET_64BIT)
3215 /* rep; movq isn't available in 32-bit code. */
3216 error ("strategy name %qs specified for option %qs "
3217 "not supported for 32-bit code", alg_name, opt);
3218 return;
3221 input_ranges[n].max = maxs;
3222 input_ranges[n].alg = (stringop_alg) i;
3223 if (!strcmp (align, "align"))
3224 input_ranges[n].noalign = false;
3225 else if (!strcmp (align, "noalign"))
3226 input_ranges[n].noalign = true;
3227 else
3229 error ("unknown alignment %qs specified for option %qs", align, opt);
3230 return;
3232 n++;
3233 curr_range_str = next_range_str;
3235 while (curr_range_str);
3237 if (input_ranges[n - 1].max != -1)
3239 error ("the max value for the last size range should be -1"
3240 " for option %qs", opt);
3241 return;
3244 if (n > MAX_STRINGOP_ALGS)
3246 error ("too many size ranges specified in option %qs", opt);
3247 return;
3250 /* Now override the default algs array. */
3251 for (i = 0; i < n; i++)
3253 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3254 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3255 = input_ranges[i].alg;
3256 *const_cast<int *>(&default_algs->size[i].noalign)
3257 = input_ranges[i].noalign;
3262 /* parse -mtune-ctrl= option. When DUMP is true,
3263 print the features that are explicitly set. */
3265 static void
3266 parse_mtune_ctrl_str (bool dump)
3268 if (!ix86_tune_ctrl_string)
3269 return;
3271 char *next_feature_string = NULL;
3272 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3273 char *orig = curr_feature_string;
3274 int i;
3277 bool clear = false;
3279 next_feature_string = strchr (curr_feature_string, ',');
3280 if (next_feature_string)
3281 *next_feature_string++ = '\0';
3282 if (*curr_feature_string == '^')
3284 curr_feature_string++;
3285 clear = true;
3287 for (i = 0; i < X86_TUNE_LAST; i++)
3289 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3291 ix86_tune_features[i] = !clear;
3292 if (dump)
3293 fprintf (stderr, "Explicitly %s feature %s\n",
3294 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3295 break;
3298 if (i == X86_TUNE_LAST)
3299 error ("unknown parameter to option -mtune-ctrl: %s",
3300 clear ? curr_feature_string - 1 : curr_feature_string);
3301 curr_feature_string = next_feature_string;
3303 while (curr_feature_string);
3304 free (orig);
3307 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3308 processor type. */
3310 static void
3311 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3313 unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
3314 int i;
3316 for (i = 0; i < X86_TUNE_LAST; ++i)
3318 if (ix86_tune_no_default)
3319 ix86_tune_features[i] = 0;
3320 else
3321 ix86_tune_features[i]
3322 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3325 if (dump)
3327 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3328 for (i = 0; i < X86_TUNE_LAST; i++)
3329 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3330 ix86_tune_features[i] ? "on" : "off");
3333 parse_mtune_ctrl_str (dump);
3337 /* Default align_* from the processor table. */
3339 static void
3340 ix86_default_align (struct gcc_options *opts)
3342 if (opts->x_align_loops == 0)
3344 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3345 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3347 if (opts->x_align_jumps == 0)
3349 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3350 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3352 if (opts->x_align_functions == 0)
3354 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3358 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3360 static void
3361 ix86_override_options_after_change (void)
3363 ix86_default_align (&global_options);
3366 /* Override various settings based on options. If MAIN_ARGS_P, the
3367 options are from the command line, otherwise they are from
3368 attributes. Return true if there's an error related to march
3369 option. */
3371 static bool
3372 ix86_option_override_internal (bool main_args_p,
3373 struct gcc_options *opts,
3374 struct gcc_options *opts_set)
3376 int i;
3377 unsigned HOST_WIDE_INT ix86_arch_mask;
3378 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3380 const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3381 const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3382 const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3383 const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3384 const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3385 const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3386 const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3387 const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3388 const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3389 const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3390 const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3391 const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3392 const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3393 const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3394 const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3395 const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3396 const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3397 const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3398 const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3399 const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3400 const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3401 const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3402 const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3403 const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3404 const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3405 const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3406 const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3407 const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3408 const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3409 const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3410 const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3411 const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3412 const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3413 const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3414 const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3415 const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3416 const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3417 const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3418 const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3419 const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3420 const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3421 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3422 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3423 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3424 const wide_int_bitmask PTA_MPX (HOST_WIDE_INT_1U << 44);
3425 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3426 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3427 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3428 const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3429 const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3430 const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3431 const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3432 const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3433 const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3434 const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3435 const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3436 const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3437 const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3438 const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3439 const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3440 const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3441 const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3442 const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3443 const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3444 const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3445 const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3446 const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3447 const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3448 const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3449 const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3450 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3451 const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7);
3452 const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
3454 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3455 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3456 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3457 | PTA_POPCNT;
3458 const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3459 const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3460 | PTA_XSAVEOPT;
3461 const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3462 | PTA_RDRND | PTA_F16C;
3463 const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3464 | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3465 const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3466 | PTA_RDSEED;
3467 const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3468 | PTA_XSAVEC | PTA_XSAVES | PTA_SGX;
3469 const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3470 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3471 | PTA_CLWB;
3472 const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE | PTA_AVX512F
3473 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3474 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA;
3475 const wide_int_bitmask PTA_ICELAKE_CLIENT = PTA_CANNONLAKE | PTA_AVX512VNNI
3476 | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3477 | PTA_RDPID | PTA_CLWB;
3478 const wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT | PTA_PCONFIG
3479 | PTA_WBNOINVD;
3480 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3481 | PTA_AVX512F | PTA_AVX512CD;
3482 const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3483 const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3484 const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3485 | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3487 static struct pta
3489 const char *const name; /* processor name or nickname. */
3490 const enum processor_type processor;
3491 const enum attr_cpu schedule;
3492 const wide_int_bitmask flags;
3494 const processor_alias_table[] =
3496 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3497 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3498 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3499 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3500 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3501 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3502 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3503 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3504 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3505 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3506 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_FXSR},
3508 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3509 PTA_MMX | PTA_SSE | PTA_FXSR},
3510 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3512 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3513 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3514 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3515 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3516 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3517 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3518 PTA_MMX | PTA_SSE | PTA_FXSR},
3519 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3520 PTA_MMX | PTA_SSE | PTA_FXSR},
3521 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3522 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3523 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3524 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3525 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3526 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3527 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3528 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3529 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3530 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3531 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3532 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3533 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3534 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3535 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3536 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3537 PTA_SANDYBRIDGE},
3538 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3539 PTA_SANDYBRIDGE},
3540 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3541 PTA_IVYBRIDGE},
3542 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3543 PTA_IVYBRIDGE},
3544 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3545 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3546 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3547 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3548 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3549 PTA_SKYLAKE_AVX512},
3550 {"cannonlake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_CANNONLAKE},
3551 {"icelake-client", PROCESSOR_ICELAKE_CLIENT, CPU_HASWELL,
3552 PTA_ICELAKE_CLIENT},
3553 {"icelake-server", PROCESSOR_ICELAKE_SERVER, CPU_HASWELL,
3554 PTA_ICELAKE_SERVER},
3555 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3556 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3557 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3558 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3559 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3560 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3561 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3562 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3563 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3564 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3565 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3566 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3567 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3568 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3569 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3570 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3571 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3572 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3573 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3574 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3575 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3576 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3577 {"x86-64", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3579 {"eden-x2", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3581 {"nano", PROCESSOR_K8, CPU_K8,
3582 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3583 | PTA_SSSE3 | PTA_FXSR},
3584 {"nano-1000", PROCESSOR_K8, CPU_K8,
3585 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3586 | PTA_SSSE3 | PTA_FXSR},
3587 {"nano-2000", PROCESSOR_K8, CPU_K8,
3588 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3589 | PTA_SSSE3 | PTA_FXSR},
3590 {"nano-3000", PROCESSOR_K8, CPU_K8,
3591 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3592 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3593 {"nano-x2", PROCESSOR_K8, CPU_K8,
3594 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3595 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3596 {"eden-x4", PROCESSOR_K8, CPU_K8,
3597 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3598 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3599 {"nano-x4", PROCESSOR_K8, CPU_K8,
3600 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3601 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3602 {"k8", PROCESSOR_K8, CPU_K8,
3603 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3604 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3605 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3606 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3607 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3608 {"opteron", PROCESSOR_K8, CPU_K8,
3609 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3610 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3611 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3612 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3613 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3614 {"athlon64", PROCESSOR_K8, CPU_K8,
3615 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3616 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3617 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3618 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3619 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3620 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3621 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3622 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3623 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3624 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3625 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3626 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3627 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3628 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3629 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3630 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3631 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3632 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3633 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3634 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3635 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3636 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3637 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3638 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3639 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3640 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3641 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3642 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3643 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3644 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3645 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3646 | PTA_XSAVEOPT | PTA_FSGSBASE},
3647 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3648 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3649 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3650 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3651 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3652 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3653 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3654 | PTA_MOVBE | PTA_MWAITX},
3655 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3656 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3657 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3658 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3659 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3660 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3661 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3662 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3663 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3664 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3665 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3666 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3667 | PTA_FXSR | PTA_XSAVE},
3668 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3669 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3670 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3671 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3672 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3673 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3675 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3676 PTA_64BIT
3677 | PTA_HLE /* flags are only used for -march switch. */ },
3680 /* -mrecip options. */
3681 static struct
3683 const char *string; /* option name */
3684 unsigned int mask; /* mask bits to set */
3686 const recip_options[] =
3688 { "all", RECIP_MASK_ALL },
3689 { "none", RECIP_MASK_NONE },
3690 { "div", RECIP_MASK_DIV },
3691 { "sqrt", RECIP_MASK_SQRT },
3692 { "vec-div", RECIP_MASK_VEC_DIV },
3693 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3696 int const pta_size = ARRAY_SIZE (processor_alias_table);
3698 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3699 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3700 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3701 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3702 #ifdef TARGET_BI_ARCH
3703 else
3705 #if TARGET_BI_ARCH == 1
3706 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3707 is on and OPTION_MASK_ABI_X32 is off. We turn off
3708 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3709 -mx32. */
3710 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3711 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3712 #else
3713 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3714 on and OPTION_MASK_ABI_64 is off. We turn off
3715 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3716 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3717 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3718 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3719 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3720 #endif
3721 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3722 && TARGET_IAMCU_P (opts->x_target_flags))
3723 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3724 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3726 #endif
3728 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3730 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3731 OPTION_MASK_ABI_64 for TARGET_X32. */
3732 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3733 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3735 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3736 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3737 | OPTION_MASK_ABI_X32
3738 | OPTION_MASK_ABI_64);
3739 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3741 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3742 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3743 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3744 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3747 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3748 SUBTARGET_OVERRIDE_OPTIONS;
3749 #endif
3751 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3752 SUBSUBTARGET_OVERRIDE_OPTIONS;
3753 #endif
3755 /* -fPIC is the default for x86_64. */
3756 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3757 opts->x_flag_pic = 2;
3759 /* Need to check -mtune=generic first. */
3760 if (opts->x_ix86_tune_string)
3762 /* As special support for cross compilers we read -mtune=native
3763 as -mtune=generic. With native compilers we won't see the
3764 -mtune=native, as it was changed by the driver. */
3765 if (!strcmp (opts->x_ix86_tune_string, "native"))
3767 opts->x_ix86_tune_string = "generic";
3769 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3770 warning (OPT_Wdeprecated,
3771 main_args_p
3772 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3773 "or %<-mtune=generic%> instead as appropriate")
3774 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3775 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3776 " instead as appropriate"));
3778 else
3780 if (opts->x_ix86_arch_string)
3781 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3782 if (!opts->x_ix86_tune_string)
3784 opts->x_ix86_tune_string
3785 = processor_target_table[TARGET_CPU_DEFAULT].name;
3786 ix86_tune_defaulted = 1;
3789 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3790 or defaulted. We need to use a sensible tune option. */
3791 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3793 opts->x_ix86_tune_string = "generic";
3797 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3798 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3800 /* rep; movq isn't available in 32-bit code. */
3801 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3802 opts->x_ix86_stringop_alg = no_stringop;
3805 if (!opts->x_ix86_arch_string)
3806 opts->x_ix86_arch_string
3807 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3808 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3809 else
3810 ix86_arch_specified = 1;
3812 if (opts_set->x_ix86_pmode)
3814 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3815 && opts->x_ix86_pmode == PMODE_SI)
3816 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3817 && opts->x_ix86_pmode == PMODE_DI))
3818 error ("address mode %qs not supported in the %s bit mode",
3819 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3820 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3822 else
3823 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3824 ? PMODE_DI : PMODE_SI;
3826 if (!opts_set->x_ix86_abi)
3827 opts->x_ix86_abi = DEFAULT_ABI;
3829 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3830 error ("-mabi=ms not supported with X32 ABI");
3831 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3833 /* For targets using ms ABI enable ms-extensions, if not
3834 explicit turned off. For non-ms ABI we turn off this
3835 option. */
3836 if (!opts_set->x_flag_ms_extensions)
3837 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3839 if (opts_set->x_ix86_cmodel)
3841 switch (opts->x_ix86_cmodel)
3843 case CM_SMALL:
3844 case CM_SMALL_PIC:
3845 if (opts->x_flag_pic)
3846 opts->x_ix86_cmodel = CM_SMALL_PIC;
3847 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3848 error ("code model %qs not supported in the %s bit mode",
3849 "small", "32");
3850 break;
3852 case CM_MEDIUM:
3853 case CM_MEDIUM_PIC:
3854 if (opts->x_flag_pic)
3855 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3856 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3857 error ("code model %qs not supported in the %s bit mode",
3858 "medium", "32");
3859 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3860 error ("code model %qs not supported in x32 mode",
3861 "medium");
3862 break;
3864 case CM_LARGE:
3865 case CM_LARGE_PIC:
3866 if (opts->x_flag_pic)
3867 opts->x_ix86_cmodel = CM_LARGE_PIC;
3868 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3869 error ("code model %qs not supported in the %s bit mode",
3870 "large", "32");
3871 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3872 error ("code model %qs not supported in x32 mode",
3873 "large");
3874 break;
3876 case CM_32:
3877 if (opts->x_flag_pic)
3878 error ("code model %s does not support PIC mode", "32");
3879 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3880 error ("code model %qs not supported in the %s bit mode",
3881 "32", "64");
3882 break;
3884 case CM_KERNEL:
3885 if (opts->x_flag_pic)
3887 error ("code model %s does not support PIC mode", "kernel");
3888 opts->x_ix86_cmodel = CM_32;
3890 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3891 error ("code model %qs not supported in the %s bit mode",
3892 "kernel", "32");
3893 break;
3895 default:
3896 gcc_unreachable ();
3899 else
3901 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3902 use of rip-relative addressing. This eliminates fixups that
3903 would otherwise be needed if this object is to be placed in a
3904 DLL, and is essentially just as efficient as direct addressing. */
3905 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3906 && (TARGET_RDOS || TARGET_PECOFF))
3907 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3908 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3909 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3910 else
3911 opts->x_ix86_cmodel = CM_32;
3913 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3915 error ("-masm=intel not supported in this configuration");
3916 opts->x_ix86_asm_dialect = ASM_ATT;
3918 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3919 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3920 sorry ("%i-bit mode not compiled in",
3921 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3923 for (i = 0; i < pta_size; i++)
3924 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3926 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3928 error (main_args_p
3929 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3930 "switch")
3931 : G_("%<generic%> CPU can be used only for "
3932 "%<target(\"tune=\")%> attribute"));
3933 return false;
3935 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3937 error (main_args_p
3938 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3939 "switch")
3940 : G_("%<intel%> CPU can be used only for "
3941 "%<target(\"tune=\")%> attribute"));
3942 return false;
3945 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3946 && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3948 error ("CPU you selected does not support x86-64 "
3949 "instruction set");
3950 return false;
3953 ix86_schedule = processor_alias_table[i].schedule;
3954 ix86_arch = processor_alias_table[i].processor;
3955 /* Default cpu tuning to the architecture. */
3956 ix86_tune = ix86_arch;
3958 if (((processor_alias_table[i].flags & PTA_MMX) != 0)
3959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3961 if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
3962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3964 if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
3965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3967 if (((processor_alias_table[i].flags & PTA_SSE) != 0)
3968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3970 if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
3971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3973 if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
3974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3976 if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
3977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3979 if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
3980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3982 if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
3983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3985 if (((processor_alias_table[i].flags & PTA_AVX) != 0)
3986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3988 if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
3989 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3990 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3991 if (((processor_alias_table[i].flags & PTA_FMA) != 0)
3992 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3993 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3994 if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
3995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3997 if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
3998 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3999 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
4000 if (((processor_alias_table[i].flags & PTA_XOP) != 0)
4001 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
4002 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
4003 if (((processor_alias_table[i].flags & PTA_LWP) != 0)
4004 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
4005 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
4006 if (((processor_alias_table[i].flags & PTA_ABM) != 0)
4007 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
4008 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
4009 if (((processor_alias_table[i].flags & PTA_BMI) != 0)
4010 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4011 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4012 if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
4013 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4014 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4015 if (((processor_alias_table[i].flags & PTA_TBM) != 0)
4016 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4017 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4018 if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
4019 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4020 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4021 if (((processor_alias_table[i].flags & PTA_CX16) != 0)
4022 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4023 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4024 if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
4025 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4026 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4027 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4028 && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
4029 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4030 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4031 if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
4032 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4033 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4034 if (((processor_alias_table[i].flags & PTA_AES) != 0)
4035 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4036 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4037 if (((processor_alias_table[i].flags & PTA_SHA) != 0)
4038 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4039 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4040 if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
4041 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4042 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4043 if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
4044 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4045 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4046 if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
4047 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4048 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4049 if (((processor_alias_table[i].flags & PTA_F16C) != 0)
4050 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4051 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4052 if (((processor_alias_table[i].flags & PTA_RTM) != 0)
4053 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4054 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4055 if (((processor_alias_table[i].flags & PTA_HLE) != 0)
4056 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4057 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4058 if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
4059 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4060 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4061 if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
4062 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4063 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4064 if (((processor_alias_table[i].flags & PTA_ADX) != 0)
4065 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4066 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4067 if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
4068 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4069 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4070 if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
4071 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4072 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4073 if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
4074 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4075 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4076 if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
4077 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4078 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4079 if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
4080 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4081 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4082 if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
4083 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4084 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4085 if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
4086 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4087 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4088 if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
4089 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4090 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4091 if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
4092 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4093 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4094 if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4095 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4096 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4097 if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4098 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4099 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4100 if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4101 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4102 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4103 if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4104 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4105 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4106 if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4107 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4108 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4109 if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4110 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4111 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4112 if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4113 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4114 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4115 if (((processor_alias_table[i].flags & PTA_MPX) != 0)
4116 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4117 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4118 if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4119 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4120 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4121 if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4122 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4123 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4124 if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4125 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4126 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4127 if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4128 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4129 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4130 if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4131 && !(opts->x_ix86_isa_flags_explicit
4132 & OPTION_MASK_ISA_AVX512VBMI2))
4133 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4134 if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4135 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4136 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4137 if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4138 && !(opts->x_ix86_isa_flags_explicit
4139 & OPTION_MASK_ISA_AVX512BITALG))
4140 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4142 if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4143 && !(opts->x_ix86_isa_flags2_explicit
4144 & OPTION_MASK_ISA_AVX5124VNNIW))
4145 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4146 if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4147 && !(opts->x_ix86_isa_flags2_explicit
4148 & OPTION_MASK_ISA_AVX5124FMAPS))
4149 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4150 if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4151 && !(opts->x_ix86_isa_flags_explicit
4152 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4153 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4154 if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4155 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4156 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4157 if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4158 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4159 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4160 if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4161 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4162 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4163 if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
4164 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
4165 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
4166 if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
4167 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
4168 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
4170 if ((processor_alias_table[i].flags
4171 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4172 x86_prefetch_sse = true;
4173 if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4174 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4175 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4176 if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4177 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4178 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4180 /* Don't enable x87 instructions if only
4181 general registers are allowed. */
4182 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4183 && !(opts_set->x_target_flags & MASK_80387))
4185 if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4186 opts->x_target_flags &= ~MASK_80387;
4187 else
4188 opts->x_target_flags |= MASK_80387;
4190 break;
4193 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4194 error ("Intel MPX does not support x32");
4196 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4197 error ("Intel MPX does not support x32");
4199 if (i == pta_size)
4201 error (main_args_p
4202 ? G_("bad value (%qs) for %<-march=%> switch")
4203 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4204 opts->x_ix86_arch_string);
4206 auto_vec <const char *> candidates;
4207 for (i = 0; i < pta_size; i++)
4208 if (strcmp (processor_alias_table[i].name, "generic")
4209 && strcmp (processor_alias_table[i].name, "intel")
4210 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4211 || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4212 candidates.safe_push (processor_alias_table[i].name);
4214 #ifdef HAVE_LOCAL_CPU_DETECT
4215 /* Add also "native" as possible value. */
4216 candidates.safe_push ("native");
4217 #endif
4219 char *s;
4220 const char *hint
4221 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4222 if (hint)
4223 inform (input_location,
4224 main_args_p
4225 ? G_("valid arguments to %<-march=%> switch are: "
4226 "%s; did you mean %qs?")
4227 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4228 "%s; did you mean %qs?"), s, hint);
4229 else
4230 inform (input_location,
4231 main_args_p
4232 ? G_("valid arguments to %<-march=%> switch are: %s")
4233 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4234 "are: %s"), s);
4235 XDELETEVEC (s);
4238 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
4239 for (i = 0; i < X86_ARCH_LAST; ++i)
4240 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4242 for (i = 0; i < pta_size; i++)
4243 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4245 ix86_schedule = processor_alias_table[i].schedule;
4246 ix86_tune = processor_alias_table[i].processor;
4247 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4249 if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4251 if (ix86_tune_defaulted)
4253 opts->x_ix86_tune_string = "x86-64";
4254 for (i = 0; i < pta_size; i++)
4255 if (! strcmp (opts->x_ix86_tune_string,
4256 processor_alias_table[i].name))
4257 break;
4258 ix86_schedule = processor_alias_table[i].schedule;
4259 ix86_tune = processor_alias_table[i].processor;
4261 else
4262 error ("CPU you selected does not support x86-64 "
4263 "instruction set");
4266 /* Intel CPUs have always interpreted SSE prefetch instructions as
4267 NOPs; so, we can enable SSE prefetch instructions even when
4268 -mtune (rather than -march) points us to a processor that has them.
4269 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4270 higher processors. */
4271 if (TARGET_CMOV
4272 && ((processor_alias_table[i].flags
4273 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4274 x86_prefetch_sse = true;
4275 break;
4278 if (ix86_tune_specified && i == pta_size)
4280 error (main_args_p
4281 ? G_("bad value (%qs) for %<-mtune=%> switch")
4282 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4283 opts->x_ix86_tune_string);
4285 auto_vec <const char *> candidates;
4286 for (i = 0; i < pta_size; i++)
4287 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4288 || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4289 candidates.safe_push (processor_alias_table[i].name);
4291 #ifdef HAVE_LOCAL_CPU_DETECT
4292 /* Add also "native" as possible value. */
4293 candidates.safe_push ("native");
4294 #endif
4296 char *s;
4297 const char *hint
4298 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4299 if (hint)
4300 inform (input_location,
4301 main_args_p
4302 ? G_("valid arguments to %<-mtune=%> switch are: "
4303 "%s; did you mean %qs?")
4304 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4305 "%s; did you mean %qs?"), s, hint);
4306 else
4307 inform (input_location,
4308 main_args_p
4309 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4310 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4311 "are: %s"), s);
4312 XDELETEVEC (s);
4315 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4317 #ifndef USE_IX86_FRAME_POINTER
4318 #define USE_IX86_FRAME_POINTER 0
4319 #endif
4321 #ifndef USE_X86_64_FRAME_POINTER
4322 #define USE_X86_64_FRAME_POINTER 0
4323 #endif
4325 /* Set the default values for switches whose default depends on TARGET_64BIT
4326 in case they weren't overwritten by command line options. */
4327 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4329 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4330 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4331 if (opts->x_flag_asynchronous_unwind_tables
4332 && !opts_set->x_flag_unwind_tables
4333 && TARGET_64BIT_MS_ABI)
4334 opts->x_flag_unwind_tables = 1;
4335 if (opts->x_flag_asynchronous_unwind_tables == 2)
4336 opts->x_flag_unwind_tables
4337 = opts->x_flag_asynchronous_unwind_tables = 1;
4338 if (opts->x_flag_pcc_struct_return == 2)
4339 opts->x_flag_pcc_struct_return = 0;
4341 else
4343 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4344 opts->x_flag_omit_frame_pointer
4345 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4346 if (opts->x_flag_asynchronous_unwind_tables == 2)
4347 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4348 if (opts->x_flag_pcc_struct_return == 2)
4350 /* Intel MCU psABI specifies that -freg-struct-return should
4351 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4352 we check -miamcu so that -freg-struct-return is always
4353 turned on if -miamcu is used. */
4354 if (TARGET_IAMCU_P (opts->x_target_flags))
4355 opts->x_flag_pcc_struct_return = 0;
4356 else
4357 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4361 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4362 /* TODO: ix86_cost should be chosen at instruction or function granuality
4363 so for cold code we use size_cost even in !optimize_size compilation. */
4364 if (opts->x_optimize_size)
4365 ix86_cost = &ix86_size_cost;
4366 else
4367 ix86_cost = ix86_tune_cost;
4369 /* Arrange to set up i386_stack_locals for all functions. */
4370 init_machine_status = ix86_init_machine_status;
4372 /* Validate -mregparm= value. */
4373 if (opts_set->x_ix86_regparm)
4375 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4376 warning (0, "-mregparm is ignored in 64-bit mode");
4377 else if (TARGET_IAMCU_P (opts->x_target_flags))
4378 warning (0, "-mregparm is ignored for Intel MCU psABI");
4379 if (opts->x_ix86_regparm > REGPARM_MAX)
4381 error ("-mregparm=%d is not between 0 and %d",
4382 opts->x_ix86_regparm, REGPARM_MAX);
4383 opts->x_ix86_regparm = 0;
4386 if (TARGET_IAMCU_P (opts->x_target_flags)
4387 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4388 opts->x_ix86_regparm = REGPARM_MAX;
4390 /* Default align_* from the processor table. */
4391 ix86_default_align (opts);
4393 /* Provide default for -mbranch-cost= value. */
4394 if (!opts_set->x_ix86_branch_cost)
4395 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4397 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4399 opts->x_target_flags
4400 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4402 /* Enable by default the SSE and MMX builtins. Do allow the user to
4403 explicitly disable any of these. In particular, disabling SSE and
4404 MMX for kernel code is extremely useful. */
4405 if (!ix86_arch_specified)
4406 opts->x_ix86_isa_flags
4407 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4408 | TARGET_SUBTARGET64_ISA_DEFAULT)
4409 & ~opts->x_ix86_isa_flags_explicit);
4411 if (TARGET_RTD_P (opts->x_target_flags))
4412 warning (0,
4413 main_args_p
4414 ? G_("%<-mrtd%> is ignored in 64bit mode")
4415 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4417 else
4419 opts->x_target_flags
4420 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4422 if (!ix86_arch_specified)
4423 opts->x_ix86_isa_flags
4424 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4426 /* i386 ABI does not specify red zone. It still makes sense to use it
4427 when programmer takes care to stack from being destroyed. */
4428 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4429 opts->x_target_flags |= MASK_NO_RED_ZONE;
4432 /* Keep nonleaf frame pointers. */
4433 if (opts->x_flag_omit_frame_pointer)
4434 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4435 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4436 opts->x_flag_omit_frame_pointer = 1;
4438 /* If we're doing fast math, we don't care about comparison order
4439 wrt NaNs. This lets us use a shorter comparison sequence. */
4440 if (opts->x_flag_finite_math_only)
4441 opts->x_target_flags &= ~MASK_IEEE_FP;
4443 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4444 since the insns won't need emulation. */
4445 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4446 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4448 /* Likewise, if the target doesn't have a 387, or we've specified
4449 software floating point, don't use 387 inline intrinsics. */
4450 if (!TARGET_80387_P (opts->x_target_flags))
4451 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4453 /* Turn on MMX builtins for -msse. */
4454 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4455 opts->x_ix86_isa_flags
4456 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4458 /* Enable SSE prefetch. */
4459 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4460 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4461 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4462 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4463 x86_prefetch_sse = true;
4465 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4466 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4467 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4468 opts->x_ix86_isa_flags
4469 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4471 /* Enable lzcnt instruction for -mabm. */
4472 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4473 opts->x_ix86_isa_flags
4474 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4476 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4477 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4478 opts->x_ix86_isa_flags
4479 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4480 & ~opts->x_ix86_isa_flags_explicit);
4482 /* Validate -mpreferred-stack-boundary= value or default it to
4483 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4484 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4485 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4487 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4488 int max = TARGET_SEH ? 4 : 12;
4490 if (opts->x_ix86_preferred_stack_boundary_arg < min
4491 || opts->x_ix86_preferred_stack_boundary_arg > max)
4493 if (min == max)
4494 error ("-mpreferred-stack-boundary is not supported "
4495 "for this target");
4496 else
4497 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4498 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4500 else
4501 ix86_preferred_stack_boundary
4502 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4505 /* Set the default value for -mstackrealign. */
4506 if (!opts_set->x_ix86_force_align_arg_pointer)
4507 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4509 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4511 /* Validate -mincoming-stack-boundary= value or default it to
4512 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4513 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4514 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4516 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4518 if (opts->x_ix86_incoming_stack_boundary_arg < min
4519 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4520 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4521 opts->x_ix86_incoming_stack_boundary_arg, min);
4522 else
4524 ix86_user_incoming_stack_boundary
4525 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4526 ix86_incoming_stack_boundary
4527 = ix86_user_incoming_stack_boundary;
4531 #ifndef NO_PROFILE_COUNTERS
4532 if (flag_nop_mcount)
4533 error ("-mnop-mcount is not compatible with this target");
4534 #endif
4535 if (flag_nop_mcount && flag_pic)
4536 error ("-mnop-mcount is not implemented for -fPIC");
4538 /* Accept -msseregparm only if at least SSE support is enabled. */
4539 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4540 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4541 error (main_args_p
4542 ? G_("%<-msseregparm%> used without SSE enabled")
4543 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4545 if (opts_set->x_ix86_fpmath)
4547 if (opts->x_ix86_fpmath & FPMATH_SSE)
4549 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4551 if (TARGET_80387_P (opts->x_target_flags))
4553 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4554 opts->x_ix86_fpmath = FPMATH_387;
4557 else if ((opts->x_ix86_fpmath & FPMATH_387)
4558 && !TARGET_80387_P (opts->x_target_flags))
4560 warning (0, "387 instruction set disabled, using SSE arithmetics");
4561 opts->x_ix86_fpmath = FPMATH_SSE;
4565 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4566 fpmath=387. The second is however default at many targets since the
4567 extra 80bit precision of temporaries is considered to be part of ABI.
4568 Overwrite the default at least for -ffast-math.
4569 TODO: -mfpmath=both seems to produce same performing code with bit
4570 smaller binaries. It is however not clear if register allocation is
4571 ready for this setting.
4572 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4573 codegen. We may switch to 387 with -ffast-math for size optimized
4574 functions. */
4575 else if (fast_math_flags_set_p (&global_options)
4576 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4577 opts->x_ix86_fpmath = FPMATH_SSE;
4578 else
4579 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4581 /* Use external vectorized library in vectorizing intrinsics. */
4582 if (opts_set->x_ix86_veclibabi_type)
4583 switch (opts->x_ix86_veclibabi_type)
4585 case ix86_veclibabi_type_svml:
4586 ix86_veclib_handler = ix86_veclibabi_svml;
4587 break;
4589 case ix86_veclibabi_type_acml:
4590 ix86_veclib_handler = ix86_veclibabi_acml;
4591 break;
4593 default:
4594 gcc_unreachable ();
4597 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4598 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4599 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4601 /* If stack probes are required, the space used for large function
4602 arguments on the stack must also be probed, so enable
4603 -maccumulate-outgoing-args so this happens in the prologue. */
4604 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4605 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4607 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4608 warning (0,
4609 main_args_p
4610 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4611 "for correctness")
4612 : G_("stack probing requires "
4613 "%<target(\"accumulate-outgoing-args\")%> for "
4614 "correctness"));
4615 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4618 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4619 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4620 if (fixed_regs[BP_REG]
4621 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4623 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4624 warning (0,
4625 main_args_p
4626 ? G_("fixed ebp register requires "
4627 "%<-maccumulate-outgoing-args%>")
4628 : G_("fixed ebp register requires "
4629 "%<target(\"accumulate-outgoing-args\")%>"));
4630 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4633 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4635 char *p;
4636 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4637 p = strchr (internal_label_prefix, 'X');
4638 internal_label_prefix_len = p - internal_label_prefix;
4639 *p = '\0';
4642 /* When scheduling description is not available, disable scheduler pass
4643 so it won't slow down the compilation and make x87 code slower. */
4644 if (!TARGET_SCHEDULE)
4645 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4647 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4648 ix86_tune_cost->simultaneous_prefetches,
4649 opts->x_param_values,
4650 opts_set->x_param_values);
4651 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4652 ix86_tune_cost->prefetch_block,
4653 opts->x_param_values,
4654 opts_set->x_param_values);
4655 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4656 ix86_tune_cost->l1_cache_size,
4657 opts->x_param_values,
4658 opts_set->x_param_values);
4659 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4660 ix86_tune_cost->l2_cache_size,
4661 opts->x_param_values,
4662 opts_set->x_param_values);
4664 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4665 if (opts->x_flag_prefetch_loop_arrays < 0
4666 && HAVE_prefetch
4667 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4668 && !opts->x_optimize_size
4669 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4670 opts->x_flag_prefetch_loop_arrays = 1;
4672 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4673 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4674 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4675 targetm.expand_builtin_va_start = NULL;
4677 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4679 ix86_gen_leave = gen_leave_rex64;
4680 if (Pmode == DImode)
4682 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4683 ix86_gen_tls_local_dynamic_base_64
4684 = gen_tls_local_dynamic_base_64_di;
4686 else
4688 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4689 ix86_gen_tls_local_dynamic_base_64
4690 = gen_tls_local_dynamic_base_64_si;
4693 else
4694 ix86_gen_leave = gen_leave;
4696 if (Pmode == DImode)
4698 ix86_gen_add3 = gen_adddi3;
4699 ix86_gen_sub3 = gen_subdi3;
4700 ix86_gen_sub3_carry = gen_subdi3_carry;
4701 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4702 ix86_gen_andsp = gen_anddi3;
4703 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4704 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4705 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4706 ix86_gen_monitor = gen_sse3_monitor_di;
4707 ix86_gen_monitorx = gen_monitorx_di;
4708 ix86_gen_clzero = gen_clzero_di;
4710 else
4712 ix86_gen_add3 = gen_addsi3;
4713 ix86_gen_sub3 = gen_subsi3;
4714 ix86_gen_sub3_carry = gen_subsi3_carry;
4715 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4716 ix86_gen_andsp = gen_andsi3;
4717 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4718 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4719 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4720 ix86_gen_monitor = gen_sse3_monitor_si;
4721 ix86_gen_monitorx = gen_monitorx_si;
4722 ix86_gen_clzero = gen_clzero_si;
4725 #ifdef USE_IX86_CLD
4726 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4727 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4728 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4729 #endif
4731 /* Set the default value for -mfentry. */
4732 if (!opts_set->x_flag_fentry)
4733 opts->x_flag_fentry = TARGET_SEH;
4734 else
4736 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4737 && opts->x_flag_fentry)
4738 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4739 "with -fpic");
4740 else if (TARGET_SEH && !opts->x_flag_fentry)
4741 sorry ("-mno-fentry isn%'t compatible with SEH");
4744 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4745 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4747 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4748 && TARGET_EMIT_VZEROUPPER)
4749 opts->x_target_flags |= MASK_VZEROUPPER;
4750 if (!(opts_set->x_target_flags & MASK_STV))
4751 opts->x_target_flags |= MASK_STV;
4752 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4753 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4754 stack realignment will be extra cost the pass doesn't take into
4755 account and the pass can't realign the stack. */
4756 if (ix86_preferred_stack_boundary < 128
4757 || ix86_incoming_stack_boundary < 128
4758 || opts->x_ix86_force_align_arg_pointer)
4759 opts->x_target_flags &= ~MASK_STV;
4760 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4761 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4762 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4763 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4764 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4765 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4767 /* Enable 128-bit AVX instruction generation
4768 for the auto-vectorizer. */
4769 if (TARGET_AVX128_OPTIMAL
4770 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4771 opts->x_prefer_vector_width_type = PVW_AVX128;
4773 /* Use 256-bit AVX instruction generation
4774 in the auto-vectorizer. */
4775 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4776 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4777 opts->x_prefer_vector_width_type = PVW_AVX256;
4779 if (opts->x_ix86_recip_name)
4781 char *p = ASTRDUP (opts->x_ix86_recip_name);
4782 char *q;
4783 unsigned int mask, i;
4784 bool invert;
4786 while ((q = strtok (p, ",")) != NULL)
4788 p = NULL;
4789 if (*q == '!')
4791 invert = true;
4792 q++;
4794 else
4795 invert = false;
4797 if (!strcmp (q, "default"))
4798 mask = RECIP_MASK_ALL;
4799 else
4801 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4802 if (!strcmp (q, recip_options[i].string))
4804 mask = recip_options[i].mask;
4805 break;
4808 if (i == ARRAY_SIZE (recip_options))
4810 error ("unknown option for -mrecip=%s", q);
4811 invert = false;
4812 mask = RECIP_MASK_NONE;
4816 opts->x_recip_mask_explicit |= mask;
4817 if (invert)
4818 opts->x_recip_mask &= ~mask;
4819 else
4820 opts->x_recip_mask |= mask;
4824 if (TARGET_RECIP_P (opts->x_target_flags))
4825 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4826 else if (opts_set->x_target_flags & MASK_RECIP)
4827 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4829 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4830 for 64-bit Bionic. Also default long double to 64-bit for Intel
4831 MCU psABI. */
4832 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4833 && !(opts_set->x_target_flags
4834 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4835 opts->x_target_flags |= (TARGET_64BIT
4836 ? MASK_LONG_DOUBLE_128
4837 : MASK_LONG_DOUBLE_64);
4839 /* Only one of them can be active. */
4840 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4841 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4843 /* Handle stack protector */
4844 if (!opts_set->x_ix86_stack_protector_guard)
4845 opts->x_ix86_stack_protector_guard
4846 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4848 #ifdef TARGET_THREAD_SSP_OFFSET
4849 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4850 #endif
4852 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4854 char *endp;
4855 const char *str = ix86_stack_protector_guard_offset_str;
4857 errno = 0;
4858 int64_t offset;
4860 #if defined(INT64_T_IS_LONG)
4861 offset = strtol (str, &endp, 0);
4862 #else
4863 offset = strtoll (str, &endp, 0);
4864 #endif
4866 if (!*str || *endp || errno)
4867 error ("%qs is not a valid number "
4868 "in -mstack-protector-guard-offset=", str);
4870 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4871 HOST_WIDE_INT_C (0x7fffffff)))
4872 error ("%qs is not a valid offset "
4873 "in -mstack-protector-guard-offset=", str);
4875 ix86_stack_protector_guard_offset = offset;
4878 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4880 /* The kernel uses a different segment register for performance
4881 reasons; a system call would not have to trash the userspace
4882 segment register, which would be expensive. */
4883 if (ix86_cmodel == CM_KERNEL)
4884 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4886 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4888 const char *str = ix86_stack_protector_guard_reg_str;
4889 addr_space_t seg = ADDR_SPACE_GENERIC;
4891 /* Discard optional register prefix. */
4892 if (str[0] == '%')
4893 str++;
4895 if (strlen (str) == 2 && str[1] == 's')
4897 if (str[0] == 'f')
4898 seg = ADDR_SPACE_SEG_FS;
4899 else if (str[0] == 'g')
4900 seg = ADDR_SPACE_SEG_GS;
4903 if (seg == ADDR_SPACE_GENERIC)
4904 error ("%qs is not a valid base register "
4905 "in -mstack-protector-guard-reg=",
4906 ix86_stack_protector_guard_reg_str);
4908 ix86_stack_protector_guard_reg = seg;
4911 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4912 if (opts->x_ix86_tune_memcpy_strategy)
4914 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4915 ix86_parse_stringop_strategy_string (str, false);
4916 free (str);
4919 if (opts->x_ix86_tune_memset_strategy)
4921 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4922 ix86_parse_stringop_strategy_string (str, true);
4923 free (str);
4926 /* Save the initial options in case the user does function specific
4927 options. */
4928 if (main_args_p)
4929 target_option_default_node = target_option_current_node
4930 = build_target_option_node (opts);
4932 /* Do not support control flow instrumentation if CET is not enabled. */
4933 cf_protection_level cf_protection
4934 = (cf_protection_level) (opts->x_flag_cf_protection & ~CF_SET);
4935 if (cf_protection != CF_NONE)
4937 switch (cf_protection)
4939 case CF_BRANCH:
4940 if (! TARGET_IBT_P (opts->x_ix86_isa_flags2))
4942 error ("%<-fcf-protection=branch%> requires Intel CET "
4943 "support. Use -mcet or -mibt option to enable CET");
4944 flag_cf_protection = CF_NONE;
4945 return false;
4947 break;
4948 case CF_RETURN:
4949 if (! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4951 error ("%<-fcf-protection=return%> requires Intel CET "
4952 "support. Use -mcet or -mshstk option to enable CET");
4953 flag_cf_protection = CF_NONE;
4954 return false;
4956 break;
4957 case CF_FULL:
4958 if ( ! TARGET_IBT_P (opts->x_ix86_isa_flags2)
4959 || ! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4961 error ("%<-fcf-protection=full%> requires Intel CET "
4962 "support. Use -mcet or both of -mibt and "
4963 "-mshstk options to enable CET");
4964 flag_cf_protection = CF_NONE;
4965 return false;
4967 break;
4968 default:
4969 gcc_unreachable ();
4972 opts->x_flag_cf_protection =
4973 (cf_protection_level) (cf_protection | CF_SET);
4976 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4977 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4978 opts->x_param_values,
4979 opts_set->x_param_values);
4981 return true;
4984 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4986 static void
4987 ix86_option_override (void)
4989 ix86_option_override_internal (true, &global_options, &global_options_set);
4992 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4993 static char *
4994 ix86_offload_options (void)
4996 if (TARGET_LP64)
4997 return xstrdup ("-foffload-abi=lp64");
4998 return xstrdup ("-foffload-abi=ilp32");
5001 /* Update register usage after having seen the compiler flags. */
5003 static void
5004 ix86_conditional_register_usage (void)
5006 int i, c_mask;
5008 /* If there are no caller-saved registers, preserve all registers.
5009 except fixed_regs and registers used for function return value
5010 since aggregate_value_p checks call_used_regs[regno] on return
5011 value. */
5012 if (cfun && cfun->machine->no_caller_saved_registers)
5013 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5014 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
5015 call_used_regs[i] = 0;
5017 /* For 32-bit targets, squash the REX registers. */
5018 if (! TARGET_64BIT)
5020 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
5021 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5022 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
5023 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5024 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5025 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5028 /* See the definition of CALL_USED_REGISTERS in i386.h. */
5029 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
5031 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
5033 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5035 /* Set/reset conditionally defined registers from
5036 CALL_USED_REGISTERS initializer. */
5037 if (call_used_regs[i] > 1)
5038 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
5040 /* Calculate registers of CLOBBERED_REGS register set
5041 as call used registers from GENERAL_REGS register set. */
5042 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
5043 && call_used_regs[i])
5044 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
5047 /* If MMX is disabled, squash the registers. */
5048 if (! TARGET_MMX)
5049 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5050 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
5051 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5053 /* If SSE is disabled, squash the registers. */
5054 if (! TARGET_SSE)
5055 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5056 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
5057 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5059 /* If the FPU is disabled, squash the registers. */
5060 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
5061 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5062 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5063 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5065 /* If AVX512F is disabled, squash the registers. */
5066 if (! TARGET_AVX512F)
5068 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5069 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5071 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5072 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5075 /* If MPX is disabled, squash the registers. */
5076 if (! TARGET_MPX)
5077 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5078 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5081 /* Canonicalize a comparison from one we don't have to one we do have. */
5083 static void
5084 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5085 bool op0_preserve_value)
5087 /* The order of operands in x87 ficom compare is forced by combine in
5088 simplify_comparison () function. Float operator is treated as RTX_OBJ
5089 with a precedence over other operators and is always put in the first
5090 place. Swap condition and operands to match ficom instruction. */
5091 if (!op0_preserve_value
5092 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5094 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5096 /* We are called only for compares that are split to SAHF instruction.
5097 Ensure that we have setcc/jcc insn for the swapped condition. */
5098 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5100 std::swap (*op0, *op1);
5101 *code = (int) scode;
5106 /* Save the current options */
5108 static void
5109 ix86_function_specific_save (struct cl_target_option *ptr,
5110 struct gcc_options *opts)
5112 ptr->arch = ix86_arch;
5113 ptr->schedule = ix86_schedule;
5114 ptr->prefetch_sse = x86_prefetch_sse;
5115 ptr->tune = ix86_tune;
5116 ptr->branch_cost = ix86_branch_cost;
5117 ptr->tune_defaulted = ix86_tune_defaulted;
5118 ptr->arch_specified = ix86_arch_specified;
5119 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5120 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5121 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5122 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5123 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5124 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5125 ptr->x_ix86_abi = opts->x_ix86_abi;
5126 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5127 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5128 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5129 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5130 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5131 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5132 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5133 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5134 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5135 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5136 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5137 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5138 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5139 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5140 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5141 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5142 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5143 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5144 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5145 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5147 /* The fields are char but the variables are not; make sure the
5148 values fit in the fields. */
5149 gcc_assert (ptr->arch == ix86_arch);
5150 gcc_assert (ptr->schedule == ix86_schedule);
5151 gcc_assert (ptr->tune == ix86_tune);
5152 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5155 /* Restore the current options */
5157 static void
5158 ix86_function_specific_restore (struct gcc_options *opts,
5159 struct cl_target_option *ptr)
5161 enum processor_type old_tune = ix86_tune;
5162 enum processor_type old_arch = ix86_arch;
5163 unsigned HOST_WIDE_INT ix86_arch_mask;
5164 int i;
5166 /* We don't change -fPIC. */
5167 opts->x_flag_pic = flag_pic;
5169 ix86_arch = (enum processor_type) ptr->arch;
5170 ix86_schedule = (enum attr_cpu) ptr->schedule;
5171 ix86_tune = (enum processor_type) ptr->tune;
5172 x86_prefetch_sse = ptr->prefetch_sse;
5173 opts->x_ix86_branch_cost = ptr->branch_cost;
5174 ix86_tune_defaulted = ptr->tune_defaulted;
5175 ix86_arch_specified = ptr->arch_specified;
5176 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5177 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5178 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5179 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5180 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5181 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5182 opts->x_ix86_abi = ptr->x_ix86_abi;
5183 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5184 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5185 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5186 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5187 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5188 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5189 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5190 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5191 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5192 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5193 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5194 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5195 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5196 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5197 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5198 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5199 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5200 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5201 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5202 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5203 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5204 /* TODO: ix86_cost should be chosen at instruction or function granuality
5205 so for cold code we use size_cost even in !optimize_size compilation. */
5206 if (opts->x_optimize_size)
5207 ix86_cost = &ix86_size_cost;
5208 else
5209 ix86_cost = ix86_tune_cost;
5211 /* Recreate the arch feature tests if the arch changed */
5212 if (old_arch != ix86_arch)
5214 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
5215 for (i = 0; i < X86_ARCH_LAST; ++i)
5216 ix86_arch_features[i]
5217 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5220 /* Recreate the tune optimization tests */
5221 if (old_tune != ix86_tune)
5222 set_ix86_tune_features (ix86_tune, false);
5225 /* Adjust target options after streaming them in. This is mainly about
5226 reconciling them with global options. */
5228 static void
5229 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5231 /* flag_pic is a global option, but ix86_cmodel is target saved option
5232 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5233 for PIC, or error out. */
5234 if (flag_pic)
5235 switch (ptr->x_ix86_cmodel)
5237 case CM_SMALL:
5238 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5239 break;
5241 case CM_MEDIUM:
5242 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5243 break;
5245 case CM_LARGE:
5246 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5247 break;
5249 case CM_KERNEL:
5250 error ("code model %s does not support PIC mode", "kernel");
5251 break;
5253 default:
5254 break;
5256 else
5257 switch (ptr->x_ix86_cmodel)
5259 case CM_SMALL_PIC:
5260 ptr->x_ix86_cmodel = CM_SMALL;
5261 break;
5263 case CM_MEDIUM_PIC:
5264 ptr->x_ix86_cmodel = CM_MEDIUM;
5265 break;
5267 case CM_LARGE_PIC:
5268 ptr->x_ix86_cmodel = CM_LARGE;
5269 break;
5271 default:
5272 break;
5276 /* Print the current options */
5278 static void
5279 ix86_function_specific_print (FILE *file, int indent,
5280 struct cl_target_option *ptr)
5282 char *target_string
5283 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5284 ptr->x_target_flags, ptr->x_ix86_target_flags,
5285 NULL, NULL, ptr->x_ix86_fpmath, false);
5287 gcc_assert (ptr->arch < PROCESSOR_max);
5288 fprintf (file, "%*sarch = %d (%s)\n",
5289 indent, "",
5290 ptr->arch, processor_target_table[ptr->arch].name);
5292 gcc_assert (ptr->tune < PROCESSOR_max);
5293 fprintf (file, "%*stune = %d (%s)\n",
5294 indent, "",
5295 ptr->tune, processor_target_table[ptr->tune].name);
5297 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5299 if (target_string)
5301 fprintf (file, "%*s%s\n", indent, "", target_string);
5302 free (target_string);
5307 /* Inner function to process the attribute((target(...))), take an argument and
5308 set the current options from the argument. If we have a list, recursively go
5309 over the list. */
5311 static bool
5312 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5313 struct gcc_options *opts,
5314 struct gcc_options *opts_set,
5315 struct gcc_options *enum_opts_set)
5317 char *next_optstr;
5318 bool ret = true;
5320 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5321 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5322 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5323 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5324 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5326 enum ix86_opt_type
5328 ix86_opt_unknown,
5329 ix86_opt_yes,
5330 ix86_opt_no,
5331 ix86_opt_str,
5332 ix86_opt_enum,
5333 ix86_opt_isa
5336 static const struct
5338 const char *string;
5339 size_t len;
5340 enum ix86_opt_type type;
5341 int opt;
5342 int mask;
5343 } attrs[] = {
5344 /* isa options */
5345 IX86_ATTR_ISA ("pconfig", OPT_mpconfig),
5346 IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd),
5347 IX86_ATTR_ISA ("sgx", OPT_msgx),
5348 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5349 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5350 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5351 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5352 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5353 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5355 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5356 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5357 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5358 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5359 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5360 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5361 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5362 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5363 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5364 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5365 IX86_ATTR_ISA ("fma", OPT_mfma),
5366 IX86_ATTR_ISA ("xop", OPT_mxop),
5367 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5368 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5369 IX86_ATTR_ISA ("avx", OPT_mavx),
5370 IX86_ATTR_ISA ("sse4", OPT_msse4),
5371 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5372 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5373 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5374 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5375 IX86_ATTR_ISA ("sse3", OPT_msse3),
5376 IX86_ATTR_ISA ("aes", OPT_maes),
5377 IX86_ATTR_ISA ("sha", OPT_msha),
5378 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5379 IX86_ATTR_ISA ("sse2", OPT_msse2),
5380 IX86_ATTR_ISA ("sse", OPT_msse),
5381 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5382 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5383 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5384 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5385 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5386 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5387 IX86_ATTR_ISA ("adx", OPT_madx),
5388 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5389 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5390 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5391 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5392 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5393 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5394 IX86_ATTR_ISA ("abm", OPT_mabm),
5395 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5396 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5397 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5398 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5399 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5400 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5401 IX86_ATTR_ISA ("sahf", OPT_msahf),
5402 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5403 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5404 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5405 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5406 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5407 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5408 IX86_ATTR_ISA ("pku", OPT_mpku),
5409 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5410 IX86_ATTR_ISA ("hle", OPT_mhle),
5411 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5412 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5413 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5414 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5415 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5416 IX86_ATTR_ISA ("ibt", OPT_mibt),
5417 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5418 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5419 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5421 /* enum options */
5422 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5424 /* string options */
5425 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5426 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5428 /* flag options */
5429 IX86_ATTR_YES ("cld",
5430 OPT_mcld,
5431 MASK_CLD),
5433 IX86_ATTR_NO ("fancy-math-387",
5434 OPT_mfancy_math_387,
5435 MASK_NO_FANCY_MATH_387),
5437 IX86_ATTR_YES ("ieee-fp",
5438 OPT_mieee_fp,
5439 MASK_IEEE_FP),
5441 IX86_ATTR_YES ("inline-all-stringops",
5442 OPT_minline_all_stringops,
5443 MASK_INLINE_ALL_STRINGOPS),
5445 IX86_ATTR_YES ("inline-stringops-dynamically",
5446 OPT_minline_stringops_dynamically,
5447 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5449 IX86_ATTR_NO ("align-stringops",
5450 OPT_mno_align_stringops,
5451 MASK_NO_ALIGN_STRINGOPS),
5453 IX86_ATTR_YES ("recip",
5454 OPT_mrecip,
5455 MASK_RECIP),
5459 /* If this is a list, recurse to get the options. */
5460 if (TREE_CODE (args) == TREE_LIST)
5462 bool ret = true;
5464 for (; args; args = TREE_CHAIN (args))
5465 if (TREE_VALUE (args)
5466 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5467 p_strings, opts, opts_set,
5468 enum_opts_set))
5469 ret = false;
5471 return ret;
5474 else if (TREE_CODE (args) != STRING_CST)
5476 error ("attribute %<target%> argument not a string");
5477 return false;
5480 /* Handle multiple arguments separated by commas. */
5481 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5483 while (next_optstr && *next_optstr != '\0')
5485 char *p = next_optstr;
5486 char *orig_p = p;
5487 char *comma = strchr (next_optstr, ',');
5488 const char *opt_string;
5489 size_t len, opt_len;
5490 int opt;
5491 bool opt_set_p;
5492 char ch;
5493 unsigned i;
5494 enum ix86_opt_type type = ix86_opt_unknown;
5495 int mask = 0;
5497 if (comma)
5499 *comma = '\0';
5500 len = comma - next_optstr;
5501 next_optstr = comma + 1;
5503 else
5505 len = strlen (p);
5506 next_optstr = NULL;
5509 /* Recognize no-xxx. */
5510 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5512 opt_set_p = false;
5513 p += 3;
5514 len -= 3;
5516 else
5517 opt_set_p = true;
5519 /* Find the option. */
5520 ch = *p;
5521 opt = N_OPTS;
5522 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5524 type = attrs[i].type;
5525 opt_len = attrs[i].len;
5526 if (ch == attrs[i].string[0]
5527 && ((type != ix86_opt_str && type != ix86_opt_enum)
5528 ? len == opt_len
5529 : len > opt_len)
5530 && memcmp (p, attrs[i].string, opt_len) == 0)
5532 opt = attrs[i].opt;
5533 mask = attrs[i].mask;
5534 opt_string = attrs[i].string;
5535 break;
5539 /* Process the option. */
5540 if (opt == N_OPTS)
5542 error ("attribute(target(\"%s\")) is unknown", orig_p);
5543 ret = false;
5546 else if (type == ix86_opt_isa)
5548 struct cl_decoded_option decoded;
5550 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5551 ix86_handle_option (opts, opts_set,
5552 &decoded, input_location);
5555 else if (type == ix86_opt_yes || type == ix86_opt_no)
5557 if (type == ix86_opt_no)
5558 opt_set_p = !opt_set_p;
5560 if (opt_set_p)
5561 opts->x_target_flags |= mask;
5562 else
5563 opts->x_target_flags &= ~mask;
5566 else if (type == ix86_opt_str)
5568 if (p_strings[opt])
5570 error ("option(\"%s\") was already specified", opt_string);
5571 ret = false;
5573 else
5574 p_strings[opt] = xstrdup (p + opt_len);
5577 else if (type == ix86_opt_enum)
5579 bool arg_ok;
5580 int value;
5582 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5583 if (arg_ok)
5584 set_option (opts, enum_opts_set, opt, value,
5585 p + opt_len, DK_UNSPECIFIED, input_location,
5586 global_dc);
5587 else
5589 error ("attribute(target(\"%s\")) is unknown", orig_p);
5590 ret = false;
5594 else
5595 gcc_unreachable ();
5598 return ret;
5601 /* Release allocated strings. */
5602 static void
5603 release_options_strings (char **option_strings)
5605 /* Free up memory allocated to hold the strings */
5606 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5607 free (option_strings[i]);
5610 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5612 tree
5613 ix86_valid_target_attribute_tree (tree args,
5614 struct gcc_options *opts,
5615 struct gcc_options *opts_set)
5617 const char *orig_arch_string = opts->x_ix86_arch_string;
5618 const char *orig_tune_string = opts->x_ix86_tune_string;
5619 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5620 int orig_tune_defaulted = ix86_tune_defaulted;
5621 int orig_arch_specified = ix86_arch_specified;
5622 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5623 tree t = NULL_TREE;
5624 struct cl_target_option *def
5625 = TREE_TARGET_OPTION (target_option_default_node);
5626 struct gcc_options enum_opts_set;
5628 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5630 /* Process each of the options on the chain. */
5631 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5632 opts_set, &enum_opts_set))
5633 return error_mark_node;
5635 /* If the changed options are different from the default, rerun
5636 ix86_option_override_internal, and then save the options away.
5637 The string options are attribute options, and will be undone
5638 when we copy the save structure. */
5639 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5640 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5641 || opts->x_target_flags != def->x_target_flags
5642 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5643 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5644 || enum_opts_set.x_ix86_fpmath)
5646 /* If we are using the default tune= or arch=, undo the string assigned,
5647 and use the default. */
5648 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5650 opts->x_ix86_arch_string
5651 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5653 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5654 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5655 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5656 | OPTION_MASK_ABI_64
5657 | OPTION_MASK_ABI_X32
5658 | OPTION_MASK_CODE16);
5659 opts->x_ix86_isa_flags2 = 0;
5661 else if (!orig_arch_specified)
5662 opts->x_ix86_arch_string = NULL;
5664 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5665 opts->x_ix86_tune_string
5666 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5667 else if (orig_tune_defaulted)
5668 opts->x_ix86_tune_string = NULL;
5670 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5671 if (enum_opts_set.x_ix86_fpmath)
5672 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5674 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5675 bool r = ix86_option_override_internal (false, opts, opts_set);
5676 if (!r)
5678 release_options_strings (option_strings);
5679 return error_mark_node;
5682 /* Add any builtin functions with the new isa if any. */
5683 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5685 /* Save the current options unless we are validating options for
5686 #pragma. */
5687 t = build_target_option_node (opts);
5689 opts->x_ix86_arch_string = orig_arch_string;
5690 opts->x_ix86_tune_string = orig_tune_string;
5691 opts_set->x_ix86_fpmath = orig_fpmath_set;
5693 release_options_strings (option_strings);
5696 return t;
5699 /* Hook to validate attribute((target("string"))). */
5701 static bool
5702 ix86_valid_target_attribute_p (tree fndecl,
5703 tree ARG_UNUSED (name),
5704 tree args,
5705 int ARG_UNUSED (flags))
5707 struct gcc_options func_options;
5708 tree new_target, new_optimize;
5709 bool ret = true;
5711 /* attribute((target("default"))) does nothing, beyond
5712 affecting multi-versioning. */
5713 if (TREE_VALUE (args)
5714 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5715 && TREE_CHAIN (args) == NULL_TREE
5716 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5717 return true;
5719 tree old_optimize = build_optimization_node (&global_options);
5721 /* Get the optimization options of the current function. */
5722 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5724 if (!func_optimize)
5725 func_optimize = old_optimize;
5727 /* Init func_options. */
5728 memset (&func_options, 0, sizeof (func_options));
5729 init_options_struct (&func_options, NULL);
5730 lang_hooks.init_options_struct (&func_options);
5732 cl_optimization_restore (&func_options,
5733 TREE_OPTIMIZATION (func_optimize));
5735 /* Initialize func_options to the default before its target options can
5736 be set. */
5737 cl_target_option_restore (&func_options,
5738 TREE_TARGET_OPTION (target_option_default_node));
5740 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5741 &global_options_set);
5743 new_optimize = build_optimization_node (&func_options);
5745 if (new_target == error_mark_node)
5746 ret = false;
5748 else if (fndecl && new_target)
5750 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5752 if (old_optimize != new_optimize)
5753 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5756 finalize_options_struct (&func_options);
5758 return ret;
5762 /* Hook to determine if one function can safely inline another. */
5764 static bool
5765 ix86_can_inline_p (tree caller, tree callee)
5767 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5768 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5769 if (!callee_tree)
5770 callee_tree = target_option_default_node;
5771 if (!caller_tree)
5772 caller_tree = target_option_default_node;
5773 if (callee_tree == caller_tree)
5774 return true;
5776 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5777 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5778 bool ret = false;
5780 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5781 function can inline a SSE2 function but a SSE2 function can't inline
5782 a SSE4 function. */
5783 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5784 != callee_opts->x_ix86_isa_flags)
5785 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5786 != callee_opts->x_ix86_isa_flags2))
5787 ret = false;
5789 /* See if we have the same non-isa options. */
5790 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5791 ret = false;
5793 /* See if arch, tune, etc. are the same. */
5794 else if (caller_opts->arch != callee_opts->arch)
5795 ret = false;
5797 else if (caller_opts->tune != callee_opts->tune)
5798 ret = false;
5800 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5801 /* If the calle doesn't use FP expressions differences in
5802 ix86_fpmath can be ignored. We are called from FEs
5803 for multi-versioning call optimization, so beware of
5804 ipa_fn_summaries not available. */
5805 && (! ipa_fn_summaries
5806 || ipa_fn_summaries->get
5807 (cgraph_node::get (callee))->fp_expressions))
5808 ret = false;
5810 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5811 ret = false;
5813 else
5814 ret = true;
5816 return ret;
5820 /* Remember the last target of ix86_set_current_function. */
5821 static GTY(()) tree ix86_previous_fndecl;
5823 /* Set targets globals to the default (or current #pragma GCC target
5824 if active). Invalidate ix86_previous_fndecl cache. */
5826 void
5827 ix86_reset_previous_fndecl (void)
5829 tree new_tree = target_option_current_node;
5830 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5831 if (TREE_TARGET_GLOBALS (new_tree))
5832 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5833 else if (new_tree == target_option_default_node)
5834 restore_target_globals (&default_target_globals);
5835 else
5836 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5837 ix86_previous_fndecl = NULL_TREE;
5840 /* Set the func_type field from the function FNDECL. */
5842 static void
5843 ix86_set_func_type (tree fndecl)
5845 if (cfun->machine->func_type == TYPE_UNKNOWN)
5847 if (lookup_attribute ("interrupt",
5848 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5850 if (ix86_function_naked (fndecl))
5851 error_at (DECL_SOURCE_LOCATION (fndecl),
5852 "interrupt and naked attributes are not compatible");
5854 int nargs = 0;
5855 for (tree arg = DECL_ARGUMENTS (fndecl);
5856 arg;
5857 arg = TREE_CHAIN (arg))
5858 nargs++;
5859 cfun->machine->no_caller_saved_registers = true;
5860 cfun->machine->func_type
5861 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5863 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5865 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5866 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5867 sorry ("Only DWARF debug format is supported for interrupt "
5868 "service routine.");
5870 else
5872 cfun->machine->func_type = TYPE_NORMAL;
5873 if (lookup_attribute ("no_caller_saved_registers",
5874 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5875 cfun->machine->no_caller_saved_registers = true;
5880 /* Set the indirect_branch_type field from the function FNDECL. */
5882 static void
5883 ix86_set_indirect_branch_type (tree fndecl)
5885 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5887 tree attr = lookup_attribute ("indirect_branch",
5888 DECL_ATTRIBUTES (fndecl));
5889 if (attr != NULL)
5891 tree args = TREE_VALUE (attr);
5892 if (args == NULL)
5893 gcc_unreachable ();
5894 tree cst = TREE_VALUE (args);
5895 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5896 cfun->machine->indirect_branch_type = indirect_branch_keep;
5897 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5898 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5899 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5900 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5901 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5902 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5903 else
5904 gcc_unreachable ();
5906 else
5907 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5909 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5910 nor -mindirect-branch=thunk-extern. */
5911 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5912 && ((cfun->machine->indirect_branch_type
5913 == indirect_branch_thunk_extern)
5914 || (cfun->machine->indirect_branch_type
5915 == indirect_branch_thunk)))
5916 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5917 "compatible",
5918 ((cfun->machine->indirect_branch_type
5919 == indirect_branch_thunk_extern)
5920 ? "thunk-extern" : "thunk"));
5922 /* -mindirect-branch=thunk-extern, -fcf-protection=branch and
5923 -fcheck-pointer-bounds are not compatible. */
5924 if ((cfun->machine->indirect_branch_type
5925 == indirect_branch_thunk_extern)
5926 && flag_check_pointer_bounds
5927 && (flag_cf_protection & CF_BRANCH) != 0)
5928 error ("%<-mindirect-branch=thunk-extern%>, "
5929 "%<-fcf-protection=branch%> and "
5930 "%<-fcheck-pointer-bounds%> are not compatible");
5933 if (cfun->machine->function_return_type == indirect_branch_unset)
5935 tree attr = lookup_attribute ("function_return",
5936 DECL_ATTRIBUTES (fndecl));
5937 if (attr != NULL)
5939 tree args = TREE_VALUE (attr);
5940 if (args == NULL)
5941 gcc_unreachable ();
5942 tree cst = TREE_VALUE (args);
5943 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5944 cfun->machine->function_return_type = indirect_branch_keep;
5945 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5946 cfun->machine->function_return_type = indirect_branch_thunk;
5947 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5948 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5949 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5950 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5951 else
5952 gcc_unreachable ();
5954 else
5955 cfun->machine->function_return_type = ix86_function_return;
5957 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5958 nor -mfunction-return=thunk-extern. */
5959 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5960 && ((cfun->machine->function_return_type
5961 == indirect_branch_thunk_extern)
5962 || (cfun->machine->function_return_type
5963 == indirect_branch_thunk)))
5964 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5965 "compatible",
5966 ((cfun->machine->function_return_type
5967 == indirect_branch_thunk_extern)
5968 ? "thunk-extern" : "thunk"));
5972 /* Establish appropriate back-end context for processing the function
5973 FNDECL. The argument might be NULL to indicate processing at top
5974 level, outside of any function scope. */
5975 static void
5976 ix86_set_current_function (tree fndecl)
5978 /* Only change the context if the function changes. This hook is called
5979 several times in the course of compiling a function, and we don't want to
5980 slow things down too much or call target_reinit when it isn't safe. */
5981 if (fndecl == ix86_previous_fndecl)
5983 /* There may be 2 function bodies for the same function FNDECL,
5984 one is extern inline and one isn't. Call ix86_set_func_type
5985 to set the func_type field. */
5986 if (fndecl != NULL_TREE)
5988 ix86_set_func_type (fndecl);
5989 ix86_set_indirect_branch_type (fndecl);
5991 return;
5994 tree old_tree;
5995 if (ix86_previous_fndecl == NULL_TREE)
5996 old_tree = target_option_current_node;
5997 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5998 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5999 else
6000 old_tree = target_option_default_node;
6002 if (fndecl == NULL_TREE)
6004 if (old_tree != target_option_current_node)
6005 ix86_reset_previous_fndecl ();
6006 return;
6009 ix86_set_func_type (fndecl);
6010 ix86_set_indirect_branch_type (fndecl);
6012 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
6013 if (new_tree == NULL_TREE)
6014 new_tree = target_option_default_node;
6016 if (old_tree != new_tree)
6018 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6019 if (TREE_TARGET_GLOBALS (new_tree))
6020 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6021 else if (new_tree == target_option_default_node)
6022 restore_target_globals (&default_target_globals);
6023 else
6024 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6026 ix86_previous_fndecl = fndecl;
6028 static bool prev_no_caller_saved_registers;
6030 /* 64-bit MS and SYSV ABI have different set of call used registers.
6031 Avoid expensive re-initialization of init_regs each time we switch
6032 function context. */
6033 if (TARGET_64BIT
6034 && (call_used_regs[SI_REG]
6035 == (cfun->machine->call_abi == MS_ABI)))
6036 reinit_regs ();
6037 /* Need to re-initialize init_regs if caller-saved registers are
6038 changed. */
6039 else if (prev_no_caller_saved_registers
6040 != cfun->machine->no_caller_saved_registers)
6041 reinit_regs ();
6043 if (cfun->machine->func_type != TYPE_NORMAL
6044 || cfun->machine->no_caller_saved_registers)
6046 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
6047 may change processor state. */
6048 const char *isa;
6049 if (TARGET_MPX)
6050 isa = "MPX";
6051 else if (TARGET_SSE)
6052 isa = "SSE";
6053 else if (TARGET_MMX)
6054 isa = "MMX/3Dnow";
6055 else if (TARGET_80387)
6056 isa = "80387";
6057 else
6058 isa = NULL;
6059 if (isa != NULL)
6061 if (cfun->machine->func_type != TYPE_NORMAL)
6062 sorry ("%s instructions aren't allowed in %s service routine",
6063 isa, (cfun->machine->func_type == TYPE_EXCEPTION
6064 ? "exception" : "interrupt"));
6065 else
6066 sorry ("%s instructions aren't allowed in function with "
6067 "no_caller_saved_registers attribute", isa);
6068 /* Don't issue the same error twice. */
6069 cfun->machine->func_type = TYPE_NORMAL;
6070 cfun->machine->no_caller_saved_registers = false;
6074 prev_no_caller_saved_registers
6075 = cfun->machine->no_caller_saved_registers;
6079 /* Return true if this goes in large data/bss. */
6081 static bool
6082 ix86_in_large_data_p (tree exp)
6084 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6085 return false;
6087 if (exp == NULL_TREE)
6088 return false;
6090 /* Functions are never large data. */
6091 if (TREE_CODE (exp) == FUNCTION_DECL)
6092 return false;
6094 /* Automatic variables are never large data. */
6095 if (VAR_P (exp) && !is_global_var (exp))
6096 return false;
6098 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6100 const char *section = DECL_SECTION_NAME (exp);
6101 if (strcmp (section, ".ldata") == 0
6102 || strcmp (section, ".lbss") == 0)
6103 return true;
6104 return false;
6106 else
6108 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6110 /* If this is an incomplete type with size 0, then we can't put it
6111 in data because it might be too big when completed. Also,
6112 int_size_in_bytes returns -1 if size can vary or is larger than
6113 an integer in which case also it is safer to assume that it goes in
6114 large data. */
6115 if (size <= 0 || size > ix86_section_threshold)
6116 return true;
6119 return false;
6122 /* i386-specific section flag to mark large sections. */
6123 #define SECTION_LARGE SECTION_MACH_DEP
6125 /* Switch to the appropriate section for output of DECL.
6126 DECL is either a `VAR_DECL' node or a constant of some sort.
6127 RELOC indicates whether forming the initial value of DECL requires
6128 link-time relocations. */
6130 ATTRIBUTE_UNUSED static section *
6131 x86_64_elf_select_section (tree decl, int reloc,
6132 unsigned HOST_WIDE_INT align)
6134 if (ix86_in_large_data_p (decl))
6136 const char *sname = NULL;
6137 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6138 switch (categorize_decl_for_section (decl, reloc))
6140 case SECCAT_DATA:
6141 sname = ".ldata";
6142 break;
6143 case SECCAT_DATA_REL:
6144 sname = ".ldata.rel";
6145 break;
6146 case SECCAT_DATA_REL_LOCAL:
6147 sname = ".ldata.rel.local";
6148 break;
6149 case SECCAT_DATA_REL_RO:
6150 sname = ".ldata.rel.ro";
6151 break;
6152 case SECCAT_DATA_REL_RO_LOCAL:
6153 sname = ".ldata.rel.ro.local";
6154 break;
6155 case SECCAT_BSS:
6156 sname = ".lbss";
6157 flags |= SECTION_BSS;
6158 break;
6159 case SECCAT_RODATA:
6160 case SECCAT_RODATA_MERGE_STR:
6161 case SECCAT_RODATA_MERGE_STR_INIT:
6162 case SECCAT_RODATA_MERGE_CONST:
6163 sname = ".lrodata";
6164 flags &= ~SECTION_WRITE;
6165 break;
6166 case SECCAT_SRODATA:
6167 case SECCAT_SDATA:
6168 case SECCAT_SBSS:
6169 gcc_unreachable ();
6170 case SECCAT_TEXT:
6171 case SECCAT_TDATA:
6172 case SECCAT_TBSS:
6173 /* We don't split these for medium model. Place them into
6174 default sections and hope for best. */
6175 break;
6177 if (sname)
6179 /* We might get called with string constants, but get_named_section
6180 doesn't like them as they are not DECLs. Also, we need to set
6181 flags in that case. */
6182 if (!DECL_P (decl))
6183 return get_section (sname, flags, NULL);
6184 return get_named_section (decl, sname, reloc);
6187 return default_elf_select_section (decl, reloc, align);
6190 /* Select a set of attributes for section NAME based on the properties
6191 of DECL and whether or not RELOC indicates that DECL's initializer
6192 might contain runtime relocations. */
6194 static unsigned int ATTRIBUTE_UNUSED
6195 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6197 unsigned int flags = default_section_type_flags (decl, name, reloc);
6199 if (ix86_in_large_data_p (decl))
6200 flags |= SECTION_LARGE;
6202 if (decl == NULL_TREE
6203 && (strcmp (name, ".ldata.rel.ro") == 0
6204 || strcmp (name, ".ldata.rel.ro.local") == 0))
6205 flags |= SECTION_RELRO;
6207 if (strcmp (name, ".lbss") == 0
6208 || strncmp (name, ".lbss.", 5) == 0
6209 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6210 flags |= SECTION_BSS;
6212 return flags;
6215 /* Build up a unique section name, expressed as a
6216 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6217 RELOC indicates whether the initial value of EXP requires
6218 link-time relocations. */
6220 static void ATTRIBUTE_UNUSED
6221 x86_64_elf_unique_section (tree decl, int reloc)
6223 if (ix86_in_large_data_p (decl))
6225 const char *prefix = NULL;
6226 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6227 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6229 switch (categorize_decl_for_section (decl, reloc))
6231 case SECCAT_DATA:
6232 case SECCAT_DATA_REL:
6233 case SECCAT_DATA_REL_LOCAL:
6234 case SECCAT_DATA_REL_RO:
6235 case SECCAT_DATA_REL_RO_LOCAL:
6236 prefix = one_only ? ".ld" : ".ldata";
6237 break;
6238 case SECCAT_BSS:
6239 prefix = one_only ? ".lb" : ".lbss";
6240 break;
6241 case SECCAT_RODATA:
6242 case SECCAT_RODATA_MERGE_STR:
6243 case SECCAT_RODATA_MERGE_STR_INIT:
6244 case SECCAT_RODATA_MERGE_CONST:
6245 prefix = one_only ? ".lr" : ".lrodata";
6246 break;
6247 case SECCAT_SRODATA:
6248 case SECCAT_SDATA:
6249 case SECCAT_SBSS:
6250 gcc_unreachable ();
6251 case SECCAT_TEXT:
6252 case SECCAT_TDATA:
6253 case SECCAT_TBSS:
6254 /* We don't split these for medium model. Place them into
6255 default sections and hope for best. */
6256 break;
6258 if (prefix)
6260 const char *name, *linkonce;
6261 char *string;
6263 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6264 name = targetm.strip_name_encoding (name);
6266 /* If we're using one_only, then there needs to be a .gnu.linkonce
6267 prefix to the section name. */
6268 linkonce = one_only ? ".gnu.linkonce" : "";
6270 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6272 set_decl_section_name (decl, string);
6273 return;
6276 default_unique_section (decl, reloc);
6279 #ifdef COMMON_ASM_OP
6281 #ifndef LARGECOMM_SECTION_ASM_OP
6282 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6283 #endif
6285 /* This says how to output assembler code to declare an
6286 uninitialized external linkage data object.
6288 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6289 large objects. */
6290 void
6291 x86_elf_aligned_decl_common (FILE *file, tree decl,
6292 const char *name, unsigned HOST_WIDE_INT size,
6293 int align)
6295 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6296 && size > (unsigned int)ix86_section_threshold)
6298 switch_to_section (get_named_section (decl, ".lbss", 0));
6299 fputs (LARGECOMM_SECTION_ASM_OP, file);
6301 else
6302 fputs (COMMON_ASM_OP, file);
6303 assemble_name (file, name);
6304 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6305 size, align / BITS_PER_UNIT);
6307 #endif
6309 /* Utility function for targets to use in implementing
6310 ASM_OUTPUT_ALIGNED_BSS. */
6312 void
6313 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6314 unsigned HOST_WIDE_INT size, int align)
6316 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6317 && size > (unsigned int)ix86_section_threshold)
6318 switch_to_section (get_named_section (decl, ".lbss", 0));
6319 else
6320 switch_to_section (bss_section);
6321 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6322 #ifdef ASM_DECLARE_OBJECT_NAME
6323 last_assemble_variable_decl = decl;
6324 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6325 #else
6326 /* Standard thing is just output label for the object. */
6327 ASM_OUTPUT_LABEL (file, name);
6328 #endif /* ASM_DECLARE_OBJECT_NAME */
6329 ASM_OUTPUT_SKIP (file, size ? size : 1);
6332 /* Decide whether we must probe the stack before any space allocation
6333 on this target. It's essentially TARGET_STACK_PROBE except when
6334 -fstack-check causes the stack to be already probed differently. */
6336 bool
6337 ix86_target_stack_probe (void)
6339 /* Do not probe the stack twice if static stack checking is enabled. */
6340 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6341 return false;
6343 return TARGET_STACK_PROBE;
6346 /* Decide whether we can make a sibling call to a function. DECL is the
6347 declaration of the function being targeted by the call and EXP is the
6348 CALL_EXPR representing the call. */
6350 static bool
6351 ix86_function_ok_for_sibcall (tree decl, tree exp)
6353 tree type, decl_or_type;
6354 rtx a, b;
6355 bool bind_global = decl && !targetm.binds_local_p (decl);
6357 if (ix86_function_naked (current_function_decl))
6358 return false;
6360 /* Sibling call isn't OK if there are no caller-saved registers
6361 since all registers must be preserved before return. */
6362 if (cfun->machine->no_caller_saved_registers)
6363 return false;
6365 /* If we are generating position-independent code, we cannot sibcall
6366 optimize direct calls to global functions, as the PLT requires
6367 %ebx be live. (Darwin does not have a PLT.) */
6368 if (!TARGET_MACHO
6369 && !TARGET_64BIT
6370 && flag_pic
6371 && flag_plt
6372 && bind_global)
6373 return false;
6375 /* If we need to align the outgoing stack, then sibcalling would
6376 unalign the stack, which may break the called function. */
6377 if (ix86_minimum_incoming_stack_boundary (true)
6378 < PREFERRED_STACK_BOUNDARY)
6379 return false;
6381 if (decl)
6383 decl_or_type = decl;
6384 type = TREE_TYPE (decl);
6386 else
6388 /* We're looking at the CALL_EXPR, we need the type of the function. */
6389 type = CALL_EXPR_FN (exp); /* pointer expression */
6390 type = TREE_TYPE (type); /* pointer type */
6391 type = TREE_TYPE (type); /* function type */
6392 decl_or_type = type;
6395 /* Check that the return value locations are the same. Like
6396 if we are returning floats on the 80387 register stack, we cannot
6397 make a sibcall from a function that doesn't return a float to a
6398 function that does or, conversely, from a function that does return
6399 a float to a function that doesn't; the necessary stack adjustment
6400 would not be executed. This is also the place we notice
6401 differences in the return value ABI. Note that it is ok for one
6402 of the functions to have void return type as long as the return
6403 value of the other is passed in a register. */
6404 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6405 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6406 cfun->decl, false);
6407 if (STACK_REG_P (a) || STACK_REG_P (b))
6409 if (!rtx_equal_p (a, b))
6410 return false;
6412 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6414 else if (!rtx_equal_p (a, b))
6415 return false;
6417 if (TARGET_64BIT)
6419 /* The SYSV ABI has more call-clobbered registers;
6420 disallow sibcalls from MS to SYSV. */
6421 if (cfun->machine->call_abi == MS_ABI
6422 && ix86_function_type_abi (type) == SYSV_ABI)
6423 return false;
6425 else
6427 /* If this call is indirect, we'll need to be able to use a
6428 call-clobbered register for the address of the target function.
6429 Make sure that all such registers are not used for passing
6430 parameters. Note that DLLIMPORT functions and call to global
6431 function via GOT slot are indirect. */
6432 if (!decl
6433 || (bind_global && flag_pic && !flag_plt)
6434 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
6435 || flag_force_indirect_call)
6437 /* Check if regparm >= 3 since arg_reg_available is set to
6438 false if regparm == 0. If regparm is 1 or 2, there is
6439 always a call-clobbered register available.
6441 ??? The symbol indirect call doesn't need a call-clobbered
6442 register. But we don't know if this is a symbol indirect
6443 call or not here. */
6444 if (ix86_function_regparm (type, decl) >= 3
6445 && !cfun->machine->arg_reg_available)
6446 return false;
6450 /* Otherwise okay. That also includes certain types of indirect calls. */
6451 return true;
6454 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6455 and "sseregparm" calling convention attributes;
6456 arguments as in struct attribute_spec.handler. */
6458 static tree
6459 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6460 bool *no_add_attrs)
6462 if (TREE_CODE (*node) != FUNCTION_TYPE
6463 && TREE_CODE (*node) != METHOD_TYPE
6464 && TREE_CODE (*node) != FIELD_DECL
6465 && TREE_CODE (*node) != TYPE_DECL)
6467 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6468 name);
6469 *no_add_attrs = true;
6470 return NULL_TREE;
6473 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6474 if (is_attribute_p ("regparm", name))
6476 tree cst;
6478 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6480 error ("fastcall and regparm attributes are not compatible");
6483 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6485 error ("regparam and thiscall attributes are not compatible");
6488 cst = TREE_VALUE (args);
6489 if (TREE_CODE (cst) != INTEGER_CST)
6491 warning (OPT_Wattributes,
6492 "%qE attribute requires an integer constant argument",
6493 name);
6494 *no_add_attrs = true;
6496 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6498 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6499 name, REGPARM_MAX);
6500 *no_add_attrs = true;
6503 return NULL_TREE;
6506 if (TARGET_64BIT)
6508 /* Do not warn when emulating the MS ABI. */
6509 if ((TREE_CODE (*node) != FUNCTION_TYPE
6510 && TREE_CODE (*node) != METHOD_TYPE)
6511 || ix86_function_type_abi (*node) != MS_ABI)
6512 warning (OPT_Wattributes, "%qE attribute ignored",
6513 name);
6514 *no_add_attrs = true;
6515 return NULL_TREE;
6518 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6519 if (is_attribute_p ("fastcall", name))
6521 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6523 error ("fastcall and cdecl attributes are not compatible");
6525 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6527 error ("fastcall and stdcall attributes are not compatible");
6529 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6531 error ("fastcall and regparm attributes are not compatible");
6533 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6535 error ("fastcall and thiscall attributes are not compatible");
6539 /* Can combine stdcall with fastcall (redundant), regparm and
6540 sseregparm. */
6541 else if (is_attribute_p ("stdcall", name))
6543 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6545 error ("stdcall and cdecl attributes are not compatible");
6547 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6549 error ("stdcall and fastcall attributes are not compatible");
6551 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6553 error ("stdcall and thiscall attributes are not compatible");
6557 /* Can combine cdecl with regparm and sseregparm. */
6558 else if (is_attribute_p ("cdecl", name))
6560 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6562 error ("stdcall and cdecl attributes are not compatible");
6564 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6566 error ("fastcall and cdecl attributes are not compatible");
6568 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6570 error ("cdecl and thiscall attributes are not compatible");
6573 else if (is_attribute_p ("thiscall", name))
6575 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6576 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6577 name);
6578 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6580 error ("stdcall and thiscall attributes are not compatible");
6582 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6584 error ("fastcall and thiscall attributes are not compatible");
6586 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6588 error ("cdecl and thiscall attributes are not compatible");
6592 /* Can combine sseregparm with all attributes. */
6594 return NULL_TREE;
6597 /* The transactional memory builtins are implicitly regparm or fastcall
6598 depending on the ABI. Override the generic do-nothing attribute that
6599 these builtins were declared with, and replace it with one of the two
6600 attributes that we expect elsewhere. */
6602 static tree
6603 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6604 int flags, bool *no_add_attrs)
6606 tree alt;
6608 /* In no case do we want to add the placeholder attribute. */
6609 *no_add_attrs = true;
6611 /* The 64-bit ABI is unchanged for transactional memory. */
6612 if (TARGET_64BIT)
6613 return NULL_TREE;
6615 /* ??? Is there a better way to validate 32-bit windows? We have
6616 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6617 if (CHECK_STACK_LIMIT > 0)
6618 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6619 else
6621 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6622 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6624 decl_attributes (node, alt, flags);
6626 return NULL_TREE;
6629 /* This function determines from TYPE the calling-convention. */
6631 unsigned int
6632 ix86_get_callcvt (const_tree type)
6634 unsigned int ret = 0;
6635 bool is_stdarg;
6636 tree attrs;
6638 if (TARGET_64BIT)
6639 return IX86_CALLCVT_CDECL;
6641 attrs = TYPE_ATTRIBUTES (type);
6642 if (attrs != NULL_TREE)
6644 if (lookup_attribute ("cdecl", attrs))
6645 ret |= IX86_CALLCVT_CDECL;
6646 else if (lookup_attribute ("stdcall", attrs))
6647 ret |= IX86_CALLCVT_STDCALL;
6648 else if (lookup_attribute ("fastcall", attrs))
6649 ret |= IX86_CALLCVT_FASTCALL;
6650 else if (lookup_attribute ("thiscall", attrs))
6651 ret |= IX86_CALLCVT_THISCALL;
6653 /* Regparam isn't allowed for thiscall and fastcall. */
6654 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6656 if (lookup_attribute ("regparm", attrs))
6657 ret |= IX86_CALLCVT_REGPARM;
6658 if (lookup_attribute ("sseregparm", attrs))
6659 ret |= IX86_CALLCVT_SSEREGPARM;
6662 if (IX86_BASE_CALLCVT(ret) != 0)
6663 return ret;
6666 is_stdarg = stdarg_p (type);
6667 if (TARGET_RTD && !is_stdarg)
6668 return IX86_CALLCVT_STDCALL | ret;
6670 if (ret != 0
6671 || is_stdarg
6672 || TREE_CODE (type) != METHOD_TYPE
6673 || ix86_function_type_abi (type) != MS_ABI)
6674 return IX86_CALLCVT_CDECL | ret;
6676 return IX86_CALLCVT_THISCALL;
6679 /* Return 0 if the attributes for two types are incompatible, 1 if they
6680 are compatible, and 2 if they are nearly compatible (which causes a
6681 warning to be generated). */
6683 static int
6684 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6686 unsigned int ccvt1, ccvt2;
6688 if (TREE_CODE (type1) != FUNCTION_TYPE
6689 && TREE_CODE (type1) != METHOD_TYPE)
6690 return 1;
6692 ccvt1 = ix86_get_callcvt (type1);
6693 ccvt2 = ix86_get_callcvt (type2);
6694 if (ccvt1 != ccvt2)
6695 return 0;
6696 if (ix86_function_regparm (type1, NULL)
6697 != ix86_function_regparm (type2, NULL))
6698 return 0;
6700 return 1;
6703 /* Return the regparm value for a function with the indicated TYPE and DECL.
6704 DECL may be NULL when calling function indirectly
6705 or considering a libcall. */
6707 static int
6708 ix86_function_regparm (const_tree type, const_tree decl)
6710 tree attr;
6711 int regparm;
6712 unsigned int ccvt;
6714 if (TARGET_64BIT)
6715 return (ix86_function_type_abi (type) == SYSV_ABI
6716 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6717 ccvt = ix86_get_callcvt (type);
6718 regparm = ix86_regparm;
6720 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6722 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6723 if (attr)
6725 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6726 return regparm;
6729 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6730 return 2;
6731 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6732 return 1;
6734 /* Use register calling convention for local functions when possible. */
6735 if (decl
6736 && TREE_CODE (decl) == FUNCTION_DECL)
6738 cgraph_node *target = cgraph_node::get (decl);
6739 if (target)
6740 target = target->function_symbol ();
6742 /* Caller and callee must agree on the calling convention, so
6743 checking here just optimize means that with
6744 __attribute__((optimize (...))) caller could use regparm convention
6745 and callee not, or vice versa. Instead look at whether the callee
6746 is optimized or not. */
6747 if (target && opt_for_fn (target->decl, optimize)
6748 && !(profile_flag && !flag_fentry))
6750 cgraph_local_info *i = &target->local;
6751 if (i && i->local && i->can_change_signature)
6753 int local_regparm, globals = 0, regno;
6755 /* Make sure no regparm register is taken by a
6756 fixed register variable. */
6757 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6758 local_regparm++)
6759 if (fixed_regs[local_regparm])
6760 break;
6762 /* We don't want to use regparm(3) for nested functions as
6763 these use a static chain pointer in the third argument. */
6764 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6765 local_regparm = 2;
6767 /* Save a register for the split stack. */
6768 if (flag_split_stack)
6770 if (local_regparm == 3)
6771 local_regparm = 2;
6772 else if (local_regparm == 2
6773 && DECL_STATIC_CHAIN (target->decl))
6774 local_regparm = 1;
6777 /* Each fixed register usage increases register pressure,
6778 so less registers should be used for argument passing.
6779 This functionality can be overriden by an explicit
6780 regparm value. */
6781 for (regno = AX_REG; regno <= DI_REG; regno++)
6782 if (fixed_regs[regno])
6783 globals++;
6785 local_regparm
6786 = globals < local_regparm ? local_regparm - globals : 0;
6788 if (local_regparm > regparm)
6789 regparm = local_regparm;
6794 return regparm;
6797 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6798 DFmode (2) arguments in SSE registers for a function with the
6799 indicated TYPE and DECL. DECL may be NULL when calling function
6800 indirectly or considering a libcall. Return -1 if any FP parameter
6801 should be rejected by error. This is used in siutation we imply SSE
6802 calling convetion but the function is called from another function with
6803 SSE disabled. Otherwise return 0. */
6805 static int
6806 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6808 gcc_assert (!TARGET_64BIT);
6810 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6811 by the sseregparm attribute. */
6812 if (TARGET_SSEREGPARM
6813 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6815 if (!TARGET_SSE)
6817 if (warn)
6819 if (decl)
6820 error ("calling %qD with attribute sseregparm without "
6821 "SSE/SSE2 enabled", decl);
6822 else
6823 error ("calling %qT with attribute sseregparm without "
6824 "SSE/SSE2 enabled", type);
6826 return 0;
6829 return 2;
6832 if (!decl)
6833 return 0;
6835 cgraph_node *target = cgraph_node::get (decl);
6836 if (target)
6837 target = target->function_symbol ();
6839 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6840 (and DFmode for SSE2) arguments in SSE registers. */
6841 if (target
6842 /* TARGET_SSE_MATH */
6843 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6844 && opt_for_fn (target->decl, optimize)
6845 && !(profile_flag && !flag_fentry))
6847 cgraph_local_info *i = &target->local;
6848 if (i && i->local && i->can_change_signature)
6850 /* Refuse to produce wrong code when local function with SSE enabled
6851 is called from SSE disabled function.
6852 FIXME: We need a way to detect these cases cross-ltrans partition
6853 and avoid using SSE calling conventions on local functions called
6854 from function with SSE disabled. For now at least delay the
6855 warning until we know we are going to produce wrong code.
6856 See PR66047 */
6857 if (!TARGET_SSE && warn)
6858 return -1;
6859 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6860 ->x_ix86_isa_flags) ? 2 : 1;
6864 return 0;
6867 /* Return true if EAX is live at the start of the function. Used by
6868 ix86_expand_prologue to determine if we need special help before
6869 calling allocate_stack_worker. */
6871 static bool
6872 ix86_eax_live_at_start_p (void)
6874 /* Cheat. Don't bother working forward from ix86_function_regparm
6875 to the function type to whether an actual argument is located in
6876 eax. Instead just look at cfg info, which is still close enough
6877 to correct at this point. This gives false positives for broken
6878 functions that might use uninitialized data that happens to be
6879 allocated in eax, but who cares? */
6880 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6883 static bool
6884 ix86_keep_aggregate_return_pointer (tree fntype)
6886 tree attr;
6888 if (!TARGET_64BIT)
6890 attr = lookup_attribute ("callee_pop_aggregate_return",
6891 TYPE_ATTRIBUTES (fntype));
6892 if (attr)
6893 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6895 /* For 32-bit MS-ABI the default is to keep aggregate
6896 return pointer. */
6897 if (ix86_function_type_abi (fntype) == MS_ABI)
6898 return true;
6900 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6903 /* Value is the number of bytes of arguments automatically
6904 popped when returning from a subroutine call.
6905 FUNDECL is the declaration node of the function (as a tree),
6906 FUNTYPE is the data type of the function (as a tree),
6907 or for a library call it is an identifier node for the subroutine name.
6908 SIZE is the number of bytes of arguments passed on the stack.
6910 On the 80386, the RTD insn may be used to pop them if the number
6911 of args is fixed, but if the number is variable then the caller
6912 must pop them all. RTD can't be used for library calls now
6913 because the library is compiled with the Unix compiler.
6914 Use of RTD is a selectable option, since it is incompatible with
6915 standard Unix calling sequences. If the option is not selected,
6916 the caller must always pop the args.
6918 The attribute stdcall is equivalent to RTD on a per module basis. */
6920 static poly_int64
6921 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6923 unsigned int ccvt;
6925 /* None of the 64-bit ABIs pop arguments. */
6926 if (TARGET_64BIT)
6927 return 0;
6929 ccvt = ix86_get_callcvt (funtype);
6931 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6932 | IX86_CALLCVT_THISCALL)) != 0
6933 && ! stdarg_p (funtype))
6934 return size;
6936 /* Lose any fake structure return argument if it is passed on the stack. */
6937 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6938 && !ix86_keep_aggregate_return_pointer (funtype))
6940 int nregs = ix86_function_regparm (funtype, fundecl);
6941 if (nregs == 0)
6942 return GET_MODE_SIZE (Pmode);
6945 return 0;
6948 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6950 static bool
6951 ix86_legitimate_combined_insn (rtx_insn *insn)
6953 int i;
6955 /* Check operand constraints in case hard registers were propagated
6956 into insn pattern. This check prevents combine pass from
6957 generating insn patterns with invalid hard register operands.
6958 These invalid insns can eventually confuse reload to error out
6959 with a spill failure. See also PRs 46829 and 46843. */
6961 gcc_assert (INSN_CODE (insn) >= 0);
6963 extract_insn (insn);
6964 preprocess_constraints (insn);
6966 int n_operands = recog_data.n_operands;
6967 int n_alternatives = recog_data.n_alternatives;
6968 for (i = 0; i < n_operands; i++)
6970 rtx op = recog_data.operand[i];
6971 machine_mode mode = GET_MODE (op);
6972 const operand_alternative *op_alt;
6973 int offset = 0;
6974 bool win;
6975 int j;
6977 /* A unary operator may be accepted by the predicate, but it
6978 is irrelevant for matching constraints. */
6979 if (UNARY_P (op))
6980 op = XEXP (op, 0);
6982 if (SUBREG_P (op))
6984 if (REG_P (SUBREG_REG (op))
6985 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6986 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6987 GET_MODE (SUBREG_REG (op)),
6988 SUBREG_BYTE (op),
6989 GET_MODE (op));
6990 op = SUBREG_REG (op);
6993 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6994 continue;
6996 op_alt = recog_op_alt;
6998 /* Operand has no constraints, anything is OK. */
6999 win = !n_alternatives;
7001 alternative_mask preferred = get_preferred_alternatives (insn);
7002 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
7004 if (!TEST_BIT (preferred, j))
7005 continue;
7006 if (op_alt[i].anything_ok
7007 || (op_alt[i].matches != -1
7008 && operands_match_p
7009 (recog_data.operand[i],
7010 recog_data.operand[op_alt[i].matches]))
7011 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
7013 win = true;
7014 break;
7018 if (!win)
7019 return false;
7022 return true;
7025 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
7027 static unsigned HOST_WIDE_INT
7028 ix86_asan_shadow_offset (void)
7030 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
7031 : HOST_WIDE_INT_C (0x7fff8000))
7032 : (HOST_WIDE_INT_1 << 29);
7035 /* Argument support functions. */
7037 /* Return true when register may be used to pass function parameters. */
7038 bool
7039 ix86_function_arg_regno_p (int regno)
7041 int i;
7042 enum calling_abi call_abi;
7043 const int *parm_regs;
7045 if (TARGET_MPX && BND_REGNO_P (regno))
7046 return true;
7048 if (!TARGET_64BIT)
7050 if (TARGET_MACHO)
7051 return (regno < REGPARM_MAX
7052 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
7053 else
7054 return (regno < REGPARM_MAX
7055 || (TARGET_MMX && MMX_REGNO_P (regno)
7056 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
7057 || (TARGET_SSE && SSE_REGNO_P (regno)
7058 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
7061 if (TARGET_SSE && SSE_REGNO_P (regno)
7062 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
7063 return true;
7065 /* TODO: The function should depend on current function ABI but
7066 builtins.c would need updating then. Therefore we use the
7067 default ABI. */
7068 call_abi = ix86_cfun_abi ();
7070 /* RAX is used as hidden argument to va_arg functions. */
7071 if (call_abi == SYSV_ABI && regno == AX_REG)
7072 return true;
7074 if (call_abi == MS_ABI)
7075 parm_regs = x86_64_ms_abi_int_parameter_registers;
7076 else
7077 parm_regs = x86_64_int_parameter_registers;
7079 for (i = 0; i < (call_abi == MS_ABI
7080 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7081 if (regno == parm_regs[i])
7082 return true;
7083 return false;
7086 /* Return if we do not know how to pass TYPE solely in registers. */
7088 static bool
7089 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7091 if (must_pass_in_stack_var_size_or_pad (mode, type))
7092 return true;
7094 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7095 The layout_type routine is crafty and tries to trick us into passing
7096 currently unsupported vector types on the stack by using TImode. */
7097 return (!TARGET_64BIT && mode == TImode
7098 && type && TREE_CODE (type) != VECTOR_TYPE);
7101 /* It returns the size, in bytes, of the area reserved for arguments passed
7102 in registers for the function represented by fndecl dependent to the used
7103 abi format. */
7105 ix86_reg_parm_stack_space (const_tree fndecl)
7107 enum calling_abi call_abi = SYSV_ABI;
7108 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7109 call_abi = ix86_function_abi (fndecl);
7110 else
7111 call_abi = ix86_function_type_abi (fndecl);
7112 if (TARGET_64BIT && call_abi == MS_ABI)
7113 return 32;
7114 return 0;
7117 /* We add this as a workaround in order to use libc_has_function
7118 hook in i386.md. */
7119 bool
7120 ix86_libc_has_function (enum function_class fn_class)
7122 return targetm.libc_has_function (fn_class);
7125 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7126 specifying the call abi used. */
7127 enum calling_abi
7128 ix86_function_type_abi (const_tree fntype)
7130 enum calling_abi abi = ix86_abi;
7132 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7133 return abi;
7135 if (abi == SYSV_ABI
7136 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7138 static int warned;
7139 if (TARGET_X32 && !warned)
7141 error ("X32 does not support ms_abi attribute");
7142 warned = 1;
7145 abi = MS_ABI;
7147 else if (abi == MS_ABI
7148 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7149 abi = SYSV_ABI;
7151 return abi;
7154 static enum calling_abi
7155 ix86_function_abi (const_tree fndecl)
7157 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7160 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7161 specifying the call abi used. */
7162 enum calling_abi
7163 ix86_cfun_abi (void)
7165 return cfun ? cfun->machine->call_abi : ix86_abi;
7168 static bool
7169 ix86_function_ms_hook_prologue (const_tree fn)
7171 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7173 if (decl_function_context (fn) != NULL_TREE)
7174 error_at (DECL_SOURCE_LOCATION (fn),
7175 "ms_hook_prologue is not compatible with nested function");
7176 else
7177 return true;
7179 return false;
7182 static bool
7183 ix86_function_naked (const_tree fn)
7185 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7186 return true;
7188 return false;
7191 /* Write the extra assembler code needed to declare a function properly. */
7193 void
7194 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7195 tree decl)
7197 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7199 if (is_ms_hook)
7201 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7202 unsigned int filler_cc = 0xcccccccc;
7204 for (i = 0; i < filler_count; i += 4)
7205 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7208 #ifdef SUBTARGET_ASM_UNWIND_INIT
7209 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7210 #endif
7212 ASM_OUTPUT_LABEL (asm_out_file, fname);
7214 /* Output magic byte marker, if hot-patch attribute is set. */
7215 if (is_ms_hook)
7217 if (TARGET_64BIT)
7219 /* leaq [%rsp + 0], %rsp */
7220 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7221 asm_out_file);
7223 else
7225 /* movl.s %edi, %edi
7226 push %ebp
7227 movl.s %esp, %ebp */
7228 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7233 /* Implementation of call abi switching target hook. Specific to FNDECL
7234 the specific call register sets are set. See also
7235 ix86_conditional_register_usage for more details. */
7236 void
7237 ix86_call_abi_override (const_tree fndecl)
7239 cfun->machine->call_abi = ix86_function_abi (fndecl);
7242 /* Return 1 if pseudo register should be created and used to hold
7243 GOT address for PIC code. */
7244 bool
7245 ix86_use_pseudo_pic_reg (void)
7247 if ((TARGET_64BIT
7248 && (ix86_cmodel == CM_SMALL_PIC
7249 || TARGET_PECOFF))
7250 || !flag_pic)
7251 return false;
7252 return true;
7255 /* Initialize large model PIC register. */
7257 static void
7258 ix86_init_large_pic_reg (unsigned int tmp_regno)
7260 rtx_code_label *label;
7261 rtx tmp_reg;
7263 gcc_assert (Pmode == DImode);
7264 label = gen_label_rtx ();
7265 emit_label (label);
7266 LABEL_PRESERVE_P (label) = 1;
7267 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7268 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7269 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7270 label));
7271 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7272 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7273 pic_offset_table_rtx, tmp_reg));
7274 const char *name = LABEL_NAME (label);
7275 PUT_CODE (label, NOTE);
7276 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7277 NOTE_DELETED_LABEL_NAME (label) = name;
7280 /* Create and initialize PIC register if required. */
7281 static void
7282 ix86_init_pic_reg (void)
7284 edge entry_edge;
7285 rtx_insn *seq;
7287 if (!ix86_use_pseudo_pic_reg ())
7288 return;
7290 start_sequence ();
7292 if (TARGET_64BIT)
7294 if (ix86_cmodel == CM_LARGE_PIC)
7295 ix86_init_large_pic_reg (R11_REG);
7296 else
7297 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7299 else
7301 /* If there is future mcount call in the function it is more profitable
7302 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7303 rtx reg = crtl->profile
7304 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7305 : pic_offset_table_rtx;
7306 rtx_insn *insn = emit_insn (gen_set_got (reg));
7307 RTX_FRAME_RELATED_P (insn) = 1;
7308 if (crtl->profile)
7309 emit_move_insn (pic_offset_table_rtx, reg);
7310 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7313 seq = get_insns ();
7314 end_sequence ();
7316 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7317 insert_insn_on_edge (seq, entry_edge);
7318 commit_one_edge_insertion (entry_edge);
7321 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7322 for a call to a function whose data type is FNTYPE.
7323 For a library call, FNTYPE is 0. */
7325 void
7326 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7327 tree fntype, /* tree ptr for function decl */
7328 rtx libname, /* SYMBOL_REF of library name or 0 */
7329 tree fndecl,
7330 int caller)
7332 struct cgraph_local_info *i = NULL;
7333 struct cgraph_node *target = NULL;
7335 memset (cum, 0, sizeof (*cum));
7337 if (fndecl)
7339 target = cgraph_node::get (fndecl);
7340 if (target)
7342 target = target->function_symbol ();
7343 i = cgraph_node::local_info (target->decl);
7344 cum->call_abi = ix86_function_abi (target->decl);
7346 else
7347 cum->call_abi = ix86_function_abi (fndecl);
7349 else
7350 cum->call_abi = ix86_function_type_abi (fntype);
7352 cum->caller = caller;
7354 /* Set up the number of registers to use for passing arguments. */
7355 cum->nregs = ix86_regparm;
7356 if (TARGET_64BIT)
7358 cum->nregs = (cum->call_abi == SYSV_ABI
7359 ? X86_64_REGPARM_MAX
7360 : X86_64_MS_REGPARM_MAX);
7362 if (TARGET_SSE)
7364 cum->sse_nregs = SSE_REGPARM_MAX;
7365 if (TARGET_64BIT)
7367 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7368 ? X86_64_SSE_REGPARM_MAX
7369 : X86_64_MS_SSE_REGPARM_MAX);
7372 if (TARGET_MMX)
7373 cum->mmx_nregs = MMX_REGPARM_MAX;
7374 cum->warn_avx512f = true;
7375 cum->warn_avx = true;
7376 cum->warn_sse = true;
7377 cum->warn_mmx = true;
7379 /* Because type might mismatch in between caller and callee, we need to
7380 use actual type of function for local calls.
7381 FIXME: cgraph_analyze can be told to actually record if function uses
7382 va_start so for local functions maybe_vaarg can be made aggressive
7383 helping K&R code.
7384 FIXME: once typesytem is fixed, we won't need this code anymore. */
7385 if (i && i->local && i->can_change_signature)
7386 fntype = TREE_TYPE (target->decl);
7387 cum->stdarg = stdarg_p (fntype);
7388 cum->maybe_vaarg = (fntype
7389 ? (!prototype_p (fntype) || stdarg_p (fntype))
7390 : !libname);
7392 cum->bnd_regno = FIRST_BND_REG;
7393 cum->bnds_in_bt = 0;
7394 cum->force_bnd_pass = 0;
7395 cum->decl = fndecl;
7397 cum->warn_empty = !warn_abi || cum->stdarg;
7398 if (!cum->warn_empty && fntype)
7400 function_args_iterator iter;
7401 tree argtype;
7402 bool seen_empty_type = false;
7403 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7405 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7406 break;
7407 if (TYPE_EMPTY_P (argtype))
7408 seen_empty_type = true;
7409 else if (seen_empty_type)
7411 cum->warn_empty = true;
7412 break;
7417 if (!TARGET_64BIT)
7419 /* If there are variable arguments, then we won't pass anything
7420 in registers in 32-bit mode. */
7421 if (stdarg_p (fntype))
7423 cum->nregs = 0;
7424 /* Since in 32-bit, variable arguments are always passed on
7425 stack, there is scratch register available for indirect
7426 sibcall. */
7427 cfun->machine->arg_reg_available = true;
7428 cum->sse_nregs = 0;
7429 cum->mmx_nregs = 0;
7430 cum->warn_avx512f = false;
7431 cum->warn_avx = false;
7432 cum->warn_sse = false;
7433 cum->warn_mmx = false;
7434 return;
7437 /* Use ecx and edx registers if function has fastcall attribute,
7438 else look for regparm information. */
7439 if (fntype)
7441 unsigned int ccvt = ix86_get_callcvt (fntype);
7442 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7444 cum->nregs = 1;
7445 cum->fastcall = 1; /* Same first register as in fastcall. */
7447 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7449 cum->nregs = 2;
7450 cum->fastcall = 1;
7452 else
7453 cum->nregs = ix86_function_regparm (fntype, fndecl);
7456 /* Set up the number of SSE registers used for passing SFmode
7457 and DFmode arguments. Warn for mismatching ABI. */
7458 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7461 cfun->machine->arg_reg_available = (cum->nregs > 0);
7464 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7465 But in the case of vector types, it is some vector mode.
7467 When we have only some of our vector isa extensions enabled, then there
7468 are some modes for which vector_mode_supported_p is false. For these
7469 modes, the generic vector support in gcc will choose some non-vector mode
7470 in order to implement the type. By computing the natural mode, we'll
7471 select the proper ABI location for the operand and not depend on whatever
7472 the middle-end decides to do with these vector types.
7474 The midde-end can't deal with the vector types > 16 bytes. In this
7475 case, we return the original mode and warn ABI change if CUM isn't
7476 NULL.
7478 If INT_RETURN is true, warn ABI change if the vector mode isn't
7479 available for function return value. */
7481 static machine_mode
7482 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7483 bool in_return)
7485 machine_mode mode = TYPE_MODE (type);
7487 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7489 HOST_WIDE_INT size = int_size_in_bytes (type);
7490 if ((size == 8 || size == 16 || size == 32 || size == 64)
7491 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7492 && TYPE_VECTOR_SUBPARTS (type) > 1)
7494 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7496 /* There are no XFmode vector modes. */
7497 if (innermode == XFmode)
7498 return mode;
7500 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7501 mode = MIN_MODE_VECTOR_FLOAT;
7502 else
7503 mode = MIN_MODE_VECTOR_INT;
7505 /* Get the mode which has this inner mode and number of units. */
7506 FOR_EACH_MODE_FROM (mode, mode)
7507 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7508 && GET_MODE_INNER (mode) == innermode)
7510 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7512 static bool warnedavx512f;
7513 static bool warnedavx512f_ret;
7515 if (cum && cum->warn_avx512f && !warnedavx512f)
7517 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7518 "without AVX512F enabled changes the ABI"))
7519 warnedavx512f = true;
7521 else if (in_return && !warnedavx512f_ret)
7523 if (warning (OPT_Wpsabi, "AVX512F vector return "
7524 "without AVX512F enabled changes the ABI"))
7525 warnedavx512f_ret = true;
7528 return TYPE_MODE (type);
7530 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7532 static bool warnedavx;
7533 static bool warnedavx_ret;
7535 if (cum && cum->warn_avx && !warnedavx)
7537 if (warning (OPT_Wpsabi, "AVX vector argument "
7538 "without AVX enabled changes the ABI"))
7539 warnedavx = true;
7541 else if (in_return && !warnedavx_ret)
7543 if (warning (OPT_Wpsabi, "AVX vector return "
7544 "without AVX enabled changes the ABI"))
7545 warnedavx_ret = true;
7548 return TYPE_MODE (type);
7550 else if (((size == 8 && TARGET_64BIT) || size == 16)
7551 && !TARGET_SSE
7552 && !TARGET_IAMCU)
7554 static bool warnedsse;
7555 static bool warnedsse_ret;
7557 if (cum && cum->warn_sse && !warnedsse)
7559 if (warning (OPT_Wpsabi, "SSE vector argument "
7560 "without SSE enabled changes the ABI"))
7561 warnedsse = true;
7563 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7565 if (warning (OPT_Wpsabi, "SSE vector return "
7566 "without SSE enabled changes the ABI"))
7567 warnedsse_ret = true;
7570 else if ((size == 8 && !TARGET_64BIT)
7571 && (!cfun
7572 || cfun->machine->func_type == TYPE_NORMAL)
7573 && !TARGET_MMX
7574 && !TARGET_IAMCU)
7576 static bool warnedmmx;
7577 static bool warnedmmx_ret;
7579 if (cum && cum->warn_mmx && !warnedmmx)
7581 if (warning (OPT_Wpsabi, "MMX vector argument "
7582 "without MMX enabled changes the ABI"))
7583 warnedmmx = true;
7585 else if (in_return && !warnedmmx_ret)
7587 if (warning (OPT_Wpsabi, "MMX vector return "
7588 "without MMX enabled changes the ABI"))
7589 warnedmmx_ret = true;
7592 return mode;
7595 gcc_unreachable ();
7599 return mode;
7602 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7603 this may not agree with the mode that the type system has chosen for the
7604 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7605 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7607 static rtx
7608 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7609 unsigned int regno)
7611 rtx tmp;
7613 if (orig_mode != BLKmode)
7614 tmp = gen_rtx_REG (orig_mode, regno);
7615 else
7617 tmp = gen_rtx_REG (mode, regno);
7618 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7619 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7622 return tmp;
7625 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7626 of this code is to classify each 8bytes of incoming argument by the register
7627 class and assign registers accordingly. */
7629 /* Return the union class of CLASS1 and CLASS2.
7630 See the x86-64 PS ABI for details. */
7632 static enum x86_64_reg_class
7633 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7635 /* Rule #1: If both classes are equal, this is the resulting class. */
7636 if (class1 == class2)
7637 return class1;
7639 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7640 the other class. */
7641 if (class1 == X86_64_NO_CLASS)
7642 return class2;
7643 if (class2 == X86_64_NO_CLASS)
7644 return class1;
7646 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7647 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7648 return X86_64_MEMORY_CLASS;
7650 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7651 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7652 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7653 return X86_64_INTEGERSI_CLASS;
7654 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7655 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7656 return X86_64_INTEGER_CLASS;
7658 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7659 MEMORY is used. */
7660 if (class1 == X86_64_X87_CLASS
7661 || class1 == X86_64_X87UP_CLASS
7662 || class1 == X86_64_COMPLEX_X87_CLASS
7663 || class2 == X86_64_X87_CLASS
7664 || class2 == X86_64_X87UP_CLASS
7665 || class2 == X86_64_COMPLEX_X87_CLASS)
7666 return X86_64_MEMORY_CLASS;
7668 /* Rule #6: Otherwise class SSE is used. */
7669 return X86_64_SSE_CLASS;
7672 /* Classify the argument of type TYPE and mode MODE.
7673 CLASSES will be filled by the register class used to pass each word
7674 of the operand. The number of words is returned. In case the parameter
7675 should be passed in memory, 0 is returned. As a special case for zero
7676 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7678 BIT_OFFSET is used internally for handling records and specifies offset
7679 of the offset in bits modulo 512 to avoid overflow cases.
7681 See the x86-64 PS ABI for details.
7684 static int
7685 classify_argument (machine_mode mode, const_tree type,
7686 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7688 HOST_WIDE_INT bytes =
7689 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7690 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7692 /* Variable sized entities are always passed/returned in memory. */
7693 if (bytes < 0)
7694 return 0;
7696 if (mode != VOIDmode
7697 && targetm.calls.must_pass_in_stack (mode, type))
7698 return 0;
7700 if (type && AGGREGATE_TYPE_P (type))
7702 int i;
7703 tree field;
7704 enum x86_64_reg_class subclasses[MAX_CLASSES];
7706 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7707 if (bytes > 64)
7708 return 0;
7710 for (i = 0; i < words; i++)
7711 classes[i] = X86_64_NO_CLASS;
7713 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7714 signalize memory class, so handle it as special case. */
7715 if (!words)
7717 classes[0] = X86_64_NO_CLASS;
7718 return 1;
7721 /* Classify each field of record and merge classes. */
7722 switch (TREE_CODE (type))
7724 case RECORD_TYPE:
7725 /* And now merge the fields of structure. */
7726 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7728 if (TREE_CODE (field) == FIELD_DECL)
7730 int num;
7732 if (TREE_TYPE (field) == error_mark_node)
7733 continue;
7735 /* Bitfields are always classified as integer. Handle them
7736 early, since later code would consider them to be
7737 misaligned integers. */
7738 if (DECL_BIT_FIELD (field))
7740 for (i = (int_bit_position (field)
7741 + (bit_offset % 64)) / 8 / 8;
7742 i < ((int_bit_position (field) + (bit_offset % 64))
7743 + tree_to_shwi (DECL_SIZE (field))
7744 + 63) / 8 / 8; i++)
7745 classes[i] =
7746 merge_classes (X86_64_INTEGER_CLASS,
7747 classes[i]);
7749 else
7751 int pos;
7753 type = TREE_TYPE (field);
7755 /* Flexible array member is ignored. */
7756 if (TYPE_MODE (type) == BLKmode
7757 && TREE_CODE (type) == ARRAY_TYPE
7758 && TYPE_SIZE (type) == NULL_TREE
7759 && TYPE_DOMAIN (type) != NULL_TREE
7760 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7761 == NULL_TREE))
7763 static bool warned;
7765 if (!warned && warn_psabi)
7767 warned = true;
7768 inform (input_location,
7769 "the ABI of passing struct with"
7770 " a flexible array member has"
7771 " changed in GCC 4.4");
7773 continue;
7775 num = classify_argument (TYPE_MODE (type), type,
7776 subclasses,
7777 (int_bit_position (field)
7778 + bit_offset) % 512);
7779 if (!num)
7780 return 0;
7781 pos = (int_bit_position (field)
7782 + (bit_offset % 64)) / 8 / 8;
7783 for (i = 0; i < num && (i + pos) < words; i++)
7784 classes[i + pos] =
7785 merge_classes (subclasses[i], classes[i + pos]);
7789 break;
7791 case ARRAY_TYPE:
7792 /* Arrays are handled as small records. */
7794 int num;
7795 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7796 TREE_TYPE (type), subclasses, bit_offset);
7797 if (!num)
7798 return 0;
7800 /* The partial classes are now full classes. */
7801 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7802 subclasses[0] = X86_64_SSE_CLASS;
7803 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7804 && !((bit_offset % 64) == 0 && bytes == 4))
7805 subclasses[0] = X86_64_INTEGER_CLASS;
7807 for (i = 0; i < words; i++)
7808 classes[i] = subclasses[i % num];
7810 break;
7812 case UNION_TYPE:
7813 case QUAL_UNION_TYPE:
7814 /* Unions are similar to RECORD_TYPE but offset is always 0.
7816 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7818 if (TREE_CODE (field) == FIELD_DECL)
7820 int num;
7822 if (TREE_TYPE (field) == error_mark_node)
7823 continue;
7825 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7826 TREE_TYPE (field), subclasses,
7827 bit_offset);
7828 if (!num)
7829 return 0;
7830 for (i = 0; i < num && i < words; i++)
7831 classes[i] = merge_classes (subclasses[i], classes[i]);
7834 break;
7836 default:
7837 gcc_unreachable ();
7840 if (words > 2)
7842 /* When size > 16 bytes, if the first one isn't
7843 X86_64_SSE_CLASS or any other ones aren't
7844 X86_64_SSEUP_CLASS, everything should be passed in
7845 memory. */
7846 if (classes[0] != X86_64_SSE_CLASS)
7847 return 0;
7849 for (i = 1; i < words; i++)
7850 if (classes[i] != X86_64_SSEUP_CLASS)
7851 return 0;
7854 /* Final merger cleanup. */
7855 for (i = 0; i < words; i++)
7857 /* If one class is MEMORY, everything should be passed in
7858 memory. */
7859 if (classes[i] == X86_64_MEMORY_CLASS)
7860 return 0;
7862 /* The X86_64_SSEUP_CLASS should be always preceded by
7863 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7864 if (classes[i] == X86_64_SSEUP_CLASS
7865 && classes[i - 1] != X86_64_SSE_CLASS
7866 && classes[i - 1] != X86_64_SSEUP_CLASS)
7868 /* The first one should never be X86_64_SSEUP_CLASS. */
7869 gcc_assert (i != 0);
7870 classes[i] = X86_64_SSE_CLASS;
7873 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7874 everything should be passed in memory. */
7875 if (classes[i] == X86_64_X87UP_CLASS
7876 && (classes[i - 1] != X86_64_X87_CLASS))
7878 static bool warned;
7880 /* The first one should never be X86_64_X87UP_CLASS. */
7881 gcc_assert (i != 0);
7882 if (!warned && warn_psabi)
7884 warned = true;
7885 inform (input_location,
7886 "the ABI of passing union with long double"
7887 " has changed in GCC 4.4");
7889 return 0;
7892 return words;
7895 /* Compute alignment needed. We align all types to natural boundaries with
7896 exception of XFmode that is aligned to 64bits. */
7897 if (mode != VOIDmode && mode != BLKmode)
7899 int mode_alignment = GET_MODE_BITSIZE (mode);
7901 if (mode == XFmode)
7902 mode_alignment = 128;
7903 else if (mode == XCmode)
7904 mode_alignment = 256;
7905 if (COMPLEX_MODE_P (mode))
7906 mode_alignment /= 2;
7907 /* Misaligned fields are always returned in memory. */
7908 if (bit_offset % mode_alignment)
7909 return 0;
7912 /* for V1xx modes, just use the base mode */
7913 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7914 && GET_MODE_UNIT_SIZE (mode) == bytes)
7915 mode = GET_MODE_INNER (mode);
7917 /* Classification of atomic types. */
7918 switch (mode)
7920 case E_SDmode:
7921 case E_DDmode:
7922 classes[0] = X86_64_SSE_CLASS;
7923 return 1;
7924 case E_TDmode:
7925 classes[0] = X86_64_SSE_CLASS;
7926 classes[1] = X86_64_SSEUP_CLASS;
7927 return 2;
7928 case E_DImode:
7929 case E_SImode:
7930 case E_HImode:
7931 case E_QImode:
7932 case E_CSImode:
7933 case E_CHImode:
7934 case E_CQImode:
7936 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7938 /* Analyze last 128 bits only. */
7939 size = (size - 1) & 0x7f;
7941 if (size < 32)
7943 classes[0] = X86_64_INTEGERSI_CLASS;
7944 return 1;
7946 else if (size < 64)
7948 classes[0] = X86_64_INTEGER_CLASS;
7949 return 1;
7951 else if (size < 64+32)
7953 classes[0] = X86_64_INTEGER_CLASS;
7954 classes[1] = X86_64_INTEGERSI_CLASS;
7955 return 2;
7957 else if (size < 64+64)
7959 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7960 return 2;
7962 else
7963 gcc_unreachable ();
7965 case E_CDImode:
7966 case E_TImode:
7967 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7968 return 2;
7969 case E_COImode:
7970 case E_OImode:
7971 /* OImode shouldn't be used directly. */
7972 gcc_unreachable ();
7973 case E_CTImode:
7974 return 0;
7975 case E_SFmode:
7976 if (!(bit_offset % 64))
7977 classes[0] = X86_64_SSESF_CLASS;
7978 else
7979 classes[0] = X86_64_SSE_CLASS;
7980 return 1;
7981 case E_DFmode:
7982 classes[0] = X86_64_SSEDF_CLASS;
7983 return 1;
7984 case E_XFmode:
7985 classes[0] = X86_64_X87_CLASS;
7986 classes[1] = X86_64_X87UP_CLASS;
7987 return 2;
7988 case E_TFmode:
7989 classes[0] = X86_64_SSE_CLASS;
7990 classes[1] = X86_64_SSEUP_CLASS;
7991 return 2;
7992 case E_SCmode:
7993 classes[0] = X86_64_SSE_CLASS;
7994 if (!(bit_offset % 64))
7995 return 1;
7996 else
7998 static bool warned;
8000 if (!warned && warn_psabi)
8002 warned = true;
8003 inform (input_location,
8004 "the ABI of passing structure with complex float"
8005 " member has changed in GCC 4.4");
8007 classes[1] = X86_64_SSESF_CLASS;
8008 return 2;
8010 case E_DCmode:
8011 classes[0] = X86_64_SSEDF_CLASS;
8012 classes[1] = X86_64_SSEDF_CLASS;
8013 return 2;
8014 case E_XCmode:
8015 classes[0] = X86_64_COMPLEX_X87_CLASS;
8016 return 1;
8017 case E_TCmode:
8018 /* This modes is larger than 16 bytes. */
8019 return 0;
8020 case E_V8SFmode:
8021 case E_V8SImode:
8022 case E_V32QImode:
8023 case E_V16HImode:
8024 case E_V4DFmode:
8025 case E_V4DImode:
8026 classes[0] = X86_64_SSE_CLASS;
8027 classes[1] = X86_64_SSEUP_CLASS;
8028 classes[2] = X86_64_SSEUP_CLASS;
8029 classes[3] = X86_64_SSEUP_CLASS;
8030 return 4;
8031 case E_V8DFmode:
8032 case E_V16SFmode:
8033 case E_V8DImode:
8034 case E_V16SImode:
8035 case E_V32HImode:
8036 case E_V64QImode:
8037 classes[0] = X86_64_SSE_CLASS;
8038 classes[1] = X86_64_SSEUP_CLASS;
8039 classes[2] = X86_64_SSEUP_CLASS;
8040 classes[3] = X86_64_SSEUP_CLASS;
8041 classes[4] = X86_64_SSEUP_CLASS;
8042 classes[5] = X86_64_SSEUP_CLASS;
8043 classes[6] = X86_64_SSEUP_CLASS;
8044 classes[7] = X86_64_SSEUP_CLASS;
8045 return 8;
8046 case E_V4SFmode:
8047 case E_V4SImode:
8048 case E_V16QImode:
8049 case E_V8HImode:
8050 case E_V2DFmode:
8051 case E_V2DImode:
8052 classes[0] = X86_64_SSE_CLASS;
8053 classes[1] = X86_64_SSEUP_CLASS;
8054 return 2;
8055 case E_V1TImode:
8056 case E_V1DImode:
8057 case E_V2SFmode:
8058 case E_V2SImode:
8059 case E_V4HImode:
8060 case E_V8QImode:
8061 classes[0] = X86_64_SSE_CLASS;
8062 return 1;
8063 case E_BLKmode:
8064 case E_VOIDmode:
8065 return 0;
8066 default:
8067 gcc_assert (VECTOR_MODE_P (mode));
8069 if (bytes > 16)
8070 return 0;
8072 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
8074 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8075 classes[0] = X86_64_INTEGERSI_CLASS;
8076 else
8077 classes[0] = X86_64_INTEGER_CLASS;
8078 classes[1] = X86_64_INTEGER_CLASS;
8079 return 1 + (bytes > 8);
8083 /* Examine the argument and return set number of register required in each
8084 class. Return true iff parameter should be passed in memory. */
8086 static bool
8087 examine_argument (machine_mode mode, const_tree type, int in_return,
8088 int *int_nregs, int *sse_nregs)
8090 enum x86_64_reg_class regclass[MAX_CLASSES];
8091 int n = classify_argument (mode, type, regclass, 0);
8093 *int_nregs = 0;
8094 *sse_nregs = 0;
8096 if (!n)
8097 return true;
8098 for (n--; n >= 0; n--)
8099 switch (regclass[n])
8101 case X86_64_INTEGER_CLASS:
8102 case X86_64_INTEGERSI_CLASS:
8103 (*int_nregs)++;
8104 break;
8105 case X86_64_SSE_CLASS:
8106 case X86_64_SSESF_CLASS:
8107 case X86_64_SSEDF_CLASS:
8108 (*sse_nregs)++;
8109 break;
8110 case X86_64_NO_CLASS:
8111 case X86_64_SSEUP_CLASS:
8112 break;
8113 case X86_64_X87_CLASS:
8114 case X86_64_X87UP_CLASS:
8115 case X86_64_COMPLEX_X87_CLASS:
8116 if (!in_return)
8117 return true;
8118 break;
8119 case X86_64_MEMORY_CLASS:
8120 gcc_unreachable ();
8123 return false;
8126 /* Construct container for the argument used by GCC interface. See
8127 FUNCTION_ARG for the detailed description. */
8129 static rtx
8130 construct_container (machine_mode mode, machine_mode orig_mode,
8131 const_tree type, int in_return, int nintregs, int nsseregs,
8132 const int *intreg, int sse_regno)
8134 /* The following variables hold the static issued_error state. */
8135 static bool issued_sse_arg_error;
8136 static bool issued_sse_ret_error;
8137 static bool issued_x87_ret_error;
8139 machine_mode tmpmode;
8140 int bytes =
8141 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8142 enum x86_64_reg_class regclass[MAX_CLASSES];
8143 int n;
8144 int i;
8145 int nexps = 0;
8146 int needed_sseregs, needed_intregs;
8147 rtx exp[MAX_CLASSES];
8148 rtx ret;
8150 n = classify_argument (mode, type, regclass, 0);
8151 if (!n)
8152 return NULL;
8153 if (examine_argument (mode, type, in_return, &needed_intregs,
8154 &needed_sseregs))
8155 return NULL;
8156 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8157 return NULL;
8159 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8160 some less clueful developer tries to use floating-point anyway. */
8161 if (needed_sseregs && !TARGET_SSE)
8163 if (in_return)
8165 if (!issued_sse_ret_error)
8167 error ("SSE register return with SSE disabled");
8168 issued_sse_ret_error = true;
8171 else if (!issued_sse_arg_error)
8173 error ("SSE register argument with SSE disabled");
8174 issued_sse_arg_error = true;
8176 return NULL;
8179 /* Likewise, error if the ABI requires us to return values in the
8180 x87 registers and the user specified -mno-80387. */
8181 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8182 for (i = 0; i < n; i++)
8183 if (regclass[i] == X86_64_X87_CLASS
8184 || regclass[i] == X86_64_X87UP_CLASS
8185 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8187 if (!issued_x87_ret_error)
8189 error ("x87 register return with x87 disabled");
8190 issued_x87_ret_error = true;
8192 return NULL;
8195 /* First construct simple cases. Avoid SCmode, since we want to use
8196 single register to pass this type. */
8197 if (n == 1 && mode != SCmode)
8198 switch (regclass[0])
8200 case X86_64_INTEGER_CLASS:
8201 case X86_64_INTEGERSI_CLASS:
8202 return gen_rtx_REG (mode, intreg[0]);
8203 case X86_64_SSE_CLASS:
8204 case X86_64_SSESF_CLASS:
8205 case X86_64_SSEDF_CLASS:
8206 if (mode != BLKmode)
8207 return gen_reg_or_parallel (mode, orig_mode,
8208 SSE_REGNO (sse_regno));
8209 break;
8210 case X86_64_X87_CLASS:
8211 case X86_64_COMPLEX_X87_CLASS:
8212 return gen_rtx_REG (mode, FIRST_STACK_REG);
8213 case X86_64_NO_CLASS:
8214 /* Zero sized array, struct or class. */
8215 return NULL;
8216 default:
8217 gcc_unreachable ();
8219 if (n == 2
8220 && regclass[0] == X86_64_SSE_CLASS
8221 && regclass[1] == X86_64_SSEUP_CLASS
8222 && mode != BLKmode)
8223 return gen_reg_or_parallel (mode, orig_mode,
8224 SSE_REGNO (sse_regno));
8225 if (n == 4
8226 && regclass[0] == X86_64_SSE_CLASS
8227 && regclass[1] == X86_64_SSEUP_CLASS
8228 && regclass[2] == X86_64_SSEUP_CLASS
8229 && regclass[3] == X86_64_SSEUP_CLASS
8230 && mode != BLKmode)
8231 return gen_reg_or_parallel (mode, orig_mode,
8232 SSE_REGNO (sse_regno));
8233 if (n == 8
8234 && regclass[0] == X86_64_SSE_CLASS
8235 && regclass[1] == X86_64_SSEUP_CLASS
8236 && regclass[2] == X86_64_SSEUP_CLASS
8237 && regclass[3] == X86_64_SSEUP_CLASS
8238 && regclass[4] == X86_64_SSEUP_CLASS
8239 && regclass[5] == X86_64_SSEUP_CLASS
8240 && regclass[6] == X86_64_SSEUP_CLASS
8241 && regclass[7] == X86_64_SSEUP_CLASS
8242 && mode != BLKmode)
8243 return gen_reg_or_parallel (mode, orig_mode,
8244 SSE_REGNO (sse_regno));
8245 if (n == 2
8246 && regclass[0] == X86_64_X87_CLASS
8247 && regclass[1] == X86_64_X87UP_CLASS)
8248 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8250 if (n == 2
8251 && regclass[0] == X86_64_INTEGER_CLASS
8252 && regclass[1] == X86_64_INTEGER_CLASS
8253 && (mode == CDImode || mode == TImode)
8254 && intreg[0] + 1 == intreg[1])
8255 return gen_rtx_REG (mode, intreg[0]);
8257 /* Otherwise figure out the entries of the PARALLEL. */
8258 for (i = 0; i < n; i++)
8260 int pos;
8262 switch (regclass[i])
8264 case X86_64_NO_CLASS:
8265 break;
8266 case X86_64_INTEGER_CLASS:
8267 case X86_64_INTEGERSI_CLASS:
8268 /* Merge TImodes on aligned occasions here too. */
8269 if (i * 8 + 8 > bytes)
8271 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8272 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8273 /* We've requested 24 bytes we
8274 don't have mode for. Use DImode. */
8275 tmpmode = DImode;
8277 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8278 tmpmode = SImode;
8279 else
8280 tmpmode = DImode;
8281 exp [nexps++]
8282 = gen_rtx_EXPR_LIST (VOIDmode,
8283 gen_rtx_REG (tmpmode, *intreg),
8284 GEN_INT (i*8));
8285 intreg++;
8286 break;
8287 case X86_64_SSESF_CLASS:
8288 exp [nexps++]
8289 = gen_rtx_EXPR_LIST (VOIDmode,
8290 gen_rtx_REG (SFmode,
8291 SSE_REGNO (sse_regno)),
8292 GEN_INT (i*8));
8293 sse_regno++;
8294 break;
8295 case X86_64_SSEDF_CLASS:
8296 exp [nexps++]
8297 = gen_rtx_EXPR_LIST (VOIDmode,
8298 gen_rtx_REG (DFmode,
8299 SSE_REGNO (sse_regno)),
8300 GEN_INT (i*8));
8301 sse_regno++;
8302 break;
8303 case X86_64_SSE_CLASS:
8304 pos = i;
8305 switch (n)
8307 case 1:
8308 tmpmode = DImode;
8309 break;
8310 case 2:
8311 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8313 tmpmode = TImode;
8314 i++;
8316 else
8317 tmpmode = DImode;
8318 break;
8319 case 4:
8320 gcc_assert (i == 0
8321 && regclass[1] == X86_64_SSEUP_CLASS
8322 && regclass[2] == X86_64_SSEUP_CLASS
8323 && regclass[3] == X86_64_SSEUP_CLASS);
8324 tmpmode = OImode;
8325 i += 3;
8326 break;
8327 case 8:
8328 gcc_assert (i == 0
8329 && regclass[1] == X86_64_SSEUP_CLASS
8330 && regclass[2] == X86_64_SSEUP_CLASS
8331 && regclass[3] == X86_64_SSEUP_CLASS
8332 && regclass[4] == X86_64_SSEUP_CLASS
8333 && regclass[5] == X86_64_SSEUP_CLASS
8334 && regclass[6] == X86_64_SSEUP_CLASS
8335 && regclass[7] == X86_64_SSEUP_CLASS);
8336 tmpmode = XImode;
8337 i += 7;
8338 break;
8339 default:
8340 gcc_unreachable ();
8342 exp [nexps++]
8343 = gen_rtx_EXPR_LIST (VOIDmode,
8344 gen_rtx_REG (tmpmode,
8345 SSE_REGNO (sse_regno)),
8346 GEN_INT (pos*8));
8347 sse_regno++;
8348 break;
8349 default:
8350 gcc_unreachable ();
8354 /* Empty aligned struct, union or class. */
8355 if (nexps == 0)
8356 return NULL;
8358 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8359 for (i = 0; i < nexps; i++)
8360 XVECEXP (ret, 0, i) = exp [i];
8361 return ret;
8364 /* Update the data in CUM to advance over an argument of mode MODE
8365 and data type TYPE. (TYPE is null for libcalls where that information
8366 may not be available.)
8368 Return a number of integer regsiters advanced over. */
8370 static int
8371 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8372 const_tree type, HOST_WIDE_INT bytes,
8373 HOST_WIDE_INT words)
8375 int res = 0;
8376 bool error_p = false;
8378 if (TARGET_IAMCU)
8380 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8381 bytes in registers. */
8382 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8383 goto pass_in_reg;
8384 return res;
8387 switch (mode)
8389 default:
8390 break;
8392 case E_BLKmode:
8393 if (bytes < 0)
8394 break;
8395 /* FALLTHRU */
8397 case E_DImode:
8398 case E_SImode:
8399 case E_HImode:
8400 case E_QImode:
8401 pass_in_reg:
8402 cum->words += words;
8403 cum->nregs -= words;
8404 cum->regno += words;
8405 if (cum->nregs >= 0)
8406 res = words;
8407 if (cum->nregs <= 0)
8409 cum->nregs = 0;
8410 cfun->machine->arg_reg_available = false;
8411 cum->regno = 0;
8413 break;
8415 case E_OImode:
8416 /* OImode shouldn't be used directly. */
8417 gcc_unreachable ();
8419 case E_DFmode:
8420 if (cum->float_in_sse == -1)
8421 error_p = true;
8422 if (cum->float_in_sse < 2)
8423 break;
8424 /* FALLTHRU */
8425 case E_SFmode:
8426 if (cum->float_in_sse == -1)
8427 error_p = true;
8428 if (cum->float_in_sse < 1)
8429 break;
8430 /* FALLTHRU */
8432 case E_V8SFmode:
8433 case E_V8SImode:
8434 case E_V64QImode:
8435 case E_V32HImode:
8436 case E_V16SImode:
8437 case E_V8DImode:
8438 case E_V16SFmode:
8439 case E_V8DFmode:
8440 case E_V32QImode:
8441 case E_V16HImode:
8442 case E_V4DFmode:
8443 case E_V4DImode:
8444 case E_TImode:
8445 case E_V16QImode:
8446 case E_V8HImode:
8447 case E_V4SImode:
8448 case E_V2DImode:
8449 case E_V4SFmode:
8450 case E_V2DFmode:
8451 if (!type || !AGGREGATE_TYPE_P (type))
8453 cum->sse_words += words;
8454 cum->sse_nregs -= 1;
8455 cum->sse_regno += 1;
8456 if (cum->sse_nregs <= 0)
8458 cum->sse_nregs = 0;
8459 cum->sse_regno = 0;
8462 break;
8464 case E_V8QImode:
8465 case E_V4HImode:
8466 case E_V2SImode:
8467 case E_V2SFmode:
8468 case E_V1TImode:
8469 case E_V1DImode:
8470 if (!type || !AGGREGATE_TYPE_P (type))
8472 cum->mmx_words += words;
8473 cum->mmx_nregs -= 1;
8474 cum->mmx_regno += 1;
8475 if (cum->mmx_nregs <= 0)
8477 cum->mmx_nregs = 0;
8478 cum->mmx_regno = 0;
8481 break;
8483 if (error_p)
8485 cum->float_in_sse = 0;
8486 error ("calling %qD with SSE calling convention without "
8487 "SSE/SSE2 enabled", cum->decl);
8488 sorry ("this is a GCC bug that can be worked around by adding "
8489 "attribute used to function called");
8492 return res;
8495 static int
8496 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8497 const_tree type, HOST_WIDE_INT words, bool named)
8499 int int_nregs, sse_nregs;
8501 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8502 if (!named && (VALID_AVX512F_REG_MODE (mode)
8503 || VALID_AVX256_REG_MODE (mode)))
8504 return 0;
8506 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8507 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8509 cum->nregs -= int_nregs;
8510 cum->sse_nregs -= sse_nregs;
8511 cum->regno += int_nregs;
8512 cum->sse_regno += sse_nregs;
8513 return int_nregs;
8515 else
8517 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8518 cum->words = ROUND_UP (cum->words, align);
8519 cum->words += words;
8520 return 0;
8524 static int
8525 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8526 HOST_WIDE_INT words)
8528 /* Otherwise, this should be passed indirect. */
8529 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8531 cum->words += words;
8532 if (cum->nregs > 0)
8534 cum->nregs -= 1;
8535 cum->regno += 1;
8536 return 1;
8538 return 0;
8541 /* Update the data in CUM to advance over an argument of mode MODE and
8542 data type TYPE. (TYPE is null for libcalls where that information
8543 may not be available.) */
8545 static void
8546 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8547 const_tree type, bool named)
8549 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8550 HOST_WIDE_INT bytes, words;
8551 int nregs;
8553 /* The argument of interrupt handler is a special case and is
8554 handled in ix86_function_arg. */
8555 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8556 return;
8558 if (mode == BLKmode)
8559 bytes = int_size_in_bytes (type);
8560 else
8561 bytes = GET_MODE_SIZE (mode);
8562 words = CEIL (bytes, UNITS_PER_WORD);
8564 if (type)
8565 mode = type_natural_mode (type, NULL, false);
8567 if ((type && POINTER_BOUNDS_TYPE_P (type))
8568 || POINTER_BOUNDS_MODE_P (mode))
8570 /* If we pass bounds in BT then just update remained bounds count. */
8571 if (cum->bnds_in_bt)
8573 cum->bnds_in_bt--;
8574 return;
8577 /* Update remained number of bounds to force. */
8578 if (cum->force_bnd_pass)
8579 cum->force_bnd_pass--;
8581 cum->bnd_regno++;
8583 return;
8586 /* The first arg not going to Bounds Tables resets this counter. */
8587 cum->bnds_in_bt = 0;
8588 /* For unnamed args we always pass bounds to avoid bounds mess when
8589 passed and received types do not match. If bounds do not follow
8590 unnamed arg, still pretend required number of bounds were passed. */
8591 if (cum->force_bnd_pass)
8593 cum->bnd_regno += cum->force_bnd_pass;
8594 cum->force_bnd_pass = 0;
8597 if (TARGET_64BIT)
8599 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8601 if (call_abi == MS_ABI)
8602 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8603 else
8604 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8606 else
8607 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8609 /* For stdarg we expect bounds to be passed for each value passed
8610 in register. */
8611 if (cum->stdarg)
8612 cum->force_bnd_pass = nregs;
8613 /* For pointers passed in memory we expect bounds passed in Bounds
8614 Table. */
8615 if (!nregs)
8617 /* Track if there are outgoing arguments on stack. */
8618 if (cum->caller)
8619 cfun->machine->outgoing_args_on_stack = true;
8621 cum->bnds_in_bt = chkp_type_bounds_count (type);
8625 /* Define where to put the arguments to a function.
8626 Value is zero to push the argument on the stack,
8627 or a hard register in which to store the argument.
8629 MODE is the argument's machine mode.
8630 TYPE is the data type of the argument (as a tree).
8631 This is null for libcalls where that information may
8632 not be available.
8633 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8634 the preceding args and about the function being called.
8635 NAMED is nonzero if this argument is a named parameter
8636 (otherwise it is an extra parameter matching an ellipsis). */
8638 static rtx
8639 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8640 machine_mode orig_mode, const_tree type,
8641 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8643 bool error_p = false;
8645 /* Avoid the AL settings for the Unix64 ABI. */
8646 if (mode == VOIDmode)
8647 return constm1_rtx;
8649 if (TARGET_IAMCU)
8651 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8652 bytes in registers. */
8653 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8654 goto pass_in_reg;
8655 return NULL_RTX;
8658 switch (mode)
8660 default:
8661 break;
8663 case E_BLKmode:
8664 if (bytes < 0)
8665 break;
8666 /* FALLTHRU */
8667 case E_DImode:
8668 case E_SImode:
8669 case E_HImode:
8670 case E_QImode:
8671 pass_in_reg:
8672 if (words <= cum->nregs)
8674 int regno = cum->regno;
8676 /* Fastcall allocates the first two DWORD (SImode) or
8677 smaller arguments to ECX and EDX if it isn't an
8678 aggregate type . */
8679 if (cum->fastcall)
8681 if (mode == BLKmode
8682 || mode == DImode
8683 || (type && AGGREGATE_TYPE_P (type)))
8684 break;
8686 /* ECX not EAX is the first allocated register. */
8687 if (regno == AX_REG)
8688 regno = CX_REG;
8690 return gen_rtx_REG (mode, regno);
8692 break;
8694 case E_DFmode:
8695 if (cum->float_in_sse == -1)
8696 error_p = true;
8697 if (cum->float_in_sse < 2)
8698 break;
8699 /* FALLTHRU */
8700 case E_SFmode:
8701 if (cum->float_in_sse == -1)
8702 error_p = true;
8703 if (cum->float_in_sse < 1)
8704 break;
8705 /* FALLTHRU */
8706 case E_TImode:
8707 /* In 32bit, we pass TImode in xmm registers. */
8708 case E_V16QImode:
8709 case E_V8HImode:
8710 case E_V4SImode:
8711 case E_V2DImode:
8712 case E_V4SFmode:
8713 case E_V2DFmode:
8714 if (!type || !AGGREGATE_TYPE_P (type))
8716 if (cum->sse_nregs)
8717 return gen_reg_or_parallel (mode, orig_mode,
8718 cum->sse_regno + FIRST_SSE_REG);
8720 break;
8722 case E_OImode:
8723 case E_XImode:
8724 /* OImode and XImode shouldn't be used directly. */
8725 gcc_unreachable ();
8727 case E_V64QImode:
8728 case E_V32HImode:
8729 case E_V16SImode:
8730 case E_V8DImode:
8731 case E_V16SFmode:
8732 case E_V8DFmode:
8733 case E_V8SFmode:
8734 case E_V8SImode:
8735 case E_V32QImode:
8736 case E_V16HImode:
8737 case E_V4DFmode:
8738 case E_V4DImode:
8739 if (!type || !AGGREGATE_TYPE_P (type))
8741 if (cum->sse_nregs)
8742 return gen_reg_or_parallel (mode, orig_mode,
8743 cum->sse_regno + FIRST_SSE_REG);
8745 break;
8747 case E_V8QImode:
8748 case E_V4HImode:
8749 case E_V2SImode:
8750 case E_V2SFmode:
8751 case E_V1TImode:
8752 case E_V1DImode:
8753 if (!type || !AGGREGATE_TYPE_P (type))
8755 if (cum->mmx_nregs)
8756 return gen_reg_or_parallel (mode, orig_mode,
8757 cum->mmx_regno + FIRST_MMX_REG);
8759 break;
8761 if (error_p)
8763 cum->float_in_sse = 0;
8764 error ("calling %qD with SSE calling convention without "
8765 "SSE/SSE2 enabled", cum->decl);
8766 sorry ("this is a GCC bug that can be worked around by adding "
8767 "attribute used to function called");
8770 return NULL_RTX;
8773 static rtx
8774 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8775 machine_mode orig_mode, const_tree type, bool named)
8777 /* Handle a hidden AL argument containing number of registers
8778 for varargs x86-64 functions. */
8779 if (mode == VOIDmode)
8780 return GEN_INT (cum->maybe_vaarg
8781 ? (cum->sse_nregs < 0
8782 ? X86_64_SSE_REGPARM_MAX
8783 : cum->sse_regno)
8784 : -1);
8786 switch (mode)
8788 default:
8789 break;
8791 case E_V8SFmode:
8792 case E_V8SImode:
8793 case E_V32QImode:
8794 case E_V16HImode:
8795 case E_V4DFmode:
8796 case E_V4DImode:
8797 case E_V16SFmode:
8798 case E_V16SImode:
8799 case E_V64QImode:
8800 case E_V32HImode:
8801 case E_V8DFmode:
8802 case E_V8DImode:
8803 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8804 if (!named)
8805 return NULL;
8806 break;
8809 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8810 cum->sse_nregs,
8811 &x86_64_int_parameter_registers [cum->regno],
8812 cum->sse_regno);
8815 static rtx
8816 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8817 machine_mode orig_mode, bool named,
8818 HOST_WIDE_INT bytes)
8820 unsigned int regno;
8822 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8823 We use value of -2 to specify that current function call is MSABI. */
8824 if (mode == VOIDmode)
8825 return GEN_INT (-2);
8827 /* If we've run out of registers, it goes on the stack. */
8828 if (cum->nregs == 0)
8829 return NULL_RTX;
8831 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8833 /* Only floating point modes are passed in anything but integer regs. */
8834 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8836 if (named)
8837 regno = cum->regno + FIRST_SSE_REG;
8838 else
8840 rtx t1, t2;
8842 /* Unnamed floating parameters are passed in both the
8843 SSE and integer registers. */
8844 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8845 t2 = gen_rtx_REG (mode, regno);
8846 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8847 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8848 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8851 /* Handle aggregated types passed in register. */
8852 if (orig_mode == BLKmode)
8854 if (bytes > 0 && bytes <= 8)
8855 mode = (bytes > 4 ? DImode : SImode);
8856 if (mode == BLKmode)
8857 mode = DImode;
8860 return gen_reg_or_parallel (mode, orig_mode, regno);
8863 /* Return where to put the arguments to a function.
8864 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8866 MODE is the argument's machine mode. TYPE is the data type of the
8867 argument. It is null for libcalls where that information may not be
8868 available. CUM gives information about the preceding args and about
8869 the function being called. NAMED is nonzero if this argument is a
8870 named parameter (otherwise it is an extra parameter matching an
8871 ellipsis). */
8873 static rtx
8874 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8875 const_tree type, bool named)
8877 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8878 machine_mode mode = omode;
8879 HOST_WIDE_INT bytes, words;
8880 rtx arg;
8882 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8884 gcc_assert (type != NULL_TREE);
8885 if (POINTER_TYPE_P (type))
8887 /* This is the pointer argument. */
8888 gcc_assert (TYPE_MODE (type) == Pmode);
8889 /* It is at -WORD(AP) in the current frame in interrupt and
8890 exception handlers. */
8891 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8893 else
8895 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8896 && TREE_CODE (type) == INTEGER_TYPE
8897 && TYPE_MODE (type) == word_mode);
8898 /* The error code is the word-mode integer argument at
8899 -2 * WORD(AP) in the current frame of the exception
8900 handler. */
8901 arg = gen_rtx_MEM (word_mode,
8902 plus_constant (Pmode,
8903 arg_pointer_rtx,
8904 -2 * UNITS_PER_WORD));
8906 return arg;
8909 /* All pointer bounds arguments are handled separately here. */
8910 if ((type && POINTER_BOUNDS_TYPE_P (type))
8911 || POINTER_BOUNDS_MODE_P (mode))
8913 /* Return NULL if bounds are forced to go in Bounds Table. */
8914 if (cum->bnds_in_bt)
8915 arg = NULL;
8916 /* Return the next available bound reg if any. */
8917 else if (cum->bnd_regno <= LAST_BND_REG)
8918 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8919 /* Return the next special slot number otherwise. */
8920 else
8921 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8923 return arg;
8926 if (mode == BLKmode)
8927 bytes = int_size_in_bytes (type);
8928 else
8929 bytes = GET_MODE_SIZE (mode);
8930 words = CEIL (bytes, UNITS_PER_WORD);
8932 /* To simplify the code below, represent vector types with a vector mode
8933 even if MMX/SSE are not active. */
8934 if (type && TREE_CODE (type) == VECTOR_TYPE)
8935 mode = type_natural_mode (type, cum, false);
8937 if (TARGET_64BIT)
8939 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8941 if (call_abi == MS_ABI)
8942 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8943 else
8944 arg = function_arg_64 (cum, mode, omode, type, named);
8946 else
8947 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8949 /* Track if there are outgoing arguments on stack. */
8950 if (arg == NULL_RTX && cum->caller)
8951 cfun->machine->outgoing_args_on_stack = true;
8953 return arg;
8956 /* A C expression that indicates when an argument must be passed by
8957 reference. If nonzero for an argument, a copy of that argument is
8958 made in memory and a pointer to the argument is passed instead of
8959 the argument itself. The pointer is passed in whatever way is
8960 appropriate for passing a pointer to that type. */
8962 static bool
8963 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8964 const_tree type, bool)
8966 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8968 /* Bounds are never passed by reference. */
8969 if ((type && POINTER_BOUNDS_TYPE_P (type))
8970 || POINTER_BOUNDS_MODE_P (mode))
8971 return false;
8973 if (TARGET_64BIT)
8975 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8977 /* See Windows x64 Software Convention. */
8978 if (call_abi == MS_ABI)
8980 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8982 if (type)
8984 /* Arrays are passed by reference. */
8985 if (TREE_CODE (type) == ARRAY_TYPE)
8986 return true;
8988 if (RECORD_OR_UNION_TYPE_P (type))
8990 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8991 are passed by reference. */
8992 msize = int_size_in_bytes (type);
8996 /* __m128 is passed by reference. */
8997 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8999 else if (type && int_size_in_bytes (type) == -1)
9000 return true;
9003 return false;
9006 /* Return true when TYPE should be 128bit aligned for 32bit argument
9007 passing ABI. XXX: This function is obsolete and is only used for
9008 checking psABI compatibility with previous versions of GCC. */
9010 static bool
9011 ix86_compat_aligned_value_p (const_tree type)
9013 machine_mode mode = TYPE_MODE (type);
9014 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
9015 || mode == TDmode
9016 || mode == TFmode
9017 || mode == TCmode)
9018 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
9019 return true;
9020 if (TYPE_ALIGN (type) < 128)
9021 return false;
9023 if (AGGREGATE_TYPE_P (type))
9025 /* Walk the aggregates recursively. */
9026 switch (TREE_CODE (type))
9028 case RECORD_TYPE:
9029 case UNION_TYPE:
9030 case QUAL_UNION_TYPE:
9032 tree field;
9034 /* Walk all the structure fields. */
9035 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9037 if (TREE_CODE (field) == FIELD_DECL
9038 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
9039 return true;
9041 break;
9044 case ARRAY_TYPE:
9045 /* Just for use if some languages passes arrays by value. */
9046 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
9047 return true;
9048 break;
9050 default:
9051 gcc_unreachable ();
9054 return false;
9057 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
9058 XXX: This function is obsolete and is only used for checking psABI
9059 compatibility with previous versions of GCC. */
9061 static unsigned int
9062 ix86_compat_function_arg_boundary (machine_mode mode,
9063 const_tree type, unsigned int align)
9065 /* In 32bit, only _Decimal128 and __float128 are aligned to their
9066 natural boundaries. */
9067 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
9069 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
9070 make an exception for SSE modes since these require 128bit
9071 alignment.
9073 The handling here differs from field_alignment. ICC aligns MMX
9074 arguments to 4 byte boundaries, while structure fields are aligned
9075 to 8 byte boundaries. */
9076 if (!type)
9078 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9079 align = PARM_BOUNDARY;
9081 else
9083 if (!ix86_compat_aligned_value_p (type))
9084 align = PARM_BOUNDARY;
9087 if (align > BIGGEST_ALIGNMENT)
9088 align = BIGGEST_ALIGNMENT;
9089 return align;
9092 /* Return true when TYPE should be 128bit aligned for 32bit argument
9093 passing ABI. */
9095 static bool
9096 ix86_contains_aligned_value_p (const_tree type)
9098 machine_mode mode = TYPE_MODE (type);
9100 if (mode == XFmode || mode == XCmode)
9101 return false;
9103 if (TYPE_ALIGN (type) < 128)
9104 return false;
9106 if (AGGREGATE_TYPE_P (type))
9108 /* Walk the aggregates recursively. */
9109 switch (TREE_CODE (type))
9111 case RECORD_TYPE:
9112 case UNION_TYPE:
9113 case QUAL_UNION_TYPE:
9115 tree field;
9117 /* Walk all the structure fields. */
9118 for (field = TYPE_FIELDS (type);
9119 field;
9120 field = DECL_CHAIN (field))
9122 if (TREE_CODE (field) == FIELD_DECL
9123 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9124 return true;
9126 break;
9129 case ARRAY_TYPE:
9130 /* Just for use if some languages passes arrays by value. */
9131 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9132 return true;
9133 break;
9135 default:
9136 gcc_unreachable ();
9139 else
9140 return TYPE_ALIGN (type) >= 128;
9142 return false;
9145 /* Gives the alignment boundary, in bits, of an argument with the
9146 specified mode and type. */
9148 static unsigned int
9149 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9151 unsigned int align;
9152 if (type)
9154 /* Since the main variant type is used for call, we convert it to
9155 the main variant type. */
9156 type = TYPE_MAIN_VARIANT (type);
9157 align = TYPE_ALIGN (type);
9158 if (TYPE_EMPTY_P (type))
9159 return PARM_BOUNDARY;
9161 else
9162 align = GET_MODE_ALIGNMENT (mode);
9163 if (align < PARM_BOUNDARY)
9164 align = PARM_BOUNDARY;
9165 else
9167 static bool warned;
9168 unsigned int saved_align = align;
9170 if (!TARGET_64BIT)
9172 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9173 if (!type)
9175 if (mode == XFmode || mode == XCmode)
9176 align = PARM_BOUNDARY;
9178 else if (!ix86_contains_aligned_value_p (type))
9179 align = PARM_BOUNDARY;
9181 if (align < 128)
9182 align = PARM_BOUNDARY;
9185 if (warn_psabi
9186 && !warned
9187 && align != ix86_compat_function_arg_boundary (mode, type,
9188 saved_align))
9190 warned = true;
9191 inform (input_location,
9192 "The ABI for passing parameters with %d-byte"
9193 " alignment has changed in GCC 4.6",
9194 align / BITS_PER_UNIT);
9198 return align;
9201 /* Return true if N is a possible register number of function value. */
9203 static bool
9204 ix86_function_value_regno_p (const unsigned int regno)
9206 switch (regno)
9208 case AX_REG:
9209 return true;
9210 case DX_REG:
9211 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9212 case DI_REG:
9213 case SI_REG:
9214 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9216 case BND0_REG:
9217 case BND1_REG:
9218 return chkp_function_instrumented_p (current_function_decl);
9220 /* Complex values are returned in %st(0)/%st(1) pair. */
9221 case ST0_REG:
9222 case ST1_REG:
9223 /* TODO: The function should depend on current function ABI but
9224 builtins.c would need updating then. Therefore we use the
9225 default ABI. */
9226 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9227 return false;
9228 return TARGET_FLOAT_RETURNS_IN_80387;
9230 /* Complex values are returned in %xmm0/%xmm1 pair. */
9231 case XMM0_REG:
9232 case XMM1_REG:
9233 return TARGET_SSE;
9235 case MM0_REG:
9236 if (TARGET_MACHO || TARGET_64BIT)
9237 return false;
9238 return TARGET_MMX;
9241 return false;
9244 /* Define how to find the value returned by a function.
9245 VALTYPE is the data type of the value (as a tree).
9246 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9247 otherwise, FUNC is 0. */
9249 static rtx
9250 function_value_32 (machine_mode orig_mode, machine_mode mode,
9251 const_tree fntype, const_tree fn)
9253 unsigned int regno;
9255 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9256 we normally prevent this case when mmx is not available. However
9257 some ABIs may require the result to be returned like DImode. */
9258 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9259 regno = FIRST_MMX_REG;
9261 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9262 we prevent this case when sse is not available. However some ABIs
9263 may require the result to be returned like integer TImode. */
9264 else if (mode == TImode
9265 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9266 regno = FIRST_SSE_REG;
9268 /* 32-byte vector modes in %ymm0. */
9269 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9270 regno = FIRST_SSE_REG;
9272 /* 64-byte vector modes in %zmm0. */
9273 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9274 regno = FIRST_SSE_REG;
9276 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9277 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9278 regno = FIRST_FLOAT_REG;
9279 else
9280 /* Most things go in %eax. */
9281 regno = AX_REG;
9283 /* Override FP return register with %xmm0 for local functions when
9284 SSE math is enabled or for functions with sseregparm attribute. */
9285 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9287 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9288 if (sse_level == -1)
9290 error ("calling %qD with SSE calling convention without "
9291 "SSE/SSE2 enabled", fn);
9292 sorry ("this is a GCC bug that can be worked around by adding "
9293 "attribute used to function called");
9295 else if ((sse_level >= 1 && mode == SFmode)
9296 || (sse_level == 2 && mode == DFmode))
9297 regno = FIRST_SSE_REG;
9300 /* OImode shouldn't be used directly. */
9301 gcc_assert (mode != OImode);
9303 return gen_rtx_REG (orig_mode, regno);
9306 static rtx
9307 function_value_64 (machine_mode orig_mode, machine_mode mode,
9308 const_tree valtype)
9310 rtx ret;
9312 /* Handle libcalls, which don't provide a type node. */
9313 if (valtype == NULL)
9315 unsigned int regno;
9317 switch (mode)
9319 case E_SFmode:
9320 case E_SCmode:
9321 case E_DFmode:
9322 case E_DCmode:
9323 case E_TFmode:
9324 case E_SDmode:
9325 case E_DDmode:
9326 case E_TDmode:
9327 regno = FIRST_SSE_REG;
9328 break;
9329 case E_XFmode:
9330 case E_XCmode:
9331 regno = FIRST_FLOAT_REG;
9332 break;
9333 case E_TCmode:
9334 return NULL;
9335 default:
9336 regno = AX_REG;
9339 return gen_rtx_REG (mode, regno);
9341 else if (POINTER_TYPE_P (valtype))
9343 /* Pointers are always returned in word_mode. */
9344 mode = word_mode;
9347 ret = construct_container (mode, orig_mode, valtype, 1,
9348 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9349 x86_64_int_return_registers, 0);
9351 /* For zero sized structures, construct_container returns NULL, but we
9352 need to keep rest of compiler happy by returning meaningful value. */
9353 if (!ret)
9354 ret = gen_rtx_REG (orig_mode, AX_REG);
9356 return ret;
9359 static rtx
9360 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9361 const_tree valtype)
9363 unsigned int regno = AX_REG;
9365 if (TARGET_SSE)
9367 switch (GET_MODE_SIZE (mode))
9369 case 16:
9370 if (valtype != NULL_TREE
9371 && !VECTOR_INTEGER_TYPE_P (valtype)
9372 && !VECTOR_INTEGER_TYPE_P (valtype)
9373 && !INTEGRAL_TYPE_P (valtype)
9374 && !VECTOR_FLOAT_TYPE_P (valtype))
9375 break;
9376 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9377 && !COMPLEX_MODE_P (mode))
9378 regno = FIRST_SSE_REG;
9379 break;
9380 case 8:
9381 case 4:
9382 if (mode == SFmode || mode == DFmode)
9383 regno = FIRST_SSE_REG;
9384 break;
9385 default:
9386 break;
9389 return gen_rtx_REG (orig_mode, regno);
9392 static rtx
9393 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9394 machine_mode orig_mode, machine_mode mode)
9396 const_tree fn, fntype;
9398 fn = NULL_TREE;
9399 if (fntype_or_decl && DECL_P (fntype_or_decl))
9400 fn = fntype_or_decl;
9401 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9403 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9404 || POINTER_BOUNDS_MODE_P (mode))
9405 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9406 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9407 return function_value_ms_64 (orig_mode, mode, valtype);
9408 else if (TARGET_64BIT)
9409 return function_value_64 (orig_mode, mode, valtype);
9410 else
9411 return function_value_32 (orig_mode, mode, fntype, fn);
9414 static rtx
9415 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9417 machine_mode mode, orig_mode;
9419 orig_mode = TYPE_MODE (valtype);
9420 mode = type_natural_mode (valtype, NULL, true);
9421 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9424 /* Return an RTX representing a place where a function returns
9425 or recieves pointer bounds or NULL if no bounds are returned.
9427 VALTYPE is a data type of a value returned by the function.
9429 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9430 or FUNCTION_TYPE of the function.
9432 If OUTGOING is false, return a place in which the caller will
9433 see the return value. Otherwise, return a place where a
9434 function returns a value. */
9436 static rtx
9437 ix86_function_value_bounds (const_tree valtype,
9438 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9439 bool outgoing ATTRIBUTE_UNUSED)
9441 rtx res = NULL_RTX;
9443 if (BOUNDED_TYPE_P (valtype))
9444 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9445 else if (chkp_type_has_pointer (valtype))
9447 bitmap slots;
9448 rtx bounds[2];
9449 bitmap_iterator bi;
9450 unsigned i, bnd_no = 0;
9452 bitmap_obstack_initialize (NULL);
9453 slots = BITMAP_ALLOC (NULL);
9454 chkp_find_bound_slots (valtype, slots);
9456 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9458 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9459 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9460 gcc_assert (bnd_no < 2);
9461 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9464 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9466 BITMAP_FREE (slots);
9467 bitmap_obstack_release (NULL);
9469 else
9470 res = NULL_RTX;
9472 return res;
9475 /* Pointer function arguments and return values are promoted to
9476 word_mode for normal functions. */
9478 static machine_mode
9479 ix86_promote_function_mode (const_tree type, machine_mode mode,
9480 int *punsignedp, const_tree fntype,
9481 int for_return)
9483 if (cfun->machine->func_type == TYPE_NORMAL
9484 && type != NULL_TREE
9485 && POINTER_TYPE_P (type))
9487 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9488 return word_mode;
9490 return default_promote_function_mode (type, mode, punsignedp, fntype,
9491 for_return);
9494 /* Return true if a structure, union or array with MODE containing FIELD
9495 should be accessed using BLKmode. */
9497 static bool
9498 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9500 /* Union with XFmode must be in BLKmode. */
9501 return (mode == XFmode
9502 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9503 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9507 ix86_libcall_value (machine_mode mode)
9509 return ix86_function_value_1 (NULL, NULL, mode, mode);
9512 /* Return true iff type is returned in memory. */
9514 static bool
9515 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9517 #ifdef SUBTARGET_RETURN_IN_MEMORY
9518 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9519 #else
9520 const machine_mode mode = type_natural_mode (type, NULL, true);
9521 HOST_WIDE_INT size;
9523 if (POINTER_BOUNDS_TYPE_P (type))
9524 return false;
9526 if (TARGET_64BIT)
9528 if (ix86_function_type_abi (fntype) == MS_ABI)
9530 size = int_size_in_bytes (type);
9532 /* __m128 is returned in xmm0. */
9533 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9534 || INTEGRAL_TYPE_P (type)
9535 || VECTOR_FLOAT_TYPE_P (type))
9536 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9537 && !COMPLEX_MODE_P (mode)
9538 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9539 return false;
9541 /* Otherwise, the size must be exactly in [1248]. */
9542 return size != 1 && size != 2 && size != 4 && size != 8;
9544 else
9546 int needed_intregs, needed_sseregs;
9548 return examine_argument (mode, type, 1,
9549 &needed_intregs, &needed_sseregs);
9552 else
9554 size = int_size_in_bytes (type);
9556 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9557 bytes in registers. */
9558 if (TARGET_IAMCU)
9559 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9561 if (mode == BLKmode)
9562 return true;
9564 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9565 return false;
9567 if (VECTOR_MODE_P (mode) || mode == TImode)
9569 /* User-created vectors small enough to fit in EAX. */
9570 if (size < 8)
9571 return false;
9573 /* Unless ABI prescibes otherwise,
9574 MMX/3dNow values are returned in MM0 if available. */
9576 if (size == 8)
9577 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9579 /* SSE values are returned in XMM0 if available. */
9580 if (size == 16)
9581 return !TARGET_SSE;
9583 /* AVX values are returned in YMM0 if available. */
9584 if (size == 32)
9585 return !TARGET_AVX;
9587 /* AVX512F values are returned in ZMM0 if available. */
9588 if (size == 64)
9589 return !TARGET_AVX512F;
9592 if (mode == XFmode)
9593 return false;
9595 if (size > 12)
9596 return true;
9598 /* OImode shouldn't be used directly. */
9599 gcc_assert (mode != OImode);
9601 return false;
9603 #endif
9607 /* Create the va_list data type. */
9609 static tree
9610 ix86_build_builtin_va_list_64 (void)
9612 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9614 record = lang_hooks.types.make_type (RECORD_TYPE);
9615 type_decl = build_decl (BUILTINS_LOCATION,
9616 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9618 f_gpr = build_decl (BUILTINS_LOCATION,
9619 FIELD_DECL, get_identifier ("gp_offset"),
9620 unsigned_type_node);
9621 f_fpr = build_decl (BUILTINS_LOCATION,
9622 FIELD_DECL, get_identifier ("fp_offset"),
9623 unsigned_type_node);
9624 f_ovf = build_decl (BUILTINS_LOCATION,
9625 FIELD_DECL, get_identifier ("overflow_arg_area"),
9626 ptr_type_node);
9627 f_sav = build_decl (BUILTINS_LOCATION,
9628 FIELD_DECL, get_identifier ("reg_save_area"),
9629 ptr_type_node);
9631 va_list_gpr_counter_field = f_gpr;
9632 va_list_fpr_counter_field = f_fpr;
9634 DECL_FIELD_CONTEXT (f_gpr) = record;
9635 DECL_FIELD_CONTEXT (f_fpr) = record;
9636 DECL_FIELD_CONTEXT (f_ovf) = record;
9637 DECL_FIELD_CONTEXT (f_sav) = record;
9639 TYPE_STUB_DECL (record) = type_decl;
9640 TYPE_NAME (record) = type_decl;
9641 TYPE_FIELDS (record) = f_gpr;
9642 DECL_CHAIN (f_gpr) = f_fpr;
9643 DECL_CHAIN (f_fpr) = f_ovf;
9644 DECL_CHAIN (f_ovf) = f_sav;
9646 layout_type (record);
9648 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9649 NULL_TREE, TYPE_ATTRIBUTES (record));
9651 /* The correct type is an array type of one element. */
9652 return build_array_type (record, build_index_type (size_zero_node));
9655 /* Setup the builtin va_list data type and for 64-bit the additional
9656 calling convention specific va_list data types. */
9658 static tree
9659 ix86_build_builtin_va_list (void)
9661 if (TARGET_64BIT)
9663 /* Initialize ABI specific va_list builtin types.
9665 In lto1, we can encounter two va_list types:
9666 - one as a result of the type-merge across TUs, and
9667 - the one constructed here.
9668 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9669 a type identity check in canonical_va_list_type based on
9670 TYPE_MAIN_VARIANT (which we used to have) will not work.
9671 Instead, we tag each va_list_type_node with its unique attribute, and
9672 look for the attribute in the type identity check in
9673 canonical_va_list_type.
9675 Tagging sysv_va_list_type_node directly with the attribute is
9676 problematic since it's a array of one record, which will degrade into a
9677 pointer to record when used as parameter (see build_va_arg comments for
9678 an example), dropping the attribute in the process. So we tag the
9679 record instead. */
9681 /* For SYSV_ABI we use an array of one record. */
9682 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9684 /* For MS_ABI we use plain pointer to argument area. */
9685 tree char_ptr_type = build_pointer_type (char_type_node);
9686 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9687 TYPE_ATTRIBUTES (char_ptr_type));
9688 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9690 return ((ix86_abi == MS_ABI)
9691 ? ms_va_list_type_node
9692 : sysv_va_list_type_node);
9694 else
9696 /* For i386 we use plain pointer to argument area. */
9697 return build_pointer_type (char_type_node);
9701 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9703 static void
9704 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9706 rtx save_area, mem;
9707 alias_set_type set;
9708 int i, max;
9710 /* GPR size of varargs save area. */
9711 if (cfun->va_list_gpr_size)
9712 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9713 else
9714 ix86_varargs_gpr_size = 0;
9716 /* FPR size of varargs save area. We don't need it if we don't pass
9717 anything in SSE registers. */
9718 if (TARGET_SSE && cfun->va_list_fpr_size)
9719 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9720 else
9721 ix86_varargs_fpr_size = 0;
9723 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9724 return;
9726 save_area = frame_pointer_rtx;
9727 set = get_varargs_alias_set ();
9729 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9730 if (max > X86_64_REGPARM_MAX)
9731 max = X86_64_REGPARM_MAX;
9733 for (i = cum->regno; i < max; i++)
9735 mem = gen_rtx_MEM (word_mode,
9736 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9737 MEM_NOTRAP_P (mem) = 1;
9738 set_mem_alias_set (mem, set);
9739 emit_move_insn (mem,
9740 gen_rtx_REG (word_mode,
9741 x86_64_int_parameter_registers[i]));
9744 if (ix86_varargs_fpr_size)
9746 machine_mode smode;
9747 rtx_code_label *label;
9748 rtx test;
9750 /* Now emit code to save SSE registers. The AX parameter contains number
9751 of SSE parameter registers used to call this function, though all we
9752 actually check here is the zero/non-zero status. */
9754 label = gen_label_rtx ();
9755 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9756 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9757 label));
9759 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9760 we used movdqa (i.e. TImode) instead? Perhaps even better would
9761 be if we could determine the real mode of the data, via a hook
9762 into pass_stdarg. Ignore all that for now. */
9763 smode = V4SFmode;
9764 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9765 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9767 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9768 if (max > X86_64_SSE_REGPARM_MAX)
9769 max = X86_64_SSE_REGPARM_MAX;
9771 for (i = cum->sse_regno; i < max; ++i)
9773 mem = plus_constant (Pmode, save_area,
9774 i * 16 + ix86_varargs_gpr_size);
9775 mem = gen_rtx_MEM (smode, mem);
9776 MEM_NOTRAP_P (mem) = 1;
9777 set_mem_alias_set (mem, set);
9778 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9780 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9783 emit_label (label);
9787 static void
9788 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9790 alias_set_type set = get_varargs_alias_set ();
9791 int i;
9793 /* Reset to zero, as there might be a sysv vaarg used
9794 before. */
9795 ix86_varargs_gpr_size = 0;
9796 ix86_varargs_fpr_size = 0;
9798 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9800 rtx reg, mem;
9802 mem = gen_rtx_MEM (Pmode,
9803 plus_constant (Pmode, virtual_incoming_args_rtx,
9804 i * UNITS_PER_WORD));
9805 MEM_NOTRAP_P (mem) = 1;
9806 set_mem_alias_set (mem, set);
9808 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9809 emit_move_insn (mem, reg);
9813 static void
9814 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9815 tree type, int *, int no_rtl)
9817 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9818 CUMULATIVE_ARGS next_cum;
9819 tree fntype;
9821 /* This argument doesn't appear to be used anymore. Which is good,
9822 because the old code here didn't suppress rtl generation. */
9823 gcc_assert (!no_rtl);
9825 if (!TARGET_64BIT)
9826 return;
9828 fntype = TREE_TYPE (current_function_decl);
9830 /* For varargs, we do not want to skip the dummy va_dcl argument.
9831 For stdargs, we do want to skip the last named argument. */
9832 next_cum = *cum;
9833 if (stdarg_p (fntype))
9834 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9835 true);
9837 if (cum->call_abi == MS_ABI)
9838 setup_incoming_varargs_ms_64 (&next_cum);
9839 else
9840 setup_incoming_varargs_64 (&next_cum);
9843 static void
9844 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9845 machine_mode mode,
9846 tree type,
9847 int *pretend_size ATTRIBUTE_UNUSED,
9848 int no_rtl)
9850 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9851 CUMULATIVE_ARGS next_cum;
9852 tree fntype;
9853 rtx save_area;
9854 int bnd_reg, i, max;
9856 gcc_assert (!no_rtl);
9858 /* Do nothing if we use plain pointer to argument area. */
9859 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9860 return;
9862 fntype = TREE_TYPE (current_function_decl);
9864 /* For varargs, we do not want to skip the dummy va_dcl argument.
9865 For stdargs, we do want to skip the last named argument. */
9866 next_cum = *cum;
9867 if (stdarg_p (fntype))
9868 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9869 true);
9870 save_area = frame_pointer_rtx;
9872 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9873 if (max > X86_64_REGPARM_MAX)
9874 max = X86_64_REGPARM_MAX;
9876 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9877 if (chkp_function_instrumented_p (current_function_decl))
9878 for (i = cum->regno; i < max; i++)
9880 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9881 rtx ptr = gen_rtx_REG (Pmode,
9882 x86_64_int_parameter_registers[i]);
9883 rtx bounds;
9885 if (bnd_reg <= LAST_BND_REG)
9886 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9887 else
9889 rtx ldx_addr =
9890 plus_constant (Pmode, arg_pointer_rtx,
9891 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9892 bounds = gen_reg_rtx (BNDmode);
9893 emit_insn (BNDmode == BND64mode
9894 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9895 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9898 emit_insn (BNDmode == BND64mode
9899 ? gen_bnd64_stx (addr, ptr, bounds)
9900 : gen_bnd32_stx (addr, ptr, bounds));
9902 bnd_reg++;
9907 /* Checks if TYPE is of kind va_list char *. */
9909 static bool
9910 is_va_list_char_pointer (tree type)
9912 tree canonic;
9914 /* For 32-bit it is always true. */
9915 if (!TARGET_64BIT)
9916 return true;
9917 canonic = ix86_canonical_va_list_type (type);
9918 return (canonic == ms_va_list_type_node
9919 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9922 /* Implement va_start. */
9924 static void
9925 ix86_va_start (tree valist, rtx nextarg)
9927 HOST_WIDE_INT words, n_gpr, n_fpr;
9928 tree f_gpr, f_fpr, f_ovf, f_sav;
9929 tree gpr, fpr, ovf, sav, t;
9930 tree type;
9931 rtx ovf_rtx;
9933 if (flag_split_stack
9934 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9936 unsigned int scratch_regno;
9938 /* When we are splitting the stack, we can't refer to the stack
9939 arguments using internal_arg_pointer, because they may be on
9940 the old stack. The split stack prologue will arrange to
9941 leave a pointer to the old stack arguments in a scratch
9942 register, which we here copy to a pseudo-register. The split
9943 stack prologue can't set the pseudo-register directly because
9944 it (the prologue) runs before any registers have been saved. */
9946 scratch_regno = split_stack_prologue_scratch_regno ();
9947 if (scratch_regno != INVALID_REGNUM)
9949 rtx reg;
9950 rtx_insn *seq;
9952 reg = gen_reg_rtx (Pmode);
9953 cfun->machine->split_stack_varargs_pointer = reg;
9955 start_sequence ();
9956 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9957 seq = get_insns ();
9958 end_sequence ();
9960 push_topmost_sequence ();
9961 emit_insn_after (seq, entry_of_function ());
9962 pop_topmost_sequence ();
9966 /* Only 64bit target needs something special. */
9967 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9969 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9970 std_expand_builtin_va_start (valist, nextarg);
9971 else
9973 rtx va_r, next;
9975 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9976 next = expand_binop (ptr_mode, add_optab,
9977 cfun->machine->split_stack_varargs_pointer,
9978 crtl->args.arg_offset_rtx,
9979 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9980 convert_move (va_r, next, 0);
9982 /* Store zero bounds for va_list. */
9983 if (chkp_function_instrumented_p (current_function_decl))
9984 chkp_expand_bounds_reset_for_mem (valist,
9985 make_tree (TREE_TYPE (valist),
9986 next));
9989 return;
9992 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9993 f_fpr = DECL_CHAIN (f_gpr);
9994 f_ovf = DECL_CHAIN (f_fpr);
9995 f_sav = DECL_CHAIN (f_ovf);
9997 valist = build_simple_mem_ref (valist);
9998 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9999 /* The following should be folded into the MEM_REF offset. */
10000 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
10001 f_gpr, NULL_TREE);
10002 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
10003 f_fpr, NULL_TREE);
10004 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
10005 f_ovf, NULL_TREE);
10006 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
10007 f_sav, NULL_TREE);
10009 /* Count number of gp and fp argument registers used. */
10010 words = crtl->args.info.words;
10011 n_gpr = crtl->args.info.regno;
10012 n_fpr = crtl->args.info.sse_regno;
10014 if (cfun->va_list_gpr_size)
10016 type = TREE_TYPE (gpr);
10017 t = build2 (MODIFY_EXPR, type,
10018 gpr, build_int_cst (type, n_gpr * 8));
10019 TREE_SIDE_EFFECTS (t) = 1;
10020 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10023 if (TARGET_SSE && cfun->va_list_fpr_size)
10025 type = TREE_TYPE (fpr);
10026 t = build2 (MODIFY_EXPR, type, fpr,
10027 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
10028 TREE_SIDE_EFFECTS (t) = 1;
10029 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10032 /* Find the overflow area. */
10033 type = TREE_TYPE (ovf);
10034 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10035 ovf_rtx = crtl->args.internal_arg_pointer;
10036 else
10037 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
10038 t = make_tree (type, ovf_rtx);
10039 if (words != 0)
10040 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
10042 /* Store zero bounds for overflow area pointer. */
10043 if (chkp_function_instrumented_p (current_function_decl))
10044 chkp_expand_bounds_reset_for_mem (ovf, t);
10046 t = build2 (MODIFY_EXPR, type, ovf, t);
10047 TREE_SIDE_EFFECTS (t) = 1;
10048 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10050 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
10052 /* Find the register save area.
10053 Prologue of the function save it right above stack frame. */
10054 type = TREE_TYPE (sav);
10055 t = make_tree (type, frame_pointer_rtx);
10056 if (!ix86_varargs_gpr_size)
10057 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
10059 /* Store zero bounds for save area pointer. */
10060 if (chkp_function_instrumented_p (current_function_decl))
10061 chkp_expand_bounds_reset_for_mem (sav, t);
10063 t = build2 (MODIFY_EXPR, type, sav, t);
10064 TREE_SIDE_EFFECTS (t) = 1;
10065 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10069 /* Implement va_arg. */
10071 static tree
10072 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
10073 gimple_seq *post_p)
10075 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
10076 tree f_gpr, f_fpr, f_ovf, f_sav;
10077 tree gpr, fpr, ovf, sav, t;
10078 int size, rsize;
10079 tree lab_false, lab_over = NULL_TREE;
10080 tree addr, t2;
10081 rtx container;
10082 int indirect_p = 0;
10083 tree ptrtype;
10084 machine_mode nat_mode;
10085 unsigned int arg_boundary;
10087 /* Only 64bit target needs something special. */
10088 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10089 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10091 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10092 f_fpr = DECL_CHAIN (f_gpr);
10093 f_ovf = DECL_CHAIN (f_fpr);
10094 f_sav = DECL_CHAIN (f_ovf);
10096 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10097 valist, f_gpr, NULL_TREE);
10099 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10100 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10101 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10103 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10104 if (indirect_p)
10105 type = build_pointer_type (type);
10106 size = arg_int_size_in_bytes (type);
10107 rsize = CEIL (size, UNITS_PER_WORD);
10109 nat_mode = type_natural_mode (type, NULL, false);
10110 switch (nat_mode)
10112 case E_V8SFmode:
10113 case E_V8SImode:
10114 case E_V32QImode:
10115 case E_V16HImode:
10116 case E_V4DFmode:
10117 case E_V4DImode:
10118 case E_V16SFmode:
10119 case E_V16SImode:
10120 case E_V64QImode:
10121 case E_V32HImode:
10122 case E_V8DFmode:
10123 case E_V8DImode:
10124 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10125 if (!TARGET_64BIT_MS_ABI)
10127 container = NULL;
10128 break;
10130 /* FALLTHRU */
10132 default:
10133 container = construct_container (nat_mode, TYPE_MODE (type),
10134 type, 0, X86_64_REGPARM_MAX,
10135 X86_64_SSE_REGPARM_MAX, intreg,
10137 break;
10140 /* Pull the value out of the saved registers. */
10142 addr = create_tmp_var (ptr_type_node, "addr");
10144 if (container)
10146 int needed_intregs, needed_sseregs;
10147 bool need_temp;
10148 tree int_addr, sse_addr;
10150 lab_false = create_artificial_label (UNKNOWN_LOCATION);
10151 lab_over = create_artificial_label (UNKNOWN_LOCATION);
10153 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10155 need_temp = (!REG_P (container)
10156 && ((needed_intregs && TYPE_ALIGN (type) > 64)
10157 || TYPE_ALIGN (type) > 128));
10159 /* In case we are passing structure, verify that it is consecutive block
10160 on the register save area. If not we need to do moves. */
10161 if (!need_temp && !REG_P (container))
10163 /* Verify that all registers are strictly consecutive */
10164 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10166 int i;
10168 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10170 rtx slot = XVECEXP (container, 0, i);
10171 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10172 || INTVAL (XEXP (slot, 1)) != i * 16)
10173 need_temp = true;
10176 else
10178 int i;
10180 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10182 rtx slot = XVECEXP (container, 0, i);
10183 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10184 || INTVAL (XEXP (slot, 1)) != i * 8)
10185 need_temp = true;
10189 if (!need_temp)
10191 int_addr = addr;
10192 sse_addr = addr;
10194 else
10196 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10197 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10200 /* First ensure that we fit completely in registers. */
10201 if (needed_intregs)
10203 t = build_int_cst (TREE_TYPE (gpr),
10204 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10205 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10206 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10207 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10208 gimplify_and_add (t, pre_p);
10210 if (needed_sseregs)
10212 t = build_int_cst (TREE_TYPE (fpr),
10213 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10214 + X86_64_REGPARM_MAX * 8);
10215 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10216 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10217 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10218 gimplify_and_add (t, pre_p);
10221 /* Compute index to start of area used for integer regs. */
10222 if (needed_intregs)
10224 /* int_addr = gpr + sav; */
10225 t = fold_build_pointer_plus (sav, gpr);
10226 gimplify_assign (int_addr, t, pre_p);
10228 if (needed_sseregs)
10230 /* sse_addr = fpr + sav; */
10231 t = fold_build_pointer_plus (sav, fpr);
10232 gimplify_assign (sse_addr, t, pre_p);
10234 if (need_temp)
10236 int i, prev_size = 0;
10237 tree temp = create_tmp_var (type, "va_arg_tmp");
10239 /* addr = &temp; */
10240 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10241 gimplify_assign (addr, t, pre_p);
10243 for (i = 0; i < XVECLEN (container, 0); i++)
10245 rtx slot = XVECEXP (container, 0, i);
10246 rtx reg = XEXP (slot, 0);
10247 machine_mode mode = GET_MODE (reg);
10248 tree piece_type;
10249 tree addr_type;
10250 tree daddr_type;
10251 tree src_addr, src;
10252 int src_offset;
10253 tree dest_addr, dest;
10254 int cur_size = GET_MODE_SIZE (mode);
10256 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10257 prev_size = INTVAL (XEXP (slot, 1));
10258 if (prev_size + cur_size > size)
10260 cur_size = size - prev_size;
10261 unsigned int nbits = cur_size * BITS_PER_UNIT;
10262 if (!int_mode_for_size (nbits, 1).exists (&mode))
10263 mode = QImode;
10265 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10266 if (mode == GET_MODE (reg))
10267 addr_type = build_pointer_type (piece_type);
10268 else
10269 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10270 true);
10271 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10272 true);
10274 if (SSE_REGNO_P (REGNO (reg)))
10276 src_addr = sse_addr;
10277 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10279 else
10281 src_addr = int_addr;
10282 src_offset = REGNO (reg) * 8;
10284 src_addr = fold_convert (addr_type, src_addr);
10285 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10287 dest_addr = fold_convert (daddr_type, addr);
10288 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10289 if (cur_size == GET_MODE_SIZE (mode))
10291 src = build_va_arg_indirect_ref (src_addr);
10292 dest = build_va_arg_indirect_ref (dest_addr);
10294 gimplify_assign (dest, src, pre_p);
10296 else
10298 tree copy
10299 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10300 3, dest_addr, src_addr,
10301 size_int (cur_size));
10302 gimplify_and_add (copy, pre_p);
10304 prev_size += cur_size;
10308 if (needed_intregs)
10310 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10311 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10312 gimplify_assign (gpr, t, pre_p);
10315 if (needed_sseregs)
10317 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10318 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10319 gimplify_assign (unshare_expr (fpr), t, pre_p);
10322 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10324 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10327 /* ... otherwise out of the overflow area. */
10329 /* When we align parameter on stack for caller, if the parameter
10330 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10331 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10332 here with caller. */
10333 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10334 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10335 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10337 /* Care for on-stack alignment if needed. */
10338 if (arg_boundary <= 64 || size == 0)
10339 t = ovf;
10340 else
10342 HOST_WIDE_INT align = arg_boundary / 8;
10343 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10344 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10345 build_int_cst (TREE_TYPE (t), -align));
10348 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10349 gimplify_assign (addr, t, pre_p);
10351 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10352 gimplify_assign (unshare_expr (ovf), t, pre_p);
10354 if (container)
10355 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10357 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10358 addr = fold_convert (ptrtype, addr);
10360 if (indirect_p)
10361 addr = build_va_arg_indirect_ref (addr);
10362 return build_va_arg_indirect_ref (addr);
10365 /* Return true if OPNUM's MEM should be matched
10366 in movabs* patterns. */
10368 bool
10369 ix86_check_movabs (rtx insn, int opnum)
10371 rtx set, mem;
10373 set = PATTERN (insn);
10374 if (GET_CODE (set) == PARALLEL)
10375 set = XVECEXP (set, 0, 0);
10376 gcc_assert (GET_CODE (set) == SET);
10377 mem = XEXP (set, opnum);
10378 while (SUBREG_P (mem))
10379 mem = SUBREG_REG (mem);
10380 gcc_assert (MEM_P (mem));
10381 return volatile_ok || !MEM_VOLATILE_P (mem);
10384 /* Return false if INSN contains a MEM with a non-default address space. */
10385 bool
10386 ix86_check_no_addr_space (rtx insn)
10388 subrtx_var_iterator::array_type array;
10389 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10391 rtx x = *iter;
10392 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10393 return false;
10395 return true;
10398 /* Initialize the table of extra 80387 mathematical constants. */
10400 static void
10401 init_ext_80387_constants (void)
10403 static const char * cst[5] =
10405 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10406 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10407 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10408 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10409 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10411 int i;
10413 for (i = 0; i < 5; i++)
10415 real_from_string (&ext_80387_constants_table[i], cst[i]);
10416 /* Ensure each constant is rounded to XFmode precision. */
10417 real_convert (&ext_80387_constants_table[i],
10418 XFmode, &ext_80387_constants_table[i]);
10421 ext_80387_constants_init = 1;
10424 /* Return non-zero if the constant is something that
10425 can be loaded with a special instruction. */
10428 standard_80387_constant_p (rtx x)
10430 machine_mode mode = GET_MODE (x);
10432 const REAL_VALUE_TYPE *r;
10434 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10435 return -1;
10437 if (x == CONST0_RTX (mode))
10438 return 1;
10439 if (x == CONST1_RTX (mode))
10440 return 2;
10442 r = CONST_DOUBLE_REAL_VALUE (x);
10444 /* For XFmode constants, try to find a special 80387 instruction when
10445 optimizing for size or on those CPUs that benefit from them. */
10446 if (mode == XFmode
10447 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10449 int i;
10451 if (! ext_80387_constants_init)
10452 init_ext_80387_constants ();
10454 for (i = 0; i < 5; i++)
10455 if (real_identical (r, &ext_80387_constants_table[i]))
10456 return i + 3;
10459 /* Load of the constant -0.0 or -1.0 will be split as
10460 fldz;fchs or fld1;fchs sequence. */
10461 if (real_isnegzero (r))
10462 return 8;
10463 if (real_identical (r, &dconstm1))
10464 return 9;
10466 return 0;
10469 /* Return the opcode of the special instruction to be used to load
10470 the constant X. */
10472 const char *
10473 standard_80387_constant_opcode (rtx x)
10475 switch (standard_80387_constant_p (x))
10477 case 1:
10478 return "fldz";
10479 case 2:
10480 return "fld1";
10481 case 3:
10482 return "fldlg2";
10483 case 4:
10484 return "fldln2";
10485 case 5:
10486 return "fldl2e";
10487 case 6:
10488 return "fldl2t";
10489 case 7:
10490 return "fldpi";
10491 case 8:
10492 case 9:
10493 return "#";
10494 default:
10495 gcc_unreachable ();
10499 /* Return the CONST_DOUBLE representing the 80387 constant that is
10500 loaded by the specified special instruction. The argument IDX
10501 matches the return value from standard_80387_constant_p. */
10504 standard_80387_constant_rtx (int idx)
10506 int i;
10508 if (! ext_80387_constants_init)
10509 init_ext_80387_constants ();
10511 switch (idx)
10513 case 3:
10514 case 4:
10515 case 5:
10516 case 6:
10517 case 7:
10518 i = idx - 3;
10519 break;
10521 default:
10522 gcc_unreachable ();
10525 return const_double_from_real_value (ext_80387_constants_table[i],
10526 XFmode);
10529 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10530 in supported SSE/AVX vector mode. */
10533 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10535 machine_mode mode;
10537 if (!TARGET_SSE)
10538 return 0;
10540 mode = GET_MODE (x);
10542 if (x == const0_rtx || const0_operand (x, mode))
10543 return 1;
10545 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10547 /* VOIDmode integer constant, get mode from the predicate. */
10548 if (mode == VOIDmode)
10549 mode = pred_mode;
10551 switch (GET_MODE_SIZE (mode))
10553 case 64:
10554 if (TARGET_AVX512F)
10555 return 2;
10556 break;
10557 case 32:
10558 if (TARGET_AVX2)
10559 return 2;
10560 break;
10561 case 16:
10562 if (TARGET_SSE2)
10563 return 2;
10564 break;
10565 case 0:
10566 /* VOIDmode */
10567 gcc_unreachable ();
10568 default:
10569 break;
10573 return 0;
10576 /* Return the opcode of the special instruction to be used to load
10577 the constant operands[1] into operands[0]. */
10579 const char *
10580 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10582 machine_mode mode;
10583 rtx x = operands[1];
10585 gcc_assert (TARGET_SSE);
10587 mode = GET_MODE (x);
10589 if (x == const0_rtx || const0_operand (x, mode))
10591 switch (get_attr_mode (insn))
10593 case MODE_TI:
10594 if (!EXT_REX_SSE_REG_P (operands[0]))
10595 return "%vpxor\t%0, %d0";
10596 /* FALLTHRU */
10597 case MODE_XI:
10598 case MODE_OI:
10599 if (EXT_REX_SSE_REG_P (operands[0]))
10600 return (TARGET_AVX512VL
10601 ? "vpxord\t%x0, %x0, %x0"
10602 : "vpxord\t%g0, %g0, %g0");
10603 return "vpxor\t%x0, %x0, %x0";
10605 case MODE_V2DF:
10606 if (!EXT_REX_SSE_REG_P (operands[0]))
10607 return "%vxorpd\t%0, %d0";
10608 /* FALLTHRU */
10609 case MODE_V8DF:
10610 case MODE_V4DF:
10611 if (!EXT_REX_SSE_REG_P (operands[0]))
10612 return "vxorpd\t%x0, %x0, %x0";
10613 else if (TARGET_AVX512DQ)
10614 return (TARGET_AVX512VL
10615 ? "vxorpd\t%x0, %x0, %x0"
10616 : "vxorpd\t%g0, %g0, %g0");
10617 else
10618 return (TARGET_AVX512VL
10619 ? "vpxorq\t%x0, %x0, %x0"
10620 : "vpxorq\t%g0, %g0, %g0");
10622 case MODE_V4SF:
10623 if (!EXT_REX_SSE_REG_P (operands[0]))
10624 return "%vxorps\t%0, %d0";
10625 /* FALLTHRU */
10626 case MODE_V16SF:
10627 case MODE_V8SF:
10628 if (!EXT_REX_SSE_REG_P (operands[0]))
10629 return "vxorps\t%x0, %x0, %x0";
10630 else if (TARGET_AVX512DQ)
10631 return (TARGET_AVX512VL
10632 ? "vxorps\t%x0, %x0, %x0"
10633 : "vxorps\t%g0, %g0, %g0");
10634 else
10635 return (TARGET_AVX512VL
10636 ? "vpxord\t%x0, %x0, %x0"
10637 : "vpxord\t%g0, %g0, %g0");
10639 default:
10640 gcc_unreachable ();
10643 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10645 enum attr_mode insn_mode = get_attr_mode (insn);
10647 switch (insn_mode)
10649 case MODE_XI:
10650 case MODE_V8DF:
10651 case MODE_V16SF:
10652 gcc_assert (TARGET_AVX512F);
10653 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10655 case MODE_OI:
10656 case MODE_V4DF:
10657 case MODE_V8SF:
10658 gcc_assert (TARGET_AVX2);
10659 /* FALLTHRU */
10660 case MODE_TI:
10661 case MODE_V2DF:
10662 case MODE_V4SF:
10663 gcc_assert (TARGET_SSE2);
10664 if (!EXT_REX_SSE_REG_P (operands[0]))
10665 return (TARGET_AVX
10666 ? "vpcmpeqd\t%0, %0, %0"
10667 : "pcmpeqd\t%0, %0");
10668 else if (TARGET_AVX512VL)
10669 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10670 else
10671 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10673 default:
10674 gcc_unreachable ();
10678 gcc_unreachable ();
10681 /* Returns true if INSN can be transformed from a memory load
10682 to a supported FP constant load. */
10684 bool
10685 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10687 rtx src = find_constant_src (insn);
10689 gcc_assert (REG_P (dst));
10691 if (src == NULL
10692 || (SSE_REGNO_P (REGNO (dst))
10693 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10694 || (STACK_REGNO_P (REGNO (dst))
10695 && standard_80387_constant_p (src) < 1))
10696 return false;
10698 return true;
10701 /* Returns true if OP contains a symbol reference */
10703 bool
10704 symbolic_reference_mentioned_p (rtx op)
10706 const char *fmt;
10707 int i;
10709 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10710 return true;
10712 fmt = GET_RTX_FORMAT (GET_CODE (op));
10713 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10715 if (fmt[i] == 'E')
10717 int j;
10719 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10720 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10721 return true;
10724 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10725 return true;
10728 return false;
10731 /* Return true if it is appropriate to emit `ret' instructions in the
10732 body of a function. Do this only if the epilogue is simple, needing a
10733 couple of insns. Prior to reloading, we can't tell how many registers
10734 must be saved, so return false then. Return false if there is no frame
10735 marker to de-allocate. */
10737 bool
10738 ix86_can_use_return_insn_p (void)
10740 if (ix86_function_naked (current_function_decl))
10741 return false;
10743 /* Don't use `ret' instruction in interrupt handler. */
10744 if (! reload_completed
10745 || frame_pointer_needed
10746 || cfun->machine->func_type != TYPE_NORMAL)
10747 return 0;
10749 /* Don't allow more than 32k pop, since that's all we can do
10750 with one instruction. */
10751 if (crtl->args.pops_args && crtl->args.size >= 32768)
10752 return 0;
10754 struct ix86_frame &frame = cfun->machine->frame;
10755 return (frame.stack_pointer_offset == UNITS_PER_WORD
10756 && (frame.nregs + frame.nsseregs) == 0);
10759 /* Value should be nonzero if functions must have frame pointers.
10760 Zero means the frame pointer need not be set up (and parms may
10761 be accessed via the stack pointer) in functions that seem suitable. */
10763 static bool
10764 ix86_frame_pointer_required (void)
10766 /* If we accessed previous frames, then the generated code expects
10767 to be able to access the saved ebp value in our frame. */
10768 if (cfun->machine->accesses_prev_frame)
10769 return true;
10771 /* Several x86 os'es need a frame pointer for other reasons,
10772 usually pertaining to setjmp. */
10773 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10774 return true;
10776 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10777 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10778 return true;
10780 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10781 allocation is 4GB. */
10782 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10783 return true;
10785 /* SSE saves require frame-pointer when stack is misaligned. */
10786 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10787 return true;
10789 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10790 turns off the frame pointer by default. Turn it back on now if
10791 we've not got a leaf function. */
10792 if (TARGET_OMIT_LEAF_FRAME_POINTER
10793 && (!crtl->is_leaf
10794 || ix86_current_function_calls_tls_descriptor))
10795 return true;
10797 if (crtl->profile && !flag_fentry)
10798 return true;
10800 return false;
10803 /* Record that the current function accesses previous call frames. */
10805 void
10806 ix86_setup_frame_addresses (void)
10808 cfun->machine->accesses_prev_frame = 1;
10811 #ifndef USE_HIDDEN_LINKONCE
10812 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10813 # define USE_HIDDEN_LINKONCE 1
10814 # else
10815 # define USE_HIDDEN_LINKONCE 0
10816 # endif
10817 #endif
10819 /* Label count for call and return thunks. It is used to make unique
10820 labels in call and return thunks. */
10821 static int indirectlabelno;
10823 /* True if call thunk function is needed. */
10824 static bool indirect_thunk_needed = false;
10825 /* True if call thunk function with the BND prefix is needed. */
10826 static bool indirect_thunk_bnd_needed = false;
10828 /* Bit masks of integer registers, which contain branch target, used
10829 by call thunk functions. */
10830 static int indirect_thunks_used;
10831 /* Bit masks of integer registers, which contain branch target, used
10832 by call thunk functions with the BND prefix. */
10833 static int indirect_thunks_bnd_used;
10835 /* True if return thunk function is needed. */
10836 static bool indirect_return_needed = false;
10837 /* True if return thunk function with the BND prefix is needed. */
10838 static bool indirect_return_bnd_needed = false;
10840 /* True if return thunk function via CX is needed. */
10841 static bool indirect_return_via_cx;
10842 /* True if return thunk function via CX with the BND prefix is
10843 needed. */
10844 static bool indirect_return_via_cx_bnd;
10846 #ifndef INDIRECT_LABEL
10847 # define INDIRECT_LABEL "LIND"
10848 #endif
10850 /* Indicate what prefix is needed for an indirect branch. */
10851 enum indirect_thunk_prefix
10853 indirect_thunk_prefix_none,
10854 indirect_thunk_prefix_bnd,
10855 indirect_thunk_prefix_nt
10858 /* Return the prefix needed for an indirect branch INSN. */
10860 enum indirect_thunk_prefix
10861 indirect_thunk_need_prefix (rtx_insn *insn)
10863 enum indirect_thunk_prefix need_prefix;
10864 if (ix86_bnd_prefixed_insn_p (insn))
10865 need_prefix = indirect_thunk_prefix_bnd;
10866 else if ((cfun->machine->indirect_branch_type
10867 == indirect_branch_thunk_extern)
10868 && ix86_notrack_prefixed_insn_p (insn))
10870 /* NOTRACK prefix is only used with external thunk so that it
10871 can be properly updated to support CET at run-time. */
10872 need_prefix = indirect_thunk_prefix_nt;
10874 else
10875 need_prefix = indirect_thunk_prefix_none;
10876 return need_prefix;
10879 /* Fills in the label name that should be used for the indirect thunk. */
10881 static void
10882 indirect_thunk_name (char name[32], unsigned int regno,
10883 enum indirect_thunk_prefix need_prefix,
10884 bool ret_p)
10886 if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
10887 gcc_unreachable ();
10889 if (USE_HIDDEN_LINKONCE)
10891 const char *prefix;
10893 if (need_prefix == indirect_thunk_prefix_bnd)
10894 prefix = "_bnd";
10895 else if (need_prefix == indirect_thunk_prefix_nt
10896 && regno != INVALID_REGNUM)
10898 /* NOTRACK prefix is only used with external thunk via
10899 register so that NOTRACK prefix can be added to indirect
10900 branch via register to support CET at run-time. */
10901 prefix = "_nt";
10903 else
10904 prefix = "";
10906 const char *ret = ret_p ? "return" : "indirect";
10908 if (regno != INVALID_REGNUM)
10910 const char *reg_prefix;
10911 if (LEGACY_INT_REGNO_P (regno))
10912 reg_prefix = TARGET_64BIT ? "r" : "e";
10913 else
10914 reg_prefix = "";
10915 sprintf (name, "__x86_%s_thunk%s_%s%s",
10916 ret, prefix, reg_prefix, reg_names[regno]);
10918 else
10919 sprintf (name, "__x86_%s_thunk%s", ret, prefix);
10921 else
10923 if (regno != INVALID_REGNUM)
10925 if (need_prefix == indirect_thunk_prefix_bnd)
10926 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10927 else
10928 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10930 else
10932 if (ret_p)
10934 if (need_prefix == indirect_thunk_prefix_bnd)
10935 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10936 else
10937 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10939 else
10941 if (need_prefix == indirect_thunk_prefix_bnd)
10942 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10943 else
10944 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10950 /* Output a call and return thunk for indirect branch. If BND_P is
10951 true, the BND prefix is needed. If REGNO != -1, the function
10952 address is in REGNO and the call and return thunk looks like:
10954 call L2
10956 pause
10957 lfence
10958 jmp L1
10960 mov %REG, (%sp)
10963 Otherwise, the function address is on the top of stack and the
10964 call and return thunk looks like:
10966 call L2
10968 pause
10969 lfence
10970 jmp L1
10972 lea WORD_SIZE(%sp), %sp
10976 static void
10977 output_indirect_thunk (enum indirect_thunk_prefix need_prefix,
10978 unsigned int regno)
10980 char indirectlabel1[32];
10981 char indirectlabel2[32];
10983 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10984 indirectlabelno++);
10985 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10986 indirectlabelno++);
10988 /* Call */
10989 if (need_prefix == indirect_thunk_prefix_bnd)
10990 fputs ("\tbnd call\t", asm_out_file);
10991 else
10992 fputs ("\tcall\t", asm_out_file);
10993 assemble_name_raw (asm_out_file, indirectlabel2);
10994 fputc ('\n', asm_out_file);
10996 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10998 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10999 Usage of both pause + lfence is compromise solution. */
11000 fprintf (asm_out_file, "\tpause\n\tlfence\n");
11002 /* Jump. */
11003 fputs ("\tjmp\t", asm_out_file);
11004 assemble_name_raw (asm_out_file, indirectlabel1);
11005 fputc ('\n', asm_out_file);
11007 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
11009 if (regno != INVALID_REGNUM)
11011 /* MOV. */
11012 rtx xops[2];
11013 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
11014 xops[1] = gen_rtx_REG (word_mode, regno);
11015 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
11017 else
11019 /* LEA. */
11020 rtx xops[2];
11021 xops[0] = stack_pointer_rtx;
11022 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11023 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
11026 if (need_prefix == indirect_thunk_prefix_bnd)
11027 fputs ("\tbnd ret\n", asm_out_file);
11028 else
11029 fputs ("\tret\n", asm_out_file);
11032 /* Output a funtion with a call and return thunk for indirect branch.
11033 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
11034 the function address is in REGNO. Otherwise, the function address is
11035 on the top of stack. Thunk is used for function return if RET_P is
11036 true. */
11038 static void
11039 output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
11040 unsigned int regno, bool ret_p)
11042 char name[32];
11043 tree decl;
11045 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
11046 indirect_thunk_name (name, regno, need_prefix, ret_p);
11047 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11048 get_identifier (name),
11049 build_function_type_list (void_type_node, NULL_TREE));
11050 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11051 NULL_TREE, void_type_node);
11052 TREE_PUBLIC (decl) = 1;
11053 TREE_STATIC (decl) = 1;
11054 DECL_IGNORED_P (decl) = 1;
11056 #if TARGET_MACHO
11057 if (TARGET_MACHO)
11059 switch_to_section (darwin_sections[picbase_thunk_section]);
11060 fputs ("\t.weak_definition\t", asm_out_file);
11061 assemble_name (asm_out_file, name);
11062 fputs ("\n\t.private_extern\t", asm_out_file);
11063 assemble_name (asm_out_file, name);
11064 putc ('\n', asm_out_file);
11065 ASM_OUTPUT_LABEL (asm_out_file, name);
11066 DECL_WEAK (decl) = 1;
11068 else
11069 #endif
11070 if (USE_HIDDEN_LINKONCE)
11072 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11074 targetm.asm_out.unique_section (decl, 0);
11075 switch_to_section (get_named_section (decl, NULL, 0));
11077 targetm.asm_out.globalize_label (asm_out_file, name);
11078 fputs ("\t.hidden\t", asm_out_file);
11079 assemble_name (asm_out_file, name);
11080 putc ('\n', asm_out_file);
11081 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11083 else
11085 switch_to_section (text_section);
11086 ASM_OUTPUT_LABEL (asm_out_file, name);
11089 DECL_INITIAL (decl) = make_node (BLOCK);
11090 current_function_decl = decl;
11091 allocate_struct_function (decl, false);
11092 init_function_start (decl);
11093 /* We're about to hide the function body from callees of final_* by
11094 emitting it directly; tell them we're a thunk, if they care. */
11095 cfun->is_thunk = true;
11096 first_function_block_is_cold = false;
11097 /* Make sure unwind info is emitted for the thunk if needed. */
11098 final_start_function (emit_barrier (), asm_out_file, 1);
11100 output_indirect_thunk (need_prefix, regno);
11102 final_end_function ();
11103 init_insn_lengths ();
11104 free_after_compilation (cfun);
11105 set_cfun (NULL);
11106 current_function_decl = NULL;
11109 static int pic_labels_used;
11111 /* Fills in the label name that should be used for a pc thunk for
11112 the given register. */
11114 static void
11115 get_pc_thunk_name (char name[32], unsigned int regno)
11117 gcc_assert (!TARGET_64BIT);
11119 if (USE_HIDDEN_LINKONCE)
11120 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11121 else
11122 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11126 /* This function generates code for -fpic that loads %ebx with
11127 the return address of the caller and then returns. */
11129 static void
11130 ix86_code_end (void)
11132 rtx xops[2];
11133 unsigned int regno;
11135 if (indirect_return_needed)
11136 output_indirect_thunk_function (indirect_thunk_prefix_none,
11137 INVALID_REGNUM, true);
11138 if (indirect_return_bnd_needed)
11139 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11140 INVALID_REGNUM, true);
11142 if (indirect_return_via_cx)
11143 output_indirect_thunk_function (indirect_thunk_prefix_none,
11144 CX_REG, true);
11145 if (indirect_return_via_cx_bnd)
11146 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11147 CX_REG, true);
11149 if (indirect_thunk_needed)
11150 output_indirect_thunk_function (indirect_thunk_prefix_none,
11151 INVALID_REGNUM, false);
11152 if (indirect_thunk_bnd_needed)
11153 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11154 INVALID_REGNUM, false);
11156 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11158 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11159 if ((indirect_thunks_used & (1 << i)))
11160 output_indirect_thunk_function (indirect_thunk_prefix_none,
11161 regno, false);
11163 if ((indirect_thunks_bnd_used & (1 << i)))
11164 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11165 regno, false);
11168 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11170 char name[32];
11171 tree decl;
11173 if ((indirect_thunks_used & (1 << regno)))
11174 output_indirect_thunk_function (indirect_thunk_prefix_none,
11175 regno, false);
11177 if ((indirect_thunks_bnd_used & (1 << regno)))
11178 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11179 regno, false);
11181 if (!(pic_labels_used & (1 << regno)))
11182 continue;
11184 get_pc_thunk_name (name, regno);
11186 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11187 get_identifier (name),
11188 build_function_type_list (void_type_node, NULL_TREE));
11189 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11190 NULL_TREE, void_type_node);
11191 TREE_PUBLIC (decl) = 1;
11192 TREE_STATIC (decl) = 1;
11193 DECL_IGNORED_P (decl) = 1;
11195 #if TARGET_MACHO
11196 if (TARGET_MACHO)
11198 switch_to_section (darwin_sections[picbase_thunk_section]);
11199 fputs ("\t.weak_definition\t", asm_out_file);
11200 assemble_name (asm_out_file, name);
11201 fputs ("\n\t.private_extern\t", asm_out_file);
11202 assemble_name (asm_out_file, name);
11203 putc ('\n', asm_out_file);
11204 ASM_OUTPUT_LABEL (asm_out_file, name);
11205 DECL_WEAK (decl) = 1;
11207 else
11208 #endif
11209 if (USE_HIDDEN_LINKONCE)
11211 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11213 targetm.asm_out.unique_section (decl, 0);
11214 switch_to_section (get_named_section (decl, NULL, 0));
11216 targetm.asm_out.globalize_label (asm_out_file, name);
11217 fputs ("\t.hidden\t", asm_out_file);
11218 assemble_name (asm_out_file, name);
11219 putc ('\n', asm_out_file);
11220 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11222 else
11224 switch_to_section (text_section);
11225 ASM_OUTPUT_LABEL (asm_out_file, name);
11228 DECL_INITIAL (decl) = make_node (BLOCK);
11229 current_function_decl = decl;
11230 allocate_struct_function (decl, false);
11231 init_function_start (decl);
11232 /* We're about to hide the function body from callees of final_* by
11233 emitting it directly; tell them we're a thunk, if they care. */
11234 cfun->is_thunk = true;
11235 first_function_block_is_cold = false;
11236 /* Make sure unwind info is emitted for the thunk if needed. */
11237 final_start_function (emit_barrier (), asm_out_file, 1);
11239 /* Pad stack IP move with 4 instructions (two NOPs count
11240 as one instruction). */
11241 if (TARGET_PAD_SHORT_FUNCTION)
11243 int i = 8;
11245 while (i--)
11246 fputs ("\tnop\n", asm_out_file);
11249 xops[0] = gen_rtx_REG (Pmode, regno);
11250 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11251 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11252 output_asm_insn ("%!ret", NULL);
11253 final_end_function ();
11254 init_insn_lengths ();
11255 free_after_compilation (cfun);
11256 set_cfun (NULL);
11257 current_function_decl = NULL;
11260 if (flag_split_stack)
11261 file_end_indicate_split_stack ();
11264 /* Emit code for the SET_GOT patterns. */
11266 const char *
11267 output_set_got (rtx dest, rtx label)
11269 rtx xops[3];
11271 xops[0] = dest;
11273 if (TARGET_VXWORKS_RTP && flag_pic)
11275 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11276 xops[2] = gen_rtx_MEM (Pmode,
11277 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11278 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11280 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11281 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11282 an unadorned address. */
11283 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11284 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11285 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11286 return "";
11289 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11291 if (flag_pic)
11293 char name[32];
11294 get_pc_thunk_name (name, REGNO (dest));
11295 pic_labels_used |= 1 << REGNO (dest);
11297 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11298 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11299 output_asm_insn ("%!call\t%X2", xops);
11301 #if TARGET_MACHO
11302 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11303 This is what will be referenced by the Mach-O PIC subsystem. */
11304 if (machopic_should_output_picbase_label () || !label)
11305 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11307 /* When we are restoring the pic base at the site of a nonlocal label,
11308 and we decided to emit the pic base above, we will still output a
11309 local label used for calculating the correction offset (even though
11310 the offset will be 0 in that case). */
11311 if (label)
11312 targetm.asm_out.internal_label (asm_out_file, "L",
11313 CODE_LABEL_NUMBER (label));
11314 #endif
11316 else
11318 if (TARGET_MACHO)
11319 /* We don't need a pic base, we're not producing pic. */
11320 gcc_unreachable ();
11322 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11323 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11324 targetm.asm_out.internal_label (asm_out_file, "L",
11325 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11328 if (!TARGET_MACHO)
11329 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11331 return "";
11334 /* Generate an "push" pattern for input ARG. */
11336 static rtx
11337 gen_push (rtx arg)
11339 struct machine_function *m = cfun->machine;
11341 if (m->fs.cfa_reg == stack_pointer_rtx)
11342 m->fs.cfa_offset += UNITS_PER_WORD;
11343 m->fs.sp_offset += UNITS_PER_WORD;
11345 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11346 arg = gen_rtx_REG (word_mode, REGNO (arg));
11348 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11349 gen_rtx_PRE_DEC (Pmode,
11350 stack_pointer_rtx)),
11351 arg);
11354 /* Generate an "pop" pattern for input ARG. */
11356 static rtx
11357 gen_pop (rtx arg)
11359 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11360 arg = gen_rtx_REG (word_mode, REGNO (arg));
11362 return gen_rtx_SET (arg,
11363 gen_rtx_MEM (word_mode,
11364 gen_rtx_POST_INC (Pmode,
11365 stack_pointer_rtx)));
11368 /* Return >= 0 if there is an unused call-clobbered register available
11369 for the entire function. */
11371 static unsigned int
11372 ix86_select_alt_pic_regnum (void)
11374 if (ix86_use_pseudo_pic_reg ())
11375 return INVALID_REGNUM;
11377 if (crtl->is_leaf
11378 && !crtl->profile
11379 && !ix86_current_function_calls_tls_descriptor)
11381 int i, drap;
11382 /* Can't use the same register for both PIC and DRAP. */
11383 if (crtl->drap_reg)
11384 drap = REGNO (crtl->drap_reg);
11385 else
11386 drap = -1;
11387 for (i = 2; i >= 0; --i)
11388 if (i != drap && !df_regs_ever_live_p (i))
11389 return i;
11392 return INVALID_REGNUM;
11395 /* Return true if REGNO is used by the epilogue. */
11397 bool
11398 ix86_epilogue_uses (int regno)
11400 /* If there are no caller-saved registers, we preserve all registers,
11401 except for MMX and x87 registers which aren't supported when saving
11402 and restoring registers. Don't explicitly save SP register since
11403 it is always preserved. */
11404 return (epilogue_completed
11405 && cfun->machine->no_caller_saved_registers
11406 && !fixed_regs[regno]
11407 && !STACK_REGNO_P (regno)
11408 && !MMX_REGNO_P (regno));
11411 /* Return nonzero if register REGNO can be used as a scratch register
11412 in peephole2. */
11414 static bool
11415 ix86_hard_regno_scratch_ok (unsigned int regno)
11417 /* If there are no caller-saved registers, we can't use any register
11418 as a scratch register after epilogue and use REGNO as scratch
11419 register only if it has been used before to avoid saving and
11420 restoring it. */
11421 return (!cfun->machine->no_caller_saved_registers
11422 || (!epilogue_completed
11423 && df_regs_ever_live_p (regno)));
11426 /* Return true if register class CL should be an additional allocno
11427 class. */
11429 static bool
11430 ix86_additional_allocno_class_p (reg_class_t cl)
11432 return cl == MOD4_SSE_REGS;
11435 /* Return TRUE if we need to save REGNO. */
11437 static bool
11438 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11440 /* If there are no caller-saved registers, we preserve all registers,
11441 except for MMX and x87 registers which aren't supported when saving
11442 and restoring registers. Don't explicitly save SP register since
11443 it is always preserved. */
11444 if (cfun->machine->no_caller_saved_registers)
11446 /* Don't preserve registers used for function return value. */
11447 rtx reg = crtl->return_rtx;
11448 if (reg)
11450 unsigned int i = REGNO (reg);
11451 unsigned int nregs = REG_NREGS (reg);
11452 while (nregs-- > 0)
11453 if ((i + nregs) == regno)
11454 return false;
11456 reg = crtl->return_bnd;
11457 if (reg)
11459 i = REGNO (reg);
11460 nregs = REG_NREGS (reg);
11461 while (nregs-- > 0)
11462 if ((i + nregs) == regno)
11463 return false;
11467 return (df_regs_ever_live_p (regno)
11468 && !fixed_regs[regno]
11469 && !STACK_REGNO_P (regno)
11470 && !MMX_REGNO_P (regno)
11471 && (regno != HARD_FRAME_POINTER_REGNUM
11472 || !frame_pointer_needed));
11475 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11476 && pic_offset_table_rtx)
11478 if (ix86_use_pseudo_pic_reg ())
11480 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11481 _mcount in prologue. */
11482 if (!TARGET_64BIT && flag_pic && crtl->profile)
11483 return true;
11485 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11486 || crtl->profile
11487 || crtl->calls_eh_return
11488 || crtl->uses_const_pool
11489 || cfun->has_nonlocal_label)
11490 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11493 if (crtl->calls_eh_return && maybe_eh_return)
11495 unsigned i;
11496 for (i = 0; ; i++)
11498 unsigned test = EH_RETURN_DATA_REGNO (i);
11499 if (test == INVALID_REGNUM)
11500 break;
11501 if (test == regno)
11502 return true;
11506 if (ignore_outlined && cfun->machine->call_ms2sysv)
11508 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11509 + xlogue_layout::MIN_REGS;
11510 if (xlogue_layout::is_stub_managed_reg (regno, count))
11511 return false;
11514 if (crtl->drap_reg
11515 && regno == REGNO (crtl->drap_reg)
11516 && !cfun->machine->no_drap_save_restore)
11517 return true;
11519 return (df_regs_ever_live_p (regno)
11520 && !call_used_regs[regno]
11521 && !fixed_regs[regno]
11522 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11525 /* Return number of saved general prupose registers. */
11527 static int
11528 ix86_nsaved_regs (void)
11530 int nregs = 0;
11531 int regno;
11533 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11534 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11535 nregs ++;
11536 return nregs;
11539 /* Return number of saved SSE registers. */
11541 static int
11542 ix86_nsaved_sseregs (void)
11544 int nregs = 0;
11545 int regno;
11547 if (!TARGET_64BIT_MS_ABI)
11548 return 0;
11549 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11550 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11551 nregs ++;
11552 return nregs;
11555 /* Given FROM and TO register numbers, say whether this elimination is
11556 allowed. If stack alignment is needed, we can only replace argument
11557 pointer with hard frame pointer, or replace frame pointer with stack
11558 pointer. Otherwise, frame pointer elimination is automatically
11559 handled and all other eliminations are valid. */
11561 static bool
11562 ix86_can_eliminate (const int from, const int to)
11564 if (stack_realign_fp)
11565 return ((from == ARG_POINTER_REGNUM
11566 && to == HARD_FRAME_POINTER_REGNUM)
11567 || (from == FRAME_POINTER_REGNUM
11568 && to == STACK_POINTER_REGNUM));
11569 else
11570 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11573 /* Return the offset between two registers, one to be eliminated, and the other
11574 its replacement, at the start of a routine. */
11576 HOST_WIDE_INT
11577 ix86_initial_elimination_offset (int from, int to)
11579 struct ix86_frame &frame = cfun->machine->frame;
11581 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11582 return frame.hard_frame_pointer_offset;
11583 else if (from == FRAME_POINTER_REGNUM
11584 && to == HARD_FRAME_POINTER_REGNUM)
11585 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11586 else
11588 gcc_assert (to == STACK_POINTER_REGNUM);
11590 if (from == ARG_POINTER_REGNUM)
11591 return frame.stack_pointer_offset;
11593 gcc_assert (from == FRAME_POINTER_REGNUM);
11594 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11598 /* In a dynamically-aligned function, we can't know the offset from
11599 stack pointer to frame pointer, so we must ensure that setjmp
11600 eliminates fp against the hard fp (%ebp) rather than trying to
11601 index from %esp up to the top of the frame across a gap that is
11602 of unknown (at compile-time) size. */
11603 static rtx
11604 ix86_builtin_setjmp_frame_value (void)
11606 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11609 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11610 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11612 static bool warned_once = false;
11613 if (!warned_once)
11615 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11616 feature);
11617 warned_once = true;
11621 /* Return the probing interval for -fstack-clash-protection. */
11623 static HOST_WIDE_INT
11624 get_probe_interval (void)
11626 if (flag_stack_clash_protection)
11627 return (HOST_WIDE_INT_1U
11628 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11629 else
11630 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11633 /* When using -fsplit-stack, the allocation routines set a field in
11634 the TCB to the bottom of the stack plus this much space, measured
11635 in bytes. */
11637 #define SPLIT_STACK_AVAILABLE 256
11639 /* Fill structure ix86_frame about frame of currently computed function. */
11641 static void
11642 ix86_compute_frame_layout (void)
11644 struct ix86_frame *frame = &cfun->machine->frame;
11645 struct machine_function *m = cfun->machine;
11646 unsigned HOST_WIDE_INT stack_alignment_needed;
11647 HOST_WIDE_INT offset;
11648 unsigned HOST_WIDE_INT preferred_alignment;
11649 HOST_WIDE_INT size = get_frame_size ();
11650 HOST_WIDE_INT to_allocate;
11652 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11653 * ms_abi functions that call a sysv function. We now need to prune away
11654 * cases where it should be disabled. */
11655 if (TARGET_64BIT && m->call_ms2sysv)
11657 gcc_assert (TARGET_64BIT_MS_ABI);
11658 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11659 gcc_assert (!TARGET_SEH);
11660 gcc_assert (TARGET_SSE);
11661 gcc_assert (!ix86_using_red_zone ());
11663 if (crtl->calls_eh_return)
11665 gcc_assert (!reload_completed);
11666 m->call_ms2sysv = false;
11667 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11670 else if (ix86_static_chain_on_stack)
11672 gcc_assert (!reload_completed);
11673 m->call_ms2sysv = false;
11674 warn_once_call_ms2sysv_xlogues ("static call chains");
11677 /* Finally, compute which registers the stub will manage. */
11678 else
11680 unsigned count = xlogue_layout::count_stub_managed_regs ();
11681 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11682 m->call_ms2sysv_pad_in = 0;
11686 frame->nregs = ix86_nsaved_regs ();
11687 frame->nsseregs = ix86_nsaved_sseregs ();
11689 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11690 except for function prologues, leaf functions and when the defult
11691 incoming stack boundary is overriden at command line or via
11692 force_align_arg_pointer attribute. */
11693 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11694 && (!crtl->is_leaf || cfun->calls_alloca != 0
11695 || ix86_current_function_calls_tls_descriptor
11696 || ix86_incoming_stack_boundary < 128))
11698 crtl->preferred_stack_boundary = 128;
11699 crtl->stack_alignment_needed = 128;
11702 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11703 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11705 gcc_assert (!size || stack_alignment_needed);
11706 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11707 gcc_assert (preferred_alignment <= stack_alignment_needed);
11709 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11710 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11711 if (TARGET_64BIT && m->call_ms2sysv)
11713 gcc_assert (stack_alignment_needed >= 16);
11714 gcc_assert (!frame->nsseregs);
11717 /* For SEH we have to limit the amount of code movement into the prologue.
11718 At present we do this via a BLOCKAGE, at which point there's very little
11719 scheduling that can be done, which means that there's very little point
11720 in doing anything except PUSHs. */
11721 if (TARGET_SEH)
11722 m->use_fast_prologue_epilogue = false;
11723 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11725 int count = frame->nregs;
11726 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11728 /* The fast prologue uses move instead of push to save registers. This
11729 is significantly longer, but also executes faster as modern hardware
11730 can execute the moves in parallel, but can't do that for push/pop.
11732 Be careful about choosing what prologue to emit: When function takes
11733 many instructions to execute we may use slow version as well as in
11734 case function is known to be outside hot spot (this is known with
11735 feedback only). Weight the size of function by number of registers
11736 to save as it is cheap to use one or two push instructions but very
11737 slow to use many of them. */
11738 if (count)
11739 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11740 if (node->frequency < NODE_FREQUENCY_NORMAL
11741 || (flag_branch_probabilities
11742 && node->frequency < NODE_FREQUENCY_HOT))
11743 m->use_fast_prologue_epilogue = false;
11744 else
11745 m->use_fast_prologue_epilogue
11746 = !expensive_function_p (count);
11749 frame->save_regs_using_mov
11750 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11751 /* If static stack checking is enabled and done with probes,
11752 the registers need to be saved before allocating the frame. */
11753 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11755 /* Skip return address and error code in exception handler. */
11756 offset = INCOMING_FRAME_SP_OFFSET;
11758 /* Skip pushed static chain. */
11759 if (ix86_static_chain_on_stack)
11760 offset += UNITS_PER_WORD;
11762 /* Skip saved base pointer. */
11763 if (frame_pointer_needed)
11764 offset += UNITS_PER_WORD;
11765 frame->hfp_save_offset = offset;
11767 /* The traditional frame pointer location is at the top of the frame. */
11768 frame->hard_frame_pointer_offset = offset;
11770 /* Register save area */
11771 offset += frame->nregs * UNITS_PER_WORD;
11772 frame->reg_save_offset = offset;
11774 /* On SEH target, registers are pushed just before the frame pointer
11775 location. */
11776 if (TARGET_SEH)
11777 frame->hard_frame_pointer_offset = offset;
11779 /* Calculate the size of the va-arg area (not including padding, if any). */
11780 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11782 /* Also adjust stack_realign_offset for the largest alignment of
11783 stack slot actually used. */
11784 if (stack_realign_fp
11785 || (cfun->machine->max_used_stack_alignment != 0
11786 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11788 /* We may need a 16-byte aligned stack for the remainder of the
11789 register save area, but the stack frame for the local function
11790 may require a greater alignment if using AVX/2/512. In order
11791 to avoid wasting space, we first calculate the space needed for
11792 the rest of the register saves, add that to the stack pointer,
11793 and then realign the stack to the boundary of the start of the
11794 frame for the local function. */
11795 HOST_WIDE_INT space_needed = 0;
11796 HOST_WIDE_INT sse_reg_space_needed = 0;
11798 if (TARGET_64BIT)
11800 if (m->call_ms2sysv)
11802 m->call_ms2sysv_pad_in = 0;
11803 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11806 else if (frame->nsseregs)
11807 /* The only ABI that has saved SSE registers (Win64) also has a
11808 16-byte aligned default stack. However, many programs violate
11809 the ABI, and Wine64 forces stack realignment to compensate. */
11810 space_needed = frame->nsseregs * 16;
11812 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11814 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11815 rounding to be pedantic. */
11816 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11818 else
11819 space_needed = frame->va_arg_size;
11821 /* Record the allocation size required prior to the realignment AND. */
11822 frame->stack_realign_allocate = space_needed;
11824 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11825 before this point are not directly comparable with values below
11826 this point. Use sp_valid_at to determine if the stack pointer is
11827 valid for a given offset, fp_valid_at for the frame pointer, or
11828 choose_baseaddr to have a base register chosen for you.
11830 Note that the result of (frame->stack_realign_offset
11831 & (stack_alignment_needed - 1)) may not equal zero. */
11832 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11833 frame->stack_realign_offset = offset - space_needed;
11834 frame->sse_reg_save_offset = frame->stack_realign_offset
11835 + sse_reg_space_needed;
11837 else
11839 frame->stack_realign_offset = offset;
11841 if (TARGET_64BIT && m->call_ms2sysv)
11843 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11844 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11847 /* Align and set SSE register save area. */
11848 else if (frame->nsseregs)
11850 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11851 required and the DRAP re-alignment boundary is at least 16 bytes,
11852 then we want the SSE register save area properly aligned. */
11853 if (ix86_incoming_stack_boundary >= 128
11854 || (stack_realign_drap && stack_alignment_needed >= 16))
11855 offset = ROUND_UP (offset, 16);
11856 offset += frame->nsseregs * 16;
11858 frame->sse_reg_save_offset = offset;
11859 offset += frame->va_arg_size;
11862 /* Align start of frame for local function. When a function call
11863 is removed, it may become a leaf function. But if argument may
11864 be passed on stack, we need to align the stack when there is no
11865 tail call. */
11866 if (m->call_ms2sysv
11867 || frame->va_arg_size != 0
11868 || size != 0
11869 || !crtl->is_leaf
11870 || (!crtl->tail_call_emit
11871 && cfun->machine->outgoing_args_on_stack)
11872 || cfun->calls_alloca
11873 || ix86_current_function_calls_tls_descriptor)
11874 offset = ROUND_UP (offset, stack_alignment_needed);
11876 /* Frame pointer points here. */
11877 frame->frame_pointer_offset = offset;
11879 offset += size;
11881 /* Add outgoing arguments area. Can be skipped if we eliminated
11882 all the function calls as dead code.
11883 Skipping is however impossible when function calls alloca. Alloca
11884 expander assumes that last crtl->outgoing_args_size
11885 of stack frame are unused. */
11886 if (ACCUMULATE_OUTGOING_ARGS
11887 && (!crtl->is_leaf || cfun->calls_alloca
11888 || ix86_current_function_calls_tls_descriptor))
11890 offset += crtl->outgoing_args_size;
11891 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11893 else
11894 frame->outgoing_arguments_size = 0;
11896 /* Align stack boundary. Only needed if we're calling another function
11897 or using alloca. */
11898 if (!crtl->is_leaf || cfun->calls_alloca
11899 || ix86_current_function_calls_tls_descriptor)
11900 offset = ROUND_UP (offset, preferred_alignment);
11902 /* We've reached end of stack frame. */
11903 frame->stack_pointer_offset = offset;
11905 /* Size prologue needs to allocate. */
11906 to_allocate = offset - frame->sse_reg_save_offset;
11908 if ((!to_allocate && frame->nregs <= 1)
11909 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11910 /* If stack clash probing needs a loop, then it needs a
11911 scratch register. But the returned register is only guaranteed
11912 to be safe to use after register saves are complete. So if
11913 stack clash protections are enabled and the allocated frame is
11914 larger than the probe interval, then use pushes to save
11915 callee saved registers. */
11916 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11917 frame->save_regs_using_mov = false;
11919 if (ix86_using_red_zone ()
11920 && crtl->sp_is_unchanging
11921 && crtl->is_leaf
11922 && !ix86_pc_thunk_call_expanded
11923 && !ix86_current_function_calls_tls_descriptor)
11925 frame->red_zone_size = to_allocate;
11926 if (frame->save_regs_using_mov)
11927 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11928 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11929 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11931 else
11932 frame->red_zone_size = 0;
11933 frame->stack_pointer_offset -= frame->red_zone_size;
11935 /* The SEH frame pointer location is near the bottom of the frame.
11936 This is enforced by the fact that the difference between the
11937 stack pointer and the frame pointer is limited to 240 bytes in
11938 the unwind data structure. */
11939 if (TARGET_SEH)
11941 HOST_WIDE_INT diff;
11943 /* If we can leave the frame pointer where it is, do so. Also, returns
11944 the establisher frame for __builtin_frame_address (0). */
11945 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11946 if (diff <= SEH_MAX_FRAME_SIZE
11947 && (diff > 240 || (diff & 15) != 0)
11948 && !crtl->accesses_prior_frames)
11950 /* Ideally we'd determine what portion of the local stack frame
11951 (within the constraint of the lowest 240) is most heavily used.
11952 But without that complication, simply bias the frame pointer
11953 by 128 bytes so as to maximize the amount of the local stack
11954 frame that is addressable with 8-bit offsets. */
11955 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11960 /* This is semi-inlined memory_address_length, but simplified
11961 since we know that we're always dealing with reg+offset, and
11962 to avoid having to create and discard all that rtl. */
11964 static inline int
11965 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11967 int len = 4;
11969 if (offset == 0)
11971 /* EBP and R13 cannot be encoded without an offset. */
11972 len = (regno == BP_REG || regno == R13_REG);
11974 else if (IN_RANGE (offset, -128, 127))
11975 len = 1;
11977 /* ESP and R12 must be encoded with a SIB byte. */
11978 if (regno == SP_REG || regno == R12_REG)
11979 len++;
11981 return len;
11984 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11985 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11987 static bool
11988 sp_valid_at (HOST_WIDE_INT cfa_offset)
11990 const struct machine_frame_state &fs = cfun->machine->fs;
11991 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11993 /* Validate that the cfa_offset isn't in a "no-man's land". */
11994 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11995 return false;
11997 return fs.sp_valid;
12000 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
12001 the frame save area. The register is saved at CFA - CFA_OFFSET. */
12003 static inline bool
12004 fp_valid_at (HOST_WIDE_INT cfa_offset)
12006 const struct machine_frame_state &fs = cfun->machine->fs;
12007 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
12009 /* Validate that the cfa_offset isn't in a "no-man's land". */
12010 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
12011 return false;
12013 return fs.fp_valid;
12016 /* Choose a base register based upon alignment requested, speed and/or
12017 size. */
12019 static void
12020 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
12021 HOST_WIDE_INT &base_offset,
12022 unsigned int align_reqested, unsigned int *align)
12024 const struct machine_function *m = cfun->machine;
12025 unsigned int hfp_align;
12026 unsigned int drap_align;
12027 unsigned int sp_align;
12028 bool hfp_ok = fp_valid_at (cfa_offset);
12029 bool drap_ok = m->fs.drap_valid;
12030 bool sp_ok = sp_valid_at (cfa_offset);
12032 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
12034 /* Filter out any registers that don't meet the requested alignment
12035 criteria. */
12036 if (align_reqested)
12038 if (m->fs.realigned)
12039 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
12040 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
12041 notes (which we would need to use a realigned stack pointer),
12042 so disable on SEH targets. */
12043 else if (m->fs.sp_realigned)
12044 sp_align = crtl->stack_alignment_needed;
12046 hfp_ok = hfp_ok && hfp_align >= align_reqested;
12047 drap_ok = drap_ok && drap_align >= align_reqested;
12048 sp_ok = sp_ok && sp_align >= align_reqested;
12051 if (m->use_fast_prologue_epilogue)
12053 /* Choose the base register most likely to allow the most scheduling
12054 opportunities. Generally FP is valid throughout the function,
12055 while DRAP must be reloaded within the epilogue. But choose either
12056 over the SP due to increased encoding size. */
12058 if (hfp_ok)
12060 base_reg = hard_frame_pointer_rtx;
12061 base_offset = m->fs.fp_offset - cfa_offset;
12063 else if (drap_ok)
12065 base_reg = crtl->drap_reg;
12066 base_offset = 0 - cfa_offset;
12068 else if (sp_ok)
12070 base_reg = stack_pointer_rtx;
12071 base_offset = m->fs.sp_offset - cfa_offset;
12074 else
12076 HOST_WIDE_INT toffset;
12077 int len = 16, tlen;
12079 /* Choose the base register with the smallest address encoding.
12080 With a tie, choose FP > DRAP > SP. */
12081 if (sp_ok)
12083 base_reg = stack_pointer_rtx;
12084 base_offset = m->fs.sp_offset - cfa_offset;
12085 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12087 if (drap_ok)
12089 toffset = 0 - cfa_offset;
12090 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12091 if (tlen <= len)
12093 base_reg = crtl->drap_reg;
12094 base_offset = toffset;
12095 len = tlen;
12098 if (hfp_ok)
12100 toffset = m->fs.fp_offset - cfa_offset;
12101 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12102 if (tlen <= len)
12104 base_reg = hard_frame_pointer_rtx;
12105 base_offset = toffset;
12106 len = tlen;
12111 /* Set the align return value. */
12112 if (align)
12114 if (base_reg == stack_pointer_rtx)
12115 *align = sp_align;
12116 else if (base_reg == crtl->drap_reg)
12117 *align = drap_align;
12118 else if (base_reg == hard_frame_pointer_rtx)
12119 *align = hfp_align;
12123 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12124 the alignment of address. If ALIGN is non-null, it should point to
12125 an alignment value (in bits) that is preferred or zero and will
12126 recieve the alignment of the base register that was selected,
12127 irrespective of rather or not CFA_OFFSET is a multiple of that
12128 alignment value. If it is possible for the base register offset to be
12129 non-immediate then SCRATCH_REGNO should specify a scratch register to
12130 use.
12132 The valid base registers are taken from CFUN->MACHINE->FS. */
12134 static rtx
12135 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12136 unsigned int scratch_regno = INVALID_REGNUM)
12138 rtx base_reg = NULL;
12139 HOST_WIDE_INT base_offset = 0;
12141 /* If a specific alignment is requested, try to get a base register
12142 with that alignment first. */
12143 if (align && *align)
12144 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12146 if (!base_reg)
12147 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12149 gcc_assert (base_reg != NULL);
12151 rtx base_offset_rtx = GEN_INT (base_offset);
12153 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12155 gcc_assert (scratch_regno != INVALID_REGNUM);
12157 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12158 emit_move_insn (scratch_reg, base_offset_rtx);
12160 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12163 return plus_constant (Pmode, base_reg, base_offset);
12166 /* Emit code to save registers in the prologue. */
12168 static void
12169 ix86_emit_save_regs (void)
12171 unsigned int regno;
12172 rtx_insn *insn;
12174 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12175 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12177 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12178 RTX_FRAME_RELATED_P (insn) = 1;
12182 /* Emit a single register save at CFA - CFA_OFFSET. */
12184 static void
12185 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12186 HOST_WIDE_INT cfa_offset)
12188 struct machine_function *m = cfun->machine;
12189 rtx reg = gen_rtx_REG (mode, regno);
12190 rtx mem, addr, base, insn;
12191 unsigned int align = GET_MODE_ALIGNMENT (mode);
12193 addr = choose_baseaddr (cfa_offset, &align);
12194 mem = gen_frame_mem (mode, addr);
12196 /* The location aligment depends upon the base register. */
12197 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12198 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12199 set_mem_align (mem, align);
12201 insn = emit_insn (gen_rtx_SET (mem, reg));
12202 RTX_FRAME_RELATED_P (insn) = 1;
12204 base = addr;
12205 if (GET_CODE (base) == PLUS)
12206 base = XEXP (base, 0);
12207 gcc_checking_assert (REG_P (base));
12209 /* When saving registers into a re-aligned local stack frame, avoid
12210 any tricky guessing by dwarf2out. */
12211 if (m->fs.realigned)
12213 gcc_checking_assert (stack_realign_drap);
12215 if (regno == REGNO (crtl->drap_reg))
12217 /* A bit of a hack. We force the DRAP register to be saved in
12218 the re-aligned stack frame, which provides us with a copy
12219 of the CFA that will last past the prologue. Install it. */
12220 gcc_checking_assert (cfun->machine->fs.fp_valid);
12221 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12222 cfun->machine->fs.fp_offset - cfa_offset);
12223 mem = gen_rtx_MEM (mode, addr);
12224 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12226 else
12228 /* The frame pointer is a stable reference within the
12229 aligned frame. Use it. */
12230 gcc_checking_assert (cfun->machine->fs.fp_valid);
12231 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12232 cfun->machine->fs.fp_offset - cfa_offset);
12233 mem = gen_rtx_MEM (mode, addr);
12234 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12238 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12239 && cfa_offset >= m->fs.sp_realigned_offset)
12241 gcc_checking_assert (stack_realign_fp);
12242 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12245 /* The memory may not be relative to the current CFA register,
12246 which means that we may need to generate a new pattern for
12247 use by the unwind info. */
12248 else if (base != m->fs.cfa_reg)
12250 addr = plus_constant (Pmode, m->fs.cfa_reg,
12251 m->fs.cfa_offset - cfa_offset);
12252 mem = gen_rtx_MEM (mode, addr);
12253 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12257 /* Emit code to save registers using MOV insns.
12258 First register is stored at CFA - CFA_OFFSET. */
12259 static void
12260 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12262 unsigned int regno;
12264 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12265 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12267 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12268 cfa_offset -= UNITS_PER_WORD;
12272 /* Emit code to save SSE registers using MOV insns.
12273 First register is stored at CFA - CFA_OFFSET. */
12274 static void
12275 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12277 unsigned int regno;
12279 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12280 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12282 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12283 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12287 static GTY(()) rtx queued_cfa_restores;
12289 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12290 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12291 Don't add the note if the previously saved value will be left untouched
12292 within stack red-zone till return, as unwinders can find the same value
12293 in the register and on the stack. */
12295 static void
12296 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12298 if (!crtl->shrink_wrapped
12299 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12300 return;
12302 if (insn)
12304 add_reg_note (insn, REG_CFA_RESTORE, reg);
12305 RTX_FRAME_RELATED_P (insn) = 1;
12307 else
12308 queued_cfa_restores
12309 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12312 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12314 static void
12315 ix86_add_queued_cfa_restore_notes (rtx insn)
12317 rtx last;
12318 if (!queued_cfa_restores)
12319 return;
12320 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12322 XEXP (last, 1) = REG_NOTES (insn);
12323 REG_NOTES (insn) = queued_cfa_restores;
12324 queued_cfa_restores = NULL_RTX;
12325 RTX_FRAME_RELATED_P (insn) = 1;
12328 /* Expand prologue or epilogue stack adjustment.
12329 The pattern exist to put a dependency on all ebp-based memory accesses.
12330 STYLE should be negative if instructions should be marked as frame related,
12331 zero if %r11 register is live and cannot be freely used and positive
12332 otherwise. */
12334 static rtx
12335 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12336 int style, bool set_cfa)
12338 struct machine_function *m = cfun->machine;
12339 rtx insn;
12340 bool add_frame_related_expr = false;
12342 if (Pmode == SImode)
12343 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12344 else if (x86_64_immediate_operand (offset, DImode))
12345 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12346 else
12348 rtx tmp;
12349 /* r11 is used by indirect sibcall return as well, set before the
12350 epilogue and used after the epilogue. */
12351 if (style)
12352 tmp = gen_rtx_REG (DImode, R11_REG);
12353 else
12355 gcc_assert (src != hard_frame_pointer_rtx
12356 && dest != hard_frame_pointer_rtx);
12357 tmp = hard_frame_pointer_rtx;
12359 insn = emit_insn (gen_rtx_SET (tmp, offset));
12360 if (style < 0)
12361 add_frame_related_expr = true;
12363 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12366 insn = emit_insn (insn);
12367 if (style >= 0)
12368 ix86_add_queued_cfa_restore_notes (insn);
12370 if (set_cfa)
12372 rtx r;
12374 gcc_assert (m->fs.cfa_reg == src);
12375 m->fs.cfa_offset += INTVAL (offset);
12376 m->fs.cfa_reg = dest;
12378 r = gen_rtx_PLUS (Pmode, src, offset);
12379 r = gen_rtx_SET (dest, r);
12380 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12381 RTX_FRAME_RELATED_P (insn) = 1;
12383 else if (style < 0)
12385 RTX_FRAME_RELATED_P (insn) = 1;
12386 if (add_frame_related_expr)
12388 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12389 r = gen_rtx_SET (dest, r);
12390 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12394 if (dest == stack_pointer_rtx)
12396 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12397 bool valid = m->fs.sp_valid;
12398 bool realigned = m->fs.sp_realigned;
12400 if (src == hard_frame_pointer_rtx)
12402 valid = m->fs.fp_valid;
12403 realigned = false;
12404 ooffset = m->fs.fp_offset;
12406 else if (src == crtl->drap_reg)
12408 valid = m->fs.drap_valid;
12409 realigned = false;
12410 ooffset = 0;
12412 else
12414 /* Else there are two possibilities: SP itself, which we set
12415 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12416 taken care of this by hand along the eh_return path. */
12417 gcc_checking_assert (src == stack_pointer_rtx
12418 || offset == const0_rtx);
12421 m->fs.sp_offset = ooffset - INTVAL (offset);
12422 m->fs.sp_valid = valid;
12423 m->fs.sp_realigned = realigned;
12425 return insn;
12428 /* Find an available register to be used as dynamic realign argument
12429 pointer regsiter. Such a register will be written in prologue and
12430 used in begin of body, so it must not be
12431 1. parameter passing register.
12432 2. GOT pointer.
12433 We reuse static-chain register if it is available. Otherwise, we
12434 use DI for i386 and R13 for x86-64. We chose R13 since it has
12435 shorter encoding.
12437 Return: the regno of chosen register. */
12439 static unsigned int
12440 find_drap_reg (void)
12442 tree decl = cfun->decl;
12444 /* Always use callee-saved register if there are no caller-saved
12445 registers. */
12446 if (TARGET_64BIT)
12448 /* Use R13 for nested function or function need static chain.
12449 Since function with tail call may use any caller-saved
12450 registers in epilogue, DRAP must not use caller-saved
12451 register in such case. */
12452 if (DECL_STATIC_CHAIN (decl)
12453 || cfun->machine->no_caller_saved_registers
12454 || crtl->tail_call_emit)
12455 return R13_REG;
12457 return R10_REG;
12459 else
12461 /* Use DI for nested function or function need static chain.
12462 Since function with tail call may use any caller-saved
12463 registers in epilogue, DRAP must not use caller-saved
12464 register in such case. */
12465 if (DECL_STATIC_CHAIN (decl)
12466 || cfun->machine->no_caller_saved_registers
12467 || crtl->tail_call_emit)
12468 return DI_REG;
12470 /* Reuse static chain register if it isn't used for parameter
12471 passing. */
12472 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12474 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12475 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12476 return CX_REG;
12478 return DI_REG;
12482 /* Handle a "force_align_arg_pointer" attribute. */
12484 static tree
12485 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12486 tree, int, bool *no_add_attrs)
12488 if (TREE_CODE (*node) != FUNCTION_TYPE
12489 && TREE_CODE (*node) != METHOD_TYPE
12490 && TREE_CODE (*node) != FIELD_DECL
12491 && TREE_CODE (*node) != TYPE_DECL)
12493 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12494 name);
12495 *no_add_attrs = true;
12498 return NULL_TREE;
12501 /* Return minimum incoming stack alignment. */
12503 static unsigned int
12504 ix86_minimum_incoming_stack_boundary (bool sibcall)
12506 unsigned int incoming_stack_boundary;
12508 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12509 if (cfun->machine->func_type != TYPE_NORMAL)
12510 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12511 /* Prefer the one specified at command line. */
12512 else if (ix86_user_incoming_stack_boundary)
12513 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12514 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12515 if -mstackrealign is used, it isn't used for sibcall check and
12516 estimated stack alignment is 128bit. */
12517 else if (!sibcall
12518 && ix86_force_align_arg_pointer
12519 && crtl->stack_alignment_estimated == 128)
12520 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12521 else
12522 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12524 /* Incoming stack alignment can be changed on individual functions
12525 via force_align_arg_pointer attribute. We use the smallest
12526 incoming stack boundary. */
12527 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12528 && lookup_attribute (ix86_force_align_arg_pointer_string,
12529 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12530 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12532 /* The incoming stack frame has to be aligned at least at
12533 parm_stack_boundary. */
12534 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12535 incoming_stack_boundary = crtl->parm_stack_boundary;
12537 /* Stack at entrance of main is aligned by runtime. We use the
12538 smallest incoming stack boundary. */
12539 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12540 && DECL_NAME (current_function_decl)
12541 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12542 && DECL_FILE_SCOPE_P (current_function_decl))
12543 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12545 return incoming_stack_boundary;
12548 /* Update incoming stack boundary and estimated stack alignment. */
12550 static void
12551 ix86_update_stack_boundary (void)
12553 ix86_incoming_stack_boundary
12554 = ix86_minimum_incoming_stack_boundary (false);
12556 /* x86_64 vararg needs 16byte stack alignment for register save
12557 area. */
12558 if (TARGET_64BIT
12559 && cfun->stdarg
12560 && crtl->stack_alignment_estimated < 128)
12561 crtl->stack_alignment_estimated = 128;
12563 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12564 if (ix86_tls_descriptor_calls_expanded_in_cfun
12565 && crtl->preferred_stack_boundary < 128)
12566 crtl->preferred_stack_boundary = 128;
12569 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12570 needed or an rtx for DRAP otherwise. */
12572 static rtx
12573 ix86_get_drap_rtx (void)
12575 /* We must use DRAP if there are outgoing arguments on stack and
12576 ACCUMULATE_OUTGOING_ARGS is false. */
12577 if (ix86_force_drap
12578 || (cfun->machine->outgoing_args_on_stack
12579 && !ACCUMULATE_OUTGOING_ARGS))
12580 crtl->need_drap = true;
12582 if (stack_realign_drap)
12584 /* Assign DRAP to vDRAP and returns vDRAP */
12585 unsigned int regno = find_drap_reg ();
12586 rtx drap_vreg;
12587 rtx arg_ptr;
12588 rtx_insn *seq, *insn;
12590 arg_ptr = gen_rtx_REG (Pmode, regno);
12591 crtl->drap_reg = arg_ptr;
12593 start_sequence ();
12594 drap_vreg = copy_to_reg (arg_ptr);
12595 seq = get_insns ();
12596 end_sequence ();
12598 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12599 if (!optimize)
12601 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12602 RTX_FRAME_RELATED_P (insn) = 1;
12604 return drap_vreg;
12606 else
12607 return NULL;
12610 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12612 static rtx
12613 ix86_internal_arg_pointer (void)
12615 return virtual_incoming_args_rtx;
12618 struct scratch_reg {
12619 rtx reg;
12620 bool saved;
12623 /* Return a short-lived scratch register for use on function entry.
12624 In 32-bit mode, it is valid only after the registers are saved
12625 in the prologue. This register must be released by means of
12626 release_scratch_register_on_entry once it is dead. */
12628 static void
12629 get_scratch_register_on_entry (struct scratch_reg *sr)
12631 int regno;
12633 sr->saved = false;
12635 if (TARGET_64BIT)
12637 /* We always use R11 in 64-bit mode. */
12638 regno = R11_REG;
12640 else
12642 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12643 bool fastcall_p
12644 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12645 bool thiscall_p
12646 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12647 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12648 int regparm = ix86_function_regparm (fntype, decl);
12649 int drap_regno
12650 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12652 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12653 for the static chain register. */
12654 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12655 && drap_regno != AX_REG)
12656 regno = AX_REG;
12657 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12658 for the static chain register. */
12659 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12660 regno = AX_REG;
12661 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12662 regno = DX_REG;
12663 /* ecx is the static chain register. */
12664 else if (regparm < 3 && !fastcall_p && !thiscall_p
12665 && !static_chain_p
12666 && drap_regno != CX_REG)
12667 regno = CX_REG;
12668 else if (ix86_save_reg (BX_REG, true, false))
12669 regno = BX_REG;
12670 /* esi is the static chain register. */
12671 else if (!(regparm == 3 && static_chain_p)
12672 && ix86_save_reg (SI_REG, true, false))
12673 regno = SI_REG;
12674 else if (ix86_save_reg (DI_REG, true, false))
12675 regno = DI_REG;
12676 else
12678 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12679 sr->saved = true;
12683 sr->reg = gen_rtx_REG (Pmode, regno);
12684 if (sr->saved)
12686 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12687 RTX_FRAME_RELATED_P (insn) = 1;
12691 /* Release a scratch register obtained from the preceding function.
12693 If RELEASE_VIA_POP is true, we just pop the register off the stack
12694 to release it. This is what non-Linux systems use with -fstack-check.
12696 Otherwise we use OFFSET to locate the saved register and the
12697 allocated stack space becomes part of the local frame and is
12698 deallocated by the epilogue. */
12700 static void
12701 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12702 bool release_via_pop)
12704 if (sr->saved)
12706 if (release_via_pop)
12708 struct machine_function *m = cfun->machine;
12709 rtx x, insn = emit_insn (gen_pop (sr->reg));
12711 /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
12712 RTX_FRAME_RELATED_P (insn) = 1;
12713 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12714 x = gen_rtx_SET (stack_pointer_rtx, x);
12715 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12716 m->fs.sp_offset -= UNITS_PER_WORD;
12718 else
12720 rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12721 x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12722 emit_insn (x);
12727 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12729 This differs from the next routine in that it tries hard to prevent
12730 attacks that jump the stack guard. Thus it is never allowed to allocate
12731 more than PROBE_INTERVAL bytes of stack space without a suitable
12732 probe.
12734 INT_REGISTERS_SAVED is true if integer registers have already been
12735 pushed on the stack. */
12737 static void
12738 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12739 const bool int_registers_saved)
12741 struct machine_function *m = cfun->machine;
12743 /* If this function does not statically allocate stack space, then
12744 no probes are needed. */
12745 if (!size)
12747 /* However, the allocation of space via pushes for register
12748 saves could be viewed as allocating space, but without the
12749 need to probe. */
12750 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12751 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12752 else
12753 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12754 return;
12757 /* If we are a noreturn function, then we have to consider the
12758 possibility that we're called via a jump rather than a call.
12760 Thus we don't have the implicit probe generated by saving the
12761 return address into the stack at the call. Thus, the stack
12762 pointer could be anywhere in the guard page. The safe thing
12763 to do is emit a probe now.
12765 The probe can be avoided if we have already emitted any callee
12766 register saves into the stack or have a frame pointer (which will
12767 have been saved as well). Those saves will function as implicit
12768 probes.
12770 ?!? This should be revamped to work like aarch64 and s390 where
12771 we track the offset from the most recent probe. Normally that
12772 offset would be zero. For a noreturn function we would reset
12773 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12774 we just probe when we cross PROBE_INTERVAL. */
12775 if (TREE_THIS_VOLATILE (cfun->decl)
12776 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12778 /* We can safely use any register here since we're just going to push
12779 its value and immediately pop it back. But we do try and avoid
12780 argument passing registers so as not to introduce dependencies in
12781 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12782 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12783 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12784 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12785 m->fs.sp_offset -= UNITS_PER_WORD;
12786 if (m->fs.cfa_reg == stack_pointer_rtx)
12788 m->fs.cfa_offset -= UNITS_PER_WORD;
12789 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12790 x = gen_rtx_SET (stack_pointer_rtx, x);
12791 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12792 RTX_FRAME_RELATED_P (insn_push) = 1;
12793 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12794 x = gen_rtx_SET (stack_pointer_rtx, x);
12795 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12796 RTX_FRAME_RELATED_P (insn_pop) = 1;
12798 emit_insn (gen_blockage ());
12801 /* If we allocate less than the size of the guard statically,
12802 then no probing is necessary, but we do need to allocate
12803 the stack. */
12804 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12806 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12807 GEN_INT (-size), -1,
12808 m->fs.cfa_reg == stack_pointer_rtx);
12809 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12810 return;
12813 /* We're allocating a large enough stack frame that we need to
12814 emit probes. Either emit them inline or in a loop depending
12815 on the size. */
12816 HOST_WIDE_INT probe_interval = get_probe_interval ();
12817 if (size <= 4 * probe_interval)
12819 HOST_WIDE_INT i;
12820 for (i = probe_interval; i <= size; i += probe_interval)
12822 /* Allocate PROBE_INTERVAL bytes. */
12823 rtx insn
12824 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12825 GEN_INT (-probe_interval), -1,
12826 m->fs.cfa_reg == stack_pointer_rtx);
12827 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12829 /* And probe at *sp. */
12830 emit_stack_probe (stack_pointer_rtx);
12831 emit_insn (gen_blockage ());
12834 /* We need to allocate space for the residual, but we do not need
12835 to probe the residual. */
12836 HOST_WIDE_INT residual = (i - probe_interval - size);
12837 if (residual)
12838 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12839 GEN_INT (residual), -1,
12840 m->fs.cfa_reg == stack_pointer_rtx);
12841 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12843 else
12845 /* We expect the GP registers to be saved when probes are used
12846 as the probing sequences might need a scratch register and
12847 the routine to allocate one assumes the integer registers
12848 have already been saved. */
12849 gcc_assert (int_registers_saved);
12851 struct scratch_reg sr;
12852 get_scratch_register_on_entry (&sr);
12854 /* If we needed to save a register, then account for any space
12855 that was pushed (we are not going to pop the register when
12856 we do the restore). */
12857 if (sr.saved)
12858 size -= UNITS_PER_WORD;
12860 /* Step 1: round SIZE down to a multiple of the interval. */
12861 HOST_WIDE_INT rounded_size = size & -probe_interval;
12863 /* Step 2: compute final value of the loop counter. Use lea if
12864 possible. */
12865 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12866 rtx insn;
12867 if (address_no_seg_operand (addr, Pmode))
12868 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12869 else
12871 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12872 insn = emit_insn (gen_rtx_SET (sr.reg,
12873 gen_rtx_PLUS (Pmode, sr.reg,
12874 stack_pointer_rtx)));
12876 if (m->fs.cfa_reg == stack_pointer_rtx)
12878 add_reg_note (insn, REG_CFA_DEF_CFA,
12879 plus_constant (Pmode, sr.reg,
12880 m->fs.cfa_offset + rounded_size));
12881 RTX_FRAME_RELATED_P (insn) = 1;
12884 /* Step 3: the loop. */
12885 rtx size_rtx = GEN_INT (rounded_size);
12886 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12887 size_rtx));
12888 if (m->fs.cfa_reg == stack_pointer_rtx)
12890 m->fs.cfa_offset += rounded_size;
12891 add_reg_note (insn, REG_CFA_DEF_CFA,
12892 plus_constant (Pmode, stack_pointer_rtx,
12893 m->fs.cfa_offset));
12894 RTX_FRAME_RELATED_P (insn) = 1;
12896 m->fs.sp_offset += rounded_size;
12897 emit_insn (gen_blockage ());
12899 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12900 is equal to ROUNDED_SIZE. */
12902 if (size != rounded_size)
12903 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12904 GEN_INT (rounded_size - size), -1,
12905 m->fs.cfa_reg == stack_pointer_rtx);
12906 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12908 /* This does not deallocate the space reserved for the scratch
12909 register. That will be deallocated in the epilogue. */
12910 release_scratch_register_on_entry (&sr, size, false);
12913 /* Make sure nothing is scheduled before we are done. */
12914 emit_insn (gen_blockage ());
12917 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12919 INT_REGISTERS_SAVED is true if integer registers have already been
12920 pushed on the stack. */
12922 static void
12923 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12924 const bool int_registers_saved)
12926 /* We skip the probe for the first interval + a small dope of 4 words and
12927 probe that many bytes past the specified size to maintain a protection
12928 area at the botton of the stack. */
12929 const int dope = 4 * UNITS_PER_WORD;
12930 rtx size_rtx = GEN_INT (size), last;
12932 /* See if we have a constant small number of probes to generate. If so,
12933 that's the easy case. The run-time loop is made up of 9 insns in the
12934 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12935 for n # of intervals. */
12936 if (size <= 4 * get_probe_interval ())
12938 HOST_WIDE_INT i, adjust;
12939 bool first_probe = true;
12941 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12942 values of N from 1 until it exceeds SIZE. If only one probe is
12943 needed, this will not generate any code. Then adjust and probe
12944 to PROBE_INTERVAL + SIZE. */
12945 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12947 if (first_probe)
12949 adjust = 2 * get_probe_interval () + dope;
12950 first_probe = false;
12952 else
12953 adjust = get_probe_interval ();
12955 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12956 plus_constant (Pmode, stack_pointer_rtx,
12957 -adjust)));
12958 emit_stack_probe (stack_pointer_rtx);
12961 if (first_probe)
12962 adjust = size + get_probe_interval () + dope;
12963 else
12964 adjust = size + get_probe_interval () - i;
12966 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12967 plus_constant (Pmode, stack_pointer_rtx,
12968 -adjust)));
12969 emit_stack_probe (stack_pointer_rtx);
12971 /* Adjust back to account for the additional first interval. */
12972 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12973 plus_constant (Pmode, stack_pointer_rtx,
12974 (get_probe_interval ()
12975 + dope))));
12978 /* Otherwise, do the same as above, but in a loop. Note that we must be
12979 extra careful with variables wrapping around because we might be at
12980 the very top (or the very bottom) of the address space and we have
12981 to be able to handle this case properly; in particular, we use an
12982 equality test for the loop condition. */
12983 else
12985 /* We expect the GP registers to be saved when probes are used
12986 as the probing sequences might need a scratch register and
12987 the routine to allocate one assumes the integer registers
12988 have already been saved. */
12989 gcc_assert (int_registers_saved);
12991 HOST_WIDE_INT rounded_size;
12992 struct scratch_reg sr;
12994 get_scratch_register_on_entry (&sr);
12996 /* If we needed to save a register, then account for any space
12997 that was pushed (we are not going to pop the register when
12998 we do the restore). */
12999 if (sr.saved)
13000 size -= UNITS_PER_WORD;
13002 /* Step 1: round SIZE to the previous multiple of the interval. */
13004 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13007 /* Step 2: compute initial and final value of the loop counter. */
13009 /* SP = SP_0 + PROBE_INTERVAL. */
13010 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13011 plus_constant (Pmode, stack_pointer_rtx,
13012 - (get_probe_interval () + dope))));
13014 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13015 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13016 emit_insn (gen_rtx_SET (sr.reg,
13017 plus_constant (Pmode, stack_pointer_rtx,
13018 -rounded_size)));
13019 else
13021 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13022 emit_insn (gen_rtx_SET (sr.reg,
13023 gen_rtx_PLUS (Pmode, sr.reg,
13024 stack_pointer_rtx)));
13028 /* Step 3: the loop
13032 SP = SP + PROBE_INTERVAL
13033 probe at SP
13035 while (SP != LAST_ADDR)
13037 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13038 values of N from 1 until it is equal to ROUNDED_SIZE. */
13040 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13043 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13044 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13046 if (size != rounded_size)
13048 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13049 plus_constant (Pmode, stack_pointer_rtx,
13050 rounded_size - size)));
13051 emit_stack_probe (stack_pointer_rtx);
13054 /* Adjust back to account for the additional first interval. */
13055 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13056 plus_constant (Pmode, stack_pointer_rtx,
13057 (get_probe_interval ()
13058 + dope))));
13060 /* This does not deallocate the space reserved for the scratch
13061 register. That will be deallocated in the epilogue. */
13062 release_scratch_register_on_entry (&sr, size, false);
13065 /* Even if the stack pointer isn't the CFA register, we need to correctly
13066 describe the adjustments made to it, in particular differentiate the
13067 frame-related ones from the frame-unrelated ones. */
13068 if (size > 0)
13070 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13071 XVECEXP (expr, 0, 0)
13072 = gen_rtx_SET (stack_pointer_rtx,
13073 plus_constant (Pmode, stack_pointer_rtx, -size));
13074 XVECEXP (expr, 0, 1)
13075 = gen_rtx_SET (stack_pointer_rtx,
13076 plus_constant (Pmode, stack_pointer_rtx,
13077 get_probe_interval () + dope + size));
13078 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13079 RTX_FRAME_RELATED_P (last) = 1;
13081 cfun->machine->fs.sp_offset += size;
13084 /* Make sure nothing is scheduled before we are done. */
13085 emit_insn (gen_blockage ());
13088 /* Adjust the stack pointer up to REG while probing it. */
13090 const char *
13091 output_adjust_stack_and_probe (rtx reg)
13093 static int labelno = 0;
13094 char loop_lab[32];
13095 rtx xops[2];
13097 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13099 /* Loop. */
13100 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13102 /* SP = SP + PROBE_INTERVAL. */
13103 xops[0] = stack_pointer_rtx;
13104 xops[1] = GEN_INT (get_probe_interval ());
13105 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13107 /* Probe at SP. */
13108 xops[1] = const0_rtx;
13109 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13111 /* Test if SP == LAST_ADDR. */
13112 xops[0] = stack_pointer_rtx;
13113 xops[1] = reg;
13114 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13116 /* Branch. */
13117 fputs ("\tjne\t", asm_out_file);
13118 assemble_name_raw (asm_out_file, loop_lab);
13119 fputc ('\n', asm_out_file);
13121 return "";
13124 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13125 inclusive. These are offsets from the current stack pointer.
13127 INT_REGISTERS_SAVED is true if integer registers have already been
13128 pushed on the stack. */
13130 static void
13131 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
13132 const bool int_registers_saved)
13134 /* See if we have a constant small number of probes to generate. If so,
13135 that's the easy case. The run-time loop is made up of 6 insns in the
13136 generic case while the compile-time loop is made up of n insns for n #
13137 of intervals. */
13138 if (size <= 6 * get_probe_interval ())
13140 HOST_WIDE_INT i;
13142 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13143 it exceeds SIZE. If only one probe is needed, this will not
13144 generate any code. Then probe at FIRST + SIZE. */
13145 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
13146 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13147 -(first + i)));
13149 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13150 -(first + size)));
13153 /* Otherwise, do the same as above, but in a loop. Note that we must be
13154 extra careful with variables wrapping around because we might be at
13155 the very top (or the very bottom) of the address space and we have
13156 to be able to handle this case properly; in particular, we use an
13157 equality test for the loop condition. */
13158 else
13160 /* We expect the GP registers to be saved when probes are used
13161 as the probing sequences might need a scratch register and
13162 the routine to allocate one assumes the integer registers
13163 have already been saved. */
13164 gcc_assert (int_registers_saved);
13166 HOST_WIDE_INT rounded_size, last;
13167 struct scratch_reg sr;
13169 get_scratch_register_on_entry (&sr);
13172 /* Step 1: round SIZE to the previous multiple of the interval. */
13174 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13177 /* Step 2: compute initial and final value of the loop counter. */
13179 /* TEST_OFFSET = FIRST. */
13180 emit_move_insn (sr.reg, GEN_INT (-first));
13182 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13183 last = first + rounded_size;
13186 /* Step 3: the loop
13190 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13191 probe at TEST_ADDR
13193 while (TEST_ADDR != LAST_ADDR)
13195 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13196 until it is equal to ROUNDED_SIZE. */
13198 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13201 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13202 that SIZE is equal to ROUNDED_SIZE. */
13204 if (size != rounded_size)
13205 emit_stack_probe (plus_constant (Pmode,
13206 gen_rtx_PLUS (Pmode,
13207 stack_pointer_rtx,
13208 sr.reg),
13209 rounded_size - size));
13211 release_scratch_register_on_entry (&sr, size, true);
13214 /* Make sure nothing is scheduled before we are done. */
13215 emit_insn (gen_blockage ());
13218 /* Probe a range of stack addresses from REG to END, inclusive. These are
13219 offsets from the current stack pointer. */
13221 const char *
13222 output_probe_stack_range (rtx reg, rtx end)
13224 static int labelno = 0;
13225 char loop_lab[32];
13226 rtx xops[3];
13228 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13230 /* Loop. */
13231 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13233 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13234 xops[0] = reg;
13235 xops[1] = GEN_INT (get_probe_interval ());
13236 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13238 /* Probe at TEST_ADDR. */
13239 xops[0] = stack_pointer_rtx;
13240 xops[1] = reg;
13241 xops[2] = const0_rtx;
13242 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13244 /* Test if TEST_ADDR == LAST_ADDR. */
13245 xops[0] = reg;
13246 xops[1] = end;
13247 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13249 /* Branch. */
13250 fputs ("\tjne\t", asm_out_file);
13251 assemble_name_raw (asm_out_file, loop_lab);
13252 fputc ('\n', asm_out_file);
13254 return "";
13257 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13258 to the largest alignment, in bits, of stack slot used if stack
13259 frame is required and CHECK_STACK_SLOT is true. */
13261 static bool
13262 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13263 bool check_stack_slot)
13265 HARD_REG_SET set_up_by_prologue, prologue_used;
13266 basic_block bb;
13268 CLEAR_HARD_REG_SET (prologue_used);
13269 CLEAR_HARD_REG_SET (set_up_by_prologue);
13270 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13271 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13272 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13273 HARD_FRAME_POINTER_REGNUM);
13275 /* The preferred stack alignment is the minimum stack alignment. */
13276 if (stack_alignment > crtl->preferred_stack_boundary)
13277 stack_alignment = crtl->preferred_stack_boundary;
13279 bool require_stack_frame = false;
13281 FOR_EACH_BB_FN (bb, cfun)
13283 rtx_insn *insn;
13284 FOR_BB_INSNS (bb, insn)
13285 if (NONDEBUG_INSN_P (insn)
13286 && requires_stack_frame_p (insn, prologue_used,
13287 set_up_by_prologue))
13289 require_stack_frame = true;
13291 if (check_stack_slot)
13293 /* Find the maximum stack alignment. */
13294 subrtx_iterator::array_type array;
13295 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13296 if (MEM_P (*iter)
13297 && (reg_mentioned_p (stack_pointer_rtx,
13298 *iter)
13299 || reg_mentioned_p (frame_pointer_rtx,
13300 *iter)))
13302 unsigned int alignment = MEM_ALIGN (*iter);
13303 if (alignment > stack_alignment)
13304 stack_alignment = alignment;
13310 return require_stack_frame;
13313 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13314 will guide prologue/epilogue to be generated in correct form. */
13316 static void
13317 ix86_finalize_stack_frame_flags (void)
13319 /* Check if stack realign is really needed after reload, and
13320 stores result in cfun */
13321 unsigned int incoming_stack_boundary
13322 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13323 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13324 unsigned int stack_alignment
13325 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13326 ? crtl->max_used_stack_slot_alignment
13327 : crtl->stack_alignment_needed);
13328 unsigned int stack_realign
13329 = (incoming_stack_boundary < stack_alignment);
13330 bool recompute_frame_layout_p = false;
13332 if (crtl->stack_realign_finalized)
13334 /* After stack_realign_needed is finalized, we can't no longer
13335 change it. */
13336 gcc_assert (crtl->stack_realign_needed == stack_realign);
13337 return;
13340 /* If the only reason for frame_pointer_needed is that we conservatively
13341 assumed stack realignment might be needed or -fno-omit-frame-pointer
13342 is used, but in the end nothing that needed the stack alignment had
13343 been spilled nor stack access, clear frame_pointer_needed and say we
13344 don't need stack realignment. */
13345 if ((stack_realign || !flag_omit_frame_pointer)
13346 && frame_pointer_needed
13347 && crtl->is_leaf
13348 && crtl->sp_is_unchanging
13349 && !ix86_current_function_calls_tls_descriptor
13350 && !crtl->accesses_prior_frames
13351 && !cfun->calls_alloca
13352 && !crtl->calls_eh_return
13353 /* See ira_setup_eliminable_regset for the rationale. */
13354 && !(STACK_CHECK_MOVING_SP
13355 && flag_stack_check
13356 && flag_exceptions
13357 && cfun->can_throw_non_call_exceptions)
13358 && !ix86_frame_pointer_required ()
13359 && get_frame_size () == 0
13360 && ix86_nsaved_sseregs () == 0
13361 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13363 if (ix86_find_max_used_stack_alignment (stack_alignment,
13364 stack_realign))
13366 /* Stack frame is required. If stack alignment needed is less
13367 than incoming stack boundary, don't realign stack. */
13368 stack_realign = incoming_stack_boundary < stack_alignment;
13369 if (!stack_realign)
13371 crtl->max_used_stack_slot_alignment
13372 = incoming_stack_boundary;
13373 crtl->stack_alignment_needed
13374 = incoming_stack_boundary;
13375 /* Also update preferred_stack_boundary for leaf
13376 functions. */
13377 crtl->preferred_stack_boundary
13378 = incoming_stack_boundary;
13381 else
13383 /* If drap has been set, but it actually isn't live at the
13384 start of the function, there is no reason to set it up. */
13385 if (crtl->drap_reg)
13387 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13388 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13389 REGNO (crtl->drap_reg)))
13391 crtl->drap_reg = NULL_RTX;
13392 crtl->need_drap = false;
13395 else
13396 cfun->machine->no_drap_save_restore = true;
13398 frame_pointer_needed = false;
13399 stack_realign = false;
13400 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13401 crtl->stack_alignment_needed = incoming_stack_boundary;
13402 crtl->stack_alignment_estimated = incoming_stack_boundary;
13403 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13404 crtl->preferred_stack_boundary = incoming_stack_boundary;
13405 df_finish_pass (true);
13406 df_scan_alloc (NULL);
13407 df_scan_blocks ();
13408 df_compute_regs_ever_live (true);
13409 df_analyze ();
13411 if (flag_var_tracking)
13413 /* Since frame pointer is no longer available, replace it with
13414 stack pointer - UNITS_PER_WORD in debug insns. */
13415 df_ref ref, next;
13416 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13417 ref; ref = next)
13419 next = DF_REF_NEXT_REG (ref);
13420 if (!DF_REF_INSN_INFO (ref))
13421 continue;
13423 /* Make sure the next ref is for a different instruction,
13424 so that we're not affected by the rescan. */
13425 rtx_insn *insn = DF_REF_INSN (ref);
13426 while (next && DF_REF_INSN (next) == insn)
13427 next = DF_REF_NEXT_REG (next);
13429 if (DEBUG_INSN_P (insn))
13431 bool changed = false;
13432 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13434 rtx *loc = DF_REF_LOC (ref);
13435 if (*loc == hard_frame_pointer_rtx)
13437 *loc = plus_constant (Pmode,
13438 stack_pointer_rtx,
13439 -UNITS_PER_WORD);
13440 changed = true;
13443 if (changed)
13444 df_insn_rescan (insn);
13449 recompute_frame_layout_p = true;
13452 else if (crtl->max_used_stack_slot_alignment
13453 > crtl->preferred_stack_boundary)
13455 /* We don't need to realign stack. But we still need to keep
13456 stack frame properly aligned to satisfy the largest alignment
13457 of stack slots. */
13458 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13459 cfun->machine->max_used_stack_alignment
13460 = stack_alignment / BITS_PER_UNIT;
13463 if (crtl->stack_realign_needed != stack_realign)
13464 recompute_frame_layout_p = true;
13465 crtl->stack_realign_needed = stack_realign;
13466 crtl->stack_realign_finalized = true;
13467 if (recompute_frame_layout_p)
13468 ix86_compute_frame_layout ();
13471 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13473 static void
13474 ix86_elim_entry_set_got (rtx reg)
13476 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13477 rtx_insn *c_insn = BB_HEAD (bb);
13478 if (!NONDEBUG_INSN_P (c_insn))
13479 c_insn = next_nonnote_nondebug_insn (c_insn);
13480 if (c_insn && NONJUMP_INSN_P (c_insn))
13482 rtx pat = PATTERN (c_insn);
13483 if (GET_CODE (pat) == PARALLEL)
13485 rtx vec = XVECEXP (pat, 0, 0);
13486 if (GET_CODE (vec) == SET
13487 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13488 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13489 delete_insn (c_insn);
13494 static rtx
13495 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13497 rtx addr, mem;
13499 if (offset)
13500 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13501 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13502 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13505 static inline rtx
13506 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13508 return gen_frame_set (reg, frame_reg, offset, false);
13511 static inline rtx
13512 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13514 return gen_frame_set (reg, frame_reg, offset, true);
13517 static void
13518 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13520 struct machine_function *m = cfun->machine;
13521 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13522 + m->call_ms2sysv_extra_regs;
13523 rtvec v = rtvec_alloc (ncregs + 1);
13524 unsigned int align, i, vi = 0;
13525 rtx_insn *insn;
13526 rtx sym, addr;
13527 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13528 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13530 /* AL should only be live with sysv_abi. */
13531 gcc_assert (!ix86_eax_live_at_start_p ());
13532 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13534 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13535 we've actually realigned the stack or not. */
13536 align = GET_MODE_ALIGNMENT (V4SFmode);
13537 addr = choose_baseaddr (frame.stack_realign_offset
13538 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13539 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13541 emit_insn (gen_rtx_SET (rax, addr));
13543 /* Get the stub symbol. */
13544 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13545 : XLOGUE_STUB_SAVE);
13546 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13548 for (i = 0; i < ncregs; ++i)
13550 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13551 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13552 r.regno);
13553 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13556 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13558 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13559 RTX_FRAME_RELATED_P (insn) = true;
13562 /* Expand the prologue into a bunch of separate insns. */
13564 void
13565 ix86_expand_prologue (void)
13567 struct machine_function *m = cfun->machine;
13568 rtx insn, t;
13569 HOST_WIDE_INT allocate;
13570 bool int_registers_saved;
13571 bool sse_registers_saved;
13572 bool save_stub_call_needed;
13573 rtx static_chain = NULL_RTX;
13575 if (ix86_function_naked (current_function_decl))
13576 return;
13578 ix86_finalize_stack_frame_flags ();
13580 /* DRAP should not coexist with stack_realign_fp */
13581 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13583 memset (&m->fs, 0, sizeof (m->fs));
13585 /* Initialize CFA state for before the prologue. */
13586 m->fs.cfa_reg = stack_pointer_rtx;
13587 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13589 /* Track SP offset to the CFA. We continue tracking this after we've
13590 swapped the CFA register away from SP. In the case of re-alignment
13591 this is fudged; we're interested to offsets within the local frame. */
13592 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13593 m->fs.sp_valid = true;
13594 m->fs.sp_realigned = false;
13596 const struct ix86_frame &frame = cfun->machine->frame;
13598 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13600 /* We should have already generated an error for any use of
13601 ms_hook on a nested function. */
13602 gcc_checking_assert (!ix86_static_chain_on_stack);
13604 /* Check if profiling is active and we shall use profiling before
13605 prologue variant. If so sorry. */
13606 if (crtl->profile && flag_fentry != 0)
13607 sorry ("ms_hook_prologue attribute isn%'t compatible "
13608 "with -mfentry for 32-bit");
13610 /* In ix86_asm_output_function_label we emitted:
13611 8b ff movl.s %edi,%edi
13612 55 push %ebp
13613 8b ec movl.s %esp,%ebp
13615 This matches the hookable function prologue in Win32 API
13616 functions in Microsoft Windows XP Service Pack 2 and newer.
13617 Wine uses this to enable Windows apps to hook the Win32 API
13618 functions provided by Wine.
13620 What that means is that we've already set up the frame pointer. */
13622 if (frame_pointer_needed
13623 && !(crtl->drap_reg && crtl->stack_realign_needed))
13625 rtx push, mov;
13627 /* We've decided to use the frame pointer already set up.
13628 Describe this to the unwinder by pretending that both
13629 push and mov insns happen right here.
13631 Putting the unwind info here at the end of the ms_hook
13632 is done so that we can make absolutely certain we get
13633 the required byte sequence at the start of the function,
13634 rather than relying on an assembler that can produce
13635 the exact encoding required.
13637 However it does mean (in the unpatched case) that we have
13638 a 1 insn window where the asynchronous unwind info is
13639 incorrect. However, if we placed the unwind info at
13640 its correct location we would have incorrect unwind info
13641 in the patched case. Which is probably all moot since
13642 I don't expect Wine generates dwarf2 unwind info for the
13643 system libraries that use this feature. */
13645 insn = emit_insn (gen_blockage ());
13647 push = gen_push (hard_frame_pointer_rtx);
13648 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13649 stack_pointer_rtx);
13650 RTX_FRAME_RELATED_P (push) = 1;
13651 RTX_FRAME_RELATED_P (mov) = 1;
13653 RTX_FRAME_RELATED_P (insn) = 1;
13654 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13655 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13657 /* Note that gen_push incremented m->fs.cfa_offset, even
13658 though we didn't emit the push insn here. */
13659 m->fs.cfa_reg = hard_frame_pointer_rtx;
13660 m->fs.fp_offset = m->fs.cfa_offset;
13661 m->fs.fp_valid = true;
13663 else
13665 /* The frame pointer is not needed so pop %ebp again.
13666 This leaves us with a pristine state. */
13667 emit_insn (gen_pop (hard_frame_pointer_rtx));
13671 /* The first insn of a function that accepts its static chain on the
13672 stack is to push the register that would be filled in by a direct
13673 call. This insn will be skipped by the trampoline. */
13674 else if (ix86_static_chain_on_stack)
13676 static_chain = ix86_static_chain (cfun->decl, false);
13677 insn = emit_insn (gen_push (static_chain));
13678 emit_insn (gen_blockage ());
13680 /* We don't want to interpret this push insn as a register save,
13681 only as a stack adjustment. The real copy of the register as
13682 a save will be done later, if needed. */
13683 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13684 t = gen_rtx_SET (stack_pointer_rtx, t);
13685 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13686 RTX_FRAME_RELATED_P (insn) = 1;
13689 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13690 of DRAP is needed and stack realignment is really needed after reload */
13691 if (stack_realign_drap)
13693 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13695 /* Can't use DRAP in interrupt function. */
13696 if (cfun->machine->func_type != TYPE_NORMAL)
13697 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13698 "in interrupt service routine. This may be worked "
13699 "around by avoiding functions with aggregate return.");
13701 /* Only need to push parameter pointer reg if it is caller saved. */
13702 if (!call_used_regs[REGNO (crtl->drap_reg)])
13704 /* Push arg pointer reg */
13705 insn = emit_insn (gen_push (crtl->drap_reg));
13706 RTX_FRAME_RELATED_P (insn) = 1;
13709 /* Grab the argument pointer. */
13710 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13711 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13712 RTX_FRAME_RELATED_P (insn) = 1;
13713 m->fs.cfa_reg = crtl->drap_reg;
13714 m->fs.cfa_offset = 0;
13716 /* Align the stack. */
13717 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13718 stack_pointer_rtx,
13719 GEN_INT (-align_bytes)));
13720 RTX_FRAME_RELATED_P (insn) = 1;
13722 /* Replicate the return address on the stack so that return
13723 address can be reached via (argp - 1) slot. This is needed
13724 to implement macro RETURN_ADDR_RTX and intrinsic function
13725 expand_builtin_return_addr etc. */
13726 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13727 t = gen_frame_mem (word_mode, t);
13728 insn = emit_insn (gen_push (t));
13729 RTX_FRAME_RELATED_P (insn) = 1;
13731 /* For the purposes of frame and register save area addressing,
13732 we've started over with a new frame. */
13733 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13734 m->fs.realigned = true;
13736 if (static_chain)
13738 /* Replicate static chain on the stack so that static chain
13739 can be reached via (argp - 2) slot. This is needed for
13740 nested function with stack realignment. */
13741 insn = emit_insn (gen_push (static_chain));
13742 RTX_FRAME_RELATED_P (insn) = 1;
13746 int_registers_saved = (frame.nregs == 0);
13747 sse_registers_saved = (frame.nsseregs == 0);
13748 save_stub_call_needed = (m->call_ms2sysv);
13749 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13751 if (frame_pointer_needed && !m->fs.fp_valid)
13753 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13754 slower on all targets. Also sdb didn't like it. */
13755 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13756 RTX_FRAME_RELATED_P (insn) = 1;
13758 /* Push registers now, before setting the frame pointer
13759 on SEH target. */
13760 if (!int_registers_saved
13761 && TARGET_SEH
13762 && !frame.save_regs_using_mov)
13764 ix86_emit_save_regs ();
13765 int_registers_saved = true;
13766 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13769 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13771 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13772 RTX_FRAME_RELATED_P (insn) = 1;
13774 if (m->fs.cfa_reg == stack_pointer_rtx)
13775 m->fs.cfa_reg = hard_frame_pointer_rtx;
13776 m->fs.fp_offset = m->fs.sp_offset;
13777 m->fs.fp_valid = true;
13781 if (!int_registers_saved)
13783 /* If saving registers via PUSH, do so now. */
13784 if (!frame.save_regs_using_mov)
13786 ix86_emit_save_regs ();
13787 int_registers_saved = true;
13788 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13791 /* When using red zone we may start register saving before allocating
13792 the stack frame saving one cycle of the prologue. However, avoid
13793 doing this if we have to probe the stack; at least on x86_64 the
13794 stack probe can turn into a call that clobbers a red zone location. */
13795 else if (ix86_using_red_zone ()
13796 && (! TARGET_STACK_PROBE
13797 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13799 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13800 int_registers_saved = true;
13804 if (stack_realign_fp)
13806 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13807 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13809 /* Record last valid frame pointer offset. */
13810 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13812 /* The computation of the size of the re-aligned stack frame means
13813 that we must allocate the size of the register save area before
13814 performing the actual alignment. Otherwise we cannot guarantee
13815 that there's enough storage above the realignment point. */
13816 allocate = frame.reg_save_offset - m->fs.sp_offset
13817 + frame.stack_realign_allocate;
13818 if (allocate)
13819 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13820 GEN_INT (-allocate), -1, false);
13822 /* Align the stack. */
13823 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13824 stack_pointer_rtx,
13825 GEN_INT (-align_bytes)));
13826 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13827 m->fs.sp_realigned_offset = m->fs.sp_offset
13828 - frame.stack_realign_allocate;
13829 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13830 Beyond this point, stack access should be done via choose_baseaddr or
13831 by using sp_valid_at and fp_valid_at to determine the correct base
13832 register. Henceforth, any CFA offset should be thought of as logical
13833 and not physical. */
13834 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13835 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13836 m->fs.sp_realigned = true;
13838 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13839 is needed to describe where a register is saved using a realigned
13840 stack pointer, so we need to invalidate the stack pointer for that
13841 target. */
13842 if (TARGET_SEH)
13843 m->fs.sp_valid = false;
13845 /* If SP offset is non-immediate after allocation of the stack frame,
13846 then emit SSE saves or stub call prior to allocating the rest of the
13847 stack frame. This is less efficient for the out-of-line stub because
13848 we can't combine allocations across the call barrier, but it's better
13849 than using a scratch register. */
13850 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13851 - m->fs.sp_realigned_offset),
13852 Pmode))
13854 if (!sse_registers_saved)
13856 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13857 sse_registers_saved = true;
13859 else if (save_stub_call_needed)
13861 ix86_emit_outlined_ms2sysv_save (frame);
13862 save_stub_call_needed = false;
13867 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13869 if (flag_stack_usage_info)
13871 /* We start to count from ARG_POINTER. */
13872 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13874 /* If it was realigned, take into account the fake frame. */
13875 if (stack_realign_drap)
13877 if (ix86_static_chain_on_stack)
13878 stack_size += UNITS_PER_WORD;
13880 if (!call_used_regs[REGNO (crtl->drap_reg)])
13881 stack_size += UNITS_PER_WORD;
13883 /* This over-estimates by 1 minimal-stack-alignment-unit but
13884 mitigates that by counting in the new return address slot. */
13885 current_function_dynamic_stack_size
13886 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13889 current_function_static_stack_size = stack_size;
13892 /* On SEH target with very large frame size, allocate an area to save
13893 SSE registers (as the very large allocation won't be described). */
13894 if (TARGET_SEH
13895 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13896 && !sse_registers_saved)
13898 HOST_WIDE_INT sse_size =
13899 frame.sse_reg_save_offset - frame.reg_save_offset;
13901 gcc_assert (int_registers_saved);
13903 /* No need to do stack checking as the area will be immediately
13904 written. */
13905 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13906 GEN_INT (-sse_size), -1,
13907 m->fs.cfa_reg == stack_pointer_rtx);
13908 allocate -= sse_size;
13909 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13910 sse_registers_saved = true;
13913 /* The stack has already been decremented by the instruction calling us
13914 so probe if the size is non-negative to preserve the protection area. */
13915 if (allocate >= 0
13916 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13917 || flag_stack_clash_protection))
13919 if (flag_stack_clash_protection)
13921 ix86_adjust_stack_and_probe_stack_clash (allocate,
13922 int_registers_saved);
13923 allocate = 0;
13925 else if (STACK_CHECK_MOVING_SP)
13927 if (!(crtl->is_leaf && !cfun->calls_alloca
13928 && allocate <= get_probe_interval ()))
13930 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13931 allocate = 0;
13934 else
13936 HOST_WIDE_INT size = allocate;
13938 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13939 size = 0x80000000 - get_stack_check_protect () - 1;
13941 if (TARGET_STACK_PROBE)
13943 if (crtl->is_leaf && !cfun->calls_alloca)
13945 if (size > get_probe_interval ())
13946 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13948 else
13949 ix86_emit_probe_stack_range (0,
13950 size + get_stack_check_protect (),
13951 int_registers_saved);
13953 else
13955 if (crtl->is_leaf && !cfun->calls_alloca)
13957 if (size > get_probe_interval ()
13958 && size > get_stack_check_protect ())
13959 ix86_emit_probe_stack_range (get_stack_check_protect (),
13960 (size
13961 - get_stack_check_protect ()),
13962 int_registers_saved);
13964 else
13965 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13966 int_registers_saved);
13971 if (allocate == 0)
13973 else if (!ix86_target_stack_probe ()
13974 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13976 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13977 GEN_INT (-allocate), -1,
13978 m->fs.cfa_reg == stack_pointer_rtx);
13980 else
13982 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13983 rtx r10 = NULL;
13984 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13985 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13986 bool eax_live = ix86_eax_live_at_start_p ();
13987 bool r10_live = false;
13989 if (TARGET_64BIT)
13990 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13992 if (eax_live)
13994 insn = emit_insn (gen_push (eax));
13995 allocate -= UNITS_PER_WORD;
13996 /* Note that SEH directives need to continue tracking the stack
13997 pointer even after the frame pointer has been set up. */
13998 if (sp_is_cfa_reg || TARGET_SEH)
14000 if (sp_is_cfa_reg)
14001 m->fs.cfa_offset += UNITS_PER_WORD;
14002 RTX_FRAME_RELATED_P (insn) = 1;
14003 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14004 gen_rtx_SET (stack_pointer_rtx,
14005 plus_constant (Pmode, stack_pointer_rtx,
14006 -UNITS_PER_WORD)));
14010 if (r10_live)
14012 r10 = gen_rtx_REG (Pmode, R10_REG);
14013 insn = emit_insn (gen_push (r10));
14014 allocate -= UNITS_PER_WORD;
14015 if (sp_is_cfa_reg || TARGET_SEH)
14017 if (sp_is_cfa_reg)
14018 m->fs.cfa_offset += UNITS_PER_WORD;
14019 RTX_FRAME_RELATED_P (insn) = 1;
14020 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14021 gen_rtx_SET (stack_pointer_rtx,
14022 plus_constant (Pmode, stack_pointer_rtx,
14023 -UNITS_PER_WORD)));
14027 emit_move_insn (eax, GEN_INT (allocate));
14028 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14030 /* Use the fact that AX still contains ALLOCATE. */
14031 adjust_stack_insn = (Pmode == DImode
14032 ? gen_pro_epilogue_adjust_stack_di_sub
14033 : gen_pro_epilogue_adjust_stack_si_sub);
14035 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14036 stack_pointer_rtx, eax));
14038 if (sp_is_cfa_reg || TARGET_SEH)
14040 if (sp_is_cfa_reg)
14041 m->fs.cfa_offset += allocate;
14042 RTX_FRAME_RELATED_P (insn) = 1;
14043 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14044 gen_rtx_SET (stack_pointer_rtx,
14045 plus_constant (Pmode, stack_pointer_rtx,
14046 -allocate)));
14048 m->fs.sp_offset += allocate;
14050 /* Use stack_pointer_rtx for relative addressing so that code
14051 works for realigned stack, too. */
14052 if (r10_live && eax_live)
14054 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14055 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14056 gen_frame_mem (word_mode, t));
14057 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14058 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14059 gen_frame_mem (word_mode, t));
14061 else if (eax_live || r10_live)
14063 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14064 emit_move_insn (gen_rtx_REG (word_mode,
14065 (eax_live ? AX_REG : R10_REG)),
14066 gen_frame_mem (word_mode, t));
14069 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14071 /* If we havn't already set up the frame pointer, do so now. */
14072 if (frame_pointer_needed && !m->fs.fp_valid)
14074 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14075 GEN_INT (frame.stack_pointer_offset
14076 - frame.hard_frame_pointer_offset));
14077 insn = emit_insn (insn);
14078 RTX_FRAME_RELATED_P (insn) = 1;
14079 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14081 if (m->fs.cfa_reg == stack_pointer_rtx)
14082 m->fs.cfa_reg = hard_frame_pointer_rtx;
14083 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14084 m->fs.fp_valid = true;
14087 if (!int_registers_saved)
14088 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14089 if (!sse_registers_saved)
14090 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14091 else if (save_stub_call_needed)
14092 ix86_emit_outlined_ms2sysv_save (frame);
14094 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14095 in PROLOGUE. */
14096 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14098 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14099 insn = emit_insn (gen_set_got (pic));
14100 RTX_FRAME_RELATED_P (insn) = 1;
14101 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14102 emit_insn (gen_prologue_use (pic));
14103 /* Deleting already emmitted SET_GOT if exist and allocated to
14104 REAL_PIC_OFFSET_TABLE_REGNUM. */
14105 ix86_elim_entry_set_got (pic);
14108 if (crtl->drap_reg && !crtl->stack_realign_needed)
14110 /* vDRAP is setup but after reload it turns out stack realign
14111 isn't necessary, here we will emit prologue to setup DRAP
14112 without stack realign adjustment */
14113 t = choose_baseaddr (0, NULL);
14114 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14117 /* Prevent instructions from being scheduled into register save push
14118 sequence when access to the redzone area is done through frame pointer.
14119 The offset between the frame pointer and the stack pointer is calculated
14120 relative to the value of the stack pointer at the end of the function
14121 prologue, and moving instructions that access redzone area via frame
14122 pointer inside push sequence violates this assumption. */
14123 if (frame_pointer_needed && frame.red_zone_size)
14124 emit_insn (gen_memory_blockage ());
14126 /* SEH requires that the prologue end within 256 bytes of the start of
14127 the function. Prevent instruction schedules that would extend that.
14128 Further, prevent alloca modifications to the stack pointer from being
14129 combined with prologue modifications. */
14130 if (TARGET_SEH)
14131 emit_insn (gen_prologue_use (stack_pointer_rtx));
14134 /* Emit code to restore REG using a POP insn. */
14136 static void
14137 ix86_emit_restore_reg_using_pop (rtx reg)
14139 struct machine_function *m = cfun->machine;
14140 rtx_insn *insn = emit_insn (gen_pop (reg));
14142 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14143 m->fs.sp_offset -= UNITS_PER_WORD;
14145 if (m->fs.cfa_reg == crtl->drap_reg
14146 && REGNO (reg) == REGNO (crtl->drap_reg))
14148 /* Previously we'd represented the CFA as an expression
14149 like *(%ebp - 8). We've just popped that value from
14150 the stack, which means we need to reset the CFA to
14151 the drap register. This will remain until we restore
14152 the stack pointer. */
14153 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14154 RTX_FRAME_RELATED_P (insn) = 1;
14156 /* This means that the DRAP register is valid for addressing too. */
14157 m->fs.drap_valid = true;
14158 return;
14161 if (m->fs.cfa_reg == stack_pointer_rtx)
14163 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14164 x = gen_rtx_SET (stack_pointer_rtx, x);
14165 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14166 RTX_FRAME_RELATED_P (insn) = 1;
14168 m->fs.cfa_offset -= UNITS_PER_WORD;
14171 /* When the frame pointer is the CFA, and we pop it, we are
14172 swapping back to the stack pointer as the CFA. This happens
14173 for stack frames that don't allocate other data, so we assume
14174 the stack pointer is now pointing at the return address, i.e.
14175 the function entry state, which makes the offset be 1 word. */
14176 if (reg == hard_frame_pointer_rtx)
14178 m->fs.fp_valid = false;
14179 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14181 m->fs.cfa_reg = stack_pointer_rtx;
14182 m->fs.cfa_offset -= UNITS_PER_WORD;
14184 add_reg_note (insn, REG_CFA_DEF_CFA,
14185 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14186 GEN_INT (m->fs.cfa_offset)));
14187 RTX_FRAME_RELATED_P (insn) = 1;
14192 /* Emit code to restore saved registers using POP insns. */
14194 static void
14195 ix86_emit_restore_regs_using_pop (void)
14197 unsigned int regno;
14199 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14200 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14201 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14204 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14205 omits the emit and only attaches the notes. */
14207 static void
14208 ix86_emit_leave (rtx_insn *insn)
14210 struct machine_function *m = cfun->machine;
14211 if (!insn)
14212 insn = emit_insn (ix86_gen_leave ());
14214 ix86_add_queued_cfa_restore_notes (insn);
14216 gcc_assert (m->fs.fp_valid);
14217 m->fs.sp_valid = true;
14218 m->fs.sp_realigned = false;
14219 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14220 m->fs.fp_valid = false;
14222 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14224 m->fs.cfa_reg = stack_pointer_rtx;
14225 m->fs.cfa_offset = m->fs.sp_offset;
14227 add_reg_note (insn, REG_CFA_DEF_CFA,
14228 plus_constant (Pmode, stack_pointer_rtx,
14229 m->fs.sp_offset));
14230 RTX_FRAME_RELATED_P (insn) = 1;
14232 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14233 m->fs.fp_offset);
14236 /* Emit code to restore saved registers using MOV insns.
14237 First register is restored from CFA - CFA_OFFSET. */
14238 static void
14239 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14240 bool maybe_eh_return)
14242 struct machine_function *m = cfun->machine;
14243 unsigned int regno;
14245 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14246 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14248 rtx reg = gen_rtx_REG (word_mode, regno);
14249 rtx mem;
14250 rtx_insn *insn;
14252 mem = choose_baseaddr (cfa_offset, NULL);
14253 mem = gen_frame_mem (word_mode, mem);
14254 insn = emit_move_insn (reg, mem);
14256 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14258 /* Previously we'd represented the CFA as an expression
14259 like *(%ebp - 8). We've just popped that value from
14260 the stack, which means we need to reset the CFA to
14261 the drap register. This will remain until we restore
14262 the stack pointer. */
14263 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14264 RTX_FRAME_RELATED_P (insn) = 1;
14266 /* This means that the DRAP register is valid for addressing. */
14267 m->fs.drap_valid = true;
14269 else
14270 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14272 cfa_offset -= UNITS_PER_WORD;
14276 /* Emit code to restore saved registers using MOV insns.
14277 First register is restored from CFA - CFA_OFFSET. */
14278 static void
14279 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14280 bool maybe_eh_return)
14282 unsigned int regno;
14284 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14285 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14287 rtx reg = gen_rtx_REG (V4SFmode, regno);
14288 rtx mem;
14289 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14291 mem = choose_baseaddr (cfa_offset, &align);
14292 mem = gen_rtx_MEM (V4SFmode, mem);
14294 /* The location aligment depends upon the base register. */
14295 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14296 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14297 set_mem_align (mem, align);
14298 emit_insn (gen_rtx_SET (reg, mem));
14300 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14302 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14306 static void
14307 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14308 bool use_call, int style)
14310 struct machine_function *m = cfun->machine;
14311 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14312 + m->call_ms2sysv_extra_regs;
14313 rtvec v;
14314 unsigned int elems_needed, align, i, vi = 0;
14315 rtx_insn *insn;
14316 rtx sym, tmp;
14317 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14318 rtx r10 = NULL_RTX;
14319 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14320 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14321 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14322 rtx rsi_frame_load = NULL_RTX;
14323 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14324 enum xlogue_stub stub;
14326 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14328 /* If using a realigned stack, we should never start with padding. */
14329 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14331 /* Setup RSI as the stub's base pointer. */
14332 align = GET_MODE_ALIGNMENT (V4SFmode);
14333 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14334 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14336 emit_insn (gen_rtx_SET (rsi, tmp));
14338 /* Get a symbol for the stub. */
14339 if (frame_pointer_needed)
14340 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14341 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14342 else
14343 stub = use_call ? XLOGUE_STUB_RESTORE
14344 : XLOGUE_STUB_RESTORE_TAIL;
14345 sym = xlogue.get_stub_rtx (stub);
14347 elems_needed = ncregs;
14348 if (use_call)
14349 elems_needed += 1;
14350 else
14351 elems_needed += frame_pointer_needed ? 5 : 3;
14352 v = rtvec_alloc (elems_needed);
14354 /* We call the epilogue stub when we need to pop incoming args or we are
14355 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14356 epilogue stub and it is the tail-call. */
14357 if (use_call)
14358 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14359 else
14361 RTVEC_ELT (v, vi++) = ret_rtx;
14362 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14363 if (frame_pointer_needed)
14365 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14366 gcc_assert (m->fs.fp_valid);
14367 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14369 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14370 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14371 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14372 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14373 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14375 else
14377 /* If no hard frame pointer, we set R10 to the SP restore value. */
14378 gcc_assert (!m->fs.fp_valid);
14379 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14380 gcc_assert (m->fs.sp_valid);
14382 r10 = gen_rtx_REG (DImode, R10_REG);
14383 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14384 emit_insn (gen_rtx_SET (r10, tmp));
14386 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14390 /* Generate frame load insns and restore notes. */
14391 for (i = 0; i < ncregs; ++i)
14393 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14394 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14395 rtx reg, frame_load;
14397 reg = gen_rtx_REG (mode, r.regno);
14398 frame_load = gen_frame_load (reg, rsi, r.offset);
14400 /* Save RSI frame load insn & note to add last. */
14401 if (r.regno == SI_REG)
14403 gcc_assert (!rsi_frame_load);
14404 rsi_frame_load = frame_load;
14405 rsi_restore_offset = r.offset;
14407 else
14409 RTVEC_ELT (v, vi++) = frame_load;
14410 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14414 /* Add RSI frame load & restore note at the end. */
14415 gcc_assert (rsi_frame_load);
14416 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14417 RTVEC_ELT (v, vi++) = rsi_frame_load;
14418 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14419 rsi_restore_offset);
14421 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14422 if (!use_call && !frame_pointer_needed)
14424 gcc_assert (m->fs.sp_valid);
14425 gcc_assert (!m->fs.sp_realigned);
14427 /* At this point, R10 should point to frame.stack_realign_offset. */
14428 if (m->fs.cfa_reg == stack_pointer_rtx)
14429 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14430 m->fs.sp_offset = frame.stack_realign_offset;
14433 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14434 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14435 if (use_call)
14436 insn = emit_insn (tmp);
14437 else
14439 insn = emit_jump_insn (tmp);
14440 JUMP_LABEL (insn) = ret_rtx;
14442 if (frame_pointer_needed)
14443 ix86_emit_leave (insn);
14444 else
14446 /* Need CFA adjust note. */
14447 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14448 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14452 RTX_FRAME_RELATED_P (insn) = true;
14453 ix86_add_queued_cfa_restore_notes (insn);
14455 /* If we're not doing a tail-call, we need to adjust the stack. */
14456 if (use_call && m->fs.sp_valid)
14458 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14459 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14460 GEN_INT (dealloc), style,
14461 m->fs.cfa_reg == stack_pointer_rtx);
14465 /* Restore function stack, frame, and registers. */
14467 void
14468 ix86_expand_epilogue (int style)
14470 struct machine_function *m = cfun->machine;
14471 struct machine_frame_state frame_state_save = m->fs;
14472 bool restore_regs_via_mov;
14473 bool using_drap;
14474 bool restore_stub_is_tail = false;
14476 if (ix86_function_naked (current_function_decl))
14478 /* The program should not reach this point. */
14479 emit_insn (gen_ud2 ());
14480 return;
14483 ix86_finalize_stack_frame_flags ();
14484 const struct ix86_frame &frame = cfun->machine->frame;
14486 m->fs.sp_realigned = stack_realign_fp;
14487 m->fs.sp_valid = stack_realign_fp
14488 || !frame_pointer_needed
14489 || crtl->sp_is_unchanging;
14490 gcc_assert (!m->fs.sp_valid
14491 || m->fs.sp_offset == frame.stack_pointer_offset);
14493 /* The FP must be valid if the frame pointer is present. */
14494 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14495 gcc_assert (!m->fs.fp_valid
14496 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14498 /* We must have *some* valid pointer to the stack frame. */
14499 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14501 /* The DRAP is never valid at this point. */
14502 gcc_assert (!m->fs.drap_valid);
14504 /* See the comment about red zone and frame
14505 pointer usage in ix86_expand_prologue. */
14506 if (frame_pointer_needed && frame.red_zone_size)
14507 emit_insn (gen_memory_blockage ());
14509 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14510 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14512 /* Determine the CFA offset of the end of the red-zone. */
14513 m->fs.red_zone_offset = 0;
14514 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14516 /* The red-zone begins below return address and error code in
14517 exception handler. */
14518 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14520 /* When the register save area is in the aligned portion of
14521 the stack, determine the maximum runtime displacement that
14522 matches up with the aligned frame. */
14523 if (stack_realign_drap)
14524 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14525 + UNITS_PER_WORD);
14528 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14530 /* Special care must be taken for the normal return case of a function
14531 using eh_return: the eax and edx registers are marked as saved, but
14532 not restored along this path. Adjust the save location to match. */
14533 if (crtl->calls_eh_return && style != 2)
14534 reg_save_offset -= 2 * UNITS_PER_WORD;
14536 /* EH_RETURN requires the use of moves to function properly. */
14537 if (crtl->calls_eh_return)
14538 restore_regs_via_mov = true;
14539 /* SEH requires the use of pops to identify the epilogue. */
14540 else if (TARGET_SEH)
14541 restore_regs_via_mov = false;
14542 /* If we're only restoring one register and sp cannot be used then
14543 using a move instruction to restore the register since it's
14544 less work than reloading sp and popping the register. */
14545 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14546 restore_regs_via_mov = true;
14547 else if (TARGET_EPILOGUE_USING_MOVE
14548 && cfun->machine->use_fast_prologue_epilogue
14549 && (frame.nregs > 1
14550 || m->fs.sp_offset != reg_save_offset))
14551 restore_regs_via_mov = true;
14552 else if (frame_pointer_needed
14553 && !frame.nregs
14554 && m->fs.sp_offset != reg_save_offset)
14555 restore_regs_via_mov = true;
14556 else if (frame_pointer_needed
14557 && TARGET_USE_LEAVE
14558 && cfun->machine->use_fast_prologue_epilogue
14559 && frame.nregs == 1)
14560 restore_regs_via_mov = true;
14561 else
14562 restore_regs_via_mov = false;
14564 if (restore_regs_via_mov || frame.nsseregs)
14566 /* Ensure that the entire register save area is addressable via
14567 the stack pointer, if we will restore SSE regs via sp. */
14568 if (TARGET_64BIT
14569 && m->fs.sp_offset > 0x7fffffff
14570 && sp_valid_at (frame.stack_realign_offset + 1)
14571 && (frame.nsseregs + frame.nregs) != 0)
14573 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14574 GEN_INT (m->fs.sp_offset
14575 - frame.sse_reg_save_offset),
14576 style,
14577 m->fs.cfa_reg == stack_pointer_rtx);
14581 /* If there are any SSE registers to restore, then we have to do it
14582 via moves, since there's obviously no pop for SSE regs. */
14583 if (frame.nsseregs)
14584 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14585 style == 2);
14587 if (m->call_ms2sysv)
14589 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14591 /* We cannot use a tail-call for the stub if:
14592 1. We have to pop incoming args,
14593 2. We have additional int regs to restore, or
14594 3. A sibling call will be the tail-call, or
14595 4. We are emitting an eh_return_internal epilogue.
14597 TODO: Item 4 has not yet tested!
14599 If any of the above are true, we will call the stub rather than
14600 jump to it. */
14601 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14602 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14605 /* If using out-of-line stub that is a tail-call, then...*/
14606 if (m->call_ms2sysv && restore_stub_is_tail)
14608 /* TODO: parinoid tests. (remove eventually) */
14609 gcc_assert (m->fs.sp_valid);
14610 gcc_assert (!m->fs.sp_realigned);
14611 gcc_assert (!m->fs.fp_valid);
14612 gcc_assert (!m->fs.realigned);
14613 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14614 gcc_assert (!crtl->drap_reg);
14615 gcc_assert (!frame.nregs);
14617 else if (restore_regs_via_mov)
14619 rtx t;
14621 if (frame.nregs)
14622 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14624 /* eh_return epilogues need %ecx added to the stack pointer. */
14625 if (style == 2)
14627 rtx sa = EH_RETURN_STACKADJ_RTX;
14628 rtx_insn *insn;
14630 /* %ecx can't be used for both DRAP register and eh_return. */
14631 if (crtl->drap_reg)
14632 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14634 /* regparm nested functions don't work with eh_return. */
14635 gcc_assert (!ix86_static_chain_on_stack);
14637 if (frame_pointer_needed)
14639 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14640 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14641 emit_insn (gen_rtx_SET (sa, t));
14643 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14644 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14646 /* Note that we use SA as a temporary CFA, as the return
14647 address is at the proper place relative to it. We
14648 pretend this happens at the FP restore insn because
14649 prior to this insn the FP would be stored at the wrong
14650 offset relative to SA, and after this insn we have no
14651 other reasonable register to use for the CFA. We don't
14652 bother resetting the CFA to the SP for the duration of
14653 the return insn, unless the control flow instrumentation
14654 is done. In this case the SP is used later and we have
14655 to reset CFA to SP. */
14656 add_reg_note (insn, REG_CFA_DEF_CFA,
14657 plus_constant (Pmode, sa, UNITS_PER_WORD));
14658 ix86_add_queued_cfa_restore_notes (insn);
14659 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14660 RTX_FRAME_RELATED_P (insn) = 1;
14662 m->fs.cfa_reg = sa;
14663 m->fs.cfa_offset = UNITS_PER_WORD;
14664 m->fs.fp_valid = false;
14666 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14667 const0_rtx, style,
14668 flag_cf_protection);
14670 else
14672 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14673 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14674 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14675 ix86_add_queued_cfa_restore_notes (insn);
14677 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14678 if (m->fs.cfa_offset != UNITS_PER_WORD)
14680 m->fs.cfa_offset = UNITS_PER_WORD;
14681 add_reg_note (insn, REG_CFA_DEF_CFA,
14682 plus_constant (Pmode, stack_pointer_rtx,
14683 UNITS_PER_WORD));
14684 RTX_FRAME_RELATED_P (insn) = 1;
14687 m->fs.sp_offset = UNITS_PER_WORD;
14688 m->fs.sp_valid = true;
14689 m->fs.sp_realigned = false;
14692 else
14694 /* SEH requires that the function end with (1) a stack adjustment
14695 if necessary, (2) a sequence of pops, and (3) a return or
14696 jump instruction. Prevent insns from the function body from
14697 being scheduled into this sequence. */
14698 if (TARGET_SEH)
14700 /* Prevent a catch region from being adjacent to the standard
14701 epilogue sequence. Unfortunately neither crtl->uses_eh_lsda
14702 nor several other flags that would be interesting to test are
14703 set up yet. */
14704 if (flag_non_call_exceptions)
14705 emit_insn (gen_nops (const1_rtx));
14706 else
14707 emit_insn (gen_blockage ());
14710 /* First step is to deallocate the stack frame so that we can
14711 pop the registers. If the stack pointer was realigned, it needs
14712 to be restored now. Also do it on SEH target for very large
14713 frame as the emitted instructions aren't allowed by the ABI
14714 in epilogues. */
14715 if (!m->fs.sp_valid || m->fs.sp_realigned
14716 || (TARGET_SEH
14717 && (m->fs.sp_offset - reg_save_offset
14718 >= SEH_MAX_FRAME_SIZE)))
14720 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14721 GEN_INT (m->fs.fp_offset
14722 - reg_save_offset),
14723 style, false);
14725 else if (m->fs.sp_offset != reg_save_offset)
14727 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14728 GEN_INT (m->fs.sp_offset
14729 - reg_save_offset),
14730 style,
14731 m->fs.cfa_reg == stack_pointer_rtx);
14734 ix86_emit_restore_regs_using_pop ();
14737 /* If we used a stack pointer and haven't already got rid of it,
14738 then do so now. */
14739 if (m->fs.fp_valid)
14741 /* If the stack pointer is valid and pointing at the frame
14742 pointer store address, then we only need a pop. */
14743 if (sp_valid_at (frame.hfp_save_offset)
14744 && m->fs.sp_offset == frame.hfp_save_offset)
14745 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14746 /* Leave results in shorter dependency chains on CPUs that are
14747 able to grok it fast. */
14748 else if (TARGET_USE_LEAVE
14749 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14750 || !cfun->machine->use_fast_prologue_epilogue)
14751 ix86_emit_leave (NULL);
14752 else
14754 pro_epilogue_adjust_stack (stack_pointer_rtx,
14755 hard_frame_pointer_rtx,
14756 const0_rtx, style, !using_drap);
14757 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14761 if (using_drap)
14763 int param_ptr_offset = UNITS_PER_WORD;
14764 rtx_insn *insn;
14766 gcc_assert (stack_realign_drap);
14768 if (ix86_static_chain_on_stack)
14769 param_ptr_offset += UNITS_PER_WORD;
14770 if (!call_used_regs[REGNO (crtl->drap_reg)])
14771 param_ptr_offset += UNITS_PER_WORD;
14773 insn = emit_insn (gen_rtx_SET
14774 (stack_pointer_rtx,
14775 gen_rtx_PLUS (Pmode,
14776 crtl->drap_reg,
14777 GEN_INT (-param_ptr_offset))));
14778 m->fs.cfa_reg = stack_pointer_rtx;
14779 m->fs.cfa_offset = param_ptr_offset;
14780 m->fs.sp_offset = param_ptr_offset;
14781 m->fs.realigned = false;
14783 add_reg_note (insn, REG_CFA_DEF_CFA,
14784 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14785 GEN_INT (param_ptr_offset)));
14786 RTX_FRAME_RELATED_P (insn) = 1;
14788 if (!call_used_regs[REGNO (crtl->drap_reg)])
14789 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14792 /* At this point the stack pointer must be valid, and we must have
14793 restored all of the registers. We may not have deallocated the
14794 entire stack frame. We've delayed this until now because it may
14795 be possible to merge the local stack deallocation with the
14796 deallocation forced by ix86_static_chain_on_stack. */
14797 gcc_assert (m->fs.sp_valid);
14798 gcc_assert (!m->fs.sp_realigned);
14799 gcc_assert (!m->fs.fp_valid);
14800 gcc_assert (!m->fs.realigned);
14801 if (m->fs.sp_offset != UNITS_PER_WORD)
14803 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14804 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14805 style, true);
14807 else
14808 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14810 /* Sibcall epilogues don't want a return instruction. */
14811 if (style == 0)
14813 m->fs = frame_state_save;
14814 return;
14817 if (cfun->machine->func_type != TYPE_NORMAL)
14818 emit_jump_insn (gen_interrupt_return ());
14819 else if (crtl->args.pops_args && crtl->args.size)
14821 rtx popc = GEN_INT (crtl->args.pops_args);
14823 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14824 address, do explicit add, and jump indirectly to the caller. */
14826 if (crtl->args.pops_args >= 65536)
14828 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14829 rtx_insn *insn;
14831 /* There is no "pascal" calling convention in any 64bit ABI. */
14832 gcc_assert (!TARGET_64BIT);
14834 insn = emit_insn (gen_pop (ecx));
14835 m->fs.cfa_offset -= UNITS_PER_WORD;
14836 m->fs.sp_offset -= UNITS_PER_WORD;
14838 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14839 x = gen_rtx_SET (stack_pointer_rtx, x);
14840 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14841 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14842 RTX_FRAME_RELATED_P (insn) = 1;
14844 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14845 popc, -1, true);
14846 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14848 else
14849 emit_jump_insn (gen_simple_return_pop_internal (popc));
14851 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14853 /* In case of return from EH a simple return cannot be used
14854 as a return address will be compared with a shadow stack
14855 return address. Use indirect jump instead. */
14856 if (style == 2 && flag_cf_protection)
14858 /* Register used in indirect jump must be in word_mode. But
14859 Pmode may not be the same as word_mode for x32. */
14860 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14861 rtx_insn *insn;
14863 insn = emit_insn (gen_pop (ecx));
14864 m->fs.cfa_offset -= UNITS_PER_WORD;
14865 m->fs.sp_offset -= UNITS_PER_WORD;
14867 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14868 x = gen_rtx_SET (stack_pointer_rtx, x);
14869 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14870 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14871 RTX_FRAME_RELATED_P (insn) = 1;
14873 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14875 else
14876 emit_jump_insn (gen_simple_return_internal ());
14879 /* Restore the state back to the state from the prologue,
14880 so that it's correct for the next epilogue. */
14881 m->fs = frame_state_save;
14884 /* Reset from the function's potential modifications. */
14886 static void
14887 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14889 if (pic_offset_table_rtx
14890 && !ix86_use_pseudo_pic_reg ())
14891 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14893 if (TARGET_MACHO)
14895 rtx_insn *insn = get_last_insn ();
14896 rtx_insn *deleted_debug_label = NULL;
14898 /* Mach-O doesn't support labels at the end of objects, so if
14899 it looks like we might want one, take special action.
14900 First, collect any sequence of deleted debug labels. */
14901 while (insn
14902 && NOTE_P (insn)
14903 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14905 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14906 notes only, instead set their CODE_LABEL_NUMBER to -1,
14907 otherwise there would be code generation differences
14908 in between -g and -g0. */
14909 if (NOTE_P (insn) && NOTE_KIND (insn)
14910 == NOTE_INSN_DELETED_DEBUG_LABEL)
14911 deleted_debug_label = insn;
14912 insn = PREV_INSN (insn);
14915 /* If we have:
14916 label:
14917 barrier
14918 then this needs to be detected, so skip past the barrier. */
14920 if (insn && BARRIER_P (insn))
14921 insn = PREV_INSN (insn);
14923 /* Up to now we've only seen notes or barriers. */
14924 if (insn)
14926 if (LABEL_P (insn)
14927 || (NOTE_P (insn)
14928 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14929 /* Trailing label. */
14930 fputs ("\tnop\n", file);
14931 else if (cfun && ! cfun->is_thunk)
14933 /* See if we have a completely empty function body, skipping
14934 the special case of the picbase thunk emitted as asm. */
14935 while (insn && ! INSN_P (insn))
14936 insn = PREV_INSN (insn);
14937 /* If we don't find any insns, we've got an empty function body;
14938 I.e. completely empty - without a return or branch. This is
14939 taken as the case where a function body has been removed
14940 because it contains an inline __builtin_unreachable(). GCC
14941 declares that reaching __builtin_unreachable() means UB so
14942 we're not obliged to do anything special; however, we want
14943 non-zero-sized function bodies. To meet this, and help the
14944 user out, let's trap the case. */
14945 if (insn == NULL)
14946 fputs ("\tud2\n", file);
14949 else if (deleted_debug_label)
14950 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14951 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14952 CODE_LABEL_NUMBER (insn) = -1;
14956 /* Return a scratch register to use in the split stack prologue. The
14957 split stack prologue is used for -fsplit-stack. It is the first
14958 instructions in the function, even before the regular prologue.
14959 The scratch register can be any caller-saved register which is not
14960 used for parameters or for the static chain. */
14962 static unsigned int
14963 split_stack_prologue_scratch_regno (void)
14965 if (TARGET_64BIT)
14966 return R11_REG;
14967 else
14969 bool is_fastcall, is_thiscall;
14970 int regparm;
14972 is_fastcall = (lookup_attribute ("fastcall",
14973 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14974 != NULL);
14975 is_thiscall = (lookup_attribute ("thiscall",
14976 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14977 != NULL);
14978 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14980 if (is_fastcall)
14982 if (DECL_STATIC_CHAIN (cfun->decl))
14984 sorry ("-fsplit-stack does not support fastcall with "
14985 "nested function");
14986 return INVALID_REGNUM;
14988 return AX_REG;
14990 else if (is_thiscall)
14992 if (!DECL_STATIC_CHAIN (cfun->decl))
14993 return DX_REG;
14994 return AX_REG;
14996 else if (regparm < 3)
14998 if (!DECL_STATIC_CHAIN (cfun->decl))
14999 return CX_REG;
15000 else
15002 if (regparm >= 2)
15004 sorry ("-fsplit-stack does not support 2 register "
15005 "parameters for a nested function");
15006 return INVALID_REGNUM;
15008 return DX_REG;
15011 else
15013 /* FIXME: We could make this work by pushing a register
15014 around the addition and comparison. */
15015 sorry ("-fsplit-stack does not support 3 register parameters");
15016 return INVALID_REGNUM;
15021 /* A SYMBOL_REF for the function which allocates new stackspace for
15022 -fsplit-stack. */
15024 static GTY(()) rtx split_stack_fn;
15026 /* A SYMBOL_REF for the more stack function when using the large
15027 model. */
15029 static GTY(()) rtx split_stack_fn_large;
15031 /* Return location of the stack guard value in the TLS block. */
15034 ix86_split_stack_guard (void)
15036 int offset;
15037 addr_space_t as = DEFAULT_TLS_SEG_REG;
15038 rtx r;
15040 gcc_assert (flag_split_stack);
15042 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15043 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15044 #else
15045 gcc_unreachable ();
15046 #endif
15048 r = GEN_INT (offset);
15049 r = gen_const_mem (Pmode, r);
15050 set_mem_addr_space (r, as);
15052 return r;
15055 /* Handle -fsplit-stack. These are the first instructions in the
15056 function, even before the regular prologue. */
15058 void
15059 ix86_expand_split_stack_prologue (void)
15061 HOST_WIDE_INT allocate;
15062 unsigned HOST_WIDE_INT args_size;
15063 rtx_code_label *label;
15064 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15065 rtx scratch_reg = NULL_RTX;
15066 rtx_code_label *varargs_label = NULL;
15067 rtx fn;
15069 gcc_assert (flag_split_stack && reload_completed);
15071 ix86_finalize_stack_frame_flags ();
15072 struct ix86_frame &frame = cfun->machine->frame;
15073 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15075 /* This is the label we will branch to if we have enough stack
15076 space. We expect the basic block reordering pass to reverse this
15077 branch if optimizing, so that we branch in the unlikely case. */
15078 label = gen_label_rtx ();
15080 /* We need to compare the stack pointer minus the frame size with
15081 the stack boundary in the TCB. The stack boundary always gives
15082 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15083 can compare directly. Otherwise we need to do an addition. */
15085 limit = ix86_split_stack_guard ();
15087 if (allocate < SPLIT_STACK_AVAILABLE)
15088 current = stack_pointer_rtx;
15089 else
15091 unsigned int scratch_regno;
15092 rtx offset;
15094 /* We need a scratch register to hold the stack pointer minus
15095 the required frame size. Since this is the very start of the
15096 function, the scratch register can be any caller-saved
15097 register which is not used for parameters. */
15098 offset = GEN_INT (- allocate);
15099 scratch_regno = split_stack_prologue_scratch_regno ();
15100 if (scratch_regno == INVALID_REGNUM)
15101 return;
15102 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15103 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15105 /* We don't use ix86_gen_add3 in this case because it will
15106 want to split to lea, but when not optimizing the insn
15107 will not be split after this point. */
15108 emit_insn (gen_rtx_SET (scratch_reg,
15109 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15110 offset)));
15112 else
15114 emit_move_insn (scratch_reg, offset);
15115 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15116 stack_pointer_rtx));
15118 current = scratch_reg;
15121 ix86_expand_branch (GEU, current, limit, label);
15122 rtx_insn *jump_insn = get_last_insn ();
15123 JUMP_LABEL (jump_insn) = label;
15125 /* Mark the jump as very likely to be taken. */
15126 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15128 if (split_stack_fn == NULL_RTX)
15130 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15131 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15133 fn = split_stack_fn;
15135 /* Get more stack space. We pass in the desired stack space and the
15136 size of the arguments to copy to the new stack. In 32-bit mode
15137 we push the parameters; __morestack will return on a new stack
15138 anyhow. In 64-bit mode we pass the parameters in r10 and
15139 r11. */
15140 allocate_rtx = GEN_INT (allocate);
15141 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
15142 call_fusage = NULL_RTX;
15143 rtx pop = NULL_RTX;
15144 if (TARGET_64BIT)
15146 rtx reg10, reg11;
15148 reg10 = gen_rtx_REG (Pmode, R10_REG);
15149 reg11 = gen_rtx_REG (Pmode, R11_REG);
15151 /* If this function uses a static chain, it will be in %r10.
15152 Preserve it across the call to __morestack. */
15153 if (DECL_STATIC_CHAIN (cfun->decl))
15155 rtx rax;
15157 rax = gen_rtx_REG (word_mode, AX_REG);
15158 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15159 use_reg (&call_fusage, rax);
15162 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15163 && !TARGET_PECOFF)
15165 HOST_WIDE_INT argval;
15167 gcc_assert (Pmode == DImode);
15168 /* When using the large model we need to load the address
15169 into a register, and we've run out of registers. So we
15170 switch to a different calling convention, and we call a
15171 different function: __morestack_large. We pass the
15172 argument size in the upper 32 bits of r10 and pass the
15173 frame size in the lower 32 bits. */
15174 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15175 gcc_assert ((args_size & 0xffffffff) == args_size);
15177 if (split_stack_fn_large == NULL_RTX)
15179 split_stack_fn_large =
15180 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15181 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15183 if (ix86_cmodel == CM_LARGE_PIC)
15185 rtx_code_label *label;
15186 rtx x;
15188 label = gen_label_rtx ();
15189 emit_label (label);
15190 LABEL_PRESERVE_P (label) = 1;
15191 emit_insn (gen_set_rip_rex64 (reg10, label));
15192 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15193 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15194 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15195 UNSPEC_GOT);
15196 x = gen_rtx_CONST (Pmode, x);
15197 emit_move_insn (reg11, x);
15198 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15199 x = gen_const_mem (Pmode, x);
15200 emit_move_insn (reg11, x);
15202 else
15203 emit_move_insn (reg11, split_stack_fn_large);
15205 fn = reg11;
15207 argval = ((args_size << 16) << 16) + allocate;
15208 emit_move_insn (reg10, GEN_INT (argval));
15210 else
15212 emit_move_insn (reg10, allocate_rtx);
15213 emit_move_insn (reg11, GEN_INT (args_size));
15214 use_reg (&call_fusage, reg11);
15217 use_reg (&call_fusage, reg10);
15219 else
15221 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15222 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15223 insn = emit_insn (gen_push (allocate_rtx));
15224 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15225 pop = GEN_INT (2 * UNITS_PER_WORD);
15227 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15228 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15229 pop, false);
15230 add_function_usage_to (call_insn, call_fusage);
15231 if (!TARGET_64BIT)
15232 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15233 /* Indicate that this function can't jump to non-local gotos. */
15234 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15236 /* In order to make call/return prediction work right, we now need
15237 to execute a return instruction. See
15238 libgcc/config/i386/morestack.S for the details on how this works.
15240 For flow purposes gcc must not see this as a return
15241 instruction--we need control flow to continue at the subsequent
15242 label. Therefore, we use an unspec. */
15243 gcc_assert (crtl->args.pops_args < 65536);
15244 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15246 /* If we are in 64-bit mode and this function uses a static chain,
15247 we saved %r10 in %rax before calling _morestack. */
15248 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15249 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15250 gen_rtx_REG (word_mode, AX_REG));
15252 /* If this function calls va_start, we need to store a pointer to
15253 the arguments on the old stack, because they may not have been
15254 all copied to the new stack. At this point the old stack can be
15255 found at the frame pointer value used by __morestack, because
15256 __morestack has set that up before calling back to us. Here we
15257 store that pointer in a scratch register, and in
15258 ix86_expand_prologue we store the scratch register in a stack
15259 slot. */
15260 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15262 unsigned int scratch_regno;
15263 rtx frame_reg;
15264 int words;
15266 scratch_regno = split_stack_prologue_scratch_regno ();
15267 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15268 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15270 /* 64-bit:
15271 fp -> old fp value
15272 return address within this function
15273 return address of caller of this function
15274 stack arguments
15275 So we add three words to get to the stack arguments.
15277 32-bit:
15278 fp -> old fp value
15279 return address within this function
15280 first argument to __morestack
15281 second argument to __morestack
15282 return address of caller of this function
15283 stack arguments
15284 So we add five words to get to the stack arguments.
15286 words = TARGET_64BIT ? 3 : 5;
15287 emit_insn (gen_rtx_SET (scratch_reg,
15288 gen_rtx_PLUS (Pmode, frame_reg,
15289 GEN_INT (words * UNITS_PER_WORD))));
15291 varargs_label = gen_label_rtx ();
15292 emit_jump_insn (gen_jump (varargs_label));
15293 JUMP_LABEL (get_last_insn ()) = varargs_label;
15295 emit_barrier ();
15298 emit_label (label);
15299 LABEL_NUSES (label) = 1;
15301 /* If this function calls va_start, we now have to set the scratch
15302 register for the case where we do not call __morestack. In this
15303 case we need to set it based on the stack pointer. */
15304 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15306 emit_insn (gen_rtx_SET (scratch_reg,
15307 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15308 GEN_INT (UNITS_PER_WORD))));
15310 emit_label (varargs_label);
15311 LABEL_NUSES (varargs_label) = 1;
15315 /* We may have to tell the dataflow pass that the split stack prologue
15316 is initializing a scratch register. */
15318 static void
15319 ix86_live_on_entry (bitmap regs)
15321 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15323 gcc_assert (flag_split_stack);
15324 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15328 /* Extract the parts of an RTL expression that is a valid memory address
15329 for an instruction. Return 0 if the structure of the address is
15330 grossly off. Return -1 if the address contains ASHIFT, so it is not
15331 strictly valid, but still used for computing length of lea instruction. */
15334 ix86_decompose_address (rtx addr, struct ix86_address *out)
15336 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15337 rtx base_reg, index_reg;
15338 HOST_WIDE_INT scale = 1;
15339 rtx scale_rtx = NULL_RTX;
15340 rtx tmp;
15341 int retval = 1;
15342 addr_space_t seg = ADDR_SPACE_GENERIC;
15344 /* Allow zero-extended SImode addresses,
15345 they will be emitted with addr32 prefix. */
15346 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15348 if (GET_CODE (addr) == ZERO_EXTEND
15349 && GET_MODE (XEXP (addr, 0)) == SImode)
15351 addr = XEXP (addr, 0);
15352 if (CONST_INT_P (addr))
15353 return 0;
15355 else if (GET_CODE (addr) == AND
15356 && const_32bit_mask (XEXP (addr, 1), DImode))
15358 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15359 if (addr == NULL_RTX)
15360 return 0;
15362 if (CONST_INT_P (addr))
15363 return 0;
15367 /* Allow SImode subregs of DImode addresses,
15368 they will be emitted with addr32 prefix. */
15369 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15371 if (SUBREG_P (addr)
15372 && GET_MODE (SUBREG_REG (addr)) == DImode)
15374 addr = SUBREG_REG (addr);
15375 if (CONST_INT_P (addr))
15376 return 0;
15380 if (REG_P (addr))
15381 base = addr;
15382 else if (SUBREG_P (addr))
15384 if (REG_P (SUBREG_REG (addr)))
15385 base = addr;
15386 else
15387 return 0;
15389 else if (GET_CODE (addr) == PLUS)
15391 rtx addends[4], op;
15392 int n = 0, i;
15394 op = addr;
15397 if (n >= 4)
15398 return 0;
15399 addends[n++] = XEXP (op, 1);
15400 op = XEXP (op, 0);
15402 while (GET_CODE (op) == PLUS);
15403 if (n >= 4)
15404 return 0;
15405 addends[n] = op;
15407 for (i = n; i >= 0; --i)
15409 op = addends[i];
15410 switch (GET_CODE (op))
15412 case MULT:
15413 if (index)
15414 return 0;
15415 index = XEXP (op, 0);
15416 scale_rtx = XEXP (op, 1);
15417 break;
15419 case ASHIFT:
15420 if (index)
15421 return 0;
15422 index = XEXP (op, 0);
15423 tmp = XEXP (op, 1);
15424 if (!CONST_INT_P (tmp))
15425 return 0;
15426 scale = INTVAL (tmp);
15427 if ((unsigned HOST_WIDE_INT) scale > 3)
15428 return 0;
15429 scale = 1 << scale;
15430 break;
15432 case ZERO_EXTEND:
15433 op = XEXP (op, 0);
15434 if (GET_CODE (op) != UNSPEC)
15435 return 0;
15436 /* FALLTHRU */
15438 case UNSPEC:
15439 if (XINT (op, 1) == UNSPEC_TP
15440 && TARGET_TLS_DIRECT_SEG_REFS
15441 && seg == ADDR_SPACE_GENERIC)
15442 seg = DEFAULT_TLS_SEG_REG;
15443 else
15444 return 0;
15445 break;
15447 case SUBREG:
15448 if (!REG_P (SUBREG_REG (op)))
15449 return 0;
15450 /* FALLTHRU */
15452 case REG:
15453 if (!base)
15454 base = op;
15455 else if (!index)
15456 index = op;
15457 else
15458 return 0;
15459 break;
15461 case CONST:
15462 case CONST_INT:
15463 case SYMBOL_REF:
15464 case LABEL_REF:
15465 if (disp)
15466 return 0;
15467 disp = op;
15468 break;
15470 default:
15471 return 0;
15475 else if (GET_CODE (addr) == MULT)
15477 index = XEXP (addr, 0); /* index*scale */
15478 scale_rtx = XEXP (addr, 1);
15480 else if (GET_CODE (addr) == ASHIFT)
15482 /* We're called for lea too, which implements ashift on occasion. */
15483 index = XEXP (addr, 0);
15484 tmp = XEXP (addr, 1);
15485 if (!CONST_INT_P (tmp))
15486 return 0;
15487 scale = INTVAL (tmp);
15488 if ((unsigned HOST_WIDE_INT) scale > 3)
15489 return 0;
15490 scale = 1 << scale;
15491 retval = -1;
15493 else
15494 disp = addr; /* displacement */
15496 if (index)
15498 if (REG_P (index))
15500 else if (SUBREG_P (index)
15501 && REG_P (SUBREG_REG (index)))
15503 else
15504 return 0;
15507 /* Extract the integral value of scale. */
15508 if (scale_rtx)
15510 if (!CONST_INT_P (scale_rtx))
15511 return 0;
15512 scale = INTVAL (scale_rtx);
15515 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15516 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15518 /* Avoid useless 0 displacement. */
15519 if (disp == const0_rtx && (base || index))
15520 disp = NULL_RTX;
15522 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15523 if (base_reg && index_reg && scale == 1
15524 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15525 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15526 || REGNO (index_reg) == SP_REG))
15528 std::swap (base, index);
15529 std::swap (base_reg, index_reg);
15532 /* Special case: %ebp cannot be encoded as a base without a displacement.
15533 Similarly %r13. */
15534 if (!disp && base_reg
15535 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15536 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15537 || REGNO (base_reg) == BP_REG
15538 || REGNO (base_reg) == R13_REG))
15539 disp = const0_rtx;
15541 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15542 Avoid this by transforming to [%esi+0].
15543 Reload calls address legitimization without cfun defined, so we need
15544 to test cfun for being non-NULL. */
15545 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15546 && base_reg && !index_reg && !disp
15547 && REGNO (base_reg) == SI_REG)
15548 disp = const0_rtx;
15550 /* Special case: encode reg+reg instead of reg*2. */
15551 if (!base && index && scale == 2)
15552 base = index, base_reg = index_reg, scale = 1;
15554 /* Special case: scaling cannot be encoded without base or displacement. */
15555 if (!base && !disp && index && scale != 1)
15556 disp = const0_rtx;
15558 out->base = base;
15559 out->index = index;
15560 out->disp = disp;
15561 out->scale = scale;
15562 out->seg = seg;
15564 return retval;
15567 /* Return cost of the memory address x.
15568 For i386, it is better to use a complex address than let gcc copy
15569 the address into a reg and make a new pseudo. But not if the address
15570 requires to two regs - that would mean more pseudos with longer
15571 lifetimes. */
15572 static int
15573 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15575 struct ix86_address parts;
15576 int cost = 1;
15577 int ok = ix86_decompose_address (x, &parts);
15579 gcc_assert (ok);
15581 if (parts.base && SUBREG_P (parts.base))
15582 parts.base = SUBREG_REG (parts.base);
15583 if (parts.index && SUBREG_P (parts.index))
15584 parts.index = SUBREG_REG (parts.index);
15586 /* Attempt to minimize number of registers in the address by increasing
15587 address cost for each used register. We don't increase address cost
15588 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15589 is not invariant itself it most likely means that base or index is not
15590 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15591 which is not profitable for x86. */
15592 if (parts.base
15593 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15594 && (current_pass->type == GIMPLE_PASS
15595 || !pic_offset_table_rtx
15596 || !REG_P (parts.base)
15597 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15598 cost++;
15600 if (parts.index
15601 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15602 && (current_pass->type == GIMPLE_PASS
15603 || !pic_offset_table_rtx
15604 || !REG_P (parts.index)
15605 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15606 cost++;
15608 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15609 since it's predecode logic can't detect the length of instructions
15610 and it degenerates to vector decoded. Increase cost of such
15611 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15612 to split such addresses or even refuse such addresses at all.
15614 Following addressing modes are affected:
15615 [base+scale*index]
15616 [scale*index+disp]
15617 [base+index]
15619 The first and last case may be avoidable by explicitly coding the zero in
15620 memory address, but I don't have AMD-K6 machine handy to check this
15621 theory. */
15623 if (TARGET_K6
15624 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15625 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15626 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15627 cost += 10;
15629 return cost;
15632 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15633 this is used for to form addresses to local data when -fPIC is in
15634 use. */
15636 static bool
15637 darwin_local_data_pic (rtx disp)
15639 return (GET_CODE (disp) == UNSPEC
15640 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15643 /* True if operand X should be loaded from GOT. */
15645 bool
15646 ix86_force_load_from_GOT_p (rtx x)
15648 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15649 && !TARGET_PECOFF && !TARGET_MACHO
15650 && !flag_plt && !flag_pic
15651 && ix86_cmodel != CM_LARGE
15652 && GET_CODE (x) == SYMBOL_REF
15653 && SYMBOL_REF_FUNCTION_P (x)
15654 && !SYMBOL_REF_LOCAL_P (x));
15657 /* Determine if a given RTX is a valid constant. We already know this
15658 satisfies CONSTANT_P. */
15660 static bool
15661 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15663 /* Pointer bounds constants are not valid. */
15664 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15665 return false;
15667 switch (GET_CODE (x))
15669 case CONST:
15670 x = XEXP (x, 0);
15672 if (GET_CODE (x) == PLUS)
15674 if (!CONST_INT_P (XEXP (x, 1)))
15675 return false;
15676 x = XEXP (x, 0);
15679 if (TARGET_MACHO && darwin_local_data_pic (x))
15680 return true;
15682 /* Only some unspecs are valid as "constants". */
15683 if (GET_CODE (x) == UNSPEC)
15684 switch (XINT (x, 1))
15686 case UNSPEC_GOT:
15687 case UNSPEC_GOTOFF:
15688 case UNSPEC_PLTOFF:
15689 return TARGET_64BIT;
15690 case UNSPEC_TPOFF:
15691 case UNSPEC_NTPOFF:
15692 x = XVECEXP (x, 0, 0);
15693 return (GET_CODE (x) == SYMBOL_REF
15694 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15695 case UNSPEC_DTPOFF:
15696 x = XVECEXP (x, 0, 0);
15697 return (GET_CODE (x) == SYMBOL_REF
15698 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15699 default:
15700 return false;
15703 /* We must have drilled down to a symbol. */
15704 if (GET_CODE (x) == LABEL_REF)
15705 return true;
15706 if (GET_CODE (x) != SYMBOL_REF)
15707 return false;
15708 /* FALLTHRU */
15710 case SYMBOL_REF:
15711 /* TLS symbols are never valid. */
15712 if (SYMBOL_REF_TLS_MODEL (x))
15713 return false;
15715 /* DLLIMPORT symbols are never valid. */
15716 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15717 && SYMBOL_REF_DLLIMPORT_P (x))
15718 return false;
15720 #if TARGET_MACHO
15721 /* mdynamic-no-pic */
15722 if (MACHO_DYNAMIC_NO_PIC_P)
15723 return machopic_symbol_defined_p (x);
15724 #endif
15726 /* External function address should be loaded
15727 via the GOT slot to avoid PLT. */
15728 if (ix86_force_load_from_GOT_p (x))
15729 return false;
15731 break;
15733 CASE_CONST_SCALAR_INT:
15734 switch (mode)
15736 case E_TImode:
15737 if (TARGET_64BIT)
15738 return true;
15739 /* FALLTHRU */
15740 case E_OImode:
15741 case E_XImode:
15742 if (!standard_sse_constant_p (x, mode))
15743 return false;
15744 default:
15745 break;
15747 break;
15749 case CONST_VECTOR:
15750 if (!standard_sse_constant_p (x, mode))
15751 return false;
15753 default:
15754 break;
15757 /* Otherwise we handle everything else in the move patterns. */
15758 return true;
15761 /* Determine if it's legal to put X into the constant pool. This
15762 is not possible for the address of thread-local symbols, which
15763 is checked above. */
15765 static bool
15766 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15768 /* We can put any immediate constant in memory. */
15769 switch (GET_CODE (x))
15771 CASE_CONST_ANY:
15772 return false;
15774 default:
15775 break;
15778 return !ix86_legitimate_constant_p (mode, x);
15781 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15782 otherwise zero. */
15784 static bool
15785 is_imported_p (rtx x)
15787 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15788 || GET_CODE (x) != SYMBOL_REF)
15789 return false;
15791 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15795 /* Nonzero if the constant value X is a legitimate general operand
15796 when generating PIC code. It is given that flag_pic is on and
15797 that X satisfies CONSTANT_P. */
15799 bool
15800 legitimate_pic_operand_p (rtx x)
15802 rtx inner;
15804 switch (GET_CODE (x))
15806 case CONST:
15807 inner = XEXP (x, 0);
15808 if (GET_CODE (inner) == PLUS
15809 && CONST_INT_P (XEXP (inner, 1)))
15810 inner = XEXP (inner, 0);
15812 /* Only some unspecs are valid as "constants". */
15813 if (GET_CODE (inner) == UNSPEC)
15814 switch (XINT (inner, 1))
15816 case UNSPEC_GOT:
15817 case UNSPEC_GOTOFF:
15818 case UNSPEC_PLTOFF:
15819 return TARGET_64BIT;
15820 case UNSPEC_TPOFF:
15821 x = XVECEXP (inner, 0, 0);
15822 return (GET_CODE (x) == SYMBOL_REF
15823 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15824 case UNSPEC_MACHOPIC_OFFSET:
15825 return legitimate_pic_address_disp_p (x);
15826 default:
15827 return false;
15829 /* FALLTHRU */
15831 case SYMBOL_REF:
15832 case LABEL_REF:
15833 return legitimate_pic_address_disp_p (x);
15835 default:
15836 return true;
15840 /* Determine if a given CONST RTX is a valid memory displacement
15841 in PIC mode. */
15843 bool
15844 legitimate_pic_address_disp_p (rtx disp)
15846 bool saw_plus;
15848 /* In 64bit mode we can allow direct addresses of symbols and labels
15849 when they are not dynamic symbols. */
15850 if (TARGET_64BIT)
15852 rtx op0 = disp, op1;
15854 switch (GET_CODE (disp))
15856 case LABEL_REF:
15857 return true;
15859 case CONST:
15860 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15861 break;
15862 op0 = XEXP (XEXP (disp, 0), 0);
15863 op1 = XEXP (XEXP (disp, 0), 1);
15864 if (!CONST_INT_P (op1))
15865 break;
15866 if (GET_CODE (op0) == UNSPEC
15867 && (XINT (op0, 1) == UNSPEC_DTPOFF
15868 || XINT (op0, 1) == UNSPEC_NTPOFF)
15869 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15870 return true;
15871 if (INTVAL (op1) >= 16*1024*1024
15872 || INTVAL (op1) < -16*1024*1024)
15873 break;
15874 if (GET_CODE (op0) == LABEL_REF)
15875 return true;
15876 if (GET_CODE (op0) == CONST
15877 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15878 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15879 return true;
15880 if (GET_CODE (op0) == UNSPEC
15881 && XINT (op0, 1) == UNSPEC_PCREL)
15882 return true;
15883 if (GET_CODE (op0) != SYMBOL_REF)
15884 break;
15885 /* FALLTHRU */
15887 case SYMBOL_REF:
15888 /* TLS references should always be enclosed in UNSPEC.
15889 The dllimported symbol needs always to be resolved. */
15890 if (SYMBOL_REF_TLS_MODEL (op0)
15891 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15892 return false;
15894 if (TARGET_PECOFF)
15896 if (is_imported_p (op0))
15897 return true;
15899 if (SYMBOL_REF_FAR_ADDR_P (op0)
15900 || !SYMBOL_REF_LOCAL_P (op0))
15901 break;
15903 /* Function-symbols need to be resolved only for
15904 large-model.
15905 For the small-model we don't need to resolve anything
15906 here. */
15907 if ((ix86_cmodel != CM_LARGE_PIC
15908 && SYMBOL_REF_FUNCTION_P (op0))
15909 || ix86_cmodel == CM_SMALL_PIC)
15910 return true;
15911 /* Non-external symbols don't need to be resolved for
15912 large, and medium-model. */
15913 if ((ix86_cmodel == CM_LARGE_PIC
15914 || ix86_cmodel == CM_MEDIUM_PIC)
15915 && !SYMBOL_REF_EXTERNAL_P (op0))
15916 return true;
15918 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15919 && (SYMBOL_REF_LOCAL_P (op0)
15920 || (HAVE_LD_PIE_COPYRELOC
15921 && flag_pie
15922 && !SYMBOL_REF_WEAK (op0)
15923 && !SYMBOL_REF_FUNCTION_P (op0)))
15924 && ix86_cmodel != CM_LARGE_PIC)
15925 return true;
15926 break;
15928 default:
15929 break;
15932 if (GET_CODE (disp) != CONST)
15933 return false;
15934 disp = XEXP (disp, 0);
15936 if (TARGET_64BIT)
15938 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15939 of GOT tables. We should not need these anyway. */
15940 if (GET_CODE (disp) != UNSPEC
15941 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15942 && XINT (disp, 1) != UNSPEC_GOTOFF
15943 && XINT (disp, 1) != UNSPEC_PCREL
15944 && XINT (disp, 1) != UNSPEC_PLTOFF))
15945 return false;
15947 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15948 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15949 return false;
15950 return true;
15953 saw_plus = false;
15954 if (GET_CODE (disp) == PLUS)
15956 if (!CONST_INT_P (XEXP (disp, 1)))
15957 return false;
15958 disp = XEXP (disp, 0);
15959 saw_plus = true;
15962 if (TARGET_MACHO && darwin_local_data_pic (disp))
15963 return true;
15965 if (GET_CODE (disp) != UNSPEC)
15966 return false;
15968 switch (XINT (disp, 1))
15970 case UNSPEC_GOT:
15971 if (saw_plus)
15972 return false;
15973 /* We need to check for both symbols and labels because VxWorks loads
15974 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15975 details. */
15976 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15977 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15978 case UNSPEC_GOTOFF:
15979 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15980 While ABI specify also 32bit relocation but we don't produce it in
15981 small PIC model at all. */
15982 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15983 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15984 && !TARGET_64BIT)
15985 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15986 return false;
15987 case UNSPEC_GOTTPOFF:
15988 case UNSPEC_GOTNTPOFF:
15989 case UNSPEC_INDNTPOFF:
15990 if (saw_plus)
15991 return false;
15992 disp = XVECEXP (disp, 0, 0);
15993 return (GET_CODE (disp) == SYMBOL_REF
15994 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15995 case UNSPEC_NTPOFF:
15996 disp = XVECEXP (disp, 0, 0);
15997 return (GET_CODE (disp) == SYMBOL_REF
15998 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15999 case UNSPEC_DTPOFF:
16000 disp = XVECEXP (disp, 0, 0);
16001 return (GET_CODE (disp) == SYMBOL_REF
16002 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16005 return false;
16008 /* Determine if op is suitable RTX for an address register.
16009 Return naked register if a register or a register subreg is
16010 found, otherwise return NULL_RTX. */
16012 static rtx
16013 ix86_validate_address_register (rtx op)
16015 machine_mode mode = GET_MODE (op);
16017 /* Only SImode or DImode registers can form the address. */
16018 if (mode != SImode && mode != DImode)
16019 return NULL_RTX;
16021 if (REG_P (op))
16022 return op;
16023 else if (SUBREG_P (op))
16025 rtx reg = SUBREG_REG (op);
16027 if (!REG_P (reg))
16028 return NULL_RTX;
16030 mode = GET_MODE (reg);
16032 /* Don't allow SUBREGs that span more than a word. It can
16033 lead to spill failures when the register is one word out
16034 of a two word structure. */
16035 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16036 return NULL_RTX;
16038 /* Allow only SUBREGs of non-eliminable hard registers. */
16039 if (register_no_elim_operand (reg, mode))
16040 return reg;
16043 /* Op is not a register. */
16044 return NULL_RTX;
16047 /* Recognizes RTL expressions that are valid memory addresses for an
16048 instruction. The MODE argument is the machine mode for the MEM
16049 expression that wants to use this address.
16051 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16052 convert common non-canonical forms to canonical form so that they will
16053 be recognized. */
16055 static bool
16056 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16058 struct ix86_address parts;
16059 rtx base, index, disp;
16060 HOST_WIDE_INT scale;
16061 addr_space_t seg;
16063 if (ix86_decompose_address (addr, &parts) <= 0)
16064 /* Decomposition failed. */
16065 return false;
16067 base = parts.base;
16068 index = parts.index;
16069 disp = parts.disp;
16070 scale = parts.scale;
16071 seg = parts.seg;
16073 /* Validate base register. */
16074 if (base)
16076 rtx reg = ix86_validate_address_register (base);
16078 if (reg == NULL_RTX)
16079 return false;
16081 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16082 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16083 /* Base is not valid. */
16084 return false;
16087 /* Validate index register. */
16088 if (index)
16090 rtx reg = ix86_validate_address_register (index);
16092 if (reg == NULL_RTX)
16093 return false;
16095 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16096 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16097 /* Index is not valid. */
16098 return false;
16101 /* Index and base should have the same mode. */
16102 if (base && index
16103 && GET_MODE (base) != GET_MODE (index))
16104 return false;
16106 /* Address override works only on the (%reg) part of %fs:(%reg). */
16107 if (seg != ADDR_SPACE_GENERIC
16108 && ((base && GET_MODE (base) != word_mode)
16109 || (index && GET_MODE (index) != word_mode)))
16110 return false;
16112 /* Validate scale factor. */
16113 if (scale != 1)
16115 if (!index)
16116 /* Scale without index. */
16117 return false;
16119 if (scale != 2 && scale != 4 && scale != 8)
16120 /* Scale is not a valid multiplier. */
16121 return false;
16124 /* Validate displacement. */
16125 if (disp)
16127 if (GET_CODE (disp) == CONST
16128 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16129 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16130 switch (XINT (XEXP (disp, 0), 1))
16132 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16133 when used. While ABI specify also 32bit relocations, we
16134 don't produce them at all and use IP relative instead.
16135 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16136 should be loaded via GOT. */
16137 case UNSPEC_GOT:
16138 if (!TARGET_64BIT
16139 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16140 goto is_legitimate_pic;
16141 /* FALLTHRU */
16142 case UNSPEC_GOTOFF:
16143 gcc_assert (flag_pic);
16144 if (!TARGET_64BIT)
16145 goto is_legitimate_pic;
16147 /* 64bit address unspec. */
16148 return false;
16150 case UNSPEC_GOTPCREL:
16151 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16152 goto is_legitimate_pic;
16153 /* FALLTHRU */
16154 case UNSPEC_PCREL:
16155 gcc_assert (flag_pic);
16156 goto is_legitimate_pic;
16158 case UNSPEC_GOTTPOFF:
16159 case UNSPEC_GOTNTPOFF:
16160 case UNSPEC_INDNTPOFF:
16161 case UNSPEC_NTPOFF:
16162 case UNSPEC_DTPOFF:
16163 break;
16165 default:
16166 /* Invalid address unspec. */
16167 return false;
16170 else if (SYMBOLIC_CONST (disp)
16171 && (flag_pic
16172 || (TARGET_MACHO
16173 #if TARGET_MACHO
16174 && MACHOPIC_INDIRECT
16175 && !machopic_operand_p (disp)
16176 #endif
16180 is_legitimate_pic:
16181 if (TARGET_64BIT && (index || base))
16183 /* foo@dtpoff(%rX) is ok. */
16184 if (GET_CODE (disp) != CONST
16185 || GET_CODE (XEXP (disp, 0)) != PLUS
16186 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16187 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16188 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16189 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16190 /* Non-constant pic memory reference. */
16191 return false;
16193 else if ((!TARGET_MACHO || flag_pic)
16194 && ! legitimate_pic_address_disp_p (disp))
16195 /* Displacement is an invalid pic construct. */
16196 return false;
16197 #if TARGET_MACHO
16198 else if (MACHO_DYNAMIC_NO_PIC_P
16199 && !ix86_legitimate_constant_p (Pmode, disp))
16200 /* displacment must be referenced via non_lazy_pointer */
16201 return false;
16202 #endif
16204 /* This code used to verify that a symbolic pic displacement
16205 includes the pic_offset_table_rtx register.
16207 While this is good idea, unfortunately these constructs may
16208 be created by "adds using lea" optimization for incorrect
16209 code like:
16211 int a;
16212 int foo(int i)
16214 return *(&a+i);
16217 This code is nonsensical, but results in addressing
16218 GOT table with pic_offset_table_rtx base. We can't
16219 just refuse it easily, since it gets matched by
16220 "addsi3" pattern, that later gets split to lea in the
16221 case output register differs from input. While this
16222 can be handled by separate addsi pattern for this case
16223 that never results in lea, this seems to be easier and
16224 correct fix for crash to disable this test. */
16226 else if (GET_CODE (disp) != LABEL_REF
16227 && !CONST_INT_P (disp)
16228 && (GET_CODE (disp) != CONST
16229 || !ix86_legitimate_constant_p (Pmode, disp))
16230 && (GET_CODE (disp) != SYMBOL_REF
16231 || !ix86_legitimate_constant_p (Pmode, disp)))
16232 /* Displacement is not constant. */
16233 return false;
16234 else if (TARGET_64BIT
16235 && !x86_64_immediate_operand (disp, VOIDmode))
16236 /* Displacement is out of range. */
16237 return false;
16238 /* In x32 mode, constant addresses are sign extended to 64bit, so
16239 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16240 else if (TARGET_X32 && !(index || base)
16241 && CONST_INT_P (disp)
16242 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16243 return false;
16246 /* Everything looks valid. */
16247 return true;
16250 /* Determine if a given RTX is a valid constant address. */
16252 bool
16253 constant_address_p (rtx x)
16255 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16258 /* Return a unique alias set for the GOT. */
16260 static alias_set_type
16261 ix86_GOT_alias_set (void)
16263 static alias_set_type set = -1;
16264 if (set == -1)
16265 set = new_alias_set ();
16266 return set;
16269 /* Return a legitimate reference for ORIG (an address) using the
16270 register REG. If REG is 0, a new pseudo is generated.
16272 There are two types of references that must be handled:
16274 1. Global data references must load the address from the GOT, via
16275 the PIC reg. An insn is emitted to do this load, and the reg is
16276 returned.
16278 2. Static data references, constant pool addresses, and code labels
16279 compute the address as an offset from the GOT, whose base is in
16280 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16281 differentiate them from global data objects. The returned
16282 address is the PIC reg + an unspec constant.
16284 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16285 reg also appears in the address. */
16287 static rtx
16288 legitimize_pic_address (rtx orig, rtx reg)
16290 rtx addr = orig;
16291 rtx new_rtx = orig;
16293 #if TARGET_MACHO
16294 if (TARGET_MACHO && !TARGET_64BIT)
16296 if (reg == 0)
16297 reg = gen_reg_rtx (Pmode);
16298 /* Use the generic Mach-O PIC machinery. */
16299 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16301 #endif
16303 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16305 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16306 if (tmp)
16307 return tmp;
16310 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16311 new_rtx = addr;
16312 else if ((!TARGET_64BIT
16313 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16314 && !TARGET_PECOFF
16315 && gotoff_operand (addr, Pmode))
16317 /* This symbol may be referenced via a displacement
16318 from the PIC base address (@GOTOFF). */
16319 if (GET_CODE (addr) == CONST)
16320 addr = XEXP (addr, 0);
16322 if (GET_CODE (addr) == PLUS)
16324 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16325 UNSPEC_GOTOFF);
16326 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16328 else
16329 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16331 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16333 if (TARGET_64BIT)
16334 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16336 if (reg != 0)
16338 gcc_assert (REG_P (reg));
16339 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16340 new_rtx, reg, 1, OPTAB_DIRECT);
16342 else
16343 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16345 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16346 /* We can't use @GOTOFF for text labels
16347 on VxWorks, see gotoff_operand. */
16348 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16350 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16351 if (tmp)
16352 return tmp;
16354 /* For x64 PE-COFF there is no GOT table,
16355 so we use address directly. */
16356 if (TARGET_64BIT && TARGET_PECOFF)
16358 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16359 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16361 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16363 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16364 UNSPEC_GOTPCREL);
16365 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16366 new_rtx = gen_const_mem (Pmode, new_rtx);
16367 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16369 else
16371 /* This symbol must be referenced via a load
16372 from the Global Offset Table (@GOT). */
16373 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16374 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16375 if (TARGET_64BIT)
16376 new_rtx = force_reg (Pmode, new_rtx);
16377 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16378 new_rtx = gen_const_mem (Pmode, new_rtx);
16379 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16382 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16384 else
16386 if (CONST_INT_P (addr)
16387 && !x86_64_immediate_operand (addr, VOIDmode))
16388 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16389 else if (GET_CODE (addr) == CONST)
16391 addr = XEXP (addr, 0);
16393 /* We must match stuff we generate before. Assume the only
16394 unspecs that can get here are ours. Not that we could do
16395 anything with them anyway.... */
16396 if (GET_CODE (addr) == UNSPEC
16397 || (GET_CODE (addr) == PLUS
16398 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16399 return orig;
16400 gcc_assert (GET_CODE (addr) == PLUS);
16403 if (GET_CODE (addr) == PLUS)
16405 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16407 /* Check first to see if this is a constant
16408 offset from a @GOTOFF symbol reference. */
16409 if (!TARGET_PECOFF
16410 && gotoff_operand (op0, Pmode)
16411 && CONST_INT_P (op1))
16413 if (!TARGET_64BIT)
16415 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16416 UNSPEC_GOTOFF);
16417 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16418 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16420 if (reg != 0)
16422 gcc_assert (REG_P (reg));
16423 new_rtx = expand_simple_binop (Pmode, PLUS,
16424 pic_offset_table_rtx,
16425 new_rtx, reg, 1,
16426 OPTAB_DIRECT);
16428 else
16429 new_rtx
16430 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16432 else
16434 if (INTVAL (op1) < -16*1024*1024
16435 || INTVAL (op1) >= 16*1024*1024)
16437 if (!x86_64_immediate_operand (op1, Pmode))
16438 op1 = force_reg (Pmode, op1);
16440 new_rtx
16441 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16445 else
16447 rtx base = legitimize_pic_address (op0, reg);
16448 machine_mode mode = GET_MODE (base);
16449 new_rtx
16450 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16452 if (CONST_INT_P (new_rtx))
16454 if (INTVAL (new_rtx) < -16*1024*1024
16455 || INTVAL (new_rtx) >= 16*1024*1024)
16457 if (!x86_64_immediate_operand (new_rtx, mode))
16458 new_rtx = force_reg (mode, new_rtx);
16460 new_rtx
16461 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16463 else
16464 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16466 else
16468 /* For %rip addressing, we have to use
16469 just disp32, not base nor index. */
16470 if (TARGET_64BIT
16471 && (GET_CODE (base) == SYMBOL_REF
16472 || GET_CODE (base) == LABEL_REF))
16473 base = force_reg (mode, base);
16474 if (GET_CODE (new_rtx) == PLUS
16475 && CONSTANT_P (XEXP (new_rtx, 1)))
16477 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16478 new_rtx = XEXP (new_rtx, 1);
16480 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16485 return new_rtx;
16488 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16490 static rtx
16491 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16493 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16495 if (GET_MODE (tp) != tp_mode)
16497 gcc_assert (GET_MODE (tp) == SImode);
16498 gcc_assert (tp_mode == DImode);
16500 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16503 if (to_reg)
16504 tp = copy_to_mode_reg (tp_mode, tp);
16506 return tp;
16509 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16511 static GTY(()) rtx ix86_tls_symbol;
16513 static rtx
16514 ix86_tls_get_addr (void)
16516 if (!ix86_tls_symbol)
16518 const char *sym
16519 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16520 ? "___tls_get_addr" : "__tls_get_addr");
16522 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16525 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16527 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16528 UNSPEC_PLTOFF);
16529 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16530 gen_rtx_CONST (Pmode, unspec));
16533 return ix86_tls_symbol;
16536 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16538 static GTY(()) rtx ix86_tls_module_base_symbol;
16541 ix86_tls_module_base (void)
16543 if (!ix86_tls_module_base_symbol)
16545 ix86_tls_module_base_symbol
16546 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16548 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16549 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16552 return ix86_tls_module_base_symbol;
16555 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16556 false if we expect this to be used for a memory address and true if
16557 we expect to load the address into a register. */
16559 static rtx
16560 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16562 rtx dest, base, off;
16563 rtx pic = NULL_RTX, tp = NULL_RTX;
16564 machine_mode tp_mode = Pmode;
16565 int type;
16567 /* Fall back to global dynamic model if tool chain cannot support local
16568 dynamic. */
16569 if (TARGET_SUN_TLS && !TARGET_64BIT
16570 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16571 && model == TLS_MODEL_LOCAL_DYNAMIC)
16572 model = TLS_MODEL_GLOBAL_DYNAMIC;
16574 switch (model)
16576 case TLS_MODEL_GLOBAL_DYNAMIC:
16577 dest = gen_reg_rtx (Pmode);
16579 if (!TARGET_64BIT)
16581 if (flag_pic && !TARGET_PECOFF)
16582 pic = pic_offset_table_rtx;
16583 else
16585 pic = gen_reg_rtx (Pmode);
16586 emit_insn (gen_set_got (pic));
16590 if (TARGET_GNU2_TLS)
16592 if (TARGET_64BIT)
16593 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16594 else
16595 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16597 tp = get_thread_pointer (Pmode, true);
16598 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16600 if (GET_MODE (x) != Pmode)
16601 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16603 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16605 else
16607 rtx caddr = ix86_tls_get_addr ();
16609 if (TARGET_64BIT)
16611 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16612 rtx_insn *insns;
16614 start_sequence ();
16615 emit_call_insn
16616 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16617 insns = get_insns ();
16618 end_sequence ();
16620 if (GET_MODE (x) != Pmode)
16621 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16623 RTL_CONST_CALL_P (insns) = 1;
16624 emit_libcall_block (insns, dest, rax, x);
16626 else
16627 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16629 break;
16631 case TLS_MODEL_LOCAL_DYNAMIC:
16632 base = gen_reg_rtx (Pmode);
16634 if (!TARGET_64BIT)
16636 if (flag_pic)
16637 pic = pic_offset_table_rtx;
16638 else
16640 pic = gen_reg_rtx (Pmode);
16641 emit_insn (gen_set_got (pic));
16645 if (TARGET_GNU2_TLS)
16647 rtx tmp = ix86_tls_module_base ();
16649 if (TARGET_64BIT)
16650 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16651 else
16652 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16654 tp = get_thread_pointer (Pmode, true);
16655 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16656 gen_rtx_MINUS (Pmode, tmp, tp));
16658 else
16660 rtx caddr = ix86_tls_get_addr ();
16662 if (TARGET_64BIT)
16664 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16665 rtx_insn *insns;
16666 rtx eqv;
16668 start_sequence ();
16669 emit_call_insn
16670 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16671 insns = get_insns ();
16672 end_sequence ();
16674 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16675 share the LD_BASE result with other LD model accesses. */
16676 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16677 UNSPEC_TLS_LD_BASE);
16679 RTL_CONST_CALL_P (insns) = 1;
16680 emit_libcall_block (insns, base, rax, eqv);
16682 else
16683 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16686 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16687 off = gen_rtx_CONST (Pmode, off);
16689 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16691 if (TARGET_GNU2_TLS)
16693 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16695 if (GET_MODE (x) != Pmode)
16696 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16698 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16700 break;
16702 case TLS_MODEL_INITIAL_EXEC:
16703 if (TARGET_64BIT)
16705 if (TARGET_SUN_TLS && !TARGET_X32)
16707 /* The Sun linker took the AMD64 TLS spec literally
16708 and can only handle %rax as destination of the
16709 initial executable code sequence. */
16711 dest = gen_reg_rtx (DImode);
16712 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16713 return dest;
16716 /* Generate DImode references to avoid %fs:(%reg32)
16717 problems and linker IE->LE relaxation bug. */
16718 tp_mode = DImode;
16719 pic = NULL;
16720 type = UNSPEC_GOTNTPOFF;
16722 else if (flag_pic)
16724 pic = pic_offset_table_rtx;
16725 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16727 else if (!TARGET_ANY_GNU_TLS)
16729 pic = gen_reg_rtx (Pmode);
16730 emit_insn (gen_set_got (pic));
16731 type = UNSPEC_GOTTPOFF;
16733 else
16735 pic = NULL;
16736 type = UNSPEC_INDNTPOFF;
16739 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16740 off = gen_rtx_CONST (tp_mode, off);
16741 if (pic)
16742 off = gen_rtx_PLUS (tp_mode, pic, off);
16743 off = gen_const_mem (tp_mode, off);
16744 set_mem_alias_set (off, ix86_GOT_alias_set ());
16746 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16748 base = get_thread_pointer (tp_mode,
16749 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16750 off = force_reg (tp_mode, off);
16751 dest = gen_rtx_PLUS (tp_mode, base, off);
16752 if (tp_mode != Pmode)
16753 dest = convert_to_mode (Pmode, dest, 1);
16755 else
16757 base = get_thread_pointer (Pmode, true);
16758 dest = gen_reg_rtx (Pmode);
16759 emit_insn (ix86_gen_sub3 (dest, base, off));
16761 break;
16763 case TLS_MODEL_LOCAL_EXEC:
16764 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16765 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16766 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16767 off = gen_rtx_CONST (Pmode, off);
16769 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16771 base = get_thread_pointer (Pmode,
16772 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16773 return gen_rtx_PLUS (Pmode, base, off);
16775 else
16777 base = get_thread_pointer (Pmode, true);
16778 dest = gen_reg_rtx (Pmode);
16779 emit_insn (ix86_gen_sub3 (dest, base, off));
16781 break;
16783 default:
16784 gcc_unreachable ();
16787 return dest;
16790 /* Return true if OP refers to a TLS address. */
16791 bool
16792 ix86_tls_address_pattern_p (rtx op)
16794 subrtx_var_iterator::array_type array;
16795 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16797 rtx op = *iter;
16798 if (MEM_P (op))
16800 rtx *x = &XEXP (op, 0);
16801 while (GET_CODE (*x) == PLUS)
16803 int i;
16804 for (i = 0; i < 2; i++)
16806 rtx u = XEXP (*x, i);
16807 if (GET_CODE (u) == ZERO_EXTEND)
16808 u = XEXP (u, 0);
16809 if (GET_CODE (u) == UNSPEC
16810 && XINT (u, 1) == UNSPEC_TP)
16811 return true;
16813 x = &XEXP (*x, 0);
16816 iter.skip_subrtxes ();
16820 return false;
16823 /* Rewrite *LOC so that it refers to a default TLS address space. */
16824 void
16825 ix86_rewrite_tls_address_1 (rtx *loc)
16827 subrtx_ptr_iterator::array_type array;
16828 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16830 rtx *loc = *iter;
16831 if (MEM_P (*loc))
16833 rtx addr = XEXP (*loc, 0);
16834 rtx *x = &addr;
16835 while (GET_CODE (*x) == PLUS)
16837 int i;
16838 for (i = 0; i < 2; i++)
16840 rtx u = XEXP (*x, i);
16841 if (GET_CODE (u) == ZERO_EXTEND)
16842 u = XEXP (u, 0);
16843 if (GET_CODE (u) == UNSPEC
16844 && XINT (u, 1) == UNSPEC_TP)
16846 addr_space_t as = DEFAULT_TLS_SEG_REG;
16848 *x = XEXP (*x, 1 - i);
16850 *loc = replace_equiv_address_nv (*loc, addr, true);
16851 set_mem_addr_space (*loc, as);
16852 return;
16855 x = &XEXP (*x, 0);
16858 iter.skip_subrtxes ();
16863 /* Rewrite instruction pattern involvning TLS address
16864 so that it refers to a default TLS address space. */
16866 ix86_rewrite_tls_address (rtx pattern)
16868 pattern = copy_insn (pattern);
16869 ix86_rewrite_tls_address_1 (&pattern);
16870 return pattern;
16873 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16874 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16875 unique refptr-DECL symbol corresponding to symbol DECL. */
16877 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16879 static inline hashval_t hash (tree_map *m) { return m->hash; }
16880 static inline bool
16881 equal (tree_map *a, tree_map *b)
16883 return a->base.from == b->base.from;
16886 static int
16887 keep_cache_entry (tree_map *&m)
16889 return ggc_marked_p (m->base.from);
16893 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16895 static tree
16896 get_dllimport_decl (tree decl, bool beimport)
16898 struct tree_map *h, in;
16899 const char *name;
16900 const char *prefix;
16901 size_t namelen, prefixlen;
16902 char *imp_name;
16903 tree to;
16904 rtx rtl;
16906 if (!dllimport_map)
16907 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16909 in.hash = htab_hash_pointer (decl);
16910 in.base.from = decl;
16911 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16912 h = *loc;
16913 if (h)
16914 return h->to;
16916 *loc = h = ggc_alloc<tree_map> ();
16917 h->hash = in.hash;
16918 h->base.from = decl;
16919 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16920 VAR_DECL, NULL, ptr_type_node);
16921 DECL_ARTIFICIAL (to) = 1;
16922 DECL_IGNORED_P (to) = 1;
16923 DECL_EXTERNAL (to) = 1;
16924 TREE_READONLY (to) = 1;
16926 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16927 name = targetm.strip_name_encoding (name);
16928 if (beimport)
16929 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16930 ? "*__imp_" : "*__imp__";
16931 else
16932 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16933 namelen = strlen (name);
16934 prefixlen = strlen (prefix);
16935 imp_name = (char *) alloca (namelen + prefixlen + 1);
16936 memcpy (imp_name, prefix, prefixlen);
16937 memcpy (imp_name + prefixlen, name, namelen + 1);
16939 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16940 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16941 SET_SYMBOL_REF_DECL (rtl, to);
16942 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16943 if (!beimport)
16945 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16946 #ifdef SUB_TARGET_RECORD_STUB
16947 SUB_TARGET_RECORD_STUB (name);
16948 #endif
16951 rtl = gen_const_mem (Pmode, rtl);
16952 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16954 SET_DECL_RTL (to, rtl);
16955 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16957 return to;
16960 /* Expand SYMBOL into its corresponding far-address symbol.
16961 WANT_REG is true if we require the result be a register. */
16963 static rtx
16964 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16966 tree imp_decl;
16967 rtx x;
16969 gcc_assert (SYMBOL_REF_DECL (symbol));
16970 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16972 x = DECL_RTL (imp_decl);
16973 if (want_reg)
16974 x = force_reg (Pmode, x);
16975 return x;
16978 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16979 true if we require the result be a register. */
16981 static rtx
16982 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16984 tree imp_decl;
16985 rtx x;
16987 gcc_assert (SYMBOL_REF_DECL (symbol));
16988 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16990 x = DECL_RTL (imp_decl);
16991 if (want_reg)
16992 x = force_reg (Pmode, x);
16993 return x;
16996 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16997 is true if we require the result be a register. */
16999 static rtx
17000 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17002 if (!TARGET_PECOFF)
17003 return NULL_RTX;
17005 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17007 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17008 return legitimize_dllimport_symbol (addr, inreg);
17009 if (GET_CODE (addr) == CONST
17010 && GET_CODE (XEXP (addr, 0)) == PLUS
17011 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17012 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17014 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17015 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17019 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17020 return NULL_RTX;
17021 if (GET_CODE (addr) == SYMBOL_REF
17022 && !is_imported_p (addr)
17023 && SYMBOL_REF_EXTERNAL_P (addr)
17024 && SYMBOL_REF_DECL (addr))
17025 return legitimize_pe_coff_extern_decl (addr, inreg);
17027 if (GET_CODE (addr) == CONST
17028 && GET_CODE (XEXP (addr, 0)) == PLUS
17029 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17030 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17031 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17032 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17034 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17035 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17037 return NULL_RTX;
17040 /* Try machine-dependent ways of modifying an illegitimate address
17041 to be legitimate. If we find one, return the new, valid address.
17042 This macro is used in only one place: `memory_address' in explow.c.
17044 OLDX is the address as it was before break_out_memory_refs was called.
17045 In some cases it is useful to look at this to decide what needs to be done.
17047 It is always safe for this macro to do nothing. It exists to recognize
17048 opportunities to optimize the output.
17050 For the 80386, we handle X+REG by loading X into a register R and
17051 using R+REG. R will go in a general reg and indexing will be used.
17052 However, if REG is a broken-out memory address or multiplication,
17053 nothing needs to be done because REG can certainly go in a general reg.
17055 When -fpic is used, special handling is needed for symbolic references.
17056 See comments by legitimize_pic_address in i386.c for details. */
17058 static rtx
17059 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17061 bool changed = false;
17062 unsigned log;
17064 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17065 if (log)
17066 return legitimize_tls_address (x, (enum tls_model) log, false);
17067 if (GET_CODE (x) == CONST
17068 && GET_CODE (XEXP (x, 0)) == PLUS
17069 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17070 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17072 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17073 (enum tls_model) log, false);
17074 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17077 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17079 rtx tmp = legitimize_pe_coff_symbol (x, true);
17080 if (tmp)
17081 return tmp;
17084 if (flag_pic && SYMBOLIC_CONST (x))
17085 return legitimize_pic_address (x, 0);
17087 #if TARGET_MACHO
17088 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17089 return machopic_indirect_data_reference (x, 0);
17090 #endif
17092 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17093 if (GET_CODE (x) == ASHIFT
17094 && CONST_INT_P (XEXP (x, 1))
17095 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17097 changed = true;
17098 log = INTVAL (XEXP (x, 1));
17099 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17100 GEN_INT (1 << log));
17103 if (GET_CODE (x) == PLUS)
17105 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17107 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17108 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17109 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17111 changed = true;
17112 log = INTVAL (XEXP (XEXP (x, 0), 1));
17113 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17114 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17115 GEN_INT (1 << log));
17118 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17119 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17120 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17122 changed = true;
17123 log = INTVAL (XEXP (XEXP (x, 1), 1));
17124 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17125 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17126 GEN_INT (1 << log));
17129 /* Put multiply first if it isn't already. */
17130 if (GET_CODE (XEXP (x, 1)) == MULT)
17132 std::swap (XEXP (x, 0), XEXP (x, 1));
17133 changed = true;
17136 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17137 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17138 created by virtual register instantiation, register elimination, and
17139 similar optimizations. */
17140 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17142 changed = true;
17143 x = gen_rtx_PLUS (Pmode,
17144 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17145 XEXP (XEXP (x, 1), 0)),
17146 XEXP (XEXP (x, 1), 1));
17149 /* Canonicalize
17150 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17151 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17152 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17153 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17154 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17155 && CONSTANT_P (XEXP (x, 1)))
17157 rtx constant;
17158 rtx other = NULL_RTX;
17160 if (CONST_INT_P (XEXP (x, 1)))
17162 constant = XEXP (x, 1);
17163 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17165 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17167 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17168 other = XEXP (x, 1);
17170 else
17171 constant = 0;
17173 if (constant)
17175 changed = true;
17176 x = gen_rtx_PLUS (Pmode,
17177 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17178 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17179 plus_constant (Pmode, other,
17180 INTVAL (constant)));
17184 if (changed && ix86_legitimate_address_p (mode, x, false))
17185 return x;
17187 if (GET_CODE (XEXP (x, 0)) == MULT)
17189 changed = true;
17190 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17193 if (GET_CODE (XEXP (x, 1)) == MULT)
17195 changed = true;
17196 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17199 if (changed
17200 && REG_P (XEXP (x, 1))
17201 && REG_P (XEXP (x, 0)))
17202 return x;
17204 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17206 changed = true;
17207 x = legitimize_pic_address (x, 0);
17210 if (changed && ix86_legitimate_address_p (mode, x, false))
17211 return x;
17213 if (REG_P (XEXP (x, 0)))
17215 rtx temp = gen_reg_rtx (Pmode);
17216 rtx val = force_operand (XEXP (x, 1), temp);
17217 if (val != temp)
17219 val = convert_to_mode (Pmode, val, 1);
17220 emit_move_insn (temp, val);
17223 XEXP (x, 1) = temp;
17224 return x;
17227 else if (REG_P (XEXP (x, 1)))
17229 rtx temp = gen_reg_rtx (Pmode);
17230 rtx val = force_operand (XEXP (x, 0), temp);
17231 if (val != temp)
17233 val = convert_to_mode (Pmode, val, 1);
17234 emit_move_insn (temp, val);
17237 XEXP (x, 0) = temp;
17238 return x;
17242 return x;
17245 /* Print an integer constant expression in assembler syntax. Addition
17246 and subtraction are the only arithmetic that may appear in these
17247 expressions. FILE is the stdio stream to write to, X is the rtx, and
17248 CODE is the operand print code from the output string. */
17250 static void
17251 output_pic_addr_const (FILE *file, rtx x, int code)
17253 char buf[256];
17255 switch (GET_CODE (x))
17257 case PC:
17258 gcc_assert (flag_pic);
17259 putc ('.', file);
17260 break;
17262 case SYMBOL_REF:
17263 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17264 output_addr_const (file, x);
17265 else
17267 const char *name = XSTR (x, 0);
17269 /* Mark the decl as referenced so that cgraph will
17270 output the function. */
17271 if (SYMBOL_REF_DECL (x))
17272 mark_decl_referenced (SYMBOL_REF_DECL (x));
17274 #if TARGET_MACHO
17275 if (MACHOPIC_INDIRECT
17276 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17277 name = machopic_indirection_name (x, /*stub_p=*/true);
17278 #endif
17279 assemble_name (file, name);
17281 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17282 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17283 fputs ("@PLT", file);
17284 break;
17286 case LABEL_REF:
17287 x = XEXP (x, 0);
17288 /* FALLTHRU */
17289 case CODE_LABEL:
17290 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17291 assemble_name (asm_out_file, buf);
17292 break;
17294 case CONST_INT:
17295 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17296 break;
17298 case CONST:
17299 /* This used to output parentheses around the expression,
17300 but that does not work on the 386 (either ATT or BSD assembler). */
17301 output_pic_addr_const (file, XEXP (x, 0), code);
17302 break;
17304 case CONST_DOUBLE:
17305 /* We can't handle floating point constants;
17306 TARGET_PRINT_OPERAND must handle them. */
17307 output_operand_lossage ("floating constant misused");
17308 break;
17310 case PLUS:
17311 /* Some assemblers need integer constants to appear first. */
17312 if (CONST_INT_P (XEXP (x, 0)))
17314 output_pic_addr_const (file, XEXP (x, 0), code);
17315 putc ('+', file);
17316 output_pic_addr_const (file, XEXP (x, 1), code);
17318 else
17320 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17321 output_pic_addr_const (file, XEXP (x, 1), code);
17322 putc ('+', file);
17323 output_pic_addr_const (file, XEXP (x, 0), code);
17325 break;
17327 case MINUS:
17328 if (!TARGET_MACHO)
17329 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17330 output_pic_addr_const (file, XEXP (x, 0), code);
17331 putc ('-', file);
17332 output_pic_addr_const (file, XEXP (x, 1), code);
17333 if (!TARGET_MACHO)
17334 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17335 break;
17337 case UNSPEC:
17338 gcc_assert (XVECLEN (x, 0) == 1);
17339 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17340 switch (XINT (x, 1))
17342 case UNSPEC_GOT:
17343 fputs ("@GOT", file);
17344 break;
17345 case UNSPEC_GOTOFF:
17346 fputs ("@GOTOFF", file);
17347 break;
17348 case UNSPEC_PLTOFF:
17349 fputs ("@PLTOFF", file);
17350 break;
17351 case UNSPEC_PCREL:
17352 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17353 "(%rip)" : "[rip]", file);
17354 break;
17355 case UNSPEC_GOTPCREL:
17356 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17357 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17358 break;
17359 case UNSPEC_GOTTPOFF:
17360 /* FIXME: This might be @TPOFF in Sun ld too. */
17361 fputs ("@gottpoff", file);
17362 break;
17363 case UNSPEC_TPOFF:
17364 fputs ("@tpoff", file);
17365 break;
17366 case UNSPEC_NTPOFF:
17367 if (TARGET_64BIT)
17368 fputs ("@tpoff", file);
17369 else
17370 fputs ("@ntpoff", file);
17371 break;
17372 case UNSPEC_DTPOFF:
17373 fputs ("@dtpoff", file);
17374 break;
17375 case UNSPEC_GOTNTPOFF:
17376 if (TARGET_64BIT)
17377 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17378 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17379 else
17380 fputs ("@gotntpoff", file);
17381 break;
17382 case UNSPEC_INDNTPOFF:
17383 fputs ("@indntpoff", file);
17384 break;
17385 #if TARGET_MACHO
17386 case UNSPEC_MACHOPIC_OFFSET:
17387 putc ('-', file);
17388 machopic_output_function_base_name (file);
17389 break;
17390 #endif
17391 default:
17392 output_operand_lossage ("invalid UNSPEC as operand");
17393 break;
17395 break;
17397 default:
17398 output_operand_lossage ("invalid expression as operand");
17402 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17403 We need to emit DTP-relative relocations. */
17405 static void ATTRIBUTE_UNUSED
17406 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17408 fputs (ASM_LONG, file);
17409 output_addr_const (file, x);
17410 fputs ("@dtpoff", file);
17411 switch (size)
17413 case 4:
17414 break;
17415 case 8:
17416 fputs (", 0", file);
17417 break;
17418 default:
17419 gcc_unreachable ();
17423 /* Return true if X is a representation of the PIC register. This copes
17424 with calls from ix86_find_base_term, where the register might have
17425 been replaced by a cselib value. */
17427 static bool
17428 ix86_pic_register_p (rtx x)
17430 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17431 return (pic_offset_table_rtx
17432 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17433 else if (!REG_P (x))
17434 return false;
17435 else if (pic_offset_table_rtx)
17437 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17438 return true;
17439 if (HARD_REGISTER_P (x)
17440 && !HARD_REGISTER_P (pic_offset_table_rtx)
17441 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17442 return true;
17443 return false;
17445 else
17446 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17449 /* Helper function for ix86_delegitimize_address.
17450 Attempt to delegitimize TLS local-exec accesses. */
17452 static rtx
17453 ix86_delegitimize_tls_address (rtx orig_x)
17455 rtx x = orig_x, unspec;
17456 struct ix86_address addr;
17458 if (!TARGET_TLS_DIRECT_SEG_REFS)
17459 return orig_x;
17460 if (MEM_P (x))
17461 x = XEXP (x, 0);
17462 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17463 return orig_x;
17464 if (ix86_decompose_address (x, &addr) == 0
17465 || addr.seg != DEFAULT_TLS_SEG_REG
17466 || addr.disp == NULL_RTX
17467 || GET_CODE (addr.disp) != CONST)
17468 return orig_x;
17469 unspec = XEXP (addr.disp, 0);
17470 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17471 unspec = XEXP (unspec, 0);
17472 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17473 return orig_x;
17474 x = XVECEXP (unspec, 0, 0);
17475 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17476 if (unspec != XEXP (addr.disp, 0))
17477 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17478 if (addr.index)
17480 rtx idx = addr.index;
17481 if (addr.scale != 1)
17482 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17483 x = gen_rtx_PLUS (Pmode, idx, x);
17485 if (addr.base)
17486 x = gen_rtx_PLUS (Pmode, addr.base, x);
17487 if (MEM_P (orig_x))
17488 x = replace_equiv_address_nv (orig_x, x);
17489 return x;
17492 /* In the name of slightly smaller debug output, and to cater to
17493 general assembler lossage, recognize PIC+GOTOFF and turn it back
17494 into a direct symbol reference.
17496 On Darwin, this is necessary to avoid a crash, because Darwin
17497 has a different PIC label for each routine but the DWARF debugging
17498 information is not associated with any particular routine, so it's
17499 necessary to remove references to the PIC label from RTL stored by
17500 the DWARF output code.
17502 This helper is used in the normal ix86_delegitimize_address
17503 entrypoint (e.g. used in the target delegitimization hook) and
17504 in ix86_find_base_term. As compile time memory optimization, we
17505 avoid allocating rtxes that will not change anything on the outcome
17506 of the callers (find_base_value and find_base_term). */
17508 static inline rtx
17509 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17511 rtx orig_x = delegitimize_mem_from_attrs (x);
17512 /* addend is NULL or some rtx if x is something+GOTOFF where
17513 something doesn't include the PIC register. */
17514 rtx addend = NULL_RTX;
17515 /* reg_addend is NULL or a multiple of some register. */
17516 rtx reg_addend = NULL_RTX;
17517 /* const_addend is NULL or a const_int. */
17518 rtx const_addend = NULL_RTX;
17519 /* This is the result, or NULL. */
17520 rtx result = NULL_RTX;
17522 x = orig_x;
17524 if (MEM_P (x))
17525 x = XEXP (x, 0);
17527 if (TARGET_64BIT)
17529 if (GET_CODE (x) == CONST
17530 && GET_CODE (XEXP (x, 0)) == PLUS
17531 && GET_MODE (XEXP (x, 0)) == Pmode
17532 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17533 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17534 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17536 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17537 base. A CONST can't be arg_pointer_rtx based. */
17538 if (base_term_p && MEM_P (orig_x))
17539 return orig_x;
17540 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17541 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17542 if (MEM_P (orig_x))
17543 x = replace_equiv_address_nv (orig_x, x);
17544 return x;
17547 if (GET_CODE (x) == CONST
17548 && GET_CODE (XEXP (x, 0)) == UNSPEC
17549 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17550 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17551 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17553 x = XVECEXP (XEXP (x, 0), 0, 0);
17554 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17556 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17557 if (x == NULL_RTX)
17558 return orig_x;
17560 return x;
17563 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17564 return ix86_delegitimize_tls_address (orig_x);
17566 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17567 and -mcmodel=medium -fpic. */
17570 if (GET_CODE (x) != PLUS
17571 || GET_CODE (XEXP (x, 1)) != CONST)
17572 return ix86_delegitimize_tls_address (orig_x);
17574 if (ix86_pic_register_p (XEXP (x, 0)))
17575 /* %ebx + GOT/GOTOFF */
17577 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17579 /* %ebx + %reg * scale + GOT/GOTOFF */
17580 reg_addend = XEXP (x, 0);
17581 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17582 reg_addend = XEXP (reg_addend, 1);
17583 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17584 reg_addend = XEXP (reg_addend, 0);
17585 else
17587 reg_addend = NULL_RTX;
17588 addend = XEXP (x, 0);
17591 else
17592 addend = XEXP (x, 0);
17594 x = XEXP (XEXP (x, 1), 0);
17595 if (GET_CODE (x) == PLUS
17596 && CONST_INT_P (XEXP (x, 1)))
17598 const_addend = XEXP (x, 1);
17599 x = XEXP (x, 0);
17602 if (GET_CODE (x) == UNSPEC
17603 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17604 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17605 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17606 && !MEM_P (orig_x) && !addend)))
17607 result = XVECEXP (x, 0, 0);
17609 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17610 && !MEM_P (orig_x))
17611 result = XVECEXP (x, 0, 0);
17613 if (! result)
17614 return ix86_delegitimize_tls_address (orig_x);
17616 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17617 recurse on the first operand. */
17618 if (const_addend && !base_term_p)
17619 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17620 if (reg_addend)
17621 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17622 if (addend)
17624 /* If the rest of original X doesn't involve the PIC register, add
17625 addend and subtract pic_offset_table_rtx. This can happen e.g.
17626 for code like:
17627 leal (%ebx, %ecx, 4), %ecx
17629 movl foo@GOTOFF(%ecx), %edx
17630 in which case we return (%ecx - %ebx) + foo
17631 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17632 and reload has completed. Don't do the latter for debug,
17633 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17634 if (pic_offset_table_rtx
17635 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17636 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17637 pic_offset_table_rtx),
17638 result);
17639 else if (base_term_p
17640 && pic_offset_table_rtx
17641 && !TARGET_MACHO
17642 && !TARGET_VXWORKS_RTP)
17644 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17645 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17646 result = gen_rtx_PLUS (Pmode, tmp, result);
17648 else
17649 return orig_x;
17651 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17653 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17654 if (result == NULL_RTX)
17655 return orig_x;
17657 return result;
17660 /* The normal instantiation of the above template. */
17662 static rtx
17663 ix86_delegitimize_address (rtx x)
17665 return ix86_delegitimize_address_1 (x, false);
17668 /* If X is a machine specific address (i.e. a symbol or label being
17669 referenced as a displacement from the GOT implemented using an
17670 UNSPEC), then return the base term. Otherwise return X. */
17673 ix86_find_base_term (rtx x)
17675 rtx term;
17677 if (TARGET_64BIT)
17679 if (GET_CODE (x) != CONST)
17680 return x;
17681 term = XEXP (x, 0);
17682 if (GET_CODE (term) == PLUS
17683 && CONST_INT_P (XEXP (term, 1)))
17684 term = XEXP (term, 0);
17685 if (GET_CODE (term) != UNSPEC
17686 || (XINT (term, 1) != UNSPEC_GOTPCREL
17687 && XINT (term, 1) != UNSPEC_PCREL))
17688 return x;
17690 return XVECEXP (term, 0, 0);
17693 return ix86_delegitimize_address_1 (x, true);
17696 /* Return true if X shouldn't be emitted into the debug info.
17697 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17698 symbol easily into the .debug_info section, so we need not to
17699 delegitimize, but instead assemble as @gotoff.
17700 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17701 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17703 static bool
17704 ix86_const_not_ok_for_debug_p (rtx x)
17706 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17707 return true;
17709 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17710 return true;
17712 return false;
17715 static void
17716 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17717 bool fp, FILE *file)
17719 const char *suffix;
17721 if (mode == CCFPmode)
17723 code = ix86_fp_compare_code_to_integer (code);
17724 mode = CCmode;
17726 if (reverse)
17727 code = reverse_condition (code);
17729 switch (code)
17731 case EQ:
17732 gcc_assert (mode != CCGZmode);
17733 switch (mode)
17735 case E_CCAmode:
17736 suffix = "a";
17737 break;
17738 case E_CCCmode:
17739 suffix = "c";
17740 break;
17741 case E_CCOmode:
17742 suffix = "o";
17743 break;
17744 case E_CCPmode:
17745 suffix = "p";
17746 break;
17747 case E_CCSmode:
17748 suffix = "s";
17749 break;
17750 default:
17751 suffix = "e";
17752 break;
17754 break;
17755 case NE:
17756 gcc_assert (mode != CCGZmode);
17757 switch (mode)
17759 case E_CCAmode:
17760 suffix = "na";
17761 break;
17762 case E_CCCmode:
17763 suffix = "nc";
17764 break;
17765 case E_CCOmode:
17766 suffix = "no";
17767 break;
17768 case E_CCPmode:
17769 suffix = "np";
17770 break;
17771 case E_CCSmode:
17772 suffix = "ns";
17773 break;
17774 default:
17775 suffix = "ne";
17776 break;
17778 break;
17779 case GT:
17780 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17781 suffix = "g";
17782 break;
17783 case GTU:
17784 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17785 Those same assemblers have the same but opposite lossage on cmov. */
17786 if (mode == CCmode)
17787 suffix = fp ? "nbe" : "a";
17788 else
17789 gcc_unreachable ();
17790 break;
17791 case LT:
17792 switch (mode)
17794 case E_CCNOmode:
17795 case E_CCGOCmode:
17796 suffix = "s";
17797 break;
17799 case E_CCmode:
17800 case E_CCGCmode:
17801 case E_CCGZmode:
17802 suffix = "l";
17803 break;
17805 default:
17806 gcc_unreachable ();
17808 break;
17809 case LTU:
17810 if (mode == CCmode || mode == CCGZmode)
17811 suffix = "b";
17812 else if (mode == CCCmode)
17813 suffix = fp ? "b" : "c";
17814 else
17815 gcc_unreachable ();
17816 break;
17817 case GE:
17818 switch (mode)
17820 case E_CCNOmode:
17821 case E_CCGOCmode:
17822 suffix = "ns";
17823 break;
17825 case E_CCmode:
17826 case E_CCGCmode:
17827 case E_CCGZmode:
17828 suffix = "ge";
17829 break;
17831 default:
17832 gcc_unreachable ();
17834 break;
17835 case GEU:
17836 if (mode == CCmode || mode == CCGZmode)
17837 suffix = "nb";
17838 else if (mode == CCCmode)
17839 suffix = fp ? "nb" : "nc";
17840 else
17841 gcc_unreachable ();
17842 break;
17843 case LE:
17844 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17845 suffix = "le";
17846 break;
17847 case LEU:
17848 if (mode == CCmode)
17849 suffix = "be";
17850 else
17851 gcc_unreachable ();
17852 break;
17853 case UNORDERED:
17854 suffix = fp ? "u" : "p";
17855 break;
17856 case ORDERED:
17857 suffix = fp ? "nu" : "np";
17858 break;
17859 default:
17860 gcc_unreachable ();
17862 fputs (suffix, file);
17865 /* Print the name of register X to FILE based on its machine mode and number.
17866 If CODE is 'w', pretend the mode is HImode.
17867 If CODE is 'b', pretend the mode is QImode.
17868 If CODE is 'k', pretend the mode is SImode.
17869 If CODE is 'q', pretend the mode is DImode.
17870 If CODE is 'x', pretend the mode is V4SFmode.
17871 If CODE is 't', pretend the mode is V8SFmode.
17872 If CODE is 'g', pretend the mode is V16SFmode.
17873 If CODE is 'h', pretend the reg is the 'high' byte register.
17874 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17875 If CODE is 'd', duplicate the operand for AVX instruction.
17876 If CODE is 'V', print naked full integer register name without %.
17879 void
17880 print_reg (rtx x, int code, FILE *file)
17882 const char *reg;
17883 int msize;
17884 unsigned int regno;
17885 bool duplicated;
17887 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17888 putc ('%', file);
17890 if (x == pc_rtx)
17892 gcc_assert (TARGET_64BIT);
17893 fputs ("rip", file);
17894 return;
17897 if (code == 'y' && STACK_TOP_P (x))
17899 fputs ("st(0)", file);
17900 return;
17903 if (code == 'w')
17904 msize = 2;
17905 else if (code == 'b')
17906 msize = 1;
17907 else if (code == 'k')
17908 msize = 4;
17909 else if (code == 'q')
17910 msize = 8;
17911 else if (code == 'h')
17912 msize = 0;
17913 else if (code == 'x')
17914 msize = 16;
17915 else if (code == 't')
17916 msize = 32;
17917 else if (code == 'g')
17918 msize = 64;
17919 else
17920 msize = GET_MODE_SIZE (GET_MODE (x));
17922 regno = REGNO (x);
17924 if (regno == ARG_POINTER_REGNUM
17925 || regno == FRAME_POINTER_REGNUM
17926 || regno == FPSR_REG
17927 || regno == FPCR_REG)
17929 output_operand_lossage
17930 ("invalid use of register '%s'", reg_names[regno]);
17931 return;
17933 else if (regno == FLAGS_REG)
17935 output_operand_lossage ("invalid use of asm flag output");
17936 return;
17939 if (code == 'V')
17941 if (GENERAL_REGNO_P (regno))
17942 msize = GET_MODE_SIZE (word_mode);
17943 else
17944 error ("'V' modifier on non-integer register");
17947 duplicated = code == 'd' && TARGET_AVX;
17949 switch (msize)
17951 case 16:
17952 case 12:
17953 case 8:
17954 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17955 warning (0, "unsupported size for integer register");
17956 /* FALLTHRU */
17957 case 4:
17958 if (LEGACY_INT_REGNO_P (regno))
17959 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17960 /* FALLTHRU */
17961 case 2:
17962 normal:
17963 reg = hi_reg_name[regno];
17964 break;
17965 case 1:
17966 if (regno >= ARRAY_SIZE (qi_reg_name))
17967 goto normal;
17968 if (!ANY_QI_REGNO_P (regno))
17969 error ("unsupported size for integer register");
17970 reg = qi_reg_name[regno];
17971 break;
17972 case 0:
17973 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17974 goto normal;
17975 reg = qi_high_reg_name[regno];
17976 break;
17977 case 32:
17978 case 64:
17979 if (SSE_REGNO_P (regno))
17981 gcc_assert (!duplicated);
17982 putc (msize == 32 ? 'y' : 'z', file);
17983 reg = hi_reg_name[regno] + 1;
17984 break;
17986 goto normal;
17987 default:
17988 gcc_unreachable ();
17991 fputs (reg, file);
17993 /* Irritatingly, AMD extended registers use
17994 different naming convention: "r%d[bwd]" */
17995 if (REX_INT_REGNO_P (regno))
17997 gcc_assert (TARGET_64BIT);
17998 switch (msize)
18000 case 0:
18001 error ("extended registers have no high halves");
18002 break;
18003 case 1:
18004 putc ('b', file);
18005 break;
18006 case 2:
18007 putc ('w', file);
18008 break;
18009 case 4:
18010 putc ('d', file);
18011 break;
18012 case 8:
18013 /* no suffix */
18014 break;
18015 default:
18016 error ("unsupported operand size for extended register");
18017 break;
18019 return;
18022 if (duplicated)
18024 if (ASSEMBLER_DIALECT == ASM_ATT)
18025 fprintf (file, ", %%%s", reg);
18026 else
18027 fprintf (file, ", %s", reg);
18031 /* Meaning of CODE:
18032 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18033 C -- print opcode suffix for set/cmov insn.
18034 c -- like C, but print reversed condition
18035 F,f -- likewise, but for floating-point.
18036 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18037 otherwise nothing
18038 R -- print embedded rounding and sae.
18039 r -- print only sae.
18040 z -- print the opcode suffix for the size of the current operand.
18041 Z -- likewise, with special suffixes for x87 instructions.
18042 * -- print a star (in certain assembler syntax)
18043 A -- print an absolute memory reference.
18044 E -- print address with DImode register names if TARGET_64BIT.
18045 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18046 s -- print a shift double count, followed by the assemblers argument
18047 delimiter.
18048 b -- print the QImode name of the register for the indicated operand.
18049 %b0 would print %al if operands[0] is reg 0.
18050 w -- likewise, print the HImode name of the register.
18051 k -- likewise, print the SImode name of the register.
18052 q -- likewise, print the DImode name of the register.
18053 x -- likewise, print the V4SFmode name of the register.
18054 t -- likewise, print the V8SFmode name of the register.
18055 g -- likewise, print the V16SFmode name of the register.
18056 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18057 y -- print "st(0)" instead of "st" as a register.
18058 d -- print duplicated register operand for AVX instruction.
18059 D -- print condition for SSE cmp instruction.
18060 P -- if PIC, print an @PLT suffix.
18061 p -- print raw symbol name.
18062 X -- don't print any sort of PIC '@' suffix for a symbol.
18063 & -- print some in-use local-dynamic symbol name.
18064 H -- print a memory address offset by 8; used for sse high-parts
18065 Y -- print condition for XOP pcom* instruction.
18066 V -- print naked full integer register name without %.
18067 + -- print a branch hint as 'cs' or 'ds' prefix
18068 ; -- print a semicolon (after prefixes due to bug in older gas).
18069 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18070 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18071 ! -- print MPX prefix for jxx/call/ret instructions if required.
18074 void
18075 ix86_print_operand (FILE *file, rtx x, int code)
18077 if (code)
18079 switch (code)
18081 case 'A':
18082 switch (ASSEMBLER_DIALECT)
18084 case ASM_ATT:
18085 putc ('*', file);
18086 break;
18088 case ASM_INTEL:
18089 /* Intel syntax. For absolute addresses, registers should not
18090 be surrounded by braces. */
18091 if (!REG_P (x))
18093 putc ('[', file);
18094 ix86_print_operand (file, x, 0);
18095 putc (']', file);
18096 return;
18098 break;
18100 default:
18101 gcc_unreachable ();
18104 ix86_print_operand (file, x, 0);
18105 return;
18107 case 'E':
18108 /* Wrap address in an UNSPEC to declare special handling. */
18109 if (TARGET_64BIT)
18110 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18112 output_address (VOIDmode, x);
18113 return;
18115 case 'L':
18116 if (ASSEMBLER_DIALECT == ASM_ATT)
18117 putc ('l', file);
18118 return;
18120 case 'W':
18121 if (ASSEMBLER_DIALECT == ASM_ATT)
18122 putc ('w', file);
18123 return;
18125 case 'B':
18126 if (ASSEMBLER_DIALECT == ASM_ATT)
18127 putc ('b', file);
18128 return;
18130 case 'Q':
18131 if (ASSEMBLER_DIALECT == ASM_ATT)
18132 putc ('l', file);
18133 return;
18135 case 'S':
18136 if (ASSEMBLER_DIALECT == ASM_ATT)
18137 putc ('s', file);
18138 return;
18140 case 'T':
18141 if (ASSEMBLER_DIALECT == ASM_ATT)
18142 putc ('t', file);
18143 return;
18145 case 'O':
18146 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18147 if (ASSEMBLER_DIALECT != ASM_ATT)
18148 return;
18150 switch (GET_MODE_SIZE (GET_MODE (x)))
18152 case 2:
18153 putc ('w', file);
18154 break;
18156 case 4:
18157 putc ('l', file);
18158 break;
18160 case 8:
18161 putc ('q', file);
18162 break;
18164 default:
18165 output_operand_lossage ("invalid operand size for operand "
18166 "code 'O'");
18167 return;
18170 putc ('.', file);
18171 #endif
18172 return;
18174 case 'z':
18175 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18177 /* Opcodes don't get size suffixes if using Intel opcodes. */
18178 if (ASSEMBLER_DIALECT == ASM_INTEL)
18179 return;
18181 switch (GET_MODE_SIZE (GET_MODE (x)))
18183 case 1:
18184 putc ('b', file);
18185 return;
18187 case 2:
18188 putc ('w', file);
18189 return;
18191 case 4:
18192 putc ('l', file);
18193 return;
18195 case 8:
18196 putc ('q', file);
18197 return;
18199 default:
18200 output_operand_lossage ("invalid operand size for operand "
18201 "code 'z'");
18202 return;
18206 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18207 warning (0, "non-integer operand used with operand code 'z'");
18208 /* FALLTHRU */
18210 case 'Z':
18211 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18212 if (ASSEMBLER_DIALECT == ASM_INTEL)
18213 return;
18215 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18217 switch (GET_MODE_SIZE (GET_MODE (x)))
18219 case 2:
18220 #ifdef HAVE_AS_IX86_FILDS
18221 putc ('s', file);
18222 #endif
18223 return;
18225 case 4:
18226 putc ('l', file);
18227 return;
18229 case 8:
18230 #ifdef HAVE_AS_IX86_FILDQ
18231 putc ('q', file);
18232 #else
18233 fputs ("ll", file);
18234 #endif
18235 return;
18237 default:
18238 break;
18241 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18243 /* 387 opcodes don't get size suffixes
18244 if the operands are registers. */
18245 if (STACK_REG_P (x))
18246 return;
18248 switch (GET_MODE_SIZE (GET_MODE (x)))
18250 case 4:
18251 putc ('s', file);
18252 return;
18254 case 8:
18255 putc ('l', file);
18256 return;
18258 case 12:
18259 case 16:
18260 putc ('t', file);
18261 return;
18263 default:
18264 break;
18267 else
18269 output_operand_lossage ("invalid operand type used with "
18270 "operand code 'Z'");
18271 return;
18274 output_operand_lossage ("invalid operand size for operand code 'Z'");
18275 return;
18277 case 'd':
18278 case 'b':
18279 case 'w':
18280 case 'k':
18281 case 'q':
18282 case 'h':
18283 case 't':
18284 case 'g':
18285 case 'y':
18286 case 'x':
18287 case 'X':
18288 case 'P':
18289 case 'p':
18290 case 'V':
18291 break;
18293 case 's':
18294 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18296 ix86_print_operand (file, x, 0);
18297 fputs (", ", file);
18299 return;
18301 case 'Y':
18302 switch (GET_CODE (x))
18304 case NE:
18305 fputs ("neq", file);
18306 break;
18307 case EQ:
18308 fputs ("eq", file);
18309 break;
18310 case GE:
18311 case GEU:
18312 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18313 break;
18314 case GT:
18315 case GTU:
18316 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18317 break;
18318 case LE:
18319 case LEU:
18320 fputs ("le", file);
18321 break;
18322 case LT:
18323 case LTU:
18324 fputs ("lt", file);
18325 break;
18326 case UNORDERED:
18327 fputs ("unord", file);
18328 break;
18329 case ORDERED:
18330 fputs ("ord", file);
18331 break;
18332 case UNEQ:
18333 fputs ("ueq", file);
18334 break;
18335 case UNGE:
18336 fputs ("nlt", file);
18337 break;
18338 case UNGT:
18339 fputs ("nle", file);
18340 break;
18341 case UNLE:
18342 fputs ("ule", file);
18343 break;
18344 case UNLT:
18345 fputs ("ult", file);
18346 break;
18347 case LTGT:
18348 fputs ("une", file);
18349 break;
18350 default:
18351 output_operand_lossage ("operand is not a condition code, "
18352 "invalid operand code 'Y'");
18353 return;
18355 return;
18357 case 'D':
18358 /* Little bit of braindamage here. The SSE compare instructions
18359 does use completely different names for the comparisons that the
18360 fp conditional moves. */
18361 switch (GET_CODE (x))
18363 case UNEQ:
18364 if (TARGET_AVX)
18366 fputs ("eq_us", file);
18367 break;
18369 /* FALLTHRU */
18370 case EQ:
18371 fputs ("eq", file);
18372 break;
18373 case UNLT:
18374 if (TARGET_AVX)
18376 fputs ("nge", file);
18377 break;
18379 /* FALLTHRU */
18380 case LT:
18381 fputs ("lt", file);
18382 break;
18383 case UNLE:
18384 if (TARGET_AVX)
18386 fputs ("ngt", file);
18387 break;
18389 /* FALLTHRU */
18390 case LE:
18391 fputs ("le", file);
18392 break;
18393 case UNORDERED:
18394 fputs ("unord", file);
18395 break;
18396 case LTGT:
18397 if (TARGET_AVX)
18399 fputs ("neq_oq", file);
18400 break;
18402 /* FALLTHRU */
18403 case NE:
18404 fputs ("neq", file);
18405 break;
18406 case GE:
18407 if (TARGET_AVX)
18409 fputs ("ge", file);
18410 break;
18412 /* FALLTHRU */
18413 case UNGE:
18414 fputs ("nlt", file);
18415 break;
18416 case GT:
18417 if (TARGET_AVX)
18419 fputs ("gt", file);
18420 break;
18422 /* FALLTHRU */
18423 case UNGT:
18424 fputs ("nle", file);
18425 break;
18426 case ORDERED:
18427 fputs ("ord", file);
18428 break;
18429 default:
18430 output_operand_lossage ("operand is not a condition code, "
18431 "invalid operand code 'D'");
18432 return;
18434 return;
18436 case 'F':
18437 case 'f':
18438 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18439 if (ASSEMBLER_DIALECT == ASM_ATT)
18440 putc ('.', file);
18441 gcc_fallthrough ();
18442 #endif
18444 case 'C':
18445 case 'c':
18446 if (!COMPARISON_P (x))
18448 output_operand_lossage ("operand is not a condition code, "
18449 "invalid operand code '%c'", code);
18450 return;
18452 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18453 code == 'c' || code == 'f',
18454 code == 'F' || code == 'f',
18455 file);
18456 return;
18458 case 'H':
18459 if (!offsettable_memref_p (x))
18461 output_operand_lossage ("operand is not an offsettable memory "
18462 "reference, invalid operand code 'H'");
18463 return;
18465 /* It doesn't actually matter what mode we use here, as we're
18466 only going to use this for printing. */
18467 x = adjust_address_nv (x, DImode, 8);
18468 /* Output 'qword ptr' for intel assembler dialect. */
18469 if (ASSEMBLER_DIALECT == ASM_INTEL)
18470 code = 'q';
18471 break;
18473 case 'K':
18474 if (!CONST_INT_P (x))
18476 output_operand_lossage ("operand is not an integer, invalid "
18477 "operand code 'K'");
18478 return;
18481 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18482 #ifdef HAVE_AS_IX86_HLE
18483 fputs ("xacquire ", file);
18484 #else
18485 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18486 #endif
18487 else if (INTVAL (x) & IX86_HLE_RELEASE)
18488 #ifdef HAVE_AS_IX86_HLE
18489 fputs ("xrelease ", file);
18490 #else
18491 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18492 #endif
18493 /* We do not want to print value of the operand. */
18494 return;
18496 case 'N':
18497 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18498 fputs ("{z}", file);
18499 return;
18501 case 'r':
18502 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18504 output_operand_lossage ("operand is not a specific integer, "
18505 "invalid operand code 'r'");
18506 return;
18509 if (ASSEMBLER_DIALECT == ASM_INTEL)
18510 fputs (", ", file);
18512 fputs ("{sae}", file);
18514 if (ASSEMBLER_DIALECT == ASM_ATT)
18515 fputs (", ", file);
18517 return;
18519 case 'R':
18520 if (!CONST_INT_P (x))
18522 output_operand_lossage ("operand is not an integer, invalid "
18523 "operand code 'R'");
18524 return;
18527 if (ASSEMBLER_DIALECT == ASM_INTEL)
18528 fputs (", ", file);
18530 switch (INTVAL (x))
18532 case ROUND_NEAREST_INT | ROUND_SAE:
18533 fputs ("{rn-sae}", file);
18534 break;
18535 case ROUND_NEG_INF | ROUND_SAE:
18536 fputs ("{rd-sae}", file);
18537 break;
18538 case ROUND_POS_INF | ROUND_SAE:
18539 fputs ("{ru-sae}", file);
18540 break;
18541 case ROUND_ZERO | ROUND_SAE:
18542 fputs ("{rz-sae}", file);
18543 break;
18544 default:
18545 output_operand_lossage ("operand is not a specific integer, "
18546 "invalid operand code 'R'");
18549 if (ASSEMBLER_DIALECT == ASM_ATT)
18550 fputs (", ", file);
18552 return;
18554 case '*':
18555 if (ASSEMBLER_DIALECT == ASM_ATT)
18556 putc ('*', file);
18557 return;
18559 case '&':
18561 const char *name = get_some_local_dynamic_name ();
18562 if (name == NULL)
18563 output_operand_lossage ("'%%&' used without any "
18564 "local dynamic TLS references");
18565 else
18566 assemble_name (file, name);
18567 return;
18570 case '+':
18572 rtx x;
18574 if (!optimize
18575 || optimize_function_for_size_p (cfun)
18576 || !TARGET_BRANCH_PREDICTION_HINTS)
18577 return;
18579 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18580 if (x)
18582 int pred_val = profile_probability::from_reg_br_prob_note
18583 (XINT (x, 0)).to_reg_br_prob_base ();
18585 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18586 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18588 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18589 bool cputaken
18590 = final_forward_branch_p (current_output_insn) == 0;
18592 /* Emit hints only in the case default branch prediction
18593 heuristics would fail. */
18594 if (taken != cputaken)
18596 /* We use 3e (DS) prefix for taken branches and
18597 2e (CS) prefix for not taken branches. */
18598 if (taken)
18599 fputs ("ds ; ", file);
18600 else
18601 fputs ("cs ; ", file);
18605 return;
18608 case ';':
18609 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18610 putc (';', file);
18611 #endif
18612 return;
18614 case '~':
18615 putc (TARGET_AVX2 ? 'i' : 'f', file);
18616 return;
18618 case '^':
18619 if (TARGET_64BIT && Pmode != word_mode)
18620 fputs ("addr32 ", file);
18621 return;
18623 case '!':
18624 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18625 fputs ("bnd ", file);
18626 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18627 fputs ("notrack ", file);
18628 return;
18630 default:
18631 output_operand_lossage ("invalid operand code '%c'", code);
18635 if (REG_P (x))
18636 print_reg (x, code, file);
18638 else if (MEM_P (x))
18640 rtx addr = XEXP (x, 0);
18642 /* No `byte ptr' prefix for call instructions ... */
18643 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18645 machine_mode mode = GET_MODE (x);
18646 const char *size;
18648 /* Check for explicit size override codes. */
18649 if (code == 'b')
18650 size = "BYTE";
18651 else if (code == 'w')
18652 size = "WORD";
18653 else if (code == 'k')
18654 size = "DWORD";
18655 else if (code == 'q')
18656 size = "QWORD";
18657 else if (code == 'x')
18658 size = "XMMWORD";
18659 else if (code == 't')
18660 size = "YMMWORD";
18661 else if (code == 'g')
18662 size = "ZMMWORD";
18663 else if (mode == BLKmode)
18664 /* ... or BLKmode operands, when not overridden. */
18665 size = NULL;
18666 else
18667 switch (GET_MODE_SIZE (mode))
18669 case 1: size = "BYTE"; break;
18670 case 2: size = "WORD"; break;
18671 case 4: size = "DWORD"; break;
18672 case 8: size = "QWORD"; break;
18673 case 12: size = "TBYTE"; break;
18674 case 16:
18675 if (mode == XFmode)
18676 size = "TBYTE";
18677 else
18678 size = "XMMWORD";
18679 break;
18680 case 32: size = "YMMWORD"; break;
18681 case 64: size = "ZMMWORD"; break;
18682 default:
18683 gcc_unreachable ();
18685 if (size)
18687 fputs (size, file);
18688 fputs (" PTR ", file);
18692 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18693 output_operand_lossage ("invalid constraints for operand");
18694 else
18695 ix86_print_operand_address_as
18696 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18699 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18701 long l;
18703 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18705 if (ASSEMBLER_DIALECT == ASM_ATT)
18706 putc ('$', file);
18707 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18708 if (code == 'q')
18709 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18710 (unsigned long long) (int) l);
18711 else
18712 fprintf (file, "0x%08x", (unsigned int) l);
18715 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18717 long l[2];
18719 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18721 if (ASSEMBLER_DIALECT == ASM_ATT)
18722 putc ('$', file);
18723 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18726 /* These float cases don't actually occur as immediate operands. */
18727 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18729 char dstr[30];
18731 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18732 fputs (dstr, file);
18735 else
18737 /* We have patterns that allow zero sets of memory, for instance.
18738 In 64-bit mode, we should probably support all 8-byte vectors,
18739 since we can in fact encode that into an immediate. */
18740 if (GET_CODE (x) == CONST_VECTOR)
18742 if (x != CONST0_RTX (GET_MODE (x)))
18743 output_operand_lossage ("invalid vector immediate");
18744 x = const0_rtx;
18747 if (code != 'P' && code != 'p')
18749 if (CONST_INT_P (x))
18751 if (ASSEMBLER_DIALECT == ASM_ATT)
18752 putc ('$', file);
18754 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18755 || GET_CODE (x) == LABEL_REF)
18757 if (ASSEMBLER_DIALECT == ASM_ATT)
18758 putc ('$', file);
18759 else
18760 fputs ("OFFSET FLAT:", file);
18763 if (CONST_INT_P (x))
18764 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18765 else if (flag_pic || MACHOPIC_INDIRECT)
18766 output_pic_addr_const (file, x, code);
18767 else
18768 output_addr_const (file, x);
18772 static bool
18773 ix86_print_operand_punct_valid_p (unsigned char code)
18775 return (code == '*' || code == '+' || code == '&' || code == ';'
18776 || code == '~' || code == '^' || code == '!');
18779 /* Print a memory operand whose address is ADDR. */
18781 static void
18782 ix86_print_operand_address_as (FILE *file, rtx addr,
18783 addr_space_t as, bool no_rip)
18785 struct ix86_address parts;
18786 rtx base, index, disp;
18787 int scale;
18788 int ok;
18789 bool vsib = false;
18790 int code = 0;
18792 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18794 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18795 gcc_assert (parts.index == NULL_RTX);
18796 parts.index = XVECEXP (addr, 0, 1);
18797 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18798 addr = XVECEXP (addr, 0, 0);
18799 vsib = true;
18801 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18803 gcc_assert (TARGET_64BIT);
18804 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18805 code = 'q';
18807 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18809 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18810 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18811 if (parts.base != NULL_RTX)
18813 parts.index = parts.base;
18814 parts.scale = 1;
18816 parts.base = XVECEXP (addr, 0, 0);
18817 addr = XVECEXP (addr, 0, 0);
18819 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18821 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18822 gcc_assert (parts.index == NULL_RTX);
18823 parts.index = XVECEXP (addr, 0, 1);
18824 addr = XVECEXP (addr, 0, 0);
18826 else
18827 ok = ix86_decompose_address (addr, &parts);
18829 gcc_assert (ok);
18831 base = parts.base;
18832 index = parts.index;
18833 disp = parts.disp;
18834 scale = parts.scale;
18836 if (ADDR_SPACE_GENERIC_P (as))
18837 as = parts.seg;
18838 else
18839 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18841 if (!ADDR_SPACE_GENERIC_P (as))
18843 const char *string;
18845 if (as == ADDR_SPACE_SEG_FS)
18846 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18847 else if (as == ADDR_SPACE_SEG_GS)
18848 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18849 else
18850 gcc_unreachable ();
18851 fputs (string, file);
18854 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18855 if (TARGET_64BIT && !base && !index && !no_rip)
18857 rtx symbol = disp;
18859 if (GET_CODE (disp) == CONST
18860 && GET_CODE (XEXP (disp, 0)) == PLUS
18861 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18862 symbol = XEXP (XEXP (disp, 0), 0);
18864 if (GET_CODE (symbol) == LABEL_REF
18865 || (GET_CODE (symbol) == SYMBOL_REF
18866 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18867 base = pc_rtx;
18870 if (!base && !index)
18872 /* Displacement only requires special attention. */
18873 if (CONST_INT_P (disp))
18875 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18876 fputs ("ds:", file);
18877 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18879 /* Load the external function address via the GOT slot to avoid PLT. */
18880 else if (GET_CODE (disp) == CONST
18881 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18882 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18883 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18884 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18885 output_pic_addr_const (file, disp, 0);
18886 else if (flag_pic)
18887 output_pic_addr_const (file, disp, 0);
18888 else
18889 output_addr_const (file, disp);
18891 else
18893 /* Print SImode register names to force addr32 prefix. */
18894 if (SImode_address_operand (addr, VOIDmode))
18896 if (flag_checking)
18898 gcc_assert (TARGET_64BIT);
18899 switch (GET_CODE (addr))
18901 case SUBREG:
18902 gcc_assert (GET_MODE (addr) == SImode);
18903 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18904 break;
18905 case ZERO_EXTEND:
18906 case AND:
18907 gcc_assert (GET_MODE (addr) == DImode);
18908 break;
18909 default:
18910 gcc_unreachable ();
18913 gcc_assert (!code);
18914 code = 'k';
18916 else if (code == 0
18917 && TARGET_X32
18918 && disp
18919 && CONST_INT_P (disp)
18920 && INTVAL (disp) < -16*1024*1024)
18922 /* X32 runs in 64-bit mode, where displacement, DISP, in
18923 address DISP(%r64), is encoded as 32-bit immediate sign-
18924 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18925 address is %r64 + 0xffffffffbffffd00. When %r64 <
18926 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18927 which is invalid for x32. The correct address is %r64
18928 - 0x40000300 == 0xf7ffdd64. To properly encode
18929 -0x40000300(%r64) for x32, we zero-extend negative
18930 displacement by forcing addr32 prefix which truncates
18931 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18932 zero-extend all negative displacements, including -1(%rsp).
18933 However, for small negative displacements, sign-extension
18934 won't cause overflow. We only zero-extend negative
18935 displacements if they < -16*1024*1024, which is also used
18936 to check legitimate address displacements for PIC. */
18937 code = 'k';
18940 /* Since the upper 32 bits of RSP are always zero for x32,
18941 we can encode %esp as %rsp to avoid 0x67 prefix if
18942 there is no index register. */
18943 if (TARGET_X32 && Pmode == SImode
18944 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18945 code = 'q';
18947 if (ASSEMBLER_DIALECT == ASM_ATT)
18949 if (disp)
18951 if (flag_pic)
18952 output_pic_addr_const (file, disp, 0);
18953 else if (GET_CODE (disp) == LABEL_REF)
18954 output_asm_label (disp);
18955 else
18956 output_addr_const (file, disp);
18959 putc ('(', file);
18960 if (base)
18961 print_reg (base, code, file);
18962 if (index)
18964 putc (',', file);
18965 print_reg (index, vsib ? 0 : code, file);
18966 if (scale != 1 || vsib)
18967 fprintf (file, ",%d", scale);
18969 putc (')', file);
18971 else
18973 rtx offset = NULL_RTX;
18975 if (disp)
18977 /* Pull out the offset of a symbol; print any symbol itself. */
18978 if (GET_CODE (disp) == CONST
18979 && GET_CODE (XEXP (disp, 0)) == PLUS
18980 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18982 offset = XEXP (XEXP (disp, 0), 1);
18983 disp = gen_rtx_CONST (VOIDmode,
18984 XEXP (XEXP (disp, 0), 0));
18987 if (flag_pic)
18988 output_pic_addr_const (file, disp, 0);
18989 else if (GET_CODE (disp) == LABEL_REF)
18990 output_asm_label (disp);
18991 else if (CONST_INT_P (disp))
18992 offset = disp;
18993 else
18994 output_addr_const (file, disp);
18997 putc ('[', file);
18998 if (base)
19000 print_reg (base, code, file);
19001 if (offset)
19003 if (INTVAL (offset) >= 0)
19004 putc ('+', file);
19005 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19008 else if (offset)
19009 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19010 else
19011 putc ('0', file);
19013 if (index)
19015 putc ('+', file);
19016 print_reg (index, vsib ? 0 : code, file);
19017 if (scale != 1 || vsib)
19018 fprintf (file, "*%d", scale);
19020 putc (']', file);
19025 static void
19026 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19028 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19031 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19033 static bool
19034 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19036 rtx op;
19038 if (GET_CODE (x) != UNSPEC)
19039 return false;
19041 op = XVECEXP (x, 0, 0);
19042 switch (XINT (x, 1))
19044 case UNSPEC_GOTOFF:
19045 output_addr_const (file, op);
19046 fputs ("@gotoff", file);
19047 break;
19048 case UNSPEC_GOTTPOFF:
19049 output_addr_const (file, op);
19050 /* FIXME: This might be @TPOFF in Sun ld. */
19051 fputs ("@gottpoff", file);
19052 break;
19053 case UNSPEC_TPOFF:
19054 output_addr_const (file, op);
19055 fputs ("@tpoff", file);
19056 break;
19057 case UNSPEC_NTPOFF:
19058 output_addr_const (file, op);
19059 if (TARGET_64BIT)
19060 fputs ("@tpoff", file);
19061 else
19062 fputs ("@ntpoff", file);
19063 break;
19064 case UNSPEC_DTPOFF:
19065 output_addr_const (file, op);
19066 fputs ("@dtpoff", file);
19067 break;
19068 case UNSPEC_GOTNTPOFF:
19069 output_addr_const (file, op);
19070 if (TARGET_64BIT)
19071 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19072 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19073 else
19074 fputs ("@gotntpoff", file);
19075 break;
19076 case UNSPEC_INDNTPOFF:
19077 output_addr_const (file, op);
19078 fputs ("@indntpoff", file);
19079 break;
19080 #if TARGET_MACHO
19081 case UNSPEC_MACHOPIC_OFFSET:
19082 output_addr_const (file, op);
19083 putc ('-', file);
19084 machopic_output_function_base_name (file);
19085 break;
19086 #endif
19088 default:
19089 return false;
19092 return true;
19095 /* Split one or more double-mode RTL references into pairs of half-mode
19096 references. The RTL can be REG, offsettable MEM, integer constant, or
19097 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19098 split and "num" is its length. lo_half and hi_half are output arrays
19099 that parallel "operands". */
19101 void
19102 split_double_mode (machine_mode mode, rtx operands[],
19103 int num, rtx lo_half[], rtx hi_half[])
19105 machine_mode half_mode;
19106 unsigned int byte;
19108 switch (mode)
19110 case E_TImode:
19111 half_mode = DImode;
19112 break;
19113 case E_DImode:
19114 half_mode = SImode;
19115 break;
19116 default:
19117 gcc_unreachable ();
19120 byte = GET_MODE_SIZE (half_mode);
19122 while (num--)
19124 rtx op = operands[num];
19126 /* simplify_subreg refuse to split volatile memory addresses,
19127 but we still have to handle it. */
19128 if (MEM_P (op))
19130 lo_half[num] = adjust_address (op, half_mode, 0);
19131 hi_half[num] = adjust_address (op, half_mode, byte);
19133 else
19135 lo_half[num] = simplify_gen_subreg (half_mode, op,
19136 GET_MODE (op) == VOIDmode
19137 ? mode : GET_MODE (op), 0);
19138 hi_half[num] = simplify_gen_subreg (half_mode, op,
19139 GET_MODE (op) == VOIDmode
19140 ? mode : GET_MODE (op), byte);
19145 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19146 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19147 is the expression of the binary operation. The output may either be
19148 emitted here, or returned to the caller, like all output_* functions.
19150 There is no guarantee that the operands are the same mode, as they
19151 might be within FLOAT or FLOAT_EXTEND expressions. */
19153 #ifndef SYSV386_COMPAT
19154 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19155 wants to fix the assemblers because that causes incompatibility
19156 with gcc. No-one wants to fix gcc because that causes
19157 incompatibility with assemblers... You can use the option of
19158 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19159 #define SYSV386_COMPAT 1
19160 #endif
19162 const char *
19163 output_387_binary_op (rtx_insn *insn, rtx *operands)
19165 static char buf[40];
19166 const char *p;
19167 bool is_sse
19168 = (SSE_REG_P (operands[0])
19169 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19171 if (is_sse)
19172 p = "%v";
19173 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19174 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19175 p = "fi";
19176 else
19177 p = "f";
19179 strcpy (buf, p);
19181 switch (GET_CODE (operands[3]))
19183 case PLUS:
19184 p = "add"; break;
19185 case MINUS:
19186 p = "sub"; break;
19187 case MULT:
19188 p = "mul"; break;
19189 case DIV:
19190 p = "div"; break;
19191 default:
19192 gcc_unreachable ();
19195 strcat (buf, p);
19197 if (is_sse)
19199 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19200 strcat (buf, p);
19202 if (TARGET_AVX)
19203 p = "\t{%2, %1, %0|%0, %1, %2}";
19204 else
19205 p = "\t{%2, %0|%0, %2}";
19207 strcat (buf, p);
19208 return buf;
19211 /* Even if we do not want to check the inputs, this documents input
19212 constraints. Which helps in understanding the following code. */
19213 if (flag_checking)
19215 if (STACK_REG_P (operands[0])
19216 && ((REG_P (operands[1])
19217 && REGNO (operands[0]) == REGNO (operands[1])
19218 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19219 || (REG_P (operands[2])
19220 && REGNO (operands[0]) == REGNO (operands[2])
19221 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19222 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19223 ; /* ok */
19224 else
19225 gcc_unreachable ();
19228 switch (GET_CODE (operands[3]))
19230 case MULT:
19231 case PLUS:
19232 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19233 std::swap (operands[1], operands[2]);
19235 /* know operands[0] == operands[1]. */
19237 if (MEM_P (operands[2]))
19239 p = "%Z2\t%2";
19240 break;
19243 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19245 if (STACK_TOP_P (operands[0]))
19246 /* How is it that we are storing to a dead operand[2]?
19247 Well, presumably operands[1] is dead too. We can't
19248 store the result to st(0) as st(0) gets popped on this
19249 instruction. Instead store to operands[2] (which I
19250 think has to be st(1)). st(1) will be popped later.
19251 gcc <= 2.8.1 didn't have this check and generated
19252 assembly code that the Unixware assembler rejected. */
19253 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19254 else
19255 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19256 break;
19259 if (STACK_TOP_P (operands[0]))
19260 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19261 else
19262 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19263 break;
19265 case MINUS:
19266 case DIV:
19267 if (MEM_P (operands[1]))
19269 p = "r%Z1\t%1";
19270 break;
19273 if (MEM_P (operands[2]))
19275 p = "%Z2\t%2";
19276 break;
19279 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19281 #if SYSV386_COMPAT
19282 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19283 derived assemblers, confusingly reverse the direction of
19284 the operation for fsub{r} and fdiv{r} when the
19285 destination register is not st(0). The Intel assembler
19286 doesn't have this brain damage. Read !SYSV386_COMPAT to
19287 figure out what the hardware really does. */
19288 if (STACK_TOP_P (operands[0]))
19289 p = "{p\t%0, %2|rp\t%2, %0}";
19290 else
19291 p = "{rp\t%2, %0|p\t%0, %2}";
19292 #else
19293 if (STACK_TOP_P (operands[0]))
19294 /* As above for fmul/fadd, we can't store to st(0). */
19295 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19296 else
19297 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19298 #endif
19299 break;
19302 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19304 #if SYSV386_COMPAT
19305 if (STACK_TOP_P (operands[0]))
19306 p = "{rp\t%0, %1|p\t%1, %0}";
19307 else
19308 p = "{p\t%1, %0|rp\t%0, %1}";
19309 #else
19310 if (STACK_TOP_P (operands[0]))
19311 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19312 else
19313 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19314 #endif
19315 break;
19318 if (STACK_TOP_P (operands[0]))
19320 if (STACK_TOP_P (operands[1]))
19321 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19322 else
19323 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19324 break;
19326 else if (STACK_TOP_P (operands[1]))
19328 #if SYSV386_COMPAT
19329 p = "{\t%1, %0|r\t%0, %1}";
19330 #else
19331 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19332 #endif
19334 else
19336 #if SYSV386_COMPAT
19337 p = "{r\t%2, %0|\t%0, %2}";
19338 #else
19339 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19340 #endif
19342 break;
19344 default:
19345 gcc_unreachable ();
19348 strcat (buf, p);
19349 return buf;
19352 /* Return needed mode for entity in optimize_mode_switching pass. */
19354 static int
19355 ix86_dirflag_mode_needed (rtx_insn *insn)
19357 if (CALL_P (insn))
19359 if (cfun->machine->func_type == TYPE_NORMAL)
19360 return X86_DIRFLAG_ANY;
19361 else
19362 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19363 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19366 if (recog_memoized (insn) < 0)
19367 return X86_DIRFLAG_ANY;
19369 if (get_attr_type (insn) == TYPE_STR)
19371 /* Emit cld instruction if stringops are used in the function. */
19372 if (cfun->machine->func_type == TYPE_NORMAL)
19373 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19374 else
19375 return X86_DIRFLAG_RESET;
19378 return X86_DIRFLAG_ANY;
19381 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19383 static bool
19384 ix86_check_avx_upper_register (const_rtx exp)
19386 if (SUBREG_P (exp))
19387 exp = SUBREG_REG (exp);
19389 return (REG_P (exp)
19390 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19391 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19394 /* Return needed mode for entity in optimize_mode_switching pass. */
19396 static int
19397 ix86_avx_u128_mode_needed (rtx_insn *insn)
19399 if (CALL_P (insn))
19401 rtx link;
19403 /* Needed mode is set to AVX_U128_CLEAN if there are
19404 no 256bit or 512bit modes used in function arguments. */
19405 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19406 link;
19407 link = XEXP (link, 1))
19409 if (GET_CODE (XEXP (link, 0)) == USE)
19411 rtx arg = XEXP (XEXP (link, 0), 0);
19413 if (ix86_check_avx_upper_register (arg))
19414 return AVX_U128_DIRTY;
19418 return AVX_U128_CLEAN;
19421 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19422 Hardware changes state only when a 256bit register is written to,
19423 but we need to prevent the compiler from moving optimal insertion
19424 point above eventual read from 256bit or 512 bit register. */
19425 subrtx_iterator::array_type array;
19426 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19427 if (ix86_check_avx_upper_register (*iter))
19428 return AVX_U128_DIRTY;
19430 return AVX_U128_ANY;
19433 /* Return mode that i387 must be switched into
19434 prior to the execution of insn. */
19436 static int
19437 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19439 enum attr_i387_cw mode;
19441 /* The mode UNINITIALIZED is used to store control word after a
19442 function call or ASM pattern. The mode ANY specify that function
19443 has no requirements on the control word and make no changes in the
19444 bits we are interested in. */
19446 if (CALL_P (insn)
19447 || (NONJUMP_INSN_P (insn)
19448 && (asm_noperands (PATTERN (insn)) >= 0
19449 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19450 return I387_CW_UNINITIALIZED;
19452 if (recog_memoized (insn) < 0)
19453 return I387_CW_ANY;
19455 mode = get_attr_i387_cw (insn);
19457 switch (entity)
19459 case I387_TRUNC:
19460 if (mode == I387_CW_TRUNC)
19461 return mode;
19462 break;
19464 case I387_FLOOR:
19465 if (mode == I387_CW_FLOOR)
19466 return mode;
19467 break;
19469 case I387_CEIL:
19470 if (mode == I387_CW_CEIL)
19471 return mode;
19472 break;
19474 case I387_MASK_PM:
19475 if (mode == I387_CW_MASK_PM)
19476 return mode;
19477 break;
19479 default:
19480 gcc_unreachable ();
19483 return I387_CW_ANY;
19486 /* Return mode that entity must be switched into
19487 prior to the execution of insn. */
19489 static int
19490 ix86_mode_needed (int entity, rtx_insn *insn)
19492 switch (entity)
19494 case X86_DIRFLAG:
19495 return ix86_dirflag_mode_needed (insn);
19496 case AVX_U128:
19497 return ix86_avx_u128_mode_needed (insn);
19498 case I387_TRUNC:
19499 case I387_FLOOR:
19500 case I387_CEIL:
19501 case I387_MASK_PM:
19502 return ix86_i387_mode_needed (entity, insn);
19503 default:
19504 gcc_unreachable ();
19506 return 0;
19509 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19511 static void
19512 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19514 if (ix86_check_avx_upper_register (dest))
19516 bool *used = (bool *) data;
19517 *used = true;
19521 /* Calculate mode of upper 128bit AVX registers after the insn. */
19523 static int
19524 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19526 rtx pat = PATTERN (insn);
19528 if (vzeroupper_operation (pat, VOIDmode)
19529 || vzeroall_operation (pat, VOIDmode))
19530 return AVX_U128_CLEAN;
19532 /* We know that state is clean after CALL insn if there are no
19533 256bit or 512bit registers used in the function return register. */
19534 if (CALL_P (insn))
19536 bool avx_upper_reg_found = false;
19537 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19539 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19542 /* Otherwise, return current mode. Remember that if insn
19543 references AVX 256bit or 512bit registers, the mode was already
19544 changed to DIRTY from MODE_NEEDED. */
19545 return mode;
19548 /* Return the mode that an insn results in. */
19550 static int
19551 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19553 switch (entity)
19555 case X86_DIRFLAG:
19556 return mode;
19557 case AVX_U128:
19558 return ix86_avx_u128_mode_after (mode, insn);
19559 case I387_TRUNC:
19560 case I387_FLOOR:
19561 case I387_CEIL:
19562 case I387_MASK_PM:
19563 return mode;
19564 default:
19565 gcc_unreachable ();
19569 static int
19570 ix86_dirflag_mode_entry (void)
19572 /* For TARGET_CLD or in the interrupt handler we can't assume
19573 direction flag state at function entry. */
19574 if (TARGET_CLD
19575 || cfun->machine->func_type != TYPE_NORMAL)
19576 return X86_DIRFLAG_ANY;
19578 return X86_DIRFLAG_RESET;
19581 static int
19582 ix86_avx_u128_mode_entry (void)
19584 tree arg;
19586 /* Entry mode is set to AVX_U128_DIRTY if there are
19587 256bit or 512bit modes used in function arguments. */
19588 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19589 arg = TREE_CHAIN (arg))
19591 rtx incoming = DECL_INCOMING_RTL (arg);
19593 if (incoming && ix86_check_avx_upper_register (incoming))
19594 return AVX_U128_DIRTY;
19597 return AVX_U128_CLEAN;
19600 /* Return a mode that ENTITY is assumed to be
19601 switched to at function entry. */
19603 static int
19604 ix86_mode_entry (int entity)
19606 switch (entity)
19608 case X86_DIRFLAG:
19609 return ix86_dirflag_mode_entry ();
19610 case AVX_U128:
19611 return ix86_avx_u128_mode_entry ();
19612 case I387_TRUNC:
19613 case I387_FLOOR:
19614 case I387_CEIL:
19615 case I387_MASK_PM:
19616 return I387_CW_ANY;
19617 default:
19618 gcc_unreachable ();
19622 static int
19623 ix86_avx_u128_mode_exit (void)
19625 rtx reg = crtl->return_rtx;
19627 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19628 or 512 bit modes used in the function return register. */
19629 if (reg && ix86_check_avx_upper_register (reg))
19630 return AVX_U128_DIRTY;
19632 return AVX_U128_CLEAN;
19635 /* Return a mode that ENTITY is assumed to be
19636 switched to at function exit. */
19638 static int
19639 ix86_mode_exit (int entity)
19641 switch (entity)
19643 case X86_DIRFLAG:
19644 return X86_DIRFLAG_ANY;
19645 case AVX_U128:
19646 return ix86_avx_u128_mode_exit ();
19647 case I387_TRUNC:
19648 case I387_FLOOR:
19649 case I387_CEIL:
19650 case I387_MASK_PM:
19651 return I387_CW_ANY;
19652 default:
19653 gcc_unreachable ();
19657 static int
19658 ix86_mode_priority (int, int n)
19660 return n;
19663 /* Output code to initialize control word copies used by trunc?f?i and
19664 rounding patterns. CURRENT_MODE is set to current control word,
19665 while NEW_MODE is set to new control word. */
19667 static void
19668 emit_i387_cw_initialization (int mode)
19670 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19671 rtx new_mode;
19673 enum ix86_stack_slot slot;
19675 rtx reg = gen_reg_rtx (HImode);
19677 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19678 emit_move_insn (reg, copy_rtx (stored_mode));
19680 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19681 || optimize_insn_for_size_p ())
19683 switch (mode)
19685 case I387_CW_TRUNC:
19686 /* round toward zero (truncate) */
19687 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19688 slot = SLOT_CW_TRUNC;
19689 break;
19691 case I387_CW_FLOOR:
19692 /* round down toward -oo */
19693 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19694 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19695 slot = SLOT_CW_FLOOR;
19696 break;
19698 case I387_CW_CEIL:
19699 /* round up toward +oo */
19700 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19701 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19702 slot = SLOT_CW_CEIL;
19703 break;
19705 case I387_CW_MASK_PM:
19706 /* mask precision exception for nearbyint() */
19707 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19708 slot = SLOT_CW_MASK_PM;
19709 break;
19711 default:
19712 gcc_unreachable ();
19715 else
19717 switch (mode)
19719 case I387_CW_TRUNC:
19720 /* round toward zero (truncate) */
19721 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19722 slot = SLOT_CW_TRUNC;
19723 break;
19725 case I387_CW_FLOOR:
19726 /* round down toward -oo */
19727 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19728 slot = SLOT_CW_FLOOR;
19729 break;
19731 case I387_CW_CEIL:
19732 /* round up toward +oo */
19733 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19734 slot = SLOT_CW_CEIL;
19735 break;
19737 case I387_CW_MASK_PM:
19738 /* mask precision exception for nearbyint() */
19739 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19740 slot = SLOT_CW_MASK_PM;
19741 break;
19743 default:
19744 gcc_unreachable ();
19748 gcc_assert (slot < MAX_386_STACK_LOCALS);
19750 new_mode = assign_386_stack_local (HImode, slot);
19751 emit_move_insn (new_mode, reg);
19754 /* Emit vzeroupper. */
19756 void
19757 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19759 int i;
19761 /* Cancel automatic vzeroupper insertion if there are
19762 live call-saved SSE registers at the insertion point. */
19764 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19765 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19766 return;
19768 if (TARGET_64BIT)
19769 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19770 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19771 return;
19773 emit_insn (gen_avx_vzeroupper ());
19776 /* Generate one or more insns to set ENTITY to MODE. */
19778 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19779 is the set of hard registers live at the point where the insn(s)
19780 are to be inserted. */
19782 static void
19783 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19784 HARD_REG_SET regs_live)
19786 switch (entity)
19788 case X86_DIRFLAG:
19789 if (mode == X86_DIRFLAG_RESET)
19790 emit_insn (gen_cld ());
19791 break;
19792 case AVX_U128:
19793 if (mode == AVX_U128_CLEAN)
19794 ix86_avx_emit_vzeroupper (regs_live);
19795 break;
19796 case I387_TRUNC:
19797 case I387_FLOOR:
19798 case I387_CEIL:
19799 case I387_MASK_PM:
19800 if (mode != I387_CW_ANY
19801 && mode != I387_CW_UNINITIALIZED)
19802 emit_i387_cw_initialization (mode);
19803 break;
19804 default:
19805 gcc_unreachable ();
19809 /* Output code for INSN to convert a float to a signed int. OPERANDS
19810 are the insn operands. The output may be [HSD]Imode and the input
19811 operand may be [SDX]Fmode. */
19813 const char *
19814 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19816 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19817 bool dimode_p = GET_MODE (operands[0]) == DImode;
19818 int round_mode = get_attr_i387_cw (insn);
19820 static char buf[40];
19821 const char *p;
19823 /* Jump through a hoop or two for DImode, since the hardware has no
19824 non-popping instruction. We used to do this a different way, but
19825 that was somewhat fragile and broke with post-reload splitters. */
19826 if ((dimode_p || fisttp) && !stack_top_dies)
19827 output_asm_insn ("fld\t%y1", operands);
19829 gcc_assert (STACK_TOP_P (operands[1]));
19830 gcc_assert (MEM_P (operands[0]));
19831 gcc_assert (GET_MODE (operands[1]) != TFmode);
19833 if (fisttp)
19834 return "fisttp%Z0\t%0";
19836 strcpy (buf, "fist");
19838 if (round_mode != I387_CW_ANY)
19839 output_asm_insn ("fldcw\t%3", operands);
19841 p = "p%Z0\t%0";
19842 strcat (buf, p + !(stack_top_dies || dimode_p));
19844 output_asm_insn (buf, operands);
19846 if (round_mode != I387_CW_ANY)
19847 output_asm_insn ("fldcw\t%2", operands);
19849 return "";
19852 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19853 have the values zero or one, indicates the ffreep insn's operand
19854 from the OPERANDS array. */
19856 static const char *
19857 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19859 if (TARGET_USE_FFREEP)
19860 #ifdef HAVE_AS_IX86_FFREEP
19861 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19862 #else
19864 static char retval[32];
19865 int regno = REGNO (operands[opno]);
19867 gcc_assert (STACK_REGNO_P (regno));
19869 regno -= FIRST_STACK_REG;
19871 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19872 return retval;
19874 #endif
19876 return opno ? "fstp\t%y1" : "fstp\t%y0";
19880 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19881 should be used. UNORDERED_P is true when fucom should be used. */
19883 const char *
19884 output_fp_compare (rtx_insn *insn, rtx *operands,
19885 bool eflags_p, bool unordered_p)
19887 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19888 bool stack_top_dies;
19890 static char buf[40];
19891 const char *p;
19893 gcc_assert (STACK_TOP_P (xops[0]));
19895 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19897 if (eflags_p)
19899 p = unordered_p ? "fucomi" : "fcomi";
19900 strcpy (buf, p);
19902 p = "p\t{%y1, %0|%0, %y1}";
19903 strcat (buf, p + !stack_top_dies);
19905 return buf;
19908 if (STACK_REG_P (xops[1])
19909 && stack_top_dies
19910 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19912 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19914 /* If both the top of the 387 stack die, and the other operand
19915 is also a stack register that dies, then this must be a
19916 `fcompp' float compare. */
19917 p = unordered_p ? "fucompp" : "fcompp";
19918 strcpy (buf, p);
19920 else if (const0_operand (xops[1], VOIDmode))
19922 gcc_assert (!unordered_p);
19923 strcpy (buf, "ftst");
19925 else
19927 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19929 gcc_assert (!unordered_p);
19930 p = "ficom";
19932 else
19933 p = unordered_p ? "fucom" : "fcom";
19935 strcpy (buf, p);
19937 p = "p%Z2\t%y2";
19938 strcat (buf, p + !stack_top_dies);
19941 output_asm_insn (buf, operands);
19942 return "fnstsw\t%0";
19945 void
19946 ix86_output_addr_vec_elt (FILE *file, int value)
19948 const char *directive = ASM_LONG;
19950 #ifdef ASM_QUAD
19951 if (TARGET_LP64)
19952 directive = ASM_QUAD;
19953 #else
19954 gcc_assert (!TARGET_64BIT);
19955 #endif
19957 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19960 void
19961 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19963 const char *directive = ASM_LONG;
19965 #ifdef ASM_QUAD
19966 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19967 directive = ASM_QUAD;
19968 #else
19969 gcc_assert (!TARGET_64BIT);
19970 #endif
19971 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19972 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19973 fprintf (file, "%s%s%d-%s%d\n",
19974 directive, LPREFIX, value, LPREFIX, rel);
19975 else if (HAVE_AS_GOTOFF_IN_DATA)
19976 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19977 #if TARGET_MACHO
19978 else if (TARGET_MACHO)
19980 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19981 machopic_output_function_base_name (file);
19982 putc ('\n', file);
19984 #endif
19985 else
19986 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19987 GOT_SYMBOL_NAME, LPREFIX, value);
19990 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19991 for the target. */
19993 void
19994 ix86_expand_clear (rtx dest)
19996 rtx tmp;
19998 /* We play register width games, which are only valid after reload. */
19999 gcc_assert (reload_completed);
20001 /* Avoid HImode and its attendant prefix byte. */
20002 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20003 dest = gen_rtx_REG (SImode, REGNO (dest));
20004 tmp = gen_rtx_SET (dest, const0_rtx);
20006 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20008 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20009 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20012 emit_insn (tmp);
20015 void
20016 ix86_expand_move (machine_mode mode, rtx operands[])
20018 rtx op0, op1;
20019 rtx tmp, addend = NULL_RTX;
20020 enum tls_model model;
20022 op0 = operands[0];
20023 op1 = operands[1];
20025 switch (GET_CODE (op1))
20027 case CONST:
20028 tmp = XEXP (op1, 0);
20030 if (GET_CODE (tmp) != PLUS
20031 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20032 break;
20034 op1 = XEXP (tmp, 0);
20035 addend = XEXP (tmp, 1);
20036 /* FALLTHRU */
20038 case SYMBOL_REF:
20039 model = SYMBOL_REF_TLS_MODEL (op1);
20041 if (model)
20042 op1 = legitimize_tls_address (op1, model, true);
20043 else if (ix86_force_load_from_GOT_p (op1))
20045 /* Load the external function address via GOT slot to avoid PLT. */
20046 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20047 (TARGET_64BIT
20048 ? UNSPEC_GOTPCREL
20049 : UNSPEC_GOT));
20050 op1 = gen_rtx_CONST (Pmode, op1);
20051 op1 = gen_const_mem (Pmode, op1);
20052 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20054 else
20056 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20057 if (tmp)
20059 op1 = tmp;
20060 if (!addend)
20061 break;
20063 else
20065 op1 = operands[1];
20066 break;
20070 if (addend)
20072 op1 = force_operand (op1, NULL_RTX);
20073 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20074 op0, 1, OPTAB_DIRECT);
20076 else
20077 op1 = force_operand (op1, op0);
20079 if (op1 == op0)
20080 return;
20082 op1 = convert_to_mode (mode, op1, 1);
20084 default:
20085 break;
20088 if ((flag_pic || MACHOPIC_INDIRECT)
20089 && symbolic_operand (op1, mode))
20091 if (TARGET_MACHO && !TARGET_64BIT)
20093 #if TARGET_MACHO
20094 /* dynamic-no-pic */
20095 if (MACHOPIC_INDIRECT)
20097 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20098 ? op0 : gen_reg_rtx (Pmode);
20099 op1 = machopic_indirect_data_reference (op1, temp);
20100 if (MACHOPIC_PURE)
20101 op1 = machopic_legitimize_pic_address (op1, mode,
20102 temp == op1 ? 0 : temp);
20104 if (op0 != op1 && GET_CODE (op0) != MEM)
20106 rtx insn = gen_rtx_SET (op0, op1);
20107 emit_insn (insn);
20108 return;
20110 if (GET_CODE (op0) == MEM)
20111 op1 = force_reg (Pmode, op1);
20112 else
20114 rtx temp = op0;
20115 if (GET_CODE (temp) != REG)
20116 temp = gen_reg_rtx (Pmode);
20117 temp = legitimize_pic_address (op1, temp);
20118 if (temp == op0)
20119 return;
20120 op1 = temp;
20122 /* dynamic-no-pic */
20123 #endif
20125 else
20127 if (MEM_P (op0))
20128 op1 = force_reg (mode, op1);
20129 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20131 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20132 op1 = legitimize_pic_address (op1, reg);
20133 if (op0 == op1)
20134 return;
20135 op1 = convert_to_mode (mode, op1, 1);
20139 else
20141 if (MEM_P (op0)
20142 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20143 || !push_operand (op0, mode))
20144 && MEM_P (op1))
20145 op1 = force_reg (mode, op1);
20147 if (push_operand (op0, mode)
20148 && ! general_no_elim_operand (op1, mode))
20149 op1 = copy_to_mode_reg (mode, op1);
20151 /* Force large constants in 64bit compilation into register
20152 to get them CSEed. */
20153 if (can_create_pseudo_p ()
20154 && (mode == DImode) && TARGET_64BIT
20155 && immediate_operand (op1, mode)
20156 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20157 && !register_operand (op0, mode)
20158 && optimize)
20159 op1 = copy_to_mode_reg (mode, op1);
20161 if (can_create_pseudo_p ()
20162 && CONST_DOUBLE_P (op1))
20164 /* If we are loading a floating point constant to a register,
20165 force the value to memory now, since we'll get better code
20166 out the back end. */
20168 op1 = validize_mem (force_const_mem (mode, op1));
20169 if (!register_operand (op0, mode))
20171 rtx temp = gen_reg_rtx (mode);
20172 emit_insn (gen_rtx_SET (temp, op1));
20173 emit_move_insn (op0, temp);
20174 return;
20179 emit_insn (gen_rtx_SET (op0, op1));
20182 void
20183 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20185 rtx op0 = operands[0], op1 = operands[1];
20186 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20187 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20188 unsigned int align = (TARGET_IAMCU
20189 ? GET_MODE_BITSIZE (mode)
20190 : GET_MODE_ALIGNMENT (mode));
20192 if (push_operand (op0, VOIDmode))
20193 op0 = emit_move_resolve_push (mode, op0);
20195 /* Force constants other than zero into memory. We do not know how
20196 the instructions used to build constants modify the upper 64 bits
20197 of the register, once we have that information we may be able
20198 to handle some of them more efficiently. */
20199 if (can_create_pseudo_p ()
20200 && (CONSTANT_P (op1)
20201 || (SUBREG_P (op1)
20202 && CONSTANT_P (SUBREG_REG (op1))))
20203 && ((register_operand (op0, mode)
20204 && !standard_sse_constant_p (op1, mode))
20205 /* ix86_expand_vector_move_misalign() does not like constants. */
20206 || (SSE_REG_MODE_P (mode)
20207 && MEM_P (op0)
20208 && MEM_ALIGN (op0) < align)))
20210 if (SUBREG_P (op1))
20212 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20213 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20214 if (r)
20215 r = validize_mem (r);
20216 else
20217 r = force_reg (imode, SUBREG_REG (op1));
20218 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20220 else
20221 op1 = validize_mem (force_const_mem (mode, op1));
20224 /* We need to check memory alignment for SSE mode since attribute
20225 can make operands unaligned. */
20226 if (can_create_pseudo_p ()
20227 && SSE_REG_MODE_P (mode)
20228 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20229 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20231 rtx tmp[2];
20233 /* ix86_expand_vector_move_misalign() does not like both
20234 arguments in memory. */
20235 if (!register_operand (op0, mode)
20236 && !register_operand (op1, mode))
20237 op1 = force_reg (mode, op1);
20239 tmp[0] = op0; tmp[1] = op1;
20240 ix86_expand_vector_move_misalign (mode, tmp);
20241 return;
20244 /* Make operand1 a register if it isn't already. */
20245 if (can_create_pseudo_p ()
20246 && !register_operand (op0, mode)
20247 && !register_operand (op1, mode))
20249 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20250 return;
20253 emit_insn (gen_rtx_SET (op0, op1));
20256 /* Split 32-byte AVX unaligned load and store if needed. */
20258 static void
20259 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20261 rtx m;
20262 rtx (*extract) (rtx, rtx, rtx);
20263 machine_mode mode;
20265 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20266 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20268 emit_insn (gen_rtx_SET (op0, op1));
20269 return;
20272 rtx orig_op0 = NULL_RTX;
20273 mode = GET_MODE (op0);
20274 switch (GET_MODE_CLASS (mode))
20276 case MODE_VECTOR_INT:
20277 case MODE_INT:
20278 if (mode != V32QImode)
20280 if (!MEM_P (op0))
20282 orig_op0 = op0;
20283 op0 = gen_reg_rtx (V32QImode);
20285 else
20286 op0 = gen_lowpart (V32QImode, op0);
20287 op1 = gen_lowpart (V32QImode, op1);
20288 mode = V32QImode;
20290 break;
20291 case MODE_VECTOR_FLOAT:
20292 break;
20293 default:
20294 gcc_unreachable ();
20297 switch (mode)
20299 default:
20300 gcc_unreachable ();
20301 case E_V32QImode:
20302 extract = gen_avx_vextractf128v32qi;
20303 mode = V16QImode;
20304 break;
20305 case E_V8SFmode:
20306 extract = gen_avx_vextractf128v8sf;
20307 mode = V4SFmode;
20308 break;
20309 case E_V4DFmode:
20310 extract = gen_avx_vextractf128v4df;
20311 mode = V2DFmode;
20312 break;
20315 if (MEM_P (op1))
20317 rtx r = gen_reg_rtx (mode);
20318 m = adjust_address (op1, mode, 0);
20319 emit_move_insn (r, m);
20320 m = adjust_address (op1, mode, 16);
20321 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20322 emit_move_insn (op0, r);
20324 else if (MEM_P (op0))
20326 m = adjust_address (op0, mode, 0);
20327 emit_insn (extract (m, op1, const0_rtx));
20328 m = adjust_address (op0, mode, 16);
20329 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20331 else
20332 gcc_unreachable ();
20334 if (orig_op0)
20335 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20338 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20339 straight to ix86_expand_vector_move. */
20340 /* Code generation for scalar reg-reg moves of single and double precision data:
20341 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20342 movaps reg, reg
20343 else
20344 movss reg, reg
20345 if (x86_sse_partial_reg_dependency == true)
20346 movapd reg, reg
20347 else
20348 movsd reg, reg
20350 Code generation for scalar loads of double precision data:
20351 if (x86_sse_split_regs == true)
20352 movlpd mem, reg (gas syntax)
20353 else
20354 movsd mem, reg
20356 Code generation for unaligned packed loads of single precision data
20357 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20358 if (x86_sse_unaligned_move_optimal)
20359 movups mem, reg
20361 if (x86_sse_partial_reg_dependency == true)
20363 xorps reg, reg
20364 movlps mem, reg
20365 movhps mem+8, reg
20367 else
20369 movlps mem, reg
20370 movhps mem+8, reg
20373 Code generation for unaligned packed loads of double precision data
20374 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20375 if (x86_sse_unaligned_move_optimal)
20376 movupd mem, reg
20378 if (x86_sse_split_regs == true)
20380 movlpd mem, reg
20381 movhpd mem+8, reg
20383 else
20385 movsd mem, reg
20386 movhpd mem+8, reg
20390 void
20391 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20393 rtx op0, op1, m;
20395 op0 = operands[0];
20396 op1 = operands[1];
20398 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20399 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20401 emit_insn (gen_rtx_SET (op0, op1));
20402 return;
20405 if (TARGET_AVX)
20407 if (GET_MODE_SIZE (mode) == 32)
20408 ix86_avx256_split_vector_move_misalign (op0, op1);
20409 else
20410 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20411 emit_insn (gen_rtx_SET (op0, op1));
20412 return;
20415 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20416 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20418 emit_insn (gen_rtx_SET (op0, op1));
20419 return;
20422 /* ??? If we have typed data, then it would appear that using
20423 movdqu is the only way to get unaligned data loaded with
20424 integer type. */
20425 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20427 emit_insn (gen_rtx_SET (op0, op1));
20428 return;
20431 if (MEM_P (op1))
20433 if (TARGET_SSE2 && mode == V2DFmode)
20435 rtx zero;
20437 /* When SSE registers are split into halves, we can avoid
20438 writing to the top half twice. */
20439 if (TARGET_SSE_SPLIT_REGS)
20441 emit_clobber (op0);
20442 zero = op0;
20444 else
20446 /* ??? Not sure about the best option for the Intel chips.
20447 The following would seem to satisfy; the register is
20448 entirely cleared, breaking the dependency chain. We
20449 then store to the upper half, with a dependency depth
20450 of one. A rumor has it that Intel recommends two movsd
20451 followed by an unpacklpd, but this is unconfirmed. And
20452 given that the dependency depth of the unpacklpd would
20453 still be one, I'm not sure why this would be better. */
20454 zero = CONST0_RTX (V2DFmode);
20457 m = adjust_address (op1, DFmode, 0);
20458 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20459 m = adjust_address (op1, DFmode, 8);
20460 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20462 else
20464 rtx t;
20466 if (mode != V4SFmode)
20467 t = gen_reg_rtx (V4SFmode);
20468 else
20469 t = op0;
20471 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20472 emit_move_insn (t, CONST0_RTX (V4SFmode));
20473 else
20474 emit_clobber (t);
20476 m = adjust_address (op1, V2SFmode, 0);
20477 emit_insn (gen_sse_loadlps (t, t, m));
20478 m = adjust_address (op1, V2SFmode, 8);
20479 emit_insn (gen_sse_loadhps (t, t, m));
20480 if (mode != V4SFmode)
20481 emit_move_insn (op0, gen_lowpart (mode, t));
20484 else if (MEM_P (op0))
20486 if (TARGET_SSE2 && mode == V2DFmode)
20488 m = adjust_address (op0, DFmode, 0);
20489 emit_insn (gen_sse2_storelpd (m, op1));
20490 m = adjust_address (op0, DFmode, 8);
20491 emit_insn (gen_sse2_storehpd (m, op1));
20493 else
20495 if (mode != V4SFmode)
20496 op1 = gen_lowpart (V4SFmode, op1);
20498 m = adjust_address (op0, V2SFmode, 0);
20499 emit_insn (gen_sse_storelps (m, op1));
20500 m = adjust_address (op0, V2SFmode, 8);
20501 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20504 else
20505 gcc_unreachable ();
20508 /* Helper function of ix86_fixup_binary_operands to canonicalize
20509 operand order. Returns true if the operands should be swapped. */
20511 static bool
20512 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20513 rtx operands[])
20515 rtx dst = operands[0];
20516 rtx src1 = operands[1];
20517 rtx src2 = operands[2];
20519 /* If the operation is not commutative, we can't do anything. */
20520 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20521 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20522 return false;
20524 /* Highest priority is that src1 should match dst. */
20525 if (rtx_equal_p (dst, src1))
20526 return false;
20527 if (rtx_equal_p (dst, src2))
20528 return true;
20530 /* Next highest priority is that immediate constants come second. */
20531 if (immediate_operand (src2, mode))
20532 return false;
20533 if (immediate_operand (src1, mode))
20534 return true;
20536 /* Lowest priority is that memory references should come second. */
20537 if (MEM_P (src2))
20538 return false;
20539 if (MEM_P (src1))
20540 return true;
20542 return false;
20546 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20547 destination to use for the operation. If different from the true
20548 destination in operands[0], a copy operation will be required. */
20551 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20552 rtx operands[])
20554 rtx dst = operands[0];
20555 rtx src1 = operands[1];
20556 rtx src2 = operands[2];
20558 /* Canonicalize operand order. */
20559 if (ix86_swap_binary_operands_p (code, mode, operands))
20561 /* It is invalid to swap operands of different modes. */
20562 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20564 std::swap (src1, src2);
20567 /* Both source operands cannot be in memory. */
20568 if (MEM_P (src1) && MEM_P (src2))
20570 /* Optimization: Only read from memory once. */
20571 if (rtx_equal_p (src1, src2))
20573 src2 = force_reg (mode, src2);
20574 src1 = src2;
20576 else if (rtx_equal_p (dst, src1))
20577 src2 = force_reg (mode, src2);
20578 else
20579 src1 = force_reg (mode, src1);
20582 /* If the destination is memory, and we do not have matching source
20583 operands, do things in registers. */
20584 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20585 dst = gen_reg_rtx (mode);
20587 /* Source 1 cannot be a constant. */
20588 if (CONSTANT_P (src1))
20589 src1 = force_reg (mode, src1);
20591 /* Source 1 cannot be a non-matching memory. */
20592 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20593 src1 = force_reg (mode, src1);
20595 /* Improve address combine. */
20596 if (code == PLUS
20597 && GET_MODE_CLASS (mode) == MODE_INT
20598 && MEM_P (src2))
20599 src2 = force_reg (mode, src2);
20601 operands[1] = src1;
20602 operands[2] = src2;
20603 return dst;
20606 /* Similarly, but assume that the destination has already been
20607 set up properly. */
20609 void
20610 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20611 machine_mode mode, rtx operands[])
20613 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20614 gcc_assert (dst == operands[0]);
20617 /* Attempt to expand a binary operator. Make the expansion closer to the
20618 actual machine, then just general_operand, which will allow 3 separate
20619 memory references (one output, two input) in a single insn. */
20621 void
20622 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20623 rtx operands[])
20625 rtx src1, src2, dst, op, clob;
20627 dst = ix86_fixup_binary_operands (code, mode, operands);
20628 src1 = operands[1];
20629 src2 = operands[2];
20631 /* Emit the instruction. */
20633 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20635 if (reload_completed
20636 && code == PLUS
20637 && !rtx_equal_p (dst, src1))
20639 /* This is going to be an LEA; avoid splitting it later. */
20640 emit_insn (op);
20642 else
20644 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20645 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20648 /* Fix up the destination if needed. */
20649 if (dst != operands[0])
20650 emit_move_insn (operands[0], dst);
20653 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20654 the given OPERANDS. */
20656 void
20657 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20658 rtx operands[])
20660 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20661 if (SUBREG_P (operands[1]))
20663 op1 = operands[1];
20664 op2 = operands[2];
20666 else if (SUBREG_P (operands[2]))
20668 op1 = operands[2];
20669 op2 = operands[1];
20671 /* Optimize (__m128i) d | (__m128i) e and similar code
20672 when d and e are float vectors into float vector logical
20673 insn. In C/C++ without using intrinsics there is no other way
20674 to express vector logical operation on float vectors than
20675 to cast them temporarily to integer vectors. */
20676 if (op1
20677 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20678 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20679 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20680 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20681 && SUBREG_BYTE (op1) == 0
20682 && (GET_CODE (op2) == CONST_VECTOR
20683 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20684 && SUBREG_BYTE (op2) == 0))
20685 && can_create_pseudo_p ())
20687 rtx dst;
20688 switch (GET_MODE (SUBREG_REG (op1)))
20690 case E_V4SFmode:
20691 case E_V8SFmode:
20692 case E_V16SFmode:
20693 case E_V2DFmode:
20694 case E_V4DFmode:
20695 case E_V8DFmode:
20696 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20697 if (GET_CODE (op2) == CONST_VECTOR)
20699 op2 = gen_lowpart (GET_MODE (dst), op2);
20700 op2 = force_reg (GET_MODE (dst), op2);
20702 else
20704 op1 = operands[1];
20705 op2 = SUBREG_REG (operands[2]);
20706 if (!vector_operand (op2, GET_MODE (dst)))
20707 op2 = force_reg (GET_MODE (dst), op2);
20709 op1 = SUBREG_REG (op1);
20710 if (!vector_operand (op1, GET_MODE (dst)))
20711 op1 = force_reg (GET_MODE (dst), op1);
20712 emit_insn (gen_rtx_SET (dst,
20713 gen_rtx_fmt_ee (code, GET_MODE (dst),
20714 op1, op2)));
20715 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20716 return;
20717 default:
20718 break;
20721 if (!vector_operand (operands[1], mode))
20722 operands[1] = force_reg (mode, operands[1]);
20723 if (!vector_operand (operands[2], mode))
20724 operands[2] = force_reg (mode, operands[2]);
20725 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20726 emit_insn (gen_rtx_SET (operands[0],
20727 gen_rtx_fmt_ee (code, mode, operands[1],
20728 operands[2])));
20731 /* Return TRUE or FALSE depending on whether the binary operator meets the
20732 appropriate constraints. */
20734 bool
20735 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20736 rtx operands[3])
20738 rtx dst = operands[0];
20739 rtx src1 = operands[1];
20740 rtx src2 = operands[2];
20742 /* Both source operands cannot be in memory. */
20743 if (MEM_P (src1) && MEM_P (src2))
20744 return false;
20746 /* Canonicalize operand order for commutative operators. */
20747 if (ix86_swap_binary_operands_p (code, mode, operands))
20748 std::swap (src1, src2);
20750 /* If the destination is memory, we must have a matching source operand. */
20751 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20752 return false;
20754 /* Source 1 cannot be a constant. */
20755 if (CONSTANT_P (src1))
20756 return false;
20758 /* Source 1 cannot be a non-matching memory. */
20759 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20760 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20761 return (code == AND
20762 && (mode == HImode
20763 || mode == SImode
20764 || (TARGET_64BIT && mode == DImode))
20765 && satisfies_constraint_L (src2));
20767 return true;
20770 /* Attempt to expand a unary operator. Make the expansion closer to the
20771 actual machine, then just general_operand, which will allow 2 separate
20772 memory references (one output, one input) in a single insn. */
20774 void
20775 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20776 rtx operands[])
20778 bool matching_memory = false;
20779 rtx src, dst, op, clob;
20781 dst = operands[0];
20782 src = operands[1];
20784 /* If the destination is memory, and we do not have matching source
20785 operands, do things in registers. */
20786 if (MEM_P (dst))
20788 if (rtx_equal_p (dst, src))
20789 matching_memory = true;
20790 else
20791 dst = gen_reg_rtx (mode);
20794 /* When source operand is memory, destination must match. */
20795 if (MEM_P (src) && !matching_memory)
20796 src = force_reg (mode, src);
20798 /* Emit the instruction. */
20800 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20802 if (code == NOT)
20803 emit_insn (op);
20804 else
20806 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20807 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20810 /* Fix up the destination if needed. */
20811 if (dst != operands[0])
20812 emit_move_insn (operands[0], dst);
20815 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20816 divisor are within the range [0-255]. */
20818 void
20819 ix86_split_idivmod (machine_mode mode, rtx operands[],
20820 bool signed_p)
20822 rtx_code_label *end_label, *qimode_label;
20823 rtx div, mod;
20824 rtx_insn *insn;
20825 rtx scratch, tmp0, tmp1, tmp2;
20826 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20827 rtx (*gen_zero_extend) (rtx, rtx);
20828 rtx (*gen_test_ccno_1) (rtx, rtx);
20830 switch (mode)
20832 case E_SImode:
20833 if (GET_MODE (operands[0]) == SImode)
20835 if (GET_MODE (operands[1]) == SImode)
20836 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20837 else
20838 gen_divmod4_1
20839 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20840 gen_zero_extend = gen_zero_extendqisi2;
20842 else
20844 gen_divmod4_1
20845 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20846 gen_zero_extend = gen_zero_extendqidi2;
20848 gen_test_ccno_1 = gen_testsi_ccno_1;
20849 break;
20850 case E_DImode:
20851 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20852 gen_test_ccno_1 = gen_testdi_ccno_1;
20853 gen_zero_extend = gen_zero_extendqidi2;
20854 break;
20855 default:
20856 gcc_unreachable ();
20859 end_label = gen_label_rtx ();
20860 qimode_label = gen_label_rtx ();
20862 scratch = gen_reg_rtx (mode);
20864 /* Use 8bit unsigned divimod if dividend and divisor are within
20865 the range [0-255]. */
20866 emit_move_insn (scratch, operands[2]);
20867 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20868 scratch, 1, OPTAB_DIRECT);
20869 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20870 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20871 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20872 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20873 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20874 pc_rtx);
20875 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20876 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20877 JUMP_LABEL (insn) = qimode_label;
20879 /* Generate original signed/unsigned divimod. */
20880 div = gen_divmod4_1 (operands[0], operands[1],
20881 operands[2], operands[3]);
20882 emit_insn (div);
20884 /* Branch to the end. */
20885 emit_jump_insn (gen_jump (end_label));
20886 emit_barrier ();
20888 /* Generate 8bit unsigned divide. */
20889 emit_label (qimode_label);
20890 /* Don't use operands[0] for result of 8bit divide since not all
20891 registers support QImode ZERO_EXTRACT. */
20892 tmp0 = lowpart_subreg (HImode, scratch, mode);
20893 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20894 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20895 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20897 if (signed_p)
20899 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20900 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20902 else
20904 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20905 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20907 if (mode == SImode)
20909 if (GET_MODE (operands[0]) != SImode)
20910 div = gen_rtx_ZERO_EXTEND (DImode, div);
20911 if (GET_MODE (operands[1]) != SImode)
20912 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20915 /* Extract remainder from AH. */
20916 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20917 tmp0, GEN_INT (8), GEN_INT (8));
20918 if (REG_P (operands[1]))
20919 insn = emit_move_insn (operands[1], tmp1);
20920 else
20922 /* Need a new scratch register since the old one has result
20923 of 8bit divide. */
20924 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20925 emit_move_insn (scratch, tmp1);
20926 insn = emit_move_insn (operands[1], scratch);
20928 set_unique_reg_note (insn, REG_EQUAL, mod);
20930 /* Zero extend quotient from AL. */
20931 tmp1 = gen_lowpart (QImode, tmp0);
20932 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20933 set_unique_reg_note (insn, REG_EQUAL, div);
20935 emit_label (end_label);
20938 #define LEA_MAX_STALL (3)
20939 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20941 /* Increase given DISTANCE in half-cycles according to
20942 dependencies between PREV and NEXT instructions.
20943 Add 1 half-cycle if there is no dependency and
20944 go to next cycle if there is some dependecy. */
20946 static unsigned int
20947 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20949 df_ref def, use;
20951 if (!prev || !next)
20952 return distance + (distance & 1) + 2;
20954 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20955 return distance + 1;
20957 FOR_EACH_INSN_USE (use, next)
20958 FOR_EACH_INSN_DEF (def, prev)
20959 if (!DF_REF_IS_ARTIFICIAL (def)
20960 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20961 return distance + (distance & 1) + 2;
20963 return distance + 1;
20966 /* Function checks if instruction INSN defines register number
20967 REGNO1 or REGNO2. */
20969 static bool
20970 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20971 rtx_insn *insn)
20973 df_ref def;
20975 FOR_EACH_INSN_DEF (def, insn)
20976 if (DF_REF_REG_DEF_P (def)
20977 && !DF_REF_IS_ARTIFICIAL (def)
20978 && (regno1 == DF_REF_REGNO (def)
20979 || regno2 == DF_REF_REGNO (def)))
20980 return true;
20982 return false;
20985 /* Function checks if instruction INSN uses register number
20986 REGNO as a part of address expression. */
20988 static bool
20989 insn_uses_reg_mem (unsigned int regno, rtx insn)
20991 df_ref use;
20993 FOR_EACH_INSN_USE (use, insn)
20994 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20995 return true;
20997 return false;
21000 /* Search backward for non-agu definition of register number REGNO1
21001 or register number REGNO2 in basic block starting from instruction
21002 START up to head of basic block or instruction INSN.
21004 Function puts true value into *FOUND var if definition was found
21005 and false otherwise.
21007 Distance in half-cycles between START and found instruction or head
21008 of BB is added to DISTANCE and returned. */
21010 static int
21011 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21012 rtx_insn *insn, int distance,
21013 rtx_insn *start, bool *found)
21015 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21016 rtx_insn *prev = start;
21017 rtx_insn *next = NULL;
21019 *found = false;
21021 while (prev
21022 && prev != insn
21023 && distance < LEA_SEARCH_THRESHOLD)
21025 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21027 distance = increase_distance (prev, next, distance);
21028 if (insn_defines_reg (regno1, regno2, prev))
21030 if (recog_memoized (prev) < 0
21031 || get_attr_type (prev) != TYPE_LEA)
21033 *found = true;
21034 return distance;
21038 next = prev;
21040 if (prev == BB_HEAD (bb))
21041 break;
21043 prev = PREV_INSN (prev);
21046 return distance;
21049 /* Search backward for non-agu definition of register number REGNO1
21050 or register number REGNO2 in INSN's basic block until
21051 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21052 2. Reach neighbor BBs boundary, or
21053 3. Reach agu definition.
21054 Returns the distance between the non-agu definition point and INSN.
21055 If no definition point, returns -1. */
21057 static int
21058 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21059 rtx_insn *insn)
21061 basic_block bb = BLOCK_FOR_INSN (insn);
21062 int distance = 0;
21063 bool found = false;
21065 if (insn != BB_HEAD (bb))
21066 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21067 distance, PREV_INSN (insn),
21068 &found);
21070 if (!found && distance < LEA_SEARCH_THRESHOLD)
21072 edge e;
21073 edge_iterator ei;
21074 bool simple_loop = false;
21076 FOR_EACH_EDGE (e, ei, bb->preds)
21077 if (e->src == bb)
21079 simple_loop = true;
21080 break;
21083 if (simple_loop)
21084 distance = distance_non_agu_define_in_bb (regno1, regno2,
21085 insn, distance,
21086 BB_END (bb), &found);
21087 else
21089 int shortest_dist = -1;
21090 bool found_in_bb = false;
21092 FOR_EACH_EDGE (e, ei, bb->preds)
21094 int bb_dist
21095 = distance_non_agu_define_in_bb (regno1, regno2,
21096 insn, distance,
21097 BB_END (e->src),
21098 &found_in_bb);
21099 if (found_in_bb)
21101 if (shortest_dist < 0)
21102 shortest_dist = bb_dist;
21103 else if (bb_dist > 0)
21104 shortest_dist = MIN (bb_dist, shortest_dist);
21106 found = true;
21110 distance = shortest_dist;
21114 /* get_attr_type may modify recog data. We want to make sure
21115 that recog data is valid for instruction INSN, on which
21116 distance_non_agu_define is called. INSN is unchanged here. */
21117 extract_insn_cached (insn);
21119 if (!found)
21120 return -1;
21122 return distance >> 1;
21125 /* Return the distance in half-cycles between INSN and the next
21126 insn that uses register number REGNO in memory address added
21127 to DISTANCE. Return -1 if REGNO0 is set.
21129 Put true value into *FOUND if register usage was found and
21130 false otherwise.
21131 Put true value into *REDEFINED if register redefinition was
21132 found and false otherwise. */
21134 static int
21135 distance_agu_use_in_bb (unsigned int regno,
21136 rtx_insn *insn, int distance, rtx_insn *start,
21137 bool *found, bool *redefined)
21139 basic_block bb = NULL;
21140 rtx_insn *next = start;
21141 rtx_insn *prev = NULL;
21143 *found = false;
21144 *redefined = false;
21146 if (start != NULL_RTX)
21148 bb = BLOCK_FOR_INSN (start);
21149 if (start != BB_HEAD (bb))
21150 /* If insn and start belong to the same bb, set prev to insn,
21151 so the call to increase_distance will increase the distance
21152 between insns by 1. */
21153 prev = insn;
21156 while (next
21157 && next != insn
21158 && distance < LEA_SEARCH_THRESHOLD)
21160 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21162 distance = increase_distance(prev, next, distance);
21163 if (insn_uses_reg_mem (regno, next))
21165 /* Return DISTANCE if OP0 is used in memory
21166 address in NEXT. */
21167 *found = true;
21168 return distance;
21171 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21173 /* Return -1 if OP0 is set in NEXT. */
21174 *redefined = true;
21175 return -1;
21178 prev = next;
21181 if (next == BB_END (bb))
21182 break;
21184 next = NEXT_INSN (next);
21187 return distance;
21190 /* Return the distance between INSN and the next insn that uses
21191 register number REGNO0 in memory address. Return -1 if no such
21192 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21194 static int
21195 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21197 basic_block bb = BLOCK_FOR_INSN (insn);
21198 int distance = 0;
21199 bool found = false;
21200 bool redefined = false;
21202 if (insn != BB_END (bb))
21203 distance = distance_agu_use_in_bb (regno0, insn, distance,
21204 NEXT_INSN (insn),
21205 &found, &redefined);
21207 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21209 edge e;
21210 edge_iterator ei;
21211 bool simple_loop = false;
21213 FOR_EACH_EDGE (e, ei, bb->succs)
21214 if (e->dest == bb)
21216 simple_loop = true;
21217 break;
21220 if (simple_loop)
21221 distance = distance_agu_use_in_bb (regno0, insn,
21222 distance, BB_HEAD (bb),
21223 &found, &redefined);
21224 else
21226 int shortest_dist = -1;
21227 bool found_in_bb = false;
21228 bool redefined_in_bb = false;
21230 FOR_EACH_EDGE (e, ei, bb->succs)
21232 int bb_dist
21233 = distance_agu_use_in_bb (regno0, insn,
21234 distance, BB_HEAD (e->dest),
21235 &found_in_bb, &redefined_in_bb);
21236 if (found_in_bb)
21238 if (shortest_dist < 0)
21239 shortest_dist = bb_dist;
21240 else if (bb_dist > 0)
21241 shortest_dist = MIN (bb_dist, shortest_dist);
21243 found = true;
21247 distance = shortest_dist;
21251 if (!found || redefined)
21252 return -1;
21254 return distance >> 1;
21257 /* Define this macro to tune LEA priority vs ADD, it take effect when
21258 there is a dilemma of choicing LEA or ADD
21259 Negative value: ADD is more preferred than LEA
21260 Zero: Netrual
21261 Positive value: LEA is more preferred than ADD*/
21262 #define IX86_LEA_PRIORITY 0
21264 /* Return true if usage of lea INSN has performance advantage
21265 over a sequence of instructions. Instructions sequence has
21266 SPLIT_COST cycles higher latency than lea latency. */
21268 static bool
21269 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21270 unsigned int regno2, int split_cost, bool has_scale)
21272 int dist_define, dist_use;
21274 /* For Silvermont if using a 2-source or 3-source LEA for
21275 non-destructive destination purposes, or due to wanting
21276 ability to use SCALE, the use of LEA is justified. */
21277 if (TARGET_SILVERMONT || TARGET_INTEL)
21279 if (has_scale)
21280 return true;
21281 if (split_cost < 1)
21282 return false;
21283 if (regno0 == regno1 || regno0 == regno2)
21284 return false;
21285 return true;
21288 dist_define = distance_non_agu_define (regno1, regno2, insn);
21289 dist_use = distance_agu_use (regno0, insn);
21291 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21293 /* If there is no non AGU operand definition, no AGU
21294 operand usage and split cost is 0 then both lea
21295 and non lea variants have same priority. Currently
21296 we prefer lea for 64 bit code and non lea on 32 bit
21297 code. */
21298 if (dist_use < 0 && split_cost == 0)
21299 return TARGET_64BIT || IX86_LEA_PRIORITY;
21300 else
21301 return true;
21304 /* With longer definitions distance lea is more preferable.
21305 Here we change it to take into account splitting cost and
21306 lea priority. */
21307 dist_define += split_cost + IX86_LEA_PRIORITY;
21309 /* If there is no use in memory addess then we just check
21310 that split cost exceeds AGU stall. */
21311 if (dist_use < 0)
21312 return dist_define > LEA_MAX_STALL;
21314 /* If this insn has both backward non-agu dependence and forward
21315 agu dependence, the one with short distance takes effect. */
21316 return dist_define >= dist_use;
21319 /* Return true if it is legal to clobber flags by INSN and
21320 false otherwise. */
21322 static bool
21323 ix86_ok_to_clobber_flags (rtx_insn *insn)
21325 basic_block bb = BLOCK_FOR_INSN (insn);
21326 df_ref use;
21327 bitmap live;
21329 while (insn)
21331 if (NONDEBUG_INSN_P (insn))
21333 FOR_EACH_INSN_USE (use, insn)
21334 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21335 return false;
21337 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21338 return true;
21341 if (insn == BB_END (bb))
21342 break;
21344 insn = NEXT_INSN (insn);
21347 live = df_get_live_out(bb);
21348 return !REGNO_REG_SET_P (live, FLAGS_REG);
21351 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21352 move and add to avoid AGU stalls. */
21354 bool
21355 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21357 unsigned int regno0, regno1, regno2;
21359 /* Check if we need to optimize. */
21360 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21361 return false;
21363 /* Check it is correct to split here. */
21364 if (!ix86_ok_to_clobber_flags(insn))
21365 return false;
21367 regno0 = true_regnum (operands[0]);
21368 regno1 = true_regnum (operands[1]);
21369 regno2 = true_regnum (operands[2]);
21371 /* We need to split only adds with non destructive
21372 destination operand. */
21373 if (regno0 == regno1 || regno0 == regno2)
21374 return false;
21375 else
21376 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21379 /* Return true if we should emit lea instruction instead of mov
21380 instruction. */
21382 bool
21383 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21385 unsigned int regno0, regno1;
21387 /* Check if we need to optimize. */
21388 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21389 return false;
21391 /* Use lea for reg to reg moves only. */
21392 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21393 return false;
21395 regno0 = true_regnum (operands[0]);
21396 regno1 = true_regnum (operands[1]);
21398 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21401 /* Return true if we need to split lea into a sequence of
21402 instructions to avoid AGU stalls. */
21404 bool
21405 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21407 unsigned int regno0, regno1, regno2;
21408 int split_cost;
21409 struct ix86_address parts;
21410 int ok;
21412 /* Check we need to optimize. */
21413 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21414 return false;
21416 /* The "at least two components" test below might not catch simple
21417 move or zero extension insns if parts.base is non-NULL and parts.disp
21418 is const0_rtx as the only components in the address, e.g. if the
21419 register is %rbp or %r13. As this test is much cheaper and moves or
21420 zero extensions are the common case, do this check first. */
21421 if (REG_P (operands[1])
21422 || (SImode_address_operand (operands[1], VOIDmode)
21423 && REG_P (XEXP (operands[1], 0))))
21424 return false;
21426 /* Check if it is OK to split here. */
21427 if (!ix86_ok_to_clobber_flags (insn))
21428 return false;
21430 ok = ix86_decompose_address (operands[1], &parts);
21431 gcc_assert (ok);
21433 /* There should be at least two components in the address. */
21434 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21435 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21436 return false;
21438 /* We should not split into add if non legitimate pic
21439 operand is used as displacement. */
21440 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21441 return false;
21443 regno0 = true_regnum (operands[0]) ;
21444 regno1 = INVALID_REGNUM;
21445 regno2 = INVALID_REGNUM;
21447 if (parts.base)
21448 regno1 = true_regnum (parts.base);
21449 if (parts.index)
21450 regno2 = true_regnum (parts.index);
21452 split_cost = 0;
21454 /* Compute how many cycles we will add to execution time
21455 if split lea into a sequence of instructions. */
21456 if (parts.base || parts.index)
21458 /* Have to use mov instruction if non desctructive
21459 destination form is used. */
21460 if (regno1 != regno0 && regno2 != regno0)
21461 split_cost += 1;
21463 /* Have to add index to base if both exist. */
21464 if (parts.base && parts.index)
21465 split_cost += 1;
21467 /* Have to use shift and adds if scale is 2 or greater. */
21468 if (parts.scale > 1)
21470 if (regno0 != regno1)
21471 split_cost += 1;
21472 else if (regno2 == regno0)
21473 split_cost += 4;
21474 else
21475 split_cost += parts.scale;
21478 /* Have to use add instruction with immediate if
21479 disp is non zero. */
21480 if (parts.disp && parts.disp != const0_rtx)
21481 split_cost += 1;
21483 /* Subtract the price of lea. */
21484 split_cost -= 1;
21487 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21488 parts.scale > 1);
21491 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21492 matches destination. RTX includes clobber of FLAGS_REG. */
21494 static void
21495 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21496 rtx dst, rtx src)
21498 rtx op, clob;
21500 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21501 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21503 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21506 /* Return true if regno1 def is nearest to the insn. */
21508 static bool
21509 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21511 rtx_insn *prev = insn;
21512 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21514 if (insn == start)
21515 return false;
21516 while (prev && prev != start)
21518 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21520 prev = PREV_INSN (prev);
21521 continue;
21523 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21524 return true;
21525 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21526 return false;
21527 prev = PREV_INSN (prev);
21530 /* None of the regs is defined in the bb. */
21531 return false;
21534 /* Split lea instructions into a sequence of instructions
21535 which are executed on ALU to avoid AGU stalls.
21536 It is assumed that it is allowed to clobber flags register
21537 at lea position. */
21539 void
21540 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21542 unsigned int regno0, regno1, regno2;
21543 struct ix86_address parts;
21544 rtx target, tmp;
21545 int ok, adds;
21547 ok = ix86_decompose_address (operands[1], &parts);
21548 gcc_assert (ok);
21550 target = gen_lowpart (mode, operands[0]);
21552 regno0 = true_regnum (target);
21553 regno1 = INVALID_REGNUM;
21554 regno2 = INVALID_REGNUM;
21556 if (parts.base)
21558 parts.base = gen_lowpart (mode, parts.base);
21559 regno1 = true_regnum (parts.base);
21562 if (parts.index)
21564 parts.index = gen_lowpart (mode, parts.index);
21565 regno2 = true_regnum (parts.index);
21568 if (parts.disp)
21569 parts.disp = gen_lowpart (mode, parts.disp);
21571 if (parts.scale > 1)
21573 /* Case r1 = r1 + ... */
21574 if (regno1 == regno0)
21576 /* If we have a case r1 = r1 + C * r2 then we
21577 should use multiplication which is very
21578 expensive. Assume cost model is wrong if we
21579 have such case here. */
21580 gcc_assert (regno2 != regno0);
21582 for (adds = parts.scale; adds > 0; adds--)
21583 ix86_emit_binop (PLUS, mode, target, parts.index);
21585 else
21587 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21588 if (regno0 != regno2)
21589 emit_insn (gen_rtx_SET (target, parts.index));
21591 /* Use shift for scaling. */
21592 ix86_emit_binop (ASHIFT, mode, target,
21593 GEN_INT (exact_log2 (parts.scale)));
21595 if (parts.base)
21596 ix86_emit_binop (PLUS, mode, target, parts.base);
21598 if (parts.disp && parts.disp != const0_rtx)
21599 ix86_emit_binop (PLUS, mode, target, parts.disp);
21602 else if (!parts.base && !parts.index)
21604 gcc_assert(parts.disp);
21605 emit_insn (gen_rtx_SET (target, parts.disp));
21607 else
21609 if (!parts.base)
21611 if (regno0 != regno2)
21612 emit_insn (gen_rtx_SET (target, parts.index));
21614 else if (!parts.index)
21616 if (regno0 != regno1)
21617 emit_insn (gen_rtx_SET (target, parts.base));
21619 else
21621 if (regno0 == regno1)
21622 tmp = parts.index;
21623 else if (regno0 == regno2)
21624 tmp = parts.base;
21625 else
21627 rtx tmp1;
21629 /* Find better operand for SET instruction, depending
21630 on which definition is farther from the insn. */
21631 if (find_nearest_reg_def (insn, regno1, regno2))
21632 tmp = parts.index, tmp1 = parts.base;
21633 else
21634 tmp = parts.base, tmp1 = parts.index;
21636 emit_insn (gen_rtx_SET (target, tmp));
21638 if (parts.disp && parts.disp != const0_rtx)
21639 ix86_emit_binop (PLUS, mode, target, parts.disp);
21641 ix86_emit_binop (PLUS, mode, target, tmp1);
21642 return;
21645 ix86_emit_binop (PLUS, mode, target, tmp);
21648 if (parts.disp && parts.disp != const0_rtx)
21649 ix86_emit_binop (PLUS, mode, target, parts.disp);
21653 /* Return true if it is ok to optimize an ADD operation to LEA
21654 operation to avoid flag register consumation. For most processors,
21655 ADD is faster than LEA. For the processors like BONNELL, if the
21656 destination register of LEA holds an actual address which will be
21657 used soon, LEA is better and otherwise ADD is better. */
21659 bool
21660 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21662 unsigned int regno0 = true_regnum (operands[0]);
21663 unsigned int regno1 = true_regnum (operands[1]);
21664 unsigned int regno2 = true_regnum (operands[2]);
21666 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21667 if (regno0 != regno1 && regno0 != regno2)
21668 return true;
21670 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21671 return false;
21673 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21676 /* Return true if destination reg of SET_BODY is shift count of
21677 USE_BODY. */
21679 static bool
21680 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21682 rtx set_dest;
21683 rtx shift_rtx;
21684 int i;
21686 /* Retrieve destination of SET_BODY. */
21687 switch (GET_CODE (set_body))
21689 case SET:
21690 set_dest = SET_DEST (set_body);
21691 if (!set_dest || !REG_P (set_dest))
21692 return false;
21693 break;
21694 case PARALLEL:
21695 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21696 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21697 use_body))
21698 return true;
21699 /* FALLTHROUGH */
21700 default:
21701 return false;
21704 /* Retrieve shift count of USE_BODY. */
21705 switch (GET_CODE (use_body))
21707 case SET:
21708 shift_rtx = XEXP (use_body, 1);
21709 break;
21710 case PARALLEL:
21711 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21712 if (ix86_dep_by_shift_count_body (set_body,
21713 XVECEXP (use_body, 0, i)))
21714 return true;
21715 /* FALLTHROUGH */
21716 default:
21717 return false;
21720 if (shift_rtx
21721 && (GET_CODE (shift_rtx) == ASHIFT
21722 || GET_CODE (shift_rtx) == LSHIFTRT
21723 || GET_CODE (shift_rtx) == ASHIFTRT
21724 || GET_CODE (shift_rtx) == ROTATE
21725 || GET_CODE (shift_rtx) == ROTATERT))
21727 rtx shift_count = XEXP (shift_rtx, 1);
21729 /* Return true if shift count is dest of SET_BODY. */
21730 if (REG_P (shift_count))
21732 /* Add check since it can be invoked before register
21733 allocation in pre-reload schedule. */
21734 if (reload_completed
21735 && true_regnum (set_dest) == true_regnum (shift_count))
21736 return true;
21737 else if (REGNO(set_dest) == REGNO(shift_count))
21738 return true;
21742 return false;
21745 /* Return true if destination reg of SET_INSN is shift count of
21746 USE_INSN. */
21748 bool
21749 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21751 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21752 PATTERN (use_insn));
21755 /* Return TRUE or FALSE depending on whether the unary operator meets the
21756 appropriate constraints. */
21758 bool
21759 ix86_unary_operator_ok (enum rtx_code,
21760 machine_mode,
21761 rtx operands[2])
21763 /* If one of operands is memory, source and destination must match. */
21764 if ((MEM_P (operands[0])
21765 || MEM_P (operands[1]))
21766 && ! rtx_equal_p (operands[0], operands[1]))
21767 return false;
21768 return true;
21771 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21772 are ok, keeping in mind the possible movddup alternative. */
21774 bool
21775 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21777 if (MEM_P (operands[0]))
21778 return rtx_equal_p (operands[0], operands[1 + high]);
21779 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21780 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21781 return true;
21784 /* Post-reload splitter for converting an SF or DFmode value in an
21785 SSE register into an unsigned SImode. */
21787 void
21788 ix86_split_convert_uns_si_sse (rtx operands[])
21790 machine_mode vecmode;
21791 rtx value, large, zero_or_two31, input, two31, x;
21793 large = operands[1];
21794 zero_or_two31 = operands[2];
21795 input = operands[3];
21796 two31 = operands[4];
21797 vecmode = GET_MODE (large);
21798 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21800 /* Load up the value into the low element. We must ensure that the other
21801 elements are valid floats -- zero is the easiest such value. */
21802 if (MEM_P (input))
21804 if (vecmode == V4SFmode)
21805 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21806 else
21807 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21809 else
21811 input = gen_rtx_REG (vecmode, REGNO (input));
21812 emit_move_insn (value, CONST0_RTX (vecmode));
21813 if (vecmode == V4SFmode)
21814 emit_insn (gen_sse_movss (value, value, input));
21815 else
21816 emit_insn (gen_sse2_movsd (value, value, input));
21819 emit_move_insn (large, two31);
21820 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21822 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21823 emit_insn (gen_rtx_SET (large, x));
21825 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21826 emit_insn (gen_rtx_SET (zero_or_two31, x));
21828 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21829 emit_insn (gen_rtx_SET (value, x));
21831 large = gen_rtx_REG (V4SImode, REGNO (large));
21832 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21834 x = gen_rtx_REG (V4SImode, REGNO (value));
21835 if (vecmode == V4SFmode)
21836 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21837 else
21838 emit_insn (gen_sse2_cvttpd2dq (x, value));
21839 value = x;
21841 emit_insn (gen_xorv4si3 (value, value, large));
21844 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21845 Expects the 64-bit DImode to be supplied in a pair of integral
21846 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21847 -mfpmath=sse, !optimize_size only. */
21849 void
21850 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21852 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21853 rtx int_xmm, fp_xmm;
21854 rtx biases, exponents;
21855 rtx x;
21857 int_xmm = gen_reg_rtx (V4SImode);
21858 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21859 emit_insn (gen_movdi_to_sse (int_xmm, input));
21860 else if (TARGET_SSE_SPLIT_REGS)
21862 emit_clobber (int_xmm);
21863 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21865 else
21867 x = gen_reg_rtx (V2DImode);
21868 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21869 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21872 x = gen_rtx_CONST_VECTOR (V4SImode,
21873 gen_rtvec (4, GEN_INT (0x43300000UL),
21874 GEN_INT (0x45300000UL),
21875 const0_rtx, const0_rtx));
21876 exponents = validize_mem (force_const_mem (V4SImode, x));
21878 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21879 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21881 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21882 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21883 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21884 (0x1.0p84 + double(fp_value_hi_xmm)).
21885 Note these exponents differ by 32. */
21887 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21889 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21890 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21891 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21892 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21893 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21894 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21895 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21896 biases = validize_mem (force_const_mem (V2DFmode, biases));
21897 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21899 /* Add the upper and lower DFmode values together. */
21900 if (TARGET_SSE3)
21901 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21902 else
21904 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21905 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21906 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21909 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21912 /* Not used, but eases macroization of patterns. */
21913 void
21914 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21916 gcc_unreachable ();
21919 /* Convert an unsigned SImode value into a DFmode. Only currently used
21920 for SSE, but applicable anywhere. */
21922 void
21923 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21925 REAL_VALUE_TYPE TWO31r;
21926 rtx x, fp;
21928 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21929 NULL, 1, OPTAB_DIRECT);
21931 fp = gen_reg_rtx (DFmode);
21932 emit_insn (gen_floatsidf2 (fp, x));
21934 real_ldexp (&TWO31r, &dconst1, 31);
21935 x = const_double_from_real_value (TWO31r, DFmode);
21937 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21938 if (x != target)
21939 emit_move_insn (target, x);
21942 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21943 32-bit mode; otherwise we have a direct convert instruction. */
21945 void
21946 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21948 REAL_VALUE_TYPE TWO32r;
21949 rtx fp_lo, fp_hi, x;
21951 fp_lo = gen_reg_rtx (DFmode);
21952 fp_hi = gen_reg_rtx (DFmode);
21954 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21956 real_ldexp (&TWO32r, &dconst1, 32);
21957 x = const_double_from_real_value (TWO32r, DFmode);
21958 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21960 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21962 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21963 0, OPTAB_DIRECT);
21964 if (x != target)
21965 emit_move_insn (target, x);
21968 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21969 For x86_32, -mfpmath=sse, !optimize_size only. */
21970 void
21971 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21973 REAL_VALUE_TYPE ONE16r;
21974 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21976 real_ldexp (&ONE16r, &dconst1, 16);
21977 x = const_double_from_real_value (ONE16r, SFmode);
21978 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21979 NULL, 0, OPTAB_DIRECT);
21980 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21981 NULL, 0, OPTAB_DIRECT);
21982 fp_hi = gen_reg_rtx (SFmode);
21983 fp_lo = gen_reg_rtx (SFmode);
21984 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21985 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21986 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21987 0, OPTAB_DIRECT);
21988 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21989 0, OPTAB_DIRECT);
21990 if (!rtx_equal_p (target, fp_hi))
21991 emit_move_insn (target, fp_hi);
21994 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21995 a vector of unsigned ints VAL to vector of floats TARGET. */
21997 void
21998 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22000 rtx tmp[8];
22001 REAL_VALUE_TYPE TWO16r;
22002 machine_mode intmode = GET_MODE (val);
22003 machine_mode fltmode = GET_MODE (target);
22004 rtx (*cvt) (rtx, rtx);
22006 if (intmode == V4SImode)
22007 cvt = gen_floatv4siv4sf2;
22008 else
22009 cvt = gen_floatv8siv8sf2;
22010 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22011 tmp[0] = force_reg (intmode, tmp[0]);
22012 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22013 OPTAB_DIRECT);
22014 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22015 NULL_RTX, 1, OPTAB_DIRECT);
22016 tmp[3] = gen_reg_rtx (fltmode);
22017 emit_insn (cvt (tmp[3], tmp[1]));
22018 tmp[4] = gen_reg_rtx (fltmode);
22019 emit_insn (cvt (tmp[4], tmp[2]));
22020 real_ldexp (&TWO16r, &dconst1, 16);
22021 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22022 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22023 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22024 OPTAB_DIRECT);
22025 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22026 OPTAB_DIRECT);
22027 if (tmp[7] != target)
22028 emit_move_insn (target, tmp[7]);
22031 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22032 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22033 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22034 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22037 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22039 REAL_VALUE_TYPE TWO31r;
22040 rtx two31r, tmp[4];
22041 machine_mode mode = GET_MODE (val);
22042 machine_mode scalarmode = GET_MODE_INNER (mode);
22043 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22044 rtx (*cmp) (rtx, rtx, rtx, rtx);
22045 int i;
22047 for (i = 0; i < 3; i++)
22048 tmp[i] = gen_reg_rtx (mode);
22049 real_ldexp (&TWO31r, &dconst1, 31);
22050 two31r = const_double_from_real_value (TWO31r, scalarmode);
22051 two31r = ix86_build_const_vector (mode, 1, two31r);
22052 two31r = force_reg (mode, two31r);
22053 switch (mode)
22055 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22056 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22057 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22058 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22059 default: gcc_unreachable ();
22061 tmp[3] = gen_rtx_LE (mode, two31r, val);
22062 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22063 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22064 0, OPTAB_DIRECT);
22065 if (intmode == V4SImode || TARGET_AVX2)
22066 *xorp = expand_simple_binop (intmode, ASHIFT,
22067 gen_lowpart (intmode, tmp[0]),
22068 GEN_INT (31), NULL_RTX, 0,
22069 OPTAB_DIRECT);
22070 else
22072 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22073 two31 = ix86_build_const_vector (intmode, 1, two31);
22074 *xorp = expand_simple_binop (intmode, AND,
22075 gen_lowpart (intmode, tmp[0]),
22076 two31, NULL_RTX, 0,
22077 OPTAB_DIRECT);
22079 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22080 0, OPTAB_DIRECT);
22083 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22084 then replicate the value for all elements of the vector
22085 register. */
22088 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22090 int i, n_elt;
22091 rtvec v;
22092 machine_mode scalar_mode;
22094 switch (mode)
22096 case E_V64QImode:
22097 case E_V32QImode:
22098 case E_V16QImode:
22099 case E_V32HImode:
22100 case E_V16HImode:
22101 case E_V8HImode:
22102 case E_V16SImode:
22103 case E_V8SImode:
22104 case E_V4SImode:
22105 case E_V8DImode:
22106 case E_V4DImode:
22107 case E_V2DImode:
22108 gcc_assert (vect);
22109 /* FALLTHRU */
22110 case E_V16SFmode:
22111 case E_V8SFmode:
22112 case E_V4SFmode:
22113 case E_V8DFmode:
22114 case E_V4DFmode:
22115 case E_V2DFmode:
22116 n_elt = GET_MODE_NUNITS (mode);
22117 v = rtvec_alloc (n_elt);
22118 scalar_mode = GET_MODE_INNER (mode);
22120 RTVEC_ELT (v, 0) = value;
22122 for (i = 1; i < n_elt; ++i)
22123 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22125 return gen_rtx_CONST_VECTOR (mode, v);
22127 default:
22128 gcc_unreachable ();
22132 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22133 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22134 for an SSE register. If VECT is true, then replicate the mask for
22135 all elements of the vector register. If INVERT is true, then create
22136 a mask excluding the sign bit. */
22139 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22141 machine_mode vec_mode, imode;
22142 wide_int w;
22143 rtx mask, v;
22145 switch (mode)
22147 case E_V16SImode:
22148 case E_V16SFmode:
22149 case E_V8SImode:
22150 case E_V4SImode:
22151 case E_V8SFmode:
22152 case E_V4SFmode:
22153 vec_mode = mode;
22154 imode = SImode;
22155 break;
22157 case E_V8DImode:
22158 case E_V4DImode:
22159 case E_V2DImode:
22160 case E_V8DFmode:
22161 case E_V4DFmode:
22162 case E_V2DFmode:
22163 vec_mode = mode;
22164 imode = DImode;
22165 break;
22167 case E_TImode:
22168 case E_TFmode:
22169 vec_mode = VOIDmode;
22170 imode = TImode;
22171 break;
22173 default:
22174 gcc_unreachable ();
22177 machine_mode inner_mode = GET_MODE_INNER (mode);
22178 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22179 GET_MODE_BITSIZE (inner_mode));
22180 if (invert)
22181 w = wi::bit_not (w);
22183 /* Force this value into the low part of a fp vector constant. */
22184 mask = immed_wide_int_const (w, imode);
22185 mask = gen_lowpart (inner_mode, mask);
22187 if (vec_mode == VOIDmode)
22188 return force_reg (inner_mode, mask);
22190 v = ix86_build_const_vector (vec_mode, vect, mask);
22191 return force_reg (vec_mode, v);
22194 /* Generate code for floating point ABS or NEG. */
22196 void
22197 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22198 rtx operands[])
22200 rtx mask, set, dst, src;
22201 bool use_sse = false;
22202 bool vector_mode = VECTOR_MODE_P (mode);
22203 machine_mode vmode = mode;
22205 if (vector_mode)
22206 use_sse = true;
22207 else if (mode == TFmode)
22208 use_sse = true;
22209 else if (TARGET_SSE_MATH)
22211 use_sse = SSE_FLOAT_MODE_P (mode);
22212 if (mode == SFmode)
22213 vmode = V4SFmode;
22214 else if (mode == DFmode)
22215 vmode = V2DFmode;
22218 /* NEG and ABS performed with SSE use bitwise mask operations.
22219 Create the appropriate mask now. */
22220 if (use_sse)
22221 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22222 else
22223 mask = NULL_RTX;
22225 dst = operands[0];
22226 src = operands[1];
22228 set = gen_rtx_fmt_e (code, mode, src);
22229 set = gen_rtx_SET (dst, set);
22231 if (mask)
22233 rtx use, clob;
22234 rtvec par;
22236 use = gen_rtx_USE (VOIDmode, mask);
22237 if (vector_mode)
22238 par = gen_rtvec (2, set, use);
22239 else
22241 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22242 par = gen_rtvec (3, set, use, clob);
22244 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22246 else
22247 emit_insn (set);
22250 /* Expand a copysign operation. Special case operand 0 being a constant. */
22252 void
22253 ix86_expand_copysign (rtx operands[])
22255 machine_mode mode, vmode;
22256 rtx dest, op0, op1, mask, nmask;
22258 dest = operands[0];
22259 op0 = operands[1];
22260 op1 = operands[2];
22262 mode = GET_MODE (dest);
22264 if (mode == SFmode)
22265 vmode = V4SFmode;
22266 else if (mode == DFmode)
22267 vmode = V2DFmode;
22268 else
22269 vmode = mode;
22271 if (CONST_DOUBLE_P (op0))
22273 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22275 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22276 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22278 if (mode == SFmode || mode == DFmode)
22280 if (op0 == CONST0_RTX (mode))
22281 op0 = CONST0_RTX (vmode);
22282 else
22284 rtx v = ix86_build_const_vector (vmode, false, op0);
22286 op0 = force_reg (vmode, v);
22289 else if (op0 != CONST0_RTX (mode))
22290 op0 = force_reg (mode, op0);
22292 mask = ix86_build_signbit_mask (vmode, 0, 0);
22294 if (mode == SFmode)
22295 copysign_insn = gen_copysignsf3_const;
22296 else if (mode == DFmode)
22297 copysign_insn = gen_copysigndf3_const;
22298 else
22299 copysign_insn = gen_copysigntf3_const;
22301 emit_insn (copysign_insn (dest, op0, op1, mask));
22303 else
22305 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22307 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22308 mask = ix86_build_signbit_mask (vmode, 0, 0);
22310 if (mode == SFmode)
22311 copysign_insn = gen_copysignsf3_var;
22312 else if (mode == DFmode)
22313 copysign_insn = gen_copysigndf3_var;
22314 else
22315 copysign_insn = gen_copysigntf3_var;
22317 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22321 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22322 be a constant, and so has already been expanded into a vector constant. */
22324 void
22325 ix86_split_copysign_const (rtx operands[])
22327 machine_mode mode, vmode;
22328 rtx dest, op0, mask, x;
22330 dest = operands[0];
22331 op0 = operands[1];
22332 mask = operands[3];
22334 mode = GET_MODE (dest);
22335 vmode = GET_MODE (mask);
22337 dest = lowpart_subreg (vmode, dest, mode);
22338 x = gen_rtx_AND (vmode, dest, mask);
22339 emit_insn (gen_rtx_SET (dest, x));
22341 if (op0 != CONST0_RTX (vmode))
22343 x = gen_rtx_IOR (vmode, dest, op0);
22344 emit_insn (gen_rtx_SET (dest, x));
22348 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22349 so we have to do two masks. */
22351 void
22352 ix86_split_copysign_var (rtx operands[])
22354 machine_mode mode, vmode;
22355 rtx dest, scratch, op0, op1, mask, nmask, x;
22357 dest = operands[0];
22358 scratch = operands[1];
22359 op0 = operands[2];
22360 op1 = operands[3];
22361 nmask = operands[4];
22362 mask = operands[5];
22364 mode = GET_MODE (dest);
22365 vmode = GET_MODE (mask);
22367 if (rtx_equal_p (op0, op1))
22369 /* Shouldn't happen often (it's useless, obviously), but when it does
22370 we'd generate incorrect code if we continue below. */
22371 emit_move_insn (dest, op0);
22372 return;
22375 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22377 gcc_assert (REGNO (op1) == REGNO (scratch));
22379 x = gen_rtx_AND (vmode, scratch, mask);
22380 emit_insn (gen_rtx_SET (scratch, x));
22382 dest = mask;
22383 op0 = lowpart_subreg (vmode, op0, mode);
22384 x = gen_rtx_NOT (vmode, dest);
22385 x = gen_rtx_AND (vmode, x, op0);
22386 emit_insn (gen_rtx_SET (dest, x));
22388 else
22390 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22392 x = gen_rtx_AND (vmode, scratch, mask);
22394 else /* alternative 2,4 */
22396 gcc_assert (REGNO (mask) == REGNO (scratch));
22397 op1 = lowpart_subreg (vmode, op1, mode);
22398 x = gen_rtx_AND (vmode, scratch, op1);
22400 emit_insn (gen_rtx_SET (scratch, x));
22402 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22404 dest = lowpart_subreg (vmode, op0, mode);
22405 x = gen_rtx_AND (vmode, dest, nmask);
22407 else /* alternative 3,4 */
22409 gcc_assert (REGNO (nmask) == REGNO (dest));
22410 dest = nmask;
22411 op0 = lowpart_subreg (vmode, op0, mode);
22412 x = gen_rtx_AND (vmode, dest, op0);
22414 emit_insn (gen_rtx_SET (dest, x));
22417 x = gen_rtx_IOR (vmode, dest, scratch);
22418 emit_insn (gen_rtx_SET (dest, x));
22421 /* Return TRUE or FALSE depending on whether the first SET in INSN
22422 has source and destination with matching CC modes, and that the
22423 CC mode is at least as constrained as REQ_MODE. */
22425 bool
22426 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22428 rtx set;
22429 machine_mode set_mode;
22431 set = PATTERN (insn);
22432 if (GET_CODE (set) == PARALLEL)
22433 set = XVECEXP (set, 0, 0);
22434 gcc_assert (GET_CODE (set) == SET);
22435 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22437 set_mode = GET_MODE (SET_DEST (set));
22438 switch (set_mode)
22440 case E_CCNOmode:
22441 if (req_mode != CCNOmode
22442 && (req_mode != CCmode
22443 || XEXP (SET_SRC (set), 1) != const0_rtx))
22444 return false;
22445 break;
22446 case E_CCmode:
22447 if (req_mode == CCGCmode)
22448 return false;
22449 /* FALLTHRU */
22450 case E_CCGCmode:
22451 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22452 return false;
22453 /* FALLTHRU */
22454 case E_CCGOCmode:
22455 if (req_mode == CCZmode)
22456 return false;
22457 /* FALLTHRU */
22458 case E_CCZmode:
22459 break;
22461 case E_CCGZmode:
22463 case E_CCAmode:
22464 case E_CCCmode:
22465 case E_CCOmode:
22466 case E_CCPmode:
22467 case E_CCSmode:
22468 if (set_mode != req_mode)
22469 return false;
22470 break;
22472 default:
22473 gcc_unreachable ();
22476 return GET_MODE (SET_SRC (set)) == set_mode;
22479 /* Generate insn patterns to do an integer compare of OPERANDS. */
22481 static rtx
22482 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22484 machine_mode cmpmode;
22485 rtx tmp, flags;
22487 cmpmode = SELECT_CC_MODE (code, op0, op1);
22488 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22490 /* This is very simple, but making the interface the same as in the
22491 FP case makes the rest of the code easier. */
22492 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22493 emit_insn (gen_rtx_SET (flags, tmp));
22495 /* Return the test that should be put into the flags user, i.e.
22496 the bcc, scc, or cmov instruction. */
22497 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22500 /* Figure out whether to use unordered fp comparisons. */
22502 static bool
22503 ix86_unordered_fp_compare (enum rtx_code code)
22505 if (!TARGET_IEEE_FP)
22506 return false;
22508 switch (code)
22510 case GT:
22511 case GE:
22512 case LT:
22513 case LE:
22514 return false;
22516 case EQ:
22517 case NE:
22519 case LTGT:
22520 case UNORDERED:
22521 case ORDERED:
22522 case UNLT:
22523 case UNLE:
22524 case UNGT:
22525 case UNGE:
22526 case UNEQ:
22527 return true;
22529 default:
22530 gcc_unreachable ();
22534 machine_mode
22535 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22537 machine_mode mode = GET_MODE (op0);
22539 if (SCALAR_FLOAT_MODE_P (mode))
22541 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22542 return CCFPmode;
22545 switch (code)
22547 /* Only zero flag is needed. */
22548 case EQ: /* ZF=0 */
22549 case NE: /* ZF!=0 */
22550 return CCZmode;
22551 /* Codes needing carry flag. */
22552 case GEU: /* CF=0 */
22553 case LTU: /* CF=1 */
22554 /* Detect overflow checks. They need just the carry flag. */
22555 if (GET_CODE (op0) == PLUS
22556 && (rtx_equal_p (op1, XEXP (op0, 0))
22557 || rtx_equal_p (op1, XEXP (op0, 1))))
22558 return CCCmode;
22559 else
22560 return CCmode;
22561 case GTU: /* CF=0 & ZF=0 */
22562 case LEU: /* CF=1 | ZF=1 */
22563 return CCmode;
22564 /* Codes possibly doable only with sign flag when
22565 comparing against zero. */
22566 case GE: /* SF=OF or SF=0 */
22567 case LT: /* SF<>OF or SF=1 */
22568 if (op1 == const0_rtx)
22569 return CCGOCmode;
22570 else
22571 /* For other cases Carry flag is not required. */
22572 return CCGCmode;
22573 /* Codes doable only with sign flag when comparing
22574 against zero, but we miss jump instruction for it
22575 so we need to use relational tests against overflow
22576 that thus needs to be zero. */
22577 case GT: /* ZF=0 & SF=OF */
22578 case LE: /* ZF=1 | SF<>OF */
22579 if (op1 == const0_rtx)
22580 return CCNOmode;
22581 else
22582 return CCGCmode;
22583 /* strcmp pattern do (use flags) and combine may ask us for proper
22584 mode. */
22585 case USE:
22586 return CCmode;
22587 default:
22588 gcc_unreachable ();
22592 /* Return the fixed registers used for condition codes. */
22594 static bool
22595 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22597 *p1 = FLAGS_REG;
22598 *p2 = FPSR_REG;
22599 return true;
22602 /* If two condition code modes are compatible, return a condition code
22603 mode which is compatible with both. Otherwise, return
22604 VOIDmode. */
22606 static machine_mode
22607 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22609 if (m1 == m2)
22610 return m1;
22612 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22613 return VOIDmode;
22615 if ((m1 == CCGCmode && m2 == CCGOCmode)
22616 || (m1 == CCGOCmode && m2 == CCGCmode))
22617 return CCGCmode;
22619 if ((m1 == CCNOmode && m2 == CCGOCmode)
22620 || (m1 == CCGOCmode && m2 == CCNOmode))
22621 return CCNOmode;
22623 if (m1 == CCZmode
22624 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22625 return m2;
22626 else if (m2 == CCZmode
22627 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22628 return m1;
22630 switch (m1)
22632 default:
22633 gcc_unreachable ();
22635 case E_CCmode:
22636 case E_CCGCmode:
22637 case E_CCGOCmode:
22638 case E_CCNOmode:
22639 case E_CCAmode:
22640 case E_CCCmode:
22641 case E_CCOmode:
22642 case E_CCPmode:
22643 case E_CCSmode:
22644 case E_CCZmode:
22645 switch (m2)
22647 default:
22648 return VOIDmode;
22650 case E_CCmode:
22651 case E_CCGCmode:
22652 case E_CCGOCmode:
22653 case E_CCNOmode:
22654 case E_CCAmode:
22655 case E_CCCmode:
22656 case E_CCOmode:
22657 case E_CCPmode:
22658 case E_CCSmode:
22659 case E_CCZmode:
22660 return CCmode;
22663 case E_CCFPmode:
22664 /* These are only compatible with themselves, which we already
22665 checked above. */
22666 return VOIDmode;
22671 /* Return a comparison we can do and that it is equivalent to
22672 swap_condition (code) apart possibly from orderedness.
22673 But, never change orderedness if TARGET_IEEE_FP, returning
22674 UNKNOWN in that case if necessary. */
22676 static enum rtx_code
22677 ix86_fp_swap_condition (enum rtx_code code)
22679 switch (code)
22681 case GT: /* GTU - CF=0 & ZF=0 */
22682 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22683 case GE: /* GEU - CF=0 */
22684 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22685 case UNLT: /* LTU - CF=1 */
22686 return TARGET_IEEE_FP ? UNKNOWN : GT;
22687 case UNLE: /* LEU - CF=1 | ZF=1 */
22688 return TARGET_IEEE_FP ? UNKNOWN : GE;
22689 default:
22690 return swap_condition (code);
22694 /* Return cost of comparison CODE using the best strategy for performance.
22695 All following functions do use number of instructions as a cost metrics.
22696 In future this should be tweaked to compute bytes for optimize_size and
22697 take into account performance of various instructions on various CPUs. */
22699 static int
22700 ix86_fp_comparison_cost (enum rtx_code code)
22702 int arith_cost;
22704 /* The cost of code using bit-twiddling on %ah. */
22705 switch (code)
22707 case UNLE:
22708 case UNLT:
22709 case LTGT:
22710 case GT:
22711 case GE:
22712 case UNORDERED:
22713 case ORDERED:
22714 case UNEQ:
22715 arith_cost = 4;
22716 break;
22717 case LT:
22718 case NE:
22719 case EQ:
22720 case UNGE:
22721 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22722 break;
22723 case LE:
22724 case UNGT:
22725 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22726 break;
22727 default:
22728 gcc_unreachable ();
22731 switch (ix86_fp_comparison_strategy (code))
22733 case IX86_FPCMP_COMI:
22734 return arith_cost > 4 ? 3 : 2;
22735 case IX86_FPCMP_SAHF:
22736 return arith_cost > 4 ? 4 : 3;
22737 default:
22738 return arith_cost;
22742 /* Return strategy to use for floating-point. We assume that fcomi is always
22743 preferrable where available, since that is also true when looking at size
22744 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22746 enum ix86_fpcmp_strategy
22747 ix86_fp_comparison_strategy (enum rtx_code)
22749 /* Do fcomi/sahf based test when profitable. */
22751 if (TARGET_CMOVE)
22752 return IX86_FPCMP_COMI;
22754 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22755 return IX86_FPCMP_SAHF;
22757 return IX86_FPCMP_ARITH;
22760 /* Swap, force into registers, or otherwise massage the two operands
22761 to a fp comparison. The operands are updated in place; the new
22762 comparison code is returned. */
22764 static enum rtx_code
22765 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22767 bool unordered_compare = ix86_unordered_fp_compare (code);
22768 rtx op0 = *pop0, op1 = *pop1;
22769 machine_mode op_mode = GET_MODE (op0);
22770 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22772 /* All of the unordered compare instructions only work on registers.
22773 The same is true of the fcomi compare instructions. The XFmode
22774 compare instructions require registers except when comparing
22775 against zero or when converting operand 1 from fixed point to
22776 floating point. */
22778 if (!is_sse
22779 && (unordered_compare
22780 || (op_mode == XFmode
22781 && ! (standard_80387_constant_p (op0) == 1
22782 || standard_80387_constant_p (op1) == 1)
22783 && GET_CODE (op1) != FLOAT)
22784 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22786 op0 = force_reg (op_mode, op0);
22787 op1 = force_reg (op_mode, op1);
22789 else
22791 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22792 things around if they appear profitable, otherwise force op0
22793 into a register. */
22795 if (standard_80387_constant_p (op0) == 0
22796 || (MEM_P (op0)
22797 && ! (standard_80387_constant_p (op1) == 0
22798 || MEM_P (op1))))
22800 enum rtx_code new_code = ix86_fp_swap_condition (code);
22801 if (new_code != UNKNOWN)
22803 std::swap (op0, op1);
22804 code = new_code;
22808 if (!REG_P (op0))
22809 op0 = force_reg (op_mode, op0);
22811 if (CONSTANT_P (op1))
22813 int tmp = standard_80387_constant_p (op1);
22814 if (tmp == 0)
22815 op1 = validize_mem (force_const_mem (op_mode, op1));
22816 else if (tmp == 1)
22818 if (TARGET_CMOVE)
22819 op1 = force_reg (op_mode, op1);
22821 else
22822 op1 = force_reg (op_mode, op1);
22826 /* Try to rearrange the comparison to make it cheaper. */
22827 if (ix86_fp_comparison_cost (code)
22828 > ix86_fp_comparison_cost (swap_condition (code))
22829 && (REG_P (op1) || can_create_pseudo_p ()))
22831 std::swap (op0, op1);
22832 code = swap_condition (code);
22833 if (!REG_P (op0))
22834 op0 = force_reg (op_mode, op0);
22837 *pop0 = op0;
22838 *pop1 = op1;
22839 return code;
22842 /* Convert comparison codes we use to represent FP comparison to integer
22843 code that will result in proper branch. Return UNKNOWN if no such code
22844 is available. */
22846 enum rtx_code
22847 ix86_fp_compare_code_to_integer (enum rtx_code code)
22849 switch (code)
22851 case GT:
22852 return GTU;
22853 case GE:
22854 return GEU;
22855 case ORDERED:
22856 case UNORDERED:
22857 return code;
22858 case UNEQ:
22859 return EQ;
22860 case UNLT:
22861 return LTU;
22862 case UNLE:
22863 return LEU;
22864 case LTGT:
22865 return NE;
22866 default:
22867 return UNKNOWN;
22871 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22873 static rtx
22874 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22876 bool unordered_compare = ix86_unordered_fp_compare (code);
22877 machine_mode intcmp_mode;
22878 rtx tmp, tmp2;
22880 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22882 /* Do fcomi/sahf based test when profitable. */
22883 switch (ix86_fp_comparison_strategy (code))
22885 case IX86_FPCMP_COMI:
22886 intcmp_mode = CCFPmode;
22887 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22888 if (unordered_compare)
22889 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22890 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22891 break;
22893 case IX86_FPCMP_SAHF:
22894 intcmp_mode = CCFPmode;
22895 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22896 if (unordered_compare)
22897 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22898 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22899 if (!scratch)
22900 scratch = gen_reg_rtx (HImode);
22901 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22902 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22903 break;
22905 case IX86_FPCMP_ARITH:
22906 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22907 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22908 if (unordered_compare)
22909 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22910 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22911 if (!scratch)
22912 scratch = gen_reg_rtx (HImode);
22913 emit_insn (gen_rtx_SET (scratch, tmp));
22915 /* In the unordered case, we have to check C2 for NaN's, which
22916 doesn't happen to work out to anything nice combination-wise.
22917 So do some bit twiddling on the value we've got in AH to come
22918 up with an appropriate set of condition codes. */
22920 intcmp_mode = CCNOmode;
22921 switch (code)
22923 case GT:
22924 case UNGT:
22925 if (code == GT || !TARGET_IEEE_FP)
22927 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22928 code = EQ;
22930 else
22932 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22933 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22934 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22935 intcmp_mode = CCmode;
22936 code = GEU;
22938 break;
22939 case LT:
22940 case UNLT:
22941 if (code == LT && TARGET_IEEE_FP)
22943 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22944 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22945 intcmp_mode = CCmode;
22946 code = EQ;
22948 else
22950 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22951 code = NE;
22953 break;
22954 case GE:
22955 case UNGE:
22956 if (code == GE || !TARGET_IEEE_FP)
22958 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22959 code = EQ;
22961 else
22963 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22964 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22965 code = NE;
22967 break;
22968 case LE:
22969 case UNLE:
22970 if (code == LE && TARGET_IEEE_FP)
22972 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22973 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22974 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22975 intcmp_mode = CCmode;
22976 code = LTU;
22978 else
22980 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22981 code = NE;
22983 break;
22984 case EQ:
22985 case UNEQ:
22986 if (code == EQ && TARGET_IEEE_FP)
22988 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22989 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22990 intcmp_mode = CCmode;
22991 code = EQ;
22993 else
22995 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22996 code = NE;
22998 break;
22999 case NE:
23000 case LTGT:
23001 if (code == NE && TARGET_IEEE_FP)
23003 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23004 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23005 GEN_INT (0x40)));
23006 code = NE;
23008 else
23010 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23011 code = EQ;
23013 break;
23015 case UNORDERED:
23016 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23017 code = NE;
23018 break;
23019 case ORDERED:
23020 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23021 code = EQ;
23022 break;
23024 default:
23025 gcc_unreachable ();
23027 break;
23029 default:
23030 gcc_unreachable();
23033 /* Return the test that should be put into the flags user, i.e.
23034 the bcc, scc, or cmov instruction. */
23035 return gen_rtx_fmt_ee (code, VOIDmode,
23036 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23037 const0_rtx);
23040 static rtx
23041 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23043 rtx ret;
23045 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23046 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23048 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23050 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23051 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23053 else
23054 ret = ix86_expand_int_compare (code, op0, op1);
23056 return ret;
23059 void
23060 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23062 machine_mode mode = GET_MODE (op0);
23063 rtx tmp;
23065 /* Handle special case - vector comparsion with boolean result, transform
23066 it using ptest instruction. */
23067 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23069 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23070 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23072 gcc_assert (code == EQ || code == NE);
23073 /* Generate XOR since we can't check that one operand is zero vector. */
23074 tmp = gen_reg_rtx (mode);
23075 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23076 tmp = gen_lowpart (p_mode, tmp);
23077 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23078 gen_rtx_UNSPEC (CCmode,
23079 gen_rtvec (2, tmp, tmp),
23080 UNSPEC_PTEST)));
23081 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23082 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23083 gen_rtx_LABEL_REF (VOIDmode, label),
23084 pc_rtx);
23085 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23086 return;
23089 switch (mode)
23091 case E_SFmode:
23092 case E_DFmode:
23093 case E_XFmode:
23094 case E_QImode:
23095 case E_HImode:
23096 case E_SImode:
23097 simple:
23098 tmp = ix86_expand_compare (code, op0, op1);
23099 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23100 gen_rtx_LABEL_REF (VOIDmode, label),
23101 pc_rtx);
23102 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23103 return;
23105 case E_DImode:
23106 if (TARGET_64BIT)
23107 goto simple;
23108 /* For 32-bit target DI comparison may be performed on
23109 SSE registers. To allow this we should avoid split
23110 to SI mode which is achieved by doing xor in DI mode
23111 and then comparing with zero (which is recognized by
23112 STV pass). We don't compare using xor when optimizing
23113 for size. */
23114 if (!optimize_insn_for_size_p ()
23115 && TARGET_STV
23116 && (code == EQ || code == NE))
23118 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23119 op1 = const0_rtx;
23121 /* FALLTHRU */
23122 case E_TImode:
23123 /* Expand DImode branch into multiple compare+branch. */
23125 rtx lo[2], hi[2];
23126 rtx_code_label *label2;
23127 enum rtx_code code1, code2, code3;
23128 machine_mode submode;
23130 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23132 std::swap (op0, op1);
23133 code = swap_condition (code);
23136 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23137 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23139 submode = mode == DImode ? SImode : DImode;
23141 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23142 avoid two branches. This costs one extra insn, so disable when
23143 optimizing for size. */
23145 if ((code == EQ || code == NE)
23146 && (!optimize_insn_for_size_p ()
23147 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23149 rtx xor0, xor1;
23151 xor1 = hi[0];
23152 if (hi[1] != const0_rtx)
23153 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23154 NULL_RTX, 0, OPTAB_WIDEN);
23156 xor0 = lo[0];
23157 if (lo[1] != const0_rtx)
23158 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23159 NULL_RTX, 0, OPTAB_WIDEN);
23161 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23162 NULL_RTX, 0, OPTAB_WIDEN);
23164 ix86_expand_branch (code, tmp, const0_rtx, label);
23165 return;
23168 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23169 op1 is a constant and the low word is zero, then we can just
23170 examine the high word. Similarly for low word -1 and
23171 less-or-equal-than or greater-than. */
23173 if (CONST_INT_P (hi[1]))
23174 switch (code)
23176 case LT: case LTU: case GE: case GEU:
23177 if (lo[1] == const0_rtx)
23179 ix86_expand_branch (code, hi[0], hi[1], label);
23180 return;
23182 break;
23183 case LE: case LEU: case GT: case GTU:
23184 if (lo[1] == constm1_rtx)
23186 ix86_expand_branch (code, hi[0], hi[1], label);
23187 return;
23189 break;
23190 default:
23191 break;
23194 /* Emulate comparisons that do not depend on Zero flag with
23195 double-word subtraction. Note that only Overflow, Sign
23196 and Carry flags are valid, so swap arguments and condition
23197 of comparisons that would otherwise test Zero flag. */
23199 switch (code)
23201 case LE: case LEU: case GT: case GTU:
23202 std::swap (lo[0], lo[1]);
23203 std::swap (hi[0], hi[1]);
23204 code = swap_condition (code);
23205 /* FALLTHRU */
23207 case LT: case LTU: case GE: case GEU:
23209 rtx (*cmp_insn) (rtx, rtx);
23210 rtx (*sbb_insn) (rtx, rtx, rtx);
23211 bool uns = (code == LTU || code == GEU);
23213 if (TARGET_64BIT)
23215 cmp_insn = gen_cmpdi_1;
23216 sbb_insn
23217 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23219 else
23221 cmp_insn = gen_cmpsi_1;
23222 sbb_insn
23223 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23226 if (!nonimmediate_operand (lo[0], submode))
23227 lo[0] = force_reg (submode, lo[0]);
23228 if (!x86_64_general_operand (lo[1], submode))
23229 lo[1] = force_reg (submode, lo[1]);
23231 if (!register_operand (hi[0], submode))
23232 hi[0] = force_reg (submode, hi[0]);
23233 if ((uns && !nonimmediate_operand (hi[1], submode))
23234 || (!uns && !x86_64_general_operand (hi[1], submode)))
23235 hi[1] = force_reg (submode, hi[1]);
23237 emit_insn (cmp_insn (lo[0], lo[1]));
23238 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23240 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23242 ix86_expand_branch (code, tmp, const0_rtx, label);
23243 return;
23246 default:
23247 break;
23250 /* Otherwise, we need two or three jumps. */
23252 label2 = gen_label_rtx ();
23254 code1 = code;
23255 code2 = swap_condition (code);
23256 code3 = unsigned_condition (code);
23258 switch (code)
23260 case LT: case GT: case LTU: case GTU:
23261 break;
23263 case LE: code1 = LT; code2 = GT; break;
23264 case GE: code1 = GT; code2 = LT; break;
23265 case LEU: code1 = LTU; code2 = GTU; break;
23266 case GEU: code1 = GTU; code2 = LTU; break;
23268 case EQ: code1 = UNKNOWN; code2 = NE; break;
23269 case NE: code2 = UNKNOWN; break;
23271 default:
23272 gcc_unreachable ();
23276 * a < b =>
23277 * if (hi(a) < hi(b)) goto true;
23278 * if (hi(a) > hi(b)) goto false;
23279 * if (lo(a) < lo(b)) goto true;
23280 * false:
23283 if (code1 != UNKNOWN)
23284 ix86_expand_branch (code1, hi[0], hi[1], label);
23285 if (code2 != UNKNOWN)
23286 ix86_expand_branch (code2, hi[0], hi[1], label2);
23288 ix86_expand_branch (code3, lo[0], lo[1], label);
23290 if (code2 != UNKNOWN)
23291 emit_label (label2);
23292 return;
23295 default:
23296 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23297 goto simple;
23301 void
23302 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23304 rtx ret;
23306 gcc_assert (GET_MODE (dest) == QImode);
23308 ret = ix86_expand_compare (code, op0, op1);
23309 PUT_MODE (ret, QImode);
23310 emit_insn (gen_rtx_SET (dest, ret));
23313 /* Expand comparison setting or clearing carry flag. Return true when
23314 successful and set pop for the operation. */
23315 static bool
23316 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23318 machine_mode mode =
23319 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23321 /* Do not handle double-mode compares that go through special path. */
23322 if (mode == (TARGET_64BIT ? TImode : DImode))
23323 return false;
23325 if (SCALAR_FLOAT_MODE_P (mode))
23327 rtx compare_op;
23328 rtx_insn *compare_seq;
23330 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23332 /* Shortcut: following common codes never translate
23333 into carry flag compares. */
23334 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23335 || code == ORDERED || code == UNORDERED)
23336 return false;
23338 /* These comparisons require zero flag; swap operands so they won't. */
23339 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23340 && !TARGET_IEEE_FP)
23342 std::swap (op0, op1);
23343 code = swap_condition (code);
23346 /* Try to expand the comparison and verify that we end up with
23347 carry flag based comparison. This fails to be true only when
23348 we decide to expand comparison using arithmetic that is not
23349 too common scenario. */
23350 start_sequence ();
23351 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23352 compare_seq = get_insns ();
23353 end_sequence ();
23355 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23356 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23357 else
23358 code = GET_CODE (compare_op);
23360 if (code != LTU && code != GEU)
23361 return false;
23363 emit_insn (compare_seq);
23364 *pop = compare_op;
23365 return true;
23368 if (!INTEGRAL_MODE_P (mode))
23369 return false;
23371 switch (code)
23373 case LTU:
23374 case GEU:
23375 break;
23377 /* Convert a==0 into (unsigned)a<1. */
23378 case EQ:
23379 case NE:
23380 if (op1 != const0_rtx)
23381 return false;
23382 op1 = const1_rtx;
23383 code = (code == EQ ? LTU : GEU);
23384 break;
23386 /* Convert a>b into b<a or a>=b-1. */
23387 case GTU:
23388 case LEU:
23389 if (CONST_INT_P (op1))
23391 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23392 /* Bail out on overflow. We still can swap operands but that
23393 would force loading of the constant into register. */
23394 if (op1 == const0_rtx
23395 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23396 return false;
23397 code = (code == GTU ? GEU : LTU);
23399 else
23401 std::swap (op0, op1);
23402 code = (code == GTU ? LTU : GEU);
23404 break;
23406 /* Convert a>=0 into (unsigned)a<0x80000000. */
23407 case LT:
23408 case GE:
23409 if (mode == DImode || op1 != const0_rtx)
23410 return false;
23411 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23412 code = (code == LT ? GEU : LTU);
23413 break;
23414 case LE:
23415 case GT:
23416 if (mode == DImode || op1 != constm1_rtx)
23417 return false;
23418 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23419 code = (code == LE ? GEU : LTU);
23420 break;
23422 default:
23423 return false;
23425 /* Swapping operands may cause constant to appear as first operand. */
23426 if (!nonimmediate_operand (op0, VOIDmode))
23428 if (!can_create_pseudo_p ())
23429 return false;
23430 op0 = force_reg (mode, op0);
23432 *pop = ix86_expand_compare (code, op0, op1);
23433 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23434 return true;
23437 bool
23438 ix86_expand_int_movcc (rtx operands[])
23440 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23441 rtx_insn *compare_seq;
23442 rtx compare_op;
23443 machine_mode mode = GET_MODE (operands[0]);
23444 bool sign_bit_compare_p = false;
23445 rtx op0 = XEXP (operands[1], 0);
23446 rtx op1 = XEXP (operands[1], 1);
23448 if (GET_MODE (op0) == TImode
23449 || (GET_MODE (op0) == DImode
23450 && !TARGET_64BIT))
23451 return false;
23453 start_sequence ();
23454 compare_op = ix86_expand_compare (code, op0, op1);
23455 compare_seq = get_insns ();
23456 end_sequence ();
23458 compare_code = GET_CODE (compare_op);
23460 if ((op1 == const0_rtx && (code == GE || code == LT))
23461 || (op1 == constm1_rtx && (code == GT || code == LE)))
23462 sign_bit_compare_p = true;
23464 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23465 HImode insns, we'd be swallowed in word prefix ops. */
23467 if ((mode != HImode || TARGET_FAST_PREFIX)
23468 && (mode != (TARGET_64BIT ? TImode : DImode))
23469 && CONST_INT_P (operands[2])
23470 && CONST_INT_P (operands[3]))
23472 rtx out = operands[0];
23473 HOST_WIDE_INT ct = INTVAL (operands[2]);
23474 HOST_WIDE_INT cf = INTVAL (operands[3]);
23475 HOST_WIDE_INT diff;
23477 diff = ct - cf;
23478 /* Sign bit compares are better done using shifts than we do by using
23479 sbb. */
23480 if (sign_bit_compare_p
23481 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23483 /* Detect overlap between destination and compare sources. */
23484 rtx tmp = out;
23486 if (!sign_bit_compare_p)
23488 rtx flags;
23489 bool fpcmp = false;
23491 compare_code = GET_CODE (compare_op);
23493 flags = XEXP (compare_op, 0);
23495 if (GET_MODE (flags) == CCFPmode)
23497 fpcmp = true;
23498 compare_code
23499 = ix86_fp_compare_code_to_integer (compare_code);
23502 /* To simplify rest of code, restrict to the GEU case. */
23503 if (compare_code == LTU)
23505 std::swap (ct, cf);
23506 compare_code = reverse_condition (compare_code);
23507 code = reverse_condition (code);
23509 else
23511 if (fpcmp)
23512 PUT_CODE (compare_op,
23513 reverse_condition_maybe_unordered
23514 (GET_CODE (compare_op)));
23515 else
23516 PUT_CODE (compare_op,
23517 reverse_condition (GET_CODE (compare_op)));
23519 diff = ct - cf;
23521 if (reg_overlap_mentioned_p (out, op0)
23522 || reg_overlap_mentioned_p (out, op1))
23523 tmp = gen_reg_rtx (mode);
23525 if (mode == DImode)
23526 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23527 else
23528 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23529 flags, compare_op));
23531 else
23533 if (code == GT || code == GE)
23534 code = reverse_condition (code);
23535 else
23537 std::swap (ct, cf);
23538 diff = ct - cf;
23540 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23543 if (diff == 1)
23546 * cmpl op0,op1
23547 * sbbl dest,dest
23548 * [addl dest, ct]
23550 * Size 5 - 8.
23552 if (ct)
23553 tmp = expand_simple_binop (mode, PLUS,
23554 tmp, GEN_INT (ct),
23555 copy_rtx (tmp), 1, OPTAB_DIRECT);
23557 else if (cf == -1)
23560 * cmpl op0,op1
23561 * sbbl dest,dest
23562 * orl $ct, dest
23564 * Size 8.
23566 tmp = expand_simple_binop (mode, IOR,
23567 tmp, GEN_INT (ct),
23568 copy_rtx (tmp), 1, OPTAB_DIRECT);
23570 else if (diff == -1 && ct)
23573 * cmpl op0,op1
23574 * sbbl dest,dest
23575 * notl dest
23576 * [addl dest, cf]
23578 * Size 8 - 11.
23580 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23581 if (cf)
23582 tmp = expand_simple_binop (mode, PLUS,
23583 copy_rtx (tmp), GEN_INT (cf),
23584 copy_rtx (tmp), 1, OPTAB_DIRECT);
23586 else
23589 * cmpl op0,op1
23590 * sbbl dest,dest
23591 * [notl dest]
23592 * andl cf - ct, dest
23593 * [addl dest, ct]
23595 * Size 8 - 11.
23598 if (cf == 0)
23600 cf = ct;
23601 ct = 0;
23602 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23605 tmp = expand_simple_binop (mode, AND,
23606 copy_rtx (tmp),
23607 gen_int_mode (cf - ct, mode),
23608 copy_rtx (tmp), 1, OPTAB_DIRECT);
23609 if (ct)
23610 tmp = expand_simple_binop (mode, PLUS,
23611 copy_rtx (tmp), GEN_INT (ct),
23612 copy_rtx (tmp), 1, OPTAB_DIRECT);
23615 if (!rtx_equal_p (tmp, out))
23616 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23618 return true;
23621 if (diff < 0)
23623 machine_mode cmp_mode = GET_MODE (op0);
23624 enum rtx_code new_code;
23626 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23628 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23630 /* We may be reversing unordered compare to normal compare, that
23631 is not valid in general (we may convert non-trapping condition
23632 to trapping one), however on i386 we currently emit all
23633 comparisons unordered. */
23634 new_code = reverse_condition_maybe_unordered (code);
23636 else
23637 new_code = ix86_reverse_condition (code, cmp_mode);
23638 if (new_code != UNKNOWN)
23640 std::swap (ct, cf);
23641 diff = -diff;
23642 code = new_code;
23646 compare_code = UNKNOWN;
23647 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23648 && CONST_INT_P (op1))
23650 if (op1 == const0_rtx
23651 && (code == LT || code == GE))
23652 compare_code = code;
23653 else if (op1 == constm1_rtx)
23655 if (code == LE)
23656 compare_code = LT;
23657 else if (code == GT)
23658 compare_code = GE;
23662 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23663 if (compare_code != UNKNOWN
23664 && GET_MODE (op0) == GET_MODE (out)
23665 && (cf == -1 || ct == -1))
23667 /* If lea code below could be used, only optimize
23668 if it results in a 2 insn sequence. */
23670 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23671 || diff == 3 || diff == 5 || diff == 9)
23672 || (compare_code == LT && ct == -1)
23673 || (compare_code == GE && cf == -1))
23676 * notl op1 (if necessary)
23677 * sarl $31, op1
23678 * orl cf, op1
23680 if (ct != -1)
23682 cf = ct;
23683 ct = -1;
23684 code = reverse_condition (code);
23687 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23689 out = expand_simple_binop (mode, IOR,
23690 out, GEN_INT (cf),
23691 out, 1, OPTAB_DIRECT);
23692 if (out != operands[0])
23693 emit_move_insn (operands[0], out);
23695 return true;
23700 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23701 || diff == 3 || diff == 5 || diff == 9)
23702 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23703 && (mode != DImode
23704 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23707 * xorl dest,dest
23708 * cmpl op1,op2
23709 * setcc dest
23710 * lea cf(dest*(ct-cf)),dest
23712 * Size 14.
23714 * This also catches the degenerate setcc-only case.
23717 rtx tmp;
23718 int nops;
23720 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23722 nops = 0;
23723 /* On x86_64 the lea instruction operates on Pmode, so we need
23724 to get arithmetics done in proper mode to match. */
23725 if (diff == 1)
23726 tmp = copy_rtx (out);
23727 else
23729 rtx out1;
23730 out1 = copy_rtx (out);
23731 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23732 nops++;
23733 if (diff & 1)
23735 tmp = gen_rtx_PLUS (mode, tmp, out1);
23736 nops++;
23739 if (cf != 0)
23741 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23742 nops++;
23744 if (!rtx_equal_p (tmp, out))
23746 if (nops == 1)
23747 out = force_operand (tmp, copy_rtx (out));
23748 else
23749 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23751 if (!rtx_equal_p (out, operands[0]))
23752 emit_move_insn (operands[0], copy_rtx (out));
23754 return true;
23758 * General case: Jumpful:
23759 * xorl dest,dest cmpl op1, op2
23760 * cmpl op1, op2 movl ct, dest
23761 * setcc dest jcc 1f
23762 * decl dest movl cf, dest
23763 * andl (cf-ct),dest 1:
23764 * addl ct,dest
23766 * Size 20. Size 14.
23768 * This is reasonably steep, but branch mispredict costs are
23769 * high on modern cpus, so consider failing only if optimizing
23770 * for space.
23773 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23774 && BRANCH_COST (optimize_insn_for_speed_p (),
23775 false) >= 2)
23777 if (cf == 0)
23779 machine_mode cmp_mode = GET_MODE (op0);
23780 enum rtx_code new_code;
23782 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23784 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23786 /* We may be reversing unordered compare to normal compare,
23787 that is not valid in general (we may convert non-trapping
23788 condition to trapping one), however on i386 we currently
23789 emit all comparisons unordered. */
23790 new_code = reverse_condition_maybe_unordered (code);
23792 else
23794 new_code = ix86_reverse_condition (code, cmp_mode);
23795 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23796 compare_code = reverse_condition (compare_code);
23799 if (new_code != UNKNOWN)
23801 cf = ct;
23802 ct = 0;
23803 code = new_code;
23807 if (compare_code != UNKNOWN)
23809 /* notl op1 (if needed)
23810 sarl $31, op1
23811 andl (cf-ct), op1
23812 addl ct, op1
23814 For x < 0 (resp. x <= -1) there will be no notl,
23815 so if possible swap the constants to get rid of the
23816 complement.
23817 True/false will be -1/0 while code below (store flag
23818 followed by decrement) is 0/-1, so the constants need
23819 to be exchanged once more. */
23821 if (compare_code == GE || !cf)
23823 code = reverse_condition (code);
23824 compare_code = LT;
23826 else
23827 std::swap (ct, cf);
23829 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23831 else
23833 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23835 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23836 constm1_rtx,
23837 copy_rtx (out), 1, OPTAB_DIRECT);
23840 out = expand_simple_binop (mode, AND, copy_rtx (out),
23841 gen_int_mode (cf - ct, mode),
23842 copy_rtx (out), 1, OPTAB_DIRECT);
23843 if (ct)
23844 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23845 copy_rtx (out), 1, OPTAB_DIRECT);
23846 if (!rtx_equal_p (out, operands[0]))
23847 emit_move_insn (operands[0], copy_rtx (out));
23849 return true;
23853 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23855 /* Try a few things more with specific constants and a variable. */
23857 optab op;
23858 rtx var, orig_out, out, tmp;
23860 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23861 return false;
23863 /* If one of the two operands is an interesting constant, load a
23864 constant with the above and mask it in with a logical operation. */
23866 if (CONST_INT_P (operands[2]))
23868 var = operands[3];
23869 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23870 operands[3] = constm1_rtx, op = and_optab;
23871 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23872 operands[3] = const0_rtx, op = ior_optab;
23873 else
23874 return false;
23876 else if (CONST_INT_P (operands[3]))
23878 var = operands[2];
23879 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23880 operands[2] = constm1_rtx, op = and_optab;
23881 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23882 operands[2] = const0_rtx, op = ior_optab;
23883 else
23884 return false;
23886 else
23887 return false;
23889 orig_out = operands[0];
23890 tmp = gen_reg_rtx (mode);
23891 operands[0] = tmp;
23893 /* Recurse to get the constant loaded. */
23894 if (!ix86_expand_int_movcc (operands))
23895 return false;
23897 /* Mask in the interesting variable. */
23898 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23899 OPTAB_WIDEN);
23900 if (!rtx_equal_p (out, orig_out))
23901 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23903 return true;
23907 * For comparison with above,
23909 * movl cf,dest
23910 * movl ct,tmp
23911 * cmpl op1,op2
23912 * cmovcc tmp,dest
23914 * Size 15.
23917 if (! nonimmediate_operand (operands[2], mode))
23918 operands[2] = force_reg (mode, operands[2]);
23919 if (! nonimmediate_operand (operands[3], mode))
23920 operands[3] = force_reg (mode, operands[3]);
23922 if (! register_operand (operands[2], VOIDmode)
23923 && (mode == QImode
23924 || ! register_operand (operands[3], VOIDmode)))
23925 operands[2] = force_reg (mode, operands[2]);
23927 if (mode == QImode
23928 && ! register_operand (operands[3], VOIDmode))
23929 operands[3] = force_reg (mode, operands[3]);
23931 emit_insn (compare_seq);
23932 emit_insn (gen_rtx_SET (operands[0],
23933 gen_rtx_IF_THEN_ELSE (mode,
23934 compare_op, operands[2],
23935 operands[3])));
23936 return true;
23939 /* Swap, force into registers, or otherwise massage the two operands
23940 to an sse comparison with a mask result. Thus we differ a bit from
23941 ix86_prepare_fp_compare_args which expects to produce a flags result.
23943 The DEST operand exists to help determine whether to commute commutative
23944 operators. The POP0/POP1 operands are updated in place. The new
23945 comparison code is returned, or UNKNOWN if not implementable. */
23947 static enum rtx_code
23948 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23949 rtx *pop0, rtx *pop1)
23951 switch (code)
23953 case LTGT:
23954 case UNEQ:
23955 /* AVX supports all the needed comparisons. */
23956 if (TARGET_AVX)
23957 break;
23958 /* We have no LTGT as an operator. We could implement it with
23959 NE & ORDERED, but this requires an extra temporary. It's
23960 not clear that it's worth it. */
23961 return UNKNOWN;
23963 case LT:
23964 case LE:
23965 case UNGT:
23966 case UNGE:
23967 /* These are supported directly. */
23968 break;
23970 case EQ:
23971 case NE:
23972 case UNORDERED:
23973 case ORDERED:
23974 /* AVX has 3 operand comparisons, no need to swap anything. */
23975 if (TARGET_AVX)
23976 break;
23977 /* For commutative operators, try to canonicalize the destination
23978 operand to be first in the comparison - this helps reload to
23979 avoid extra moves. */
23980 if (!dest || !rtx_equal_p (dest, *pop1))
23981 break;
23982 /* FALLTHRU */
23984 case GE:
23985 case GT:
23986 case UNLE:
23987 case UNLT:
23988 /* These are not supported directly before AVX, and furthermore
23989 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23990 comparison operands to transform into something that is
23991 supported. */
23992 std::swap (*pop0, *pop1);
23993 code = swap_condition (code);
23994 break;
23996 default:
23997 gcc_unreachable ();
24000 return code;
24003 /* Detect conditional moves that exactly match min/max operational
24004 semantics. Note that this is IEEE safe, as long as we don't
24005 interchange the operands.
24007 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24008 and TRUE if the operation is successful and instructions are emitted. */
24010 static bool
24011 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24012 rtx cmp_op1, rtx if_true, rtx if_false)
24014 machine_mode mode;
24015 bool is_min;
24016 rtx tmp;
24018 if (code == LT)
24020 else if (code == UNGE)
24021 std::swap (if_true, if_false);
24022 else
24023 return false;
24025 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24026 is_min = true;
24027 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24028 is_min = false;
24029 else
24030 return false;
24032 mode = GET_MODE (dest);
24034 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24035 but MODE may be a vector mode and thus not appropriate. */
24036 if (!flag_finite_math_only || flag_signed_zeros)
24038 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24039 rtvec v;
24041 if_true = force_reg (mode, if_true);
24042 v = gen_rtvec (2, if_true, if_false);
24043 tmp = gen_rtx_UNSPEC (mode, v, u);
24045 else
24047 code = is_min ? SMIN : SMAX;
24048 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24051 emit_insn (gen_rtx_SET (dest, tmp));
24052 return true;
24055 /* Expand an sse vector comparison. Return the register with the result. */
24057 static rtx
24058 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24059 rtx op_true, rtx op_false)
24061 machine_mode mode = GET_MODE (dest);
24062 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24064 /* In general case result of comparison can differ from operands' type. */
24065 machine_mode cmp_mode;
24067 /* In AVX512F the result of comparison is an integer mask. */
24068 bool maskcmp = false;
24069 rtx x;
24071 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24073 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
24074 cmp_mode = int_mode_for_size (nbits, 0).require ();
24075 maskcmp = true;
24077 else
24078 cmp_mode = cmp_ops_mode;
24081 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24082 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24083 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24085 if (optimize
24086 || (maskcmp && cmp_mode != mode)
24087 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24088 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24089 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24091 /* Compare patterns for int modes are unspec in AVX512F only. */
24092 if (maskcmp && (code == GT || code == EQ))
24094 rtx (*gen)(rtx, rtx, rtx);
24096 switch (cmp_ops_mode)
24098 case E_V64QImode:
24099 gcc_assert (TARGET_AVX512BW);
24100 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24101 break;
24102 case E_V32HImode:
24103 gcc_assert (TARGET_AVX512BW);
24104 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24105 break;
24106 case E_V16SImode:
24107 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24108 break;
24109 case E_V8DImode:
24110 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24111 break;
24112 default:
24113 gen = NULL;
24116 if (gen)
24118 emit_insn (gen (dest, cmp_op0, cmp_op1));
24119 return dest;
24122 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24124 if (cmp_mode != mode && !maskcmp)
24126 x = force_reg (cmp_ops_mode, x);
24127 convert_move (dest, x, false);
24129 else
24130 emit_insn (gen_rtx_SET (dest, x));
24132 return dest;
24135 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24136 operations. This is used for both scalar and vector conditional moves. */
24138 void
24139 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24141 machine_mode mode = GET_MODE (dest);
24142 machine_mode cmpmode = GET_MODE (cmp);
24144 /* In AVX512F the result of comparison is an integer mask. */
24145 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24147 rtx t2, t3, x;
24149 /* If we have an integer mask and FP value then we need
24150 to cast mask to FP mode. */
24151 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24153 cmp = force_reg (cmpmode, cmp);
24154 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24157 if (vector_all_ones_operand (op_true, mode)
24158 && rtx_equal_p (op_false, CONST0_RTX (mode))
24159 && !maskcmp)
24161 emit_insn (gen_rtx_SET (dest, cmp));
24163 else if (op_false == CONST0_RTX (mode)
24164 && !maskcmp)
24166 op_true = force_reg (mode, op_true);
24167 x = gen_rtx_AND (mode, cmp, op_true);
24168 emit_insn (gen_rtx_SET (dest, x));
24170 else if (op_true == CONST0_RTX (mode)
24171 && !maskcmp)
24173 op_false = force_reg (mode, op_false);
24174 x = gen_rtx_NOT (mode, cmp);
24175 x = gen_rtx_AND (mode, x, op_false);
24176 emit_insn (gen_rtx_SET (dest, x));
24178 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24179 && !maskcmp)
24181 op_false = force_reg (mode, op_false);
24182 x = gen_rtx_IOR (mode, cmp, op_false);
24183 emit_insn (gen_rtx_SET (dest, x));
24185 else if (TARGET_XOP
24186 && !maskcmp)
24188 op_true = force_reg (mode, op_true);
24190 if (!nonimmediate_operand (op_false, mode))
24191 op_false = force_reg (mode, op_false);
24193 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24194 op_true,
24195 op_false)));
24197 else
24199 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24200 rtx d = dest;
24202 if (!nonimmediate_operand (op_true, mode))
24203 op_true = force_reg (mode, op_true);
24205 op_false = force_reg (mode, op_false);
24207 switch (mode)
24209 case E_V4SFmode:
24210 if (TARGET_SSE4_1)
24211 gen = gen_sse4_1_blendvps;
24212 break;
24213 case E_V2DFmode:
24214 if (TARGET_SSE4_1)
24215 gen = gen_sse4_1_blendvpd;
24216 break;
24217 case E_V16QImode:
24218 case E_V8HImode:
24219 case E_V4SImode:
24220 case E_V2DImode:
24221 if (TARGET_SSE4_1)
24223 gen = gen_sse4_1_pblendvb;
24224 if (mode != V16QImode)
24225 d = gen_reg_rtx (V16QImode);
24226 op_false = gen_lowpart (V16QImode, op_false);
24227 op_true = gen_lowpart (V16QImode, op_true);
24228 cmp = gen_lowpart (V16QImode, cmp);
24230 break;
24231 case E_V8SFmode:
24232 if (TARGET_AVX)
24233 gen = gen_avx_blendvps256;
24234 break;
24235 case E_V4DFmode:
24236 if (TARGET_AVX)
24237 gen = gen_avx_blendvpd256;
24238 break;
24239 case E_V32QImode:
24240 case E_V16HImode:
24241 case E_V8SImode:
24242 case E_V4DImode:
24243 if (TARGET_AVX2)
24245 gen = gen_avx2_pblendvb;
24246 if (mode != V32QImode)
24247 d = gen_reg_rtx (V32QImode);
24248 op_false = gen_lowpart (V32QImode, op_false);
24249 op_true = gen_lowpart (V32QImode, op_true);
24250 cmp = gen_lowpart (V32QImode, cmp);
24252 break;
24254 case E_V64QImode:
24255 gen = gen_avx512bw_blendmv64qi;
24256 break;
24257 case E_V32HImode:
24258 gen = gen_avx512bw_blendmv32hi;
24259 break;
24260 case E_V16SImode:
24261 gen = gen_avx512f_blendmv16si;
24262 break;
24263 case E_V8DImode:
24264 gen = gen_avx512f_blendmv8di;
24265 break;
24266 case E_V8DFmode:
24267 gen = gen_avx512f_blendmv8df;
24268 break;
24269 case E_V16SFmode:
24270 gen = gen_avx512f_blendmv16sf;
24271 break;
24273 default:
24274 break;
24277 if (gen != NULL)
24279 emit_insn (gen (d, op_false, op_true, cmp));
24280 if (d != dest)
24281 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24283 else
24285 op_true = force_reg (mode, op_true);
24287 t2 = gen_reg_rtx (mode);
24288 if (optimize)
24289 t3 = gen_reg_rtx (mode);
24290 else
24291 t3 = dest;
24293 x = gen_rtx_AND (mode, op_true, cmp);
24294 emit_insn (gen_rtx_SET (t2, x));
24296 x = gen_rtx_NOT (mode, cmp);
24297 x = gen_rtx_AND (mode, x, op_false);
24298 emit_insn (gen_rtx_SET (t3, x));
24300 x = gen_rtx_IOR (mode, t3, t2);
24301 emit_insn (gen_rtx_SET (dest, x));
24306 /* Expand a floating-point conditional move. Return true if successful. */
24308 bool
24309 ix86_expand_fp_movcc (rtx operands[])
24311 machine_mode mode = GET_MODE (operands[0]);
24312 enum rtx_code code = GET_CODE (operands[1]);
24313 rtx tmp, compare_op;
24314 rtx op0 = XEXP (operands[1], 0);
24315 rtx op1 = XEXP (operands[1], 1);
24317 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24319 machine_mode cmode;
24321 /* Since we've no cmove for sse registers, don't force bad register
24322 allocation just to gain access to it. Deny movcc when the
24323 comparison mode doesn't match the move mode. */
24324 cmode = GET_MODE (op0);
24325 if (cmode == VOIDmode)
24326 cmode = GET_MODE (op1);
24327 if (cmode != mode)
24328 return false;
24330 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24331 if (code == UNKNOWN)
24332 return false;
24334 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24335 operands[2], operands[3]))
24336 return true;
24338 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24339 operands[2], operands[3]);
24340 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24341 return true;
24344 if (GET_MODE (op0) == TImode
24345 || (GET_MODE (op0) == DImode
24346 && !TARGET_64BIT))
24347 return false;
24349 /* The floating point conditional move instructions don't directly
24350 support conditions resulting from a signed integer comparison. */
24352 compare_op = ix86_expand_compare (code, op0, op1);
24353 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24355 tmp = gen_reg_rtx (QImode);
24356 ix86_expand_setcc (tmp, code, op0, op1);
24358 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24361 emit_insn (gen_rtx_SET (operands[0],
24362 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24363 operands[2], operands[3])));
24365 return true;
24368 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24370 static int
24371 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24373 switch (code)
24375 case EQ:
24376 return 0;
24377 case LT:
24378 case LTU:
24379 return 1;
24380 case LE:
24381 case LEU:
24382 return 2;
24383 case NE:
24384 return 4;
24385 case GE:
24386 case GEU:
24387 return 5;
24388 case GT:
24389 case GTU:
24390 return 6;
24391 default:
24392 gcc_unreachable ();
24396 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24398 static int
24399 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24401 switch (code)
24403 case EQ:
24404 return 0x00;
24405 case NE:
24406 return 0x04;
24407 case GT:
24408 return 0x0e;
24409 case LE:
24410 return 0x02;
24411 case GE:
24412 return 0x0d;
24413 case LT:
24414 return 0x01;
24415 case UNLE:
24416 return 0x0a;
24417 case UNLT:
24418 return 0x09;
24419 case UNGE:
24420 return 0x05;
24421 case UNGT:
24422 return 0x06;
24423 case UNEQ:
24424 return 0x18;
24425 case LTGT:
24426 return 0x0c;
24427 case ORDERED:
24428 return 0x07;
24429 case UNORDERED:
24430 return 0x03;
24431 default:
24432 gcc_unreachable ();
24436 /* Return immediate value to be used in UNSPEC_PCMP
24437 for comparison CODE in MODE. */
24439 static int
24440 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24442 if (FLOAT_MODE_P (mode))
24443 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24444 return ix86_int_cmp_code_to_pcmp_immediate (code);
24447 /* Expand AVX-512 vector comparison. */
24449 bool
24450 ix86_expand_mask_vec_cmp (rtx operands[])
24452 machine_mode mask_mode = GET_MODE (operands[0]);
24453 machine_mode cmp_mode = GET_MODE (operands[2]);
24454 enum rtx_code code = GET_CODE (operands[1]);
24455 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24456 int unspec_code;
24457 rtx unspec;
24459 switch (code)
24461 case LEU:
24462 case GTU:
24463 case GEU:
24464 case LTU:
24465 unspec_code = UNSPEC_UNSIGNED_PCMP;
24466 break;
24468 default:
24469 unspec_code = UNSPEC_PCMP;
24472 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24473 operands[3], imm),
24474 unspec_code);
24475 emit_insn (gen_rtx_SET (operands[0], unspec));
24477 return true;
24480 /* Expand fp vector comparison. */
24482 bool
24483 ix86_expand_fp_vec_cmp (rtx operands[])
24485 enum rtx_code code = GET_CODE (operands[1]);
24486 rtx cmp;
24488 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24489 &operands[2], &operands[3]);
24490 if (code == UNKNOWN)
24492 rtx temp;
24493 switch (GET_CODE (operands[1]))
24495 case LTGT:
24496 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24497 operands[3], NULL, NULL);
24498 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24499 operands[3], NULL, NULL);
24500 code = AND;
24501 break;
24502 case UNEQ:
24503 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24504 operands[3], NULL, NULL);
24505 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24506 operands[3], NULL, NULL);
24507 code = IOR;
24508 break;
24509 default:
24510 gcc_unreachable ();
24512 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24513 OPTAB_DIRECT);
24515 else
24516 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24517 operands[1], operands[2]);
24519 if (operands[0] != cmp)
24520 emit_move_insn (operands[0], cmp);
24522 return true;
24525 static rtx
24526 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24527 rtx op_true, rtx op_false, bool *negate)
24529 machine_mode data_mode = GET_MODE (dest);
24530 machine_mode mode = GET_MODE (cop0);
24531 rtx x;
24533 *negate = false;
24535 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24536 if (TARGET_XOP
24537 && (mode == V16QImode || mode == V8HImode
24538 || mode == V4SImode || mode == V2DImode))
24540 else
24542 /* Canonicalize the comparison to EQ, GT, GTU. */
24543 switch (code)
24545 case EQ:
24546 case GT:
24547 case GTU:
24548 break;
24550 case NE:
24551 case LE:
24552 case LEU:
24553 code = reverse_condition (code);
24554 *negate = true;
24555 break;
24557 case GE:
24558 case GEU:
24559 code = reverse_condition (code);
24560 *negate = true;
24561 /* FALLTHRU */
24563 case LT:
24564 case LTU:
24565 std::swap (cop0, cop1);
24566 code = swap_condition (code);
24567 break;
24569 default:
24570 gcc_unreachable ();
24573 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24574 if (mode == V2DImode)
24576 switch (code)
24578 case EQ:
24579 /* SSE4.1 supports EQ. */
24580 if (!TARGET_SSE4_1)
24581 return NULL;
24582 break;
24584 case GT:
24585 case GTU:
24586 /* SSE4.2 supports GT/GTU. */
24587 if (!TARGET_SSE4_2)
24588 return NULL;
24589 break;
24591 default:
24592 gcc_unreachable ();
24596 /* Unsigned parallel compare is not supported by the hardware.
24597 Play some tricks to turn this into a signed comparison
24598 against 0. */
24599 if (code == GTU)
24601 cop0 = force_reg (mode, cop0);
24603 switch (mode)
24605 case E_V16SImode:
24606 case E_V8DImode:
24607 case E_V8SImode:
24608 case E_V4DImode:
24609 case E_V4SImode:
24610 case E_V2DImode:
24612 rtx t1, t2, mask;
24613 rtx (*gen_sub3) (rtx, rtx, rtx);
24615 switch (mode)
24617 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24618 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24619 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24620 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24621 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24622 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24623 default:
24624 gcc_unreachable ();
24626 /* Subtract (-(INT MAX) - 1) from both operands to make
24627 them signed. */
24628 mask = ix86_build_signbit_mask (mode, true, false);
24629 t1 = gen_reg_rtx (mode);
24630 emit_insn (gen_sub3 (t1, cop0, mask));
24632 t2 = gen_reg_rtx (mode);
24633 emit_insn (gen_sub3 (t2, cop1, mask));
24635 cop0 = t1;
24636 cop1 = t2;
24637 code = GT;
24639 break;
24641 case E_V64QImode:
24642 case E_V32HImode:
24643 case E_V32QImode:
24644 case E_V16HImode:
24645 case E_V16QImode:
24646 case E_V8HImode:
24647 /* Perform a parallel unsigned saturating subtraction. */
24648 x = gen_reg_rtx (mode);
24649 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24650 cop1)));
24652 cop0 = x;
24653 cop1 = CONST0_RTX (mode);
24654 code = EQ;
24655 *negate = !*negate;
24656 break;
24658 default:
24659 gcc_unreachable ();
24664 if (*negate)
24665 std::swap (op_true, op_false);
24667 /* Allow the comparison to be done in one mode, but the movcc to
24668 happen in another mode. */
24669 if (data_mode == mode)
24671 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24672 op_true, op_false);
24674 else
24676 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24677 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24678 op_true, op_false);
24679 if (GET_MODE (x) == mode)
24680 x = gen_lowpart (data_mode, x);
24683 return x;
24686 /* Expand integer vector comparison. */
24688 bool
24689 ix86_expand_int_vec_cmp (rtx operands[])
24691 rtx_code code = GET_CODE (operands[1]);
24692 bool negate = false;
24693 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24694 operands[3], NULL, NULL, &negate);
24696 if (!cmp)
24697 return false;
24699 if (negate)
24700 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24701 CONST0_RTX (GET_MODE (cmp)),
24702 NULL, NULL, &negate);
24704 gcc_assert (!negate);
24706 if (operands[0] != cmp)
24707 emit_move_insn (operands[0], cmp);
24709 return true;
24712 /* Expand a floating-point vector conditional move; a vcond operation
24713 rather than a movcc operation. */
24715 bool
24716 ix86_expand_fp_vcond (rtx operands[])
24718 enum rtx_code code = GET_CODE (operands[3]);
24719 rtx cmp;
24721 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24722 &operands[4], &operands[5]);
24723 if (code == UNKNOWN)
24725 rtx temp;
24726 switch (GET_CODE (operands[3]))
24728 case LTGT:
24729 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24730 operands[5], operands[0], operands[0]);
24731 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24732 operands[5], operands[1], operands[2]);
24733 code = AND;
24734 break;
24735 case UNEQ:
24736 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24737 operands[5], operands[0], operands[0]);
24738 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24739 operands[5], operands[1], operands[2]);
24740 code = IOR;
24741 break;
24742 default:
24743 gcc_unreachable ();
24745 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24746 OPTAB_DIRECT);
24747 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24748 return true;
24751 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24752 operands[5], operands[1], operands[2]))
24753 return true;
24755 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24756 operands[1], operands[2]);
24757 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24758 return true;
24761 /* Expand a signed/unsigned integral vector conditional move. */
24763 bool
24764 ix86_expand_int_vcond (rtx operands[])
24766 machine_mode data_mode = GET_MODE (operands[0]);
24767 machine_mode mode = GET_MODE (operands[4]);
24768 enum rtx_code code = GET_CODE (operands[3]);
24769 bool negate = false;
24770 rtx x, cop0, cop1;
24772 cop0 = operands[4];
24773 cop1 = operands[5];
24775 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24776 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24777 if ((code == LT || code == GE)
24778 && data_mode == mode
24779 && cop1 == CONST0_RTX (mode)
24780 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24781 && GET_MODE_UNIT_SIZE (data_mode) > 1
24782 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24783 && (GET_MODE_SIZE (data_mode) == 16
24784 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24786 rtx negop = operands[2 - (code == LT)];
24787 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24788 if (negop == CONST1_RTX (data_mode))
24790 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24791 operands[0], 1, OPTAB_DIRECT);
24792 if (res != operands[0])
24793 emit_move_insn (operands[0], res);
24794 return true;
24796 else if (GET_MODE_INNER (data_mode) != DImode
24797 && vector_all_ones_operand (negop, data_mode))
24799 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24800 operands[0], 0, OPTAB_DIRECT);
24801 if (res != operands[0])
24802 emit_move_insn (operands[0], res);
24803 return true;
24807 if (!nonimmediate_operand (cop1, mode))
24808 cop1 = force_reg (mode, cop1);
24809 if (!general_operand (operands[1], data_mode))
24810 operands[1] = force_reg (data_mode, operands[1]);
24811 if (!general_operand (operands[2], data_mode))
24812 operands[2] = force_reg (data_mode, operands[2]);
24814 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24815 operands[1], operands[2], &negate);
24817 if (!x)
24818 return false;
24820 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24821 operands[2-negate]);
24822 return true;
24825 /* AVX512F does support 64-byte integer vector operations,
24826 thus the longest vector we are faced with is V64QImode. */
24827 #define MAX_VECT_LEN 64
24829 struct expand_vec_perm_d
24831 rtx target, op0, op1;
24832 unsigned char perm[MAX_VECT_LEN];
24833 machine_mode vmode;
24834 unsigned char nelt;
24835 bool one_operand_p;
24836 bool testing_p;
24839 static bool
24840 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24841 struct expand_vec_perm_d *d)
24843 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24844 expander, so args are either in d, or in op0, op1 etc. */
24845 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24846 machine_mode maskmode = mode;
24847 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24849 switch (mode)
24851 case E_V8HImode:
24852 if (TARGET_AVX512VL && TARGET_AVX512BW)
24853 gen = gen_avx512vl_vpermt2varv8hi3;
24854 break;
24855 case E_V16HImode:
24856 if (TARGET_AVX512VL && TARGET_AVX512BW)
24857 gen = gen_avx512vl_vpermt2varv16hi3;
24858 break;
24859 case E_V64QImode:
24860 if (TARGET_AVX512VBMI)
24861 gen = gen_avx512bw_vpermt2varv64qi3;
24862 break;
24863 case E_V32HImode:
24864 if (TARGET_AVX512BW)
24865 gen = gen_avx512bw_vpermt2varv32hi3;
24866 break;
24867 case E_V4SImode:
24868 if (TARGET_AVX512VL)
24869 gen = gen_avx512vl_vpermt2varv4si3;
24870 break;
24871 case E_V8SImode:
24872 if (TARGET_AVX512VL)
24873 gen = gen_avx512vl_vpermt2varv8si3;
24874 break;
24875 case E_V16SImode:
24876 if (TARGET_AVX512F)
24877 gen = gen_avx512f_vpermt2varv16si3;
24878 break;
24879 case E_V4SFmode:
24880 if (TARGET_AVX512VL)
24882 gen = gen_avx512vl_vpermt2varv4sf3;
24883 maskmode = V4SImode;
24885 break;
24886 case E_V8SFmode:
24887 if (TARGET_AVX512VL)
24889 gen = gen_avx512vl_vpermt2varv8sf3;
24890 maskmode = V8SImode;
24892 break;
24893 case E_V16SFmode:
24894 if (TARGET_AVX512F)
24896 gen = gen_avx512f_vpermt2varv16sf3;
24897 maskmode = V16SImode;
24899 break;
24900 case E_V2DImode:
24901 if (TARGET_AVX512VL)
24902 gen = gen_avx512vl_vpermt2varv2di3;
24903 break;
24904 case E_V4DImode:
24905 if (TARGET_AVX512VL)
24906 gen = gen_avx512vl_vpermt2varv4di3;
24907 break;
24908 case E_V8DImode:
24909 if (TARGET_AVX512F)
24910 gen = gen_avx512f_vpermt2varv8di3;
24911 break;
24912 case E_V2DFmode:
24913 if (TARGET_AVX512VL)
24915 gen = gen_avx512vl_vpermt2varv2df3;
24916 maskmode = V2DImode;
24918 break;
24919 case E_V4DFmode:
24920 if (TARGET_AVX512VL)
24922 gen = gen_avx512vl_vpermt2varv4df3;
24923 maskmode = V4DImode;
24925 break;
24926 case E_V8DFmode:
24927 if (TARGET_AVX512F)
24929 gen = gen_avx512f_vpermt2varv8df3;
24930 maskmode = V8DImode;
24932 break;
24933 default:
24934 break;
24937 if (gen == NULL)
24938 return false;
24940 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24941 expander, so args are either in d, or in op0, op1 etc. */
24942 if (d)
24944 rtx vec[64];
24945 target = d->target;
24946 op0 = d->op0;
24947 op1 = d->op1;
24948 for (int i = 0; i < d->nelt; ++i)
24949 vec[i] = GEN_INT (d->perm[i]);
24950 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24953 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24954 return true;
24957 /* Expand a variable vector permutation. */
24959 void
24960 ix86_expand_vec_perm (rtx operands[])
24962 rtx target = operands[0];
24963 rtx op0 = operands[1];
24964 rtx op1 = operands[2];
24965 rtx mask = operands[3];
24966 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24967 machine_mode mode = GET_MODE (op0);
24968 machine_mode maskmode = GET_MODE (mask);
24969 int w, e, i;
24970 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24972 /* Number of elements in the vector. */
24973 w = GET_MODE_NUNITS (mode);
24974 e = GET_MODE_UNIT_SIZE (mode);
24975 gcc_assert (w <= 64);
24977 if (TARGET_AVX512F && one_operand_shuffle)
24979 rtx (*gen) (rtx, rtx, rtx) = NULL;
24980 switch (mode)
24982 case E_V16SImode:
24983 gen =gen_avx512f_permvarv16si;
24984 break;
24985 case E_V16SFmode:
24986 gen = gen_avx512f_permvarv16sf;
24987 break;
24988 case E_V8DImode:
24989 gen = gen_avx512f_permvarv8di;
24990 break;
24991 case E_V8DFmode:
24992 gen = gen_avx512f_permvarv8df;
24993 break;
24994 default:
24995 break;
24997 if (gen != NULL)
24999 emit_insn (gen (target, op0, mask));
25000 return;
25004 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
25005 return;
25007 if (TARGET_AVX2)
25009 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25011 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25012 an constant shuffle operand. With a tiny bit of effort we can
25013 use VPERMD instead. A re-interpretation stall for V4DFmode is
25014 unfortunate but there's no avoiding it.
25015 Similarly for V16HImode we don't have instructions for variable
25016 shuffling, while for V32QImode we can use after preparing suitable
25017 masks vpshufb; vpshufb; vpermq; vpor. */
25019 if (mode == V16HImode)
25021 maskmode = mode = V32QImode;
25022 w = 32;
25023 e = 1;
25025 else
25027 maskmode = mode = V8SImode;
25028 w = 8;
25029 e = 4;
25031 t1 = gen_reg_rtx (maskmode);
25033 /* Replicate the low bits of the V4DImode mask into V8SImode:
25034 mask = { A B C D }
25035 t1 = { A A B B C C D D }. */
25036 for (i = 0; i < w / 2; ++i)
25037 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25038 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25039 vt = force_reg (maskmode, vt);
25040 mask = gen_lowpart (maskmode, mask);
25041 if (maskmode == V8SImode)
25042 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25043 else
25044 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25046 /* Multiply the shuffle indicies by two. */
25047 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25048 OPTAB_DIRECT);
25050 /* Add one to the odd shuffle indicies:
25051 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25052 for (i = 0; i < w / 2; ++i)
25054 vec[i * 2] = const0_rtx;
25055 vec[i * 2 + 1] = const1_rtx;
25057 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25058 vt = validize_mem (force_const_mem (maskmode, vt));
25059 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25060 OPTAB_DIRECT);
25062 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25063 operands[3] = mask = t1;
25064 target = gen_reg_rtx (mode);
25065 op0 = gen_lowpart (mode, op0);
25066 op1 = gen_lowpart (mode, op1);
25069 switch (mode)
25071 case E_V8SImode:
25072 /* The VPERMD and VPERMPS instructions already properly ignore
25073 the high bits of the shuffle elements. No need for us to
25074 perform an AND ourselves. */
25075 if (one_operand_shuffle)
25077 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25078 if (target != operands[0])
25079 emit_move_insn (operands[0],
25080 gen_lowpart (GET_MODE (operands[0]), target));
25082 else
25084 t1 = gen_reg_rtx (V8SImode);
25085 t2 = gen_reg_rtx (V8SImode);
25086 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25087 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25088 goto merge_two;
25090 return;
25092 case E_V8SFmode:
25093 mask = gen_lowpart (V8SImode, mask);
25094 if (one_operand_shuffle)
25095 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25096 else
25098 t1 = gen_reg_rtx (V8SFmode);
25099 t2 = gen_reg_rtx (V8SFmode);
25100 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25101 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25102 goto merge_two;
25104 return;
25106 case E_V4SImode:
25107 /* By combining the two 128-bit input vectors into one 256-bit
25108 input vector, we can use VPERMD and VPERMPS for the full
25109 two-operand shuffle. */
25110 t1 = gen_reg_rtx (V8SImode);
25111 t2 = gen_reg_rtx (V8SImode);
25112 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25113 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25114 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25115 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25116 return;
25118 case E_V4SFmode:
25119 t1 = gen_reg_rtx (V8SFmode);
25120 t2 = gen_reg_rtx (V8SImode);
25121 mask = gen_lowpart (V4SImode, mask);
25122 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25123 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25124 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25125 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25126 return;
25128 case E_V32QImode:
25129 t1 = gen_reg_rtx (V32QImode);
25130 t2 = gen_reg_rtx (V32QImode);
25131 t3 = gen_reg_rtx (V32QImode);
25132 vt2 = GEN_INT (-128);
25133 vt = gen_const_vec_duplicate (V32QImode, vt2);
25134 vt = force_reg (V32QImode, vt);
25135 for (i = 0; i < 32; i++)
25136 vec[i] = i < 16 ? vt2 : const0_rtx;
25137 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25138 vt2 = force_reg (V32QImode, vt2);
25139 /* From mask create two adjusted masks, which contain the same
25140 bits as mask in the low 7 bits of each vector element.
25141 The first mask will have the most significant bit clear
25142 if it requests element from the same 128-bit lane
25143 and MSB set if it requests element from the other 128-bit lane.
25144 The second mask will have the opposite values of the MSB,
25145 and additionally will have its 128-bit lanes swapped.
25146 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25147 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25148 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25149 stands for other 12 bytes. */
25150 /* The bit whether element is from the same lane or the other
25151 lane is bit 4, so shift it up by 3 to the MSB position. */
25152 t5 = gen_reg_rtx (V4DImode);
25153 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25154 GEN_INT (3)));
25155 /* Clear MSB bits from the mask just in case it had them set. */
25156 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25157 /* After this t1 will have MSB set for elements from other lane. */
25158 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25159 /* Clear bits other than MSB. */
25160 emit_insn (gen_andv32qi3 (t1, t1, vt));
25161 /* Or in the lower bits from mask into t3. */
25162 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25163 /* And invert MSB bits in t1, so MSB is set for elements from the same
25164 lane. */
25165 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25166 /* Swap 128-bit lanes in t3. */
25167 t6 = gen_reg_rtx (V4DImode);
25168 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25169 const2_rtx, GEN_INT (3),
25170 const0_rtx, const1_rtx));
25171 /* And or in the lower bits from mask into t1. */
25172 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25173 if (one_operand_shuffle)
25175 /* Each of these shuffles will put 0s in places where
25176 element from the other 128-bit lane is needed, otherwise
25177 will shuffle in the requested value. */
25178 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25179 gen_lowpart (V32QImode, t6)));
25180 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25181 /* For t3 the 128-bit lanes are swapped again. */
25182 t7 = gen_reg_rtx (V4DImode);
25183 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25184 const2_rtx, GEN_INT (3),
25185 const0_rtx, const1_rtx));
25186 /* And oring both together leads to the result. */
25187 emit_insn (gen_iorv32qi3 (target, t1,
25188 gen_lowpart (V32QImode, t7)));
25189 if (target != operands[0])
25190 emit_move_insn (operands[0],
25191 gen_lowpart (GET_MODE (operands[0]), target));
25192 return;
25195 t4 = gen_reg_rtx (V32QImode);
25196 /* Similarly to the above one_operand_shuffle code,
25197 just for repeated twice for each operand. merge_two:
25198 code will merge the two results together. */
25199 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25200 gen_lowpart (V32QImode, t6)));
25201 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25202 gen_lowpart (V32QImode, t6)));
25203 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25204 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25205 t7 = gen_reg_rtx (V4DImode);
25206 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25207 const2_rtx, GEN_INT (3),
25208 const0_rtx, const1_rtx));
25209 t8 = gen_reg_rtx (V4DImode);
25210 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25211 const2_rtx, GEN_INT (3),
25212 const0_rtx, const1_rtx));
25213 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25214 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25215 t1 = t4;
25216 t2 = t3;
25217 goto merge_two;
25219 default:
25220 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25221 break;
25225 if (TARGET_XOP)
25227 /* The XOP VPPERM insn supports three inputs. By ignoring the
25228 one_operand_shuffle special case, we avoid creating another
25229 set of constant vectors in memory. */
25230 one_operand_shuffle = false;
25232 /* mask = mask & {2*w-1, ...} */
25233 vt = GEN_INT (2*w - 1);
25235 else
25237 /* mask = mask & {w-1, ...} */
25238 vt = GEN_INT (w - 1);
25241 vt = gen_const_vec_duplicate (maskmode, vt);
25242 mask = expand_simple_binop (maskmode, AND, mask, vt,
25243 NULL_RTX, 0, OPTAB_DIRECT);
25245 /* For non-QImode operations, convert the word permutation control
25246 into a byte permutation control. */
25247 if (mode != V16QImode)
25249 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25250 GEN_INT (exact_log2 (e)),
25251 NULL_RTX, 0, OPTAB_DIRECT);
25253 /* Convert mask to vector of chars. */
25254 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25256 /* Replicate each of the input bytes into byte positions:
25257 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25258 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25259 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25260 for (i = 0; i < 16; ++i)
25261 vec[i] = GEN_INT (i/e * e);
25262 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25263 vt = validize_mem (force_const_mem (V16QImode, vt));
25264 if (TARGET_XOP)
25265 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25266 else
25267 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25269 /* Convert it into the byte positions by doing
25270 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25271 for (i = 0; i < 16; ++i)
25272 vec[i] = GEN_INT (i % e);
25273 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25274 vt = validize_mem (force_const_mem (V16QImode, vt));
25275 emit_insn (gen_addv16qi3 (mask, mask, vt));
25278 /* The actual shuffle operations all operate on V16QImode. */
25279 op0 = gen_lowpart (V16QImode, op0);
25280 op1 = gen_lowpart (V16QImode, op1);
25282 if (TARGET_XOP)
25284 if (GET_MODE (target) != V16QImode)
25285 target = gen_reg_rtx (V16QImode);
25286 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25287 if (target != operands[0])
25288 emit_move_insn (operands[0],
25289 gen_lowpart (GET_MODE (operands[0]), target));
25291 else if (one_operand_shuffle)
25293 if (GET_MODE (target) != V16QImode)
25294 target = gen_reg_rtx (V16QImode);
25295 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25296 if (target != operands[0])
25297 emit_move_insn (operands[0],
25298 gen_lowpart (GET_MODE (operands[0]), target));
25300 else
25302 rtx xops[6];
25303 bool ok;
25305 /* Shuffle the two input vectors independently. */
25306 t1 = gen_reg_rtx (V16QImode);
25307 t2 = gen_reg_rtx (V16QImode);
25308 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25309 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25311 merge_two:
25312 /* Then merge them together. The key is whether any given control
25313 element contained a bit set that indicates the second word. */
25314 mask = operands[3];
25315 vt = GEN_INT (w);
25316 if (maskmode == V2DImode && !TARGET_SSE4_1)
25318 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25319 more shuffle to convert the V2DI input mask into a V4SI
25320 input mask. At which point the masking that expand_int_vcond
25321 will work as desired. */
25322 rtx t3 = gen_reg_rtx (V4SImode);
25323 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25324 const0_rtx, const0_rtx,
25325 const2_rtx, const2_rtx));
25326 mask = t3;
25327 maskmode = V4SImode;
25328 e = w = 4;
25331 vt = gen_const_vec_duplicate (maskmode, vt);
25332 vt = force_reg (maskmode, vt);
25333 mask = expand_simple_binop (maskmode, AND, mask, vt,
25334 NULL_RTX, 0, OPTAB_DIRECT);
25336 if (GET_MODE (target) != mode)
25337 target = gen_reg_rtx (mode);
25338 xops[0] = target;
25339 xops[1] = gen_lowpart (mode, t2);
25340 xops[2] = gen_lowpart (mode, t1);
25341 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25342 xops[4] = mask;
25343 xops[5] = vt;
25344 ok = ix86_expand_int_vcond (xops);
25345 gcc_assert (ok);
25346 if (target != operands[0])
25347 emit_move_insn (operands[0],
25348 gen_lowpart (GET_MODE (operands[0]), target));
25352 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25353 true if we should do zero extension, else sign extension. HIGH_P is
25354 true if we want the N/2 high elements, else the low elements. */
25356 void
25357 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25359 machine_mode imode = GET_MODE (src);
25360 rtx tmp;
25362 if (TARGET_SSE4_1)
25364 rtx (*unpack)(rtx, rtx);
25365 rtx (*extract)(rtx, rtx) = NULL;
25366 machine_mode halfmode = BLKmode;
25368 switch (imode)
25370 case E_V64QImode:
25371 if (unsigned_p)
25372 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25373 else
25374 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25375 halfmode = V32QImode;
25376 extract
25377 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25378 break;
25379 case E_V32QImode:
25380 if (unsigned_p)
25381 unpack = gen_avx2_zero_extendv16qiv16hi2;
25382 else
25383 unpack = gen_avx2_sign_extendv16qiv16hi2;
25384 halfmode = V16QImode;
25385 extract
25386 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25387 break;
25388 case E_V32HImode:
25389 if (unsigned_p)
25390 unpack = gen_avx512f_zero_extendv16hiv16si2;
25391 else
25392 unpack = gen_avx512f_sign_extendv16hiv16si2;
25393 halfmode = V16HImode;
25394 extract
25395 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25396 break;
25397 case E_V16HImode:
25398 if (unsigned_p)
25399 unpack = gen_avx2_zero_extendv8hiv8si2;
25400 else
25401 unpack = gen_avx2_sign_extendv8hiv8si2;
25402 halfmode = V8HImode;
25403 extract
25404 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25405 break;
25406 case E_V16SImode:
25407 if (unsigned_p)
25408 unpack = gen_avx512f_zero_extendv8siv8di2;
25409 else
25410 unpack = gen_avx512f_sign_extendv8siv8di2;
25411 halfmode = V8SImode;
25412 extract
25413 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25414 break;
25415 case E_V8SImode:
25416 if (unsigned_p)
25417 unpack = gen_avx2_zero_extendv4siv4di2;
25418 else
25419 unpack = gen_avx2_sign_extendv4siv4di2;
25420 halfmode = V4SImode;
25421 extract
25422 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25423 break;
25424 case E_V16QImode:
25425 if (unsigned_p)
25426 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25427 else
25428 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25429 break;
25430 case E_V8HImode:
25431 if (unsigned_p)
25432 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25433 else
25434 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25435 break;
25436 case E_V4SImode:
25437 if (unsigned_p)
25438 unpack = gen_sse4_1_zero_extendv2siv2di2;
25439 else
25440 unpack = gen_sse4_1_sign_extendv2siv2di2;
25441 break;
25442 default:
25443 gcc_unreachable ();
25446 if (GET_MODE_SIZE (imode) >= 32)
25448 tmp = gen_reg_rtx (halfmode);
25449 emit_insn (extract (tmp, src));
25451 else if (high_p)
25453 /* Shift higher 8 bytes to lower 8 bytes. */
25454 tmp = gen_reg_rtx (V1TImode);
25455 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25456 GEN_INT (64)));
25457 tmp = gen_lowpart (imode, tmp);
25459 else
25460 tmp = src;
25462 emit_insn (unpack (dest, tmp));
25464 else
25466 rtx (*unpack)(rtx, rtx, rtx);
25468 switch (imode)
25470 case E_V16QImode:
25471 if (high_p)
25472 unpack = gen_vec_interleave_highv16qi;
25473 else
25474 unpack = gen_vec_interleave_lowv16qi;
25475 break;
25476 case E_V8HImode:
25477 if (high_p)
25478 unpack = gen_vec_interleave_highv8hi;
25479 else
25480 unpack = gen_vec_interleave_lowv8hi;
25481 break;
25482 case E_V4SImode:
25483 if (high_p)
25484 unpack = gen_vec_interleave_highv4si;
25485 else
25486 unpack = gen_vec_interleave_lowv4si;
25487 break;
25488 default:
25489 gcc_unreachable ();
25492 if (unsigned_p)
25493 tmp = force_reg (imode, CONST0_RTX (imode));
25494 else
25495 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25496 src, pc_rtx, pc_rtx);
25498 rtx tmp2 = gen_reg_rtx (imode);
25499 emit_insn (unpack (tmp2, src, tmp));
25500 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25504 /* Expand conditional increment or decrement using adb/sbb instructions.
25505 The default case using setcc followed by the conditional move can be
25506 done by generic code. */
25507 bool
25508 ix86_expand_int_addcc (rtx operands[])
25510 enum rtx_code code = GET_CODE (operands[1]);
25511 rtx flags;
25512 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25513 rtx compare_op;
25514 rtx val = const0_rtx;
25515 bool fpcmp = false;
25516 machine_mode mode;
25517 rtx op0 = XEXP (operands[1], 0);
25518 rtx op1 = XEXP (operands[1], 1);
25520 if (operands[3] != const1_rtx
25521 && operands[3] != constm1_rtx)
25522 return false;
25523 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25524 return false;
25525 code = GET_CODE (compare_op);
25527 flags = XEXP (compare_op, 0);
25529 if (GET_MODE (flags) == CCFPmode)
25531 fpcmp = true;
25532 code = ix86_fp_compare_code_to_integer (code);
25535 if (code != LTU)
25537 val = constm1_rtx;
25538 if (fpcmp)
25539 PUT_CODE (compare_op,
25540 reverse_condition_maybe_unordered
25541 (GET_CODE (compare_op)));
25542 else
25543 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25546 mode = GET_MODE (operands[0]);
25548 /* Construct either adc or sbb insn. */
25549 if ((code == LTU) == (operands[3] == constm1_rtx))
25551 switch (mode)
25553 case E_QImode:
25554 insn = gen_subqi3_carry;
25555 break;
25556 case E_HImode:
25557 insn = gen_subhi3_carry;
25558 break;
25559 case E_SImode:
25560 insn = gen_subsi3_carry;
25561 break;
25562 case E_DImode:
25563 insn = gen_subdi3_carry;
25564 break;
25565 default:
25566 gcc_unreachable ();
25569 else
25571 switch (mode)
25573 case E_QImode:
25574 insn = gen_addqi3_carry;
25575 break;
25576 case E_HImode:
25577 insn = gen_addhi3_carry;
25578 break;
25579 case E_SImode:
25580 insn = gen_addsi3_carry;
25581 break;
25582 case E_DImode:
25583 insn = gen_adddi3_carry;
25584 break;
25585 default:
25586 gcc_unreachable ();
25589 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25591 return true;
25595 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25596 but works for floating pointer parameters and nonoffsetable memories.
25597 For pushes, it returns just stack offsets; the values will be saved
25598 in the right order. Maximally three parts are generated. */
25600 static int
25601 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25603 int size;
25605 if (!TARGET_64BIT)
25606 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25607 else
25608 size = (GET_MODE_SIZE (mode) + 4) / 8;
25610 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25611 gcc_assert (size >= 2 && size <= 4);
25613 /* Optimize constant pool reference to immediates. This is used by fp
25614 moves, that force all constants to memory to allow combining. */
25615 if (MEM_P (operand) && MEM_READONLY_P (operand))
25616 operand = avoid_constant_pool_reference (operand);
25618 if (MEM_P (operand) && !offsettable_memref_p (operand))
25620 /* The only non-offsetable memories we handle are pushes. */
25621 int ok = push_operand (operand, VOIDmode);
25623 gcc_assert (ok);
25625 operand = copy_rtx (operand);
25626 PUT_MODE (operand, word_mode);
25627 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25628 return size;
25631 if (GET_CODE (operand) == CONST_VECTOR)
25633 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25634 /* Caution: if we looked through a constant pool memory above,
25635 the operand may actually have a different mode now. That's
25636 ok, since we want to pun this all the way back to an integer. */
25637 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25638 gcc_assert (operand != NULL);
25639 mode = imode;
25642 if (!TARGET_64BIT)
25644 if (mode == DImode)
25645 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25646 else
25648 int i;
25650 if (REG_P (operand))
25652 gcc_assert (reload_completed);
25653 for (i = 0; i < size; i++)
25654 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25656 else if (offsettable_memref_p (operand))
25658 operand = adjust_address (operand, SImode, 0);
25659 parts[0] = operand;
25660 for (i = 1; i < size; i++)
25661 parts[i] = adjust_address (operand, SImode, 4 * i);
25663 else if (CONST_DOUBLE_P (operand))
25665 const REAL_VALUE_TYPE *r;
25666 long l[4];
25668 r = CONST_DOUBLE_REAL_VALUE (operand);
25669 switch (mode)
25671 case E_TFmode:
25672 real_to_target (l, r, mode);
25673 parts[3] = gen_int_mode (l[3], SImode);
25674 parts[2] = gen_int_mode (l[2], SImode);
25675 break;
25676 case E_XFmode:
25677 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25678 long double may not be 80-bit. */
25679 real_to_target (l, r, mode);
25680 parts[2] = gen_int_mode (l[2], SImode);
25681 break;
25682 case E_DFmode:
25683 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25684 break;
25685 default:
25686 gcc_unreachable ();
25688 parts[1] = gen_int_mode (l[1], SImode);
25689 parts[0] = gen_int_mode (l[0], SImode);
25691 else
25692 gcc_unreachable ();
25695 else
25697 if (mode == TImode)
25698 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25699 if (mode == XFmode || mode == TFmode)
25701 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25702 if (REG_P (operand))
25704 gcc_assert (reload_completed);
25705 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25706 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25708 else if (offsettable_memref_p (operand))
25710 operand = adjust_address (operand, DImode, 0);
25711 parts[0] = operand;
25712 parts[1] = adjust_address (operand, upper_mode, 8);
25714 else if (CONST_DOUBLE_P (operand))
25716 long l[4];
25718 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25720 /* real_to_target puts 32-bit pieces in each long. */
25721 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25722 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25723 << 32), DImode);
25725 if (upper_mode == SImode)
25726 parts[1] = gen_int_mode (l[2], SImode);
25727 else
25728 parts[1]
25729 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25730 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25731 << 32), DImode);
25733 else
25734 gcc_unreachable ();
25738 return size;
25741 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25742 Return false when normal moves are needed; true when all required
25743 insns have been emitted. Operands 2-4 contain the input values
25744 int the correct order; operands 5-7 contain the output values. */
25746 void
25747 ix86_split_long_move (rtx operands[])
25749 rtx part[2][4];
25750 int nparts, i, j;
25751 int push = 0;
25752 int collisions = 0;
25753 machine_mode mode = GET_MODE (operands[0]);
25754 bool collisionparts[4];
25756 /* The DFmode expanders may ask us to move double.
25757 For 64bit target this is single move. By hiding the fact
25758 here we simplify i386.md splitters. */
25759 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25761 /* Optimize constant pool reference to immediates. This is used by
25762 fp moves, that force all constants to memory to allow combining. */
25764 if (MEM_P (operands[1])
25765 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25766 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25767 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25768 if (push_operand (operands[0], VOIDmode))
25770 operands[0] = copy_rtx (operands[0]);
25771 PUT_MODE (operands[0], word_mode);
25773 else
25774 operands[0] = gen_lowpart (DImode, operands[0]);
25775 operands[1] = gen_lowpart (DImode, operands[1]);
25776 emit_move_insn (operands[0], operands[1]);
25777 return;
25780 /* The only non-offsettable memory we handle is push. */
25781 if (push_operand (operands[0], VOIDmode))
25782 push = 1;
25783 else
25784 gcc_assert (!MEM_P (operands[0])
25785 || offsettable_memref_p (operands[0]));
25787 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25788 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25790 /* When emitting push, take care for source operands on the stack. */
25791 if (push && MEM_P (operands[1])
25792 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25794 rtx src_base = XEXP (part[1][nparts - 1], 0);
25796 /* Compensate for the stack decrement by 4. */
25797 if (!TARGET_64BIT && nparts == 3
25798 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25799 src_base = plus_constant (Pmode, src_base, 4);
25801 /* src_base refers to the stack pointer and is
25802 automatically decreased by emitted push. */
25803 for (i = 0; i < nparts; i++)
25804 part[1][i] = change_address (part[1][i],
25805 GET_MODE (part[1][i]), src_base);
25808 /* We need to do copy in the right order in case an address register
25809 of the source overlaps the destination. */
25810 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25812 rtx tmp;
25814 for (i = 0; i < nparts; i++)
25816 collisionparts[i]
25817 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25818 if (collisionparts[i])
25819 collisions++;
25822 /* Collision in the middle part can be handled by reordering. */
25823 if (collisions == 1 && nparts == 3 && collisionparts [1])
25825 std::swap (part[0][1], part[0][2]);
25826 std::swap (part[1][1], part[1][2]);
25828 else if (collisions == 1
25829 && nparts == 4
25830 && (collisionparts [1] || collisionparts [2]))
25832 if (collisionparts [1])
25834 std::swap (part[0][1], part[0][2]);
25835 std::swap (part[1][1], part[1][2]);
25837 else
25839 std::swap (part[0][2], part[0][3]);
25840 std::swap (part[1][2], part[1][3]);
25844 /* If there are more collisions, we can't handle it by reordering.
25845 Do an lea to the last part and use only one colliding move. */
25846 else if (collisions > 1)
25848 rtx base, addr;
25850 collisions = 1;
25852 base = part[0][nparts - 1];
25854 /* Handle the case when the last part isn't valid for lea.
25855 Happens in 64-bit mode storing the 12-byte XFmode. */
25856 if (GET_MODE (base) != Pmode)
25857 base = gen_rtx_REG (Pmode, REGNO (base));
25859 addr = XEXP (part[1][0], 0);
25860 if (TARGET_TLS_DIRECT_SEG_REFS)
25862 struct ix86_address parts;
25863 int ok = ix86_decompose_address (addr, &parts);
25864 gcc_assert (ok);
25865 /* It is not valid to use %gs: or %fs: in lea. */
25866 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25868 emit_insn (gen_rtx_SET (base, addr));
25869 part[1][0] = replace_equiv_address (part[1][0], base);
25870 for (i = 1; i < nparts; i++)
25872 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25873 part[1][i] = replace_equiv_address (part[1][i], tmp);
25878 if (push)
25880 if (!TARGET_64BIT)
25882 if (nparts == 3)
25884 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25885 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25886 stack_pointer_rtx, GEN_INT (-4)));
25887 emit_move_insn (part[0][2], part[1][2]);
25889 else if (nparts == 4)
25891 emit_move_insn (part[0][3], part[1][3]);
25892 emit_move_insn (part[0][2], part[1][2]);
25895 else
25897 /* In 64bit mode we don't have 32bit push available. In case this is
25898 register, it is OK - we will just use larger counterpart. We also
25899 retype memory - these comes from attempt to avoid REX prefix on
25900 moving of second half of TFmode value. */
25901 if (GET_MODE (part[1][1]) == SImode)
25903 switch (GET_CODE (part[1][1]))
25905 case MEM:
25906 part[1][1] = adjust_address (part[1][1], DImode, 0);
25907 break;
25909 case REG:
25910 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25911 break;
25913 default:
25914 gcc_unreachable ();
25917 if (GET_MODE (part[1][0]) == SImode)
25918 part[1][0] = part[1][1];
25921 emit_move_insn (part[0][1], part[1][1]);
25922 emit_move_insn (part[0][0], part[1][0]);
25923 return;
25926 /* Choose correct order to not overwrite the source before it is copied. */
25927 if ((REG_P (part[0][0])
25928 && REG_P (part[1][1])
25929 && (REGNO (part[0][0]) == REGNO (part[1][1])
25930 || (nparts == 3
25931 && REGNO (part[0][0]) == REGNO (part[1][2]))
25932 || (nparts == 4
25933 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25934 || (collisions > 0
25935 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25937 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25939 operands[2 + i] = part[0][j];
25940 operands[6 + i] = part[1][j];
25943 else
25945 for (i = 0; i < nparts; i++)
25947 operands[2 + i] = part[0][i];
25948 operands[6 + i] = part[1][i];
25952 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25953 if (optimize_insn_for_size_p ())
25955 for (j = 0; j < nparts - 1; j++)
25956 if (CONST_INT_P (operands[6 + j])
25957 && operands[6 + j] != const0_rtx
25958 && REG_P (operands[2 + j]))
25959 for (i = j; i < nparts - 1; i++)
25960 if (CONST_INT_P (operands[7 + i])
25961 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25962 operands[7 + i] = operands[2 + j];
25965 for (i = 0; i < nparts; i++)
25966 emit_move_insn (operands[2 + i], operands[6 + i]);
25968 return;
25971 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25972 left shift by a constant, either using a single shift or
25973 a sequence of add instructions. */
25975 static void
25976 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25978 rtx (*insn)(rtx, rtx, rtx);
25980 if (count == 1
25981 || (count * ix86_cost->add <= ix86_cost->shift_const
25982 && !optimize_insn_for_size_p ()))
25984 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25985 while (count-- > 0)
25986 emit_insn (insn (operand, operand, operand));
25988 else
25990 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25991 emit_insn (insn (operand, operand, GEN_INT (count)));
25995 void
25996 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25998 rtx (*gen_ashl3)(rtx, rtx, rtx);
25999 rtx (*gen_shld)(rtx, rtx, rtx);
26000 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26002 rtx low[2], high[2];
26003 int count;
26005 if (CONST_INT_P (operands[2]))
26007 split_double_mode (mode, operands, 2, low, high);
26008 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26010 if (count >= half_width)
26012 emit_move_insn (high[0], low[1]);
26013 emit_move_insn (low[0], const0_rtx);
26015 if (count > half_width)
26016 ix86_expand_ashl_const (high[0], count - half_width, mode);
26018 else
26020 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26022 if (!rtx_equal_p (operands[0], operands[1]))
26023 emit_move_insn (operands[0], operands[1]);
26025 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26026 ix86_expand_ashl_const (low[0], count, mode);
26028 return;
26031 split_double_mode (mode, operands, 1, low, high);
26033 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26035 if (operands[1] == const1_rtx)
26037 /* Assuming we've chosen a QImode capable registers, then 1 << N
26038 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26039 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26041 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26043 ix86_expand_clear (low[0]);
26044 ix86_expand_clear (high[0]);
26045 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26047 d = gen_lowpart (QImode, low[0]);
26048 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26049 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26050 emit_insn (gen_rtx_SET (d, s));
26052 d = gen_lowpart (QImode, high[0]);
26053 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26054 s = gen_rtx_NE (QImode, flags, const0_rtx);
26055 emit_insn (gen_rtx_SET (d, s));
26058 /* Otherwise, we can get the same results by manually performing
26059 a bit extract operation on bit 5/6, and then performing the two
26060 shifts. The two methods of getting 0/1 into low/high are exactly
26061 the same size. Avoiding the shift in the bit extract case helps
26062 pentium4 a bit; no one else seems to care much either way. */
26063 else
26065 machine_mode half_mode;
26066 rtx (*gen_lshr3)(rtx, rtx, rtx);
26067 rtx (*gen_and3)(rtx, rtx, rtx);
26068 rtx (*gen_xor3)(rtx, rtx, rtx);
26069 HOST_WIDE_INT bits;
26070 rtx x;
26072 if (mode == DImode)
26074 half_mode = SImode;
26075 gen_lshr3 = gen_lshrsi3;
26076 gen_and3 = gen_andsi3;
26077 gen_xor3 = gen_xorsi3;
26078 bits = 5;
26080 else
26082 half_mode = DImode;
26083 gen_lshr3 = gen_lshrdi3;
26084 gen_and3 = gen_anddi3;
26085 gen_xor3 = gen_xordi3;
26086 bits = 6;
26089 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26090 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26091 else
26092 x = gen_lowpart (half_mode, operands[2]);
26093 emit_insn (gen_rtx_SET (high[0], x));
26095 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26096 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26097 emit_move_insn (low[0], high[0]);
26098 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26101 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26102 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26103 return;
26106 if (operands[1] == constm1_rtx)
26108 /* For -1 << N, we can avoid the shld instruction, because we
26109 know that we're shifting 0...31/63 ones into a -1. */
26110 emit_move_insn (low[0], constm1_rtx);
26111 if (optimize_insn_for_size_p ())
26112 emit_move_insn (high[0], low[0]);
26113 else
26114 emit_move_insn (high[0], constm1_rtx);
26116 else
26118 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26120 if (!rtx_equal_p (operands[0], operands[1]))
26121 emit_move_insn (operands[0], operands[1]);
26123 split_double_mode (mode, operands, 1, low, high);
26124 emit_insn (gen_shld (high[0], low[0], operands[2]));
26127 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26129 if (TARGET_CMOVE && scratch)
26131 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26132 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26134 ix86_expand_clear (scratch);
26135 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26137 else
26139 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26140 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26142 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26146 void
26147 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26149 rtx (*gen_ashr3)(rtx, rtx, rtx)
26150 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26151 rtx (*gen_shrd)(rtx, rtx, rtx);
26152 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26154 rtx low[2], high[2];
26155 int count;
26157 if (CONST_INT_P (operands[2]))
26159 split_double_mode (mode, operands, 2, low, high);
26160 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26162 if (count == GET_MODE_BITSIZE (mode) - 1)
26164 emit_move_insn (high[0], high[1]);
26165 emit_insn (gen_ashr3 (high[0], high[0],
26166 GEN_INT (half_width - 1)));
26167 emit_move_insn (low[0], high[0]);
26170 else if (count >= half_width)
26172 emit_move_insn (low[0], high[1]);
26173 emit_move_insn (high[0], low[0]);
26174 emit_insn (gen_ashr3 (high[0], high[0],
26175 GEN_INT (half_width - 1)));
26177 if (count > half_width)
26178 emit_insn (gen_ashr3 (low[0], low[0],
26179 GEN_INT (count - half_width)));
26181 else
26183 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26185 if (!rtx_equal_p (operands[0], operands[1]))
26186 emit_move_insn (operands[0], operands[1]);
26188 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26189 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26192 else
26194 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26196 if (!rtx_equal_p (operands[0], operands[1]))
26197 emit_move_insn (operands[0], operands[1]);
26199 split_double_mode (mode, operands, 1, low, high);
26201 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26202 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26204 if (TARGET_CMOVE && scratch)
26206 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26207 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26209 emit_move_insn (scratch, high[0]);
26210 emit_insn (gen_ashr3 (scratch, scratch,
26211 GEN_INT (half_width - 1)));
26212 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26213 scratch));
26215 else
26217 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26218 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26220 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26225 void
26226 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26228 rtx (*gen_lshr3)(rtx, rtx, rtx)
26229 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26230 rtx (*gen_shrd)(rtx, rtx, rtx);
26231 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26233 rtx low[2], high[2];
26234 int count;
26236 if (CONST_INT_P (operands[2]))
26238 split_double_mode (mode, operands, 2, low, high);
26239 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26241 if (count >= half_width)
26243 emit_move_insn (low[0], high[1]);
26244 ix86_expand_clear (high[0]);
26246 if (count > half_width)
26247 emit_insn (gen_lshr3 (low[0], low[0],
26248 GEN_INT (count - half_width)));
26250 else
26252 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26254 if (!rtx_equal_p (operands[0], operands[1]))
26255 emit_move_insn (operands[0], operands[1]);
26257 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26258 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26261 else
26263 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26265 if (!rtx_equal_p (operands[0], operands[1]))
26266 emit_move_insn (operands[0], operands[1]);
26268 split_double_mode (mode, operands, 1, low, high);
26270 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26271 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26273 if (TARGET_CMOVE && scratch)
26275 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26276 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26278 ix86_expand_clear (scratch);
26279 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26280 scratch));
26282 else
26284 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26285 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26287 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26292 /* Predict just emitted jump instruction to be taken with probability PROB. */
26293 static void
26294 predict_jump (int prob)
26296 rtx_insn *insn = get_last_insn ();
26297 gcc_assert (JUMP_P (insn));
26298 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26301 /* Helper function for the string operations below. Dest VARIABLE whether
26302 it is aligned to VALUE bytes. If true, jump to the label. */
26303 static rtx_code_label *
26304 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26306 rtx_code_label *label = gen_label_rtx ();
26307 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26308 if (GET_MODE (variable) == DImode)
26309 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26310 else
26311 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26312 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26313 1, label);
26314 if (epilogue)
26315 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26316 else
26317 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26318 return label;
26321 /* Adjust COUNTER by the VALUE. */
26322 static void
26323 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26325 rtx (*gen_add)(rtx, rtx, rtx)
26326 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26328 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26331 /* Zero extend possibly SImode EXP to Pmode register. */
26333 ix86_zero_extend_to_Pmode (rtx exp)
26335 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26338 /* Divide COUNTREG by SCALE. */
26339 static rtx
26340 scale_counter (rtx countreg, int scale)
26342 rtx sc;
26344 if (scale == 1)
26345 return countreg;
26346 if (CONST_INT_P (countreg))
26347 return GEN_INT (INTVAL (countreg) / scale);
26348 gcc_assert (REG_P (countreg));
26350 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26351 GEN_INT (exact_log2 (scale)),
26352 NULL, 1, OPTAB_DIRECT);
26353 return sc;
26356 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26357 DImode for constant loop counts. */
26359 static machine_mode
26360 counter_mode (rtx count_exp)
26362 if (GET_MODE (count_exp) != VOIDmode)
26363 return GET_MODE (count_exp);
26364 if (!CONST_INT_P (count_exp))
26365 return Pmode;
26366 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26367 return DImode;
26368 return SImode;
26371 /* Copy the address to a Pmode register. This is used for x32 to
26372 truncate DImode TLS address to a SImode register. */
26374 static rtx
26375 ix86_copy_addr_to_reg (rtx addr)
26377 rtx reg;
26378 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26380 reg = copy_addr_to_reg (addr);
26381 REG_POINTER (reg) = 1;
26382 return reg;
26384 else
26386 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26387 reg = copy_to_mode_reg (DImode, addr);
26388 REG_POINTER (reg) = 1;
26389 return gen_rtx_SUBREG (SImode, reg, 0);
26393 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26394 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26395 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26396 memory by VALUE (supposed to be in MODE).
26398 The size is rounded down to whole number of chunk size moved at once.
26399 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26402 static void
26403 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26404 rtx destptr, rtx srcptr, rtx value,
26405 rtx count, machine_mode mode, int unroll,
26406 int expected_size, bool issetmem)
26408 rtx_code_label *out_label, *top_label;
26409 rtx iter, tmp;
26410 machine_mode iter_mode = counter_mode (count);
26411 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26412 rtx piece_size = GEN_INT (piece_size_n);
26413 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26414 rtx size;
26415 int i;
26417 top_label = gen_label_rtx ();
26418 out_label = gen_label_rtx ();
26419 iter = gen_reg_rtx (iter_mode);
26421 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26422 NULL, 1, OPTAB_DIRECT);
26423 /* Those two should combine. */
26424 if (piece_size == const1_rtx)
26426 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26427 true, out_label);
26428 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26430 emit_move_insn (iter, const0_rtx);
26432 emit_label (top_label);
26434 tmp = convert_modes (Pmode, iter_mode, iter, true);
26436 /* This assert could be relaxed - in this case we'll need to compute
26437 smallest power of two, containing in PIECE_SIZE_N and pass it to
26438 offset_address. */
26439 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26440 destmem = offset_address (destmem, tmp, piece_size_n);
26441 destmem = adjust_address (destmem, mode, 0);
26443 if (!issetmem)
26445 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26446 srcmem = adjust_address (srcmem, mode, 0);
26448 /* When unrolling for chips that reorder memory reads and writes,
26449 we can save registers by using single temporary.
26450 Also using 4 temporaries is overkill in 32bit mode. */
26451 if (!TARGET_64BIT && 0)
26453 for (i = 0; i < unroll; i++)
26455 if (i)
26457 destmem =
26458 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26459 srcmem =
26460 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26462 emit_move_insn (destmem, srcmem);
26465 else
26467 rtx tmpreg[4];
26468 gcc_assert (unroll <= 4);
26469 for (i = 0; i < unroll; i++)
26471 tmpreg[i] = gen_reg_rtx (mode);
26472 if (i)
26474 srcmem =
26475 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26477 emit_move_insn (tmpreg[i], srcmem);
26479 for (i = 0; i < unroll; i++)
26481 if (i)
26483 destmem =
26484 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26486 emit_move_insn (destmem, tmpreg[i]);
26490 else
26491 for (i = 0; i < unroll; i++)
26493 if (i)
26494 destmem =
26495 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26496 emit_move_insn (destmem, value);
26499 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26500 true, OPTAB_LIB_WIDEN);
26501 if (tmp != iter)
26502 emit_move_insn (iter, tmp);
26504 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26505 true, top_label);
26506 if (expected_size != -1)
26508 expected_size /= GET_MODE_SIZE (mode) * unroll;
26509 if (expected_size == 0)
26510 predict_jump (0);
26511 else if (expected_size > REG_BR_PROB_BASE)
26512 predict_jump (REG_BR_PROB_BASE - 1);
26513 else
26514 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26516 else
26517 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26518 iter = ix86_zero_extend_to_Pmode (iter);
26519 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26520 true, OPTAB_LIB_WIDEN);
26521 if (tmp != destptr)
26522 emit_move_insn (destptr, tmp);
26523 if (!issetmem)
26525 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26526 true, OPTAB_LIB_WIDEN);
26527 if (tmp != srcptr)
26528 emit_move_insn (srcptr, tmp);
26530 emit_label (out_label);
26533 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26534 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26535 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26536 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26537 ORIG_VALUE is the original value passed to memset to fill the memory with.
26538 Other arguments have same meaning as for previous function. */
26540 static void
26541 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26542 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26543 rtx count,
26544 machine_mode mode, bool issetmem)
26546 rtx destexp;
26547 rtx srcexp;
26548 rtx countreg;
26549 HOST_WIDE_INT rounded_count;
26551 /* If possible, it is shorter to use rep movs.
26552 TODO: Maybe it is better to move this logic to decide_alg. */
26553 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26554 && (!issetmem || orig_value == const0_rtx))
26555 mode = SImode;
26557 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26558 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26560 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26561 GET_MODE_SIZE (mode)));
26562 if (mode != QImode)
26564 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26565 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26566 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26568 else
26569 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26570 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26572 rounded_count
26573 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26574 destmem = shallow_copy_rtx (destmem);
26575 set_mem_size (destmem, rounded_count);
26577 else if (MEM_SIZE_KNOWN_P (destmem))
26578 clear_mem_size (destmem);
26580 if (issetmem)
26582 value = force_reg (mode, gen_lowpart (mode, value));
26583 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26585 else
26587 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26588 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26589 if (mode != QImode)
26591 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26592 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26593 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26595 else
26596 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26597 if (CONST_INT_P (count))
26599 rounded_count
26600 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26601 srcmem = shallow_copy_rtx (srcmem);
26602 set_mem_size (srcmem, rounded_count);
26604 else
26606 if (MEM_SIZE_KNOWN_P (srcmem))
26607 clear_mem_size (srcmem);
26609 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26610 destexp, srcexp));
26614 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26615 DESTMEM.
26616 SRC is passed by pointer to be updated on return.
26617 Return value is updated DST. */
26618 static rtx
26619 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26620 HOST_WIDE_INT size_to_move)
26622 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26623 enum insn_code code;
26624 machine_mode move_mode;
26625 int piece_size, i;
26627 /* Find the widest mode in which we could perform moves.
26628 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26629 it until move of such size is supported. */
26630 piece_size = 1 << floor_log2 (size_to_move);
26631 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26632 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26634 gcc_assert (piece_size > 1);
26635 piece_size >>= 1;
26638 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26639 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26640 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26642 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26643 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26644 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26646 move_mode = word_mode;
26647 piece_size = GET_MODE_SIZE (move_mode);
26648 code = optab_handler (mov_optab, move_mode);
26651 gcc_assert (code != CODE_FOR_nothing);
26653 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26654 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26656 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26657 gcc_assert (size_to_move % piece_size == 0);
26658 adjust = GEN_INT (piece_size);
26659 for (i = 0; i < size_to_move; i += piece_size)
26661 /* We move from memory to memory, so we'll need to do it via
26662 a temporary register. */
26663 tempreg = gen_reg_rtx (move_mode);
26664 emit_insn (GEN_FCN (code) (tempreg, src));
26665 emit_insn (GEN_FCN (code) (dst, tempreg));
26667 emit_move_insn (destptr,
26668 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26669 emit_move_insn (srcptr,
26670 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26672 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26673 piece_size);
26674 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26675 piece_size);
26678 /* Update DST and SRC rtx. */
26679 *srcmem = src;
26680 return dst;
26683 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26684 static void
26685 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26686 rtx destptr, rtx srcptr, rtx count, int max_size)
26688 rtx src, dest;
26689 if (CONST_INT_P (count))
26691 HOST_WIDE_INT countval = INTVAL (count);
26692 HOST_WIDE_INT epilogue_size = countval % max_size;
26693 int i;
26695 /* For now MAX_SIZE should be a power of 2. This assert could be
26696 relaxed, but it'll require a bit more complicated epilogue
26697 expanding. */
26698 gcc_assert ((max_size & (max_size - 1)) == 0);
26699 for (i = max_size; i >= 1; i >>= 1)
26701 if (epilogue_size & i)
26702 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26704 return;
26706 if (max_size > 8)
26708 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26709 count, 1, OPTAB_DIRECT);
26710 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26711 count, QImode, 1, 4, false);
26712 return;
26715 /* When there are stringops, we can cheaply increase dest and src pointers.
26716 Otherwise we save code size by maintaining offset (zero is readily
26717 available from preceding rep operation) and using x86 addressing modes.
26719 if (TARGET_SINGLE_STRINGOP)
26721 if (max_size > 4)
26723 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26724 src = change_address (srcmem, SImode, srcptr);
26725 dest = change_address (destmem, SImode, destptr);
26726 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26727 emit_label (label);
26728 LABEL_NUSES (label) = 1;
26730 if (max_size > 2)
26732 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26733 src = change_address (srcmem, HImode, srcptr);
26734 dest = change_address (destmem, HImode, destptr);
26735 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26736 emit_label (label);
26737 LABEL_NUSES (label) = 1;
26739 if (max_size > 1)
26741 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26742 src = change_address (srcmem, QImode, srcptr);
26743 dest = change_address (destmem, QImode, destptr);
26744 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26745 emit_label (label);
26746 LABEL_NUSES (label) = 1;
26749 else
26751 rtx offset = force_reg (Pmode, const0_rtx);
26752 rtx tmp;
26754 if (max_size > 4)
26756 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26757 src = change_address (srcmem, SImode, srcptr);
26758 dest = change_address (destmem, SImode, destptr);
26759 emit_move_insn (dest, src);
26760 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26761 true, OPTAB_LIB_WIDEN);
26762 if (tmp != offset)
26763 emit_move_insn (offset, tmp);
26764 emit_label (label);
26765 LABEL_NUSES (label) = 1;
26767 if (max_size > 2)
26769 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26770 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26771 src = change_address (srcmem, HImode, tmp);
26772 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26773 dest = change_address (destmem, HImode, tmp);
26774 emit_move_insn (dest, src);
26775 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26776 true, OPTAB_LIB_WIDEN);
26777 if (tmp != offset)
26778 emit_move_insn (offset, tmp);
26779 emit_label (label);
26780 LABEL_NUSES (label) = 1;
26782 if (max_size > 1)
26784 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26785 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26786 src = change_address (srcmem, QImode, tmp);
26787 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26788 dest = change_address (destmem, QImode, tmp);
26789 emit_move_insn (dest, src);
26790 emit_label (label);
26791 LABEL_NUSES (label) = 1;
26796 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26797 with value PROMOTED_VAL.
26798 SRC is passed by pointer to be updated on return.
26799 Return value is updated DST. */
26800 static rtx
26801 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26802 HOST_WIDE_INT size_to_move)
26804 rtx dst = destmem, adjust;
26805 enum insn_code code;
26806 machine_mode move_mode;
26807 int piece_size, i;
26809 /* Find the widest mode in which we could perform moves.
26810 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26811 it until move of such size is supported. */
26812 move_mode = GET_MODE (promoted_val);
26813 if (move_mode == VOIDmode)
26814 move_mode = QImode;
26815 if (size_to_move < GET_MODE_SIZE (move_mode))
26817 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26818 move_mode = int_mode_for_size (move_bits, 0).require ();
26819 promoted_val = gen_lowpart (move_mode, promoted_val);
26821 piece_size = GET_MODE_SIZE (move_mode);
26822 code = optab_handler (mov_optab, move_mode);
26823 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26825 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26827 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26828 gcc_assert (size_to_move % piece_size == 0);
26829 adjust = GEN_INT (piece_size);
26830 for (i = 0; i < size_to_move; i += piece_size)
26832 if (piece_size <= GET_MODE_SIZE (word_mode))
26834 emit_insn (gen_strset (destptr, dst, promoted_val));
26835 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26836 piece_size);
26837 continue;
26840 emit_insn (GEN_FCN (code) (dst, promoted_val));
26842 emit_move_insn (destptr,
26843 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26845 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26846 piece_size);
26849 /* Update DST rtx. */
26850 return dst;
26852 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26853 static void
26854 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26855 rtx count, int max_size)
26857 count =
26858 expand_simple_binop (counter_mode (count), AND, count,
26859 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26860 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26861 gen_lowpart (QImode, value), count, QImode,
26862 1, max_size / 2, true);
26865 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26866 static void
26867 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26868 rtx count, int max_size)
26870 rtx dest;
26872 if (CONST_INT_P (count))
26874 HOST_WIDE_INT countval = INTVAL (count);
26875 HOST_WIDE_INT epilogue_size = countval % max_size;
26876 int i;
26878 /* For now MAX_SIZE should be a power of 2. This assert could be
26879 relaxed, but it'll require a bit more complicated epilogue
26880 expanding. */
26881 gcc_assert ((max_size & (max_size - 1)) == 0);
26882 for (i = max_size; i >= 1; i >>= 1)
26884 if (epilogue_size & i)
26886 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26887 destmem = emit_memset (destmem, destptr, vec_value, i);
26888 else
26889 destmem = emit_memset (destmem, destptr, value, i);
26892 return;
26894 if (max_size > 32)
26896 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26897 return;
26899 if (max_size > 16)
26901 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26902 if (TARGET_64BIT)
26904 dest = change_address (destmem, DImode, destptr);
26905 emit_insn (gen_strset (destptr, dest, value));
26906 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26907 emit_insn (gen_strset (destptr, dest, value));
26909 else
26911 dest = change_address (destmem, SImode, destptr);
26912 emit_insn (gen_strset (destptr, dest, value));
26913 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26914 emit_insn (gen_strset (destptr, dest, value));
26915 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26916 emit_insn (gen_strset (destptr, dest, value));
26917 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26918 emit_insn (gen_strset (destptr, dest, value));
26920 emit_label (label);
26921 LABEL_NUSES (label) = 1;
26923 if (max_size > 8)
26925 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26926 if (TARGET_64BIT)
26928 dest = change_address (destmem, DImode, destptr);
26929 emit_insn (gen_strset (destptr, dest, value));
26931 else
26933 dest = change_address (destmem, SImode, destptr);
26934 emit_insn (gen_strset (destptr, dest, value));
26935 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26936 emit_insn (gen_strset (destptr, dest, value));
26938 emit_label (label);
26939 LABEL_NUSES (label) = 1;
26941 if (max_size > 4)
26943 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26944 dest = change_address (destmem, SImode, destptr);
26945 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26946 emit_label (label);
26947 LABEL_NUSES (label) = 1;
26949 if (max_size > 2)
26951 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26952 dest = change_address (destmem, HImode, destptr);
26953 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26954 emit_label (label);
26955 LABEL_NUSES (label) = 1;
26957 if (max_size > 1)
26959 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26960 dest = change_address (destmem, QImode, destptr);
26961 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26962 emit_label (label);
26963 LABEL_NUSES (label) = 1;
26967 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26968 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26969 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26970 ignored.
26971 Return value is updated DESTMEM. */
26972 static rtx
26973 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26974 rtx destptr, rtx srcptr, rtx value,
26975 rtx vec_value, rtx count, int align,
26976 int desired_alignment, bool issetmem)
26978 int i;
26979 for (i = 1; i < desired_alignment; i <<= 1)
26981 if (align <= i)
26983 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26984 if (issetmem)
26986 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26987 destmem = emit_memset (destmem, destptr, vec_value, i);
26988 else
26989 destmem = emit_memset (destmem, destptr, value, i);
26991 else
26992 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26993 ix86_adjust_counter (count, i);
26994 emit_label (label);
26995 LABEL_NUSES (label) = 1;
26996 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26999 return destmem;
27002 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27003 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27004 and jump to DONE_LABEL. */
27005 static void
27006 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27007 rtx destptr, rtx srcptr,
27008 rtx value, rtx vec_value,
27009 rtx count, int size,
27010 rtx done_label, bool issetmem)
27012 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27013 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
27014 rtx modesize;
27015 int n;
27017 /* If we do not have vector value to copy, we must reduce size. */
27018 if (issetmem)
27020 if (!vec_value)
27022 if (GET_MODE (value) == VOIDmode && size > 8)
27023 mode = Pmode;
27024 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27025 mode = GET_MODE (value);
27027 else
27028 mode = GET_MODE (vec_value), value = vec_value;
27030 else
27032 /* Choose appropriate vector mode. */
27033 if (size >= 32)
27034 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27035 else if (size >= 16)
27036 mode = TARGET_SSE ? V16QImode : DImode;
27037 srcmem = change_address (srcmem, mode, srcptr);
27039 destmem = change_address (destmem, mode, destptr);
27040 modesize = GEN_INT (GET_MODE_SIZE (mode));
27041 gcc_assert (GET_MODE_SIZE (mode) <= size);
27042 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27044 if (issetmem)
27045 emit_move_insn (destmem, gen_lowpart (mode, value));
27046 else
27048 emit_move_insn (destmem, srcmem);
27049 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27051 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27054 destmem = offset_address (destmem, count, 1);
27055 destmem = offset_address (destmem, GEN_INT (-2 * size),
27056 GET_MODE_SIZE (mode));
27057 if (!issetmem)
27059 srcmem = offset_address (srcmem, count, 1);
27060 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27061 GET_MODE_SIZE (mode));
27063 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27065 if (issetmem)
27066 emit_move_insn (destmem, gen_lowpart (mode, value));
27067 else
27069 emit_move_insn (destmem, srcmem);
27070 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27072 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27074 emit_jump_insn (gen_jump (done_label));
27075 emit_barrier ();
27077 emit_label (label);
27078 LABEL_NUSES (label) = 1;
27081 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27082 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27083 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27084 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27085 DONE_LABEL is a label after the whole copying sequence. The label is created
27086 on demand if *DONE_LABEL is NULL.
27087 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27088 bounds after the initial copies.
27090 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27091 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27092 we will dispatch to a library call for large blocks.
27094 In pseudocode we do:
27096 if (COUNT < SIZE)
27098 Assume that SIZE is 4. Bigger sizes are handled analogously
27099 if (COUNT & 4)
27101 copy 4 bytes from SRCPTR to DESTPTR
27102 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27103 goto done_label
27105 if (!COUNT)
27106 goto done_label;
27107 copy 1 byte from SRCPTR to DESTPTR
27108 if (COUNT & 2)
27110 copy 2 bytes from SRCPTR to DESTPTR
27111 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27114 else
27116 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27117 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27119 OLD_DESPTR = DESTPTR;
27120 Align DESTPTR up to DESIRED_ALIGN
27121 SRCPTR += DESTPTR - OLD_DESTPTR
27122 COUNT -= DEST_PTR - OLD_DESTPTR
27123 if (DYNAMIC_CHECK)
27124 Round COUNT down to multiple of SIZE
27125 << optional caller supplied zero size guard is here >>
27126 << optional caller supplied dynamic check is here >>
27127 << caller supplied main copy loop is here >>
27129 done_label:
27131 static void
27132 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27133 rtx *destptr, rtx *srcptr,
27134 machine_mode mode,
27135 rtx value, rtx vec_value,
27136 rtx *count,
27137 rtx_code_label **done_label,
27138 int size,
27139 int desired_align,
27140 int align,
27141 unsigned HOST_WIDE_INT *min_size,
27142 bool dynamic_check,
27143 bool issetmem)
27145 rtx_code_label *loop_label = NULL, *label;
27146 int n;
27147 rtx modesize;
27148 int prolog_size = 0;
27149 rtx mode_value;
27151 /* Chose proper value to copy. */
27152 if (issetmem && VECTOR_MODE_P (mode))
27153 mode_value = vec_value;
27154 else
27155 mode_value = value;
27156 gcc_assert (GET_MODE_SIZE (mode) <= size);
27158 /* See if block is big or small, handle small blocks. */
27159 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27161 int size2 = size;
27162 loop_label = gen_label_rtx ();
27164 if (!*done_label)
27165 *done_label = gen_label_rtx ();
27167 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27168 1, loop_label);
27169 size2 >>= 1;
27171 /* Handle sizes > 3. */
27172 for (;size2 > 2; size2 >>= 1)
27173 expand_small_movmem_or_setmem (destmem, srcmem,
27174 *destptr, *srcptr,
27175 value, vec_value,
27176 *count,
27177 size2, *done_label, issetmem);
27178 /* Nothing to copy? Jump to DONE_LABEL if so */
27179 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27180 1, *done_label);
27182 /* Do a byte copy. */
27183 destmem = change_address (destmem, QImode, *destptr);
27184 if (issetmem)
27185 emit_move_insn (destmem, gen_lowpart (QImode, value));
27186 else
27188 srcmem = change_address (srcmem, QImode, *srcptr);
27189 emit_move_insn (destmem, srcmem);
27192 /* Handle sizes 2 and 3. */
27193 label = ix86_expand_aligntest (*count, 2, false);
27194 destmem = change_address (destmem, HImode, *destptr);
27195 destmem = offset_address (destmem, *count, 1);
27196 destmem = offset_address (destmem, GEN_INT (-2), 2);
27197 if (issetmem)
27198 emit_move_insn (destmem, gen_lowpart (HImode, value));
27199 else
27201 srcmem = change_address (srcmem, HImode, *srcptr);
27202 srcmem = offset_address (srcmem, *count, 1);
27203 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27204 emit_move_insn (destmem, srcmem);
27207 emit_label (label);
27208 LABEL_NUSES (label) = 1;
27209 emit_jump_insn (gen_jump (*done_label));
27210 emit_barrier ();
27212 else
27213 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27214 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27216 /* Start memcpy for COUNT >= SIZE. */
27217 if (loop_label)
27219 emit_label (loop_label);
27220 LABEL_NUSES (loop_label) = 1;
27223 /* Copy first desired_align bytes. */
27224 if (!issetmem)
27225 srcmem = change_address (srcmem, mode, *srcptr);
27226 destmem = change_address (destmem, mode, *destptr);
27227 modesize = GEN_INT (GET_MODE_SIZE (mode));
27228 for (n = 0; prolog_size < desired_align - align; n++)
27230 if (issetmem)
27231 emit_move_insn (destmem, mode_value);
27232 else
27234 emit_move_insn (destmem, srcmem);
27235 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27237 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27238 prolog_size += GET_MODE_SIZE (mode);
27242 /* Copy last SIZE bytes. */
27243 destmem = offset_address (destmem, *count, 1);
27244 destmem = offset_address (destmem,
27245 GEN_INT (-size - prolog_size),
27247 if (issetmem)
27248 emit_move_insn (destmem, mode_value);
27249 else
27251 srcmem = offset_address (srcmem, *count, 1);
27252 srcmem = offset_address (srcmem,
27253 GEN_INT (-size - prolog_size),
27255 emit_move_insn (destmem, srcmem);
27257 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27259 destmem = offset_address (destmem, modesize, 1);
27260 if (issetmem)
27261 emit_move_insn (destmem, mode_value);
27262 else
27264 srcmem = offset_address (srcmem, modesize, 1);
27265 emit_move_insn (destmem, srcmem);
27269 /* Align destination. */
27270 if (desired_align > 1 && desired_align > align)
27272 rtx saveddest = *destptr;
27274 gcc_assert (desired_align <= size);
27275 /* Align destptr up, place it to new register. */
27276 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27277 GEN_INT (prolog_size),
27278 NULL_RTX, 1, OPTAB_DIRECT);
27279 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27280 REG_POINTER (*destptr) = 1;
27281 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27282 GEN_INT (-desired_align),
27283 *destptr, 1, OPTAB_DIRECT);
27284 /* See how many bytes we skipped. */
27285 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27286 *destptr,
27287 saveddest, 1, OPTAB_DIRECT);
27288 /* Adjust srcptr and count. */
27289 if (!issetmem)
27290 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27291 saveddest, *srcptr, 1, OPTAB_DIRECT);
27292 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27293 saveddest, *count, 1, OPTAB_DIRECT);
27294 /* We copied at most size + prolog_size. */
27295 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27296 *min_size
27297 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27298 else
27299 *min_size = 0;
27301 /* Our loops always round down the block size, but for dispatch to
27302 library we need precise value. */
27303 if (dynamic_check)
27304 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27305 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27307 else
27309 gcc_assert (prolog_size == 0);
27310 /* Decrease count, so we won't end up copying last word twice. */
27311 if (!CONST_INT_P (*count))
27312 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27313 constm1_rtx, *count, 1, OPTAB_DIRECT);
27314 else
27315 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27316 (unsigned HOST_WIDE_INT)size));
27317 if (*min_size)
27318 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27323 /* This function is like the previous one, except here we know how many bytes
27324 need to be copied. That allows us to update alignment not only of DST, which
27325 is returned, but also of SRC, which is passed as a pointer for that
27326 reason. */
27327 static rtx
27328 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27329 rtx srcreg, rtx value, rtx vec_value,
27330 int desired_align, int align_bytes,
27331 bool issetmem)
27333 rtx src = NULL;
27334 rtx orig_dst = dst;
27335 rtx orig_src = NULL;
27336 int piece_size = 1;
27337 int copied_bytes = 0;
27339 if (!issetmem)
27341 gcc_assert (srcp != NULL);
27342 src = *srcp;
27343 orig_src = src;
27346 for (piece_size = 1;
27347 piece_size <= desired_align && copied_bytes < align_bytes;
27348 piece_size <<= 1)
27350 if (align_bytes & piece_size)
27352 if (issetmem)
27354 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27355 dst = emit_memset (dst, destreg, vec_value, piece_size);
27356 else
27357 dst = emit_memset (dst, destreg, value, piece_size);
27359 else
27360 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27361 copied_bytes += piece_size;
27364 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27365 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27366 if (MEM_SIZE_KNOWN_P (orig_dst))
27367 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27369 if (!issetmem)
27371 int src_align_bytes = get_mem_align_offset (src, desired_align
27372 * BITS_PER_UNIT);
27373 if (src_align_bytes >= 0)
27374 src_align_bytes = desired_align - src_align_bytes;
27375 if (src_align_bytes >= 0)
27377 unsigned int src_align;
27378 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27380 if ((src_align_bytes & (src_align - 1))
27381 == (align_bytes & (src_align - 1)))
27382 break;
27384 if (src_align > (unsigned int) desired_align)
27385 src_align = desired_align;
27386 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27387 set_mem_align (src, src_align * BITS_PER_UNIT);
27389 if (MEM_SIZE_KNOWN_P (orig_src))
27390 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27391 *srcp = src;
27394 return dst;
27397 /* Return true if ALG can be used in current context.
27398 Assume we expand memset if MEMSET is true. */
27399 static bool
27400 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27402 if (alg == no_stringop)
27403 return false;
27404 if (alg == vector_loop)
27405 return TARGET_SSE || TARGET_AVX;
27406 /* Algorithms using the rep prefix want at least edi and ecx;
27407 additionally, memset wants eax and memcpy wants esi. Don't
27408 consider such algorithms if the user has appropriated those
27409 registers for their own purposes, or if we have a non-default
27410 address space, since some string insns cannot override the segment. */
27411 if (alg == rep_prefix_1_byte
27412 || alg == rep_prefix_4_byte
27413 || alg == rep_prefix_8_byte)
27415 if (have_as)
27416 return false;
27417 if (fixed_regs[CX_REG]
27418 || fixed_regs[DI_REG]
27419 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27420 return false;
27422 return true;
27425 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27426 static enum stringop_alg
27427 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27428 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27429 bool memset, bool zero_memset, bool have_as,
27430 int *dynamic_check, bool *noalign, bool recur)
27432 const struct stringop_algs *algs;
27433 bool optimize_for_speed;
27434 int max = 0;
27435 const struct processor_costs *cost;
27436 int i;
27437 bool any_alg_usable_p = false;
27439 *noalign = false;
27440 *dynamic_check = -1;
27442 /* Even if the string operation call is cold, we still might spend a lot
27443 of time processing large blocks. */
27444 if (optimize_function_for_size_p (cfun)
27445 || (optimize_insn_for_size_p ()
27446 && (max_size < 256
27447 || (expected_size != -1 && expected_size < 256))))
27448 optimize_for_speed = false;
27449 else
27450 optimize_for_speed = true;
27452 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27453 if (memset)
27454 algs = &cost->memset[TARGET_64BIT != 0];
27455 else
27456 algs = &cost->memcpy[TARGET_64BIT != 0];
27458 /* See maximal size for user defined algorithm. */
27459 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27461 enum stringop_alg candidate = algs->size[i].alg;
27462 bool usable = alg_usable_p (candidate, memset, have_as);
27463 any_alg_usable_p |= usable;
27465 if (candidate != libcall && candidate && usable)
27466 max = algs->size[i].max;
27469 /* If expected size is not known but max size is small enough
27470 so inline version is a win, set expected size into
27471 the range. */
27472 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27473 && expected_size == -1)
27474 expected_size = min_size / 2 + max_size / 2;
27476 /* If user specified the algorithm, honor it if possible. */
27477 if (ix86_stringop_alg != no_stringop
27478 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27479 return ix86_stringop_alg;
27480 /* rep; movq or rep; movl is the smallest variant. */
27481 else if (!optimize_for_speed)
27483 *noalign = true;
27484 if (!count || (count & 3) || (memset && !zero_memset))
27485 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27486 ? rep_prefix_1_byte : loop_1_byte;
27487 else
27488 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27489 ? rep_prefix_4_byte : loop;
27491 /* Very tiny blocks are best handled via the loop, REP is expensive to
27492 setup. */
27493 else if (expected_size != -1 && expected_size < 4)
27494 return loop_1_byte;
27495 else if (expected_size != -1)
27497 enum stringop_alg alg = libcall;
27498 bool alg_noalign = false;
27499 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27501 /* We get here if the algorithms that were not libcall-based
27502 were rep-prefix based and we are unable to use rep prefixes
27503 based on global register usage. Break out of the loop and
27504 use the heuristic below. */
27505 if (algs->size[i].max == 0)
27506 break;
27507 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27509 enum stringop_alg candidate = algs->size[i].alg;
27511 if (candidate != libcall
27512 && alg_usable_p (candidate, memset, have_as))
27514 alg = candidate;
27515 alg_noalign = algs->size[i].noalign;
27517 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27518 last non-libcall inline algorithm. */
27519 if (TARGET_INLINE_ALL_STRINGOPS)
27521 /* When the current size is best to be copied by a libcall,
27522 but we are still forced to inline, run the heuristic below
27523 that will pick code for medium sized blocks. */
27524 if (alg != libcall)
27526 *noalign = alg_noalign;
27527 return alg;
27529 else if (!any_alg_usable_p)
27530 break;
27532 else if (alg_usable_p (candidate, memset, have_as))
27534 *noalign = algs->size[i].noalign;
27535 return candidate;
27540 /* When asked to inline the call anyway, try to pick meaningful choice.
27541 We look for maximal size of block that is faster to copy by hand and
27542 take blocks of at most of that size guessing that average size will
27543 be roughly half of the block.
27545 If this turns out to be bad, we might simply specify the preferred
27546 choice in ix86_costs. */
27547 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27548 && (algs->unknown_size == libcall
27549 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27551 enum stringop_alg alg;
27552 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27554 /* If there aren't any usable algorithms or if recursing already,
27555 then recursing on smaller sizes or same size isn't going to
27556 find anything. Just return the simple byte-at-a-time copy loop. */
27557 if (!any_alg_usable_p || recur)
27559 /* Pick something reasonable. */
27560 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27561 *dynamic_check = 128;
27562 return loop_1_byte;
27564 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27565 zero_memset, have_as, dynamic_check, noalign, true);
27566 gcc_assert (*dynamic_check == -1);
27567 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27568 *dynamic_check = max;
27569 else
27570 gcc_assert (alg != libcall);
27571 return alg;
27573 return (alg_usable_p (algs->unknown_size, memset, have_as)
27574 ? algs->unknown_size : libcall);
27577 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27578 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27579 static int
27580 decide_alignment (int align,
27581 enum stringop_alg alg,
27582 int expected_size,
27583 machine_mode move_mode)
27585 int desired_align = 0;
27587 gcc_assert (alg != no_stringop);
27589 if (alg == libcall)
27590 return 0;
27591 if (move_mode == VOIDmode)
27592 return 0;
27594 desired_align = GET_MODE_SIZE (move_mode);
27595 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27596 copying whole cacheline at once. */
27597 if (TARGET_PENTIUMPRO
27598 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27599 desired_align = 8;
27601 if (optimize_size)
27602 desired_align = 1;
27603 if (desired_align < align)
27604 desired_align = align;
27605 if (expected_size != -1 && expected_size < 4)
27606 desired_align = align;
27608 return desired_align;
27612 /* Helper function for memcpy. For QImode value 0xXY produce
27613 0xXYXYXYXY of wide specified by MODE. This is essentially
27614 a * 0x10101010, but we can do slightly better than
27615 synth_mult by unwinding the sequence by hand on CPUs with
27616 slow multiply. */
27617 static rtx
27618 promote_duplicated_reg (machine_mode mode, rtx val)
27620 machine_mode valmode = GET_MODE (val);
27621 rtx tmp;
27622 int nops = mode == DImode ? 3 : 2;
27624 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27625 if (val == const0_rtx)
27626 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27627 if (CONST_INT_P (val))
27629 HOST_WIDE_INT v = INTVAL (val) & 255;
27631 v |= v << 8;
27632 v |= v << 16;
27633 if (mode == DImode)
27634 v |= (v << 16) << 16;
27635 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27638 if (valmode == VOIDmode)
27639 valmode = QImode;
27640 if (valmode != QImode)
27641 val = gen_lowpart (QImode, val);
27642 if (mode == QImode)
27643 return val;
27644 if (!TARGET_PARTIAL_REG_STALL)
27645 nops--;
27646 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27647 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27648 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27649 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27651 rtx reg = convert_modes (mode, QImode, val, true);
27652 tmp = promote_duplicated_reg (mode, const1_rtx);
27653 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27654 OPTAB_DIRECT);
27656 else
27658 rtx reg = convert_modes (mode, QImode, val, true);
27660 if (!TARGET_PARTIAL_REG_STALL)
27661 if (mode == SImode)
27662 emit_insn (gen_insvsi_1 (reg, reg));
27663 else
27664 emit_insn (gen_insvdi_1 (reg, reg));
27665 else
27667 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27668 NULL, 1, OPTAB_DIRECT);
27669 reg =
27670 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27672 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27673 NULL, 1, OPTAB_DIRECT);
27674 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27675 if (mode == SImode)
27676 return reg;
27677 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27678 NULL, 1, OPTAB_DIRECT);
27679 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27680 return reg;
27684 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27685 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27686 alignment from ALIGN to DESIRED_ALIGN. */
27687 static rtx
27688 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27689 int align)
27691 rtx promoted_val;
27693 if (TARGET_64BIT
27694 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27695 promoted_val = promote_duplicated_reg (DImode, val);
27696 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27697 promoted_val = promote_duplicated_reg (SImode, val);
27698 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27699 promoted_val = promote_duplicated_reg (HImode, val);
27700 else
27701 promoted_val = val;
27703 return promoted_val;
27706 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27707 operations when profitable. The code depends upon architecture, block size
27708 and alignment, but always has one of the following overall structures:
27710 Aligned move sequence:
27712 1) Prologue guard: Conditional that jumps up to epilogues for small
27713 blocks that can be handled by epilogue alone. This is faster
27714 but also needed for correctness, since prologue assume the block
27715 is larger than the desired alignment.
27717 Optional dynamic check for size and libcall for large
27718 blocks is emitted here too, with -minline-stringops-dynamically.
27720 2) Prologue: copy first few bytes in order to get destination
27721 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27722 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27723 copied. We emit either a jump tree on power of two sized
27724 blocks, or a byte loop.
27726 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27727 with specified algorithm.
27729 4) Epilogue: code copying tail of the block that is too small to be
27730 handled by main body (or up to size guarded by prologue guard).
27732 Misaligned move sequence
27734 1) missaligned move prologue/epilogue containing:
27735 a) Prologue handling small memory blocks and jumping to done_label
27736 (skipped if blocks are known to be large enough)
27737 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27738 needed by single possibly misaligned move
27739 (skipped if alignment is not needed)
27740 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27742 2) Zero size guard dispatching to done_label, if needed
27744 3) dispatch to library call, if needed,
27746 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27747 with specified algorithm. */
27748 bool
27749 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27750 rtx align_exp, rtx expected_align_exp,
27751 rtx expected_size_exp, rtx min_size_exp,
27752 rtx max_size_exp, rtx probable_max_size_exp,
27753 bool issetmem)
27755 rtx destreg;
27756 rtx srcreg = NULL;
27757 rtx_code_label *label = NULL;
27758 rtx tmp;
27759 rtx_code_label *jump_around_label = NULL;
27760 HOST_WIDE_INT align = 1;
27761 unsigned HOST_WIDE_INT count = 0;
27762 HOST_WIDE_INT expected_size = -1;
27763 int size_needed = 0, epilogue_size_needed;
27764 int desired_align = 0, align_bytes = 0;
27765 enum stringop_alg alg;
27766 rtx promoted_val = NULL;
27767 rtx vec_promoted_val = NULL;
27768 bool force_loopy_epilogue = false;
27769 int dynamic_check;
27770 bool need_zero_guard = false;
27771 bool noalign;
27772 machine_mode move_mode = VOIDmode;
27773 machine_mode wider_mode;
27774 int unroll_factor = 1;
27775 /* TODO: Once value ranges are available, fill in proper data. */
27776 unsigned HOST_WIDE_INT min_size = 0;
27777 unsigned HOST_WIDE_INT max_size = -1;
27778 unsigned HOST_WIDE_INT probable_max_size = -1;
27779 bool misaligned_prologue_used = false;
27780 bool have_as;
27782 if (CONST_INT_P (align_exp))
27783 align = INTVAL (align_exp);
27784 /* i386 can do misaligned access on reasonably increased cost. */
27785 if (CONST_INT_P (expected_align_exp)
27786 && INTVAL (expected_align_exp) > align)
27787 align = INTVAL (expected_align_exp);
27788 /* ALIGN is the minimum of destination and source alignment, but we care here
27789 just about destination alignment. */
27790 else if (!issetmem
27791 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27792 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27794 if (CONST_INT_P (count_exp))
27796 min_size = max_size = probable_max_size = count = expected_size
27797 = INTVAL (count_exp);
27798 /* When COUNT is 0, there is nothing to do. */
27799 if (!count)
27800 return true;
27802 else
27804 if (min_size_exp)
27805 min_size = INTVAL (min_size_exp);
27806 if (max_size_exp)
27807 max_size = INTVAL (max_size_exp);
27808 if (probable_max_size_exp)
27809 probable_max_size = INTVAL (probable_max_size_exp);
27810 if (CONST_INT_P (expected_size_exp))
27811 expected_size = INTVAL (expected_size_exp);
27814 /* Make sure we don't need to care about overflow later on. */
27815 if (count > (HOST_WIDE_INT_1U << 30))
27816 return false;
27818 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27819 if (!issetmem)
27820 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27822 /* Step 0: Decide on preferred algorithm, desired alignment and
27823 size of chunks to be copied by main loop. */
27824 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27825 issetmem,
27826 issetmem && val_exp == const0_rtx, have_as,
27827 &dynamic_check, &noalign, false);
27828 if (alg == libcall)
27829 return false;
27830 gcc_assert (alg != no_stringop);
27832 /* For now vector-version of memset is generated only for memory zeroing, as
27833 creating of promoted vector value is very cheap in this case. */
27834 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27835 alg = unrolled_loop;
27837 if (!count)
27838 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27839 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27840 if (!issetmem)
27841 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27843 unroll_factor = 1;
27844 move_mode = word_mode;
27845 switch (alg)
27847 case libcall:
27848 case no_stringop:
27849 case last_alg:
27850 gcc_unreachable ();
27851 case loop_1_byte:
27852 need_zero_guard = true;
27853 move_mode = QImode;
27854 break;
27855 case loop:
27856 need_zero_guard = true;
27857 break;
27858 case unrolled_loop:
27859 need_zero_guard = true;
27860 unroll_factor = (TARGET_64BIT ? 4 : 2);
27861 break;
27862 case vector_loop:
27863 need_zero_guard = true;
27864 unroll_factor = 4;
27865 /* Find the widest supported mode. */
27866 move_mode = word_mode;
27867 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27868 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27869 move_mode = wider_mode;
27871 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27872 move_mode = TImode;
27874 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27875 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27876 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27878 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27879 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27880 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27881 move_mode = word_mode;
27883 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27884 break;
27885 case rep_prefix_8_byte:
27886 move_mode = DImode;
27887 break;
27888 case rep_prefix_4_byte:
27889 move_mode = SImode;
27890 break;
27891 case rep_prefix_1_byte:
27892 move_mode = QImode;
27893 break;
27895 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27896 epilogue_size_needed = size_needed;
27898 /* If we are going to call any library calls conditionally, make sure any
27899 pending stack adjustment happen before the first conditional branch,
27900 otherwise they will be emitted before the library call only and won't
27901 happen from the other branches. */
27902 if (dynamic_check != -1)
27903 do_pending_stack_adjust ();
27905 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27906 if (!TARGET_ALIGN_STRINGOPS || noalign)
27907 align = desired_align;
27909 /* Step 1: Prologue guard. */
27911 /* Alignment code needs count to be in register. */
27912 if (CONST_INT_P (count_exp) && desired_align > align)
27914 if (INTVAL (count_exp) > desired_align
27915 && INTVAL (count_exp) > size_needed)
27917 align_bytes
27918 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27919 if (align_bytes <= 0)
27920 align_bytes = 0;
27921 else
27922 align_bytes = desired_align - align_bytes;
27924 if (align_bytes == 0)
27925 count_exp = force_reg (counter_mode (count_exp), count_exp);
27927 gcc_assert (desired_align >= 1 && align >= 1);
27929 /* Misaligned move sequences handle both prologue and epilogue at once.
27930 Default code generation results in a smaller code for large alignments
27931 and also avoids redundant job when sizes are known precisely. */
27932 misaligned_prologue_used
27933 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27934 && MAX (desired_align, epilogue_size_needed) <= 32
27935 && desired_align <= epilogue_size_needed
27936 && ((desired_align > align && !align_bytes)
27937 || (!count && epilogue_size_needed > 1)));
27939 /* Do the cheap promotion to allow better CSE across the
27940 main loop and epilogue (ie one load of the big constant in the
27941 front of all code.
27942 For now the misaligned move sequences do not have fast path
27943 without broadcasting. */
27944 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27946 if (alg == vector_loop)
27948 gcc_assert (val_exp == const0_rtx);
27949 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27950 promoted_val = promote_duplicated_reg_to_size (val_exp,
27951 GET_MODE_SIZE (word_mode),
27952 desired_align, align);
27954 else
27956 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27957 desired_align, align);
27960 /* Misaligned move sequences handles both prologues and epilogues at once.
27961 Default code generation results in smaller code for large alignments and
27962 also avoids redundant job when sizes are known precisely. */
27963 if (misaligned_prologue_used)
27965 /* Misaligned move prologue handled small blocks by itself. */
27966 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27967 (dst, src, &destreg, &srcreg,
27968 move_mode, promoted_val, vec_promoted_val,
27969 &count_exp,
27970 &jump_around_label,
27971 desired_align < align
27972 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27973 desired_align, align, &min_size, dynamic_check, issetmem);
27974 if (!issetmem)
27975 src = change_address (src, BLKmode, srcreg);
27976 dst = change_address (dst, BLKmode, destreg);
27977 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27978 epilogue_size_needed = 0;
27979 if (need_zero_guard
27980 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27982 /* It is possible that we copied enough so the main loop will not
27983 execute. */
27984 gcc_assert (size_needed > 1);
27985 if (jump_around_label == NULL_RTX)
27986 jump_around_label = gen_label_rtx ();
27987 emit_cmp_and_jump_insns (count_exp,
27988 GEN_INT (size_needed),
27989 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27990 if (expected_size == -1
27991 || expected_size < (desired_align - align) / 2 + size_needed)
27992 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27993 else
27994 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27997 /* Ensure that alignment prologue won't copy past end of block. */
27998 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28000 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28001 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28002 Make sure it is power of 2. */
28003 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28005 /* To improve performance of small blocks, we jump around the VAL
28006 promoting mode. This mean that if the promoted VAL is not constant,
28007 we might not use it in the epilogue and have to use byte
28008 loop variant. */
28009 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28010 force_loopy_epilogue = true;
28011 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28012 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28014 /* If main algorithm works on QImode, no epilogue is needed.
28015 For small sizes just don't align anything. */
28016 if (size_needed == 1)
28017 desired_align = align;
28018 else
28019 goto epilogue;
28021 else if (!count
28022 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28024 label = gen_label_rtx ();
28025 emit_cmp_and_jump_insns (count_exp,
28026 GEN_INT (epilogue_size_needed),
28027 LTU, 0, counter_mode (count_exp), 1, label);
28028 if (expected_size == -1 || expected_size < epilogue_size_needed)
28029 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28030 else
28031 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28035 /* Emit code to decide on runtime whether library call or inline should be
28036 used. */
28037 if (dynamic_check != -1)
28039 if (!issetmem && CONST_INT_P (count_exp))
28041 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28043 emit_block_copy_via_libcall (dst, src, count_exp);
28044 count_exp = const0_rtx;
28045 goto epilogue;
28048 else
28050 rtx_code_label *hot_label = gen_label_rtx ();
28051 if (jump_around_label == NULL_RTX)
28052 jump_around_label = gen_label_rtx ();
28053 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28054 LEU, 0, counter_mode (count_exp),
28055 1, hot_label);
28056 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28057 if (issetmem)
28058 set_storage_via_libcall (dst, count_exp, val_exp);
28059 else
28060 emit_block_copy_via_libcall (dst, src, count_exp);
28061 emit_jump (jump_around_label);
28062 emit_label (hot_label);
28066 /* Step 2: Alignment prologue. */
28067 /* Do the expensive promotion once we branched off the small blocks. */
28068 if (issetmem && !promoted_val)
28069 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28070 desired_align, align);
28072 if (desired_align > align && !misaligned_prologue_used)
28074 if (align_bytes == 0)
28076 /* Except for the first move in prologue, we no longer know
28077 constant offset in aliasing info. It don't seems to worth
28078 the pain to maintain it for the first move, so throw away
28079 the info early. */
28080 dst = change_address (dst, BLKmode, destreg);
28081 if (!issetmem)
28082 src = change_address (src, BLKmode, srcreg);
28083 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28084 promoted_val, vec_promoted_val,
28085 count_exp, align, desired_align,
28086 issetmem);
28087 /* At most desired_align - align bytes are copied. */
28088 if (min_size < (unsigned)(desired_align - align))
28089 min_size = 0;
28090 else
28091 min_size -= desired_align - align;
28093 else
28095 /* If we know how many bytes need to be stored before dst is
28096 sufficiently aligned, maintain aliasing info accurately. */
28097 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28098 srcreg,
28099 promoted_val,
28100 vec_promoted_val,
28101 desired_align,
28102 align_bytes,
28103 issetmem);
28105 count_exp = plus_constant (counter_mode (count_exp),
28106 count_exp, -align_bytes);
28107 count -= align_bytes;
28108 min_size -= align_bytes;
28109 max_size -= align_bytes;
28111 if (need_zero_guard
28112 && min_size < (unsigned HOST_WIDE_INT) size_needed
28113 && (count < (unsigned HOST_WIDE_INT) size_needed
28114 || (align_bytes == 0
28115 && count < ((unsigned HOST_WIDE_INT) size_needed
28116 + desired_align - align))))
28118 /* It is possible that we copied enough so the main loop will not
28119 execute. */
28120 gcc_assert (size_needed > 1);
28121 if (label == NULL_RTX)
28122 label = gen_label_rtx ();
28123 emit_cmp_and_jump_insns (count_exp,
28124 GEN_INT (size_needed),
28125 LTU, 0, counter_mode (count_exp), 1, label);
28126 if (expected_size == -1
28127 || expected_size < (desired_align - align) / 2 + size_needed)
28128 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28129 else
28130 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28133 if (label && size_needed == 1)
28135 emit_label (label);
28136 LABEL_NUSES (label) = 1;
28137 label = NULL;
28138 epilogue_size_needed = 1;
28139 if (issetmem)
28140 promoted_val = val_exp;
28142 else if (label == NULL_RTX && !misaligned_prologue_used)
28143 epilogue_size_needed = size_needed;
28145 /* Step 3: Main loop. */
28147 switch (alg)
28149 case libcall:
28150 case no_stringop:
28151 case last_alg:
28152 gcc_unreachable ();
28153 case loop_1_byte:
28154 case loop:
28155 case unrolled_loop:
28156 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28157 count_exp, move_mode, unroll_factor,
28158 expected_size, issetmem);
28159 break;
28160 case vector_loop:
28161 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28162 vec_promoted_val, count_exp, move_mode,
28163 unroll_factor, expected_size, issetmem);
28164 break;
28165 case rep_prefix_8_byte:
28166 case rep_prefix_4_byte:
28167 case rep_prefix_1_byte:
28168 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28169 val_exp, count_exp, move_mode, issetmem);
28170 break;
28172 /* Adjust properly the offset of src and dest memory for aliasing. */
28173 if (CONST_INT_P (count_exp))
28175 if (!issetmem)
28176 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28177 (count / size_needed) * size_needed);
28178 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28179 (count / size_needed) * size_needed);
28181 else
28183 if (!issetmem)
28184 src = change_address (src, BLKmode, srcreg);
28185 dst = change_address (dst, BLKmode, destreg);
28188 /* Step 4: Epilogue to copy the remaining bytes. */
28189 epilogue:
28190 if (label)
28192 /* When the main loop is done, COUNT_EXP might hold original count,
28193 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28194 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28195 bytes. Compensate if needed. */
28197 if (size_needed < epilogue_size_needed)
28199 tmp =
28200 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28201 GEN_INT (size_needed - 1), count_exp, 1,
28202 OPTAB_DIRECT);
28203 if (tmp != count_exp)
28204 emit_move_insn (count_exp, tmp);
28206 emit_label (label);
28207 LABEL_NUSES (label) = 1;
28210 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28212 if (force_loopy_epilogue)
28213 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28214 epilogue_size_needed);
28215 else
28217 if (issetmem)
28218 expand_setmem_epilogue (dst, destreg, promoted_val,
28219 vec_promoted_val, count_exp,
28220 epilogue_size_needed);
28221 else
28222 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28223 epilogue_size_needed);
28226 if (jump_around_label)
28227 emit_label (jump_around_label);
28228 return true;
28232 /* Expand the appropriate insns for doing strlen if not just doing
28233 repnz; scasb
28235 out = result, initialized with the start address
28236 align_rtx = alignment of the address.
28237 scratch = scratch register, initialized with the startaddress when
28238 not aligned, otherwise undefined
28240 This is just the body. It needs the initializations mentioned above and
28241 some address computing at the end. These things are done in i386.md. */
28243 static void
28244 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28246 int align;
28247 rtx tmp;
28248 rtx_code_label *align_2_label = NULL;
28249 rtx_code_label *align_3_label = NULL;
28250 rtx_code_label *align_4_label = gen_label_rtx ();
28251 rtx_code_label *end_0_label = gen_label_rtx ();
28252 rtx mem;
28253 rtx tmpreg = gen_reg_rtx (SImode);
28254 rtx scratch = gen_reg_rtx (SImode);
28255 rtx cmp;
28257 align = 0;
28258 if (CONST_INT_P (align_rtx))
28259 align = INTVAL (align_rtx);
28261 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28263 /* Is there a known alignment and is it less than 4? */
28264 if (align < 4)
28266 rtx scratch1 = gen_reg_rtx (Pmode);
28267 emit_move_insn (scratch1, out);
28268 /* Is there a known alignment and is it not 2? */
28269 if (align != 2)
28271 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28272 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28274 /* Leave just the 3 lower bits. */
28275 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28276 NULL_RTX, 0, OPTAB_WIDEN);
28278 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28279 Pmode, 1, align_4_label);
28280 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28281 Pmode, 1, align_2_label);
28282 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28283 Pmode, 1, align_3_label);
28285 else
28287 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28288 check if is aligned to 4 - byte. */
28290 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28291 NULL_RTX, 0, OPTAB_WIDEN);
28293 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28294 Pmode, 1, align_4_label);
28297 mem = change_address (src, QImode, out);
28299 /* Now compare the bytes. */
28301 /* Compare the first n unaligned byte on a byte per byte basis. */
28302 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28303 QImode, 1, end_0_label);
28305 /* Increment the address. */
28306 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28308 /* Not needed with an alignment of 2 */
28309 if (align != 2)
28311 emit_label (align_2_label);
28313 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28314 end_0_label);
28316 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28318 emit_label (align_3_label);
28321 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28322 end_0_label);
28324 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28327 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28328 align this loop. It gives only huge programs, but does not help to
28329 speed up. */
28330 emit_label (align_4_label);
28332 mem = change_address (src, SImode, out);
28333 emit_move_insn (scratch, mem);
28334 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28336 /* This formula yields a nonzero result iff one of the bytes is zero.
28337 This saves three branches inside loop and many cycles. */
28339 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28340 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28341 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28342 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28343 gen_int_mode (0x80808080, SImode)));
28344 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28345 align_4_label);
28347 if (TARGET_CMOVE)
28349 rtx reg = gen_reg_rtx (SImode);
28350 rtx reg2 = gen_reg_rtx (Pmode);
28351 emit_move_insn (reg, tmpreg);
28352 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28354 /* If zero is not in the first two bytes, move two bytes forward. */
28355 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28356 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28357 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28358 emit_insn (gen_rtx_SET (tmpreg,
28359 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28360 reg,
28361 tmpreg)));
28362 /* Emit lea manually to avoid clobbering of flags. */
28363 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28365 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28366 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28367 emit_insn (gen_rtx_SET (out,
28368 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28369 reg2,
28370 out)));
28372 else
28374 rtx_code_label *end_2_label = gen_label_rtx ();
28375 /* Is zero in the first two bytes? */
28377 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28378 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28379 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28380 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28381 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28382 pc_rtx);
28383 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28384 JUMP_LABEL (tmp) = end_2_label;
28386 /* Not in the first two. Move two bytes forward. */
28387 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28388 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28390 emit_label (end_2_label);
28394 /* Avoid branch in fixing the byte. */
28395 tmpreg = gen_lowpart (QImode, tmpreg);
28396 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28397 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28398 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28399 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28401 emit_label (end_0_label);
28404 /* Expand strlen. */
28406 bool
28407 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28409 rtx addr, scratch1, scratch2, scratch3, scratch4;
28411 /* The generic case of strlen expander is long. Avoid it's
28412 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28414 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28415 && !TARGET_INLINE_ALL_STRINGOPS
28416 && !optimize_insn_for_size_p ()
28417 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28418 return false;
28420 addr = force_reg (Pmode, XEXP (src, 0));
28421 scratch1 = gen_reg_rtx (Pmode);
28423 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28424 && !optimize_insn_for_size_p ())
28426 /* Well it seems that some optimizer does not combine a call like
28427 foo(strlen(bar), strlen(bar));
28428 when the move and the subtraction is done here. It does calculate
28429 the length just once when these instructions are done inside of
28430 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28431 often used and I use one fewer register for the lifetime of
28432 output_strlen_unroll() this is better. */
28434 emit_move_insn (out, addr);
28436 ix86_expand_strlensi_unroll_1 (out, src, align);
28438 /* strlensi_unroll_1 returns the address of the zero at the end of
28439 the string, like memchr(), so compute the length by subtracting
28440 the start address. */
28441 emit_insn (ix86_gen_sub3 (out, out, addr));
28443 else
28445 rtx unspec;
28447 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28448 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28449 return false;
28450 /* Can't use this for non-default address spaces. */
28451 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28452 return false;
28454 scratch2 = gen_reg_rtx (Pmode);
28455 scratch3 = gen_reg_rtx (Pmode);
28456 scratch4 = force_reg (Pmode, constm1_rtx);
28458 emit_move_insn (scratch3, addr);
28459 eoschar = force_reg (QImode, eoschar);
28461 src = replace_equiv_address_nv (src, scratch3);
28463 /* If .md starts supporting :P, this can be done in .md. */
28464 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28465 scratch4), UNSPEC_SCAS);
28466 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28467 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28468 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28470 return true;
28473 /* For given symbol (function) construct code to compute address of it's PLT
28474 entry in large x86-64 PIC model. */
28475 static rtx
28476 construct_plt_address (rtx symbol)
28478 rtx tmp, unspec;
28480 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28481 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28482 gcc_assert (Pmode == DImode);
28484 tmp = gen_reg_rtx (Pmode);
28485 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28487 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28488 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28489 return tmp;
28493 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28494 rtx callarg2,
28495 rtx pop, bool sibcall)
28497 rtx vec[3];
28498 rtx use = NULL, call;
28499 unsigned int vec_len = 0;
28500 tree fndecl;
28502 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28504 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28505 if (fndecl
28506 && (lookup_attribute ("interrupt",
28507 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28508 error ("interrupt service routine can't be called directly");
28510 else
28511 fndecl = NULL_TREE;
28513 if (pop == const0_rtx)
28514 pop = NULL;
28515 gcc_assert (!TARGET_64BIT || !pop);
28517 if (TARGET_MACHO && !TARGET_64BIT)
28519 #if TARGET_MACHO
28520 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28521 fnaddr = machopic_indirect_call_target (fnaddr);
28522 #endif
28524 else
28526 /* Static functions and indirect calls don't need the pic register. Also,
28527 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28528 it an indirect call. */
28529 rtx addr = XEXP (fnaddr, 0);
28530 if (flag_pic
28531 && GET_CODE (addr) == SYMBOL_REF
28532 && !SYMBOL_REF_LOCAL_P (addr))
28534 if (flag_plt
28535 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28536 || !lookup_attribute ("noplt",
28537 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28539 if (!TARGET_64BIT
28540 || (ix86_cmodel == CM_LARGE_PIC
28541 && DEFAULT_ABI != MS_ABI))
28543 use_reg (&use, gen_rtx_REG (Pmode,
28544 REAL_PIC_OFFSET_TABLE_REGNUM));
28545 if (ix86_use_pseudo_pic_reg ())
28546 emit_move_insn (gen_rtx_REG (Pmode,
28547 REAL_PIC_OFFSET_TABLE_REGNUM),
28548 pic_offset_table_rtx);
28551 else if (!TARGET_PECOFF && !TARGET_MACHO)
28553 if (TARGET_64BIT)
28555 fnaddr = gen_rtx_UNSPEC (Pmode,
28556 gen_rtvec (1, addr),
28557 UNSPEC_GOTPCREL);
28558 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28560 else
28562 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28563 UNSPEC_GOT);
28564 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28565 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28566 fnaddr);
28568 fnaddr = gen_const_mem (Pmode, fnaddr);
28569 /* Pmode may not be the same as word_mode for x32, which
28570 doesn't support indirect branch via 32-bit memory slot.
28571 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28572 indirect branch via x32 GOT slot is OK. */
28573 if (GET_MODE (fnaddr) != word_mode)
28574 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28575 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28580 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28581 parameters passed in vector registers. */
28582 if (TARGET_64BIT
28583 && (INTVAL (callarg2) > 0
28584 || (INTVAL (callarg2) == 0
28585 && (TARGET_SSE || !flag_skip_rax_setup))))
28587 rtx al = gen_rtx_REG (QImode, AX_REG);
28588 emit_move_insn (al, callarg2);
28589 use_reg (&use, al);
28592 if (ix86_cmodel == CM_LARGE_PIC
28593 && !TARGET_PECOFF
28594 && MEM_P (fnaddr)
28595 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28596 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28597 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28598 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28599 branch via x32 GOT slot is OK. */
28600 else if (!(TARGET_X32
28601 && MEM_P (fnaddr)
28602 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28603 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28604 && (sibcall
28605 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28606 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28608 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28609 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28612 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28614 if (retval)
28616 /* We should add bounds as destination register in case
28617 pointer with bounds may be returned. */
28618 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28620 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28621 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28622 if (GET_CODE (retval) == PARALLEL)
28624 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28625 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28626 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28627 retval = chkp_join_splitted_slot (retval, par);
28629 else
28631 retval = gen_rtx_PARALLEL (VOIDmode,
28632 gen_rtvec (3, retval, b0, b1));
28633 chkp_put_regs_to_expr_list (retval);
28637 call = gen_rtx_SET (retval, call);
28639 vec[vec_len++] = call;
28641 if (pop)
28643 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28644 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28645 vec[vec_len++] = pop;
28648 if (cfun->machine->no_caller_saved_registers
28649 && (!fndecl
28650 || (!TREE_THIS_VOLATILE (fndecl)
28651 && !lookup_attribute ("no_caller_saved_registers",
28652 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28654 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28655 bool is_64bit_ms_abi = (TARGET_64BIT
28656 && ix86_function_abi (fndecl) == MS_ABI);
28657 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28659 /* If there are no caller-saved registers, add all registers
28660 that are clobbered by the call which returns. */
28661 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28662 if (!fixed_regs[i]
28663 && (ix86_call_used_regs[i] == 1
28664 || (ix86_call_used_regs[i] & c_mask))
28665 && !STACK_REGNO_P (i)
28666 && !MMX_REGNO_P (i))
28667 clobber_reg (&use,
28668 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28670 else if (TARGET_64BIT_MS_ABI
28671 && (!callarg2 || INTVAL (callarg2) != -2))
28673 unsigned i;
28675 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28677 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28678 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28680 clobber_reg (&use, gen_rtx_REG (mode, regno));
28683 /* Set here, but it may get cleared later. */
28684 if (TARGET_CALL_MS2SYSV_XLOGUES)
28686 if (!TARGET_SSE)
28689 /* Don't break hot-patched functions. */
28690 else if (ix86_function_ms_hook_prologue (current_function_decl))
28693 /* TODO: Cases not yet examined. */
28694 else if (flag_split_stack)
28695 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28697 else
28699 gcc_assert (!reload_completed);
28700 cfun->machine->call_ms2sysv = true;
28705 if (vec_len > 1)
28706 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28707 call = emit_call_insn (call);
28708 if (use)
28709 CALL_INSN_FUNCTION_USAGE (call) = use;
28711 return call;
28714 /* Return true if the function being called was marked with attribute
28715 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28716 to handle the non-PIC case in the backend because there is no easy
28717 interface for the front-end to force non-PLT calls to use the GOT.
28718 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28719 to call the function marked "noplt" indirectly. */
28721 static bool
28722 ix86_nopic_noplt_attribute_p (rtx call_op)
28724 if (flag_pic || ix86_cmodel == CM_LARGE
28725 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28726 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28727 || SYMBOL_REF_LOCAL_P (call_op))
28728 return false;
28730 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28732 if (!flag_plt
28733 || (symbol_decl != NULL_TREE
28734 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28735 return true;
28737 return false;
28740 /* Output indirect branch via a call and return thunk. CALL_OP is a
28741 register which contains the branch target. XASM is the assembly
28742 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28743 A normal call is converted to:
28745 call __x86_indirect_thunk_reg
28747 and a tail call is converted to:
28749 jmp __x86_indirect_thunk_reg
28752 static void
28753 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28755 char thunk_name_buf[32];
28756 char *thunk_name;
28757 enum indirect_thunk_prefix need_prefix
28758 = indirect_thunk_need_prefix (current_output_insn);
28759 int regno = REGNO (call_op);
28761 if (cfun->machine->indirect_branch_type
28762 != indirect_branch_thunk_inline)
28764 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28766 int i = regno;
28767 if (i >= FIRST_REX_INT_REG)
28768 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28769 if (need_prefix == indirect_thunk_prefix_bnd)
28770 indirect_thunks_bnd_used |= 1 << i;
28771 else
28772 indirect_thunks_used |= 1 << i;
28774 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28775 thunk_name = thunk_name_buf;
28777 else
28778 thunk_name = NULL;
28780 if (sibcall_p)
28782 if (thunk_name != NULL)
28784 if (need_prefix == indirect_thunk_prefix_bnd)
28785 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28786 else
28787 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28789 else
28790 output_indirect_thunk (need_prefix, regno);
28792 else
28794 if (thunk_name != NULL)
28796 if (need_prefix == indirect_thunk_prefix_bnd)
28797 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28798 else
28799 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28800 return;
28803 char indirectlabel1[32];
28804 char indirectlabel2[32];
28806 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28807 INDIRECT_LABEL,
28808 indirectlabelno++);
28809 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28810 INDIRECT_LABEL,
28811 indirectlabelno++);
28813 /* Jump. */
28814 if (need_prefix == indirect_thunk_prefix_bnd)
28815 fputs ("\tbnd jmp\t", asm_out_file);
28816 else
28817 fputs ("\tjmp\t", asm_out_file);
28818 assemble_name_raw (asm_out_file, indirectlabel2);
28819 fputc ('\n', asm_out_file);
28821 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28823 if (thunk_name != NULL)
28825 if (need_prefix == indirect_thunk_prefix_bnd)
28826 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28827 else
28828 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28830 else
28831 output_indirect_thunk (need_prefix, regno);
28833 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28835 /* Call. */
28836 if (need_prefix == indirect_thunk_prefix_bnd)
28837 fputs ("\tbnd call\t", asm_out_file);
28838 else
28839 fputs ("\tcall\t", asm_out_file);
28840 assemble_name_raw (asm_out_file, indirectlabel1);
28841 fputc ('\n', asm_out_file);
28845 /* Output indirect branch via a call and return thunk. CALL_OP is
28846 the branch target. XASM is the assembly template for CALL_OP.
28847 Branch is a tail call if SIBCALL_P is true. A normal call is
28848 converted to:
28850 jmp L2
28852 push CALL_OP
28853 jmp __x86_indirect_thunk
28855 call L1
28857 and a tail call is converted to:
28859 push CALL_OP
28860 jmp __x86_indirect_thunk
28863 static void
28864 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28865 bool sibcall_p)
28867 char thunk_name_buf[32];
28868 char *thunk_name;
28869 char push_buf[64];
28870 enum indirect_thunk_prefix need_prefix
28871 = indirect_thunk_need_prefix (current_output_insn);
28872 int regno = -1;
28874 if (cfun->machine->indirect_branch_type
28875 != indirect_branch_thunk_inline)
28877 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28879 if (need_prefix == indirect_thunk_prefix_bnd)
28880 indirect_thunk_bnd_needed = true;
28881 else
28882 indirect_thunk_needed = true;
28884 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28885 thunk_name = thunk_name_buf;
28887 else
28888 thunk_name = NULL;
28890 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28891 TARGET_64BIT ? 'q' : 'l', xasm);
28893 if (sibcall_p)
28895 output_asm_insn (push_buf, &call_op);
28896 if (thunk_name != NULL)
28898 if (need_prefix == indirect_thunk_prefix_bnd)
28899 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28900 else
28901 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28903 else
28904 output_indirect_thunk (need_prefix, regno);
28906 else
28908 char indirectlabel1[32];
28909 char indirectlabel2[32];
28911 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28912 INDIRECT_LABEL,
28913 indirectlabelno++);
28914 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28915 INDIRECT_LABEL,
28916 indirectlabelno++);
28918 /* Jump. */
28919 if (need_prefix == indirect_thunk_prefix_bnd)
28920 fputs ("\tbnd jmp\t", asm_out_file);
28921 else
28922 fputs ("\tjmp\t", asm_out_file);
28923 assemble_name_raw (asm_out_file, indirectlabel2);
28924 fputc ('\n', asm_out_file);
28926 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28928 /* An external function may be called via GOT, instead of PLT. */
28929 if (MEM_P (call_op))
28931 struct ix86_address parts;
28932 rtx addr = XEXP (call_op, 0);
28933 if (ix86_decompose_address (addr, &parts)
28934 && parts.base == stack_pointer_rtx)
28936 /* Since call will adjust stack by -UNITS_PER_WORD,
28937 we must convert "disp(stack, index, scale)" to
28938 "disp+UNITS_PER_WORD(stack, index, scale)". */
28939 if (parts.index)
28941 addr = gen_rtx_MULT (Pmode, parts.index,
28942 GEN_INT (parts.scale));
28943 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28944 addr);
28946 else
28947 addr = stack_pointer_rtx;
28949 rtx disp;
28950 if (parts.disp != NULL_RTX)
28951 disp = plus_constant (Pmode, parts.disp,
28952 UNITS_PER_WORD);
28953 else
28954 disp = GEN_INT (UNITS_PER_WORD);
28956 addr = gen_rtx_PLUS (Pmode, addr, disp);
28957 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28961 output_asm_insn (push_buf, &call_op);
28963 if (thunk_name != NULL)
28965 if (need_prefix == indirect_thunk_prefix_bnd)
28966 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28967 else
28968 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28970 else
28971 output_indirect_thunk (need_prefix, regno);
28973 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28975 /* Call. */
28976 if (need_prefix == indirect_thunk_prefix_bnd)
28977 fputs ("\tbnd call\t", asm_out_file);
28978 else
28979 fputs ("\tcall\t", asm_out_file);
28980 assemble_name_raw (asm_out_file, indirectlabel1);
28981 fputc ('\n', asm_out_file);
28985 /* Output indirect branch via a call and return thunk. CALL_OP is
28986 the branch target. XASM is the assembly template for CALL_OP.
28987 Branch is a tail call if SIBCALL_P is true. */
28989 static void
28990 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28991 bool sibcall_p)
28993 if (REG_P (call_op))
28994 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28995 else
28996 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
28999 /* Output indirect jump. CALL_OP is the jump target. */
29001 const char *
29002 ix86_output_indirect_jmp (rtx call_op)
29004 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
29006 /* We can't have red-zone since "call" in the indirect thunk
29007 pushes the return address onto stack, destroying red-zone. */
29008 if (ix86_red_zone_size != 0)
29009 gcc_unreachable ();
29011 ix86_output_indirect_branch (call_op, "%0", true);
29012 return "";
29014 else
29015 return "%!jmp\t%A0";
29018 /* Output function return. CALL_OP is the jump target. Add a REP
29019 prefix to RET if LONG_P is true and function return is kept. */
29021 const char *
29022 ix86_output_function_return (bool long_p)
29024 if (cfun->machine->function_return_type != indirect_branch_keep)
29026 char thunk_name[32];
29027 enum indirect_thunk_prefix need_prefix
29028 = indirect_thunk_need_prefix (current_output_insn);
29030 if (cfun->machine->function_return_type
29031 != indirect_branch_thunk_inline)
29033 bool need_thunk = (cfun->machine->function_return_type
29034 == indirect_branch_thunk);
29035 indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
29036 true);
29037 if (need_prefix == indirect_thunk_prefix_bnd)
29039 indirect_return_bnd_needed |= need_thunk;
29040 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29042 else
29044 indirect_return_needed |= need_thunk;
29045 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29048 else
29049 output_indirect_thunk (need_prefix, INVALID_REGNUM);
29051 return "";
29054 if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
29055 return "%!ret";
29057 return "rep%; ret";
29060 /* Output indirect function return. RET_OP is the function return
29061 target. */
29063 const char *
29064 ix86_output_indirect_function_return (rtx ret_op)
29066 if (cfun->machine->function_return_type != indirect_branch_keep)
29068 char thunk_name[32];
29069 enum indirect_thunk_prefix need_prefix
29070 = indirect_thunk_need_prefix (current_output_insn);
29071 unsigned int regno = REGNO (ret_op);
29072 gcc_assert (regno == CX_REG);
29074 if (cfun->machine->function_return_type
29075 != indirect_branch_thunk_inline)
29077 bool need_thunk = (cfun->machine->function_return_type
29078 == indirect_branch_thunk);
29079 indirect_thunk_name (thunk_name, regno, need_prefix, true);
29080 if (need_prefix == indirect_thunk_prefix_bnd)
29082 if (need_thunk)
29084 indirect_return_via_cx_bnd = true;
29085 indirect_thunks_bnd_used |= 1 << CX_REG;
29087 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29089 else
29091 if (need_thunk)
29093 indirect_return_via_cx = true;
29094 indirect_thunks_used |= 1 << CX_REG;
29096 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29099 else
29100 output_indirect_thunk (need_prefix, regno);
29102 return "";
29104 else
29105 return "%!jmp\t%A0";
29108 /* Split simple return with popping POPC bytes from stack to indirect
29109 branch with stack adjustment . */
29111 void
29112 ix86_split_simple_return_pop_internal (rtx popc)
29114 struct machine_function *m = cfun->machine;
29115 rtx ecx = gen_rtx_REG (SImode, CX_REG);
29116 rtx_insn *insn;
29118 /* There is no "pascal" calling convention in any 64bit ABI. */
29119 gcc_assert (!TARGET_64BIT);
29121 insn = emit_insn (gen_pop (ecx));
29122 m->fs.cfa_offset -= UNITS_PER_WORD;
29123 m->fs.sp_offset -= UNITS_PER_WORD;
29125 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
29126 x = gen_rtx_SET (stack_pointer_rtx, x);
29127 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29128 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
29129 RTX_FRAME_RELATED_P (insn) = 1;
29131 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
29132 x = gen_rtx_SET (stack_pointer_rtx, x);
29133 insn = emit_insn (x);
29134 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29135 RTX_FRAME_RELATED_P (insn) = 1;
29137 /* Now return address is in ECX. */
29138 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
29141 /* Output the assembly for a call instruction. */
29143 const char *
29144 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29146 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29147 bool output_indirect_p
29148 = (!TARGET_SEH
29149 && cfun->machine->indirect_branch_type != indirect_branch_keep);
29150 bool seh_nop_p = false;
29151 const char *xasm;
29153 if (SIBLING_CALL_P (insn))
29155 if (direct_p)
29157 if (ix86_nopic_noplt_attribute_p (call_op))
29159 direct_p = false;
29160 if (TARGET_64BIT)
29162 if (output_indirect_p)
29163 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29164 else
29165 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29167 else
29169 if (output_indirect_p)
29170 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29171 else
29172 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29175 else
29176 xasm = "%!jmp\t%P0";
29178 /* SEH epilogue detection requires the indirect branch case
29179 to include REX.W. */
29180 else if (TARGET_SEH)
29181 xasm = "%!rex.W jmp\t%A0";
29182 else
29184 if (output_indirect_p)
29185 xasm = "%0";
29186 else
29187 xasm = "%!jmp\t%A0";
29190 if (output_indirect_p && !direct_p)
29191 ix86_output_indirect_branch (call_op, xasm, true);
29192 else
29193 output_asm_insn (xasm, &call_op);
29194 return "";
29197 /* SEH unwinding can require an extra nop to be emitted in several
29198 circumstances. Determine if we have one of those. */
29199 if (TARGET_SEH)
29201 rtx_insn *i;
29203 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29205 /* Prevent a catch region from being adjacent to a jump that would
29206 be interpreted as an epilogue sequence by the unwinder. */
29207 if (JUMP_P(i) && CROSSING_JUMP_P (i))
29209 seh_nop_p = true;
29210 break;
29213 /* If we get to another real insn, we don't need the nop. */
29214 if (INSN_P (i))
29215 break;
29217 /* If we get to the epilogue note, prevent a catch region from
29218 being adjacent to the standard epilogue sequence. If non-
29219 call-exceptions, we'll have done this during epilogue emission. */
29220 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29221 && !flag_non_call_exceptions
29222 && !can_throw_internal (insn))
29224 seh_nop_p = true;
29225 break;
29229 /* If we didn't find a real insn following the call, prevent the
29230 unwinder from looking into the next function. */
29231 if (i == NULL)
29232 seh_nop_p = true;
29235 if (direct_p)
29237 if (ix86_nopic_noplt_attribute_p (call_op))
29239 direct_p = false;
29240 if (TARGET_64BIT)
29242 if (output_indirect_p)
29243 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29244 else
29245 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29247 else
29249 if (output_indirect_p)
29250 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29251 else
29252 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29255 else
29256 xasm = "%!call\t%P0";
29258 else
29260 if (output_indirect_p)
29261 xasm = "%0";
29262 else
29263 xasm = "%!call\t%A0";
29266 if (output_indirect_p && !direct_p)
29267 ix86_output_indirect_branch (call_op, xasm, false);
29268 else
29269 output_asm_insn (xasm, &call_op);
29271 if (seh_nop_p)
29272 return "nop";
29274 return "";
29277 /* Clear stack slot assignments remembered from previous functions.
29278 This is called from INIT_EXPANDERS once before RTL is emitted for each
29279 function. */
29281 static struct machine_function *
29282 ix86_init_machine_status (void)
29284 struct machine_function *f;
29286 f = ggc_cleared_alloc<machine_function> ();
29287 f->call_abi = ix86_abi;
29289 return f;
29292 /* Return a MEM corresponding to a stack slot with mode MODE.
29293 Allocate a new slot if necessary.
29295 The RTL for a function can have several slots available: N is
29296 which slot to use. */
29299 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29301 struct stack_local_entry *s;
29303 gcc_assert (n < MAX_386_STACK_LOCALS);
29305 for (s = ix86_stack_locals; s; s = s->next)
29306 if (s->mode == mode && s->n == n)
29307 return validize_mem (copy_rtx (s->rtl));
29309 s = ggc_alloc<stack_local_entry> ();
29310 s->n = n;
29311 s->mode = mode;
29312 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29314 s->next = ix86_stack_locals;
29315 ix86_stack_locals = s;
29316 return validize_mem (copy_rtx (s->rtl));
29319 static void
29320 ix86_instantiate_decls (void)
29322 struct stack_local_entry *s;
29324 for (s = ix86_stack_locals; s; s = s->next)
29325 if (s->rtl != NULL_RTX)
29326 instantiate_decl_rtl (s->rtl);
29329 /* Return the number used for encoding REG, in the range 0..7. */
29331 static int
29332 reg_encoded_number (rtx reg)
29334 unsigned regno = REGNO (reg);
29335 switch (regno)
29337 case AX_REG:
29338 return 0;
29339 case CX_REG:
29340 return 1;
29341 case DX_REG:
29342 return 2;
29343 case BX_REG:
29344 return 3;
29345 case SP_REG:
29346 return 4;
29347 case BP_REG:
29348 return 5;
29349 case SI_REG:
29350 return 6;
29351 case DI_REG:
29352 return 7;
29353 default:
29354 break;
29356 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29357 return regno - FIRST_STACK_REG;
29358 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29359 return regno - FIRST_SSE_REG;
29360 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29361 return regno - FIRST_MMX_REG;
29362 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29363 return regno - FIRST_REX_SSE_REG;
29364 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29365 return regno - FIRST_REX_INT_REG;
29366 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29367 return regno - FIRST_MASK_REG;
29368 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29369 return regno - FIRST_BND_REG;
29370 return -1;
29373 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29374 in its encoding if it could be relevant for ROP mitigation, otherwise
29375 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29376 used for calculating it into them. */
29378 static int
29379 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29380 int *popno0 = 0, int *popno1 = 0)
29382 if (asm_noperands (PATTERN (insn)) >= 0)
29383 return -1;
29384 int has_modrm = get_attr_modrm (insn);
29385 if (!has_modrm)
29386 return -1;
29387 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29388 rtx op0, op1;
29389 switch (cls)
29391 case MODRM_CLASS_OP02:
29392 gcc_assert (noperands >= 3);
29393 if (popno0)
29395 *popno0 = 0;
29396 *popno1 = 2;
29398 op0 = operands[0];
29399 op1 = operands[2];
29400 break;
29401 case MODRM_CLASS_OP01:
29402 gcc_assert (noperands >= 2);
29403 if (popno0)
29405 *popno0 = 0;
29406 *popno1 = 1;
29408 op0 = operands[0];
29409 op1 = operands[1];
29410 break;
29411 default:
29412 return -1;
29414 if (REG_P (op0) && REG_P (op1))
29416 int enc0 = reg_encoded_number (op0);
29417 int enc1 = reg_encoded_number (op1);
29418 return 0xc0 + (enc1 << 3) + enc0;
29420 return -1;
29423 /* Check whether x86 address PARTS is a pc-relative address. */
29425 bool
29426 ix86_rip_relative_addr_p (struct ix86_address *parts)
29428 rtx base, index, disp;
29430 base = parts->base;
29431 index = parts->index;
29432 disp = parts->disp;
29434 if (disp && !base && !index)
29436 if (TARGET_64BIT)
29438 rtx symbol = disp;
29440 if (GET_CODE (disp) == CONST)
29441 symbol = XEXP (disp, 0);
29442 if (GET_CODE (symbol) == PLUS
29443 && CONST_INT_P (XEXP (symbol, 1)))
29444 symbol = XEXP (symbol, 0);
29446 if (GET_CODE (symbol) == LABEL_REF
29447 || (GET_CODE (symbol) == SYMBOL_REF
29448 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29449 || (GET_CODE (symbol) == UNSPEC
29450 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29451 || XINT (symbol, 1) == UNSPEC_PCREL
29452 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29453 return true;
29456 return false;
29459 /* Calculate the length of the memory address in the instruction encoding.
29460 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29461 or other prefixes. We never generate addr32 prefix for LEA insn. */
29464 memory_address_length (rtx addr, bool lea)
29466 struct ix86_address parts;
29467 rtx base, index, disp;
29468 int len;
29469 int ok;
29471 if (GET_CODE (addr) == PRE_DEC
29472 || GET_CODE (addr) == POST_INC
29473 || GET_CODE (addr) == PRE_MODIFY
29474 || GET_CODE (addr) == POST_MODIFY)
29475 return 0;
29477 ok = ix86_decompose_address (addr, &parts);
29478 gcc_assert (ok);
29480 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29482 /* If this is not LEA instruction, add the length of addr32 prefix. */
29483 if (TARGET_64BIT && !lea
29484 && (SImode_address_operand (addr, VOIDmode)
29485 || (parts.base && GET_MODE (parts.base) == SImode)
29486 || (parts.index && GET_MODE (parts.index) == SImode)))
29487 len++;
29489 base = parts.base;
29490 index = parts.index;
29491 disp = parts.disp;
29493 if (base && SUBREG_P (base))
29494 base = SUBREG_REG (base);
29495 if (index && SUBREG_P (index))
29496 index = SUBREG_REG (index);
29498 gcc_assert (base == NULL_RTX || REG_P (base));
29499 gcc_assert (index == NULL_RTX || REG_P (index));
29501 /* Rule of thumb:
29502 - esp as the base always wants an index,
29503 - ebp as the base always wants a displacement,
29504 - r12 as the base always wants an index,
29505 - r13 as the base always wants a displacement. */
29507 /* Register Indirect. */
29508 if (base && !index && !disp)
29510 /* esp (for its index) and ebp (for its displacement) need
29511 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29512 code. */
29513 if (base == arg_pointer_rtx
29514 || base == frame_pointer_rtx
29515 || REGNO (base) == SP_REG
29516 || REGNO (base) == BP_REG
29517 || REGNO (base) == R12_REG
29518 || REGNO (base) == R13_REG)
29519 len++;
29522 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29523 is not disp32, but disp32(%rip), so for disp32
29524 SIB byte is needed, unless print_operand_address
29525 optimizes it into disp32(%rip) or (%rip) is implied
29526 by UNSPEC. */
29527 else if (disp && !base && !index)
29529 len += 4;
29530 if (!ix86_rip_relative_addr_p (&parts))
29531 len++;
29533 else
29535 /* Find the length of the displacement constant. */
29536 if (disp)
29538 if (base && satisfies_constraint_K (disp))
29539 len += 1;
29540 else
29541 len += 4;
29543 /* ebp always wants a displacement. Similarly r13. */
29544 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29545 len++;
29547 /* An index requires the two-byte modrm form.... */
29548 if (index
29549 /* ...like esp (or r12), which always wants an index. */
29550 || base == arg_pointer_rtx
29551 || base == frame_pointer_rtx
29552 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29553 len++;
29556 return len;
29559 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29560 is set, expect that insn have 8bit immediate alternative. */
29562 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29564 int len = 0;
29565 int i;
29566 extract_insn_cached (insn);
29567 for (i = recog_data.n_operands - 1; i >= 0; --i)
29568 if (CONSTANT_P (recog_data.operand[i]))
29570 enum attr_mode mode = get_attr_mode (insn);
29572 gcc_assert (!len);
29573 if (shortform && CONST_INT_P (recog_data.operand[i]))
29575 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29576 switch (mode)
29578 case MODE_QI:
29579 len = 1;
29580 continue;
29581 case MODE_HI:
29582 ival = trunc_int_for_mode (ival, HImode);
29583 break;
29584 case MODE_SI:
29585 ival = trunc_int_for_mode (ival, SImode);
29586 break;
29587 default:
29588 break;
29590 if (IN_RANGE (ival, -128, 127))
29592 len = 1;
29593 continue;
29596 switch (mode)
29598 case MODE_QI:
29599 len = 1;
29600 break;
29601 case MODE_HI:
29602 len = 2;
29603 break;
29604 case MODE_SI:
29605 len = 4;
29606 break;
29607 /* Immediates for DImode instructions are encoded
29608 as 32bit sign extended values. */
29609 case MODE_DI:
29610 len = 4;
29611 break;
29612 default:
29613 fatal_insn ("unknown insn mode", insn);
29616 return len;
29619 /* Compute default value for "length_address" attribute. */
29621 ix86_attr_length_address_default (rtx_insn *insn)
29623 int i;
29625 if (get_attr_type (insn) == TYPE_LEA)
29627 rtx set = PATTERN (insn), addr;
29629 if (GET_CODE (set) == PARALLEL)
29630 set = XVECEXP (set, 0, 0);
29632 gcc_assert (GET_CODE (set) == SET);
29634 addr = SET_SRC (set);
29636 return memory_address_length (addr, true);
29639 extract_insn_cached (insn);
29640 for (i = recog_data.n_operands - 1; i >= 0; --i)
29642 rtx op = recog_data.operand[i];
29643 if (MEM_P (op))
29645 constrain_operands_cached (insn, reload_completed);
29646 if (which_alternative != -1)
29648 const char *constraints = recog_data.constraints[i];
29649 int alt = which_alternative;
29651 while (*constraints == '=' || *constraints == '+')
29652 constraints++;
29653 while (alt-- > 0)
29654 while (*constraints++ != ',')
29656 /* Skip ignored operands. */
29657 if (*constraints == 'X')
29658 continue;
29661 int len = memory_address_length (XEXP (op, 0), false);
29663 /* Account for segment prefix for non-default addr spaces. */
29664 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29665 len++;
29667 return len;
29670 return 0;
29673 /* Compute default value for "length_vex" attribute. It includes
29674 2 or 3 byte VEX prefix and 1 opcode byte. */
29677 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29678 bool has_vex_w)
29680 int i;
29682 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29683 byte VEX prefix. */
29684 if (!has_0f_opcode || has_vex_w)
29685 return 3 + 1;
29687 /* We can always use 2 byte VEX prefix in 32bit. */
29688 if (!TARGET_64BIT)
29689 return 2 + 1;
29691 extract_insn_cached (insn);
29693 for (i = recog_data.n_operands - 1; i >= 0; --i)
29694 if (REG_P (recog_data.operand[i]))
29696 /* REX.W bit uses 3 byte VEX prefix. */
29697 if (GET_MODE (recog_data.operand[i]) == DImode
29698 && GENERAL_REG_P (recog_data.operand[i]))
29699 return 3 + 1;
29701 else
29703 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29704 if (MEM_P (recog_data.operand[i])
29705 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29706 return 3 + 1;
29709 return 2 + 1;
29713 static bool
29714 ix86_class_likely_spilled_p (reg_class_t);
29716 /* Returns true if lhs of insn is HW function argument register and set up
29717 is_spilled to true if it is likely spilled HW register. */
29718 static bool
29719 insn_is_function_arg (rtx insn, bool* is_spilled)
29721 rtx dst;
29723 if (!NONDEBUG_INSN_P (insn))
29724 return false;
29725 /* Call instructions are not movable, ignore it. */
29726 if (CALL_P (insn))
29727 return false;
29728 insn = PATTERN (insn);
29729 if (GET_CODE (insn) == PARALLEL)
29730 insn = XVECEXP (insn, 0, 0);
29731 if (GET_CODE (insn) != SET)
29732 return false;
29733 dst = SET_DEST (insn);
29734 if (REG_P (dst) && HARD_REGISTER_P (dst)
29735 && ix86_function_arg_regno_p (REGNO (dst)))
29737 /* Is it likely spilled HW register? */
29738 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29739 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29740 *is_spilled = true;
29741 return true;
29743 return false;
29746 /* Add output dependencies for chain of function adjacent arguments if only
29747 there is a move to likely spilled HW register. Return first argument
29748 if at least one dependence was added or NULL otherwise. */
29749 static rtx_insn *
29750 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29752 rtx_insn *insn;
29753 rtx_insn *last = call;
29754 rtx_insn *first_arg = NULL;
29755 bool is_spilled = false;
29757 head = PREV_INSN (head);
29759 /* Find nearest to call argument passing instruction. */
29760 while (true)
29762 last = PREV_INSN (last);
29763 if (last == head)
29764 return NULL;
29765 if (!NONDEBUG_INSN_P (last))
29766 continue;
29767 if (insn_is_function_arg (last, &is_spilled))
29768 break;
29769 return NULL;
29772 first_arg = last;
29773 while (true)
29775 insn = PREV_INSN (last);
29776 if (!INSN_P (insn))
29777 break;
29778 if (insn == head)
29779 break;
29780 if (!NONDEBUG_INSN_P (insn))
29782 last = insn;
29783 continue;
29785 if (insn_is_function_arg (insn, &is_spilled))
29787 /* Add output depdendence between two function arguments if chain
29788 of output arguments contains likely spilled HW registers. */
29789 if (is_spilled)
29790 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29791 first_arg = last = insn;
29793 else
29794 break;
29796 if (!is_spilled)
29797 return NULL;
29798 return first_arg;
29801 /* Add output or anti dependency from insn to first_arg to restrict its code
29802 motion. */
29803 static void
29804 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29806 rtx set;
29807 rtx tmp;
29809 /* Add anti dependencies for bounds stores. */
29810 if (INSN_P (insn)
29811 && GET_CODE (PATTERN (insn)) == PARALLEL
29812 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29813 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29815 add_dependence (first_arg, insn, REG_DEP_ANTI);
29816 return;
29819 set = single_set (insn);
29820 if (!set)
29821 return;
29822 tmp = SET_DEST (set);
29823 if (REG_P (tmp))
29825 /* Add output dependency to the first function argument. */
29826 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29827 return;
29829 /* Add anti dependency. */
29830 add_dependence (first_arg, insn, REG_DEP_ANTI);
29833 /* Avoid cross block motion of function argument through adding dependency
29834 from the first non-jump instruction in bb. */
29835 static void
29836 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29838 rtx_insn *insn = BB_END (bb);
29840 while (insn)
29842 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29844 rtx set = single_set (insn);
29845 if (set)
29847 avoid_func_arg_motion (arg, insn);
29848 return;
29851 if (insn == BB_HEAD (bb))
29852 return;
29853 insn = PREV_INSN (insn);
29857 /* Hook for pre-reload schedule - avoid motion of function arguments
29858 passed in likely spilled HW registers. */
29859 static void
29860 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29862 rtx_insn *insn;
29863 rtx_insn *first_arg = NULL;
29864 if (reload_completed)
29865 return;
29866 while (head != tail && DEBUG_INSN_P (head))
29867 head = NEXT_INSN (head);
29868 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29869 if (INSN_P (insn) && CALL_P (insn))
29871 first_arg = add_parameter_dependencies (insn, head);
29872 if (first_arg)
29874 /* Add dependee for first argument to predecessors if only
29875 region contains more than one block. */
29876 basic_block bb = BLOCK_FOR_INSN (insn);
29877 int rgn = CONTAINING_RGN (bb->index);
29878 int nr_blks = RGN_NR_BLOCKS (rgn);
29879 /* Skip trivial regions and region head blocks that can have
29880 predecessors outside of region. */
29881 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29883 edge e;
29884 edge_iterator ei;
29886 /* Regions are SCCs with the exception of selective
29887 scheduling with pipelining of outer blocks enabled.
29888 So also check that immediate predecessors of a non-head
29889 block are in the same region. */
29890 FOR_EACH_EDGE (e, ei, bb->preds)
29892 /* Avoid creating of loop-carried dependencies through
29893 using topological ordering in the region. */
29894 if (rgn == CONTAINING_RGN (e->src->index)
29895 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29896 add_dependee_for_func_arg (first_arg, e->src);
29899 insn = first_arg;
29900 if (insn == head)
29901 break;
29904 else if (first_arg)
29905 avoid_func_arg_motion (first_arg, insn);
29908 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29909 HW registers to maximum, to schedule them at soon as possible. These are
29910 moves from function argument registers at the top of the function entry
29911 and moves from function return value registers after call. */
29912 static int
29913 ix86_adjust_priority (rtx_insn *insn, int priority)
29915 rtx set;
29917 if (reload_completed)
29918 return priority;
29920 if (!NONDEBUG_INSN_P (insn))
29921 return priority;
29923 set = single_set (insn);
29924 if (set)
29926 rtx tmp = SET_SRC (set);
29927 if (REG_P (tmp)
29928 && HARD_REGISTER_P (tmp)
29929 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29930 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29931 return current_sched_info->sched_max_insns_priority;
29934 return priority;
29937 /* Prepare for scheduling pass. */
29938 static void
29939 ix86_sched_init_global (FILE *, int, int)
29941 /* Install scheduling hooks for current CPU. Some of these hooks are used
29942 in time-critical parts of the scheduler, so we only set them up when
29943 they are actually used. */
29944 switch (ix86_tune)
29946 case PROCESSOR_CORE2:
29947 case PROCESSOR_NEHALEM:
29948 case PROCESSOR_SANDYBRIDGE:
29949 case PROCESSOR_HASWELL:
29950 case PROCESSOR_GENERIC:
29951 /* Do not perform multipass scheduling for pre-reload schedule
29952 to save compile time. */
29953 if (reload_completed)
29955 ix86_core2i7_init_hooks ();
29956 break;
29958 /* Fall through. */
29959 default:
29960 targetm.sched.dfa_post_advance_cycle = NULL;
29961 targetm.sched.first_cycle_multipass_init = NULL;
29962 targetm.sched.first_cycle_multipass_begin = NULL;
29963 targetm.sched.first_cycle_multipass_issue = NULL;
29964 targetm.sched.first_cycle_multipass_backtrack = NULL;
29965 targetm.sched.first_cycle_multipass_end = NULL;
29966 targetm.sched.first_cycle_multipass_fini = NULL;
29967 break;
29972 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29974 static HOST_WIDE_INT
29975 ix86_static_rtx_alignment (machine_mode mode)
29977 if (mode == DFmode)
29978 return 64;
29979 if (ALIGN_MODE_128 (mode))
29980 return MAX (128, GET_MODE_ALIGNMENT (mode));
29981 return GET_MODE_ALIGNMENT (mode);
29984 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29986 static HOST_WIDE_INT
29987 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29989 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29990 || TREE_CODE (exp) == INTEGER_CST)
29992 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29993 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29994 return MAX (mode_align, align);
29996 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29997 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29998 return BITS_PER_WORD;
30000 return align;
30003 /* Implement TARGET_EMPTY_RECORD_P. */
30005 static bool
30006 ix86_is_empty_record (const_tree type)
30008 if (!TARGET_64BIT)
30009 return false;
30010 return default_is_empty_record (type);
30013 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
30015 static void
30016 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
30018 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
30020 if (!cum->warn_empty)
30021 return;
30023 if (!TYPE_EMPTY_P (type))
30024 return;
30026 const_tree ctx = get_ultimate_context (cum->decl);
30027 if (ctx != NULL_TREE
30028 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
30029 return;
30031 /* If the actual size of the type is zero, then there is no change
30032 in how objects of this size are passed. */
30033 if (int_size_in_bytes (type) == 0)
30034 return;
30036 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
30037 "changes in -fabi-version=12 (GCC 8)", type);
30039 /* Only warn once. */
30040 cum->warn_empty = false;
30043 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30044 the data type, and ALIGN is the alignment that the object would
30045 ordinarily have. */
30047 static int
30048 iamcu_alignment (tree type, int align)
30050 machine_mode mode;
30052 if (align < 32 || TYPE_USER_ALIGN (type))
30053 return align;
30055 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30056 bytes. */
30057 mode = TYPE_MODE (strip_array_types (type));
30058 switch (GET_MODE_CLASS (mode))
30060 case MODE_INT:
30061 case MODE_COMPLEX_INT:
30062 case MODE_COMPLEX_FLOAT:
30063 case MODE_FLOAT:
30064 case MODE_DECIMAL_FLOAT:
30065 return 32;
30066 default:
30067 return align;
30071 /* Compute the alignment for a static variable.
30072 TYPE is the data type, and ALIGN is the alignment that
30073 the object would ordinarily have. The value of this function is used
30074 instead of that alignment to align the object. */
30077 ix86_data_alignment (tree type, int align, bool opt)
30079 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30080 for symbols from other compilation units or symbols that don't need
30081 to bind locally. In order to preserve some ABI compatibility with
30082 those compilers, ensure we don't decrease alignment from what we
30083 used to assume. */
30085 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30087 /* A data structure, equal or greater than the size of a cache line
30088 (64 bytes in the Pentium 4 and other recent Intel processors, including
30089 processors based on Intel Core microarchitecture) should be aligned
30090 so that its base address is a multiple of a cache line size. */
30092 int max_align
30093 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30095 if (max_align < BITS_PER_WORD)
30096 max_align = BITS_PER_WORD;
30098 switch (ix86_align_data_type)
30100 case ix86_align_data_type_abi: opt = false; break;
30101 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30102 case ix86_align_data_type_cacheline: break;
30105 if (TARGET_IAMCU)
30106 align = iamcu_alignment (type, align);
30108 if (opt
30109 && AGGREGATE_TYPE_P (type)
30110 && TYPE_SIZE (type)
30111 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30113 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
30114 && align < max_align_compat)
30115 align = max_align_compat;
30116 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
30117 && align < max_align)
30118 align = max_align;
30121 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30122 to 16byte boundary. */
30123 if (TARGET_64BIT)
30125 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30126 && TYPE_SIZE (type)
30127 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30128 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30129 && align < 128)
30130 return 128;
30133 if (!opt)
30134 return align;
30136 if (TREE_CODE (type) == ARRAY_TYPE)
30138 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30139 return 64;
30140 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30141 return 128;
30143 else if (TREE_CODE (type) == COMPLEX_TYPE)
30146 if (TYPE_MODE (type) == DCmode && align < 64)
30147 return 64;
30148 if ((TYPE_MODE (type) == XCmode
30149 || TYPE_MODE (type) == TCmode) && align < 128)
30150 return 128;
30152 else if ((TREE_CODE (type) == RECORD_TYPE
30153 || TREE_CODE (type) == UNION_TYPE
30154 || TREE_CODE (type) == QUAL_UNION_TYPE)
30155 && TYPE_FIELDS (type))
30157 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30158 return 64;
30159 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30160 return 128;
30162 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30163 || TREE_CODE (type) == INTEGER_TYPE)
30165 if (TYPE_MODE (type) == DFmode && align < 64)
30166 return 64;
30167 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30168 return 128;
30171 return align;
30174 /* Compute the alignment for a local variable or a stack slot. EXP is
30175 the data type or decl itself, MODE is the widest mode available and
30176 ALIGN is the alignment that the object would ordinarily have. The
30177 value of this macro is used instead of that alignment to align the
30178 object. */
30180 unsigned int
30181 ix86_local_alignment (tree exp, machine_mode mode,
30182 unsigned int align)
30184 tree type, decl;
30186 if (exp && DECL_P (exp))
30188 type = TREE_TYPE (exp);
30189 decl = exp;
30191 else
30193 type = exp;
30194 decl = NULL;
30197 /* Don't do dynamic stack realignment for long long objects with
30198 -mpreferred-stack-boundary=2. */
30199 if (!TARGET_64BIT
30200 && align == 64
30201 && ix86_preferred_stack_boundary < 64
30202 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30203 && (!type || !TYPE_USER_ALIGN (type))
30204 && (!decl || !DECL_USER_ALIGN (decl)))
30205 align = 32;
30207 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30208 register in MODE. We will return the largest alignment of XF
30209 and DF. */
30210 if (!type)
30212 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30213 align = GET_MODE_ALIGNMENT (DFmode);
30214 return align;
30217 /* Don't increase alignment for Intel MCU psABI. */
30218 if (TARGET_IAMCU)
30219 return align;
30221 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30222 to 16byte boundary. Exact wording is:
30224 An array uses the same alignment as its elements, except that a local or
30225 global array variable of length at least 16 bytes or
30226 a C99 variable-length array variable always has alignment of at least 16 bytes.
30228 This was added to allow use of aligned SSE instructions at arrays. This
30229 rule is meant for static storage (where compiler can not do the analysis
30230 by itself). We follow it for automatic variables only when convenient.
30231 We fully control everything in the function compiled and functions from
30232 other unit can not rely on the alignment.
30234 Exclude va_list type. It is the common case of local array where
30235 we can not benefit from the alignment.
30237 TODO: Probably one should optimize for size only when var is not escaping. */
30238 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30239 && TARGET_SSE)
30241 if (AGGREGATE_TYPE_P (type)
30242 && (va_list_type_node == NULL_TREE
30243 || (TYPE_MAIN_VARIANT (type)
30244 != TYPE_MAIN_VARIANT (va_list_type_node)))
30245 && TYPE_SIZE (type)
30246 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30247 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30248 && align < 128)
30249 return 128;
30251 if (TREE_CODE (type) == ARRAY_TYPE)
30253 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30254 return 64;
30255 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30256 return 128;
30258 else if (TREE_CODE (type) == COMPLEX_TYPE)
30260 if (TYPE_MODE (type) == DCmode && align < 64)
30261 return 64;
30262 if ((TYPE_MODE (type) == XCmode
30263 || TYPE_MODE (type) == TCmode) && align < 128)
30264 return 128;
30266 else if ((TREE_CODE (type) == RECORD_TYPE
30267 || TREE_CODE (type) == UNION_TYPE
30268 || TREE_CODE (type) == QUAL_UNION_TYPE)
30269 && TYPE_FIELDS (type))
30271 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30272 return 64;
30273 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30274 return 128;
30276 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30277 || TREE_CODE (type) == INTEGER_TYPE)
30280 if (TYPE_MODE (type) == DFmode && align < 64)
30281 return 64;
30282 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30283 return 128;
30285 return align;
30288 /* Compute the minimum required alignment for dynamic stack realignment
30289 purposes for a local variable, parameter or a stack slot. EXP is
30290 the data type or decl itself, MODE is its mode and ALIGN is the
30291 alignment that the object would ordinarily have. */
30293 unsigned int
30294 ix86_minimum_alignment (tree exp, machine_mode mode,
30295 unsigned int align)
30297 tree type, decl;
30299 if (exp && DECL_P (exp))
30301 type = TREE_TYPE (exp);
30302 decl = exp;
30304 else
30306 type = exp;
30307 decl = NULL;
30310 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30311 return align;
30313 /* Don't do dynamic stack realignment for long long objects with
30314 -mpreferred-stack-boundary=2. */
30315 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30316 && (!type || !TYPE_USER_ALIGN (type))
30317 && (!decl || !DECL_USER_ALIGN (decl)))
30319 gcc_checking_assert (!TARGET_STV);
30320 return 32;
30323 return align;
30326 /* Find a location for the static chain incoming to a nested function.
30327 This is a register, unless all free registers are used by arguments. */
30329 static rtx
30330 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30332 unsigned regno;
30334 if (TARGET_64BIT)
30336 /* We always use R10 in 64-bit mode. */
30337 regno = R10_REG;
30339 else
30341 const_tree fntype, fndecl;
30342 unsigned int ccvt;
30344 /* By default in 32-bit mode we use ECX to pass the static chain. */
30345 regno = CX_REG;
30347 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30349 fntype = TREE_TYPE (fndecl_or_type);
30350 fndecl = fndecl_or_type;
30352 else
30354 fntype = fndecl_or_type;
30355 fndecl = NULL;
30358 ccvt = ix86_get_callcvt (fntype);
30359 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30361 /* Fastcall functions use ecx/edx for arguments, which leaves
30362 us with EAX for the static chain.
30363 Thiscall functions use ecx for arguments, which also
30364 leaves us with EAX for the static chain. */
30365 regno = AX_REG;
30367 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30369 /* Thiscall functions use ecx for arguments, which leaves
30370 us with EAX and EDX for the static chain.
30371 We are using for abi-compatibility EAX. */
30372 regno = AX_REG;
30374 else if (ix86_function_regparm (fntype, fndecl) == 3)
30376 /* For regparm 3, we have no free call-clobbered registers in
30377 which to store the static chain. In order to implement this,
30378 we have the trampoline push the static chain to the stack.
30379 However, we can't push a value below the return address when
30380 we call the nested function directly, so we have to use an
30381 alternate entry point. For this we use ESI, and have the
30382 alternate entry point push ESI, so that things appear the
30383 same once we're executing the nested function. */
30384 if (incoming_p)
30386 if (fndecl == current_function_decl
30387 && !ix86_static_chain_on_stack)
30389 gcc_assert (!reload_completed);
30390 ix86_static_chain_on_stack = true;
30392 return gen_frame_mem (SImode,
30393 plus_constant (Pmode,
30394 arg_pointer_rtx, -8));
30396 regno = SI_REG;
30400 return gen_rtx_REG (Pmode, regno);
30403 /* Emit RTL insns to initialize the variable parts of a trampoline.
30404 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30405 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30406 to be passed to the target function. */
30408 static void
30409 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30411 rtx mem, fnaddr;
30412 int opcode;
30413 int offset = 0;
30415 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30417 if (TARGET_64BIT)
30419 int size;
30421 /* Load the function address to r11. Try to load address using
30422 the shorter movl instead of movabs. We may want to support
30423 movq for kernel mode, but kernel does not use trampolines at
30424 the moment. FNADDR is a 32bit address and may not be in
30425 DImode when ptr_mode == SImode. Always use movl in this
30426 case. */
30427 if (ptr_mode == SImode
30428 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30430 fnaddr = copy_addr_to_reg (fnaddr);
30432 mem = adjust_address (m_tramp, HImode, offset);
30433 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30435 mem = adjust_address (m_tramp, SImode, offset + 2);
30436 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30437 offset += 6;
30439 else
30441 mem = adjust_address (m_tramp, HImode, offset);
30442 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30444 mem = adjust_address (m_tramp, DImode, offset + 2);
30445 emit_move_insn (mem, fnaddr);
30446 offset += 10;
30449 /* Load static chain using movabs to r10. Use the shorter movl
30450 instead of movabs when ptr_mode == SImode. */
30451 if (ptr_mode == SImode)
30453 opcode = 0xba41;
30454 size = 6;
30456 else
30458 opcode = 0xba49;
30459 size = 10;
30462 mem = adjust_address (m_tramp, HImode, offset);
30463 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30465 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30466 emit_move_insn (mem, chain_value);
30467 offset += size;
30469 /* Jump to r11; the last (unused) byte is a nop, only there to
30470 pad the write out to a single 32-bit store. */
30471 mem = adjust_address (m_tramp, SImode, offset);
30472 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30473 offset += 4;
30475 else
30477 rtx disp, chain;
30479 /* Depending on the static chain location, either load a register
30480 with a constant, or push the constant to the stack. All of the
30481 instructions are the same size. */
30482 chain = ix86_static_chain (fndecl, true);
30483 if (REG_P (chain))
30485 switch (REGNO (chain))
30487 case AX_REG:
30488 opcode = 0xb8; break;
30489 case CX_REG:
30490 opcode = 0xb9; break;
30491 default:
30492 gcc_unreachable ();
30495 else
30496 opcode = 0x68;
30498 mem = adjust_address (m_tramp, QImode, offset);
30499 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30501 mem = adjust_address (m_tramp, SImode, offset + 1);
30502 emit_move_insn (mem, chain_value);
30503 offset += 5;
30505 mem = adjust_address (m_tramp, QImode, offset);
30506 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30508 mem = adjust_address (m_tramp, SImode, offset + 1);
30510 /* Compute offset from the end of the jmp to the target function.
30511 In the case in which the trampoline stores the static chain on
30512 the stack, we need to skip the first insn which pushes the
30513 (call-saved) register static chain; this push is 1 byte. */
30514 offset += 5;
30515 disp = expand_binop (SImode, sub_optab, fnaddr,
30516 plus_constant (Pmode, XEXP (m_tramp, 0),
30517 offset - (MEM_P (chain) ? 1 : 0)),
30518 NULL_RTX, 1, OPTAB_DIRECT);
30519 emit_move_insn (mem, disp);
30522 gcc_assert (offset <= TRAMPOLINE_SIZE);
30524 #ifdef HAVE_ENABLE_EXECUTE_STACK
30525 #ifdef CHECK_EXECUTE_STACK_ENABLED
30526 if (CHECK_EXECUTE_STACK_ENABLED)
30527 #endif
30528 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30529 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30530 #endif
30533 static bool
30534 ix86_allocate_stack_slots_for_args (void)
30536 /* Naked functions should not allocate stack slots for arguments. */
30537 return !ix86_function_naked (current_function_decl);
30540 static bool
30541 ix86_warn_func_return (tree decl)
30543 /* Naked functions are implemented entirely in assembly, including the
30544 return sequence, so suppress warnings about this. */
30545 return !ix86_function_naked (decl);
30548 /* The following file contains several enumerations and data structures
30549 built from the definitions in i386-builtin-types.def. */
30551 #include "i386-builtin-types.inc"
30553 /* Table for the ix86 builtin non-function types. */
30554 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30556 /* Retrieve an element from the above table, building some of
30557 the types lazily. */
30559 static tree
30560 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30562 unsigned int index;
30563 tree type, itype;
30565 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30567 type = ix86_builtin_type_tab[(int) tcode];
30568 if (type != NULL)
30569 return type;
30571 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30572 if (tcode <= IX86_BT_LAST_VECT)
30574 machine_mode mode;
30576 index = tcode - IX86_BT_LAST_PRIM - 1;
30577 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30578 mode = ix86_builtin_type_vect_mode[index];
30580 type = build_vector_type_for_mode (itype, mode);
30582 else
30584 int quals;
30586 index = tcode - IX86_BT_LAST_VECT - 1;
30587 if (tcode <= IX86_BT_LAST_PTR)
30588 quals = TYPE_UNQUALIFIED;
30589 else
30590 quals = TYPE_QUAL_CONST;
30592 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30593 if (quals != TYPE_UNQUALIFIED)
30594 itype = build_qualified_type (itype, quals);
30596 type = build_pointer_type (itype);
30599 ix86_builtin_type_tab[(int) tcode] = type;
30600 return type;
30603 /* Table for the ix86 builtin function types. */
30604 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30606 /* Retrieve an element from the above table, building some of
30607 the types lazily. */
30609 static tree
30610 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30612 tree type;
30614 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30616 type = ix86_builtin_func_type_tab[(int) tcode];
30617 if (type != NULL)
30618 return type;
30620 if (tcode <= IX86_BT_LAST_FUNC)
30622 unsigned start = ix86_builtin_func_start[(int) tcode];
30623 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30624 tree rtype, atype, args = void_list_node;
30625 unsigned i;
30627 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30628 for (i = after - 1; i > start; --i)
30630 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30631 args = tree_cons (NULL, atype, args);
30634 type = build_function_type (rtype, args);
30636 else
30638 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30639 enum ix86_builtin_func_type icode;
30641 icode = ix86_builtin_func_alias_base[index];
30642 type = ix86_get_builtin_func_type (icode);
30645 ix86_builtin_func_type_tab[(int) tcode] = type;
30646 return type;
30650 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30651 bdesc_* arrays below should come first, then builtins for each bdesc_*
30652 array in ascending order, so that we can use direct array accesses. */
30653 enum ix86_builtins
30655 IX86_BUILTIN_MASKMOVQ,
30656 IX86_BUILTIN_LDMXCSR,
30657 IX86_BUILTIN_STMXCSR,
30658 IX86_BUILTIN_MASKMOVDQU,
30659 IX86_BUILTIN_PSLLDQ128,
30660 IX86_BUILTIN_CLFLUSH,
30661 IX86_BUILTIN_MONITOR,
30662 IX86_BUILTIN_MWAIT,
30663 IX86_BUILTIN_CLZERO,
30664 IX86_BUILTIN_VEC_INIT_V2SI,
30665 IX86_BUILTIN_VEC_INIT_V4HI,
30666 IX86_BUILTIN_VEC_INIT_V8QI,
30667 IX86_BUILTIN_VEC_EXT_V2DF,
30668 IX86_BUILTIN_VEC_EXT_V2DI,
30669 IX86_BUILTIN_VEC_EXT_V4SF,
30670 IX86_BUILTIN_VEC_EXT_V4SI,
30671 IX86_BUILTIN_VEC_EXT_V8HI,
30672 IX86_BUILTIN_VEC_EXT_V2SI,
30673 IX86_BUILTIN_VEC_EXT_V4HI,
30674 IX86_BUILTIN_VEC_EXT_V16QI,
30675 IX86_BUILTIN_VEC_SET_V2DI,
30676 IX86_BUILTIN_VEC_SET_V4SF,
30677 IX86_BUILTIN_VEC_SET_V4SI,
30678 IX86_BUILTIN_VEC_SET_V8HI,
30679 IX86_BUILTIN_VEC_SET_V4HI,
30680 IX86_BUILTIN_VEC_SET_V16QI,
30681 IX86_BUILTIN_GATHERSIV2DF,
30682 IX86_BUILTIN_GATHERSIV4DF,
30683 IX86_BUILTIN_GATHERDIV2DF,
30684 IX86_BUILTIN_GATHERDIV4DF,
30685 IX86_BUILTIN_GATHERSIV4SF,
30686 IX86_BUILTIN_GATHERSIV8SF,
30687 IX86_BUILTIN_GATHERDIV4SF,
30688 IX86_BUILTIN_GATHERDIV8SF,
30689 IX86_BUILTIN_GATHERSIV2DI,
30690 IX86_BUILTIN_GATHERSIV4DI,
30691 IX86_BUILTIN_GATHERDIV2DI,
30692 IX86_BUILTIN_GATHERDIV4DI,
30693 IX86_BUILTIN_GATHERSIV4SI,
30694 IX86_BUILTIN_GATHERSIV8SI,
30695 IX86_BUILTIN_GATHERDIV4SI,
30696 IX86_BUILTIN_GATHERDIV8SI,
30697 IX86_BUILTIN_VFMSUBSD3_MASK3,
30698 IX86_BUILTIN_VFMSUBSS3_MASK3,
30699 IX86_BUILTIN_GATHER3SIV8SF,
30700 IX86_BUILTIN_GATHER3SIV4SF,
30701 IX86_BUILTIN_GATHER3SIV4DF,
30702 IX86_BUILTIN_GATHER3SIV2DF,
30703 IX86_BUILTIN_GATHER3DIV8SF,
30704 IX86_BUILTIN_GATHER3DIV4SF,
30705 IX86_BUILTIN_GATHER3DIV4DF,
30706 IX86_BUILTIN_GATHER3DIV2DF,
30707 IX86_BUILTIN_GATHER3SIV8SI,
30708 IX86_BUILTIN_GATHER3SIV4SI,
30709 IX86_BUILTIN_GATHER3SIV4DI,
30710 IX86_BUILTIN_GATHER3SIV2DI,
30711 IX86_BUILTIN_GATHER3DIV8SI,
30712 IX86_BUILTIN_GATHER3DIV4SI,
30713 IX86_BUILTIN_GATHER3DIV4DI,
30714 IX86_BUILTIN_GATHER3DIV2DI,
30715 IX86_BUILTIN_SCATTERSIV8SF,
30716 IX86_BUILTIN_SCATTERSIV4SF,
30717 IX86_BUILTIN_SCATTERSIV4DF,
30718 IX86_BUILTIN_SCATTERSIV2DF,
30719 IX86_BUILTIN_SCATTERDIV8SF,
30720 IX86_BUILTIN_SCATTERDIV4SF,
30721 IX86_BUILTIN_SCATTERDIV4DF,
30722 IX86_BUILTIN_SCATTERDIV2DF,
30723 IX86_BUILTIN_SCATTERSIV8SI,
30724 IX86_BUILTIN_SCATTERSIV4SI,
30725 IX86_BUILTIN_SCATTERSIV4DI,
30726 IX86_BUILTIN_SCATTERSIV2DI,
30727 IX86_BUILTIN_SCATTERDIV8SI,
30728 IX86_BUILTIN_SCATTERDIV4SI,
30729 IX86_BUILTIN_SCATTERDIV4DI,
30730 IX86_BUILTIN_SCATTERDIV2DI,
30731 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30732 where all operands are 32-byte or 64-byte wide respectively. */
30733 IX86_BUILTIN_GATHERALTSIV4DF,
30734 IX86_BUILTIN_GATHERALTDIV8SF,
30735 IX86_BUILTIN_GATHERALTSIV4DI,
30736 IX86_BUILTIN_GATHERALTDIV8SI,
30737 IX86_BUILTIN_GATHER3ALTDIV16SF,
30738 IX86_BUILTIN_GATHER3ALTDIV16SI,
30739 IX86_BUILTIN_GATHER3ALTSIV4DF,
30740 IX86_BUILTIN_GATHER3ALTDIV8SF,
30741 IX86_BUILTIN_GATHER3ALTSIV4DI,
30742 IX86_BUILTIN_GATHER3ALTDIV8SI,
30743 IX86_BUILTIN_GATHER3ALTSIV8DF,
30744 IX86_BUILTIN_GATHER3ALTSIV8DI,
30745 IX86_BUILTIN_GATHER3DIV16SF,
30746 IX86_BUILTIN_GATHER3DIV16SI,
30747 IX86_BUILTIN_GATHER3DIV8DF,
30748 IX86_BUILTIN_GATHER3DIV8DI,
30749 IX86_BUILTIN_GATHER3SIV16SF,
30750 IX86_BUILTIN_GATHER3SIV16SI,
30751 IX86_BUILTIN_GATHER3SIV8DF,
30752 IX86_BUILTIN_GATHER3SIV8DI,
30753 IX86_BUILTIN_SCATTERALTSIV8DF,
30754 IX86_BUILTIN_SCATTERALTDIV16SF,
30755 IX86_BUILTIN_SCATTERALTSIV8DI,
30756 IX86_BUILTIN_SCATTERALTDIV16SI,
30757 IX86_BUILTIN_SCATTERDIV16SF,
30758 IX86_BUILTIN_SCATTERDIV16SI,
30759 IX86_BUILTIN_SCATTERDIV8DF,
30760 IX86_BUILTIN_SCATTERDIV8DI,
30761 IX86_BUILTIN_SCATTERSIV16SF,
30762 IX86_BUILTIN_SCATTERSIV16SI,
30763 IX86_BUILTIN_SCATTERSIV8DF,
30764 IX86_BUILTIN_SCATTERSIV8DI,
30765 IX86_BUILTIN_GATHERPFQPD,
30766 IX86_BUILTIN_GATHERPFDPS,
30767 IX86_BUILTIN_GATHERPFDPD,
30768 IX86_BUILTIN_GATHERPFQPS,
30769 IX86_BUILTIN_SCATTERPFDPD,
30770 IX86_BUILTIN_SCATTERPFDPS,
30771 IX86_BUILTIN_SCATTERPFQPD,
30772 IX86_BUILTIN_SCATTERPFQPS,
30773 IX86_BUILTIN_CLWB,
30774 IX86_BUILTIN_CLFLUSHOPT,
30775 IX86_BUILTIN_INFQ,
30776 IX86_BUILTIN_HUGE_VALQ,
30777 IX86_BUILTIN_NANQ,
30778 IX86_BUILTIN_NANSQ,
30779 IX86_BUILTIN_XABORT,
30780 IX86_BUILTIN_ADDCARRYX32,
30781 IX86_BUILTIN_ADDCARRYX64,
30782 IX86_BUILTIN_SBB32,
30783 IX86_BUILTIN_SBB64,
30784 IX86_BUILTIN_RDRAND16_STEP,
30785 IX86_BUILTIN_RDRAND32_STEP,
30786 IX86_BUILTIN_RDRAND64_STEP,
30787 IX86_BUILTIN_RDSEED16_STEP,
30788 IX86_BUILTIN_RDSEED32_STEP,
30789 IX86_BUILTIN_RDSEED64_STEP,
30790 IX86_BUILTIN_MONITORX,
30791 IX86_BUILTIN_MWAITX,
30792 IX86_BUILTIN_CFSTRING,
30793 IX86_BUILTIN_CPU_INIT,
30794 IX86_BUILTIN_CPU_IS,
30795 IX86_BUILTIN_CPU_SUPPORTS,
30796 IX86_BUILTIN_READ_FLAGS,
30797 IX86_BUILTIN_WRITE_FLAGS,
30799 /* All the remaining builtins are tracked in bdesc_* arrays in
30800 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30801 this point. */
30802 #define BDESC(mask, icode, name, code, comparison, flag) \
30803 code,
30804 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30805 code, \
30806 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30807 #define BDESC_END(kind, next_kind)
30809 #include "i386-builtin.def"
30811 #undef BDESC
30812 #undef BDESC_FIRST
30813 #undef BDESC_END
30815 IX86_BUILTIN_MAX,
30817 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30819 /* Now just the aliases for bdesc_* start/end. */
30820 #define BDESC(mask, icode, name, code, comparison, flag)
30821 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30822 #define BDESC_END(kind, next_kind) \
30823 IX86_BUILTIN__BDESC_##kind##_LAST \
30824 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30826 #include "i386-builtin.def"
30828 #undef BDESC
30829 #undef BDESC_FIRST
30830 #undef BDESC_END
30832 /* Just to make sure there is no comma after the last enumerator. */
30833 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30836 /* Table for the ix86 builtin decls. */
30837 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30839 /* Table of all of the builtin functions that are possible with different ISA's
30840 but are waiting to be built until a function is declared to use that
30841 ISA. */
30842 struct builtin_isa {
30843 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30844 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30845 const char *name; /* function name */
30846 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30847 unsigned char const_p:1; /* true if the declaration is constant */
30848 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30849 bool leaf_p; /* true if the declaration has leaf attribute */
30850 bool nothrow_p; /* true if the declaration has nothrow attribute */
30851 bool set_and_not_built_p;
30854 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30856 /* Bits that can still enable any inclusion of a builtin. */
30857 static HOST_WIDE_INT deferred_isa_values = 0;
30858 static HOST_WIDE_INT deferred_isa_values2 = 0;
30860 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30861 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30862 function decl in the ix86_builtins array. Returns the function decl or
30863 NULL_TREE, if the builtin was not added.
30865 If the front end has a special hook for builtin functions, delay adding
30866 builtin functions that aren't in the current ISA until the ISA is changed
30867 with function specific optimization. Doing so, can save about 300K for the
30868 default compiler. When the builtin is expanded, check at that time whether
30869 it is valid.
30871 If the front end doesn't have a special hook, record all builtins, even if
30872 it isn't an instruction set in the current ISA in case the user uses
30873 function specific options for a different ISA, so that we don't get scope
30874 errors if a builtin is added in the middle of a function scope. */
30876 static inline tree
30877 def_builtin (HOST_WIDE_INT mask, const char *name,
30878 enum ix86_builtin_func_type tcode,
30879 enum ix86_builtins code)
30881 tree decl = NULL_TREE;
30883 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30885 ix86_builtins_isa[(int) code].isa = mask;
30887 mask &= ~OPTION_MASK_ISA_64BIT;
30889 /* Filter out the masks most often ored together with others. */
30890 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30891 && mask != OPTION_MASK_ISA_AVX512VL)
30892 mask &= ~OPTION_MASK_ISA_AVX512VL;
30893 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30894 && mask != OPTION_MASK_ISA_AVX512BW)
30895 mask &= ~OPTION_MASK_ISA_AVX512BW;
30897 if (mask == 0
30898 || (mask & ix86_isa_flags) != 0
30899 || (lang_hooks.builtin_function
30900 == lang_hooks.builtin_function_ext_scope))
30902 tree type = ix86_get_builtin_func_type (tcode);
30903 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30904 NULL, NULL_TREE);
30905 ix86_builtins[(int) code] = decl;
30906 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30908 else
30910 /* Just a MASK where set_and_not_built_p == true can potentially
30911 include a builtin. */
30912 deferred_isa_values |= mask;
30913 ix86_builtins[(int) code] = NULL_TREE;
30914 ix86_builtins_isa[(int) code].tcode = tcode;
30915 ix86_builtins_isa[(int) code].name = name;
30916 ix86_builtins_isa[(int) code].leaf_p = false;
30917 ix86_builtins_isa[(int) code].nothrow_p = false;
30918 ix86_builtins_isa[(int) code].const_p = false;
30919 ix86_builtins_isa[(int) code].pure_p = false;
30920 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30924 return decl;
30927 /* Like def_builtin, but also marks the function decl "const". */
30929 static inline tree
30930 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30931 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30933 tree decl = def_builtin (mask, name, tcode, code);
30934 if (decl)
30935 TREE_READONLY (decl) = 1;
30936 else
30937 ix86_builtins_isa[(int) code].const_p = true;
30939 return decl;
30942 /* Like def_builtin, but also marks the function decl "pure". */
30944 static inline tree
30945 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30946 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30948 tree decl = def_builtin (mask, name, tcode, code);
30949 if (decl)
30950 DECL_PURE_P (decl) = 1;
30951 else
30952 ix86_builtins_isa[(int) code].pure_p = true;
30954 return decl;
30957 /* Like def_builtin, but for additional isa2 flags. */
30959 static inline tree
30960 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30961 enum ix86_builtin_func_type tcode,
30962 enum ix86_builtins code)
30964 tree decl = NULL_TREE;
30966 ix86_builtins_isa[(int) code].isa2 = mask;
30968 if (mask == 0
30969 || (mask & ix86_isa_flags2) != 0
30970 || (lang_hooks.builtin_function
30971 == lang_hooks.builtin_function_ext_scope))
30974 tree type = ix86_get_builtin_func_type (tcode);
30975 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30976 NULL, NULL_TREE);
30977 ix86_builtins[(int) code] = decl;
30978 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30980 else
30982 /* Just a MASK where set_and_not_built_p == true can potentially
30983 include a builtin. */
30984 deferred_isa_values2 |= mask;
30985 ix86_builtins[(int) code] = NULL_TREE;
30986 ix86_builtins_isa[(int) code].tcode = tcode;
30987 ix86_builtins_isa[(int) code].name = name;
30988 ix86_builtins_isa[(int) code].leaf_p = false;
30989 ix86_builtins_isa[(int) code].nothrow_p = false;
30990 ix86_builtins_isa[(int) code].const_p = false;
30991 ix86_builtins_isa[(int) code].pure_p = false;
30992 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30995 return decl;
30998 /* Like def_builtin, but also marks the function decl "const". */
31000 static inline tree
31001 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31002 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31004 tree decl = def_builtin2 (mask, name, tcode, code);
31005 if (decl)
31006 TREE_READONLY (decl) = 1;
31007 else
31008 ix86_builtins_isa[(int) code].const_p = true;
31010 return decl;
31013 /* Like def_builtin, but also marks the function decl "pure". */
31015 static inline tree
31016 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
31017 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31019 tree decl = def_builtin2 (mask, name, tcode, code);
31020 if (decl)
31021 DECL_PURE_P (decl) = 1;
31022 else
31023 ix86_builtins_isa[(int) code].pure_p = true;
31025 return decl;
31028 /* Add any new builtin functions for a given ISA that may not have been
31029 declared. This saves a bit of space compared to adding all of the
31030 declarations to the tree, even if we didn't use them. */
31032 static void
31033 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31035 isa &= ~OPTION_MASK_ISA_64BIT;
31037 if ((isa & deferred_isa_values) == 0
31038 && (isa2 & deferred_isa_values2) == 0)
31039 return;
31041 /* Bits in ISA value can be removed from potential isa values. */
31042 deferred_isa_values &= ~isa;
31043 deferred_isa_values2 &= ~isa2;
31045 int i;
31046 tree saved_current_target_pragma = current_target_pragma;
31047 current_target_pragma = NULL_TREE;
31049 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31051 if (((ix86_builtins_isa[i].isa & isa) != 0
31052 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31053 && ix86_builtins_isa[i].set_and_not_built_p)
31055 tree decl, type;
31057 /* Don't define the builtin again. */
31058 ix86_builtins_isa[i].set_and_not_built_p = false;
31060 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31061 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31062 type, i, BUILT_IN_MD, NULL,
31063 NULL_TREE);
31065 ix86_builtins[i] = decl;
31066 if (ix86_builtins_isa[i].const_p)
31067 TREE_READONLY (decl) = 1;
31068 if (ix86_builtins_isa[i].pure_p)
31069 DECL_PURE_P (decl) = 1;
31070 if (ix86_builtins_isa[i].leaf_p)
31071 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31072 NULL_TREE);
31073 if (ix86_builtins_isa[i].nothrow_p)
31074 TREE_NOTHROW (decl) = 1;
31078 current_target_pragma = saved_current_target_pragma;
31081 /* Bits for builtin_description.flag. */
31083 /* Set when we don't support the comparison natively, and should
31084 swap_comparison in order to support it. */
31085 #define BUILTIN_DESC_SWAP_OPERANDS 1
31087 struct builtin_description
31089 const HOST_WIDE_INT mask;
31090 const enum insn_code icode;
31091 const char *const name;
31092 const enum ix86_builtins code;
31093 const enum rtx_code comparison;
31094 const int flag;
31097 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31098 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31099 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31100 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31101 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31102 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31103 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31104 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31105 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31106 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31107 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31108 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31109 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31110 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31111 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31112 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31113 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31114 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31115 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31116 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31117 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31118 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31119 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31120 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31121 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31122 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31123 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31124 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31125 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31126 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31127 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31128 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31129 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31130 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31131 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31132 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31133 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31134 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31135 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31136 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31137 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31138 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31139 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31140 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31141 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31142 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31143 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31144 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31145 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31146 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31147 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31148 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31150 #define BDESC(mask, icode, name, code, comparison, flag) \
31151 { mask, icode, name, code, comparison, flag },
31152 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31153 static const struct builtin_description bdesc_##kind[] = \
31155 BDESC (mask, icode, name, code, comparison, flag)
31156 #define BDESC_END(kind, next_kind) \
31159 #include "i386-builtin.def"
31161 #undef BDESC
31162 #undef BDESC_FIRST
31163 #undef BDESC_END
31165 /* TM vector builtins. */
31167 /* Reuse the existing x86-specific `struct builtin_description' cause
31168 we're lazy. Add casts to make them fit. */
31169 static const struct builtin_description bdesc_tm[] =
31171 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31172 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31173 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31174 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31175 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31176 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31177 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31179 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31180 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31181 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31182 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31183 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31184 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31185 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31187 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31188 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31189 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31190 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31191 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31192 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31193 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31195 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31196 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31197 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31200 /* Initialize the transactional memory vector load/store builtins. */
31202 static void
31203 ix86_init_tm_builtins (void)
31205 enum ix86_builtin_func_type ftype;
31206 const struct builtin_description *d;
31207 size_t i;
31208 tree decl;
31209 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31210 tree attrs_log, attrs_type_log;
31212 if (!flag_tm)
31213 return;
31215 /* If there are no builtins defined, we must be compiling in a
31216 language without trans-mem support. */
31217 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31218 return;
31220 /* Use whatever attributes a normal TM load has. */
31221 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31222 attrs_load = DECL_ATTRIBUTES (decl);
31223 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31224 /* Use whatever attributes a normal TM store has. */
31225 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31226 attrs_store = DECL_ATTRIBUTES (decl);
31227 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31228 /* Use whatever attributes a normal TM log has. */
31229 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31230 attrs_log = DECL_ATTRIBUTES (decl);
31231 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31233 for (i = 0, d = bdesc_tm;
31234 i < ARRAY_SIZE (bdesc_tm);
31235 i++, d++)
31237 if ((d->mask & ix86_isa_flags) != 0
31238 || (lang_hooks.builtin_function
31239 == lang_hooks.builtin_function_ext_scope))
31241 tree type, attrs, attrs_type;
31242 enum built_in_function code = (enum built_in_function) d->code;
31244 ftype = (enum ix86_builtin_func_type) d->flag;
31245 type = ix86_get_builtin_func_type (ftype);
31247 if (BUILTIN_TM_LOAD_P (code))
31249 attrs = attrs_load;
31250 attrs_type = attrs_type_load;
31252 else if (BUILTIN_TM_STORE_P (code))
31254 attrs = attrs_store;
31255 attrs_type = attrs_type_store;
31257 else
31259 attrs = attrs_log;
31260 attrs_type = attrs_type_log;
31262 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31263 /* The builtin without the prefix for
31264 calling it directly. */
31265 d->name + strlen ("__builtin_"),
31266 attrs);
31267 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31268 set the TYPE_ATTRIBUTES. */
31269 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31271 set_builtin_decl (code, decl, false);
31276 /* Macros for verification of enum ix86_builtins order. */
31277 #define BDESC_VERIFY(x, y, z) \
31278 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31279 #define BDESC_VERIFYS(x, y, z) \
31280 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31282 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31283 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31284 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31285 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31286 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31287 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31288 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31289 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31290 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31291 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31292 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31293 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31294 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31295 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31296 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31297 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
31298 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31299 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31300 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31301 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31302 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31303 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31304 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31305 IX86_BUILTIN__BDESC_CET_LAST, 1);
31306 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31307 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31309 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31310 in the current target ISA to allow the user to compile particular modules
31311 with different target specific options that differ from the command line
31312 options. */
31313 static void
31314 ix86_init_mmx_sse_builtins (void)
31316 const struct builtin_description * d;
31317 enum ix86_builtin_func_type ftype;
31318 size_t i;
31320 /* Add all special builtins with variable number of operands. */
31321 for (i = 0, d = bdesc_special_args;
31322 i < ARRAY_SIZE (bdesc_special_args);
31323 i++, d++)
31325 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31326 if (d->name == 0)
31327 continue;
31329 ftype = (enum ix86_builtin_func_type) d->flag;
31330 def_builtin (d->mask, d->name, ftype, d->code);
31332 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31333 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31334 ARRAY_SIZE (bdesc_special_args) - 1);
31336 /* Add all special builtins with variable number of operands. */
31337 for (i = 0, d = bdesc_special_args2;
31338 i < ARRAY_SIZE (bdesc_special_args2);
31339 i++, d++)
31341 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
31342 if (d->name == 0)
31343 continue;
31345 ftype = (enum ix86_builtin_func_type) d->flag;
31346 def_builtin2 (d->mask, d->name, ftype, d->code);
31348 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
31349 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31350 ARRAY_SIZE (bdesc_special_args2) - 1);
31352 /* Add all builtins with variable number of operands. */
31353 for (i = 0, d = bdesc_args;
31354 i < ARRAY_SIZE (bdesc_args);
31355 i++, d++)
31357 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31358 if (d->name == 0)
31359 continue;
31361 ftype = (enum ix86_builtin_func_type) d->flag;
31362 def_builtin_const (d->mask, d->name, ftype, d->code);
31364 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31365 IX86_BUILTIN__BDESC_ARGS_FIRST,
31366 ARRAY_SIZE (bdesc_args) - 1);
31368 /* Add all builtins with variable number of operands. */
31369 for (i = 0, d = bdesc_args2;
31370 i < ARRAY_SIZE (bdesc_args2);
31371 i++, d++)
31373 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31374 if (d->name == 0)
31375 continue;
31377 ftype = (enum ix86_builtin_func_type) d->flag;
31378 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31380 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31381 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31382 ARRAY_SIZE (bdesc_args2) - 1);
31384 /* Add all builtins with rounding. */
31385 for (i = 0, d = bdesc_round_args;
31386 i < ARRAY_SIZE (bdesc_round_args);
31387 i++, d++)
31389 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31390 if (d->name == 0)
31391 continue;
31393 ftype = (enum ix86_builtin_func_type) d->flag;
31394 def_builtin_const (d->mask, d->name, ftype, d->code);
31396 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31397 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31398 ARRAY_SIZE (bdesc_round_args) - 1);
31400 /* pcmpestr[im] insns. */
31401 for (i = 0, d = bdesc_pcmpestr;
31402 i < ARRAY_SIZE (bdesc_pcmpestr);
31403 i++, d++)
31405 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31406 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31407 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31408 else
31409 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31410 def_builtin_const (d->mask, d->name, ftype, d->code);
31412 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31413 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31414 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31416 /* pcmpistr[im] insns. */
31417 for (i = 0, d = bdesc_pcmpistr;
31418 i < ARRAY_SIZE (bdesc_pcmpistr);
31419 i++, d++)
31421 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31422 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31423 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31424 else
31425 ftype = INT_FTYPE_V16QI_V16QI_INT;
31426 def_builtin_const (d->mask, d->name, ftype, d->code);
31428 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31429 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31430 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31432 /* comi/ucomi insns. */
31433 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31435 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31436 if (d->mask == OPTION_MASK_ISA_SSE2)
31437 ftype = INT_FTYPE_V2DF_V2DF;
31438 else
31439 ftype = INT_FTYPE_V4SF_V4SF;
31440 def_builtin_const (d->mask, d->name, ftype, d->code);
31442 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31443 IX86_BUILTIN__BDESC_COMI_FIRST,
31444 ARRAY_SIZE (bdesc_comi) - 1);
31446 /* SSE */
31447 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31448 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31449 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31450 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31452 /* SSE or 3DNow!A */
31453 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31454 /* As it uses V4HImode, we have to require -mmmx too. */
31455 | OPTION_MASK_ISA_MMX,
31456 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31457 IX86_BUILTIN_MASKMOVQ);
31459 /* SSE2 */
31460 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31461 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31463 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31464 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31465 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31466 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31468 /* SSE3. */
31469 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31470 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31471 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31472 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31474 /* AES */
31475 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31476 "__builtin_ia32_aesenc128",
31477 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31478 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31479 "__builtin_ia32_aesenclast128",
31480 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31481 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31482 "__builtin_ia32_aesdec128",
31483 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31484 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31485 "__builtin_ia32_aesdeclast128",
31486 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31487 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31488 "__builtin_ia32_aesimc128",
31489 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31490 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31491 "__builtin_ia32_aeskeygenassist128",
31492 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31494 /* PCLMUL */
31495 def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2,
31496 "__builtin_ia32_pclmulqdq128",
31497 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31499 /* RDRND */
31500 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31501 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31502 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31503 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31504 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31505 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31506 IX86_BUILTIN_RDRAND64_STEP);
31508 /* AVX2 */
31509 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31510 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31511 IX86_BUILTIN_GATHERSIV2DF);
31513 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31514 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31515 IX86_BUILTIN_GATHERSIV4DF);
31517 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31518 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31519 IX86_BUILTIN_GATHERDIV2DF);
31521 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31522 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31523 IX86_BUILTIN_GATHERDIV4DF);
31525 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31526 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31527 IX86_BUILTIN_GATHERSIV4SF);
31529 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31530 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31531 IX86_BUILTIN_GATHERSIV8SF);
31533 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31534 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31535 IX86_BUILTIN_GATHERDIV4SF);
31537 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31538 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31539 IX86_BUILTIN_GATHERDIV8SF);
31541 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31542 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31543 IX86_BUILTIN_GATHERSIV2DI);
31545 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31546 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31547 IX86_BUILTIN_GATHERSIV4DI);
31549 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31550 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31551 IX86_BUILTIN_GATHERDIV2DI);
31553 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31554 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31555 IX86_BUILTIN_GATHERDIV4DI);
31557 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31558 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31559 IX86_BUILTIN_GATHERSIV4SI);
31561 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31562 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31563 IX86_BUILTIN_GATHERSIV8SI);
31565 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31566 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31567 IX86_BUILTIN_GATHERDIV4SI);
31569 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31570 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31571 IX86_BUILTIN_GATHERDIV8SI);
31573 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31574 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31575 IX86_BUILTIN_GATHERALTSIV4DF);
31577 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31578 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31579 IX86_BUILTIN_GATHERALTDIV8SF);
31581 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31582 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31583 IX86_BUILTIN_GATHERALTSIV4DI);
31585 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31586 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31587 IX86_BUILTIN_GATHERALTDIV8SI);
31589 /* AVX512F */
31590 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31591 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31592 IX86_BUILTIN_GATHER3SIV16SF);
31594 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31595 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31596 IX86_BUILTIN_GATHER3SIV8DF);
31598 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31599 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31600 IX86_BUILTIN_GATHER3DIV16SF);
31602 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31603 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31604 IX86_BUILTIN_GATHER3DIV8DF);
31606 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31607 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31608 IX86_BUILTIN_GATHER3SIV16SI);
31610 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31611 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31612 IX86_BUILTIN_GATHER3SIV8DI);
31614 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31615 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31616 IX86_BUILTIN_GATHER3DIV16SI);
31618 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31619 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31620 IX86_BUILTIN_GATHER3DIV8DI);
31622 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31623 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31624 IX86_BUILTIN_GATHER3ALTSIV8DF);
31626 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31627 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31628 IX86_BUILTIN_GATHER3ALTDIV16SF);
31630 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31631 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31632 IX86_BUILTIN_GATHER3ALTSIV8DI);
31634 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31635 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31636 IX86_BUILTIN_GATHER3ALTDIV16SI);
31638 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31639 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31640 IX86_BUILTIN_SCATTERSIV16SF);
31642 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31643 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31644 IX86_BUILTIN_SCATTERSIV8DF);
31646 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31647 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31648 IX86_BUILTIN_SCATTERDIV16SF);
31650 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31651 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31652 IX86_BUILTIN_SCATTERDIV8DF);
31654 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31655 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31656 IX86_BUILTIN_SCATTERSIV16SI);
31658 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31659 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31660 IX86_BUILTIN_SCATTERSIV8DI);
31662 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31663 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31664 IX86_BUILTIN_SCATTERDIV16SI);
31666 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31667 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31668 IX86_BUILTIN_SCATTERDIV8DI);
31670 /* AVX512VL */
31671 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31672 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31673 IX86_BUILTIN_GATHER3SIV2DF);
31675 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31676 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31677 IX86_BUILTIN_GATHER3SIV4DF);
31679 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31680 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31681 IX86_BUILTIN_GATHER3DIV2DF);
31683 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31684 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31685 IX86_BUILTIN_GATHER3DIV4DF);
31687 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31688 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31689 IX86_BUILTIN_GATHER3SIV4SF);
31691 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31692 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31693 IX86_BUILTIN_GATHER3SIV8SF);
31695 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31696 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31697 IX86_BUILTIN_GATHER3DIV4SF);
31699 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31700 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31701 IX86_BUILTIN_GATHER3DIV8SF);
31703 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31704 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31705 IX86_BUILTIN_GATHER3SIV2DI);
31707 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31708 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31709 IX86_BUILTIN_GATHER3SIV4DI);
31711 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31712 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31713 IX86_BUILTIN_GATHER3DIV2DI);
31715 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31716 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31717 IX86_BUILTIN_GATHER3DIV4DI);
31719 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31720 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31721 IX86_BUILTIN_GATHER3SIV4SI);
31723 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31724 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31725 IX86_BUILTIN_GATHER3SIV8SI);
31727 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31728 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31729 IX86_BUILTIN_GATHER3DIV4SI);
31731 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31732 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31733 IX86_BUILTIN_GATHER3DIV8SI);
31735 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31736 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31737 IX86_BUILTIN_GATHER3ALTSIV4DF);
31739 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31740 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31741 IX86_BUILTIN_GATHER3ALTDIV8SF);
31743 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31744 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31745 IX86_BUILTIN_GATHER3ALTSIV4DI);
31747 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31748 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31749 IX86_BUILTIN_GATHER3ALTDIV8SI);
31751 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31752 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31753 IX86_BUILTIN_SCATTERSIV8SF);
31755 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31756 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31757 IX86_BUILTIN_SCATTERSIV4SF);
31759 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31760 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31761 IX86_BUILTIN_SCATTERSIV4DF);
31763 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31764 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31765 IX86_BUILTIN_SCATTERSIV2DF);
31767 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31768 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31769 IX86_BUILTIN_SCATTERDIV8SF);
31771 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31772 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31773 IX86_BUILTIN_SCATTERDIV4SF);
31775 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31776 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31777 IX86_BUILTIN_SCATTERDIV4DF);
31779 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31780 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31781 IX86_BUILTIN_SCATTERDIV2DF);
31783 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31784 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31785 IX86_BUILTIN_SCATTERSIV8SI);
31787 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31788 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31789 IX86_BUILTIN_SCATTERSIV4SI);
31791 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31792 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31793 IX86_BUILTIN_SCATTERSIV4DI);
31795 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31796 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31797 IX86_BUILTIN_SCATTERSIV2DI);
31799 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31800 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31801 IX86_BUILTIN_SCATTERDIV8SI);
31803 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31804 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31805 IX86_BUILTIN_SCATTERDIV4SI);
31807 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31808 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31809 IX86_BUILTIN_SCATTERDIV4DI);
31811 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31812 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31813 IX86_BUILTIN_SCATTERDIV2DI);
31814 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31815 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31816 IX86_BUILTIN_SCATTERALTSIV8DF);
31818 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31819 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31820 IX86_BUILTIN_SCATTERALTDIV16SF);
31822 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31823 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31824 IX86_BUILTIN_SCATTERALTSIV8DI);
31826 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31827 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31828 IX86_BUILTIN_SCATTERALTDIV16SI);
31830 /* AVX512PF */
31831 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31832 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31833 IX86_BUILTIN_GATHERPFDPD);
31834 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31835 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31836 IX86_BUILTIN_GATHERPFDPS);
31837 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31838 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31839 IX86_BUILTIN_GATHERPFQPD);
31840 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31841 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31842 IX86_BUILTIN_GATHERPFQPS);
31843 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31844 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31845 IX86_BUILTIN_SCATTERPFDPD);
31846 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31847 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31848 IX86_BUILTIN_SCATTERPFDPS);
31849 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31850 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31851 IX86_BUILTIN_SCATTERPFQPD);
31852 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31853 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31854 IX86_BUILTIN_SCATTERPFQPS);
31856 /* SHA */
31857 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31858 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31859 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31860 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31861 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31862 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31863 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31864 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31865 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31866 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31867 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31868 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31869 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31870 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31872 /* RTM. */
31873 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31874 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31876 /* MMX access to the vec_init patterns. */
31877 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31878 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31880 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31881 V4HI_FTYPE_HI_HI_HI_HI,
31882 IX86_BUILTIN_VEC_INIT_V4HI);
31884 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31885 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31886 IX86_BUILTIN_VEC_INIT_V8QI);
31888 /* Access to the vec_extract patterns. */
31889 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31890 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31891 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31892 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31893 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31894 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31895 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31896 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31897 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31898 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31900 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31901 /* As it uses V4HImode, we have to require -mmmx too. */
31902 | OPTION_MASK_ISA_MMX,
31903 "__builtin_ia32_vec_ext_v4hi",
31904 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31906 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31907 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31909 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31910 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31912 /* Access to the vec_set patterns. */
31913 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31914 "__builtin_ia32_vec_set_v2di",
31915 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31917 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31918 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31920 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31921 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31923 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31924 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31926 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31927 /* As it uses V4HImode, we have to require -mmmx too. */
31928 | OPTION_MASK_ISA_MMX,
31929 "__builtin_ia32_vec_set_v4hi",
31930 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31932 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31933 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31935 /* RDSEED */
31936 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31937 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31938 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31939 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31940 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31941 "__builtin_ia32_rdseed_di_step",
31942 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31944 /* ADCX */
31945 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31946 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31947 def_builtin (OPTION_MASK_ISA_64BIT,
31948 "__builtin_ia32_addcarryx_u64",
31949 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31950 IX86_BUILTIN_ADDCARRYX64);
31952 /* SBB */
31953 def_builtin (0, "__builtin_ia32_sbb_u32",
31954 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31955 def_builtin (OPTION_MASK_ISA_64BIT,
31956 "__builtin_ia32_sbb_u64",
31957 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31958 IX86_BUILTIN_SBB64);
31960 /* Read/write FLAGS. */
31961 def_builtin (0, "__builtin_ia32_readeflags_u32",
31962 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31963 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31964 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31965 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31966 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31967 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31968 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31970 /* CLFLUSHOPT. */
31971 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31972 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31974 /* CLWB. */
31975 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31976 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31978 /* MONITORX and MWAITX. */
31979 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31980 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31981 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31982 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31984 /* CLZERO. */
31985 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31986 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31988 /* Add FMA4 multi-arg argument instructions */
31989 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31991 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31992 if (d->name == 0)
31993 continue;
31995 ftype = (enum ix86_builtin_func_type) d->flag;
31996 def_builtin_const (d->mask, d->name, ftype, d->code);
31998 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31999 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32000 ARRAY_SIZE (bdesc_multi_arg) - 1);
32002 /* Add CET inrinsics. */
32003 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
32005 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
32006 if (d->name == 0)
32007 continue;
32009 ftype = (enum ix86_builtin_func_type) d->flag;
32010 def_builtin (d->mask, d->name, ftype, d->code);
32012 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
32013 IX86_BUILTIN__BDESC_CET_FIRST,
32014 ARRAY_SIZE (bdesc_cet) - 1);
32016 for (i = 0, d = bdesc_cet_rdssp;
32017 i < ARRAY_SIZE (bdesc_cet_rdssp);
32018 i++, d++)
32020 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
32021 if (d->name == 0)
32022 continue;
32024 ftype = (enum ix86_builtin_func_type) d->flag;
32025 def_builtin (d->mask, d->name, ftype, d->code);
32027 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
32028 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
32029 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
32032 static void
32033 ix86_init_mpx_builtins ()
32035 const struct builtin_description * d;
32036 enum ix86_builtin_func_type ftype;
32037 tree decl;
32038 size_t i;
32040 for (i = 0, d = bdesc_mpx;
32041 i < ARRAY_SIZE (bdesc_mpx);
32042 i++, d++)
32044 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32045 if (d->name == 0)
32046 continue;
32048 ftype = (enum ix86_builtin_func_type) d->flag;
32049 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
32051 /* With no leaf and nothrow flags for MPX builtins
32052 abnormal edges may follow its call when setjmp
32053 presents in the function. Since we may have a lot
32054 of MPX builtins calls it causes lots of useless
32055 edges and enormous PHI nodes. To avoid this we mark
32056 MPX builtins as leaf and nothrow. */
32057 if (decl)
32059 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32060 NULL_TREE);
32061 TREE_NOTHROW (decl) = 1;
32063 else
32065 ix86_builtins_isa[(int)d->code].leaf_p = true;
32066 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32069 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32070 IX86_BUILTIN__BDESC_MPX_FIRST,
32071 ARRAY_SIZE (bdesc_mpx) - 1);
32073 for (i = 0, d = bdesc_mpx_const;
32074 i < ARRAY_SIZE (bdesc_mpx_const);
32075 i++, d++)
32077 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32078 if (d->name == 0)
32079 continue;
32081 ftype = (enum ix86_builtin_func_type) d->flag;
32082 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
32084 if (decl)
32086 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32087 NULL_TREE);
32088 TREE_NOTHROW (decl) = 1;
32090 else
32092 ix86_builtins_isa[(int)d->code].leaf_p = true;
32093 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32096 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32097 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32098 ARRAY_SIZE (bdesc_mpx_const) - 1);
32100 #undef BDESC_VERIFY
32101 #undef BDESC_VERIFYS
32103 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32104 to return a pointer to VERSION_DECL if the outcome of the expression
32105 formed by PREDICATE_CHAIN is true. This function will be called during
32106 version dispatch to decide which function version to execute. It returns
32107 the basic block at the end, to which more conditions can be added. */
32109 static basic_block
32110 add_condition_to_bb (tree function_decl, tree version_decl,
32111 tree predicate_chain, basic_block new_bb)
32113 gimple *return_stmt;
32114 tree convert_expr, result_var;
32115 gimple *convert_stmt;
32116 gimple *call_cond_stmt;
32117 gimple *if_else_stmt;
32119 basic_block bb1, bb2, bb3;
32120 edge e12, e23;
32122 tree cond_var, and_expr_var = NULL_TREE;
32123 gimple_seq gseq;
32125 tree predicate_decl, predicate_arg;
32127 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32129 gcc_assert (new_bb != NULL);
32130 gseq = bb_seq (new_bb);
32133 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32134 build_fold_addr_expr (version_decl));
32135 result_var = create_tmp_var (ptr_type_node);
32136 convert_stmt = gimple_build_assign (result_var, convert_expr);
32137 return_stmt = gimple_build_return (result_var);
32139 if (predicate_chain == NULL_TREE)
32141 gimple_seq_add_stmt (&gseq, convert_stmt);
32142 gimple_seq_add_stmt (&gseq, return_stmt);
32143 set_bb_seq (new_bb, gseq);
32144 gimple_set_bb (convert_stmt, new_bb);
32145 gimple_set_bb (return_stmt, new_bb);
32146 pop_cfun ();
32147 return new_bb;
32150 while (predicate_chain != NULL)
32152 cond_var = create_tmp_var (integer_type_node);
32153 predicate_decl = TREE_PURPOSE (predicate_chain);
32154 predicate_arg = TREE_VALUE (predicate_chain);
32155 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32156 gimple_call_set_lhs (call_cond_stmt, cond_var);
32158 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32159 gimple_set_bb (call_cond_stmt, new_bb);
32160 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32162 predicate_chain = TREE_CHAIN (predicate_chain);
32164 if (and_expr_var == NULL)
32165 and_expr_var = cond_var;
32166 else
32168 gimple *assign_stmt;
32169 /* Use MIN_EXPR to check if any integer is zero?.
32170 and_expr_var = min_expr <cond_var, and_expr_var> */
32171 assign_stmt = gimple_build_assign (and_expr_var,
32172 build2 (MIN_EXPR, integer_type_node,
32173 cond_var, and_expr_var));
32175 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32176 gimple_set_bb (assign_stmt, new_bb);
32177 gimple_seq_add_stmt (&gseq, assign_stmt);
32181 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32182 integer_zero_node,
32183 NULL_TREE, NULL_TREE);
32184 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32185 gimple_set_bb (if_else_stmt, new_bb);
32186 gimple_seq_add_stmt (&gseq, if_else_stmt);
32188 gimple_seq_add_stmt (&gseq, convert_stmt);
32189 gimple_seq_add_stmt (&gseq, return_stmt);
32190 set_bb_seq (new_bb, gseq);
32192 bb1 = new_bb;
32193 e12 = split_block (bb1, if_else_stmt);
32194 bb2 = e12->dest;
32195 e12->flags &= ~EDGE_FALLTHRU;
32196 e12->flags |= EDGE_TRUE_VALUE;
32198 e23 = split_block (bb2, return_stmt);
32200 gimple_set_bb (convert_stmt, bb2);
32201 gimple_set_bb (return_stmt, bb2);
32203 bb3 = e23->dest;
32204 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32206 remove_edge (e23);
32207 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32209 pop_cfun ();
32211 return bb3;
32214 /* This parses the attribute arguments to target in DECL and determines
32215 the right builtin to use to match the platform specification.
32216 It returns the priority value for this version decl. If PREDICATE_LIST
32217 is not NULL, it stores the list of cpu features that need to be checked
32218 before dispatching this function. */
32220 static unsigned int
32221 get_builtin_code_for_version (tree decl, tree *predicate_list)
32223 tree attrs;
32224 struct cl_target_option cur_target;
32225 tree target_node;
32226 struct cl_target_option *new_target;
32227 const char *arg_str = NULL;
32228 const char *attrs_str = NULL;
32229 char *tok_str = NULL;
32230 char *token;
32232 /* Priority of i386 features, greater value is higher priority. This is
32233 used to decide the order in which function dispatch must happen. For
32234 instance, a version specialized for SSE4.2 should be checked for dispatch
32235 before a version for SSE3, as SSE4.2 implies SSE3. */
32236 enum feature_priority
32238 P_ZERO = 0,
32239 P_MMX,
32240 P_SSE,
32241 P_SSE2,
32242 P_SSE3,
32243 P_SSSE3,
32244 P_PROC_SSSE3,
32245 P_SSE4_A,
32246 P_PROC_SSE4_A,
32247 P_SSE4_1,
32248 P_SSE4_2,
32249 P_PROC_SSE4_2,
32250 P_POPCNT,
32251 P_AES,
32252 P_PCLMUL,
32253 P_AVX,
32254 P_PROC_AVX,
32255 P_BMI,
32256 P_PROC_BMI,
32257 P_FMA4,
32258 P_XOP,
32259 P_PROC_XOP,
32260 P_FMA,
32261 P_PROC_FMA,
32262 P_BMI2,
32263 P_AVX2,
32264 P_PROC_AVX2,
32265 P_AVX512F,
32266 P_PROC_AVX512F
32269 enum feature_priority priority = P_ZERO;
32271 /* These are the target attribute strings for which a dispatcher is
32272 available, from fold_builtin_cpu. */
32274 static struct _feature_list
32276 const char *const name;
32277 const enum feature_priority priority;
32279 const feature_list[] =
32281 {"mmx", P_MMX},
32282 {"sse", P_SSE},
32283 {"sse2", P_SSE2},
32284 {"sse3", P_SSE3},
32285 {"sse4a", P_SSE4_A},
32286 {"ssse3", P_SSSE3},
32287 {"sse4.1", P_SSE4_1},
32288 {"sse4.2", P_SSE4_2},
32289 {"popcnt", P_POPCNT},
32290 {"aes", P_AES},
32291 {"pclmul", P_PCLMUL},
32292 {"avx", P_AVX},
32293 {"bmi", P_BMI},
32294 {"fma4", P_FMA4},
32295 {"xop", P_XOP},
32296 {"fma", P_FMA},
32297 {"bmi2", P_BMI2},
32298 {"avx2", P_AVX2},
32299 {"avx512f", P_AVX512F}
32303 static unsigned int NUM_FEATURES
32304 = sizeof (feature_list) / sizeof (struct _feature_list);
32306 unsigned int i;
32308 tree predicate_chain = NULL_TREE;
32309 tree predicate_decl, predicate_arg;
32311 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32312 gcc_assert (attrs != NULL);
32314 attrs = TREE_VALUE (TREE_VALUE (attrs));
32316 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32317 attrs_str = TREE_STRING_POINTER (attrs);
32319 /* Return priority zero for default function. */
32320 if (strcmp (attrs_str, "default") == 0)
32321 return 0;
32323 /* Handle arch= if specified. For priority, set it to be 1 more than
32324 the best instruction set the processor can handle. For instance, if
32325 there is a version for atom and a version for ssse3 (the highest ISA
32326 priority for atom), the atom version must be checked for dispatch
32327 before the ssse3 version. */
32328 if (strstr (attrs_str, "arch=") != NULL)
32330 cl_target_option_save (&cur_target, &global_options);
32331 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32332 &global_options_set);
32334 gcc_assert (target_node);
32335 new_target = TREE_TARGET_OPTION (target_node);
32336 gcc_assert (new_target);
32338 if (new_target->arch_specified && new_target->arch > 0)
32340 switch (new_target->arch)
32342 case PROCESSOR_CORE2:
32343 arg_str = "core2";
32344 priority = P_PROC_SSSE3;
32345 break;
32346 case PROCESSOR_NEHALEM:
32347 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32349 arg_str = "westmere";
32350 priority = P_AES;
32352 else
32354 /* We translate "arch=corei7" and "arch=nehalem" to
32355 "corei7" so that it will be mapped to M_INTEL_COREI7
32356 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32357 arg_str = "corei7";
32358 priority = P_PROC_SSE4_2;
32360 break;
32361 case PROCESSOR_SANDYBRIDGE:
32362 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32363 arg_str = "ivybridge";
32364 else
32365 arg_str = "sandybridge";
32366 priority = P_PROC_AVX;
32367 break;
32368 case PROCESSOR_HASWELL:
32369 case PROCESSOR_SKYLAKE_AVX512:
32370 if (new_target->x_ix86_isa_flags
32371 & OPTION_MASK_ISA_AVX512VBMI)
32372 arg_str = "cannonlake";
32373 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32374 arg_str = "skylake-avx512";
32375 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32376 arg_str = "skylake";
32377 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32378 arg_str = "broadwell";
32379 else
32380 arg_str = "haswell";
32381 priority = P_PROC_AVX2;
32382 break;
32383 case PROCESSOR_ICELAKE_CLIENT:
32384 arg_str = "icelake-client";
32385 priority = P_PROC_AVX2;
32386 break;
32387 case PROCESSOR_ICELAKE_SERVER:
32388 arg_str = "icelake-server";
32389 priority = P_PROC_AVX2;
32390 break;
32391 case PROCESSOR_BONNELL:
32392 arg_str = "bonnell";
32393 priority = P_PROC_SSSE3;
32394 break;
32395 case PROCESSOR_KNL:
32396 arg_str = "knl";
32397 priority = P_PROC_AVX512F;
32398 break;
32399 case PROCESSOR_KNM:
32400 arg_str = "knm";
32401 priority = P_PROC_AVX512F;
32402 break;
32403 case PROCESSOR_SILVERMONT:
32404 arg_str = "silvermont";
32405 priority = P_PROC_SSE4_2;
32406 break;
32407 case PROCESSOR_AMDFAM10:
32408 arg_str = "amdfam10h";
32409 priority = P_PROC_SSE4_A;
32410 break;
32411 case PROCESSOR_BTVER1:
32412 arg_str = "btver1";
32413 priority = P_PROC_SSE4_A;
32414 break;
32415 case PROCESSOR_BTVER2:
32416 arg_str = "btver2";
32417 priority = P_PROC_BMI;
32418 break;
32419 case PROCESSOR_BDVER1:
32420 arg_str = "bdver1";
32421 priority = P_PROC_XOP;
32422 break;
32423 case PROCESSOR_BDVER2:
32424 arg_str = "bdver2";
32425 priority = P_PROC_FMA;
32426 break;
32427 case PROCESSOR_BDVER3:
32428 arg_str = "bdver3";
32429 priority = P_PROC_FMA;
32430 break;
32431 case PROCESSOR_BDVER4:
32432 arg_str = "bdver4";
32433 priority = P_PROC_AVX2;
32434 break;
32435 case PROCESSOR_ZNVER1:
32436 arg_str = "znver1";
32437 priority = P_PROC_AVX2;
32438 break;
32442 cl_target_option_restore (&global_options, &cur_target);
32444 if (predicate_list && arg_str == NULL)
32446 error_at (DECL_SOURCE_LOCATION (decl),
32447 "No dispatcher found for the versioning attributes");
32448 return 0;
32451 if (predicate_list)
32453 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32454 /* For a C string literal the length includes the trailing NULL. */
32455 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32456 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32457 predicate_chain);
32461 /* Process feature name. */
32462 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32463 strcpy (tok_str, attrs_str);
32464 token = strtok (tok_str, ",");
32465 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32467 while (token != NULL)
32469 /* Do not process "arch=" */
32470 if (strncmp (token, "arch=", 5) == 0)
32472 token = strtok (NULL, ",");
32473 continue;
32475 for (i = 0; i < NUM_FEATURES; ++i)
32477 if (strcmp (token, feature_list[i].name) == 0)
32479 if (predicate_list)
32481 predicate_arg = build_string_literal (
32482 strlen (feature_list[i].name) + 1,
32483 feature_list[i].name);
32484 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32485 predicate_chain);
32487 /* Find the maximum priority feature. */
32488 if (feature_list[i].priority > priority)
32489 priority = feature_list[i].priority;
32491 break;
32494 if (predicate_list && i == NUM_FEATURES)
32496 error_at (DECL_SOURCE_LOCATION (decl),
32497 "No dispatcher found for %s", token);
32498 return 0;
32500 token = strtok (NULL, ",");
32502 free (tok_str);
32504 if (predicate_list && predicate_chain == NULL_TREE)
32506 error_at (DECL_SOURCE_LOCATION (decl),
32507 "No dispatcher found for the versioning attributes : %s",
32508 attrs_str);
32509 return 0;
32511 else if (predicate_list)
32513 predicate_chain = nreverse (predicate_chain);
32514 *predicate_list = predicate_chain;
32517 return priority;
32520 /* This compares the priority of target features in function DECL1
32521 and DECL2. It returns positive value if DECL1 is higher priority,
32522 negative value if DECL2 is higher priority and 0 if they are the
32523 same. */
32525 static int
32526 ix86_compare_version_priority (tree decl1, tree decl2)
32528 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32529 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32531 return (int)priority1 - (int)priority2;
32534 /* V1 and V2 point to function versions with different priorities
32535 based on the target ISA. This function compares their priorities. */
32537 static int
32538 feature_compare (const void *v1, const void *v2)
32540 typedef struct _function_version_info
32542 tree version_decl;
32543 tree predicate_chain;
32544 unsigned int dispatch_priority;
32545 } function_version_info;
32547 const function_version_info c1 = *(const function_version_info *)v1;
32548 const function_version_info c2 = *(const function_version_info *)v2;
32549 return (c2.dispatch_priority - c1.dispatch_priority);
32552 /* This function generates the dispatch function for
32553 multi-versioned functions. DISPATCH_DECL is the function which will
32554 contain the dispatch logic. FNDECLS are the function choices for
32555 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32556 in DISPATCH_DECL in which the dispatch code is generated. */
32558 static int
32559 dispatch_function_versions (tree dispatch_decl,
32560 void *fndecls_p,
32561 basic_block *empty_bb)
32563 tree default_decl;
32564 gimple *ifunc_cpu_init_stmt;
32565 gimple_seq gseq;
32566 int ix;
32567 tree ele;
32568 vec<tree> *fndecls;
32569 unsigned int num_versions = 0;
32570 unsigned int actual_versions = 0;
32571 unsigned int i;
32573 struct _function_version_info
32575 tree version_decl;
32576 tree predicate_chain;
32577 unsigned int dispatch_priority;
32578 }*function_version_info;
32580 gcc_assert (dispatch_decl != NULL
32581 && fndecls_p != NULL
32582 && empty_bb != NULL);
32584 /*fndecls_p is actually a vector. */
32585 fndecls = static_cast<vec<tree> *> (fndecls_p);
32587 /* At least one more version other than the default. */
32588 num_versions = fndecls->length ();
32589 gcc_assert (num_versions >= 2);
32591 function_version_info = (struct _function_version_info *)
32592 XNEWVEC (struct _function_version_info, (num_versions - 1));
32594 /* The first version in the vector is the default decl. */
32595 default_decl = (*fndecls)[0];
32597 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32599 gseq = bb_seq (*empty_bb);
32600 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32601 constructors, so explicity call __builtin_cpu_init here. */
32602 ifunc_cpu_init_stmt = gimple_build_call_vec (
32603 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32604 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32605 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32606 set_bb_seq (*empty_bb, gseq);
32608 pop_cfun ();
32611 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32613 tree version_decl = ele;
32614 tree predicate_chain = NULL_TREE;
32615 unsigned int priority;
32616 /* Get attribute string, parse it and find the right predicate decl.
32617 The predicate function could be a lengthy combination of many
32618 features, like arch-type and various isa-variants. */
32619 priority = get_builtin_code_for_version (version_decl,
32620 &predicate_chain);
32622 if (predicate_chain == NULL_TREE)
32623 continue;
32625 function_version_info [actual_versions].version_decl = version_decl;
32626 function_version_info [actual_versions].predicate_chain
32627 = predicate_chain;
32628 function_version_info [actual_versions].dispatch_priority = priority;
32629 actual_versions++;
32632 /* Sort the versions according to descending order of dispatch priority. The
32633 priority is based on the ISA. This is not a perfect solution. There
32634 could still be ambiguity. If more than one function version is suitable
32635 to execute, which one should be dispatched? In future, allow the user
32636 to specify a dispatch priority next to the version. */
32637 qsort (function_version_info, actual_versions,
32638 sizeof (struct _function_version_info), feature_compare);
32640 for (i = 0; i < actual_versions; ++i)
32641 *empty_bb = add_condition_to_bb (dispatch_decl,
32642 function_version_info[i].version_decl,
32643 function_version_info[i].predicate_chain,
32644 *empty_bb);
32646 /* dispatch default version at the end. */
32647 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32648 NULL, *empty_bb);
32650 free (function_version_info);
32651 return 0;
32654 /* This function changes the assembler name for functions that are
32655 versions. If DECL is a function version and has a "target"
32656 attribute, it appends the attribute string to its assembler name. */
32658 static tree
32659 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32661 tree version_attr;
32662 const char *orig_name, *version_string;
32663 char *attr_str, *assembler_name;
32665 if (DECL_DECLARED_INLINE_P (decl)
32666 && lookup_attribute ("gnu_inline",
32667 DECL_ATTRIBUTES (decl)))
32668 error_at (DECL_SOURCE_LOCATION (decl),
32669 "Function versions cannot be marked as gnu_inline,"
32670 " bodies have to be generated");
32672 if (DECL_VIRTUAL_P (decl)
32673 || DECL_VINDEX (decl))
32674 sorry ("Virtual function multiversioning not supported");
32676 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32678 /* target attribute string cannot be NULL. */
32679 gcc_assert (version_attr != NULL_TREE);
32681 orig_name = IDENTIFIER_POINTER (id);
32682 version_string
32683 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32685 if (strcmp (version_string, "default") == 0)
32686 return id;
32688 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32689 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32691 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32693 /* Allow assembler name to be modified if already set. */
32694 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32695 SET_DECL_RTL (decl, NULL);
32697 tree ret = get_identifier (assembler_name);
32698 XDELETEVEC (attr_str);
32699 XDELETEVEC (assembler_name);
32700 return ret;
32704 static tree
32705 ix86_mangle_decl_assembler_name (tree decl, tree id)
32707 /* For function version, add the target suffix to the assembler name. */
32708 if (TREE_CODE (decl) == FUNCTION_DECL
32709 && DECL_FUNCTION_VERSIONED (decl))
32710 id = ix86_mangle_function_version_assembler_name (decl, id);
32711 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32712 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32713 #endif
32715 return id;
32718 /* Make a dispatcher declaration for the multi-versioned function DECL.
32719 Calls to DECL function will be replaced with calls to the dispatcher
32720 by the front-end. Returns the decl of the dispatcher function. */
32722 static tree
32723 ix86_get_function_versions_dispatcher (void *decl)
32725 tree fn = (tree) decl;
32726 struct cgraph_node *node = NULL;
32727 struct cgraph_node *default_node = NULL;
32728 struct cgraph_function_version_info *node_v = NULL;
32729 struct cgraph_function_version_info *first_v = NULL;
32731 tree dispatch_decl = NULL;
32733 struct cgraph_function_version_info *default_version_info = NULL;
32735 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32737 node = cgraph_node::get (fn);
32738 gcc_assert (node != NULL);
32740 node_v = node->function_version ();
32741 gcc_assert (node_v != NULL);
32743 if (node_v->dispatcher_resolver != NULL)
32744 return node_v->dispatcher_resolver;
32746 /* Find the default version and make it the first node. */
32747 first_v = node_v;
32748 /* Go to the beginning of the chain. */
32749 while (first_v->prev != NULL)
32750 first_v = first_v->prev;
32751 default_version_info = first_v;
32752 while (default_version_info != NULL)
32754 if (is_function_default_version
32755 (default_version_info->this_node->decl))
32756 break;
32757 default_version_info = default_version_info->next;
32760 /* If there is no default node, just return NULL. */
32761 if (default_version_info == NULL)
32762 return NULL;
32764 /* Make default info the first node. */
32765 if (first_v != default_version_info)
32767 default_version_info->prev->next = default_version_info->next;
32768 if (default_version_info->next)
32769 default_version_info->next->prev = default_version_info->prev;
32770 first_v->prev = default_version_info;
32771 default_version_info->next = first_v;
32772 default_version_info->prev = NULL;
32775 default_node = default_version_info->this_node;
32777 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32778 if (targetm.has_ifunc_p ())
32780 struct cgraph_function_version_info *it_v = NULL;
32781 struct cgraph_node *dispatcher_node = NULL;
32782 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32784 /* Right now, the dispatching is done via ifunc. */
32785 dispatch_decl = make_dispatcher_decl (default_node->decl);
32787 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32788 gcc_assert (dispatcher_node != NULL);
32789 dispatcher_node->dispatcher_function = 1;
32790 dispatcher_version_info
32791 = dispatcher_node->insert_new_function_version ();
32792 dispatcher_version_info->next = default_version_info;
32793 dispatcher_node->definition = 1;
32795 /* Set the dispatcher for all the versions. */
32796 it_v = default_version_info;
32797 while (it_v != NULL)
32799 it_v->dispatcher_resolver = dispatch_decl;
32800 it_v = it_v->next;
32803 else
32804 #endif
32806 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32807 "multiversioning needs ifunc which is not supported "
32808 "on this target");
32811 return dispatch_decl;
32814 /* Make the resolver function decl to dispatch the versions of
32815 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32816 ifunc alias that will point to the created resolver. Create an
32817 empty basic block in the resolver and store the pointer in
32818 EMPTY_BB. Return the decl of the resolver function. */
32820 static tree
32821 make_resolver_func (const tree default_decl,
32822 const tree ifunc_alias_decl,
32823 basic_block *empty_bb)
32825 char *resolver_name;
32826 tree decl, type, decl_name, t;
32828 /* IFUNC's have to be globally visible. So, if the default_decl is
32829 not, then the name of the IFUNC should be made unique. */
32830 if (TREE_PUBLIC (default_decl) == 0)
32832 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32833 symtab->change_decl_assembler_name (ifunc_alias_decl,
32834 get_identifier (ifunc_name));
32835 XDELETEVEC (ifunc_name);
32838 resolver_name = make_unique_name (default_decl, "resolver", false);
32840 /* The resolver function should return a (void *). */
32841 type = build_function_type_list (ptr_type_node, NULL_TREE);
32843 decl = build_fn_decl (resolver_name, type);
32844 decl_name = get_identifier (resolver_name);
32845 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32847 DECL_NAME (decl) = decl_name;
32848 TREE_USED (decl) = 1;
32849 DECL_ARTIFICIAL (decl) = 1;
32850 DECL_IGNORED_P (decl) = 1;
32851 TREE_PUBLIC (decl) = 0;
32852 DECL_UNINLINABLE (decl) = 1;
32854 /* Resolver is not external, body is generated. */
32855 DECL_EXTERNAL (decl) = 0;
32856 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32858 DECL_CONTEXT (decl) = NULL_TREE;
32859 DECL_INITIAL (decl) = make_node (BLOCK);
32860 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32862 if (DECL_COMDAT_GROUP (default_decl)
32863 || TREE_PUBLIC (default_decl))
32865 /* In this case, each translation unit with a call to this
32866 versioned function will put out a resolver. Ensure it
32867 is comdat to keep just one copy. */
32868 DECL_COMDAT (decl) = 1;
32869 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32871 /* Build result decl and add to function_decl. */
32872 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32873 DECL_ARTIFICIAL (t) = 1;
32874 DECL_IGNORED_P (t) = 1;
32875 DECL_RESULT (decl) = t;
32877 gimplify_function_tree (decl);
32878 push_cfun (DECL_STRUCT_FUNCTION (decl));
32879 *empty_bb = init_lowered_empty_function (decl, false,
32880 profile_count::uninitialized ());
32882 cgraph_node::add_new_function (decl, true);
32883 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32885 pop_cfun ();
32887 gcc_assert (ifunc_alias_decl != NULL);
32888 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32889 DECL_ATTRIBUTES (ifunc_alias_decl)
32890 = make_attribute ("ifunc", resolver_name,
32891 DECL_ATTRIBUTES (ifunc_alias_decl));
32893 /* Create the alias for dispatch to resolver here. */
32894 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32895 XDELETEVEC (resolver_name);
32896 return decl;
32899 /* Generate the dispatching code body to dispatch multi-versioned function
32900 DECL. The target hook is called to process the "target" attributes and
32901 provide the code to dispatch the right function at run-time. NODE points
32902 to the dispatcher decl whose body will be created. */
32904 static tree
32905 ix86_generate_version_dispatcher_body (void *node_p)
32907 tree resolver_decl;
32908 basic_block empty_bb;
32909 tree default_ver_decl;
32910 struct cgraph_node *versn;
32911 struct cgraph_node *node;
32913 struct cgraph_function_version_info *node_version_info = NULL;
32914 struct cgraph_function_version_info *versn_info = NULL;
32916 node = (cgraph_node *)node_p;
32918 node_version_info = node->function_version ();
32919 gcc_assert (node->dispatcher_function
32920 && node_version_info != NULL);
32922 if (node_version_info->dispatcher_resolver)
32923 return node_version_info->dispatcher_resolver;
32925 /* The first version in the chain corresponds to the default version. */
32926 default_ver_decl = node_version_info->next->this_node->decl;
32928 /* node is going to be an alias, so remove the finalized bit. */
32929 node->definition = false;
32931 resolver_decl = make_resolver_func (default_ver_decl,
32932 node->decl, &empty_bb);
32934 node_version_info->dispatcher_resolver = resolver_decl;
32936 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32938 auto_vec<tree, 2> fn_ver_vec;
32940 for (versn_info = node_version_info->next; versn_info;
32941 versn_info = versn_info->next)
32943 versn = versn_info->this_node;
32944 /* Check for virtual functions here again, as by this time it should
32945 have been determined if this function needs a vtable index or
32946 not. This happens for methods in derived classes that override
32947 virtual methods in base classes but are not explicitly marked as
32948 virtual. */
32949 if (DECL_VINDEX (versn->decl))
32950 sorry ("Virtual function multiversioning not supported");
32952 fn_ver_vec.safe_push (versn->decl);
32955 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32956 cgraph_edge::rebuild_edges ();
32957 pop_cfun ();
32958 return resolver_decl;
32960 /* This builds the processor_model struct type defined in
32961 libgcc/config/i386/cpuinfo.c */
32963 static tree
32964 build_processor_model_struct (void)
32966 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32967 "__cpu_features"};
32968 tree field = NULL_TREE, field_chain = NULL_TREE;
32969 int i;
32970 tree type = make_node (RECORD_TYPE);
32972 /* The first 3 fields are unsigned int. */
32973 for (i = 0; i < 3; ++i)
32975 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32976 get_identifier (field_name[i]), unsigned_type_node);
32977 if (field_chain != NULL_TREE)
32978 DECL_CHAIN (field) = field_chain;
32979 field_chain = field;
32982 /* The last field is an array of unsigned integers of size one. */
32983 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32984 get_identifier (field_name[3]),
32985 build_array_type (unsigned_type_node,
32986 build_index_type (size_one_node)));
32987 if (field_chain != NULL_TREE)
32988 DECL_CHAIN (field) = field_chain;
32989 field_chain = field;
32991 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32992 return type;
32995 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32997 static tree
32998 make_var_decl (tree type, const char *name)
33000 tree new_decl;
33002 new_decl = build_decl (UNKNOWN_LOCATION,
33003 VAR_DECL,
33004 get_identifier(name),
33005 type);
33007 DECL_EXTERNAL (new_decl) = 1;
33008 TREE_STATIC (new_decl) = 1;
33009 TREE_PUBLIC (new_decl) = 1;
33010 DECL_INITIAL (new_decl) = 0;
33011 DECL_ARTIFICIAL (new_decl) = 0;
33012 DECL_PRESERVE_P (new_decl) = 1;
33014 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33015 assemble_variable (new_decl, 0, 0, 0);
33017 return new_decl;
33020 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33021 into an integer defined in libgcc/config/i386/cpuinfo.c */
33023 static tree
33024 fold_builtin_cpu (tree fndecl, tree *args)
33026 unsigned int i;
33027 enum ix86_builtins fn_code = (enum ix86_builtins)
33028 DECL_FUNCTION_CODE (fndecl);
33029 tree param_string_cst = NULL;
33031 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33032 enum processor_features
33034 F_CMOV = 0,
33035 F_MMX,
33036 F_POPCNT,
33037 F_SSE,
33038 F_SSE2,
33039 F_SSE3,
33040 F_SSSE3,
33041 F_SSE4_1,
33042 F_SSE4_2,
33043 F_AVX,
33044 F_AVX2,
33045 F_SSE4_A,
33046 F_FMA4,
33047 F_XOP,
33048 F_FMA,
33049 F_AVX512F,
33050 F_BMI,
33051 F_BMI2,
33052 F_AES,
33053 F_PCLMUL,
33054 F_AVX512VL,
33055 F_AVX512BW,
33056 F_AVX512DQ,
33057 F_AVX512CD,
33058 F_AVX512ER,
33059 F_AVX512PF,
33060 F_AVX512VBMI,
33061 F_AVX512IFMA,
33062 F_AVX5124VNNIW,
33063 F_AVX5124FMAPS,
33064 F_AVX512VPOPCNTDQ,
33065 F_AVX512VBMI2,
33066 F_GFNI,
33067 F_VPCLMULQDQ,
33068 F_AVX512VNNI,
33069 F_AVX512BITALG,
33070 F_MAX
33073 /* These are the values for vendor types and cpu types and subtypes
33074 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33075 the corresponding start value. */
33076 enum processor_model
33078 M_INTEL = 1,
33079 M_AMD,
33080 M_CPU_TYPE_START,
33081 M_INTEL_BONNELL,
33082 M_INTEL_CORE2,
33083 M_INTEL_COREI7,
33084 M_AMDFAM10H,
33085 M_AMDFAM15H,
33086 M_INTEL_SILVERMONT,
33087 M_INTEL_KNL,
33088 M_AMD_BTVER1,
33089 M_AMD_BTVER2,
33090 M_AMDFAM17H,
33091 M_INTEL_KNM,
33092 M_CPU_SUBTYPE_START,
33093 M_INTEL_COREI7_NEHALEM,
33094 M_INTEL_COREI7_WESTMERE,
33095 M_INTEL_COREI7_SANDYBRIDGE,
33096 M_AMDFAM10H_BARCELONA,
33097 M_AMDFAM10H_SHANGHAI,
33098 M_AMDFAM10H_ISTANBUL,
33099 M_AMDFAM15H_BDVER1,
33100 M_AMDFAM15H_BDVER2,
33101 M_AMDFAM15H_BDVER3,
33102 M_AMDFAM15H_BDVER4,
33103 M_AMDFAM17H_ZNVER1,
33104 M_INTEL_COREI7_IVYBRIDGE,
33105 M_INTEL_COREI7_HASWELL,
33106 M_INTEL_COREI7_BROADWELL,
33107 M_INTEL_COREI7_SKYLAKE,
33108 M_INTEL_COREI7_SKYLAKE_AVX512,
33109 M_INTEL_COREI7_CANNONLAKE,
33110 M_INTEL_COREI7_ICELAKE_CLIENT,
33111 M_INTEL_COREI7_ICELAKE_SERVER
33114 static struct _arch_names_table
33116 const char *const name;
33117 const enum processor_model model;
33119 const arch_names_table[] =
33121 {"amd", M_AMD},
33122 {"intel", M_INTEL},
33123 {"atom", M_INTEL_BONNELL},
33124 {"slm", M_INTEL_SILVERMONT},
33125 {"core2", M_INTEL_CORE2},
33126 {"corei7", M_INTEL_COREI7},
33127 {"nehalem", M_INTEL_COREI7_NEHALEM},
33128 {"westmere", M_INTEL_COREI7_WESTMERE},
33129 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33130 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33131 {"haswell", M_INTEL_COREI7_HASWELL},
33132 {"broadwell", M_INTEL_COREI7_BROADWELL},
33133 {"skylake", M_INTEL_COREI7_SKYLAKE},
33134 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33135 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
33136 {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
33137 {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
33138 {"bonnell", M_INTEL_BONNELL},
33139 {"silvermont", M_INTEL_SILVERMONT},
33140 {"knl", M_INTEL_KNL},
33141 {"knm", M_INTEL_KNM},
33142 {"amdfam10h", M_AMDFAM10H},
33143 {"barcelona", M_AMDFAM10H_BARCELONA},
33144 {"shanghai", M_AMDFAM10H_SHANGHAI},
33145 {"istanbul", M_AMDFAM10H_ISTANBUL},
33146 {"btver1", M_AMD_BTVER1},
33147 {"amdfam15h", M_AMDFAM15H},
33148 {"bdver1", M_AMDFAM15H_BDVER1},
33149 {"bdver2", M_AMDFAM15H_BDVER2},
33150 {"bdver3", M_AMDFAM15H_BDVER3},
33151 {"bdver4", M_AMDFAM15H_BDVER4},
33152 {"btver2", M_AMD_BTVER2},
33153 {"amdfam17h", M_AMDFAM17H},
33154 {"znver1", M_AMDFAM17H_ZNVER1},
33157 static struct _isa_names_table
33159 const char *const name;
33160 const enum processor_features feature;
33162 const isa_names_table[] =
33164 {"cmov", F_CMOV},
33165 {"mmx", F_MMX},
33166 {"popcnt", F_POPCNT},
33167 {"sse", F_SSE},
33168 {"sse2", F_SSE2},
33169 {"sse3", F_SSE3},
33170 {"ssse3", F_SSSE3},
33171 {"sse4a", F_SSE4_A},
33172 {"sse4.1", F_SSE4_1},
33173 {"sse4.2", F_SSE4_2},
33174 {"avx", F_AVX},
33175 {"fma4", F_FMA4},
33176 {"xop", F_XOP},
33177 {"fma", F_FMA},
33178 {"avx2", F_AVX2},
33179 {"avx512f", F_AVX512F},
33180 {"bmi", F_BMI},
33181 {"bmi2", F_BMI2},
33182 {"aes", F_AES},
33183 {"pclmul", F_PCLMUL},
33184 {"avx512vl",F_AVX512VL},
33185 {"avx512bw",F_AVX512BW},
33186 {"avx512dq",F_AVX512DQ},
33187 {"avx512cd",F_AVX512CD},
33188 {"avx512er",F_AVX512ER},
33189 {"avx512pf",F_AVX512PF},
33190 {"avx512vbmi",F_AVX512VBMI},
33191 {"avx512ifma",F_AVX512IFMA},
33192 {"avx5124vnniw",F_AVX5124VNNIW},
33193 {"avx5124fmaps",F_AVX5124FMAPS},
33194 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ},
33195 {"avx512vbmi2", F_AVX512VBMI2},
33196 {"gfni", F_GFNI},
33197 {"vpclmulqdq", F_VPCLMULQDQ},
33198 {"avx512vnni", F_AVX512VNNI},
33199 {"avx512bitalg", F_AVX512BITALG}
33202 tree __processor_model_type = build_processor_model_struct ();
33203 tree __cpu_model_var = make_var_decl (__processor_model_type,
33204 "__cpu_model");
33207 varpool_node::add (__cpu_model_var);
33209 gcc_assert ((args != NULL) && (*args != NULL));
33211 param_string_cst = *args;
33212 while (param_string_cst
33213 && TREE_CODE (param_string_cst) != STRING_CST)
33215 /* *args must be a expr that can contain other EXPRS leading to a
33216 STRING_CST. */
33217 if (!EXPR_P (param_string_cst))
33219 error ("Parameter to builtin must be a string constant or literal");
33220 return integer_zero_node;
33222 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33225 gcc_assert (param_string_cst);
33227 if (fn_code == IX86_BUILTIN_CPU_IS)
33229 tree ref;
33230 tree field;
33231 tree final;
33233 unsigned int field_val = 0;
33234 unsigned int NUM_ARCH_NAMES
33235 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33237 for (i = 0; i < NUM_ARCH_NAMES; i++)
33238 if (strcmp (arch_names_table[i].name,
33239 TREE_STRING_POINTER (param_string_cst)) == 0)
33240 break;
33242 if (i == NUM_ARCH_NAMES)
33244 error ("Parameter to builtin not valid: %s",
33245 TREE_STRING_POINTER (param_string_cst));
33246 return integer_zero_node;
33249 field = TYPE_FIELDS (__processor_model_type);
33250 field_val = arch_names_table[i].model;
33252 /* CPU types are stored in the next field. */
33253 if (field_val > M_CPU_TYPE_START
33254 && field_val < M_CPU_SUBTYPE_START)
33256 field = DECL_CHAIN (field);
33257 field_val -= M_CPU_TYPE_START;
33260 /* CPU subtypes are stored in the next field. */
33261 if (field_val > M_CPU_SUBTYPE_START)
33263 field = DECL_CHAIN ( DECL_CHAIN (field));
33264 field_val -= M_CPU_SUBTYPE_START;
33267 /* Get the appropriate field in __cpu_model. */
33268 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33269 field, NULL_TREE);
33271 /* Check the value. */
33272 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33273 build_int_cstu (unsigned_type_node, field_val));
33274 return build1 (CONVERT_EXPR, integer_type_node, final);
33276 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33278 tree ref;
33279 tree array_elt;
33280 tree field;
33281 tree final;
33283 unsigned int field_val = 0;
33284 unsigned int NUM_ISA_NAMES
33285 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33287 for (i = 0; i < NUM_ISA_NAMES; i++)
33288 if (strcmp (isa_names_table[i].name,
33289 TREE_STRING_POINTER (param_string_cst)) == 0)
33290 break;
33292 if (i == NUM_ISA_NAMES)
33294 error ("Parameter to builtin not valid: %s",
33295 TREE_STRING_POINTER (param_string_cst));
33296 return integer_zero_node;
33299 if (isa_names_table[i].feature >= 32)
33301 tree __cpu_features2_var = make_var_decl (unsigned_type_node,
33302 "__cpu_features2");
33304 varpool_node::add (__cpu_features2_var);
33305 field_val = (1U << (isa_names_table[i].feature - 32));
33306 /* Return __cpu_features2 & field_val */
33307 final = build2 (BIT_AND_EXPR, unsigned_type_node,
33308 __cpu_features2_var,
33309 build_int_cstu (unsigned_type_node, field_val));
33310 return build1 (CONVERT_EXPR, integer_type_node, final);
33313 field = TYPE_FIELDS (__processor_model_type);
33314 /* Get the last field, which is __cpu_features. */
33315 while (DECL_CHAIN (field))
33316 field = DECL_CHAIN (field);
33318 /* Get the appropriate field: __cpu_model.__cpu_features */
33319 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33320 field, NULL_TREE);
33322 /* Access the 0th element of __cpu_features array. */
33323 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33324 integer_zero_node, NULL_TREE, NULL_TREE);
33326 field_val = (1U << isa_names_table[i].feature);
33327 /* Return __cpu_model.__cpu_features[0] & field_val */
33328 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33329 build_int_cstu (unsigned_type_node, field_val));
33330 return build1 (CONVERT_EXPR, integer_type_node, final);
33332 gcc_unreachable ();
33335 static tree
33336 ix86_fold_builtin (tree fndecl, int n_args,
33337 tree *args, bool ignore ATTRIBUTE_UNUSED)
33339 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33341 enum ix86_builtins fn_code = (enum ix86_builtins)
33342 DECL_FUNCTION_CODE (fndecl);
33343 switch (fn_code)
33345 case IX86_BUILTIN_CPU_IS:
33346 case IX86_BUILTIN_CPU_SUPPORTS:
33347 gcc_assert (n_args == 1);
33348 return fold_builtin_cpu (fndecl, args);
33350 case IX86_BUILTIN_NANQ:
33351 case IX86_BUILTIN_NANSQ:
33353 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33354 const char *str = c_getstr (*args);
33355 int quiet = fn_code == IX86_BUILTIN_NANQ;
33356 REAL_VALUE_TYPE real;
33358 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33359 return build_real (type, real);
33360 return NULL_TREE;
33363 case IX86_BUILTIN_INFQ:
33364 case IX86_BUILTIN_HUGE_VALQ:
33366 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33367 REAL_VALUE_TYPE inf;
33368 real_inf (&inf);
33369 return build_real (type, inf);
33372 case IX86_BUILTIN_TZCNT16:
33373 case IX86_BUILTIN_CTZS:
33374 case IX86_BUILTIN_TZCNT32:
33375 case IX86_BUILTIN_TZCNT64:
33376 gcc_assert (n_args == 1);
33377 if (TREE_CODE (args[0]) == INTEGER_CST)
33379 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33380 tree arg = args[0];
33381 if (fn_code == IX86_BUILTIN_TZCNT16
33382 || fn_code == IX86_BUILTIN_CTZS)
33383 arg = fold_convert (short_unsigned_type_node, arg);
33384 if (integer_zerop (arg))
33385 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33386 else
33387 return fold_const_call (CFN_CTZ, type, arg);
33389 break;
33391 case IX86_BUILTIN_LZCNT16:
33392 case IX86_BUILTIN_CLZS:
33393 case IX86_BUILTIN_LZCNT32:
33394 case IX86_BUILTIN_LZCNT64:
33395 gcc_assert (n_args == 1);
33396 if (TREE_CODE (args[0]) == INTEGER_CST)
33398 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33399 tree arg = args[0];
33400 if (fn_code == IX86_BUILTIN_LZCNT16
33401 || fn_code == IX86_BUILTIN_CLZS)
33402 arg = fold_convert (short_unsigned_type_node, arg);
33403 if (integer_zerop (arg))
33404 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33405 else
33406 return fold_const_call (CFN_CLZ, type, arg);
33408 break;
33410 case IX86_BUILTIN_BEXTR32:
33411 case IX86_BUILTIN_BEXTR64:
33412 case IX86_BUILTIN_BEXTRI32:
33413 case IX86_BUILTIN_BEXTRI64:
33414 gcc_assert (n_args == 2);
33415 if (tree_fits_uhwi_p (args[1]))
33417 unsigned HOST_WIDE_INT res = 0;
33418 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33419 unsigned int start = tree_to_uhwi (args[1]);
33420 unsigned int len = (start & 0xff00) >> 8;
33421 start &= 0xff;
33422 if (start >= prec || len == 0)
33423 res = 0;
33424 else if (!tree_fits_uhwi_p (args[0]))
33425 break;
33426 else
33427 res = tree_to_uhwi (args[0]) >> start;
33428 if (len > prec)
33429 len = prec;
33430 if (len < HOST_BITS_PER_WIDE_INT)
33431 res &= (HOST_WIDE_INT_1U << len) - 1;
33432 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33434 break;
33436 case IX86_BUILTIN_BZHI32:
33437 case IX86_BUILTIN_BZHI64:
33438 gcc_assert (n_args == 2);
33439 if (tree_fits_uhwi_p (args[1]))
33441 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33442 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33443 return args[0];
33444 if (!tree_fits_uhwi_p (args[0]))
33445 break;
33446 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33447 res &= ~(HOST_WIDE_INT_M1U << idx);
33448 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33450 break;
33452 case IX86_BUILTIN_PDEP32:
33453 case IX86_BUILTIN_PDEP64:
33454 gcc_assert (n_args == 2);
33455 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33457 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33458 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33459 unsigned HOST_WIDE_INT res = 0;
33460 unsigned HOST_WIDE_INT m, k = 1;
33461 for (m = 1; m; m <<= 1)
33462 if ((mask & m) != 0)
33464 if ((src & k) != 0)
33465 res |= m;
33466 k <<= 1;
33468 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33470 break;
33472 case IX86_BUILTIN_PEXT32:
33473 case IX86_BUILTIN_PEXT64:
33474 gcc_assert (n_args == 2);
33475 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33477 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33478 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33479 unsigned HOST_WIDE_INT res = 0;
33480 unsigned HOST_WIDE_INT m, k = 1;
33481 for (m = 1; m; m <<= 1)
33482 if ((mask & m) != 0)
33484 if ((src & m) != 0)
33485 res |= k;
33486 k <<= 1;
33488 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33490 break;
33492 default:
33493 break;
33497 #ifdef SUBTARGET_FOLD_BUILTIN
33498 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33499 #endif
33501 return NULL_TREE;
33504 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33505 constant) in GIMPLE. */
33507 bool
33508 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33510 gimple *stmt = gsi_stmt (*gsi);
33511 tree fndecl = gimple_call_fndecl (stmt);
33512 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33513 int n_args = gimple_call_num_args (stmt);
33514 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33515 tree decl = NULL_TREE;
33516 tree arg0, arg1;
33518 switch (fn_code)
33520 case IX86_BUILTIN_TZCNT32:
33521 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33522 goto fold_tzcnt_lzcnt;
33524 case IX86_BUILTIN_TZCNT64:
33525 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33526 goto fold_tzcnt_lzcnt;
33528 case IX86_BUILTIN_LZCNT32:
33529 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33530 goto fold_tzcnt_lzcnt;
33532 case IX86_BUILTIN_LZCNT64:
33533 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33534 goto fold_tzcnt_lzcnt;
33536 fold_tzcnt_lzcnt:
33537 gcc_assert (n_args == 1);
33538 arg0 = gimple_call_arg (stmt, 0);
33539 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33541 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33542 /* If arg0 is provably non-zero, optimize into generic
33543 __builtin_c[tl]z{,ll} function the middle-end handles
33544 better. */
33545 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33546 return false;
33548 location_t loc = gimple_location (stmt);
33549 gimple *g = gimple_build_call (decl, 1, arg0);
33550 gimple_set_location (g, loc);
33551 tree lhs = make_ssa_name (integer_type_node);
33552 gimple_call_set_lhs (g, lhs);
33553 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33554 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33555 gimple_set_location (g, loc);
33556 gsi_replace (gsi, g, false);
33557 return true;
33559 break;
33561 case IX86_BUILTIN_BZHI32:
33562 case IX86_BUILTIN_BZHI64:
33563 gcc_assert (n_args == 2);
33564 arg1 = gimple_call_arg (stmt, 1);
33565 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33567 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33568 arg0 = gimple_call_arg (stmt, 0);
33569 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33570 break;
33571 location_t loc = gimple_location (stmt);
33572 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33573 gimple_set_location (g, loc);
33574 gsi_replace (gsi, g, false);
33575 return true;
33577 break;
33579 case IX86_BUILTIN_PDEP32:
33580 case IX86_BUILTIN_PDEP64:
33581 case IX86_BUILTIN_PEXT32:
33582 case IX86_BUILTIN_PEXT64:
33583 gcc_assert (n_args == 2);
33584 arg1 = gimple_call_arg (stmt, 1);
33585 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33587 location_t loc = gimple_location (stmt);
33588 arg0 = gimple_call_arg (stmt, 0);
33589 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33590 gimple_set_location (g, loc);
33591 gsi_replace (gsi, g, false);
33592 return true;
33594 break;
33596 default:
33597 break;
33600 return false;
33603 /* Make builtins to detect cpu type and features supported. NAME is
33604 the builtin name, CODE is the builtin code, and FTYPE is the function
33605 type of the builtin. */
33607 static void
33608 make_cpu_type_builtin (const char* name, int code,
33609 enum ix86_builtin_func_type ftype, bool is_const)
33611 tree decl;
33612 tree type;
33614 type = ix86_get_builtin_func_type (ftype);
33615 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33616 NULL, NULL_TREE);
33617 gcc_assert (decl != NULL_TREE);
33618 ix86_builtins[(int) code] = decl;
33619 TREE_READONLY (decl) = is_const;
33622 /* Make builtins to get CPU type and features supported. The created
33623 builtins are :
33625 __builtin_cpu_init (), to detect cpu type and features,
33626 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33627 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33630 static void
33631 ix86_init_platform_type_builtins (void)
33633 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33634 INT_FTYPE_VOID, false);
33635 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33636 INT_FTYPE_PCCHAR, true);
33637 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33638 INT_FTYPE_PCCHAR, true);
33641 /* Internal method for ix86_init_builtins. */
33643 static void
33644 ix86_init_builtins_va_builtins_abi (void)
33646 tree ms_va_ref, sysv_va_ref;
33647 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33648 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33649 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33650 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33652 if (!TARGET_64BIT)
33653 return;
33654 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33655 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33656 ms_va_ref = build_reference_type (ms_va_list_type_node);
33657 sysv_va_ref =
33658 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33660 fnvoid_va_end_ms =
33661 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33662 fnvoid_va_start_ms =
33663 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33664 fnvoid_va_end_sysv =
33665 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33666 fnvoid_va_start_sysv =
33667 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33668 NULL_TREE);
33669 fnvoid_va_copy_ms =
33670 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33671 NULL_TREE);
33672 fnvoid_va_copy_sysv =
33673 build_function_type_list (void_type_node, sysv_va_ref,
33674 sysv_va_ref, NULL_TREE);
33676 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33677 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33678 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33679 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33680 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33681 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33682 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33683 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33684 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33685 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33686 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33687 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33690 static void
33691 ix86_init_builtin_types (void)
33693 tree float80_type_node, const_string_type_node;
33695 /* The __float80 type. */
33696 float80_type_node = long_double_type_node;
33697 if (TYPE_MODE (float80_type_node) != XFmode)
33699 if (float64x_type_node != NULL_TREE
33700 && TYPE_MODE (float64x_type_node) == XFmode)
33701 float80_type_node = float64x_type_node;
33702 else
33704 /* The __float80 type. */
33705 float80_type_node = make_node (REAL_TYPE);
33707 TYPE_PRECISION (float80_type_node) = 80;
33708 layout_type (float80_type_node);
33711 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33713 /* The __float128 type. The node has already been created as
33714 _Float128, so we only need to register the __float128 name for
33715 it. */
33716 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33718 const_string_type_node
33719 = build_pointer_type (build_qualified_type
33720 (char_type_node, TYPE_QUAL_CONST));
33722 /* This macro is built by i386-builtin-types.awk. */
33723 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33726 static void
33727 ix86_init_builtins (void)
33729 tree ftype, decl;
33731 ix86_init_builtin_types ();
33733 /* Builtins to get CPU type and features. */
33734 ix86_init_platform_type_builtins ();
33736 /* TFmode support builtins. */
33737 def_builtin_const (0, "__builtin_infq",
33738 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33739 def_builtin_const (0, "__builtin_huge_valq",
33740 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33742 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33743 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33744 BUILT_IN_MD, "nanq", NULL_TREE);
33745 TREE_READONLY (decl) = 1;
33746 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33748 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33749 BUILT_IN_MD, "nansq", NULL_TREE);
33750 TREE_READONLY (decl) = 1;
33751 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33753 /* We will expand them to normal call if SSE isn't available since
33754 they are used by libgcc. */
33755 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33756 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33757 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33758 TREE_READONLY (decl) = 1;
33759 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33761 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33762 decl = add_builtin_function ("__builtin_copysignq", ftype,
33763 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33764 "__copysigntf3", NULL_TREE);
33765 TREE_READONLY (decl) = 1;
33766 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33768 ix86_init_tm_builtins ();
33769 ix86_init_mmx_sse_builtins ();
33770 ix86_init_mpx_builtins ();
33772 if (TARGET_LP64)
33773 ix86_init_builtins_va_builtins_abi ();
33775 #ifdef SUBTARGET_INIT_BUILTINS
33776 SUBTARGET_INIT_BUILTINS;
33777 #endif
33780 /* Return the ix86 builtin for CODE. */
33782 static tree
33783 ix86_builtin_decl (unsigned code, bool)
33785 if (code >= IX86_BUILTIN_MAX)
33786 return error_mark_node;
33788 return ix86_builtins[code];
33791 /* Errors in the source file can cause expand_expr to return const0_rtx
33792 where we expect a vector. To avoid crashing, use one of the vector
33793 clear instructions. */
33794 static rtx
33795 safe_vector_operand (rtx x, machine_mode mode)
33797 if (x == const0_rtx)
33798 x = CONST0_RTX (mode);
33799 return x;
33802 /* Fixup modeless constants to fit required mode. */
33803 static rtx
33804 fixup_modeless_constant (rtx x, machine_mode mode)
33806 if (GET_MODE (x) == VOIDmode)
33807 x = convert_to_mode (mode, x, 1);
33808 return x;
33811 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33813 static rtx
33814 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33816 rtx pat;
33817 tree arg0 = CALL_EXPR_ARG (exp, 0);
33818 tree arg1 = CALL_EXPR_ARG (exp, 1);
33819 rtx op0 = expand_normal (arg0);
33820 rtx op1 = expand_normal (arg1);
33821 machine_mode tmode = insn_data[icode].operand[0].mode;
33822 machine_mode mode0 = insn_data[icode].operand[1].mode;
33823 machine_mode mode1 = insn_data[icode].operand[2].mode;
33825 if (VECTOR_MODE_P (mode0))
33826 op0 = safe_vector_operand (op0, mode0);
33827 if (VECTOR_MODE_P (mode1))
33828 op1 = safe_vector_operand (op1, mode1);
33830 if (optimize || !target
33831 || GET_MODE (target) != tmode
33832 || !insn_data[icode].operand[0].predicate (target, tmode))
33833 target = gen_reg_rtx (tmode);
33835 if (GET_MODE (op1) == SImode && mode1 == TImode)
33837 rtx x = gen_reg_rtx (V4SImode);
33838 emit_insn (gen_sse2_loadd (x, op1));
33839 op1 = gen_lowpart (TImode, x);
33842 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33843 op0 = copy_to_mode_reg (mode0, op0);
33844 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33845 op1 = copy_to_mode_reg (mode1, op1);
33847 pat = GEN_FCN (icode) (target, op0, op1);
33848 if (! pat)
33849 return 0;
33851 emit_insn (pat);
33853 return target;
33856 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33858 static rtx
33859 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33860 enum ix86_builtin_func_type m_type,
33861 enum rtx_code sub_code)
33863 rtx pat;
33864 int i;
33865 int nargs;
33866 bool comparison_p = false;
33867 bool tf_p = false;
33868 bool last_arg_constant = false;
33869 int num_memory = 0;
33870 struct {
33871 rtx op;
33872 machine_mode mode;
33873 } args[4];
33875 machine_mode tmode = insn_data[icode].operand[0].mode;
33877 switch (m_type)
33879 case MULTI_ARG_4_DF2_DI_I:
33880 case MULTI_ARG_4_DF2_DI_I1:
33881 case MULTI_ARG_4_SF2_SI_I:
33882 case MULTI_ARG_4_SF2_SI_I1:
33883 nargs = 4;
33884 last_arg_constant = true;
33885 break;
33887 case MULTI_ARG_3_SF:
33888 case MULTI_ARG_3_DF:
33889 case MULTI_ARG_3_SF2:
33890 case MULTI_ARG_3_DF2:
33891 case MULTI_ARG_3_DI:
33892 case MULTI_ARG_3_SI:
33893 case MULTI_ARG_3_SI_DI:
33894 case MULTI_ARG_3_HI:
33895 case MULTI_ARG_3_HI_SI:
33896 case MULTI_ARG_3_QI:
33897 case MULTI_ARG_3_DI2:
33898 case MULTI_ARG_3_SI2:
33899 case MULTI_ARG_3_HI2:
33900 case MULTI_ARG_3_QI2:
33901 nargs = 3;
33902 break;
33904 case MULTI_ARG_2_SF:
33905 case MULTI_ARG_2_DF:
33906 case MULTI_ARG_2_DI:
33907 case MULTI_ARG_2_SI:
33908 case MULTI_ARG_2_HI:
33909 case MULTI_ARG_2_QI:
33910 nargs = 2;
33911 break;
33913 case MULTI_ARG_2_DI_IMM:
33914 case MULTI_ARG_2_SI_IMM:
33915 case MULTI_ARG_2_HI_IMM:
33916 case MULTI_ARG_2_QI_IMM:
33917 nargs = 2;
33918 last_arg_constant = true;
33919 break;
33921 case MULTI_ARG_1_SF:
33922 case MULTI_ARG_1_DF:
33923 case MULTI_ARG_1_SF2:
33924 case MULTI_ARG_1_DF2:
33925 case MULTI_ARG_1_DI:
33926 case MULTI_ARG_1_SI:
33927 case MULTI_ARG_1_HI:
33928 case MULTI_ARG_1_QI:
33929 case MULTI_ARG_1_SI_DI:
33930 case MULTI_ARG_1_HI_DI:
33931 case MULTI_ARG_1_HI_SI:
33932 case MULTI_ARG_1_QI_DI:
33933 case MULTI_ARG_1_QI_SI:
33934 case MULTI_ARG_1_QI_HI:
33935 nargs = 1;
33936 break;
33938 case MULTI_ARG_2_DI_CMP:
33939 case MULTI_ARG_2_SI_CMP:
33940 case MULTI_ARG_2_HI_CMP:
33941 case MULTI_ARG_2_QI_CMP:
33942 nargs = 2;
33943 comparison_p = true;
33944 break;
33946 case MULTI_ARG_2_SF_TF:
33947 case MULTI_ARG_2_DF_TF:
33948 case MULTI_ARG_2_DI_TF:
33949 case MULTI_ARG_2_SI_TF:
33950 case MULTI_ARG_2_HI_TF:
33951 case MULTI_ARG_2_QI_TF:
33952 nargs = 2;
33953 tf_p = true;
33954 break;
33956 default:
33957 gcc_unreachable ();
33960 if (optimize || !target
33961 || GET_MODE (target) != tmode
33962 || !insn_data[icode].operand[0].predicate (target, tmode))
33963 target = gen_reg_rtx (tmode);
33964 else if (memory_operand (target, tmode))
33965 num_memory++;
33967 gcc_assert (nargs <= 4);
33969 for (i = 0; i < nargs; i++)
33971 tree arg = CALL_EXPR_ARG (exp, i);
33972 rtx op = expand_normal (arg);
33973 int adjust = (comparison_p) ? 1 : 0;
33974 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33976 if (last_arg_constant && i == nargs - 1)
33978 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33980 enum insn_code new_icode = icode;
33981 switch (icode)
33983 case CODE_FOR_xop_vpermil2v2df3:
33984 case CODE_FOR_xop_vpermil2v4sf3:
33985 case CODE_FOR_xop_vpermil2v4df3:
33986 case CODE_FOR_xop_vpermil2v8sf3:
33987 error ("the last argument must be a 2-bit immediate");
33988 return gen_reg_rtx (tmode);
33989 case CODE_FOR_xop_rotlv2di3:
33990 new_icode = CODE_FOR_rotlv2di3;
33991 goto xop_rotl;
33992 case CODE_FOR_xop_rotlv4si3:
33993 new_icode = CODE_FOR_rotlv4si3;
33994 goto xop_rotl;
33995 case CODE_FOR_xop_rotlv8hi3:
33996 new_icode = CODE_FOR_rotlv8hi3;
33997 goto xop_rotl;
33998 case CODE_FOR_xop_rotlv16qi3:
33999 new_icode = CODE_FOR_rotlv16qi3;
34000 xop_rotl:
34001 if (CONST_INT_P (op))
34003 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34004 op = GEN_INT (INTVAL (op) & mask);
34005 gcc_checking_assert
34006 (insn_data[icode].operand[i + 1].predicate (op, mode));
34008 else
34010 gcc_checking_assert
34011 (nargs == 2
34012 && insn_data[new_icode].operand[0].mode == tmode
34013 && insn_data[new_icode].operand[1].mode == tmode
34014 && insn_data[new_icode].operand[2].mode == mode
34015 && insn_data[new_icode].operand[0].predicate
34016 == insn_data[icode].operand[0].predicate
34017 && insn_data[new_icode].operand[1].predicate
34018 == insn_data[icode].operand[1].predicate);
34019 icode = new_icode;
34020 goto non_constant;
34022 break;
34023 default:
34024 gcc_unreachable ();
34028 else
34030 non_constant:
34031 if (VECTOR_MODE_P (mode))
34032 op = safe_vector_operand (op, mode);
34034 /* If we aren't optimizing, only allow one memory operand to be
34035 generated. */
34036 if (memory_operand (op, mode))
34037 num_memory++;
34039 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34041 if (optimize
34042 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34043 || num_memory > 1)
34044 op = force_reg (mode, op);
34047 args[i].op = op;
34048 args[i].mode = mode;
34051 switch (nargs)
34053 case 1:
34054 pat = GEN_FCN (icode) (target, args[0].op);
34055 break;
34057 case 2:
34058 if (tf_p)
34059 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34060 GEN_INT ((int)sub_code));
34061 else if (! comparison_p)
34062 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34063 else
34065 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34066 args[0].op,
34067 args[1].op);
34069 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34071 break;
34073 case 3:
34074 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34075 break;
34077 case 4:
34078 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34079 break;
34081 default:
34082 gcc_unreachable ();
34085 if (! pat)
34086 return 0;
34088 emit_insn (pat);
34089 return target;
34092 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34093 insns with vec_merge. */
34095 static rtx
34096 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34097 rtx target)
34099 rtx pat;
34100 tree arg0 = CALL_EXPR_ARG (exp, 0);
34101 rtx op1, op0 = expand_normal (arg0);
34102 machine_mode tmode = insn_data[icode].operand[0].mode;
34103 machine_mode mode0 = insn_data[icode].operand[1].mode;
34105 if (optimize || !target
34106 || GET_MODE (target) != tmode
34107 || !insn_data[icode].operand[0].predicate (target, tmode))
34108 target = gen_reg_rtx (tmode);
34110 if (VECTOR_MODE_P (mode0))
34111 op0 = safe_vector_operand (op0, mode0);
34113 if ((optimize && !register_operand (op0, mode0))
34114 || !insn_data[icode].operand[1].predicate (op0, mode0))
34115 op0 = copy_to_mode_reg (mode0, op0);
34117 op1 = op0;
34118 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34119 op1 = copy_to_mode_reg (mode0, op1);
34121 pat = GEN_FCN (icode) (target, op0, op1);
34122 if (! pat)
34123 return 0;
34124 emit_insn (pat);
34125 return target;
34128 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34130 static rtx
34131 ix86_expand_sse_compare (const struct builtin_description *d,
34132 tree exp, rtx target, bool swap)
34134 rtx pat;
34135 tree arg0 = CALL_EXPR_ARG (exp, 0);
34136 tree arg1 = CALL_EXPR_ARG (exp, 1);
34137 rtx op0 = expand_normal (arg0);
34138 rtx op1 = expand_normal (arg1);
34139 rtx op2;
34140 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34141 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34142 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34143 enum rtx_code comparison = d->comparison;
34145 if (VECTOR_MODE_P (mode0))
34146 op0 = safe_vector_operand (op0, mode0);
34147 if (VECTOR_MODE_P (mode1))
34148 op1 = safe_vector_operand (op1, mode1);
34150 /* Swap operands if we have a comparison that isn't available in
34151 hardware. */
34152 if (swap)
34153 std::swap (op0, op1);
34155 if (optimize || !target
34156 || GET_MODE (target) != tmode
34157 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34158 target = gen_reg_rtx (tmode);
34160 if ((optimize && !register_operand (op0, mode0))
34161 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34162 op0 = copy_to_mode_reg (mode0, op0);
34163 if ((optimize && !register_operand (op1, mode1))
34164 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34165 op1 = copy_to_mode_reg (mode1, op1);
34167 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34168 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34169 if (! pat)
34170 return 0;
34171 emit_insn (pat);
34172 return target;
34175 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34177 static rtx
34178 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34179 rtx target)
34181 rtx pat;
34182 tree arg0 = CALL_EXPR_ARG (exp, 0);
34183 tree arg1 = CALL_EXPR_ARG (exp, 1);
34184 rtx op0 = expand_normal (arg0);
34185 rtx op1 = expand_normal (arg1);
34186 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34187 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34188 enum rtx_code comparison = d->comparison;
34190 if (VECTOR_MODE_P (mode0))
34191 op0 = safe_vector_operand (op0, mode0);
34192 if (VECTOR_MODE_P (mode1))
34193 op1 = safe_vector_operand (op1, mode1);
34195 /* Swap operands if we have a comparison that isn't available in
34196 hardware. */
34197 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34198 std::swap (op0, op1);
34200 target = gen_reg_rtx (SImode);
34201 emit_move_insn (target, const0_rtx);
34202 target = gen_rtx_SUBREG (QImode, target, 0);
34204 if ((optimize && !register_operand (op0, mode0))
34205 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34206 op0 = copy_to_mode_reg (mode0, op0);
34207 if ((optimize && !register_operand (op1, mode1))
34208 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34209 op1 = copy_to_mode_reg (mode1, op1);
34211 pat = GEN_FCN (d->icode) (op0, op1);
34212 if (! pat)
34213 return 0;
34214 emit_insn (pat);
34215 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34216 gen_rtx_fmt_ee (comparison, QImode,
34217 SET_DEST (pat),
34218 const0_rtx)));
34220 return SUBREG_REG (target);
34223 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34225 static rtx
34226 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34227 rtx target)
34229 rtx pat;
34230 tree arg0 = CALL_EXPR_ARG (exp, 0);
34231 rtx op1, op0 = expand_normal (arg0);
34232 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34233 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34235 if (optimize || target == 0
34236 || GET_MODE (target) != tmode
34237 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34238 target = gen_reg_rtx (tmode);
34240 if (VECTOR_MODE_P (mode0))
34241 op0 = safe_vector_operand (op0, mode0);
34243 if ((optimize && !register_operand (op0, mode0))
34244 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34245 op0 = copy_to_mode_reg (mode0, op0);
34247 op1 = GEN_INT (d->comparison);
34249 pat = GEN_FCN (d->icode) (target, op0, op1);
34250 if (! pat)
34251 return 0;
34252 emit_insn (pat);
34253 return target;
34256 static rtx
34257 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34258 tree exp, rtx target)
34260 rtx pat;
34261 tree arg0 = CALL_EXPR_ARG (exp, 0);
34262 tree arg1 = CALL_EXPR_ARG (exp, 1);
34263 rtx op0 = expand_normal (arg0);
34264 rtx op1 = expand_normal (arg1);
34265 rtx op2;
34266 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34267 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34268 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34270 if (optimize || target == 0
34271 || GET_MODE (target) != tmode
34272 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34273 target = gen_reg_rtx (tmode);
34275 op0 = safe_vector_operand (op0, mode0);
34276 op1 = safe_vector_operand (op1, mode1);
34278 if ((optimize && !register_operand (op0, mode0))
34279 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34280 op0 = copy_to_mode_reg (mode0, op0);
34281 if ((optimize && !register_operand (op1, mode1))
34282 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34283 op1 = copy_to_mode_reg (mode1, op1);
34285 op2 = GEN_INT (d->comparison);
34287 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34288 if (! pat)
34289 return 0;
34290 emit_insn (pat);
34291 return target;
34294 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34296 static rtx
34297 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34298 rtx target)
34300 rtx pat;
34301 tree arg0 = CALL_EXPR_ARG (exp, 0);
34302 tree arg1 = CALL_EXPR_ARG (exp, 1);
34303 rtx op0 = expand_normal (arg0);
34304 rtx op1 = expand_normal (arg1);
34305 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34306 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34307 enum rtx_code comparison = d->comparison;
34309 if (VECTOR_MODE_P (mode0))
34310 op0 = safe_vector_operand (op0, mode0);
34311 if (VECTOR_MODE_P (mode1))
34312 op1 = safe_vector_operand (op1, mode1);
34314 target = gen_reg_rtx (SImode);
34315 emit_move_insn (target, const0_rtx);
34316 target = gen_rtx_SUBREG (QImode, target, 0);
34318 if ((optimize && !register_operand (op0, mode0))
34319 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34320 op0 = copy_to_mode_reg (mode0, op0);
34321 if ((optimize && !register_operand (op1, mode1))
34322 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34323 op1 = copy_to_mode_reg (mode1, op1);
34325 pat = GEN_FCN (d->icode) (op0, op1);
34326 if (! pat)
34327 return 0;
34328 emit_insn (pat);
34329 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34330 gen_rtx_fmt_ee (comparison, QImode,
34331 SET_DEST (pat),
34332 const0_rtx)));
34334 return SUBREG_REG (target);
34337 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34339 static rtx
34340 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34341 tree exp, rtx target)
34343 rtx pat;
34344 tree arg0 = CALL_EXPR_ARG (exp, 0);
34345 tree arg1 = CALL_EXPR_ARG (exp, 1);
34346 tree arg2 = CALL_EXPR_ARG (exp, 2);
34347 tree arg3 = CALL_EXPR_ARG (exp, 3);
34348 tree arg4 = CALL_EXPR_ARG (exp, 4);
34349 rtx scratch0, scratch1;
34350 rtx op0 = expand_normal (arg0);
34351 rtx op1 = expand_normal (arg1);
34352 rtx op2 = expand_normal (arg2);
34353 rtx op3 = expand_normal (arg3);
34354 rtx op4 = expand_normal (arg4);
34355 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34357 tmode0 = insn_data[d->icode].operand[0].mode;
34358 tmode1 = insn_data[d->icode].operand[1].mode;
34359 modev2 = insn_data[d->icode].operand[2].mode;
34360 modei3 = insn_data[d->icode].operand[3].mode;
34361 modev4 = insn_data[d->icode].operand[4].mode;
34362 modei5 = insn_data[d->icode].operand[5].mode;
34363 modeimm = insn_data[d->icode].operand[6].mode;
34365 if (VECTOR_MODE_P (modev2))
34366 op0 = safe_vector_operand (op0, modev2);
34367 if (VECTOR_MODE_P (modev4))
34368 op2 = safe_vector_operand (op2, modev4);
34370 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34371 op0 = copy_to_mode_reg (modev2, op0);
34372 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34373 op1 = copy_to_mode_reg (modei3, op1);
34374 if ((optimize && !register_operand (op2, modev4))
34375 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34376 op2 = copy_to_mode_reg (modev4, op2);
34377 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34378 op3 = copy_to_mode_reg (modei5, op3);
34380 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34382 error ("the fifth argument must be an 8-bit immediate");
34383 return const0_rtx;
34386 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34388 if (optimize || !target
34389 || GET_MODE (target) != tmode0
34390 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34391 target = gen_reg_rtx (tmode0);
34393 scratch1 = gen_reg_rtx (tmode1);
34395 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34397 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34399 if (optimize || !target
34400 || GET_MODE (target) != tmode1
34401 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34402 target = gen_reg_rtx (tmode1);
34404 scratch0 = gen_reg_rtx (tmode0);
34406 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34408 else
34410 gcc_assert (d->flag);
34412 scratch0 = gen_reg_rtx (tmode0);
34413 scratch1 = gen_reg_rtx (tmode1);
34415 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34418 if (! pat)
34419 return 0;
34421 emit_insn (pat);
34423 if (d->flag)
34425 target = gen_reg_rtx (SImode);
34426 emit_move_insn (target, const0_rtx);
34427 target = gen_rtx_SUBREG (QImode, target, 0);
34429 emit_insn
34430 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34431 gen_rtx_fmt_ee (EQ, QImode,
34432 gen_rtx_REG ((machine_mode) d->flag,
34433 FLAGS_REG),
34434 const0_rtx)));
34435 return SUBREG_REG (target);
34437 else
34438 return target;
34442 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34444 static rtx
34445 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34446 tree exp, rtx target)
34448 rtx pat;
34449 tree arg0 = CALL_EXPR_ARG (exp, 0);
34450 tree arg1 = CALL_EXPR_ARG (exp, 1);
34451 tree arg2 = CALL_EXPR_ARG (exp, 2);
34452 rtx scratch0, scratch1;
34453 rtx op0 = expand_normal (arg0);
34454 rtx op1 = expand_normal (arg1);
34455 rtx op2 = expand_normal (arg2);
34456 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34458 tmode0 = insn_data[d->icode].operand[0].mode;
34459 tmode1 = insn_data[d->icode].operand[1].mode;
34460 modev2 = insn_data[d->icode].operand[2].mode;
34461 modev3 = insn_data[d->icode].operand[3].mode;
34462 modeimm = insn_data[d->icode].operand[4].mode;
34464 if (VECTOR_MODE_P (modev2))
34465 op0 = safe_vector_operand (op0, modev2);
34466 if (VECTOR_MODE_P (modev3))
34467 op1 = safe_vector_operand (op1, modev3);
34469 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34470 op0 = copy_to_mode_reg (modev2, op0);
34471 if ((optimize && !register_operand (op1, modev3))
34472 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34473 op1 = copy_to_mode_reg (modev3, op1);
34475 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34477 error ("the third argument must be an 8-bit immediate");
34478 return const0_rtx;
34481 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34483 if (optimize || !target
34484 || GET_MODE (target) != tmode0
34485 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34486 target = gen_reg_rtx (tmode0);
34488 scratch1 = gen_reg_rtx (tmode1);
34490 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34492 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34494 if (optimize || !target
34495 || GET_MODE (target) != tmode1
34496 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34497 target = gen_reg_rtx (tmode1);
34499 scratch0 = gen_reg_rtx (tmode0);
34501 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34503 else
34505 gcc_assert (d->flag);
34507 scratch0 = gen_reg_rtx (tmode0);
34508 scratch1 = gen_reg_rtx (tmode1);
34510 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34513 if (! pat)
34514 return 0;
34516 emit_insn (pat);
34518 if (d->flag)
34520 target = gen_reg_rtx (SImode);
34521 emit_move_insn (target, const0_rtx);
34522 target = gen_rtx_SUBREG (QImode, target, 0);
34524 emit_insn
34525 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34526 gen_rtx_fmt_ee (EQ, QImode,
34527 gen_rtx_REG ((machine_mode) d->flag,
34528 FLAGS_REG),
34529 const0_rtx)));
34530 return SUBREG_REG (target);
34532 else
34533 return target;
34536 /* Subroutine of ix86_expand_builtin to take care of insns with
34537 variable number of operands. */
34539 static rtx
34540 ix86_expand_args_builtin (const struct builtin_description *d,
34541 tree exp, rtx target)
34543 rtx pat, real_target;
34544 unsigned int i, nargs;
34545 unsigned int nargs_constant = 0;
34546 unsigned int mask_pos = 0;
34547 int num_memory = 0;
34548 struct
34550 rtx op;
34551 machine_mode mode;
34552 } args[6];
34553 bool second_arg_count = false;
34554 enum insn_code icode = d->icode;
34555 const struct insn_data_d *insn_p = &insn_data[icode];
34556 machine_mode tmode = insn_p->operand[0].mode;
34557 machine_mode rmode = VOIDmode;
34558 bool swap = false;
34559 enum rtx_code comparison = d->comparison;
34561 switch ((enum ix86_builtin_func_type) d->flag)
34563 case V2DF_FTYPE_V2DF_ROUND:
34564 case V4DF_FTYPE_V4DF_ROUND:
34565 case V8DF_FTYPE_V8DF_ROUND:
34566 case V4SF_FTYPE_V4SF_ROUND:
34567 case V8SF_FTYPE_V8SF_ROUND:
34568 case V16SF_FTYPE_V16SF_ROUND:
34569 case V4SI_FTYPE_V4SF_ROUND:
34570 case V8SI_FTYPE_V8SF_ROUND:
34571 case V16SI_FTYPE_V16SF_ROUND:
34572 return ix86_expand_sse_round (d, exp, target);
34573 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34574 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34575 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34576 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34577 case INT_FTYPE_V8SF_V8SF_PTEST:
34578 case INT_FTYPE_V4DI_V4DI_PTEST:
34579 case INT_FTYPE_V4DF_V4DF_PTEST:
34580 case INT_FTYPE_V4SF_V4SF_PTEST:
34581 case INT_FTYPE_V2DI_V2DI_PTEST:
34582 case INT_FTYPE_V2DF_V2DF_PTEST:
34583 return ix86_expand_sse_ptest (d, exp, target);
34584 case FLOAT128_FTYPE_FLOAT128:
34585 case FLOAT_FTYPE_FLOAT:
34586 case INT_FTYPE_INT:
34587 case UINT_FTYPE_UINT:
34588 case UINT16_FTYPE_UINT16:
34589 case UINT64_FTYPE_INT:
34590 case UINT64_FTYPE_UINT64:
34591 case INT64_FTYPE_INT64:
34592 case INT64_FTYPE_V4SF:
34593 case INT64_FTYPE_V2DF:
34594 case INT_FTYPE_V16QI:
34595 case INT_FTYPE_V8QI:
34596 case INT_FTYPE_V8SF:
34597 case INT_FTYPE_V4DF:
34598 case INT_FTYPE_V4SF:
34599 case INT_FTYPE_V2DF:
34600 case INT_FTYPE_V32QI:
34601 case V16QI_FTYPE_V16QI:
34602 case V8SI_FTYPE_V8SF:
34603 case V8SI_FTYPE_V4SI:
34604 case V8HI_FTYPE_V8HI:
34605 case V8HI_FTYPE_V16QI:
34606 case V8QI_FTYPE_V8QI:
34607 case V8SF_FTYPE_V8SF:
34608 case V8SF_FTYPE_V8SI:
34609 case V8SF_FTYPE_V4SF:
34610 case V8SF_FTYPE_V8HI:
34611 case V4SI_FTYPE_V4SI:
34612 case V4SI_FTYPE_V16QI:
34613 case V4SI_FTYPE_V4SF:
34614 case V4SI_FTYPE_V8SI:
34615 case V4SI_FTYPE_V8HI:
34616 case V4SI_FTYPE_V4DF:
34617 case V4SI_FTYPE_V2DF:
34618 case V4HI_FTYPE_V4HI:
34619 case V4DF_FTYPE_V4DF:
34620 case V4DF_FTYPE_V4SI:
34621 case V4DF_FTYPE_V4SF:
34622 case V4DF_FTYPE_V2DF:
34623 case V4SF_FTYPE_V4SF:
34624 case V4SF_FTYPE_V4SI:
34625 case V4SF_FTYPE_V8SF:
34626 case V4SF_FTYPE_V4DF:
34627 case V4SF_FTYPE_V8HI:
34628 case V4SF_FTYPE_V2DF:
34629 case V2DI_FTYPE_V2DI:
34630 case V2DI_FTYPE_V16QI:
34631 case V2DI_FTYPE_V8HI:
34632 case V2DI_FTYPE_V4SI:
34633 case V2DF_FTYPE_V2DF:
34634 case V2DF_FTYPE_V4SI:
34635 case V2DF_FTYPE_V4DF:
34636 case V2DF_FTYPE_V4SF:
34637 case V2DF_FTYPE_V2SI:
34638 case V2SI_FTYPE_V2SI:
34639 case V2SI_FTYPE_V4SF:
34640 case V2SI_FTYPE_V2SF:
34641 case V2SI_FTYPE_V2DF:
34642 case V2SF_FTYPE_V2SF:
34643 case V2SF_FTYPE_V2SI:
34644 case V32QI_FTYPE_V32QI:
34645 case V32QI_FTYPE_V16QI:
34646 case V16HI_FTYPE_V16HI:
34647 case V16HI_FTYPE_V8HI:
34648 case V8SI_FTYPE_V8SI:
34649 case V16HI_FTYPE_V16QI:
34650 case V8SI_FTYPE_V16QI:
34651 case V4DI_FTYPE_V16QI:
34652 case V8SI_FTYPE_V8HI:
34653 case V4DI_FTYPE_V8HI:
34654 case V4DI_FTYPE_V4SI:
34655 case V4DI_FTYPE_V2DI:
34656 case UQI_FTYPE_UQI:
34657 case UHI_FTYPE_UHI:
34658 case USI_FTYPE_USI:
34659 case USI_FTYPE_UQI:
34660 case USI_FTYPE_UHI:
34661 case UDI_FTYPE_UDI:
34662 case UHI_FTYPE_V16QI:
34663 case USI_FTYPE_V32QI:
34664 case UDI_FTYPE_V64QI:
34665 case V16QI_FTYPE_UHI:
34666 case V32QI_FTYPE_USI:
34667 case V64QI_FTYPE_UDI:
34668 case V8HI_FTYPE_UQI:
34669 case V16HI_FTYPE_UHI:
34670 case V32HI_FTYPE_USI:
34671 case V4SI_FTYPE_UQI:
34672 case V8SI_FTYPE_UQI:
34673 case V4SI_FTYPE_UHI:
34674 case V8SI_FTYPE_UHI:
34675 case UQI_FTYPE_V8HI:
34676 case UHI_FTYPE_V16HI:
34677 case USI_FTYPE_V32HI:
34678 case UQI_FTYPE_V4SI:
34679 case UQI_FTYPE_V8SI:
34680 case UHI_FTYPE_V16SI:
34681 case UQI_FTYPE_V2DI:
34682 case UQI_FTYPE_V4DI:
34683 case UQI_FTYPE_V8DI:
34684 case V16SI_FTYPE_UHI:
34685 case V2DI_FTYPE_UQI:
34686 case V4DI_FTYPE_UQI:
34687 case V16SI_FTYPE_INT:
34688 case V16SF_FTYPE_V8SF:
34689 case V16SI_FTYPE_V8SI:
34690 case V16SF_FTYPE_V4SF:
34691 case V16SI_FTYPE_V4SI:
34692 case V16SI_FTYPE_V16SF:
34693 case V16SI_FTYPE_V16SI:
34694 case V64QI_FTYPE_V64QI:
34695 case V32HI_FTYPE_V32HI:
34696 case V16SF_FTYPE_V16SF:
34697 case V8DI_FTYPE_UQI:
34698 case V8DI_FTYPE_V8DI:
34699 case V8DF_FTYPE_V4DF:
34700 case V8DF_FTYPE_V2DF:
34701 case V8DF_FTYPE_V8DF:
34702 case V4DI_FTYPE_V4DI:
34703 nargs = 1;
34704 break;
34705 case V4SF_FTYPE_V4SF_VEC_MERGE:
34706 case V2DF_FTYPE_V2DF_VEC_MERGE:
34707 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34708 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34709 case V16QI_FTYPE_V16QI_V16QI:
34710 case V16QI_FTYPE_V8HI_V8HI:
34711 case V16SF_FTYPE_V16SF_V16SF:
34712 case V8QI_FTYPE_V8QI_V8QI:
34713 case V8QI_FTYPE_V4HI_V4HI:
34714 case V8HI_FTYPE_V8HI_V8HI:
34715 case V8HI_FTYPE_V16QI_V16QI:
34716 case V8HI_FTYPE_V4SI_V4SI:
34717 case V8SF_FTYPE_V8SF_V8SF:
34718 case V8SF_FTYPE_V8SF_V8SI:
34719 case V8DF_FTYPE_V8DF_V8DF:
34720 case V4SI_FTYPE_V4SI_V4SI:
34721 case V4SI_FTYPE_V8HI_V8HI:
34722 case V4SI_FTYPE_V2DF_V2DF:
34723 case V4HI_FTYPE_V4HI_V4HI:
34724 case V4HI_FTYPE_V8QI_V8QI:
34725 case V4HI_FTYPE_V2SI_V2SI:
34726 case V4DF_FTYPE_V4DF_V4DF:
34727 case V4DF_FTYPE_V4DF_V4DI:
34728 case V4SF_FTYPE_V4SF_V4SF:
34729 case V4SF_FTYPE_V4SF_V4SI:
34730 case V4SF_FTYPE_V4SF_V2SI:
34731 case V4SF_FTYPE_V4SF_V2DF:
34732 case V4SF_FTYPE_V4SF_UINT:
34733 case V4SF_FTYPE_V4SF_DI:
34734 case V4SF_FTYPE_V4SF_SI:
34735 case V2DI_FTYPE_V2DI_V2DI:
34736 case V2DI_FTYPE_V16QI_V16QI:
34737 case V2DI_FTYPE_V4SI_V4SI:
34738 case V2DI_FTYPE_V2DI_V16QI:
34739 case V2SI_FTYPE_V2SI_V2SI:
34740 case V2SI_FTYPE_V4HI_V4HI:
34741 case V2SI_FTYPE_V2SF_V2SF:
34742 case V2DF_FTYPE_V2DF_V2DF:
34743 case V2DF_FTYPE_V2DF_V4SF:
34744 case V2DF_FTYPE_V2DF_V2DI:
34745 case V2DF_FTYPE_V2DF_DI:
34746 case V2DF_FTYPE_V2DF_SI:
34747 case V2DF_FTYPE_V2DF_UINT:
34748 case V2SF_FTYPE_V2SF_V2SF:
34749 case V1DI_FTYPE_V1DI_V1DI:
34750 case V1DI_FTYPE_V8QI_V8QI:
34751 case V1DI_FTYPE_V2SI_V2SI:
34752 case V32QI_FTYPE_V16HI_V16HI:
34753 case V16HI_FTYPE_V8SI_V8SI:
34754 case V64QI_FTYPE_V64QI_V64QI:
34755 case V32QI_FTYPE_V32QI_V32QI:
34756 case V16HI_FTYPE_V32QI_V32QI:
34757 case V16HI_FTYPE_V16HI_V16HI:
34758 case V8SI_FTYPE_V4DF_V4DF:
34759 case V8SI_FTYPE_V8SI_V8SI:
34760 case V8SI_FTYPE_V16HI_V16HI:
34761 case V4DI_FTYPE_V4DI_V4DI:
34762 case V4DI_FTYPE_V8SI_V8SI:
34763 case V8DI_FTYPE_V64QI_V64QI:
34764 if (comparison == UNKNOWN)
34765 return ix86_expand_binop_builtin (icode, exp, target);
34766 nargs = 2;
34767 break;
34768 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34769 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34770 gcc_assert (comparison != UNKNOWN);
34771 nargs = 2;
34772 swap = true;
34773 break;
34774 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34775 case V16HI_FTYPE_V16HI_SI_COUNT:
34776 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34777 case V8SI_FTYPE_V8SI_SI_COUNT:
34778 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34779 case V4DI_FTYPE_V4DI_INT_COUNT:
34780 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34781 case V8HI_FTYPE_V8HI_SI_COUNT:
34782 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34783 case V4SI_FTYPE_V4SI_SI_COUNT:
34784 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34785 case V4HI_FTYPE_V4HI_SI_COUNT:
34786 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34787 case V2DI_FTYPE_V2DI_SI_COUNT:
34788 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34789 case V2SI_FTYPE_V2SI_SI_COUNT:
34790 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34791 case V1DI_FTYPE_V1DI_SI_COUNT:
34792 nargs = 2;
34793 second_arg_count = true;
34794 break;
34795 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34796 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34797 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34798 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34799 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34800 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34801 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34802 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34803 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34804 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34805 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34806 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34807 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34808 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34809 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34810 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34811 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34812 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34813 nargs = 4;
34814 second_arg_count = true;
34815 break;
34816 case UINT64_FTYPE_UINT64_UINT64:
34817 case UINT_FTYPE_UINT_UINT:
34818 case UINT_FTYPE_UINT_USHORT:
34819 case UINT_FTYPE_UINT_UCHAR:
34820 case UINT16_FTYPE_UINT16_INT:
34821 case UINT8_FTYPE_UINT8_INT:
34822 case UQI_FTYPE_UQI_UQI:
34823 case UHI_FTYPE_UHI_UHI:
34824 case USI_FTYPE_USI_USI:
34825 case UDI_FTYPE_UDI_UDI:
34826 case V16SI_FTYPE_V8DF_V8DF:
34827 nargs = 2;
34828 break;
34829 case V2DI_FTYPE_V2DI_INT_CONVERT:
34830 nargs = 2;
34831 rmode = V1TImode;
34832 nargs_constant = 1;
34833 break;
34834 case V4DI_FTYPE_V4DI_INT_CONVERT:
34835 nargs = 2;
34836 rmode = V2TImode;
34837 nargs_constant = 1;
34838 break;
34839 case V8DI_FTYPE_V8DI_INT_CONVERT:
34840 nargs = 2;
34841 rmode = V4TImode;
34842 nargs_constant = 1;
34843 break;
34844 case V8HI_FTYPE_V8HI_INT:
34845 case V8HI_FTYPE_V8SF_INT:
34846 case V16HI_FTYPE_V16SF_INT:
34847 case V8HI_FTYPE_V4SF_INT:
34848 case V8SF_FTYPE_V8SF_INT:
34849 case V4SF_FTYPE_V16SF_INT:
34850 case V16SF_FTYPE_V16SF_INT:
34851 case V4SI_FTYPE_V4SI_INT:
34852 case V4SI_FTYPE_V8SI_INT:
34853 case V4HI_FTYPE_V4HI_INT:
34854 case V4DF_FTYPE_V4DF_INT:
34855 case V4DF_FTYPE_V8DF_INT:
34856 case V4SF_FTYPE_V4SF_INT:
34857 case V4SF_FTYPE_V8SF_INT:
34858 case V2DI_FTYPE_V2DI_INT:
34859 case V2DF_FTYPE_V2DF_INT:
34860 case V2DF_FTYPE_V4DF_INT:
34861 case V16HI_FTYPE_V16HI_INT:
34862 case V8SI_FTYPE_V8SI_INT:
34863 case V16SI_FTYPE_V16SI_INT:
34864 case V4SI_FTYPE_V16SI_INT:
34865 case V4DI_FTYPE_V4DI_INT:
34866 case V2DI_FTYPE_V4DI_INT:
34867 case V4DI_FTYPE_V8DI_INT:
34868 case QI_FTYPE_V4SF_INT:
34869 case QI_FTYPE_V2DF_INT:
34870 case UQI_FTYPE_UQI_UQI_CONST:
34871 case UHI_FTYPE_UHI_UQI:
34872 case USI_FTYPE_USI_UQI:
34873 case UDI_FTYPE_UDI_UQI:
34874 nargs = 2;
34875 nargs_constant = 1;
34876 break;
34877 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34878 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34879 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34880 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34881 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34882 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34883 case UHI_FTYPE_V16SI_V16SI_UHI:
34884 case UQI_FTYPE_V8DI_V8DI_UQI:
34885 case V16HI_FTYPE_V16SI_V16HI_UHI:
34886 case V16QI_FTYPE_V16SI_V16QI_UHI:
34887 case V16QI_FTYPE_V8DI_V16QI_UQI:
34888 case V16SF_FTYPE_V16SF_V16SF_UHI:
34889 case V16SF_FTYPE_V4SF_V16SF_UHI:
34890 case V16SI_FTYPE_SI_V16SI_UHI:
34891 case V16SI_FTYPE_V16HI_V16SI_UHI:
34892 case V16SI_FTYPE_V16QI_V16SI_UHI:
34893 case V8SF_FTYPE_V4SF_V8SF_UQI:
34894 case V4DF_FTYPE_V2DF_V4DF_UQI:
34895 case V8SI_FTYPE_V4SI_V8SI_UQI:
34896 case V8SI_FTYPE_SI_V8SI_UQI:
34897 case V4SI_FTYPE_V4SI_V4SI_UQI:
34898 case V4SI_FTYPE_SI_V4SI_UQI:
34899 case V4DI_FTYPE_V2DI_V4DI_UQI:
34900 case V4DI_FTYPE_DI_V4DI_UQI:
34901 case V2DI_FTYPE_V2DI_V2DI_UQI:
34902 case V2DI_FTYPE_DI_V2DI_UQI:
34903 case V64QI_FTYPE_V64QI_V64QI_UDI:
34904 case V64QI_FTYPE_V16QI_V64QI_UDI:
34905 case V64QI_FTYPE_QI_V64QI_UDI:
34906 case V32QI_FTYPE_V32QI_V32QI_USI:
34907 case V32QI_FTYPE_V16QI_V32QI_USI:
34908 case V32QI_FTYPE_QI_V32QI_USI:
34909 case V16QI_FTYPE_V16QI_V16QI_UHI:
34910 case V16QI_FTYPE_QI_V16QI_UHI:
34911 case V32HI_FTYPE_V8HI_V32HI_USI:
34912 case V32HI_FTYPE_HI_V32HI_USI:
34913 case V16HI_FTYPE_V8HI_V16HI_UHI:
34914 case V16HI_FTYPE_HI_V16HI_UHI:
34915 case V8HI_FTYPE_V8HI_V8HI_UQI:
34916 case V8HI_FTYPE_HI_V8HI_UQI:
34917 case V8SF_FTYPE_V8HI_V8SF_UQI:
34918 case V4SF_FTYPE_V8HI_V4SF_UQI:
34919 case V8SI_FTYPE_V8SF_V8SI_UQI:
34920 case V4SI_FTYPE_V4SF_V4SI_UQI:
34921 case V4DI_FTYPE_V4SF_V4DI_UQI:
34922 case V2DI_FTYPE_V4SF_V2DI_UQI:
34923 case V4SF_FTYPE_V4DI_V4SF_UQI:
34924 case V4SF_FTYPE_V2DI_V4SF_UQI:
34925 case V4DF_FTYPE_V4DI_V4DF_UQI:
34926 case V2DF_FTYPE_V2DI_V2DF_UQI:
34927 case V16QI_FTYPE_V8HI_V16QI_UQI:
34928 case V16QI_FTYPE_V16HI_V16QI_UHI:
34929 case V16QI_FTYPE_V4SI_V16QI_UQI:
34930 case V16QI_FTYPE_V8SI_V16QI_UQI:
34931 case V8HI_FTYPE_V4SI_V8HI_UQI:
34932 case V8HI_FTYPE_V8SI_V8HI_UQI:
34933 case V16QI_FTYPE_V2DI_V16QI_UQI:
34934 case V16QI_FTYPE_V4DI_V16QI_UQI:
34935 case V8HI_FTYPE_V2DI_V8HI_UQI:
34936 case V8HI_FTYPE_V4DI_V8HI_UQI:
34937 case V4SI_FTYPE_V2DI_V4SI_UQI:
34938 case V4SI_FTYPE_V4DI_V4SI_UQI:
34939 case V32QI_FTYPE_V32HI_V32QI_USI:
34940 case UHI_FTYPE_V16QI_V16QI_UHI:
34941 case USI_FTYPE_V32QI_V32QI_USI:
34942 case UDI_FTYPE_V64QI_V64QI_UDI:
34943 case UQI_FTYPE_V8HI_V8HI_UQI:
34944 case UHI_FTYPE_V16HI_V16HI_UHI:
34945 case USI_FTYPE_V32HI_V32HI_USI:
34946 case UQI_FTYPE_V4SI_V4SI_UQI:
34947 case UQI_FTYPE_V8SI_V8SI_UQI:
34948 case UQI_FTYPE_V2DI_V2DI_UQI:
34949 case UQI_FTYPE_V4DI_V4DI_UQI:
34950 case V4SF_FTYPE_V2DF_V4SF_UQI:
34951 case V4SF_FTYPE_V4DF_V4SF_UQI:
34952 case V16SI_FTYPE_V16SI_V16SI_UHI:
34953 case V16SI_FTYPE_V4SI_V16SI_UHI:
34954 case V2DI_FTYPE_V4SI_V2DI_UQI:
34955 case V2DI_FTYPE_V8HI_V2DI_UQI:
34956 case V2DI_FTYPE_V16QI_V2DI_UQI:
34957 case V4DI_FTYPE_V4DI_V4DI_UQI:
34958 case V4DI_FTYPE_V4SI_V4DI_UQI:
34959 case V4DI_FTYPE_V8HI_V4DI_UQI:
34960 case V4DI_FTYPE_V16QI_V4DI_UQI:
34961 case V4DI_FTYPE_V4DF_V4DI_UQI:
34962 case V2DI_FTYPE_V2DF_V2DI_UQI:
34963 case V4SI_FTYPE_V4DF_V4SI_UQI:
34964 case V4SI_FTYPE_V2DF_V4SI_UQI:
34965 case V4SI_FTYPE_V8HI_V4SI_UQI:
34966 case V4SI_FTYPE_V16QI_V4SI_UQI:
34967 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34968 case V8DF_FTYPE_V2DF_V8DF_UQI:
34969 case V8DF_FTYPE_V4DF_V8DF_UQI:
34970 case V8DF_FTYPE_V8DF_V8DF_UQI:
34971 case V8SF_FTYPE_V8SF_V8SF_UQI:
34972 case V8SF_FTYPE_V8SI_V8SF_UQI:
34973 case V4DF_FTYPE_V4DF_V4DF_UQI:
34974 case V4SF_FTYPE_V4SF_V4SF_UQI:
34975 case V2DF_FTYPE_V2DF_V2DF_UQI:
34976 case V2DF_FTYPE_V4SF_V2DF_UQI:
34977 case V2DF_FTYPE_V4SI_V2DF_UQI:
34978 case V4SF_FTYPE_V4SI_V4SF_UQI:
34979 case V4DF_FTYPE_V4SF_V4DF_UQI:
34980 case V4DF_FTYPE_V4SI_V4DF_UQI:
34981 case V8SI_FTYPE_V8SI_V8SI_UQI:
34982 case V8SI_FTYPE_V8HI_V8SI_UQI:
34983 case V8SI_FTYPE_V16QI_V8SI_UQI:
34984 case V8DF_FTYPE_V8SI_V8DF_UQI:
34985 case V8DI_FTYPE_DI_V8DI_UQI:
34986 case V16SF_FTYPE_V8SF_V16SF_UHI:
34987 case V16SI_FTYPE_V8SI_V16SI_UHI:
34988 case V16HI_FTYPE_V16HI_V16HI_UHI:
34989 case V8HI_FTYPE_V16QI_V8HI_UQI:
34990 case V16HI_FTYPE_V16QI_V16HI_UHI:
34991 case V32HI_FTYPE_V32HI_V32HI_USI:
34992 case V32HI_FTYPE_V32QI_V32HI_USI:
34993 case V8DI_FTYPE_V16QI_V8DI_UQI:
34994 case V8DI_FTYPE_V2DI_V8DI_UQI:
34995 case V8DI_FTYPE_V4DI_V8DI_UQI:
34996 case V8DI_FTYPE_V8DI_V8DI_UQI:
34997 case V8DI_FTYPE_V8HI_V8DI_UQI:
34998 case V8DI_FTYPE_V8SI_V8DI_UQI:
34999 case V8HI_FTYPE_V8DI_V8HI_UQI:
35000 case V8SI_FTYPE_V8DI_V8SI_UQI:
35001 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35002 case V16SI_FTYPE_V16SI_V16SI_V16SI:
35003 case V8DI_FTYPE_V8DI_V8DI_V8DI:
35004 case V32HI_FTYPE_V32HI_V32HI_V32HI:
35005 case V2DI_FTYPE_V2DI_V2DI_V2DI:
35006 case V16HI_FTYPE_V16HI_V16HI_V16HI:
35007 case V8SI_FTYPE_V8SI_V8SI_V8SI:
35008 case V8HI_FTYPE_V8HI_V8HI_V8HI:
35009 nargs = 3;
35010 break;
35011 case V32QI_FTYPE_V32QI_V32QI_INT:
35012 case V16HI_FTYPE_V16HI_V16HI_INT:
35013 case V16QI_FTYPE_V16QI_V16QI_INT:
35014 case V4DI_FTYPE_V4DI_V4DI_INT:
35015 case V8HI_FTYPE_V8HI_V8HI_INT:
35016 case V8SI_FTYPE_V8SI_V8SI_INT:
35017 case V8SI_FTYPE_V8SI_V4SI_INT:
35018 case V8SF_FTYPE_V8SF_V8SF_INT:
35019 case V8SF_FTYPE_V8SF_V4SF_INT:
35020 case V4SI_FTYPE_V4SI_V4SI_INT:
35021 case V4DF_FTYPE_V4DF_V4DF_INT:
35022 case V16SF_FTYPE_V16SF_V16SF_INT:
35023 case V16SF_FTYPE_V16SF_V4SF_INT:
35024 case V16SI_FTYPE_V16SI_V4SI_INT:
35025 case V4DF_FTYPE_V4DF_V2DF_INT:
35026 case V4SF_FTYPE_V4SF_V4SF_INT:
35027 case V2DI_FTYPE_V2DI_V2DI_INT:
35028 case V4DI_FTYPE_V4DI_V2DI_INT:
35029 case V2DF_FTYPE_V2DF_V2DF_INT:
35030 case UQI_FTYPE_V8DI_V8UDI_INT:
35031 case UQI_FTYPE_V8DF_V8DF_INT:
35032 case UQI_FTYPE_V2DF_V2DF_INT:
35033 case UQI_FTYPE_V4SF_V4SF_INT:
35034 case UHI_FTYPE_V16SI_V16SI_INT:
35035 case UHI_FTYPE_V16SF_V16SF_INT:
35036 case V64QI_FTYPE_V64QI_V64QI_INT:
35037 case V32HI_FTYPE_V32HI_V32HI_INT:
35038 case V16SI_FTYPE_V16SI_V16SI_INT:
35039 case V8DI_FTYPE_V8DI_V8DI_INT:
35040 nargs = 3;
35041 nargs_constant = 1;
35042 break;
35043 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35044 nargs = 3;
35045 rmode = V4DImode;
35046 nargs_constant = 1;
35047 break;
35048 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35049 nargs = 3;
35050 rmode = V2DImode;
35051 nargs_constant = 1;
35052 break;
35053 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35054 nargs = 3;
35055 rmode = DImode;
35056 nargs_constant = 1;
35057 break;
35058 case V2DI_FTYPE_V2DI_UINT_UINT:
35059 nargs = 3;
35060 nargs_constant = 2;
35061 break;
35062 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35063 nargs = 3;
35064 rmode = V8DImode;
35065 nargs_constant = 1;
35066 break;
35067 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35068 nargs = 5;
35069 rmode = V8DImode;
35070 mask_pos = 2;
35071 nargs_constant = 1;
35072 break;
35073 case QI_FTYPE_V8DF_INT_UQI:
35074 case QI_FTYPE_V4DF_INT_UQI:
35075 case QI_FTYPE_V2DF_INT_UQI:
35076 case HI_FTYPE_V16SF_INT_UHI:
35077 case QI_FTYPE_V8SF_INT_UQI:
35078 case QI_FTYPE_V4SF_INT_UQI:
35079 case V4SI_FTYPE_V4SI_V4SI_UHI:
35080 case V8SI_FTYPE_V8SI_V8SI_UHI:
35081 nargs = 3;
35082 mask_pos = 1;
35083 nargs_constant = 1;
35084 break;
35085 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35086 nargs = 5;
35087 rmode = V4DImode;
35088 mask_pos = 2;
35089 nargs_constant = 1;
35090 break;
35091 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35092 nargs = 5;
35093 rmode = V2DImode;
35094 mask_pos = 2;
35095 nargs_constant = 1;
35096 break;
35097 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35098 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35099 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35100 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35101 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35102 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35103 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35104 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35105 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35106 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35107 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35108 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35109 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35110 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35111 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35112 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35113 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35114 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35115 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35116 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35117 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35118 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35119 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35120 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35121 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35122 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35123 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35124 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35125 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35126 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35127 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35128 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35129 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35130 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35131 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35132 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35133 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35134 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35135 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35136 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35137 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35138 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35139 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35140 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35141 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35142 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35143 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35144 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35145 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35146 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35147 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35148 nargs = 4;
35149 break;
35150 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35151 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35152 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35153 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35154 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35155 nargs = 4;
35156 nargs_constant = 1;
35157 break;
35158 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35159 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35160 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35161 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35162 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35163 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35164 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35165 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35166 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35167 case USI_FTYPE_V32QI_V32QI_INT_USI:
35168 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35169 case USI_FTYPE_V32HI_V32HI_INT_USI:
35170 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35171 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35172 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
35173 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
35174 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
35175 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
35176 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
35177 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
35178 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
35179 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
35180 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
35181 nargs = 4;
35182 mask_pos = 1;
35183 nargs_constant = 1;
35184 break;
35185 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35186 nargs = 4;
35187 nargs_constant = 2;
35188 break;
35189 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35190 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35191 nargs = 4;
35192 break;
35193 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35194 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35195 mask_pos = 1;
35196 nargs = 4;
35197 nargs_constant = 1;
35198 break;
35199 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35200 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35201 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35202 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35203 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35204 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35205 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35206 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35207 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35208 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35209 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35210 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35211 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35212 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35213 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35214 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35215 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35216 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35217 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35218 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35219 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35220 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35221 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35222 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35223 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35224 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35225 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35226 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35227 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35228 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35229 nargs = 4;
35230 mask_pos = 2;
35231 nargs_constant = 1;
35232 break;
35233 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35234 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35235 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35236 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35237 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35238 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35239 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35240 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35241 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35242 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35243 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35244 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35245 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35246 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35247 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35248 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35249 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35250 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35251 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35252 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35253 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35254 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35255 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35256 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35257 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35258 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35259 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35260 nargs = 5;
35261 mask_pos = 2;
35262 nargs_constant = 1;
35263 break;
35264 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35265 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35266 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35267 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35268 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35269 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35270 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35271 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35272 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35273 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35274 nargs = 5;
35275 mask_pos = 1;
35276 nargs_constant = 1;
35277 break;
35278 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35279 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35280 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35281 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35282 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35283 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35284 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35285 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35286 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35287 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35288 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35289 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35290 nargs = 5;
35291 mask_pos = 1;
35292 nargs_constant = 2;
35293 break;
35295 default:
35296 gcc_unreachable ();
35299 gcc_assert (nargs <= ARRAY_SIZE (args));
35301 if (comparison != UNKNOWN)
35303 gcc_assert (nargs == 2);
35304 return ix86_expand_sse_compare (d, exp, target, swap);
35307 if (rmode == VOIDmode || rmode == tmode)
35309 if (optimize
35310 || target == 0
35311 || GET_MODE (target) != tmode
35312 || !insn_p->operand[0].predicate (target, tmode))
35313 target = gen_reg_rtx (tmode);
35314 else if (memory_operand (target, tmode))
35315 num_memory++;
35316 real_target = target;
35318 else
35320 real_target = gen_reg_rtx (tmode);
35321 target = lowpart_subreg (rmode, real_target, tmode);
35324 for (i = 0; i < nargs; i++)
35326 tree arg = CALL_EXPR_ARG (exp, i);
35327 rtx op = expand_normal (arg);
35328 machine_mode mode = insn_p->operand[i + 1].mode;
35329 bool match = insn_p->operand[i + 1].predicate (op, mode);
35331 if (second_arg_count && i == 1)
35333 /* SIMD shift insns take either an 8-bit immediate or
35334 register as count. But builtin functions take int as
35335 count. If count doesn't match, we put it in register.
35336 The instructions are using 64-bit count, if op is just
35337 32-bit, zero-extend it, as negative shift counts
35338 are undefined behavior and zero-extension is more
35339 efficient. */
35340 if (!match)
35342 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35343 op = convert_modes (mode, GET_MODE (op), op, 1);
35344 else
35345 op = lowpart_subreg (mode, op, GET_MODE (op));
35346 if (!insn_p->operand[i + 1].predicate (op, mode))
35347 op = copy_to_reg (op);
35350 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35351 (!mask_pos && (nargs - i) <= nargs_constant))
35353 if (!match)
35354 switch (icode)
35356 case CODE_FOR_avx_vinsertf128v4di:
35357 case CODE_FOR_avx_vextractf128v4di:
35358 error ("the last argument must be an 1-bit immediate");
35359 return const0_rtx;
35361 case CODE_FOR_avx512f_cmpv8di3_mask:
35362 case CODE_FOR_avx512f_cmpv16si3_mask:
35363 case CODE_FOR_avx512f_ucmpv8di3_mask:
35364 case CODE_FOR_avx512f_ucmpv16si3_mask:
35365 case CODE_FOR_avx512vl_cmpv4di3_mask:
35366 case CODE_FOR_avx512vl_cmpv8si3_mask:
35367 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35368 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35369 case CODE_FOR_avx512vl_cmpv2di3_mask:
35370 case CODE_FOR_avx512vl_cmpv4si3_mask:
35371 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35372 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35373 error ("the last argument must be a 3-bit immediate");
35374 return const0_rtx;
35376 case CODE_FOR_sse4_1_roundsd:
35377 case CODE_FOR_sse4_1_roundss:
35379 case CODE_FOR_sse4_1_roundpd:
35380 case CODE_FOR_sse4_1_roundps:
35381 case CODE_FOR_avx_roundpd256:
35382 case CODE_FOR_avx_roundps256:
35384 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35385 case CODE_FOR_sse4_1_roundps_sfix:
35386 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35387 case CODE_FOR_avx_roundps_sfix256:
35389 case CODE_FOR_sse4_1_blendps:
35390 case CODE_FOR_avx_blendpd256:
35391 case CODE_FOR_avx_vpermilv4df:
35392 case CODE_FOR_avx_vpermilv4df_mask:
35393 case CODE_FOR_avx512f_getmantv8df_mask:
35394 case CODE_FOR_avx512f_getmantv16sf_mask:
35395 case CODE_FOR_avx512vl_getmantv8sf_mask:
35396 case CODE_FOR_avx512vl_getmantv4df_mask:
35397 case CODE_FOR_avx512vl_getmantv4sf_mask:
35398 case CODE_FOR_avx512vl_getmantv2df_mask:
35399 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35400 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35401 case CODE_FOR_avx512dq_rangepv4df_mask:
35402 case CODE_FOR_avx512dq_rangepv8sf_mask:
35403 case CODE_FOR_avx512dq_rangepv2df_mask:
35404 case CODE_FOR_avx512dq_rangepv4sf_mask:
35405 case CODE_FOR_avx_shufpd256_mask:
35406 error ("the last argument must be a 4-bit immediate");
35407 return const0_rtx;
35409 case CODE_FOR_sha1rnds4:
35410 case CODE_FOR_sse4_1_blendpd:
35411 case CODE_FOR_avx_vpermilv2df:
35412 case CODE_FOR_avx_vpermilv2df_mask:
35413 case CODE_FOR_xop_vpermil2v2df3:
35414 case CODE_FOR_xop_vpermil2v4sf3:
35415 case CODE_FOR_xop_vpermil2v4df3:
35416 case CODE_FOR_xop_vpermil2v8sf3:
35417 case CODE_FOR_avx512f_vinsertf32x4_mask:
35418 case CODE_FOR_avx512f_vinserti32x4_mask:
35419 case CODE_FOR_avx512f_vextractf32x4_mask:
35420 case CODE_FOR_avx512f_vextracti32x4_mask:
35421 case CODE_FOR_sse2_shufpd:
35422 case CODE_FOR_sse2_shufpd_mask:
35423 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35424 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35425 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35426 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35427 error ("the last argument must be a 2-bit immediate");
35428 return const0_rtx;
35430 case CODE_FOR_avx_vextractf128v4df:
35431 case CODE_FOR_avx_vextractf128v8sf:
35432 case CODE_FOR_avx_vextractf128v8si:
35433 case CODE_FOR_avx_vinsertf128v4df:
35434 case CODE_FOR_avx_vinsertf128v8sf:
35435 case CODE_FOR_avx_vinsertf128v8si:
35436 case CODE_FOR_avx512f_vinsertf64x4_mask:
35437 case CODE_FOR_avx512f_vinserti64x4_mask:
35438 case CODE_FOR_avx512f_vextractf64x4_mask:
35439 case CODE_FOR_avx512f_vextracti64x4_mask:
35440 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35441 case CODE_FOR_avx512dq_vinserti32x8_mask:
35442 case CODE_FOR_avx512vl_vinsertv4df:
35443 case CODE_FOR_avx512vl_vinsertv4di:
35444 case CODE_FOR_avx512vl_vinsertv8sf:
35445 case CODE_FOR_avx512vl_vinsertv8si:
35446 error ("the last argument must be a 1-bit immediate");
35447 return const0_rtx;
35449 case CODE_FOR_avx_vmcmpv2df3:
35450 case CODE_FOR_avx_vmcmpv4sf3:
35451 case CODE_FOR_avx_cmpv2df3:
35452 case CODE_FOR_avx_cmpv4sf3:
35453 case CODE_FOR_avx_cmpv4df3:
35454 case CODE_FOR_avx_cmpv8sf3:
35455 case CODE_FOR_avx512f_cmpv8df3_mask:
35456 case CODE_FOR_avx512f_cmpv16sf3_mask:
35457 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35458 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35459 error ("the last argument must be a 5-bit immediate");
35460 return const0_rtx;
35462 default:
35463 switch (nargs_constant)
35465 case 2:
35466 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35467 (!mask_pos && (nargs - i) == nargs_constant))
35469 error ("the next to last argument must be an 8-bit immediate");
35470 break;
35472 /* FALLTHRU */
35473 case 1:
35474 error ("the last argument must be an 8-bit immediate");
35475 break;
35476 default:
35477 gcc_unreachable ();
35479 return const0_rtx;
35482 else
35484 if (VECTOR_MODE_P (mode))
35485 op = safe_vector_operand (op, mode);
35487 /* If we aren't optimizing, only allow one memory operand to
35488 be generated. */
35489 if (memory_operand (op, mode))
35490 num_memory++;
35492 op = fixup_modeless_constant (op, mode);
35494 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35496 if (optimize || !match || num_memory > 1)
35497 op = copy_to_mode_reg (mode, op);
35499 else
35501 op = copy_to_reg (op);
35502 op = lowpart_subreg (mode, op, GET_MODE (op));
35506 args[i].op = op;
35507 args[i].mode = mode;
35510 switch (nargs)
35512 case 1:
35513 pat = GEN_FCN (icode) (real_target, args[0].op);
35514 break;
35515 case 2:
35516 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35517 break;
35518 case 3:
35519 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35520 args[2].op);
35521 break;
35522 case 4:
35523 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35524 args[2].op, args[3].op);
35525 break;
35526 case 5:
35527 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35528 args[2].op, args[3].op, args[4].op);
35529 break;
35530 case 6:
35531 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35532 args[2].op, args[3].op, args[4].op,
35533 args[5].op);
35534 break;
35535 default:
35536 gcc_unreachable ();
35539 if (! pat)
35540 return 0;
35542 emit_insn (pat);
35543 return target;
35546 /* Transform pattern of following layout:
35547 (set A
35548 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35550 into:
35551 (set (A B)) */
35553 static rtx
35554 ix86_erase_embedded_rounding (rtx pat)
35556 if (GET_CODE (pat) == INSN)
35557 pat = PATTERN (pat);
35559 gcc_assert (GET_CODE (pat) == SET);
35560 rtx src = SET_SRC (pat);
35561 gcc_assert (XVECLEN (src, 0) == 2);
35562 rtx p0 = XVECEXP (src, 0, 0);
35563 gcc_assert (GET_CODE (src) == UNSPEC
35564 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35565 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35566 return res;
35569 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35570 with rounding. */
35571 static rtx
35572 ix86_expand_sse_comi_round (const struct builtin_description *d,
35573 tree exp, rtx target)
35575 rtx pat, set_dst;
35576 tree arg0 = CALL_EXPR_ARG (exp, 0);
35577 tree arg1 = CALL_EXPR_ARG (exp, 1);
35578 tree arg2 = CALL_EXPR_ARG (exp, 2);
35579 tree arg3 = CALL_EXPR_ARG (exp, 3);
35580 rtx op0 = expand_normal (arg0);
35581 rtx op1 = expand_normal (arg1);
35582 rtx op2 = expand_normal (arg2);
35583 rtx op3 = expand_normal (arg3);
35584 enum insn_code icode = d->icode;
35585 const struct insn_data_d *insn_p = &insn_data[icode];
35586 machine_mode mode0 = insn_p->operand[0].mode;
35587 machine_mode mode1 = insn_p->operand[1].mode;
35588 enum rtx_code comparison = UNEQ;
35589 bool need_ucomi = false;
35591 /* See avxintrin.h for values. */
35592 enum rtx_code comi_comparisons[32] =
35594 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35595 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35596 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35598 bool need_ucomi_values[32] =
35600 true, false, false, true, true, false, false, true,
35601 true, false, false, true, true, false, false, true,
35602 false, true, true, false, false, true, true, false,
35603 false, true, true, false, false, true, true, false
35606 if (!CONST_INT_P (op2))
35608 error ("the third argument must be comparison constant");
35609 return const0_rtx;
35611 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35613 error ("incorrect comparison mode");
35614 return const0_rtx;
35617 if (!insn_p->operand[2].predicate (op3, SImode))
35619 error ("incorrect rounding operand");
35620 return const0_rtx;
35623 comparison = comi_comparisons[INTVAL (op2)];
35624 need_ucomi = need_ucomi_values[INTVAL (op2)];
35626 if (VECTOR_MODE_P (mode0))
35627 op0 = safe_vector_operand (op0, mode0);
35628 if (VECTOR_MODE_P (mode1))
35629 op1 = safe_vector_operand (op1, mode1);
35631 target = gen_reg_rtx (SImode);
35632 emit_move_insn (target, const0_rtx);
35633 target = gen_rtx_SUBREG (QImode, target, 0);
35635 if ((optimize && !register_operand (op0, mode0))
35636 || !insn_p->operand[0].predicate (op0, mode0))
35637 op0 = copy_to_mode_reg (mode0, op0);
35638 if ((optimize && !register_operand (op1, mode1))
35639 || !insn_p->operand[1].predicate (op1, mode1))
35640 op1 = copy_to_mode_reg (mode1, op1);
35642 if (need_ucomi)
35643 icode = icode == CODE_FOR_sse_comi_round
35644 ? CODE_FOR_sse_ucomi_round
35645 : CODE_FOR_sse2_ucomi_round;
35647 pat = GEN_FCN (icode) (op0, op1, op3);
35648 if (! pat)
35649 return 0;
35651 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35652 if (INTVAL (op3) == NO_ROUND)
35654 pat = ix86_erase_embedded_rounding (pat);
35655 if (! pat)
35656 return 0;
35658 set_dst = SET_DEST (pat);
35660 else
35662 gcc_assert (GET_CODE (pat) == SET);
35663 set_dst = SET_DEST (pat);
35666 emit_insn (pat);
35667 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35668 gen_rtx_fmt_ee (comparison, QImode,
35669 set_dst,
35670 const0_rtx)));
35672 return SUBREG_REG (target);
35675 static rtx
35676 ix86_expand_round_builtin (const struct builtin_description *d,
35677 tree exp, rtx target)
35679 rtx pat;
35680 unsigned int i, nargs;
35681 struct
35683 rtx op;
35684 machine_mode mode;
35685 } args[6];
35686 enum insn_code icode = d->icode;
35687 const struct insn_data_d *insn_p = &insn_data[icode];
35688 machine_mode tmode = insn_p->operand[0].mode;
35689 unsigned int nargs_constant = 0;
35690 unsigned int redundant_embed_rnd = 0;
35692 switch ((enum ix86_builtin_func_type) d->flag)
35694 case UINT64_FTYPE_V2DF_INT:
35695 case UINT64_FTYPE_V4SF_INT:
35696 case UINT_FTYPE_V2DF_INT:
35697 case UINT_FTYPE_V4SF_INT:
35698 case INT64_FTYPE_V2DF_INT:
35699 case INT64_FTYPE_V4SF_INT:
35700 case INT_FTYPE_V2DF_INT:
35701 case INT_FTYPE_V4SF_INT:
35702 nargs = 2;
35703 break;
35704 case V4SF_FTYPE_V4SF_UINT_INT:
35705 case V4SF_FTYPE_V4SF_UINT64_INT:
35706 case V2DF_FTYPE_V2DF_UINT64_INT:
35707 case V4SF_FTYPE_V4SF_INT_INT:
35708 case V4SF_FTYPE_V4SF_INT64_INT:
35709 case V2DF_FTYPE_V2DF_INT64_INT:
35710 case V4SF_FTYPE_V4SF_V4SF_INT:
35711 case V2DF_FTYPE_V2DF_V2DF_INT:
35712 case V4SF_FTYPE_V4SF_V2DF_INT:
35713 case V2DF_FTYPE_V2DF_V4SF_INT:
35714 nargs = 3;
35715 break;
35716 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35717 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35718 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35719 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35720 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35721 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35722 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35723 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35724 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35725 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35726 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35727 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35728 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35729 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35730 nargs = 4;
35731 break;
35732 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35733 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35734 nargs_constant = 2;
35735 nargs = 4;
35736 break;
35737 case INT_FTYPE_V4SF_V4SF_INT_INT:
35738 case INT_FTYPE_V2DF_V2DF_INT_INT:
35739 return ix86_expand_sse_comi_round (d, exp, target);
35740 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35741 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35742 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35743 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35744 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35745 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35746 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35747 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35748 nargs = 5;
35749 break;
35750 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35751 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35752 nargs_constant = 4;
35753 nargs = 5;
35754 break;
35755 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35756 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35757 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35758 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35759 nargs_constant = 3;
35760 nargs = 5;
35761 break;
35762 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35763 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35764 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35765 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35766 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35767 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35768 nargs = 6;
35769 nargs_constant = 4;
35770 break;
35771 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35772 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35773 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35774 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35775 nargs = 6;
35776 nargs_constant = 3;
35777 break;
35778 default:
35779 gcc_unreachable ();
35781 gcc_assert (nargs <= ARRAY_SIZE (args));
35783 if (optimize
35784 || target == 0
35785 || GET_MODE (target) != tmode
35786 || !insn_p->operand[0].predicate (target, tmode))
35787 target = gen_reg_rtx (tmode);
35789 for (i = 0; i < nargs; i++)
35791 tree arg = CALL_EXPR_ARG (exp, i);
35792 rtx op = expand_normal (arg);
35793 machine_mode mode = insn_p->operand[i + 1].mode;
35794 bool match = insn_p->operand[i + 1].predicate (op, mode);
35796 if (i == nargs - nargs_constant)
35798 if (!match)
35800 switch (icode)
35802 case CODE_FOR_avx512f_getmantv8df_mask_round:
35803 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35804 case CODE_FOR_avx512f_vgetmantv2df_round:
35805 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35806 case CODE_FOR_avx512f_vgetmantv4sf_round:
35807 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35808 error ("the immediate argument must be a 4-bit immediate");
35809 return const0_rtx;
35810 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35811 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35812 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35813 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35814 error ("the immediate argument must be a 5-bit immediate");
35815 return const0_rtx;
35816 default:
35817 error ("the immediate argument must be an 8-bit immediate");
35818 return const0_rtx;
35822 else if (i == nargs-1)
35824 if (!insn_p->operand[nargs].predicate (op, SImode))
35826 error ("incorrect rounding operand");
35827 return const0_rtx;
35830 /* If there is no rounding use normal version of the pattern. */
35831 if (INTVAL (op) == NO_ROUND)
35832 redundant_embed_rnd = 1;
35834 else
35836 if (VECTOR_MODE_P (mode))
35837 op = safe_vector_operand (op, mode);
35839 op = fixup_modeless_constant (op, mode);
35841 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35843 if (optimize || !match)
35844 op = copy_to_mode_reg (mode, op);
35846 else
35848 op = copy_to_reg (op);
35849 op = lowpart_subreg (mode, op, GET_MODE (op));
35853 args[i].op = op;
35854 args[i].mode = mode;
35857 switch (nargs)
35859 case 1:
35860 pat = GEN_FCN (icode) (target, args[0].op);
35861 break;
35862 case 2:
35863 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35864 break;
35865 case 3:
35866 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35867 args[2].op);
35868 break;
35869 case 4:
35870 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35871 args[2].op, args[3].op);
35872 break;
35873 case 5:
35874 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35875 args[2].op, args[3].op, args[4].op);
35876 break;
35877 case 6:
35878 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35879 args[2].op, args[3].op, args[4].op,
35880 args[5].op);
35881 break;
35882 default:
35883 gcc_unreachable ();
35886 if (!pat)
35887 return 0;
35889 if (redundant_embed_rnd)
35890 pat = ix86_erase_embedded_rounding (pat);
35892 emit_insn (pat);
35893 return target;
35896 /* Subroutine of ix86_expand_builtin to take care of special insns
35897 with variable number of operands. */
35899 static rtx
35900 ix86_expand_special_args_builtin (const struct builtin_description *d,
35901 tree exp, rtx target)
35903 tree arg;
35904 rtx pat, op;
35905 unsigned int i, nargs, arg_adjust, memory;
35906 bool aligned_mem = false;
35907 struct
35909 rtx op;
35910 machine_mode mode;
35911 } args[3];
35912 enum insn_code icode = d->icode;
35913 bool last_arg_constant = false;
35914 const struct insn_data_d *insn_p = &insn_data[icode];
35915 machine_mode tmode = insn_p->operand[0].mode;
35916 enum { load, store } klass;
35918 switch ((enum ix86_builtin_func_type) d->flag)
35920 case VOID_FTYPE_VOID:
35921 emit_insn (GEN_FCN (icode) (target));
35922 return 0;
35923 case VOID_FTYPE_UINT64:
35924 case VOID_FTYPE_UNSIGNED:
35925 nargs = 0;
35926 klass = store;
35927 memory = 0;
35928 break;
35930 case INT_FTYPE_VOID:
35931 case USHORT_FTYPE_VOID:
35932 case UINT64_FTYPE_VOID:
35933 case UINT_FTYPE_VOID:
35934 case UNSIGNED_FTYPE_VOID:
35935 nargs = 0;
35936 klass = load;
35937 memory = 0;
35938 break;
35939 case UINT64_FTYPE_PUNSIGNED:
35940 case V2DI_FTYPE_PV2DI:
35941 case V4DI_FTYPE_PV4DI:
35942 case V32QI_FTYPE_PCCHAR:
35943 case V16QI_FTYPE_PCCHAR:
35944 case V8SF_FTYPE_PCV4SF:
35945 case V8SF_FTYPE_PCFLOAT:
35946 case V4SF_FTYPE_PCFLOAT:
35947 case V4DF_FTYPE_PCV2DF:
35948 case V4DF_FTYPE_PCDOUBLE:
35949 case V2DF_FTYPE_PCDOUBLE:
35950 case VOID_FTYPE_PVOID:
35951 case V8DI_FTYPE_PV8DI:
35952 nargs = 1;
35953 klass = load;
35954 memory = 0;
35955 switch (icode)
35957 case CODE_FOR_sse4_1_movntdqa:
35958 case CODE_FOR_avx2_movntdqa:
35959 case CODE_FOR_avx512f_movntdqa:
35960 aligned_mem = true;
35961 break;
35962 default:
35963 break;
35965 break;
35966 case VOID_FTYPE_PV2SF_V4SF:
35967 case VOID_FTYPE_PV8DI_V8DI:
35968 case VOID_FTYPE_PV4DI_V4DI:
35969 case VOID_FTYPE_PV2DI_V2DI:
35970 case VOID_FTYPE_PCHAR_V32QI:
35971 case VOID_FTYPE_PCHAR_V16QI:
35972 case VOID_FTYPE_PFLOAT_V16SF:
35973 case VOID_FTYPE_PFLOAT_V8SF:
35974 case VOID_FTYPE_PFLOAT_V4SF:
35975 case VOID_FTYPE_PDOUBLE_V8DF:
35976 case VOID_FTYPE_PDOUBLE_V4DF:
35977 case VOID_FTYPE_PDOUBLE_V2DF:
35978 case VOID_FTYPE_PLONGLONG_LONGLONG:
35979 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35980 case VOID_FTYPE_PINT_INT:
35981 nargs = 1;
35982 klass = store;
35983 /* Reserve memory operand for target. */
35984 memory = ARRAY_SIZE (args);
35985 switch (icode)
35987 /* These builtins and instructions require the memory
35988 to be properly aligned. */
35989 case CODE_FOR_avx_movntv4di:
35990 case CODE_FOR_sse2_movntv2di:
35991 case CODE_FOR_avx_movntv8sf:
35992 case CODE_FOR_sse_movntv4sf:
35993 case CODE_FOR_sse4a_vmmovntv4sf:
35994 case CODE_FOR_avx_movntv4df:
35995 case CODE_FOR_sse2_movntv2df:
35996 case CODE_FOR_sse4a_vmmovntv2df:
35997 case CODE_FOR_sse2_movntidi:
35998 case CODE_FOR_sse_movntq:
35999 case CODE_FOR_sse2_movntisi:
36000 case CODE_FOR_avx512f_movntv16sf:
36001 case CODE_FOR_avx512f_movntv8df:
36002 case CODE_FOR_avx512f_movntv8di:
36003 aligned_mem = true;
36004 break;
36005 default:
36006 break;
36008 break;
36009 case V4SF_FTYPE_V4SF_PCV2SF:
36010 case V2DF_FTYPE_V2DF_PCDOUBLE:
36011 nargs = 2;
36012 klass = load;
36013 memory = 1;
36014 break;
36015 case V8SF_FTYPE_PCV8SF_V8SI:
36016 case V4DF_FTYPE_PCV4DF_V4DI:
36017 case V4SF_FTYPE_PCV4SF_V4SI:
36018 case V2DF_FTYPE_PCV2DF_V2DI:
36019 case V8SI_FTYPE_PCV8SI_V8SI:
36020 case V4DI_FTYPE_PCV4DI_V4DI:
36021 case V4SI_FTYPE_PCV4SI_V4SI:
36022 case V2DI_FTYPE_PCV2DI_V2DI:
36023 case VOID_FTYPE_INT_INT64:
36024 nargs = 2;
36025 klass = load;
36026 memory = 0;
36027 break;
36028 case VOID_FTYPE_PV8DF_V8DF_UQI:
36029 case VOID_FTYPE_PV4DF_V4DF_UQI:
36030 case VOID_FTYPE_PV2DF_V2DF_UQI:
36031 case VOID_FTYPE_PV16SF_V16SF_UHI:
36032 case VOID_FTYPE_PV8SF_V8SF_UQI:
36033 case VOID_FTYPE_PV4SF_V4SF_UQI:
36034 case VOID_FTYPE_PV8DI_V8DI_UQI:
36035 case VOID_FTYPE_PV4DI_V4DI_UQI:
36036 case VOID_FTYPE_PV2DI_V2DI_UQI:
36037 case VOID_FTYPE_PV16SI_V16SI_UHI:
36038 case VOID_FTYPE_PV8SI_V8SI_UQI:
36039 case VOID_FTYPE_PV4SI_V4SI_UQI:
36040 case VOID_FTYPE_PV64QI_V64QI_UDI:
36041 case VOID_FTYPE_PV32HI_V32HI_USI:
36042 case VOID_FTYPE_PV32QI_V32QI_USI:
36043 case VOID_FTYPE_PV16QI_V16QI_UHI:
36044 case VOID_FTYPE_PV16HI_V16HI_UHI:
36045 case VOID_FTYPE_PV8HI_V8HI_UQI:
36046 switch (icode)
36048 /* These builtins and instructions require the memory
36049 to be properly aligned. */
36050 case CODE_FOR_avx512f_storev16sf_mask:
36051 case CODE_FOR_avx512f_storev16si_mask:
36052 case CODE_FOR_avx512f_storev8df_mask:
36053 case CODE_FOR_avx512f_storev8di_mask:
36054 case CODE_FOR_avx512vl_storev8sf_mask:
36055 case CODE_FOR_avx512vl_storev8si_mask:
36056 case CODE_FOR_avx512vl_storev4df_mask:
36057 case CODE_FOR_avx512vl_storev4di_mask:
36058 case CODE_FOR_avx512vl_storev4sf_mask:
36059 case CODE_FOR_avx512vl_storev4si_mask:
36060 case CODE_FOR_avx512vl_storev2df_mask:
36061 case CODE_FOR_avx512vl_storev2di_mask:
36062 aligned_mem = true;
36063 break;
36064 default:
36065 break;
36067 /* FALLTHRU */
36068 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36069 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36070 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36071 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36072 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36073 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36074 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36075 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36076 case VOID_FTYPE_PV8SI_V8DI_UQI:
36077 case VOID_FTYPE_PV8HI_V8DI_UQI:
36078 case VOID_FTYPE_PV16HI_V16SI_UHI:
36079 case VOID_FTYPE_PV16QI_V8DI_UQI:
36080 case VOID_FTYPE_PV16QI_V16SI_UHI:
36081 case VOID_FTYPE_PV4SI_V4DI_UQI:
36082 case VOID_FTYPE_PV4SI_V2DI_UQI:
36083 case VOID_FTYPE_PV8HI_V4DI_UQI:
36084 case VOID_FTYPE_PV8HI_V2DI_UQI:
36085 case VOID_FTYPE_PV8HI_V8SI_UQI:
36086 case VOID_FTYPE_PV8HI_V4SI_UQI:
36087 case VOID_FTYPE_PV16QI_V4DI_UQI:
36088 case VOID_FTYPE_PV16QI_V2DI_UQI:
36089 case VOID_FTYPE_PV16QI_V8SI_UQI:
36090 case VOID_FTYPE_PV16QI_V4SI_UQI:
36091 case VOID_FTYPE_PCHAR_V64QI_UDI:
36092 case VOID_FTYPE_PCHAR_V32QI_USI:
36093 case VOID_FTYPE_PCHAR_V16QI_UHI:
36094 case VOID_FTYPE_PSHORT_V32HI_USI:
36095 case VOID_FTYPE_PSHORT_V16HI_UHI:
36096 case VOID_FTYPE_PSHORT_V8HI_UQI:
36097 case VOID_FTYPE_PINT_V16SI_UHI:
36098 case VOID_FTYPE_PINT_V8SI_UQI:
36099 case VOID_FTYPE_PINT_V4SI_UQI:
36100 case VOID_FTYPE_PINT64_V8DI_UQI:
36101 case VOID_FTYPE_PINT64_V4DI_UQI:
36102 case VOID_FTYPE_PINT64_V2DI_UQI:
36103 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36104 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36105 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36106 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36107 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36108 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36109 case VOID_FTYPE_PV32QI_V32HI_USI:
36110 case VOID_FTYPE_PV16QI_V16HI_UHI:
36111 case VOID_FTYPE_PV8QI_V8HI_UQI:
36112 nargs = 2;
36113 klass = store;
36114 /* Reserve memory operand for target. */
36115 memory = ARRAY_SIZE (args);
36116 break;
36117 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36118 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36119 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36120 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36121 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36122 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36123 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36124 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36125 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36126 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36127 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36128 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36129 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
36130 case V32HI_FTYPE_PCV32HI_V32HI_USI:
36131 case V32QI_FTYPE_PCV32QI_V32QI_USI:
36132 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
36133 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
36134 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
36135 switch (icode)
36137 /* These builtins and instructions require the memory
36138 to be properly aligned. */
36139 case CODE_FOR_avx512f_loadv16sf_mask:
36140 case CODE_FOR_avx512f_loadv16si_mask:
36141 case CODE_FOR_avx512f_loadv8df_mask:
36142 case CODE_FOR_avx512f_loadv8di_mask:
36143 case CODE_FOR_avx512vl_loadv8sf_mask:
36144 case CODE_FOR_avx512vl_loadv8si_mask:
36145 case CODE_FOR_avx512vl_loadv4df_mask:
36146 case CODE_FOR_avx512vl_loadv4di_mask:
36147 case CODE_FOR_avx512vl_loadv4sf_mask:
36148 case CODE_FOR_avx512vl_loadv4si_mask:
36149 case CODE_FOR_avx512vl_loadv2df_mask:
36150 case CODE_FOR_avx512vl_loadv2di_mask:
36151 case CODE_FOR_avx512bw_loadv64qi_mask:
36152 case CODE_FOR_avx512vl_loadv32qi_mask:
36153 case CODE_FOR_avx512vl_loadv16qi_mask:
36154 case CODE_FOR_avx512bw_loadv32hi_mask:
36155 case CODE_FOR_avx512vl_loadv16hi_mask:
36156 case CODE_FOR_avx512vl_loadv8hi_mask:
36157 aligned_mem = true;
36158 break;
36159 default:
36160 break;
36162 /* FALLTHRU */
36163 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36164 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36165 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36166 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36167 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36168 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36169 case V16SI_FTYPE_PCINT_V16SI_UHI:
36170 case V8SI_FTYPE_PCINT_V8SI_UQI:
36171 case V4SI_FTYPE_PCINT_V4SI_UQI:
36172 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36173 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36174 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36175 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36176 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36177 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36178 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36179 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36180 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36181 nargs = 3;
36182 klass = load;
36183 memory = 0;
36184 break;
36185 case VOID_FTYPE_UINT_UINT_UINT:
36186 case VOID_FTYPE_UINT64_UINT_UINT:
36187 case UCHAR_FTYPE_UINT_UINT_UINT:
36188 case UCHAR_FTYPE_UINT64_UINT_UINT:
36189 nargs = 3;
36190 klass = load;
36191 memory = ARRAY_SIZE (args);
36192 last_arg_constant = true;
36193 break;
36194 default:
36195 gcc_unreachable ();
36198 gcc_assert (nargs <= ARRAY_SIZE (args));
36200 if (klass == store)
36202 arg = CALL_EXPR_ARG (exp, 0);
36203 op = expand_normal (arg);
36204 gcc_assert (target == 0);
36205 if (memory)
36207 op = ix86_zero_extend_to_Pmode (op);
36208 target = gen_rtx_MEM (tmode, op);
36209 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36210 on it. Try to improve it using get_pointer_alignment,
36211 and if the special builtin is one that requires strict
36212 mode alignment, also from it's GET_MODE_ALIGNMENT.
36213 Failure to do so could lead to ix86_legitimate_combined_insn
36214 rejecting all changes to such insns. */
36215 unsigned int align = get_pointer_alignment (arg);
36216 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36217 align = GET_MODE_ALIGNMENT (tmode);
36218 if (MEM_ALIGN (target) < align)
36219 set_mem_align (target, align);
36221 else
36222 target = force_reg (tmode, op);
36223 arg_adjust = 1;
36225 else
36227 arg_adjust = 0;
36228 if (optimize
36229 || target == 0
36230 || !register_operand (target, tmode)
36231 || GET_MODE (target) != tmode)
36232 target = gen_reg_rtx (tmode);
36235 for (i = 0; i < nargs; i++)
36237 machine_mode mode = insn_p->operand[i + 1].mode;
36238 bool match;
36240 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36241 op = expand_normal (arg);
36242 match = insn_p->operand[i + 1].predicate (op, mode);
36244 if (last_arg_constant && (i + 1) == nargs)
36246 if (!match)
36248 if (icode == CODE_FOR_lwp_lwpvalsi3
36249 || icode == CODE_FOR_lwp_lwpinssi3
36250 || icode == CODE_FOR_lwp_lwpvaldi3
36251 || icode == CODE_FOR_lwp_lwpinsdi3)
36252 error ("the last argument must be a 32-bit immediate");
36253 else
36254 error ("the last argument must be an 8-bit immediate");
36255 return const0_rtx;
36258 else
36260 if (i == memory)
36262 /* This must be the memory operand. */
36263 op = ix86_zero_extend_to_Pmode (op);
36264 op = gen_rtx_MEM (mode, op);
36265 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36266 on it. Try to improve it using get_pointer_alignment,
36267 and if the special builtin is one that requires strict
36268 mode alignment, also from it's GET_MODE_ALIGNMENT.
36269 Failure to do so could lead to ix86_legitimate_combined_insn
36270 rejecting all changes to such insns. */
36271 unsigned int align = get_pointer_alignment (arg);
36272 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36273 align = GET_MODE_ALIGNMENT (mode);
36274 if (MEM_ALIGN (op) < align)
36275 set_mem_align (op, align);
36277 else
36279 /* This must be register. */
36280 if (VECTOR_MODE_P (mode))
36281 op = safe_vector_operand (op, mode);
36283 op = fixup_modeless_constant (op, mode);
36285 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36286 op = copy_to_mode_reg (mode, op);
36287 else
36289 op = copy_to_reg (op);
36290 op = lowpart_subreg (mode, op, GET_MODE (op));
36295 args[i].op = op;
36296 args[i].mode = mode;
36299 switch (nargs)
36301 case 0:
36302 pat = GEN_FCN (icode) (target);
36303 break;
36304 case 1:
36305 pat = GEN_FCN (icode) (target, args[0].op);
36306 break;
36307 case 2:
36308 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36309 break;
36310 case 3:
36311 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36312 break;
36313 default:
36314 gcc_unreachable ();
36317 if (! pat)
36318 return 0;
36319 emit_insn (pat);
36320 return klass == store ? 0 : target;
36323 /* Return the integer constant in ARG. Constrain it to be in the range
36324 of the subparts of VEC_TYPE; issue an error if not. */
36326 static int
36327 get_element_number (tree vec_type, tree arg)
36329 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36331 if (!tree_fits_uhwi_p (arg)
36332 || (elt = tree_to_uhwi (arg), elt > max))
36334 error ("selector must be an integer constant in the range 0..%wi", max);
36335 return 0;
36338 return elt;
36341 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36342 ix86_expand_vector_init. We DO have language-level syntax for this, in
36343 the form of (type){ init-list }. Except that since we can't place emms
36344 instructions from inside the compiler, we can't allow the use of MMX
36345 registers unless the user explicitly asks for it. So we do *not* define
36346 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36347 we have builtins invoked by mmintrin.h that gives us license to emit
36348 these sorts of instructions. */
36350 static rtx
36351 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36353 machine_mode tmode = TYPE_MODE (type);
36354 machine_mode inner_mode = GET_MODE_INNER (tmode);
36355 int i, n_elt = GET_MODE_NUNITS (tmode);
36356 rtvec v = rtvec_alloc (n_elt);
36358 gcc_assert (VECTOR_MODE_P (tmode));
36359 gcc_assert (call_expr_nargs (exp) == n_elt);
36361 for (i = 0; i < n_elt; ++i)
36363 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36364 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36367 if (!target || !register_operand (target, tmode))
36368 target = gen_reg_rtx (tmode);
36370 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36371 return target;
36374 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36375 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36376 had a language-level syntax for referencing vector elements. */
36378 static rtx
36379 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36381 machine_mode tmode, mode0;
36382 tree arg0, arg1;
36383 int elt;
36384 rtx op0;
36386 arg0 = CALL_EXPR_ARG (exp, 0);
36387 arg1 = CALL_EXPR_ARG (exp, 1);
36389 op0 = expand_normal (arg0);
36390 elt = get_element_number (TREE_TYPE (arg0), arg1);
36392 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36393 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36394 gcc_assert (VECTOR_MODE_P (mode0));
36396 op0 = force_reg (mode0, op0);
36398 if (optimize || !target || !register_operand (target, tmode))
36399 target = gen_reg_rtx (tmode);
36401 ix86_expand_vector_extract (true, target, op0, elt);
36403 return target;
36406 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36407 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36408 a language-level syntax for referencing vector elements. */
36410 static rtx
36411 ix86_expand_vec_set_builtin (tree exp)
36413 machine_mode tmode, mode1;
36414 tree arg0, arg1, arg2;
36415 int elt;
36416 rtx op0, op1, target;
36418 arg0 = CALL_EXPR_ARG (exp, 0);
36419 arg1 = CALL_EXPR_ARG (exp, 1);
36420 arg2 = CALL_EXPR_ARG (exp, 2);
36422 tmode = TYPE_MODE (TREE_TYPE (arg0));
36423 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36424 gcc_assert (VECTOR_MODE_P (tmode));
36426 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36427 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36428 elt = get_element_number (TREE_TYPE (arg0), arg2);
36430 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36431 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36433 op0 = force_reg (tmode, op0);
36434 op1 = force_reg (mode1, op1);
36436 /* OP0 is the source of these builtin functions and shouldn't be
36437 modified. Create a copy, use it and return it as target. */
36438 target = gen_reg_rtx (tmode);
36439 emit_move_insn (target, op0);
36440 ix86_expand_vector_set (true, target, op1, elt);
36442 return target;
36445 /* Emit conditional move of SRC to DST with condition
36446 OP1 CODE OP2. */
36447 static void
36448 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36450 rtx t;
36452 if (TARGET_CMOVE)
36454 t = ix86_expand_compare (code, op1, op2);
36455 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36456 src, dst)));
36458 else
36460 rtx_code_label *nomove = gen_label_rtx ();
36461 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36462 const0_rtx, GET_MODE (op1), 1, nomove);
36463 emit_move_insn (dst, src);
36464 emit_label (nomove);
36468 /* Choose max of DST and SRC and put it to DST. */
36469 static void
36470 ix86_emit_move_max (rtx dst, rtx src)
36472 ix86_emit_cmove (dst, src, LTU, dst, src);
36475 /* Expand an expression EXP that calls a built-in function,
36476 with result going to TARGET if that's convenient
36477 (and in mode MODE if that's convenient).
36478 SUBTARGET may be used as the target for computing one of EXP's operands.
36479 IGNORE is nonzero if the value is to be ignored. */
36481 static rtx
36482 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36483 machine_mode mode, int ignore)
36485 size_t i;
36486 enum insn_code icode, icode2;
36487 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36488 tree arg0, arg1, arg2, arg3, arg4;
36489 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36490 machine_mode mode0, mode1, mode2, mode3, mode4;
36491 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36493 /* For CPU builtins that can be folded, fold first and expand the fold. */
36494 switch (fcode)
36496 case IX86_BUILTIN_CPU_INIT:
36498 /* Make it call __cpu_indicator_init in libgcc. */
36499 tree call_expr, fndecl, type;
36500 type = build_function_type_list (integer_type_node, NULL_TREE);
36501 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36502 call_expr = build_call_expr (fndecl, 0);
36503 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36505 case IX86_BUILTIN_CPU_IS:
36506 case IX86_BUILTIN_CPU_SUPPORTS:
36508 tree arg0 = CALL_EXPR_ARG (exp, 0);
36509 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36510 gcc_assert (fold_expr != NULL_TREE);
36511 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36515 HOST_WIDE_INT isa = ix86_isa_flags;
36516 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36517 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36518 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36519 /* The general case is we require all the ISAs specified in bisa{,2}
36520 to be enabled.
36521 The exceptions are:
36522 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36523 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36524 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36525 where for each this pair it is sufficient if either of the ISAs is
36526 enabled, plus if it is ored with other options also those others. */
36527 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36528 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36529 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36530 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36531 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36532 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36533 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36534 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36535 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36536 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36537 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36538 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36539 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36541 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36542 (enum fpmath_unit) 0, false);
36543 if (!opts)
36544 error ("%qE needs unknown isa option", fndecl);
36545 else
36547 gcc_assert (opts != NULL);
36548 error ("%qE needs isa option %s", fndecl, opts);
36549 free (opts);
36551 return expand_call (exp, target, ignore);
36554 switch (fcode)
36556 case IX86_BUILTIN_BNDMK:
36557 if (!target
36558 || GET_MODE (target) != BNDmode
36559 || !register_operand (target, BNDmode))
36560 target = gen_reg_rtx (BNDmode);
36562 arg0 = CALL_EXPR_ARG (exp, 0);
36563 arg1 = CALL_EXPR_ARG (exp, 1);
36565 op0 = expand_normal (arg0);
36566 op1 = expand_normal (arg1);
36568 if (!register_operand (op0, Pmode))
36569 op0 = ix86_zero_extend_to_Pmode (op0);
36570 if (!register_operand (op1, Pmode))
36571 op1 = ix86_zero_extend_to_Pmode (op1);
36573 /* Builtin arg1 is size of block but instruction op1 should
36574 be (size - 1). */
36575 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36576 NULL_RTX, 1, OPTAB_DIRECT);
36578 emit_insn (BNDmode == BND64mode
36579 ? gen_bnd64_mk (target, op0, op1)
36580 : gen_bnd32_mk (target, op0, op1));
36581 return target;
36583 case IX86_BUILTIN_BNDSTX:
36584 arg0 = CALL_EXPR_ARG (exp, 0);
36585 arg1 = CALL_EXPR_ARG (exp, 1);
36586 arg2 = CALL_EXPR_ARG (exp, 2);
36588 op0 = expand_normal (arg0);
36589 op1 = expand_normal (arg1);
36590 op2 = expand_normal (arg2);
36592 if (!register_operand (op0, Pmode))
36593 op0 = ix86_zero_extend_to_Pmode (op0);
36594 if (!register_operand (op1, BNDmode))
36595 op1 = copy_to_mode_reg (BNDmode, op1);
36596 if (!register_operand (op2, Pmode))
36597 op2 = ix86_zero_extend_to_Pmode (op2);
36599 emit_insn (BNDmode == BND64mode
36600 ? gen_bnd64_stx (op2, op0, op1)
36601 : gen_bnd32_stx (op2, op0, op1));
36602 return 0;
36604 case IX86_BUILTIN_BNDLDX:
36605 if (!target
36606 || GET_MODE (target) != BNDmode
36607 || !register_operand (target, BNDmode))
36608 target = gen_reg_rtx (BNDmode);
36610 arg0 = CALL_EXPR_ARG (exp, 0);
36611 arg1 = CALL_EXPR_ARG (exp, 1);
36613 op0 = expand_normal (arg0);
36614 op1 = expand_normal (arg1);
36616 if (!register_operand (op0, Pmode))
36617 op0 = ix86_zero_extend_to_Pmode (op0);
36618 if (!register_operand (op1, Pmode))
36619 op1 = ix86_zero_extend_to_Pmode (op1);
36621 emit_insn (BNDmode == BND64mode
36622 ? gen_bnd64_ldx (target, op0, op1)
36623 : gen_bnd32_ldx (target, op0, op1));
36624 return target;
36626 case IX86_BUILTIN_BNDCL:
36627 arg0 = CALL_EXPR_ARG (exp, 0);
36628 arg1 = CALL_EXPR_ARG (exp, 1);
36630 op0 = expand_normal (arg0);
36631 op1 = expand_normal (arg1);
36633 if (!register_operand (op0, Pmode))
36634 op0 = ix86_zero_extend_to_Pmode (op0);
36635 if (!register_operand (op1, BNDmode))
36636 op1 = copy_to_mode_reg (BNDmode, op1);
36638 emit_insn (BNDmode == BND64mode
36639 ? gen_bnd64_cl (op1, op0)
36640 : gen_bnd32_cl (op1, op0));
36641 return 0;
36643 case IX86_BUILTIN_BNDCU:
36644 arg0 = CALL_EXPR_ARG (exp, 0);
36645 arg1 = CALL_EXPR_ARG (exp, 1);
36647 op0 = expand_normal (arg0);
36648 op1 = expand_normal (arg1);
36650 if (!register_operand (op0, Pmode))
36651 op0 = ix86_zero_extend_to_Pmode (op0);
36652 if (!register_operand (op1, BNDmode))
36653 op1 = copy_to_mode_reg (BNDmode, op1);
36655 emit_insn (BNDmode == BND64mode
36656 ? gen_bnd64_cu (op1, op0)
36657 : gen_bnd32_cu (op1, op0));
36658 return 0;
36660 case IX86_BUILTIN_BNDRET:
36661 arg0 = CALL_EXPR_ARG (exp, 0);
36662 target = chkp_get_rtl_bounds (arg0);
36664 /* If no bounds were specified for returned value,
36665 then use INIT bounds. It usually happens when
36666 some built-in function is expanded. */
36667 if (!target)
36669 rtx t1 = gen_reg_rtx (Pmode);
36670 rtx t2 = gen_reg_rtx (Pmode);
36671 target = gen_reg_rtx (BNDmode);
36672 emit_move_insn (t1, const0_rtx);
36673 emit_move_insn (t2, constm1_rtx);
36674 emit_insn (BNDmode == BND64mode
36675 ? gen_bnd64_mk (target, t1, t2)
36676 : gen_bnd32_mk (target, t1, t2));
36679 gcc_assert (target && REG_P (target));
36680 return target;
36682 case IX86_BUILTIN_BNDNARROW:
36684 rtx m1, m1h1, m1h2, lb, ub, t1;
36686 /* Return value and lb. */
36687 arg0 = CALL_EXPR_ARG (exp, 0);
36688 /* Bounds. */
36689 arg1 = CALL_EXPR_ARG (exp, 1);
36690 /* Size. */
36691 arg2 = CALL_EXPR_ARG (exp, 2);
36693 lb = expand_normal (arg0);
36694 op1 = expand_normal (arg1);
36695 op2 = expand_normal (arg2);
36697 /* Size was passed but we need to use (size - 1) as for bndmk. */
36698 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36699 NULL_RTX, 1, OPTAB_DIRECT);
36701 /* Add LB to size and inverse to get UB. */
36702 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36703 op2, 1, OPTAB_DIRECT);
36704 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36706 if (!register_operand (lb, Pmode))
36707 lb = ix86_zero_extend_to_Pmode (lb);
36708 if (!register_operand (ub, Pmode))
36709 ub = ix86_zero_extend_to_Pmode (ub);
36711 /* We need to move bounds to memory before any computations. */
36712 if (MEM_P (op1))
36713 m1 = op1;
36714 else
36716 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36717 emit_move_insn (m1, op1);
36720 /* Generate mem expression to be used for access to LB and UB. */
36721 m1h1 = adjust_address (m1, Pmode, 0);
36722 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36724 t1 = gen_reg_rtx (Pmode);
36726 /* Compute LB. */
36727 emit_move_insn (t1, m1h1);
36728 ix86_emit_move_max (t1, lb);
36729 emit_move_insn (m1h1, t1);
36731 /* Compute UB. UB is stored in 1's complement form. Therefore
36732 we also use max here. */
36733 emit_move_insn (t1, m1h2);
36734 ix86_emit_move_max (t1, ub);
36735 emit_move_insn (m1h2, t1);
36737 op2 = gen_reg_rtx (BNDmode);
36738 emit_move_insn (op2, m1);
36740 return chkp_join_splitted_slot (lb, op2);
36743 case IX86_BUILTIN_BNDINT:
36745 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36747 if (!target
36748 || GET_MODE (target) != BNDmode
36749 || !register_operand (target, BNDmode))
36750 target = gen_reg_rtx (BNDmode);
36752 arg0 = CALL_EXPR_ARG (exp, 0);
36753 arg1 = CALL_EXPR_ARG (exp, 1);
36755 op0 = expand_normal (arg0);
36756 op1 = expand_normal (arg1);
36758 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36759 rh1 = adjust_address (res, Pmode, 0);
36760 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36762 /* Put first bounds to temporaries. */
36763 lb1 = gen_reg_rtx (Pmode);
36764 ub1 = gen_reg_rtx (Pmode);
36765 if (MEM_P (op0))
36767 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36768 emit_move_insn (ub1, adjust_address (op0, Pmode,
36769 GET_MODE_SIZE (Pmode)));
36771 else
36773 emit_move_insn (res, op0);
36774 emit_move_insn (lb1, rh1);
36775 emit_move_insn (ub1, rh2);
36778 /* Put second bounds to temporaries. */
36779 lb2 = gen_reg_rtx (Pmode);
36780 ub2 = gen_reg_rtx (Pmode);
36781 if (MEM_P (op1))
36783 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36784 emit_move_insn (ub2, adjust_address (op1, Pmode,
36785 GET_MODE_SIZE (Pmode)));
36787 else
36789 emit_move_insn (res, op1);
36790 emit_move_insn (lb2, rh1);
36791 emit_move_insn (ub2, rh2);
36794 /* Compute LB. */
36795 ix86_emit_move_max (lb1, lb2);
36796 emit_move_insn (rh1, lb1);
36798 /* Compute UB. UB is stored in 1's complement form. Therefore
36799 we also use max here. */
36800 ix86_emit_move_max (ub1, ub2);
36801 emit_move_insn (rh2, ub1);
36803 emit_move_insn (target, res);
36805 return target;
36808 case IX86_BUILTIN_SIZEOF:
36810 tree name;
36811 rtx symbol;
36813 if (!target
36814 || GET_MODE (target) != Pmode
36815 || !register_operand (target, Pmode))
36816 target = gen_reg_rtx (Pmode);
36818 arg0 = CALL_EXPR_ARG (exp, 0);
36819 gcc_assert (VAR_P (arg0));
36821 name = DECL_ASSEMBLER_NAME (arg0);
36822 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36824 emit_insn (Pmode == SImode
36825 ? gen_move_size_reloc_si (target, symbol)
36826 : gen_move_size_reloc_di (target, symbol));
36828 return target;
36831 case IX86_BUILTIN_BNDLOWER:
36833 rtx mem, hmem;
36835 if (!target
36836 || GET_MODE (target) != Pmode
36837 || !register_operand (target, Pmode))
36838 target = gen_reg_rtx (Pmode);
36840 arg0 = CALL_EXPR_ARG (exp, 0);
36841 op0 = expand_normal (arg0);
36843 /* We need to move bounds to memory first. */
36844 if (MEM_P (op0))
36845 mem = op0;
36846 else
36848 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36849 emit_move_insn (mem, op0);
36852 /* Generate mem expression to access LB and load it. */
36853 hmem = adjust_address (mem, Pmode, 0);
36854 emit_move_insn (target, hmem);
36856 return target;
36859 case IX86_BUILTIN_BNDUPPER:
36861 rtx mem, hmem, res;
36863 if (!target
36864 || GET_MODE (target) != Pmode
36865 || !register_operand (target, Pmode))
36866 target = gen_reg_rtx (Pmode);
36868 arg0 = CALL_EXPR_ARG (exp, 0);
36869 op0 = expand_normal (arg0);
36871 /* We need to move bounds to memory first. */
36872 if (MEM_P (op0))
36873 mem = op0;
36874 else
36876 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36877 emit_move_insn (mem, op0);
36880 /* Generate mem expression to access UB. */
36881 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36883 /* We need to inverse all bits of UB. */
36884 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36886 if (res != target)
36887 emit_move_insn (target, res);
36889 return target;
36892 case IX86_BUILTIN_MASKMOVQ:
36893 case IX86_BUILTIN_MASKMOVDQU:
36894 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36895 ? CODE_FOR_mmx_maskmovq
36896 : CODE_FOR_sse2_maskmovdqu);
36897 /* Note the arg order is different from the operand order. */
36898 arg1 = CALL_EXPR_ARG (exp, 0);
36899 arg2 = CALL_EXPR_ARG (exp, 1);
36900 arg0 = CALL_EXPR_ARG (exp, 2);
36901 op0 = expand_normal (arg0);
36902 op1 = expand_normal (arg1);
36903 op2 = expand_normal (arg2);
36904 mode0 = insn_data[icode].operand[0].mode;
36905 mode1 = insn_data[icode].operand[1].mode;
36906 mode2 = insn_data[icode].operand[2].mode;
36908 op0 = ix86_zero_extend_to_Pmode (op0);
36909 op0 = gen_rtx_MEM (mode1, op0);
36911 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36912 op0 = copy_to_mode_reg (mode0, op0);
36913 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36914 op1 = copy_to_mode_reg (mode1, op1);
36915 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36916 op2 = copy_to_mode_reg (mode2, op2);
36917 pat = GEN_FCN (icode) (op0, op1, op2);
36918 if (! pat)
36919 return 0;
36920 emit_insn (pat);
36921 return 0;
36923 case IX86_BUILTIN_LDMXCSR:
36924 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36925 target = assign_386_stack_local (SImode, SLOT_TEMP);
36926 emit_move_insn (target, op0);
36927 emit_insn (gen_sse_ldmxcsr (target));
36928 return 0;
36930 case IX86_BUILTIN_STMXCSR:
36931 target = assign_386_stack_local (SImode, SLOT_TEMP);
36932 emit_insn (gen_sse_stmxcsr (target));
36933 return copy_to_mode_reg (SImode, target);
36935 case IX86_BUILTIN_CLFLUSH:
36936 arg0 = CALL_EXPR_ARG (exp, 0);
36937 op0 = expand_normal (arg0);
36938 icode = CODE_FOR_sse2_clflush;
36939 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36940 op0 = ix86_zero_extend_to_Pmode (op0);
36942 emit_insn (gen_sse2_clflush (op0));
36943 return 0;
36945 case IX86_BUILTIN_CLWB:
36946 arg0 = CALL_EXPR_ARG (exp, 0);
36947 op0 = expand_normal (arg0);
36948 icode = CODE_FOR_clwb;
36949 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36950 op0 = ix86_zero_extend_to_Pmode (op0);
36952 emit_insn (gen_clwb (op0));
36953 return 0;
36955 case IX86_BUILTIN_CLFLUSHOPT:
36956 arg0 = CALL_EXPR_ARG (exp, 0);
36957 op0 = expand_normal (arg0);
36958 icode = CODE_FOR_clflushopt;
36959 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36960 op0 = ix86_zero_extend_to_Pmode (op0);
36962 emit_insn (gen_clflushopt (op0));
36963 return 0;
36965 case IX86_BUILTIN_MONITOR:
36966 case IX86_BUILTIN_MONITORX:
36967 arg0 = CALL_EXPR_ARG (exp, 0);
36968 arg1 = CALL_EXPR_ARG (exp, 1);
36969 arg2 = CALL_EXPR_ARG (exp, 2);
36970 op0 = expand_normal (arg0);
36971 op1 = expand_normal (arg1);
36972 op2 = expand_normal (arg2);
36973 if (!REG_P (op0))
36974 op0 = ix86_zero_extend_to_Pmode (op0);
36975 if (!REG_P (op1))
36976 op1 = copy_to_mode_reg (SImode, op1);
36977 if (!REG_P (op2))
36978 op2 = copy_to_mode_reg (SImode, op2);
36980 emit_insn (fcode == IX86_BUILTIN_MONITOR
36981 ? ix86_gen_monitor (op0, op1, op2)
36982 : ix86_gen_monitorx (op0, op1, op2));
36983 return 0;
36985 case IX86_BUILTIN_MWAIT:
36986 arg0 = CALL_EXPR_ARG (exp, 0);
36987 arg1 = CALL_EXPR_ARG (exp, 1);
36988 op0 = expand_normal (arg0);
36989 op1 = expand_normal (arg1);
36990 if (!REG_P (op0))
36991 op0 = copy_to_mode_reg (SImode, op0);
36992 if (!REG_P (op1))
36993 op1 = copy_to_mode_reg (SImode, op1);
36994 emit_insn (gen_sse3_mwait (op0, op1));
36995 return 0;
36997 case IX86_BUILTIN_MWAITX:
36998 arg0 = CALL_EXPR_ARG (exp, 0);
36999 arg1 = CALL_EXPR_ARG (exp, 1);
37000 arg2 = CALL_EXPR_ARG (exp, 2);
37001 op0 = expand_normal (arg0);
37002 op1 = expand_normal (arg1);
37003 op2 = expand_normal (arg2);
37004 if (!REG_P (op0))
37005 op0 = copy_to_mode_reg (SImode, op0);
37006 if (!REG_P (op1))
37007 op1 = copy_to_mode_reg (SImode, op1);
37008 if (!REG_P (op2))
37009 op2 = copy_to_mode_reg (SImode, op2);
37010 emit_insn (gen_mwaitx (op0, op1, op2));
37011 return 0;
37013 case IX86_BUILTIN_CLZERO:
37014 arg0 = CALL_EXPR_ARG (exp, 0);
37015 op0 = expand_normal (arg0);
37016 if (!REG_P (op0))
37017 op0 = ix86_zero_extend_to_Pmode (op0);
37018 emit_insn (ix86_gen_clzero (op0));
37019 return 0;
37021 case IX86_BUILTIN_VEC_INIT_V2SI:
37022 case IX86_BUILTIN_VEC_INIT_V4HI:
37023 case IX86_BUILTIN_VEC_INIT_V8QI:
37024 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37026 case IX86_BUILTIN_VEC_EXT_V2DF:
37027 case IX86_BUILTIN_VEC_EXT_V2DI:
37028 case IX86_BUILTIN_VEC_EXT_V4SF:
37029 case IX86_BUILTIN_VEC_EXT_V4SI:
37030 case IX86_BUILTIN_VEC_EXT_V8HI:
37031 case IX86_BUILTIN_VEC_EXT_V2SI:
37032 case IX86_BUILTIN_VEC_EXT_V4HI:
37033 case IX86_BUILTIN_VEC_EXT_V16QI:
37034 return ix86_expand_vec_ext_builtin (exp, target);
37036 case IX86_BUILTIN_VEC_SET_V2DI:
37037 case IX86_BUILTIN_VEC_SET_V4SF:
37038 case IX86_BUILTIN_VEC_SET_V4SI:
37039 case IX86_BUILTIN_VEC_SET_V8HI:
37040 case IX86_BUILTIN_VEC_SET_V4HI:
37041 case IX86_BUILTIN_VEC_SET_V16QI:
37042 return ix86_expand_vec_set_builtin (exp);
37044 case IX86_BUILTIN_NANQ:
37045 case IX86_BUILTIN_NANSQ:
37046 return expand_call (exp, target, ignore);
37048 case IX86_BUILTIN_RDPID:
37050 op0 = gen_reg_rtx (TARGET_64BIT ? DImode : SImode);
37052 if (TARGET_64BIT)
37054 insn = gen_rdpid_rex64 (op0);
37055 op0 = convert_to_mode (SImode, op0, 1);
37057 else
37058 insn = gen_rdpid (op0);
37059 emit_insn (insn);
37061 if (target == 0)
37063 /* mode is VOIDmode if __builtin_rdpid has been called
37064 without lhs. */
37065 if (mode == VOIDmode)
37066 return target;
37067 target = gen_reg_rtx (mode);
37069 emit_move_insn (target, op0);
37070 return target;
37071 case IX86_BUILTIN_RDPMC:
37072 case IX86_BUILTIN_RDTSC:
37073 case IX86_BUILTIN_RDTSCP:
37074 case IX86_BUILTIN_XGETBV:
37076 op0 = gen_reg_rtx (DImode);
37077 op1 = gen_reg_rtx (DImode);
37079 if (fcode == IX86_BUILTIN_RDPMC)
37081 arg0 = CALL_EXPR_ARG (exp, 0);
37082 op2 = expand_normal (arg0);
37083 if (!register_operand (op2, SImode))
37084 op2 = copy_to_mode_reg (SImode, op2);
37086 insn = (TARGET_64BIT
37087 ? gen_rdpmc_rex64 (op0, op1, op2)
37088 : gen_rdpmc (op0, op2));
37089 emit_insn (insn);
37091 else if (fcode == IX86_BUILTIN_XGETBV)
37093 arg0 = CALL_EXPR_ARG (exp, 0);
37094 op2 = expand_normal (arg0);
37095 if (!register_operand (op2, SImode))
37096 op2 = copy_to_mode_reg (SImode, op2);
37098 insn = (TARGET_64BIT
37099 ? gen_xgetbv_rex64 (op0, op1, op2)
37100 : gen_xgetbv (op0, op2));
37101 emit_insn (insn);
37103 else if (fcode == IX86_BUILTIN_RDTSC)
37105 insn = (TARGET_64BIT
37106 ? gen_rdtsc_rex64 (op0, op1)
37107 : gen_rdtsc (op0));
37108 emit_insn (insn);
37110 else
37112 op2 = gen_reg_rtx (SImode);
37114 insn = (TARGET_64BIT
37115 ? gen_rdtscp_rex64 (op0, op1, op2)
37116 : gen_rdtscp (op0, op2));
37117 emit_insn (insn);
37119 arg0 = CALL_EXPR_ARG (exp, 0);
37120 op4 = expand_normal (arg0);
37121 if (!address_operand (op4, VOIDmode))
37123 op4 = convert_memory_address (Pmode, op4);
37124 op4 = copy_addr_to_reg (op4);
37126 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37129 if (target == 0)
37131 /* mode is VOIDmode if __builtin_rd* has been called
37132 without lhs. */
37133 if (mode == VOIDmode)
37134 return target;
37135 target = gen_reg_rtx (mode);
37138 if (TARGET_64BIT)
37140 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37141 op1, 1, OPTAB_DIRECT);
37142 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37143 op0, 1, OPTAB_DIRECT);
37146 emit_move_insn (target, op0);
37147 return target;
37149 case IX86_BUILTIN_FXSAVE:
37150 case IX86_BUILTIN_FXRSTOR:
37151 case IX86_BUILTIN_FXSAVE64:
37152 case IX86_BUILTIN_FXRSTOR64:
37153 case IX86_BUILTIN_FNSTENV:
37154 case IX86_BUILTIN_FLDENV:
37155 mode0 = BLKmode;
37156 switch (fcode)
37158 case IX86_BUILTIN_FXSAVE:
37159 icode = CODE_FOR_fxsave;
37160 break;
37161 case IX86_BUILTIN_FXRSTOR:
37162 icode = CODE_FOR_fxrstor;
37163 break;
37164 case IX86_BUILTIN_FXSAVE64:
37165 icode = CODE_FOR_fxsave64;
37166 break;
37167 case IX86_BUILTIN_FXRSTOR64:
37168 icode = CODE_FOR_fxrstor64;
37169 break;
37170 case IX86_BUILTIN_FNSTENV:
37171 icode = CODE_FOR_fnstenv;
37172 break;
37173 case IX86_BUILTIN_FLDENV:
37174 icode = CODE_FOR_fldenv;
37175 break;
37176 default:
37177 gcc_unreachable ();
37180 arg0 = CALL_EXPR_ARG (exp, 0);
37181 op0 = expand_normal (arg0);
37183 if (!address_operand (op0, VOIDmode))
37185 op0 = convert_memory_address (Pmode, op0);
37186 op0 = copy_addr_to_reg (op0);
37188 op0 = gen_rtx_MEM (mode0, op0);
37190 pat = GEN_FCN (icode) (op0);
37191 if (pat)
37192 emit_insn (pat);
37193 return 0;
37195 case IX86_BUILTIN_XSETBV:
37196 arg0 = CALL_EXPR_ARG (exp, 0);
37197 arg1 = CALL_EXPR_ARG (exp, 1);
37198 op0 = expand_normal (arg0);
37199 op1 = expand_normal (arg1);
37201 if (!REG_P (op0))
37202 op0 = copy_to_mode_reg (SImode, op0);
37204 if (TARGET_64BIT)
37206 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37207 NULL, 1, OPTAB_DIRECT);
37209 op2 = gen_lowpart (SImode, op2);
37210 op1 = gen_lowpart (SImode, op1);
37211 if (!REG_P (op1))
37212 op1 = copy_to_mode_reg (SImode, op1);
37213 if (!REG_P (op2))
37214 op2 = copy_to_mode_reg (SImode, op2);
37215 icode = CODE_FOR_xsetbv_rex64;
37216 pat = GEN_FCN (icode) (op0, op1, op2);
37218 else
37220 if (!REG_P (op1))
37221 op1 = copy_to_mode_reg (DImode, op1);
37222 icode = CODE_FOR_xsetbv;
37223 pat = GEN_FCN (icode) (op0, op1);
37225 if (pat)
37226 emit_insn (pat);
37227 return 0;
37229 case IX86_BUILTIN_XSAVE:
37230 case IX86_BUILTIN_XRSTOR:
37231 case IX86_BUILTIN_XSAVE64:
37232 case IX86_BUILTIN_XRSTOR64:
37233 case IX86_BUILTIN_XSAVEOPT:
37234 case IX86_BUILTIN_XSAVEOPT64:
37235 case IX86_BUILTIN_XSAVES:
37236 case IX86_BUILTIN_XRSTORS:
37237 case IX86_BUILTIN_XSAVES64:
37238 case IX86_BUILTIN_XRSTORS64:
37239 case IX86_BUILTIN_XSAVEC:
37240 case IX86_BUILTIN_XSAVEC64:
37241 arg0 = CALL_EXPR_ARG (exp, 0);
37242 arg1 = CALL_EXPR_ARG (exp, 1);
37243 op0 = expand_normal (arg0);
37244 op1 = expand_normal (arg1);
37246 if (!address_operand (op0, VOIDmode))
37248 op0 = convert_memory_address (Pmode, op0);
37249 op0 = copy_addr_to_reg (op0);
37251 op0 = gen_rtx_MEM (BLKmode, op0);
37253 op1 = force_reg (DImode, op1);
37255 if (TARGET_64BIT)
37257 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37258 NULL, 1, OPTAB_DIRECT);
37259 switch (fcode)
37261 case IX86_BUILTIN_XSAVE:
37262 icode = CODE_FOR_xsave_rex64;
37263 break;
37264 case IX86_BUILTIN_XRSTOR:
37265 icode = CODE_FOR_xrstor_rex64;
37266 break;
37267 case IX86_BUILTIN_XSAVE64:
37268 icode = CODE_FOR_xsave64;
37269 break;
37270 case IX86_BUILTIN_XRSTOR64:
37271 icode = CODE_FOR_xrstor64;
37272 break;
37273 case IX86_BUILTIN_XSAVEOPT:
37274 icode = CODE_FOR_xsaveopt_rex64;
37275 break;
37276 case IX86_BUILTIN_XSAVEOPT64:
37277 icode = CODE_FOR_xsaveopt64;
37278 break;
37279 case IX86_BUILTIN_XSAVES:
37280 icode = CODE_FOR_xsaves_rex64;
37281 break;
37282 case IX86_BUILTIN_XRSTORS:
37283 icode = CODE_FOR_xrstors_rex64;
37284 break;
37285 case IX86_BUILTIN_XSAVES64:
37286 icode = CODE_FOR_xsaves64;
37287 break;
37288 case IX86_BUILTIN_XRSTORS64:
37289 icode = CODE_FOR_xrstors64;
37290 break;
37291 case IX86_BUILTIN_XSAVEC:
37292 icode = CODE_FOR_xsavec_rex64;
37293 break;
37294 case IX86_BUILTIN_XSAVEC64:
37295 icode = CODE_FOR_xsavec64;
37296 break;
37297 default:
37298 gcc_unreachable ();
37301 op2 = gen_lowpart (SImode, op2);
37302 op1 = gen_lowpart (SImode, op1);
37303 pat = GEN_FCN (icode) (op0, op1, op2);
37305 else
37307 switch (fcode)
37309 case IX86_BUILTIN_XSAVE:
37310 icode = CODE_FOR_xsave;
37311 break;
37312 case IX86_BUILTIN_XRSTOR:
37313 icode = CODE_FOR_xrstor;
37314 break;
37315 case IX86_BUILTIN_XSAVEOPT:
37316 icode = CODE_FOR_xsaveopt;
37317 break;
37318 case IX86_BUILTIN_XSAVES:
37319 icode = CODE_FOR_xsaves;
37320 break;
37321 case IX86_BUILTIN_XRSTORS:
37322 icode = CODE_FOR_xrstors;
37323 break;
37324 case IX86_BUILTIN_XSAVEC:
37325 icode = CODE_FOR_xsavec;
37326 break;
37327 default:
37328 gcc_unreachable ();
37330 pat = GEN_FCN (icode) (op0, op1);
37333 if (pat)
37334 emit_insn (pat);
37335 return 0;
37337 case IX86_BUILTIN_LLWPCB:
37338 arg0 = CALL_EXPR_ARG (exp, 0);
37339 op0 = expand_normal (arg0);
37340 icode = CODE_FOR_lwp_llwpcb;
37341 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37342 op0 = ix86_zero_extend_to_Pmode (op0);
37343 emit_insn (gen_lwp_llwpcb (op0));
37344 return 0;
37346 case IX86_BUILTIN_SLWPCB:
37347 icode = CODE_FOR_lwp_slwpcb;
37348 if (!target
37349 || !insn_data[icode].operand[0].predicate (target, Pmode))
37350 target = gen_reg_rtx (Pmode);
37351 emit_insn (gen_lwp_slwpcb (target));
37352 return target;
37354 case IX86_BUILTIN_BEXTRI32:
37355 case IX86_BUILTIN_BEXTRI64:
37356 arg0 = CALL_EXPR_ARG (exp, 0);
37357 arg1 = CALL_EXPR_ARG (exp, 1);
37358 op0 = expand_normal (arg0);
37359 op1 = expand_normal (arg1);
37360 icode = (fcode == IX86_BUILTIN_BEXTRI32
37361 ? CODE_FOR_tbm_bextri_si
37362 : CODE_FOR_tbm_bextri_di);
37363 if (!CONST_INT_P (op1))
37365 error ("last argument must be an immediate");
37366 return const0_rtx;
37368 else
37370 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37371 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37372 op1 = GEN_INT (length);
37373 op2 = GEN_INT (lsb_index);
37374 pat = GEN_FCN (icode) (target, op0, op1, op2);
37375 if (pat)
37376 emit_insn (pat);
37377 return target;
37380 case IX86_BUILTIN_RDRAND16_STEP:
37381 icode = CODE_FOR_rdrandhi_1;
37382 mode0 = HImode;
37383 goto rdrand_step;
37385 case IX86_BUILTIN_RDRAND32_STEP:
37386 icode = CODE_FOR_rdrandsi_1;
37387 mode0 = SImode;
37388 goto rdrand_step;
37390 case IX86_BUILTIN_RDRAND64_STEP:
37391 icode = CODE_FOR_rdranddi_1;
37392 mode0 = DImode;
37394 rdrand_step:
37395 arg0 = CALL_EXPR_ARG (exp, 0);
37396 op1 = expand_normal (arg0);
37397 if (!address_operand (op1, VOIDmode))
37399 op1 = convert_memory_address (Pmode, op1);
37400 op1 = copy_addr_to_reg (op1);
37403 op0 = gen_reg_rtx (mode0);
37404 emit_insn (GEN_FCN (icode) (op0));
37406 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37408 op1 = gen_reg_rtx (SImode);
37409 emit_move_insn (op1, CONST1_RTX (SImode));
37411 /* Emit SImode conditional move. */
37412 if (mode0 == HImode)
37414 if (TARGET_ZERO_EXTEND_WITH_AND
37415 && optimize_function_for_speed_p (cfun))
37417 op2 = force_reg (SImode, const0_rtx);
37419 emit_insn (gen_movstricthi
37420 (gen_lowpart (HImode, op2), op0));
37422 else
37424 op2 = gen_reg_rtx (SImode);
37426 emit_insn (gen_zero_extendhisi2 (op2, op0));
37429 else if (mode0 == SImode)
37430 op2 = op0;
37431 else
37432 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37434 if (target == 0
37435 || !register_operand (target, SImode))
37436 target = gen_reg_rtx (SImode);
37438 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37439 const0_rtx);
37440 emit_insn (gen_rtx_SET (target,
37441 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37442 return target;
37444 case IX86_BUILTIN_RDSEED16_STEP:
37445 icode = CODE_FOR_rdseedhi_1;
37446 mode0 = HImode;
37447 goto rdseed_step;
37449 case IX86_BUILTIN_RDSEED32_STEP:
37450 icode = CODE_FOR_rdseedsi_1;
37451 mode0 = SImode;
37452 goto rdseed_step;
37454 case IX86_BUILTIN_RDSEED64_STEP:
37455 icode = CODE_FOR_rdseeddi_1;
37456 mode0 = DImode;
37458 rdseed_step:
37459 arg0 = CALL_EXPR_ARG (exp, 0);
37460 op1 = expand_normal (arg0);
37461 if (!address_operand (op1, VOIDmode))
37463 op1 = convert_memory_address (Pmode, op1);
37464 op1 = copy_addr_to_reg (op1);
37467 op0 = gen_reg_rtx (mode0);
37468 emit_insn (GEN_FCN (icode) (op0));
37470 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37472 op2 = gen_reg_rtx (QImode);
37474 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37475 const0_rtx);
37476 emit_insn (gen_rtx_SET (op2, pat));
37478 if (target == 0
37479 || !register_operand (target, SImode))
37480 target = gen_reg_rtx (SImode);
37482 emit_insn (gen_zero_extendqisi2 (target, op2));
37483 return target;
37485 case IX86_BUILTIN_SBB32:
37486 icode = CODE_FOR_subborrowsi;
37487 icode2 = CODE_FOR_subborrowsi_0;
37488 mode0 = SImode;
37489 mode1 = DImode;
37490 mode2 = CCmode;
37491 goto handlecarry;
37493 case IX86_BUILTIN_SBB64:
37494 icode = CODE_FOR_subborrowdi;
37495 icode2 = CODE_FOR_subborrowdi_0;
37496 mode0 = DImode;
37497 mode1 = TImode;
37498 mode2 = CCmode;
37499 goto handlecarry;
37501 case IX86_BUILTIN_ADDCARRYX32:
37502 icode = CODE_FOR_addcarrysi;
37503 icode2 = CODE_FOR_addcarrysi_0;
37504 mode0 = SImode;
37505 mode1 = DImode;
37506 mode2 = CCCmode;
37507 goto handlecarry;
37509 case IX86_BUILTIN_ADDCARRYX64:
37510 icode = CODE_FOR_addcarrydi;
37511 icode2 = CODE_FOR_addcarrydi_0;
37512 mode0 = DImode;
37513 mode1 = TImode;
37514 mode2 = CCCmode;
37516 handlecarry:
37517 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37518 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37519 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37520 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37522 op1 = expand_normal (arg0);
37523 if (!integer_zerop (arg0))
37524 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37526 op2 = expand_normal (arg1);
37527 if (!register_operand (op2, mode0))
37528 op2 = copy_to_mode_reg (mode0, op2);
37530 op3 = expand_normal (arg2);
37531 if (!register_operand (op3, mode0))
37532 op3 = copy_to_mode_reg (mode0, op3);
37534 op4 = expand_normal (arg3);
37535 if (!address_operand (op4, VOIDmode))
37537 op4 = convert_memory_address (Pmode, op4);
37538 op4 = copy_addr_to_reg (op4);
37541 op0 = gen_reg_rtx (mode0);
37542 if (integer_zerop (arg0))
37544 /* If arg0 is 0, optimize right away into add or sub
37545 instruction that sets CCCmode flags. */
37546 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37547 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37549 else
37551 /* Generate CF from input operand. */
37552 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37554 /* Generate instruction that consumes CF. */
37555 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37556 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37557 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37558 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37561 /* Return current CF value. */
37562 if (target == 0)
37563 target = gen_reg_rtx (QImode);
37565 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37566 emit_insn (gen_rtx_SET (target, pat));
37568 /* Store the result. */
37569 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37571 return target;
37573 case IX86_BUILTIN_READ_FLAGS:
37574 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37576 if (optimize
37577 || target == NULL_RTX
37578 || !nonimmediate_operand (target, word_mode)
37579 || GET_MODE (target) != word_mode)
37580 target = gen_reg_rtx (word_mode);
37582 emit_insn (gen_pop (target));
37583 return target;
37585 case IX86_BUILTIN_WRITE_FLAGS:
37587 arg0 = CALL_EXPR_ARG (exp, 0);
37588 op0 = expand_normal (arg0);
37589 if (!general_no_elim_operand (op0, word_mode))
37590 op0 = copy_to_mode_reg (word_mode, op0);
37592 emit_insn (gen_push (op0));
37593 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37594 return 0;
37596 case IX86_BUILTIN_KTESTC8:
37597 icode = CODE_FOR_ktestqi;
37598 mode3 = CCCmode;
37599 goto kortest;
37601 case IX86_BUILTIN_KTESTZ8:
37602 icode = CODE_FOR_ktestqi;
37603 mode3 = CCZmode;
37604 goto kortest;
37606 case IX86_BUILTIN_KTESTC16:
37607 icode = CODE_FOR_ktesthi;
37608 mode3 = CCCmode;
37609 goto kortest;
37611 case IX86_BUILTIN_KTESTZ16:
37612 icode = CODE_FOR_ktesthi;
37613 mode3 = CCZmode;
37614 goto kortest;
37616 case IX86_BUILTIN_KTESTC32:
37617 icode = CODE_FOR_ktestsi;
37618 mode3 = CCCmode;
37619 goto kortest;
37621 case IX86_BUILTIN_KTESTZ32:
37622 icode = CODE_FOR_ktestsi;
37623 mode3 = CCZmode;
37624 goto kortest;
37626 case IX86_BUILTIN_KTESTC64:
37627 icode = CODE_FOR_ktestdi;
37628 mode3 = CCCmode;
37629 goto kortest;
37631 case IX86_BUILTIN_KTESTZ64:
37632 icode = CODE_FOR_ktestdi;
37633 mode3 = CCZmode;
37634 goto kortest;
37636 case IX86_BUILTIN_KORTESTC8:
37637 icode = CODE_FOR_kortestqi;
37638 mode3 = CCCmode;
37639 goto kortest;
37641 case IX86_BUILTIN_KORTESTZ8:
37642 icode = CODE_FOR_kortestqi;
37643 mode3 = CCZmode;
37644 goto kortest;
37646 case IX86_BUILTIN_KORTESTC16:
37647 icode = CODE_FOR_kortesthi;
37648 mode3 = CCCmode;
37649 goto kortest;
37651 case IX86_BUILTIN_KORTESTZ16:
37652 icode = CODE_FOR_kortesthi;
37653 mode3 = CCZmode;
37654 goto kortest;
37656 case IX86_BUILTIN_KORTESTC32:
37657 icode = CODE_FOR_kortestsi;
37658 mode3 = CCCmode;
37659 goto kortest;
37661 case IX86_BUILTIN_KORTESTZ32:
37662 icode = CODE_FOR_kortestsi;
37663 mode3 = CCZmode;
37664 goto kortest;
37666 case IX86_BUILTIN_KORTESTC64:
37667 icode = CODE_FOR_kortestdi;
37668 mode3 = CCCmode;
37669 goto kortest;
37671 case IX86_BUILTIN_KORTESTZ64:
37672 icode = CODE_FOR_kortestdi;
37673 mode3 = CCZmode;
37675 kortest:
37676 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37677 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37678 op0 = expand_normal (arg0);
37679 op1 = expand_normal (arg1);
37681 mode0 = insn_data[icode].operand[0].mode;
37682 mode1 = insn_data[icode].operand[1].mode;
37684 if (GET_MODE (op0) != VOIDmode)
37685 op0 = force_reg (GET_MODE (op0), op0);
37687 op0 = gen_lowpart (mode0, op0);
37689 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37690 op0 = copy_to_mode_reg (mode0, op0);
37692 if (GET_MODE (op1) != VOIDmode)
37693 op1 = force_reg (GET_MODE (op1), op1);
37695 op1 = gen_lowpart (mode1, op1);
37697 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37698 op1 = copy_to_mode_reg (mode1, op1);
37700 target = gen_reg_rtx (QImode);
37702 /* Emit kortest. */
37703 emit_insn (GEN_FCN (icode) (op0, op1));
37704 /* And use setcc to return result from flags. */
37705 ix86_expand_setcc (target, EQ,
37706 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37707 return target;
37709 case IX86_BUILTIN_GATHERSIV2DF:
37710 icode = CODE_FOR_avx2_gathersiv2df;
37711 goto gather_gen;
37712 case IX86_BUILTIN_GATHERSIV4DF:
37713 icode = CODE_FOR_avx2_gathersiv4df;
37714 goto gather_gen;
37715 case IX86_BUILTIN_GATHERDIV2DF:
37716 icode = CODE_FOR_avx2_gatherdiv2df;
37717 goto gather_gen;
37718 case IX86_BUILTIN_GATHERDIV4DF:
37719 icode = CODE_FOR_avx2_gatherdiv4df;
37720 goto gather_gen;
37721 case IX86_BUILTIN_GATHERSIV4SF:
37722 icode = CODE_FOR_avx2_gathersiv4sf;
37723 goto gather_gen;
37724 case IX86_BUILTIN_GATHERSIV8SF:
37725 icode = CODE_FOR_avx2_gathersiv8sf;
37726 goto gather_gen;
37727 case IX86_BUILTIN_GATHERDIV4SF:
37728 icode = CODE_FOR_avx2_gatherdiv4sf;
37729 goto gather_gen;
37730 case IX86_BUILTIN_GATHERDIV8SF:
37731 icode = CODE_FOR_avx2_gatherdiv8sf;
37732 goto gather_gen;
37733 case IX86_BUILTIN_GATHERSIV2DI:
37734 icode = CODE_FOR_avx2_gathersiv2di;
37735 goto gather_gen;
37736 case IX86_BUILTIN_GATHERSIV4DI:
37737 icode = CODE_FOR_avx2_gathersiv4di;
37738 goto gather_gen;
37739 case IX86_BUILTIN_GATHERDIV2DI:
37740 icode = CODE_FOR_avx2_gatherdiv2di;
37741 goto gather_gen;
37742 case IX86_BUILTIN_GATHERDIV4DI:
37743 icode = CODE_FOR_avx2_gatherdiv4di;
37744 goto gather_gen;
37745 case IX86_BUILTIN_GATHERSIV4SI:
37746 icode = CODE_FOR_avx2_gathersiv4si;
37747 goto gather_gen;
37748 case IX86_BUILTIN_GATHERSIV8SI:
37749 icode = CODE_FOR_avx2_gathersiv8si;
37750 goto gather_gen;
37751 case IX86_BUILTIN_GATHERDIV4SI:
37752 icode = CODE_FOR_avx2_gatherdiv4si;
37753 goto gather_gen;
37754 case IX86_BUILTIN_GATHERDIV8SI:
37755 icode = CODE_FOR_avx2_gatherdiv8si;
37756 goto gather_gen;
37757 case IX86_BUILTIN_GATHERALTSIV4DF:
37758 icode = CODE_FOR_avx2_gathersiv4df;
37759 goto gather_gen;
37760 case IX86_BUILTIN_GATHERALTDIV8SF:
37761 icode = CODE_FOR_avx2_gatherdiv8sf;
37762 goto gather_gen;
37763 case IX86_BUILTIN_GATHERALTSIV4DI:
37764 icode = CODE_FOR_avx2_gathersiv4di;
37765 goto gather_gen;
37766 case IX86_BUILTIN_GATHERALTDIV8SI:
37767 icode = CODE_FOR_avx2_gatherdiv8si;
37768 goto gather_gen;
37769 case IX86_BUILTIN_GATHER3SIV16SF:
37770 icode = CODE_FOR_avx512f_gathersiv16sf;
37771 goto gather_gen;
37772 case IX86_BUILTIN_GATHER3SIV8DF:
37773 icode = CODE_FOR_avx512f_gathersiv8df;
37774 goto gather_gen;
37775 case IX86_BUILTIN_GATHER3DIV16SF:
37776 icode = CODE_FOR_avx512f_gatherdiv16sf;
37777 goto gather_gen;
37778 case IX86_BUILTIN_GATHER3DIV8DF:
37779 icode = CODE_FOR_avx512f_gatherdiv8df;
37780 goto gather_gen;
37781 case IX86_BUILTIN_GATHER3SIV16SI:
37782 icode = CODE_FOR_avx512f_gathersiv16si;
37783 goto gather_gen;
37784 case IX86_BUILTIN_GATHER3SIV8DI:
37785 icode = CODE_FOR_avx512f_gathersiv8di;
37786 goto gather_gen;
37787 case IX86_BUILTIN_GATHER3DIV16SI:
37788 icode = CODE_FOR_avx512f_gatherdiv16si;
37789 goto gather_gen;
37790 case IX86_BUILTIN_GATHER3DIV8DI:
37791 icode = CODE_FOR_avx512f_gatherdiv8di;
37792 goto gather_gen;
37793 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37794 icode = CODE_FOR_avx512f_gathersiv8df;
37795 goto gather_gen;
37796 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37797 icode = CODE_FOR_avx512f_gatherdiv16sf;
37798 goto gather_gen;
37799 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37800 icode = CODE_FOR_avx512f_gathersiv8di;
37801 goto gather_gen;
37802 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37803 icode = CODE_FOR_avx512f_gatherdiv16si;
37804 goto gather_gen;
37805 case IX86_BUILTIN_GATHER3SIV2DF:
37806 icode = CODE_FOR_avx512vl_gathersiv2df;
37807 goto gather_gen;
37808 case IX86_BUILTIN_GATHER3SIV4DF:
37809 icode = CODE_FOR_avx512vl_gathersiv4df;
37810 goto gather_gen;
37811 case IX86_BUILTIN_GATHER3DIV2DF:
37812 icode = CODE_FOR_avx512vl_gatherdiv2df;
37813 goto gather_gen;
37814 case IX86_BUILTIN_GATHER3DIV4DF:
37815 icode = CODE_FOR_avx512vl_gatherdiv4df;
37816 goto gather_gen;
37817 case IX86_BUILTIN_GATHER3SIV4SF:
37818 icode = CODE_FOR_avx512vl_gathersiv4sf;
37819 goto gather_gen;
37820 case IX86_BUILTIN_GATHER3SIV8SF:
37821 icode = CODE_FOR_avx512vl_gathersiv8sf;
37822 goto gather_gen;
37823 case IX86_BUILTIN_GATHER3DIV4SF:
37824 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37825 goto gather_gen;
37826 case IX86_BUILTIN_GATHER3DIV8SF:
37827 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37828 goto gather_gen;
37829 case IX86_BUILTIN_GATHER3SIV2DI:
37830 icode = CODE_FOR_avx512vl_gathersiv2di;
37831 goto gather_gen;
37832 case IX86_BUILTIN_GATHER3SIV4DI:
37833 icode = CODE_FOR_avx512vl_gathersiv4di;
37834 goto gather_gen;
37835 case IX86_BUILTIN_GATHER3DIV2DI:
37836 icode = CODE_FOR_avx512vl_gatherdiv2di;
37837 goto gather_gen;
37838 case IX86_BUILTIN_GATHER3DIV4DI:
37839 icode = CODE_FOR_avx512vl_gatherdiv4di;
37840 goto gather_gen;
37841 case IX86_BUILTIN_GATHER3SIV4SI:
37842 icode = CODE_FOR_avx512vl_gathersiv4si;
37843 goto gather_gen;
37844 case IX86_BUILTIN_GATHER3SIV8SI:
37845 icode = CODE_FOR_avx512vl_gathersiv8si;
37846 goto gather_gen;
37847 case IX86_BUILTIN_GATHER3DIV4SI:
37848 icode = CODE_FOR_avx512vl_gatherdiv4si;
37849 goto gather_gen;
37850 case IX86_BUILTIN_GATHER3DIV8SI:
37851 icode = CODE_FOR_avx512vl_gatherdiv8si;
37852 goto gather_gen;
37853 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37854 icode = CODE_FOR_avx512vl_gathersiv4df;
37855 goto gather_gen;
37856 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37857 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37858 goto gather_gen;
37859 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37860 icode = CODE_FOR_avx512vl_gathersiv4di;
37861 goto gather_gen;
37862 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37863 icode = CODE_FOR_avx512vl_gatherdiv8si;
37864 goto gather_gen;
37865 case IX86_BUILTIN_SCATTERSIV16SF:
37866 icode = CODE_FOR_avx512f_scattersiv16sf;
37867 goto scatter_gen;
37868 case IX86_BUILTIN_SCATTERSIV8DF:
37869 icode = CODE_FOR_avx512f_scattersiv8df;
37870 goto scatter_gen;
37871 case IX86_BUILTIN_SCATTERDIV16SF:
37872 icode = CODE_FOR_avx512f_scatterdiv16sf;
37873 goto scatter_gen;
37874 case IX86_BUILTIN_SCATTERDIV8DF:
37875 icode = CODE_FOR_avx512f_scatterdiv8df;
37876 goto scatter_gen;
37877 case IX86_BUILTIN_SCATTERSIV16SI:
37878 icode = CODE_FOR_avx512f_scattersiv16si;
37879 goto scatter_gen;
37880 case IX86_BUILTIN_SCATTERSIV8DI:
37881 icode = CODE_FOR_avx512f_scattersiv8di;
37882 goto scatter_gen;
37883 case IX86_BUILTIN_SCATTERDIV16SI:
37884 icode = CODE_FOR_avx512f_scatterdiv16si;
37885 goto scatter_gen;
37886 case IX86_BUILTIN_SCATTERDIV8DI:
37887 icode = CODE_FOR_avx512f_scatterdiv8di;
37888 goto scatter_gen;
37889 case IX86_BUILTIN_SCATTERSIV8SF:
37890 icode = CODE_FOR_avx512vl_scattersiv8sf;
37891 goto scatter_gen;
37892 case IX86_BUILTIN_SCATTERSIV4SF:
37893 icode = CODE_FOR_avx512vl_scattersiv4sf;
37894 goto scatter_gen;
37895 case IX86_BUILTIN_SCATTERSIV4DF:
37896 icode = CODE_FOR_avx512vl_scattersiv4df;
37897 goto scatter_gen;
37898 case IX86_BUILTIN_SCATTERSIV2DF:
37899 icode = CODE_FOR_avx512vl_scattersiv2df;
37900 goto scatter_gen;
37901 case IX86_BUILTIN_SCATTERDIV8SF:
37902 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37903 goto scatter_gen;
37904 case IX86_BUILTIN_SCATTERDIV4SF:
37905 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37906 goto scatter_gen;
37907 case IX86_BUILTIN_SCATTERDIV4DF:
37908 icode = CODE_FOR_avx512vl_scatterdiv4df;
37909 goto scatter_gen;
37910 case IX86_BUILTIN_SCATTERDIV2DF:
37911 icode = CODE_FOR_avx512vl_scatterdiv2df;
37912 goto scatter_gen;
37913 case IX86_BUILTIN_SCATTERSIV8SI:
37914 icode = CODE_FOR_avx512vl_scattersiv8si;
37915 goto scatter_gen;
37916 case IX86_BUILTIN_SCATTERSIV4SI:
37917 icode = CODE_FOR_avx512vl_scattersiv4si;
37918 goto scatter_gen;
37919 case IX86_BUILTIN_SCATTERSIV4DI:
37920 icode = CODE_FOR_avx512vl_scattersiv4di;
37921 goto scatter_gen;
37922 case IX86_BUILTIN_SCATTERSIV2DI:
37923 icode = CODE_FOR_avx512vl_scattersiv2di;
37924 goto scatter_gen;
37925 case IX86_BUILTIN_SCATTERDIV8SI:
37926 icode = CODE_FOR_avx512vl_scatterdiv8si;
37927 goto scatter_gen;
37928 case IX86_BUILTIN_SCATTERDIV4SI:
37929 icode = CODE_FOR_avx512vl_scatterdiv4si;
37930 goto scatter_gen;
37931 case IX86_BUILTIN_SCATTERDIV4DI:
37932 icode = CODE_FOR_avx512vl_scatterdiv4di;
37933 goto scatter_gen;
37934 case IX86_BUILTIN_SCATTERDIV2DI:
37935 icode = CODE_FOR_avx512vl_scatterdiv2di;
37936 goto scatter_gen;
37937 case IX86_BUILTIN_GATHERPFDPD:
37938 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37939 goto vec_prefetch_gen;
37940 case IX86_BUILTIN_SCATTERALTSIV8DF:
37941 icode = CODE_FOR_avx512f_scattersiv8df;
37942 goto scatter_gen;
37943 case IX86_BUILTIN_SCATTERALTDIV16SF:
37944 icode = CODE_FOR_avx512f_scatterdiv16sf;
37945 goto scatter_gen;
37946 case IX86_BUILTIN_SCATTERALTSIV8DI:
37947 icode = CODE_FOR_avx512f_scattersiv8di;
37948 goto scatter_gen;
37949 case IX86_BUILTIN_SCATTERALTDIV16SI:
37950 icode = CODE_FOR_avx512f_scatterdiv16si;
37951 goto scatter_gen;
37952 case IX86_BUILTIN_GATHERPFDPS:
37953 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37954 goto vec_prefetch_gen;
37955 case IX86_BUILTIN_GATHERPFQPD:
37956 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37957 goto vec_prefetch_gen;
37958 case IX86_BUILTIN_GATHERPFQPS:
37959 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37960 goto vec_prefetch_gen;
37961 case IX86_BUILTIN_SCATTERPFDPD:
37962 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37963 goto vec_prefetch_gen;
37964 case IX86_BUILTIN_SCATTERPFDPS:
37965 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37966 goto vec_prefetch_gen;
37967 case IX86_BUILTIN_SCATTERPFQPD:
37968 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37969 goto vec_prefetch_gen;
37970 case IX86_BUILTIN_SCATTERPFQPS:
37971 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37972 goto vec_prefetch_gen;
37974 gather_gen:
37975 rtx half;
37976 rtx (*gen) (rtx, rtx);
37978 arg0 = CALL_EXPR_ARG (exp, 0);
37979 arg1 = CALL_EXPR_ARG (exp, 1);
37980 arg2 = CALL_EXPR_ARG (exp, 2);
37981 arg3 = CALL_EXPR_ARG (exp, 3);
37982 arg4 = CALL_EXPR_ARG (exp, 4);
37983 op0 = expand_normal (arg0);
37984 op1 = expand_normal (arg1);
37985 op2 = expand_normal (arg2);
37986 op3 = expand_normal (arg3);
37987 op4 = expand_normal (arg4);
37988 /* Note the arg order is different from the operand order. */
37989 mode0 = insn_data[icode].operand[1].mode;
37990 mode2 = insn_data[icode].operand[3].mode;
37991 mode3 = insn_data[icode].operand[4].mode;
37992 mode4 = insn_data[icode].operand[5].mode;
37994 if (target == NULL_RTX
37995 || GET_MODE (target) != insn_data[icode].operand[0].mode
37996 || !insn_data[icode].operand[0].predicate (target,
37997 GET_MODE (target)))
37998 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37999 else
38000 subtarget = target;
38002 switch (fcode)
38004 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38005 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38006 half = gen_reg_rtx (V8SImode);
38007 if (!nonimmediate_operand (op2, V16SImode))
38008 op2 = copy_to_mode_reg (V16SImode, op2);
38009 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38010 op2 = half;
38011 break;
38012 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38013 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38014 case IX86_BUILTIN_GATHERALTSIV4DF:
38015 case IX86_BUILTIN_GATHERALTSIV4DI:
38016 half = gen_reg_rtx (V4SImode);
38017 if (!nonimmediate_operand (op2, V8SImode))
38018 op2 = copy_to_mode_reg (V8SImode, op2);
38019 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38020 op2 = half;
38021 break;
38022 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38023 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38024 half = gen_reg_rtx (mode0);
38025 if (mode0 == V8SFmode)
38026 gen = gen_vec_extract_lo_v16sf;
38027 else
38028 gen = gen_vec_extract_lo_v16si;
38029 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38030 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38031 emit_insn (gen (half, op0));
38032 op0 = half;
38033 if (GET_MODE (op3) != VOIDmode)
38035 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38036 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38037 emit_insn (gen (half, op3));
38038 op3 = half;
38040 break;
38041 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38042 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38043 case IX86_BUILTIN_GATHERALTDIV8SF:
38044 case IX86_BUILTIN_GATHERALTDIV8SI:
38045 half = gen_reg_rtx (mode0);
38046 if (mode0 == V4SFmode)
38047 gen = gen_vec_extract_lo_v8sf;
38048 else
38049 gen = gen_vec_extract_lo_v8si;
38050 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38051 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38052 emit_insn (gen (half, op0));
38053 op0 = half;
38054 if (GET_MODE (op3) != VOIDmode)
38056 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38057 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38058 emit_insn (gen (half, op3));
38059 op3 = half;
38061 break;
38062 default:
38063 break;
38066 /* Force memory operand only with base register here. But we
38067 don't want to do it on memory operand for other builtin
38068 functions. */
38069 op1 = ix86_zero_extend_to_Pmode (op1);
38071 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38072 op0 = copy_to_mode_reg (mode0, op0);
38073 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38074 op1 = copy_to_mode_reg (Pmode, op1);
38075 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38076 op2 = copy_to_mode_reg (mode2, op2);
38078 op3 = fixup_modeless_constant (op3, mode3);
38080 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38082 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38083 op3 = copy_to_mode_reg (mode3, op3);
38085 else
38087 op3 = copy_to_reg (op3);
38088 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38090 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38092 error ("the last argument must be scale 1, 2, 4, 8");
38093 return const0_rtx;
38096 /* Optimize. If mask is known to have all high bits set,
38097 replace op0 with pc_rtx to signal that the instruction
38098 overwrites the whole destination and doesn't use its
38099 previous contents. */
38100 if (optimize)
38102 if (TREE_CODE (arg3) == INTEGER_CST)
38104 if (integer_all_onesp (arg3))
38105 op0 = pc_rtx;
38107 else if (TREE_CODE (arg3) == VECTOR_CST)
38109 unsigned int negative = 0;
38110 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38112 tree cst = VECTOR_CST_ELT (arg3, i);
38113 if (TREE_CODE (cst) == INTEGER_CST
38114 && tree_int_cst_sign_bit (cst))
38115 negative++;
38116 else if (TREE_CODE (cst) == REAL_CST
38117 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38118 negative++;
38120 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38121 op0 = pc_rtx;
38123 else if (TREE_CODE (arg3) == SSA_NAME
38124 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38126 /* Recognize also when mask is like:
38127 __v2df src = _mm_setzero_pd ();
38128 __v2df mask = _mm_cmpeq_pd (src, src);
38130 __v8sf src = _mm256_setzero_ps ();
38131 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38132 as that is a cheaper way to load all ones into
38133 a register than having to load a constant from
38134 memory. */
38135 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38136 if (is_gimple_call (def_stmt))
38138 tree fndecl = gimple_call_fndecl (def_stmt);
38139 if (fndecl
38140 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38141 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38143 case IX86_BUILTIN_CMPPD:
38144 case IX86_BUILTIN_CMPPS:
38145 case IX86_BUILTIN_CMPPD256:
38146 case IX86_BUILTIN_CMPPS256:
38147 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38148 break;
38149 /* FALLTHRU */
38150 case IX86_BUILTIN_CMPEQPD:
38151 case IX86_BUILTIN_CMPEQPS:
38152 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38153 && initializer_zerop (gimple_call_arg (def_stmt,
38154 1)))
38155 op0 = pc_rtx;
38156 break;
38157 default:
38158 break;
38164 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38165 if (! pat)
38166 return const0_rtx;
38167 emit_insn (pat);
38169 switch (fcode)
38171 case IX86_BUILTIN_GATHER3DIV16SF:
38172 if (target == NULL_RTX)
38173 target = gen_reg_rtx (V8SFmode);
38174 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38175 break;
38176 case IX86_BUILTIN_GATHER3DIV16SI:
38177 if (target == NULL_RTX)
38178 target = gen_reg_rtx (V8SImode);
38179 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38180 break;
38181 case IX86_BUILTIN_GATHER3DIV8SF:
38182 case IX86_BUILTIN_GATHERDIV8SF:
38183 if (target == NULL_RTX)
38184 target = gen_reg_rtx (V4SFmode);
38185 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38186 break;
38187 case IX86_BUILTIN_GATHER3DIV8SI:
38188 case IX86_BUILTIN_GATHERDIV8SI:
38189 if (target == NULL_RTX)
38190 target = gen_reg_rtx (V4SImode);
38191 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38192 break;
38193 default:
38194 target = subtarget;
38195 break;
38197 return target;
38199 scatter_gen:
38200 arg0 = CALL_EXPR_ARG (exp, 0);
38201 arg1 = CALL_EXPR_ARG (exp, 1);
38202 arg2 = CALL_EXPR_ARG (exp, 2);
38203 arg3 = CALL_EXPR_ARG (exp, 3);
38204 arg4 = CALL_EXPR_ARG (exp, 4);
38205 op0 = expand_normal (arg0);
38206 op1 = expand_normal (arg1);
38207 op2 = expand_normal (arg2);
38208 op3 = expand_normal (arg3);
38209 op4 = expand_normal (arg4);
38210 mode1 = insn_data[icode].operand[1].mode;
38211 mode2 = insn_data[icode].operand[2].mode;
38212 mode3 = insn_data[icode].operand[3].mode;
38213 mode4 = insn_data[icode].operand[4].mode;
38215 /* Scatter instruction stores operand op3 to memory with
38216 indices from op2 and scale from op4 under writemask op1.
38217 If index operand op2 has more elements then source operand
38218 op3 one need to use only its low half. And vice versa. */
38219 switch (fcode)
38221 case IX86_BUILTIN_SCATTERALTSIV8DF:
38222 case IX86_BUILTIN_SCATTERALTSIV8DI:
38223 half = gen_reg_rtx (V8SImode);
38224 if (!nonimmediate_operand (op2, V16SImode))
38225 op2 = copy_to_mode_reg (V16SImode, op2);
38226 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38227 op2 = half;
38228 break;
38229 case IX86_BUILTIN_SCATTERALTDIV16SF:
38230 case IX86_BUILTIN_SCATTERALTDIV16SI:
38231 half = gen_reg_rtx (mode3);
38232 if (mode3 == V8SFmode)
38233 gen = gen_vec_extract_lo_v16sf;
38234 else
38235 gen = gen_vec_extract_lo_v16si;
38236 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38237 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38238 emit_insn (gen (half, op3));
38239 op3 = half;
38240 break;
38241 default:
38242 break;
38245 /* Force memory operand only with base register here. But we
38246 don't want to do it on memory operand for other builtin
38247 functions. */
38248 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38250 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38251 op0 = copy_to_mode_reg (Pmode, op0);
38253 op1 = fixup_modeless_constant (op1, mode1);
38255 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38257 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38258 op1 = copy_to_mode_reg (mode1, op1);
38260 else
38262 op1 = copy_to_reg (op1);
38263 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38266 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38267 op2 = copy_to_mode_reg (mode2, op2);
38269 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38270 op3 = copy_to_mode_reg (mode3, op3);
38272 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38274 error ("the last argument must be scale 1, 2, 4, 8");
38275 return const0_rtx;
38278 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38279 if (! pat)
38280 return const0_rtx;
38282 emit_insn (pat);
38283 return 0;
38285 vec_prefetch_gen:
38286 arg0 = CALL_EXPR_ARG (exp, 0);
38287 arg1 = CALL_EXPR_ARG (exp, 1);
38288 arg2 = CALL_EXPR_ARG (exp, 2);
38289 arg3 = CALL_EXPR_ARG (exp, 3);
38290 arg4 = CALL_EXPR_ARG (exp, 4);
38291 op0 = expand_normal (arg0);
38292 op1 = expand_normal (arg1);
38293 op2 = expand_normal (arg2);
38294 op3 = expand_normal (arg3);
38295 op4 = expand_normal (arg4);
38296 mode0 = insn_data[icode].operand[0].mode;
38297 mode1 = insn_data[icode].operand[1].mode;
38298 mode3 = insn_data[icode].operand[3].mode;
38299 mode4 = insn_data[icode].operand[4].mode;
38301 op0 = fixup_modeless_constant (op0, mode0);
38303 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38305 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38306 op0 = copy_to_mode_reg (mode0, op0);
38308 else
38310 op0 = copy_to_reg (op0);
38311 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38314 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38315 op1 = copy_to_mode_reg (mode1, op1);
38317 /* Force memory operand only with base register here. But we
38318 don't want to do it on memory operand for other builtin
38319 functions. */
38320 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38322 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38323 op2 = copy_to_mode_reg (Pmode, op2);
38325 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38327 error ("the forth argument must be scale 1, 2, 4, 8");
38328 return const0_rtx;
38331 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38333 error ("incorrect hint operand");
38334 return const0_rtx;
38337 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38338 if (! pat)
38339 return const0_rtx;
38341 emit_insn (pat);
38343 return 0;
38345 case IX86_BUILTIN_XABORT:
38346 icode = CODE_FOR_xabort;
38347 arg0 = CALL_EXPR_ARG (exp, 0);
38348 op0 = expand_normal (arg0);
38349 mode0 = insn_data[icode].operand[0].mode;
38350 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38352 error ("the xabort's argument must be an 8-bit immediate");
38353 return const0_rtx;
38355 emit_insn (gen_xabort (op0));
38356 return 0;
38358 case IX86_BUILTIN_RSTORSSP:
38359 case IX86_BUILTIN_CLRSSBSY:
38360 arg0 = CALL_EXPR_ARG (exp, 0);
38361 op0 = expand_normal (arg0);
38362 icode = (fcode == IX86_BUILTIN_RSTORSSP
38363 ? CODE_FOR_rstorssp
38364 : CODE_FOR_clrssbsy);
38365 if (!address_operand (op0, VOIDmode))
38367 op1 = convert_memory_address (Pmode, op0);
38368 op0 = copy_addr_to_reg (op1);
38370 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38371 return 0;
38373 case IX86_BUILTIN_WRSSD:
38374 case IX86_BUILTIN_WRSSQ:
38375 case IX86_BUILTIN_WRUSSD:
38376 case IX86_BUILTIN_WRUSSQ:
38377 arg0 = CALL_EXPR_ARG (exp, 0);
38378 op0 = expand_normal (arg0);
38379 arg1 = CALL_EXPR_ARG (exp, 1);
38380 op1 = expand_normal (arg1);
38381 switch (fcode)
38383 case IX86_BUILTIN_WRSSD:
38384 icode = CODE_FOR_wrsssi;
38385 mode = SImode;
38386 break;
38387 case IX86_BUILTIN_WRSSQ:
38388 icode = CODE_FOR_wrssdi;
38389 mode = DImode;
38390 break;
38391 case IX86_BUILTIN_WRUSSD:
38392 icode = CODE_FOR_wrusssi;
38393 mode = SImode;
38394 break;
38395 case IX86_BUILTIN_WRUSSQ:
38396 icode = CODE_FOR_wrussdi;
38397 mode = DImode;
38398 break;
38400 op0 = force_reg (mode, op0);
38401 if (!address_operand (op1, VOIDmode))
38403 op2 = convert_memory_address (Pmode, op1);
38404 op1 = copy_addr_to_reg (op2);
38406 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38407 return 0;
38409 default:
38410 break;
38413 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38414 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38416 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38417 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38418 target);
38421 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
38422 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
38424 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
38425 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
38426 target);
38429 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38430 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38432 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38433 switch (fcode)
38435 case IX86_BUILTIN_FABSQ:
38436 case IX86_BUILTIN_COPYSIGNQ:
38437 if (!TARGET_SSE)
38438 /* Emit a normal call if SSE isn't available. */
38439 return expand_call (exp, target, ignore);
38440 /* FALLTHRU */
38441 default:
38442 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38446 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38447 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38449 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38450 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38451 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38452 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38453 int masked = 1;
38454 machine_mode mode, wide_mode, nar_mode;
38456 nar_mode = V4SFmode;
38457 mode = V16SFmode;
38458 wide_mode = V64SFmode;
38459 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38460 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38462 switch (fcode)
38464 case IX86_BUILTIN_4FMAPS:
38465 fcn = gen_avx5124fmaddps_4fmaddps;
38466 masked = 0;
38467 goto v4fma_expand;
38469 case IX86_BUILTIN_4DPWSSD:
38470 nar_mode = V4SImode;
38471 mode = V16SImode;
38472 wide_mode = V64SImode;
38473 fcn = gen_avx5124vnniw_vp4dpwssd;
38474 masked = 0;
38475 goto v4fma_expand;
38477 case IX86_BUILTIN_4DPWSSDS:
38478 nar_mode = V4SImode;
38479 mode = V16SImode;
38480 wide_mode = V64SImode;
38481 fcn = gen_avx5124vnniw_vp4dpwssds;
38482 masked = 0;
38483 goto v4fma_expand;
38485 case IX86_BUILTIN_4FNMAPS:
38486 fcn = gen_avx5124fmaddps_4fnmaddps;
38487 masked = 0;
38488 goto v4fma_expand;
38490 case IX86_BUILTIN_4FNMAPS_MASK:
38491 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38492 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38493 goto v4fma_expand;
38495 case IX86_BUILTIN_4DPWSSD_MASK:
38496 nar_mode = V4SImode;
38497 mode = V16SImode;
38498 wide_mode = V64SImode;
38499 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38500 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38501 goto v4fma_expand;
38503 case IX86_BUILTIN_4DPWSSDS_MASK:
38504 nar_mode = V4SImode;
38505 mode = V16SImode;
38506 wide_mode = V64SImode;
38507 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38508 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38509 goto v4fma_expand;
38511 case IX86_BUILTIN_4FMAPS_MASK:
38513 tree args[4];
38514 rtx ops[4];
38515 rtx wide_reg;
38516 rtx accum;
38517 rtx addr;
38518 rtx mem;
38520 v4fma_expand:
38521 wide_reg = gen_reg_rtx (wide_mode);
38522 for (i = 0; i < 4; i++)
38524 args[i] = CALL_EXPR_ARG (exp, i);
38525 ops[i] = expand_normal (args[i]);
38527 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38528 ops[i]);
38531 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38532 accum = force_reg (mode, accum);
38534 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38535 addr = force_reg (Pmode, addr);
38537 mem = gen_rtx_MEM (nar_mode, addr);
38539 target = gen_reg_rtx (mode);
38541 emit_move_insn (target, accum);
38543 if (! masked)
38544 emit_insn (fcn (target, accum, wide_reg, mem));
38545 else
38547 rtx merge, mask;
38548 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38550 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38552 if (CONST_INT_P (mask))
38553 mask = fixup_modeless_constant (mask, HImode);
38555 mask = force_reg (HImode, mask);
38557 if (GET_MODE (mask) != HImode)
38558 mask = gen_rtx_SUBREG (HImode, mask, 0);
38560 /* If merge is 0 then we're about to emit z-masked variant. */
38561 if (const0_operand (merge, mode))
38562 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38563 /* If merge is the same as accum then emit merge-masked variant. */
38564 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38566 merge = force_reg (mode, merge);
38567 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38569 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38570 else
38572 target = gen_reg_rtx (mode);
38573 emit_move_insn (target, merge);
38574 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38577 return target;
38580 case IX86_BUILTIN_4FNMASS:
38581 fcn = gen_avx5124fmaddps_4fnmaddss;
38582 masked = 0;
38583 goto s4fma_expand;
38585 case IX86_BUILTIN_4FMASS:
38586 fcn = gen_avx5124fmaddps_4fmaddss;
38587 masked = 0;
38588 goto s4fma_expand;
38590 case IX86_BUILTIN_4FNMASS_MASK:
38591 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38592 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38593 goto s4fma_expand;
38595 case IX86_BUILTIN_4FMASS_MASK:
38597 tree args[4];
38598 rtx ops[4];
38599 rtx wide_reg;
38600 rtx accum;
38601 rtx addr;
38602 rtx mem;
38604 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38605 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38607 s4fma_expand:
38608 mode = V4SFmode;
38609 wide_reg = gen_reg_rtx (V64SFmode);
38610 for (i = 0; i < 4; i++)
38612 rtx tmp;
38613 args[i] = CALL_EXPR_ARG (exp, i);
38614 ops[i] = expand_normal (args[i]);
38616 tmp = gen_reg_rtx (SFmode);
38617 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38619 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38620 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38623 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38624 accum = force_reg (V4SFmode, accum);
38626 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38627 addr = force_reg (Pmode, addr);
38629 mem = gen_rtx_MEM (V4SFmode, addr);
38631 target = gen_reg_rtx (V4SFmode);
38633 emit_move_insn (target, accum);
38635 if (! masked)
38636 emit_insn (fcn (target, accum, wide_reg, mem));
38637 else
38639 rtx merge, mask;
38640 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38642 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38644 if (CONST_INT_P (mask))
38645 mask = fixup_modeless_constant (mask, QImode);
38647 mask = force_reg (QImode, mask);
38649 if (GET_MODE (mask) != QImode)
38650 mask = gen_rtx_SUBREG (QImode, mask, 0);
38652 /* If merge is 0 then we're about to emit z-masked variant. */
38653 if (const0_operand (merge, mode))
38654 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38655 /* If merge is the same as accum then emit merge-masked
38656 variant. */
38657 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38659 merge = force_reg (mode, merge);
38660 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38662 /* Merge with something unknown might happen if we z-mask
38663 w/ -O0. */
38664 else
38666 target = gen_reg_rtx (mode);
38667 emit_move_insn (target, merge);
38668 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38671 return target;
38673 case IX86_BUILTIN_RDPID:
38674 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38675 target);
38676 default:
38677 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38681 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38682 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38684 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38685 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38688 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38689 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38691 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38692 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38695 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38696 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38698 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38699 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38702 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38703 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38705 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38706 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38709 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38710 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38712 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38713 const struct builtin_description *d = bdesc_multi_arg + i;
38714 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38715 (enum ix86_builtin_func_type)
38716 d->flag, d->comparison);
38719 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38720 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38722 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38723 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38724 target);
38727 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38728 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38730 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38731 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
38732 target);
38735 gcc_unreachable ();
38738 /* This returns the target-specific builtin with code CODE if
38739 current_function_decl has visibility on this builtin, which is checked
38740 using isa flags. Returns NULL_TREE otherwise. */
38742 static tree ix86_get_builtin (enum ix86_builtins code)
38744 struct cl_target_option *opts;
38745 tree target_tree = NULL_TREE;
38747 /* Determine the isa flags of current_function_decl. */
38749 if (current_function_decl)
38750 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38752 if (target_tree == NULL)
38753 target_tree = target_option_default_node;
38755 opts = TREE_TARGET_OPTION (target_tree);
38757 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38758 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38759 return ix86_builtin_decl (code, true);
38760 else
38761 return NULL_TREE;
38764 /* Return function decl for target specific builtin
38765 for given MPX builtin passed i FCODE. */
38766 static tree
38767 ix86_builtin_mpx_function (unsigned fcode)
38769 switch (fcode)
38771 case BUILT_IN_CHKP_BNDMK:
38772 return ix86_builtins[IX86_BUILTIN_BNDMK];
38774 case BUILT_IN_CHKP_BNDSTX:
38775 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38777 case BUILT_IN_CHKP_BNDLDX:
38778 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38780 case BUILT_IN_CHKP_BNDCL:
38781 return ix86_builtins[IX86_BUILTIN_BNDCL];
38783 case BUILT_IN_CHKP_BNDCU:
38784 return ix86_builtins[IX86_BUILTIN_BNDCU];
38786 case BUILT_IN_CHKP_BNDRET:
38787 return ix86_builtins[IX86_BUILTIN_BNDRET];
38789 case BUILT_IN_CHKP_INTERSECT:
38790 return ix86_builtins[IX86_BUILTIN_BNDINT];
38792 case BUILT_IN_CHKP_NARROW:
38793 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38795 case BUILT_IN_CHKP_SIZEOF:
38796 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38798 case BUILT_IN_CHKP_EXTRACT_LOWER:
38799 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38801 case BUILT_IN_CHKP_EXTRACT_UPPER:
38802 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38804 default:
38805 return NULL_TREE;
38808 gcc_unreachable ();
38811 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38813 Return an address to be used to load/store bounds for pointer
38814 passed in SLOT.
38816 SLOT_NO is an integer constant holding number of a target
38817 dependent special slot to be used in case SLOT is not a memory.
38819 SPECIAL_BASE is a pointer to be used as a base of fake address
38820 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38821 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38823 static rtx
38824 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38826 rtx addr = NULL;
38828 /* NULL slot means we pass bounds for pointer not passed to the
38829 function at all. Register slot means we pass pointer in a
38830 register. In both these cases bounds are passed via Bounds
38831 Table. Since we do not have actual pointer stored in memory,
38832 we have to use fake addresses to access Bounds Table. We
38833 start with (special_base - sizeof (void*)) and decrease this
38834 address by pointer size to get addresses for other slots. */
38835 if (!slot || REG_P (slot))
38837 gcc_assert (CONST_INT_P (slot_no));
38838 addr = plus_constant (Pmode, special_base,
38839 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38841 /* If pointer is passed in a memory then its address is used to
38842 access Bounds Table. */
38843 else if (MEM_P (slot))
38845 addr = XEXP (slot, 0);
38846 if (!register_operand (addr, Pmode))
38847 addr = copy_addr_to_reg (addr);
38849 else
38850 gcc_unreachable ();
38852 return addr;
38855 /* Expand pass uses this hook to load bounds for function parameter
38856 PTR passed in SLOT in case its bounds are not passed in a register.
38858 If SLOT is a memory, then bounds are loaded as for regular pointer
38859 loaded from memory. PTR may be NULL in case SLOT is a memory.
38860 In such case value of PTR (if required) may be loaded from SLOT.
38862 If SLOT is NULL or a register then SLOT_NO is an integer constant
38863 holding number of the target dependent special slot which should be
38864 used to obtain bounds.
38866 Return loaded bounds. */
38868 static rtx
38869 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38871 rtx reg = gen_reg_rtx (BNDmode);
38872 rtx addr;
38874 /* Get address to be used to access Bounds Table. Special slots start
38875 at the location of return address of the current function. */
38876 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38878 /* Load pointer value from a memory if we don't have it. */
38879 if (!ptr)
38881 gcc_assert (MEM_P (slot));
38882 ptr = copy_addr_to_reg (slot);
38885 if (!register_operand (ptr, Pmode))
38886 ptr = ix86_zero_extend_to_Pmode (ptr);
38888 emit_insn (BNDmode == BND64mode
38889 ? gen_bnd64_ldx (reg, addr, ptr)
38890 : gen_bnd32_ldx (reg, addr, ptr));
38892 return reg;
38895 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38896 passed in SLOT in case BOUNDS are not passed in a register.
38898 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38899 stored in memory. PTR may be NULL in case SLOT is a memory.
38900 In such case value of PTR (if required) may be loaded from SLOT.
38902 If SLOT is NULL or a register then SLOT_NO is an integer constant
38903 holding number of the target dependent special slot which should be
38904 used to store BOUNDS. */
38906 static void
38907 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38909 rtx addr;
38911 /* Get address to be used to access Bounds Table. Special slots start
38912 at the location of return address of a called function. */
38913 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38915 /* Load pointer value from a memory if we don't have it. */
38916 if (!ptr)
38918 gcc_assert (MEM_P (slot));
38919 ptr = copy_addr_to_reg (slot);
38922 if (!register_operand (ptr, Pmode))
38923 ptr = ix86_zero_extend_to_Pmode (ptr);
38925 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38926 if (!register_operand (bounds, BNDmode))
38927 bounds = copy_to_mode_reg (BNDmode, bounds);
38929 emit_insn (BNDmode == BND64mode
38930 ? gen_bnd64_stx (addr, ptr, bounds)
38931 : gen_bnd32_stx (addr, ptr, bounds));
38934 /* Load and return bounds returned by function in SLOT. */
38936 static rtx
38937 ix86_load_returned_bounds (rtx slot)
38939 rtx res;
38941 gcc_assert (REG_P (slot));
38942 res = gen_reg_rtx (BNDmode);
38943 emit_move_insn (res, slot);
38945 return res;
38948 /* Store BOUNDS returned by function into SLOT. */
38950 static void
38951 ix86_store_returned_bounds (rtx slot, rtx bounds)
38953 gcc_assert (REG_P (slot));
38954 emit_move_insn (slot, bounds);
38957 /* Returns a function decl for a vectorized version of the combined function
38958 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38959 if it is not available. */
38961 static tree
38962 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38963 tree type_in)
38965 machine_mode in_mode, out_mode;
38966 int in_n, out_n;
38968 if (TREE_CODE (type_out) != VECTOR_TYPE
38969 || TREE_CODE (type_in) != VECTOR_TYPE)
38970 return NULL_TREE;
38972 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38973 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38974 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38975 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38977 switch (fn)
38979 CASE_CFN_EXP2:
38980 if (out_mode == SFmode && in_mode == SFmode)
38982 if (out_n == 16 && in_n == 16)
38983 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38985 break;
38987 CASE_CFN_IFLOOR:
38988 CASE_CFN_LFLOOR:
38989 CASE_CFN_LLFLOOR:
38990 /* The round insn does not trap on denormals. */
38991 if (flag_trapping_math || !TARGET_SSE4_1)
38992 break;
38994 if (out_mode == SImode && in_mode == DFmode)
38996 if (out_n == 4 && in_n == 2)
38997 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38998 else if (out_n == 8 && in_n == 4)
38999 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39000 else if (out_n == 16 && in_n == 8)
39001 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39003 if (out_mode == SImode && in_mode == SFmode)
39005 if (out_n == 4 && in_n == 4)
39006 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39007 else if (out_n == 8 && in_n == 8)
39008 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39009 else if (out_n == 16 && in_n == 16)
39010 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39012 break;
39014 CASE_CFN_ICEIL:
39015 CASE_CFN_LCEIL:
39016 CASE_CFN_LLCEIL:
39017 /* The round insn does not trap on denormals. */
39018 if (flag_trapping_math || !TARGET_SSE4_1)
39019 break;
39021 if (out_mode == SImode && in_mode == DFmode)
39023 if (out_n == 4 && in_n == 2)
39024 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39025 else if (out_n == 8 && in_n == 4)
39026 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39027 else if (out_n == 16 && in_n == 8)
39028 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39030 if (out_mode == SImode && in_mode == SFmode)
39032 if (out_n == 4 && in_n == 4)
39033 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39034 else if (out_n == 8 && in_n == 8)
39035 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39036 else if (out_n == 16 && in_n == 16)
39037 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39039 break;
39041 CASE_CFN_IRINT:
39042 CASE_CFN_LRINT:
39043 CASE_CFN_LLRINT:
39044 if (out_mode == SImode && in_mode == DFmode)
39046 if (out_n == 4 && in_n == 2)
39047 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39048 else if (out_n == 8 && in_n == 4)
39049 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39050 else if (out_n == 16 && in_n == 8)
39051 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39053 if (out_mode == SImode && in_mode == SFmode)
39055 if (out_n == 4 && in_n == 4)
39056 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39057 else if (out_n == 8 && in_n == 8)
39058 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39059 else if (out_n == 16 && in_n == 16)
39060 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39062 break;
39064 CASE_CFN_IROUND:
39065 CASE_CFN_LROUND:
39066 CASE_CFN_LLROUND:
39067 /* The round insn does not trap on denormals. */
39068 if (flag_trapping_math || !TARGET_SSE4_1)
39069 break;
39071 if (out_mode == SImode && in_mode == DFmode)
39073 if (out_n == 4 && in_n == 2)
39074 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39075 else if (out_n == 8 && in_n == 4)
39076 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39077 else if (out_n == 16 && in_n == 8)
39078 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39080 if (out_mode == SImode && in_mode == SFmode)
39082 if (out_n == 4 && in_n == 4)
39083 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39084 else if (out_n == 8 && in_n == 8)
39085 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39086 else if (out_n == 16 && in_n == 16)
39087 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39089 break;
39091 CASE_CFN_FLOOR:
39092 /* The round insn does not trap on denormals. */
39093 if (flag_trapping_math || !TARGET_SSE4_1)
39094 break;
39096 if (out_mode == DFmode && in_mode == DFmode)
39098 if (out_n == 2 && in_n == 2)
39099 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39100 else if (out_n == 4 && in_n == 4)
39101 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39102 else if (out_n == 8 && in_n == 8)
39103 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39105 if (out_mode == SFmode && in_mode == SFmode)
39107 if (out_n == 4 && in_n == 4)
39108 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39109 else if (out_n == 8 && in_n == 8)
39110 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39111 else if (out_n == 16 && in_n == 16)
39112 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39114 break;
39116 CASE_CFN_CEIL:
39117 /* The round insn does not trap on denormals. */
39118 if (flag_trapping_math || !TARGET_SSE4_1)
39119 break;
39121 if (out_mode == DFmode && in_mode == DFmode)
39123 if (out_n == 2 && in_n == 2)
39124 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39125 else if (out_n == 4 && in_n == 4)
39126 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39127 else if (out_n == 8 && in_n == 8)
39128 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39130 if (out_mode == SFmode && in_mode == SFmode)
39132 if (out_n == 4 && in_n == 4)
39133 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39134 else if (out_n == 8 && in_n == 8)
39135 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39136 else if (out_n == 16 && in_n == 16)
39137 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39139 break;
39141 CASE_CFN_TRUNC:
39142 /* The round insn does not trap on denormals. */
39143 if (flag_trapping_math || !TARGET_SSE4_1)
39144 break;
39146 if (out_mode == DFmode && in_mode == DFmode)
39148 if (out_n == 2 && in_n == 2)
39149 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39150 else if (out_n == 4 && in_n == 4)
39151 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39152 else if (out_n == 8 && in_n == 8)
39153 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39155 if (out_mode == SFmode && in_mode == SFmode)
39157 if (out_n == 4 && in_n == 4)
39158 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39159 else if (out_n == 8 && in_n == 8)
39160 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39161 else if (out_n == 16 && in_n == 16)
39162 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39164 break;
39166 CASE_CFN_RINT:
39167 /* The round insn does not trap on denormals. */
39168 if (flag_trapping_math || !TARGET_SSE4_1)
39169 break;
39171 if (out_mode == DFmode && in_mode == DFmode)
39173 if (out_n == 2 && in_n == 2)
39174 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39175 else if (out_n == 4 && in_n == 4)
39176 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39178 if (out_mode == SFmode && in_mode == SFmode)
39180 if (out_n == 4 && in_n == 4)
39181 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39182 else if (out_n == 8 && in_n == 8)
39183 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39185 break;
39187 CASE_CFN_FMA:
39188 if (out_mode == DFmode && in_mode == DFmode)
39190 if (out_n == 2 && in_n == 2)
39191 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39192 if (out_n == 4 && in_n == 4)
39193 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39195 if (out_mode == SFmode && in_mode == SFmode)
39197 if (out_n == 4 && in_n == 4)
39198 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39199 if (out_n == 8 && in_n == 8)
39200 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39202 break;
39204 default:
39205 break;
39208 /* Dispatch to a handler for a vectorization library. */
39209 if (ix86_veclib_handler)
39210 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39212 return NULL_TREE;
39215 /* Handler for an SVML-style interface to
39216 a library with vectorized intrinsics. */
39218 static tree
39219 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39221 char name[20];
39222 tree fntype, new_fndecl, args;
39223 unsigned arity;
39224 const char *bname;
39225 machine_mode el_mode, in_mode;
39226 int n, in_n;
39228 /* The SVML is suitable for unsafe math only. */
39229 if (!flag_unsafe_math_optimizations)
39230 return NULL_TREE;
39232 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39233 n = TYPE_VECTOR_SUBPARTS (type_out);
39234 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39235 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39236 if (el_mode != in_mode
39237 || n != in_n)
39238 return NULL_TREE;
39240 switch (fn)
39242 CASE_CFN_EXP:
39243 CASE_CFN_LOG:
39244 CASE_CFN_LOG10:
39245 CASE_CFN_POW:
39246 CASE_CFN_TANH:
39247 CASE_CFN_TAN:
39248 CASE_CFN_ATAN:
39249 CASE_CFN_ATAN2:
39250 CASE_CFN_ATANH:
39251 CASE_CFN_CBRT:
39252 CASE_CFN_SINH:
39253 CASE_CFN_SIN:
39254 CASE_CFN_ASINH:
39255 CASE_CFN_ASIN:
39256 CASE_CFN_COSH:
39257 CASE_CFN_COS:
39258 CASE_CFN_ACOSH:
39259 CASE_CFN_ACOS:
39260 if ((el_mode != DFmode || n != 2)
39261 && (el_mode != SFmode || n != 4))
39262 return NULL_TREE;
39263 break;
39265 default:
39266 return NULL_TREE;
39269 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39270 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39272 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39273 strcpy (name, "vmlsLn4");
39274 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39275 strcpy (name, "vmldLn2");
39276 else if (n == 4)
39278 sprintf (name, "vmls%s", bname+10);
39279 name[strlen (name)-1] = '4';
39281 else
39282 sprintf (name, "vmld%s2", bname+10);
39284 /* Convert to uppercase. */
39285 name[4] &= ~0x20;
39287 arity = 0;
39288 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39289 arity++;
39291 if (arity == 1)
39292 fntype = build_function_type_list (type_out, type_in, NULL);
39293 else
39294 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39296 /* Build a function declaration for the vectorized function. */
39297 new_fndecl = build_decl (BUILTINS_LOCATION,
39298 FUNCTION_DECL, get_identifier (name), fntype);
39299 TREE_PUBLIC (new_fndecl) = 1;
39300 DECL_EXTERNAL (new_fndecl) = 1;
39301 DECL_IS_NOVOPS (new_fndecl) = 1;
39302 TREE_READONLY (new_fndecl) = 1;
39304 return new_fndecl;
39307 /* Handler for an ACML-style interface to
39308 a library with vectorized intrinsics. */
39310 static tree
39311 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39313 char name[20] = "__vr.._";
39314 tree fntype, new_fndecl, args;
39315 unsigned arity;
39316 const char *bname;
39317 machine_mode el_mode, in_mode;
39318 int n, in_n;
39320 /* The ACML is 64bits only and suitable for unsafe math only as
39321 it does not correctly support parts of IEEE with the required
39322 precision such as denormals. */
39323 if (!TARGET_64BIT
39324 || !flag_unsafe_math_optimizations)
39325 return NULL_TREE;
39327 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39328 n = TYPE_VECTOR_SUBPARTS (type_out);
39329 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39330 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39331 if (el_mode != in_mode
39332 || n != in_n)
39333 return NULL_TREE;
39335 switch (fn)
39337 CASE_CFN_SIN:
39338 CASE_CFN_COS:
39339 CASE_CFN_EXP:
39340 CASE_CFN_LOG:
39341 CASE_CFN_LOG2:
39342 CASE_CFN_LOG10:
39343 if (el_mode == DFmode && n == 2)
39345 name[4] = 'd';
39346 name[5] = '2';
39348 else if (el_mode == SFmode && n == 4)
39350 name[4] = 's';
39351 name[5] = '4';
39353 else
39354 return NULL_TREE;
39355 break;
39357 default:
39358 return NULL_TREE;
39361 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39362 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39363 sprintf (name + 7, "%s", bname+10);
39365 arity = 0;
39366 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39367 arity++;
39369 if (arity == 1)
39370 fntype = build_function_type_list (type_out, type_in, NULL);
39371 else
39372 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39374 /* Build a function declaration for the vectorized function. */
39375 new_fndecl = build_decl (BUILTINS_LOCATION,
39376 FUNCTION_DECL, get_identifier (name), fntype);
39377 TREE_PUBLIC (new_fndecl) = 1;
39378 DECL_EXTERNAL (new_fndecl) = 1;
39379 DECL_IS_NOVOPS (new_fndecl) = 1;
39380 TREE_READONLY (new_fndecl) = 1;
39382 return new_fndecl;
39385 /* Returns a decl of a function that implements gather load with
39386 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39387 Return NULL_TREE if it is not available. */
39389 static tree
39390 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39391 const_tree index_type, int scale)
39393 bool si;
39394 enum ix86_builtins code;
39396 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39397 return NULL_TREE;
39399 if ((TREE_CODE (index_type) != INTEGER_TYPE
39400 && !POINTER_TYPE_P (index_type))
39401 || (TYPE_MODE (index_type) != SImode
39402 && TYPE_MODE (index_type) != DImode))
39403 return NULL_TREE;
39405 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39406 return NULL_TREE;
39408 /* v*gather* insn sign extends index to pointer mode. */
39409 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39410 && TYPE_UNSIGNED (index_type))
39411 return NULL_TREE;
39413 if (scale <= 0
39414 || scale > 8
39415 || (scale & (scale - 1)) != 0)
39416 return NULL_TREE;
39418 si = TYPE_MODE (index_type) == SImode;
39419 switch (TYPE_MODE (mem_vectype))
39421 case E_V2DFmode:
39422 if (TARGET_AVX512VL)
39423 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39424 else
39425 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39426 break;
39427 case E_V4DFmode:
39428 if (TARGET_AVX512VL)
39429 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39430 else
39431 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39432 break;
39433 case E_V2DImode:
39434 if (TARGET_AVX512VL)
39435 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39436 else
39437 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39438 break;
39439 case E_V4DImode:
39440 if (TARGET_AVX512VL)
39441 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39442 else
39443 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39444 break;
39445 case E_V4SFmode:
39446 if (TARGET_AVX512VL)
39447 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39448 else
39449 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39450 break;
39451 case E_V8SFmode:
39452 if (TARGET_AVX512VL)
39453 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39454 else
39455 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39456 break;
39457 case E_V4SImode:
39458 if (TARGET_AVX512VL)
39459 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39460 else
39461 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39462 break;
39463 case E_V8SImode:
39464 if (TARGET_AVX512VL)
39465 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39466 else
39467 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39468 break;
39469 case E_V8DFmode:
39470 if (TARGET_AVX512F)
39471 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39472 else
39473 return NULL_TREE;
39474 break;
39475 case E_V8DImode:
39476 if (TARGET_AVX512F)
39477 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39478 else
39479 return NULL_TREE;
39480 break;
39481 case E_V16SFmode:
39482 if (TARGET_AVX512F)
39483 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39484 else
39485 return NULL_TREE;
39486 break;
39487 case E_V16SImode:
39488 if (TARGET_AVX512F)
39489 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39490 else
39491 return NULL_TREE;
39492 break;
39493 default:
39494 return NULL_TREE;
39497 return ix86_get_builtin (code);
39500 /* Returns a decl of a function that implements scatter store with
39501 register type VECTYPE and index type INDEX_TYPE and SCALE.
39502 Return NULL_TREE if it is not available. */
39504 static tree
39505 ix86_vectorize_builtin_scatter (const_tree vectype,
39506 const_tree index_type, int scale)
39508 bool si;
39509 enum ix86_builtins code;
39511 if (!TARGET_AVX512F)
39512 return NULL_TREE;
39514 if ((TREE_CODE (index_type) != INTEGER_TYPE
39515 && !POINTER_TYPE_P (index_type))
39516 || (TYPE_MODE (index_type) != SImode
39517 && TYPE_MODE (index_type) != DImode))
39518 return NULL_TREE;
39520 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39521 return NULL_TREE;
39523 /* v*scatter* insn sign extends index to pointer mode. */
39524 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39525 && TYPE_UNSIGNED (index_type))
39526 return NULL_TREE;
39528 /* Scale can be 1, 2, 4 or 8. */
39529 if (scale <= 0
39530 || scale > 8
39531 || (scale & (scale - 1)) != 0)
39532 return NULL_TREE;
39534 si = TYPE_MODE (index_type) == SImode;
39535 switch (TYPE_MODE (vectype))
39537 case E_V8DFmode:
39538 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39539 break;
39540 case E_V8DImode:
39541 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39542 break;
39543 case E_V16SFmode:
39544 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39545 break;
39546 case E_V16SImode:
39547 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39548 break;
39549 default:
39550 return NULL_TREE;
39553 return ix86_builtins[code];
39556 /* Return true if it is safe to use the rsqrt optabs to optimize
39557 1.0/sqrt. */
39559 static bool
39560 use_rsqrt_p ()
39562 return (TARGET_SSE_MATH
39563 && flag_finite_math_only
39564 && !flag_trapping_math
39565 && flag_unsafe_math_optimizations);
39568 /* Returns a code for a target-specific builtin that implements
39569 reciprocal of the function, or NULL_TREE if not available. */
39571 static tree
39572 ix86_builtin_reciprocal (tree fndecl)
39574 switch (DECL_FUNCTION_CODE (fndecl))
39576 /* Vectorized version of sqrt to rsqrt conversion. */
39577 case IX86_BUILTIN_SQRTPS_NR:
39578 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39580 case IX86_BUILTIN_SQRTPS_NR256:
39581 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39583 default:
39584 return NULL_TREE;
39588 /* Helper for avx_vpermilps256_operand et al. This is also used by
39589 the expansion functions to turn the parallel back into a mask.
39590 The return value is 0 for no match and the imm8+1 for a match. */
39593 avx_vpermilp_parallel (rtx par, machine_mode mode)
39595 unsigned i, nelt = GET_MODE_NUNITS (mode);
39596 unsigned mask = 0;
39597 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39599 if (XVECLEN (par, 0) != (int) nelt)
39600 return 0;
39602 /* Validate that all of the elements are constants, and not totally
39603 out of range. Copy the data into an integral array to make the
39604 subsequent checks easier. */
39605 for (i = 0; i < nelt; ++i)
39607 rtx er = XVECEXP (par, 0, i);
39608 unsigned HOST_WIDE_INT ei;
39610 if (!CONST_INT_P (er))
39611 return 0;
39612 ei = INTVAL (er);
39613 if (ei >= nelt)
39614 return 0;
39615 ipar[i] = ei;
39618 switch (mode)
39620 case E_V8DFmode:
39621 /* In the 512-bit DFmode case, we can only move elements within
39622 a 128-bit lane. First fill the second part of the mask,
39623 then fallthru. */
39624 for (i = 4; i < 6; ++i)
39626 if (ipar[i] < 4 || ipar[i] >= 6)
39627 return 0;
39628 mask |= (ipar[i] - 4) << i;
39630 for (i = 6; i < 8; ++i)
39632 if (ipar[i] < 6)
39633 return 0;
39634 mask |= (ipar[i] - 6) << i;
39636 /* FALLTHRU */
39638 case E_V4DFmode:
39639 /* In the 256-bit DFmode case, we can only move elements within
39640 a 128-bit lane. */
39641 for (i = 0; i < 2; ++i)
39643 if (ipar[i] >= 2)
39644 return 0;
39645 mask |= ipar[i] << i;
39647 for (i = 2; i < 4; ++i)
39649 if (ipar[i] < 2)
39650 return 0;
39651 mask |= (ipar[i] - 2) << i;
39653 break;
39655 case E_V16SFmode:
39656 /* In 512 bit SFmode case, permutation in the upper 256 bits
39657 must mirror the permutation in the lower 256-bits. */
39658 for (i = 0; i < 8; ++i)
39659 if (ipar[i] + 8 != ipar[i + 8])
39660 return 0;
39661 /* FALLTHRU */
39663 case E_V8SFmode:
39664 /* In 256 bit SFmode case, we have full freedom of
39665 movement within the low 128-bit lane, but the high 128-bit
39666 lane must mirror the exact same pattern. */
39667 for (i = 0; i < 4; ++i)
39668 if (ipar[i] + 4 != ipar[i + 4])
39669 return 0;
39670 nelt = 4;
39671 /* FALLTHRU */
39673 case E_V2DFmode:
39674 case E_V4SFmode:
39675 /* In the 128-bit case, we've full freedom in the placement of
39676 the elements from the source operand. */
39677 for (i = 0; i < nelt; ++i)
39678 mask |= ipar[i] << (i * (nelt / 2));
39679 break;
39681 default:
39682 gcc_unreachable ();
39685 /* Make sure success has a non-zero value by adding one. */
39686 return mask + 1;
39689 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39690 the expansion functions to turn the parallel back into a mask.
39691 The return value is 0 for no match and the imm8+1 for a match. */
39694 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39696 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39697 unsigned mask = 0;
39698 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39700 if (XVECLEN (par, 0) != (int) nelt)
39701 return 0;
39703 /* Validate that all of the elements are constants, and not totally
39704 out of range. Copy the data into an integral array to make the
39705 subsequent checks easier. */
39706 for (i = 0; i < nelt; ++i)
39708 rtx er = XVECEXP (par, 0, i);
39709 unsigned HOST_WIDE_INT ei;
39711 if (!CONST_INT_P (er))
39712 return 0;
39713 ei = INTVAL (er);
39714 if (ei >= 2 * nelt)
39715 return 0;
39716 ipar[i] = ei;
39719 /* Validate that the halves of the permute are halves. */
39720 for (i = 0; i < nelt2 - 1; ++i)
39721 if (ipar[i] + 1 != ipar[i + 1])
39722 return 0;
39723 for (i = nelt2; i < nelt - 1; ++i)
39724 if (ipar[i] + 1 != ipar[i + 1])
39725 return 0;
39727 /* Reconstruct the mask. */
39728 for (i = 0; i < 2; ++i)
39730 unsigned e = ipar[i * nelt2];
39731 if (e % nelt2)
39732 return 0;
39733 e /= nelt2;
39734 mask |= e << (i * 4);
39737 /* Make sure success has a non-zero value by adding one. */
39738 return mask + 1;
39741 /* Return a register priority for hard reg REGNO. */
39742 static int
39743 ix86_register_priority (int hard_regno)
39745 /* ebp and r13 as the base always wants a displacement, r12 as the
39746 base always wants an index. So discourage their usage in an
39747 address. */
39748 if (hard_regno == R12_REG || hard_regno == R13_REG)
39749 return 0;
39750 if (hard_regno == BP_REG)
39751 return 1;
39752 /* New x86-64 int registers result in bigger code size. Discourage
39753 them. */
39754 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39755 return 2;
39756 /* New x86-64 SSE registers result in bigger code size. Discourage
39757 them. */
39758 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39759 return 2;
39760 /* Usage of AX register results in smaller code. Prefer it. */
39761 if (hard_regno == AX_REG)
39762 return 4;
39763 return 3;
39766 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39768 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39769 QImode must go into class Q_REGS.
39770 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39771 movdf to do mem-to-mem moves through integer regs. */
39773 static reg_class_t
39774 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39776 machine_mode mode = GET_MODE (x);
39778 /* We're only allowed to return a subclass of CLASS. Many of the
39779 following checks fail for NO_REGS, so eliminate that early. */
39780 if (regclass == NO_REGS)
39781 return NO_REGS;
39783 /* All classes can load zeros. */
39784 if (x == CONST0_RTX (mode))
39785 return regclass;
39787 /* Force constants into memory if we are loading a (nonzero) constant into
39788 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39789 instructions to load from a constant. */
39790 if (CONSTANT_P (x)
39791 && (MAYBE_MMX_CLASS_P (regclass)
39792 || MAYBE_SSE_CLASS_P (regclass)
39793 || MAYBE_MASK_CLASS_P (regclass)))
39794 return NO_REGS;
39796 /* Floating-point constants need more complex checks. */
39797 if (CONST_DOUBLE_P (x))
39799 /* General regs can load everything. */
39800 if (INTEGER_CLASS_P (regclass))
39801 return regclass;
39803 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39804 zero above. We only want to wind up preferring 80387 registers if
39805 we plan on doing computation with them. */
39806 if (IS_STACK_MODE (mode)
39807 && standard_80387_constant_p (x) > 0)
39809 /* Limit class to FP regs. */
39810 if (FLOAT_CLASS_P (regclass))
39811 return FLOAT_REGS;
39812 else if (regclass == FP_TOP_SSE_REGS)
39813 return FP_TOP_REG;
39814 else if (regclass == FP_SECOND_SSE_REGS)
39815 return FP_SECOND_REG;
39818 return NO_REGS;
39821 /* Prefer SSE regs only, if we can use them for math. */
39822 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39823 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39825 /* Generally when we see PLUS here, it's the function invariant
39826 (plus soft-fp const_int). Which can only be computed into general
39827 regs. */
39828 if (GET_CODE (x) == PLUS)
39829 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39831 /* QImode constants are easy to load, but non-constant QImode data
39832 must go into Q_REGS. */
39833 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39835 if (Q_CLASS_P (regclass))
39836 return regclass;
39837 else if (reg_class_subset_p (Q_REGS, regclass))
39838 return Q_REGS;
39839 else
39840 return NO_REGS;
39843 return regclass;
39846 /* Discourage putting floating-point values in SSE registers unless
39847 SSE math is being used, and likewise for the 387 registers. */
39848 static reg_class_t
39849 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39851 machine_mode mode = GET_MODE (x);
39853 /* Restrict the output reload class to the register bank that we are doing
39854 math on. If we would like not to return a subset of CLASS, reject this
39855 alternative: if reload cannot do this, it will still use its choice. */
39856 mode = GET_MODE (x);
39857 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39858 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39860 if (IS_STACK_MODE (mode))
39862 if (regclass == FP_TOP_SSE_REGS)
39863 return FP_TOP_REG;
39864 else if (regclass == FP_SECOND_SSE_REGS)
39865 return FP_SECOND_REG;
39866 else
39867 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39870 return regclass;
39873 static reg_class_t
39874 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39875 machine_mode mode, secondary_reload_info *sri)
39877 /* Double-word spills from general registers to non-offsettable memory
39878 references (zero-extended addresses) require special handling. */
39879 if (TARGET_64BIT
39880 && MEM_P (x)
39881 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39882 && INTEGER_CLASS_P (rclass)
39883 && !offsettable_memref_p (x))
39885 sri->icode = (in_p
39886 ? CODE_FOR_reload_noff_load
39887 : CODE_FOR_reload_noff_store);
39888 /* Add the cost of moving address to a temporary. */
39889 sri->extra_cost = 1;
39891 return NO_REGS;
39894 /* QImode spills from non-QI registers require
39895 intermediate register on 32bit targets. */
39896 if (mode == QImode
39897 && ((!TARGET_64BIT && !in_p
39898 && INTEGER_CLASS_P (rclass)
39899 && MAYBE_NON_Q_CLASS_P (rclass))
39900 || (!TARGET_AVX512DQ
39901 && MAYBE_MASK_CLASS_P (rclass))))
39903 int regno = true_regnum (x);
39905 /* Return Q_REGS if the operand is in memory. */
39906 if (regno == -1)
39907 return Q_REGS;
39909 return NO_REGS;
39912 /* This condition handles corner case where an expression involving
39913 pointers gets vectorized. We're trying to use the address of a
39914 stack slot as a vector initializer.
39916 (set (reg:V2DI 74 [ vect_cst_.2 ])
39917 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39919 Eventually frame gets turned into sp+offset like this:
39921 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39922 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39923 (const_int 392 [0x188]))))
39925 That later gets turned into:
39927 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39928 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39929 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39931 We'll have the following reload recorded:
39933 Reload 0: reload_in (DI) =
39934 (plus:DI (reg/f:DI 7 sp)
39935 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39936 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39937 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39938 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39939 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39940 reload_reg_rtx: (reg:V2DI 22 xmm1)
39942 Which isn't going to work since SSE instructions can't handle scalar
39943 additions. Returning GENERAL_REGS forces the addition into integer
39944 register and reload can handle subsequent reloads without problems. */
39946 if (in_p && GET_CODE (x) == PLUS
39947 && SSE_CLASS_P (rclass)
39948 && SCALAR_INT_MODE_P (mode))
39949 return GENERAL_REGS;
39951 return NO_REGS;
39954 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39956 static bool
39957 ix86_class_likely_spilled_p (reg_class_t rclass)
39959 switch (rclass)
39961 case AREG:
39962 case DREG:
39963 case CREG:
39964 case BREG:
39965 case AD_REGS:
39966 case SIREG:
39967 case DIREG:
39968 case SSE_FIRST_REG:
39969 case FP_TOP_REG:
39970 case FP_SECOND_REG:
39971 case BND_REGS:
39972 return true;
39974 default:
39975 break;
39978 return false;
39981 /* If we are copying between registers from different register sets
39982 (e.g. FP and integer), we may need a memory location.
39984 The function can't work reliably when one of the CLASSES is a class
39985 containing registers from multiple sets. We avoid this by never combining
39986 different sets in a single alternative in the machine description.
39987 Ensure that this constraint holds to avoid unexpected surprises.
39989 When STRICT is false, we are being called from REGISTER_MOVE_COST,
39990 so do not enforce these sanity checks.
39992 To optimize register_move_cost performance, define inline variant. */
39994 static inline bool
39995 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39996 reg_class_t class2, int strict)
39998 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39999 return false;
40001 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40002 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40003 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40004 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40005 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40006 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40007 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40008 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40010 gcc_assert (!strict || lra_in_progress);
40011 return true;
40014 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40015 return true;
40017 /* Between mask and general, we have moves no larger than word size. */
40018 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40019 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40020 return true;
40022 /* ??? This is a lie. We do have moves between mmx/general, and for
40023 mmx/sse2. But by saying we need secondary memory we discourage the
40024 register allocator from using the mmx registers unless needed. */
40025 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40026 return true;
40028 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40030 /* SSE1 doesn't have any direct moves from other classes. */
40031 if (!TARGET_SSE2)
40032 return true;
40034 /* If the target says that inter-unit moves are more expensive
40035 than moving through memory, then don't generate them. */
40036 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40037 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40038 return true;
40040 /* Between SSE and general, we have moves no larger than word size. */
40041 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40042 return true;
40045 return false;
40048 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
40050 static bool
40051 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
40052 reg_class_t class2)
40054 return inline_secondary_memory_needed (mode, class1, class2, true);
40057 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
40059 get_secondary_mem widens integral modes to BITS_PER_WORD.
40060 There is no need to emit full 64 bit move on 64 bit targets
40061 for integral modes that can be moved using 32 bit move. */
40063 static machine_mode
40064 ix86_secondary_memory_needed_mode (machine_mode mode)
40066 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
40067 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
40068 return mode;
40071 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40073 On the 80386, this is the size of MODE in words,
40074 except in the FP regs, where a single reg is always enough. */
40076 static unsigned char
40077 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40079 if (MAYBE_INTEGER_CLASS_P (rclass))
40081 if (mode == XFmode)
40082 return (TARGET_64BIT ? 2 : 3);
40083 else if (mode == XCmode)
40084 return (TARGET_64BIT ? 4 : 6);
40085 else
40086 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40088 else
40090 if (COMPLEX_MODE_P (mode))
40091 return 2;
40092 else
40093 return 1;
40097 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
40099 static bool
40100 ix86_can_change_mode_class (machine_mode from, machine_mode to,
40101 reg_class_t regclass)
40103 if (from == to)
40104 return true;
40106 /* x87 registers can't do subreg at all, as all values are reformatted
40107 to extended precision. */
40108 if (MAYBE_FLOAT_CLASS_P (regclass))
40109 return false;
40111 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40113 /* Vector registers do not support QI or HImode loads. If we don't
40114 disallow a change to these modes, reload will assume it's ok to
40115 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40116 the vec_dupv4hi pattern. */
40117 if (GET_MODE_SIZE (from) < 4)
40118 return false;
40121 return true;
40124 /* Return index of MODE in the sse load/store tables. */
40126 static inline int
40127 sse_store_index (machine_mode mode)
40129 switch (GET_MODE_SIZE (mode))
40131 case 4:
40132 return 0;
40133 case 8:
40134 return 1;
40135 case 16:
40136 return 2;
40137 case 32:
40138 return 3;
40139 case 64:
40140 return 4;
40141 default:
40142 return -1;
40146 /* Return the cost of moving data of mode M between a
40147 register and memory. A value of 2 is the default; this cost is
40148 relative to those in `REGISTER_MOVE_COST'.
40150 This function is used extensively by register_move_cost that is used to
40151 build tables at startup. Make it inline in this case.
40152 When IN is 2, return maximum of in and out move cost.
40154 If moving between registers and memory is more expensive than
40155 between two registers, you should define this macro to express the
40156 relative cost.
40158 Model also increased moving costs of QImode registers in non
40159 Q_REGS classes.
40161 static inline int
40162 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40163 int in)
40165 int cost;
40166 if (FLOAT_CLASS_P (regclass))
40168 int index;
40169 switch (mode)
40171 case E_SFmode:
40172 index = 0;
40173 break;
40174 case E_DFmode:
40175 index = 1;
40176 break;
40177 case E_XFmode:
40178 index = 2;
40179 break;
40180 default:
40181 return 100;
40183 if (in == 2)
40184 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40185 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40187 if (SSE_CLASS_P (regclass))
40189 int index = sse_store_index (mode);
40190 if (index == -1)
40191 return 100;
40192 if (in == 2)
40193 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40194 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40196 if (MMX_CLASS_P (regclass))
40198 int index;
40199 switch (GET_MODE_SIZE (mode))
40201 case 4:
40202 index = 0;
40203 break;
40204 case 8:
40205 index = 1;
40206 break;
40207 default:
40208 return 100;
40210 if (in)
40211 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40212 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40214 switch (GET_MODE_SIZE (mode))
40216 case 1:
40217 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40219 if (!in)
40220 return ix86_cost->int_store[0];
40221 if (TARGET_PARTIAL_REG_DEPENDENCY
40222 && optimize_function_for_speed_p (cfun))
40223 cost = ix86_cost->movzbl_load;
40224 else
40225 cost = ix86_cost->int_load[0];
40226 if (in == 2)
40227 return MAX (cost, ix86_cost->int_store[0]);
40228 return cost;
40230 else
40232 if (in == 2)
40233 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40234 if (in)
40235 return ix86_cost->movzbl_load;
40236 else
40237 return ix86_cost->int_store[0] + 4;
40239 break;
40240 case 2:
40241 if (in == 2)
40242 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40243 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40244 default:
40245 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40246 if (mode == TFmode)
40247 mode = XFmode;
40248 if (in == 2)
40249 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40250 else if (in)
40251 cost = ix86_cost->int_load[2];
40252 else
40253 cost = ix86_cost->int_store[2];
40254 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40258 static int
40259 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40260 bool in)
40262 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40266 /* Return the cost of moving data from a register in class CLASS1 to
40267 one in class CLASS2.
40269 It is not required that the cost always equal 2 when FROM is the same as TO;
40270 on some machines it is expensive to move between registers if they are not
40271 general registers. */
40273 static int
40274 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40275 reg_class_t class2_i)
40277 enum reg_class class1 = (enum reg_class) class1_i;
40278 enum reg_class class2 = (enum reg_class) class2_i;
40280 /* In case we require secondary memory, compute cost of the store followed
40281 by load. In order to avoid bad register allocation choices, we need
40282 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40284 if (inline_secondary_memory_needed (mode, class1, class2, false))
40286 int cost = 1;
40288 cost += inline_memory_move_cost (mode, class1, 2);
40289 cost += inline_memory_move_cost (mode, class2, 2);
40291 /* In case of copying from general_purpose_register we may emit multiple
40292 stores followed by single load causing memory size mismatch stall.
40293 Count this as arbitrarily high cost of 20. */
40294 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40295 && TARGET_MEMORY_MISMATCH_STALL
40296 && targetm.class_max_nregs (class1, mode)
40297 > targetm.class_max_nregs (class2, mode))
40298 cost += 20;
40300 /* In the case of FP/MMX moves, the registers actually overlap, and we
40301 have to switch modes in order to treat them differently. */
40302 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40303 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40304 cost += 20;
40306 return cost;
40309 /* Moves between SSE/MMX and integer unit are expensive. */
40310 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40311 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40313 /* ??? By keeping returned value relatively high, we limit the number
40314 of moves between integer and MMX/SSE registers for all targets.
40315 Additionally, high value prevents problem with x86_modes_tieable_p(),
40316 where integer modes in MMX/SSE registers are not tieable
40317 because of missing QImode and HImode moves to, from or between
40318 MMX/SSE registers. */
40319 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40320 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40322 if (MAYBE_FLOAT_CLASS_P (class1))
40323 return ix86_cost->fp_move;
40324 if (MAYBE_SSE_CLASS_P (class1))
40326 if (GET_MODE_BITSIZE (mode) <= 128)
40327 return ix86_cost->xmm_move;
40328 if (GET_MODE_BITSIZE (mode) <= 256)
40329 return ix86_cost->ymm_move;
40330 return ix86_cost->zmm_move;
40332 if (MAYBE_MMX_CLASS_P (class1))
40333 return ix86_cost->mmx_move;
40334 return 2;
40337 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
40338 words of a value of mode MODE but can be less for certain modes in
40339 special long registers.
40341 Actually there are no two word move instructions for consecutive
40342 registers. And only registers 0-3 may have mov byte instructions
40343 applied to them. */
40345 static unsigned int
40346 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40348 if (GENERAL_REGNO_P (regno))
40350 if (mode == XFmode)
40351 return TARGET_64BIT ? 2 : 3;
40352 if (mode == XCmode)
40353 return TARGET_64BIT ? 4 : 6;
40354 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40356 if (COMPLEX_MODE_P (mode))
40357 return 2;
40358 if (mode == V64SFmode || mode == V64SImode)
40359 return 4;
40360 return 1;
40363 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40365 static bool
40366 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40368 /* Flags and only flags can only hold CCmode values. */
40369 if (CC_REGNO_P (regno))
40370 return GET_MODE_CLASS (mode) == MODE_CC;
40371 if (GET_MODE_CLASS (mode) == MODE_CC
40372 || GET_MODE_CLASS (mode) == MODE_RANDOM
40373 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40374 return false;
40375 if (STACK_REGNO_P (regno))
40376 return VALID_FP_MODE_P (mode);
40377 if (MASK_REGNO_P (regno))
40378 return (VALID_MASK_REG_MODE (mode)
40379 || (TARGET_AVX512BW
40380 && VALID_MASK_AVX512BW_MODE (mode)));
40381 if (BND_REGNO_P (regno))
40382 return VALID_BND_REG_MODE (mode);
40383 if (SSE_REGNO_P (regno))
40385 /* We implement the move patterns for all vector modes into and
40386 out of SSE registers, even when no operation instructions
40387 are available. */
40389 /* For AVX-512 we allow, regardless of regno:
40390 - XI mode
40391 - any of 512-bit wide vector mode
40392 - any scalar mode. */
40393 if (TARGET_AVX512F
40394 && (mode == XImode
40395 || VALID_AVX512F_REG_MODE (mode)
40396 || VALID_AVX512F_SCALAR_MODE (mode)))
40397 return true;
40399 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40400 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40401 && MOD4_SSE_REGNO_P (regno)
40402 && mode == V64SFmode)
40403 return true;
40405 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40406 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40407 && MOD4_SSE_REGNO_P (regno)
40408 && mode == V64SImode)
40409 return true;
40411 /* TODO check for QI/HI scalars. */
40412 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40413 if (TARGET_AVX512VL
40414 && (mode == OImode
40415 || mode == TImode
40416 || VALID_AVX256_REG_MODE (mode)
40417 || VALID_AVX512VL_128_REG_MODE (mode)))
40418 return true;
40420 /* xmm16-xmm31 are only available for AVX-512. */
40421 if (EXT_REX_SSE_REGNO_P (regno))
40422 return false;
40424 /* OImode and AVX modes are available only when AVX is enabled. */
40425 return ((TARGET_AVX
40426 && VALID_AVX256_REG_OR_OI_MODE (mode))
40427 || VALID_SSE_REG_MODE (mode)
40428 || VALID_SSE2_REG_MODE (mode)
40429 || VALID_MMX_REG_MODE (mode)
40430 || VALID_MMX_REG_MODE_3DNOW (mode));
40432 if (MMX_REGNO_P (regno))
40434 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40435 so if the register is available at all, then we can move data of
40436 the given mode into or out of it. */
40437 return (VALID_MMX_REG_MODE (mode)
40438 || VALID_MMX_REG_MODE_3DNOW (mode));
40441 if (mode == QImode)
40443 /* Take care for QImode values - they can be in non-QI regs,
40444 but then they do cause partial register stalls. */
40445 if (ANY_QI_REGNO_P (regno))
40446 return true;
40447 if (!TARGET_PARTIAL_REG_STALL)
40448 return true;
40449 /* LRA checks if the hard register is OK for the given mode.
40450 QImode values can live in non-QI regs, so we allow all
40451 registers here. */
40452 if (lra_in_progress)
40453 return true;
40454 return !can_create_pseudo_p ();
40456 /* We handle both integer and floats in the general purpose registers. */
40457 else if (VALID_INT_MODE_P (mode))
40458 return true;
40459 else if (VALID_FP_MODE_P (mode))
40460 return true;
40461 else if (VALID_DFP_MODE_P (mode))
40462 return true;
40463 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40464 on to use that value in smaller contexts, this can easily force a
40465 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40466 supporting DImode, allow it. */
40467 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40468 return true;
40470 return false;
40473 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40474 saves SSE registers across calls is Win64 (thus no need to check the
40475 current ABI here), and with AVX enabled Win64 only guarantees that
40476 the low 16 bytes are saved. */
40478 static bool
40479 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40481 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40484 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40485 tieable integer mode. */
40487 static bool
40488 ix86_tieable_integer_mode_p (machine_mode mode)
40490 switch (mode)
40492 case E_HImode:
40493 case E_SImode:
40494 return true;
40496 case E_QImode:
40497 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40499 case E_DImode:
40500 return TARGET_64BIT;
40502 default:
40503 return false;
40507 /* Implement TARGET_MODES_TIEABLE_P.
40509 Return true if MODE1 is accessible in a register that can hold MODE2
40510 without copying. That is, all register classes that can hold MODE2
40511 can also hold MODE1. */
40513 static bool
40514 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40516 if (mode1 == mode2)
40517 return true;
40519 if (ix86_tieable_integer_mode_p (mode1)
40520 && ix86_tieable_integer_mode_p (mode2))
40521 return true;
40523 /* MODE2 being XFmode implies fp stack or general regs, which means we
40524 can tie any smaller floating point modes to it. Note that we do not
40525 tie this with TFmode. */
40526 if (mode2 == XFmode)
40527 return mode1 == SFmode || mode1 == DFmode;
40529 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40530 that we can tie it with SFmode. */
40531 if (mode2 == DFmode)
40532 return mode1 == SFmode;
40534 /* If MODE2 is only appropriate for an SSE register, then tie with
40535 any other mode acceptable to SSE registers. */
40536 if (GET_MODE_SIZE (mode2) == 32
40537 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40538 return (GET_MODE_SIZE (mode1) == 32
40539 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40540 if (GET_MODE_SIZE (mode2) == 16
40541 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40542 return (GET_MODE_SIZE (mode1) == 16
40543 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40545 /* If MODE2 is appropriate for an MMX register, then tie
40546 with any other mode acceptable to MMX registers. */
40547 if (GET_MODE_SIZE (mode2) == 8
40548 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40549 return (GET_MODE_SIZE (mode1) == 8
40550 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40552 return false;
40555 /* Return the cost of moving between two registers of mode MODE. */
40557 static int
40558 ix86_set_reg_reg_cost (machine_mode mode)
40560 unsigned int units = UNITS_PER_WORD;
40562 switch (GET_MODE_CLASS (mode))
40564 default:
40565 break;
40567 case MODE_CC:
40568 units = GET_MODE_SIZE (CCmode);
40569 break;
40571 case MODE_FLOAT:
40572 if ((TARGET_SSE && mode == TFmode)
40573 || (TARGET_80387 && mode == XFmode)
40574 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40575 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40576 units = GET_MODE_SIZE (mode);
40577 break;
40579 case MODE_COMPLEX_FLOAT:
40580 if ((TARGET_SSE && mode == TCmode)
40581 || (TARGET_80387 && mode == XCmode)
40582 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40583 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40584 units = GET_MODE_SIZE (mode);
40585 break;
40587 case MODE_VECTOR_INT:
40588 case MODE_VECTOR_FLOAT:
40589 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40590 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40591 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40592 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40593 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40594 units = GET_MODE_SIZE (mode);
40597 /* Return the cost of moving between two registers of mode MODE,
40598 assuming that the move will be in pieces of at most UNITS bytes. */
40599 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40602 /* Return cost of vector operation in MODE given that scalar version has
40603 COST. If PARALLEL is true assume that CPU has more than one unit
40604 performing the operation. */
40606 static int
40607 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40609 if (!VECTOR_MODE_P (mode))
40610 return cost;
40612 if (!parallel)
40613 return cost * GET_MODE_NUNITS (mode);
40614 if (GET_MODE_BITSIZE (mode) == 128
40615 && TARGET_SSE_SPLIT_REGS)
40616 return cost * 2;
40617 if (GET_MODE_BITSIZE (mode) > 128
40618 && TARGET_AVX128_OPTIMAL)
40619 return cost * GET_MODE_BITSIZE (mode) / 128;
40620 return cost;
40623 /* Return cost of multiplication in MODE. */
40625 static int
40626 ix86_multiplication_cost (const struct processor_costs *cost,
40627 enum machine_mode mode)
40629 machine_mode inner_mode = mode;
40630 if (VECTOR_MODE_P (mode))
40631 inner_mode = GET_MODE_INNER (mode);
40633 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40634 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40635 else if (X87_FLOAT_MODE_P (mode))
40636 return cost->fmul;
40637 else if (FLOAT_MODE_P (mode))
40638 return ix86_vec_cost (mode,
40639 inner_mode == DFmode
40640 ? cost->mulsd : cost->mulss, true);
40641 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40643 /* vpmullq is used in this case. No emulation is needed. */
40644 if (TARGET_AVX512DQ)
40645 return ix86_vec_cost (mode, cost->mulss, true);
40647 /* V*QImode is emulated with 7-13 insns. */
40648 if (mode == V16QImode || mode == V32QImode)
40650 int extra = 11;
40651 if (TARGET_XOP && mode == V16QImode)
40652 extra = 5;
40653 else if (TARGET_SSSE3)
40654 extra = 6;
40655 return ix86_vec_cost (mode,
40656 cost->mulss * 2 + cost->sse_op * extra,
40657 true);
40659 /* V*DImode is emulated with 5-8 insns. */
40660 else if (mode == V2DImode || mode == V4DImode)
40662 if (TARGET_XOP && mode == V2DImode)
40663 return ix86_vec_cost (mode,
40664 cost->mulss * 2 + cost->sse_op * 3,
40665 true);
40666 else
40667 return ix86_vec_cost (mode,
40668 cost->mulss * 3 + cost->sse_op * 5,
40669 true);
40671 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40672 insns, including two PMULUDQ. */
40673 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40674 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40675 true);
40676 else
40677 return ix86_vec_cost (mode, cost->mulss, true);
40679 else
40680 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40683 /* Return cost of multiplication in MODE. */
40685 static int
40686 ix86_division_cost (const struct processor_costs *cost,
40687 enum machine_mode mode)
40689 machine_mode inner_mode = mode;
40690 if (VECTOR_MODE_P (mode))
40691 inner_mode = GET_MODE_INNER (mode);
40693 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40694 return inner_mode == DFmode ? cost->divsd : cost->divss;
40695 else if (X87_FLOAT_MODE_P (mode))
40696 return cost->fdiv;
40697 else if (FLOAT_MODE_P (mode))
40698 return ix86_vec_cost (mode,
40699 inner_mode == DFmode ? cost->divsd : cost->divss,
40700 true);
40701 else
40702 return cost->divide[MODE_INDEX (mode)];
40705 /* Return cost of shift in MODE.
40706 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40707 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40708 if op1 is a result of subreg.
40710 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40712 static int
40713 ix86_shift_rotate_cost (const struct processor_costs *cost,
40714 enum machine_mode mode, bool constant_op1,
40715 HOST_WIDE_INT op1_val,
40716 bool speed,
40717 bool and_in_op1,
40718 bool shift_and_truncate,
40719 bool *skip_op0, bool *skip_op1)
40721 if (skip_op0)
40722 *skip_op0 = *skip_op1 = false;
40723 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40725 /* V*QImode is emulated with 1-11 insns. */
40726 if (mode == V16QImode || mode == V32QImode)
40728 int count = 11;
40729 if (TARGET_XOP && mode == V16QImode)
40731 /* For XOP we use vpshab, which requires a broadcast of the
40732 value to the variable shift insn. For constants this
40733 means a V16Q const in mem; even when we can perform the
40734 shift with one insn set the cost to prefer paddb. */
40735 if (constant_op1)
40737 if (skip_op1)
40738 *skip_op1 = true;
40739 return ix86_vec_cost (mode,
40740 cost->sse_op
40741 + (speed
40743 : COSTS_N_BYTES
40744 (GET_MODE_UNIT_SIZE (mode))), true);
40746 count = 3;
40748 else if (TARGET_SSSE3)
40749 count = 7;
40750 return ix86_vec_cost (mode, cost->sse_op * count, true);
40752 else
40753 return ix86_vec_cost (mode, cost->sse_op, true);
40755 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40757 if (constant_op1)
40759 if (op1_val > 32)
40760 return cost->shift_const + COSTS_N_INSNS (2);
40761 else
40762 return cost->shift_const * 2;
40764 else
40766 if (and_in_op1)
40767 return cost->shift_var * 2;
40768 else
40769 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40772 else
40774 if (constant_op1)
40775 return cost->shift_const;
40776 else if (shift_and_truncate)
40778 if (skip_op0)
40779 *skip_op0 = *skip_op1 = true;
40780 /* Return the cost after shift-and truncation. */
40781 return cost->shift_var;
40783 else
40784 return cost->shift_var;
40786 return cost->shift_const;
40789 /* Compute a (partial) cost for rtx X. Return true if the complete
40790 cost has been computed, and false if subexpressions should be
40791 scanned. In either case, *TOTAL contains the cost result. */
40793 static bool
40794 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40795 int *total, bool speed)
40797 rtx mask;
40798 enum rtx_code code = GET_CODE (x);
40799 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40800 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40801 int src_cost;
40803 switch (code)
40805 case SET:
40806 if (register_operand (SET_DEST (x), VOIDmode)
40807 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40809 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40810 return true;
40813 if (register_operand (SET_SRC (x), VOIDmode))
40814 /* Avoid potentially incorrect high cost from rtx_costs
40815 for non-tieable SUBREGs. */
40816 src_cost = 0;
40817 else
40819 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40821 if (CONSTANT_P (SET_SRC (x)))
40822 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40823 a small value, possibly zero for cheap constants. */
40824 src_cost += COSTS_N_INSNS (1);
40827 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40828 return true;
40830 case CONST_INT:
40831 case CONST:
40832 case LABEL_REF:
40833 case SYMBOL_REF:
40834 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40835 *total = 3;
40836 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40837 *total = 2;
40838 else if (flag_pic && SYMBOLIC_CONST (x)
40839 && !(TARGET_64BIT
40840 && (GET_CODE (x) == LABEL_REF
40841 || (GET_CODE (x) == SYMBOL_REF
40842 && SYMBOL_REF_LOCAL_P (x))))
40843 /* Use 0 cost for CONST to improve its propagation. */
40844 && (TARGET_64BIT || GET_CODE (x) != CONST))
40845 *total = 1;
40846 else
40847 *total = 0;
40848 return true;
40850 case CONST_DOUBLE:
40851 if (IS_STACK_MODE (mode))
40852 switch (standard_80387_constant_p (x))
40854 case -1:
40855 case 0:
40856 break;
40857 case 1: /* 0.0 */
40858 *total = 1;
40859 return true;
40860 default: /* Other constants */
40861 *total = 2;
40862 return true;
40864 /* FALLTHRU */
40866 case CONST_VECTOR:
40867 switch (standard_sse_constant_p (x, mode))
40869 case 0:
40870 break;
40871 case 1: /* 0: xor eliminates false dependency */
40872 *total = 0;
40873 return true;
40874 default: /* -1: cmp contains false dependency */
40875 *total = 1;
40876 return true;
40878 /* FALLTHRU */
40880 case CONST_WIDE_INT:
40881 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40882 it'll probably end up. Add a penalty for size. */
40883 *total = (COSTS_N_INSNS (1)
40884 + (!TARGET_64BIT && flag_pic)
40885 + (GET_MODE_SIZE (mode) <= 4
40886 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40887 return true;
40889 case ZERO_EXTEND:
40890 /* The zero extensions is often completely free on x86_64, so make
40891 it as cheap as possible. */
40892 if (TARGET_64BIT && mode == DImode
40893 && GET_MODE (XEXP (x, 0)) == SImode)
40894 *total = 1;
40895 else if (TARGET_ZERO_EXTEND_WITH_AND)
40896 *total = cost->add;
40897 else
40898 *total = cost->movzx;
40899 return false;
40901 case SIGN_EXTEND:
40902 *total = cost->movsx;
40903 return false;
40905 case ASHIFT:
40906 if (SCALAR_INT_MODE_P (mode)
40907 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40908 && CONST_INT_P (XEXP (x, 1)))
40910 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40911 if (value == 1)
40913 *total = cost->add;
40914 return false;
40916 if ((value == 2 || value == 3)
40917 && cost->lea <= cost->shift_const)
40919 *total = cost->lea;
40920 return false;
40923 /* FALLTHRU */
40925 case ROTATE:
40926 case ASHIFTRT:
40927 case LSHIFTRT:
40928 case ROTATERT:
40929 bool skip_op0, skip_op1;
40930 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40931 CONST_INT_P (XEXP (x, 1))
40932 ? INTVAL (XEXP (x, 1)) : -1,
40933 speed,
40934 GET_CODE (XEXP (x, 1)) == AND,
40935 SUBREG_P (XEXP (x, 1))
40936 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40937 &skip_op0, &skip_op1);
40938 if (skip_op0 || skip_op1)
40940 if (!skip_op0)
40941 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40942 if (!skip_op1)
40943 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40944 return true;
40946 return false;
40948 case FMA:
40950 rtx sub;
40952 gcc_assert (FLOAT_MODE_P (mode));
40953 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40955 *total = ix86_vec_cost (mode,
40956 mode == SFmode ? cost->fmass : cost->fmasd,
40957 true);
40958 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40960 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40961 sub = XEXP (x, 0);
40962 if (GET_CODE (sub) == NEG)
40963 sub = XEXP (sub, 0);
40964 *total += rtx_cost (sub, mode, FMA, 0, speed);
40966 sub = XEXP (x, 2);
40967 if (GET_CODE (sub) == NEG)
40968 sub = XEXP (sub, 0);
40969 *total += rtx_cost (sub, mode, FMA, 2, speed);
40970 return true;
40973 case MULT:
40974 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40976 rtx op0 = XEXP (x, 0);
40977 rtx op1 = XEXP (x, 1);
40978 int nbits;
40979 if (CONST_INT_P (XEXP (x, 1)))
40981 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40982 for (nbits = 0; value != 0; value &= value - 1)
40983 nbits++;
40985 else
40986 /* This is arbitrary. */
40987 nbits = 7;
40989 /* Compute costs correctly for widening multiplication. */
40990 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40991 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40992 == GET_MODE_SIZE (mode))
40994 int is_mulwiden = 0;
40995 machine_mode inner_mode = GET_MODE (op0);
40997 if (GET_CODE (op0) == GET_CODE (op1))
40998 is_mulwiden = 1, op1 = XEXP (op1, 0);
40999 else if (CONST_INT_P (op1))
41001 if (GET_CODE (op0) == SIGN_EXTEND)
41002 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41003 == INTVAL (op1);
41004 else
41005 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41008 if (is_mulwiden)
41009 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41012 *total = (cost->mult_init[MODE_INDEX (mode)]
41013 + nbits * cost->mult_bit
41014 + rtx_cost (op0, mode, outer_code, opno, speed)
41015 + rtx_cost (op1, mode, outer_code, opno, speed));
41017 return true;
41019 *total = ix86_multiplication_cost (cost, mode);
41020 return false;
41022 case DIV:
41023 case UDIV:
41024 case MOD:
41025 case UMOD:
41026 *total = ix86_division_cost (cost, mode);
41027 return false;
41029 case PLUS:
41030 if (GET_MODE_CLASS (mode) == MODE_INT
41031 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41033 if (GET_CODE (XEXP (x, 0)) == PLUS
41034 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41035 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41036 && CONSTANT_P (XEXP (x, 1)))
41038 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41039 if (val == 2 || val == 4 || val == 8)
41041 *total = cost->lea;
41042 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41043 outer_code, opno, speed);
41044 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41045 outer_code, opno, speed);
41046 *total += rtx_cost (XEXP (x, 1), mode,
41047 outer_code, opno, speed);
41048 return true;
41051 else if (GET_CODE (XEXP (x, 0)) == MULT
41052 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41054 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41055 if (val == 2 || val == 4 || val == 8)
41057 *total = cost->lea;
41058 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41059 outer_code, opno, speed);
41060 *total += rtx_cost (XEXP (x, 1), mode,
41061 outer_code, opno, speed);
41062 return true;
41065 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41067 /* Add with carry, ignore the cost of adding a carry flag. */
41068 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41069 *total = cost->add;
41070 else
41072 *total = cost->lea;
41073 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41074 outer_code, opno, speed);
41077 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41078 outer_code, opno, speed);
41079 *total += rtx_cost (XEXP (x, 1), mode,
41080 outer_code, opno, speed);
41081 return true;
41084 /* FALLTHRU */
41086 case MINUS:
41087 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41088 if (GET_MODE_CLASS (mode) == MODE_INT
41089 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41090 && GET_CODE (XEXP (x, 0)) == MINUS
41091 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41093 *total = cost->add;
41094 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41095 outer_code, opno, speed);
41096 *total += rtx_cost (XEXP (x, 1), mode,
41097 outer_code, opno, speed);
41098 return true;
41101 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41103 *total = cost->addss;
41104 return false;
41106 else if (X87_FLOAT_MODE_P (mode))
41108 *total = cost->fadd;
41109 return false;
41111 else if (FLOAT_MODE_P (mode))
41113 *total = ix86_vec_cost (mode, cost->addss, true);
41114 return false;
41116 /* FALLTHRU */
41118 case AND:
41119 case IOR:
41120 case XOR:
41121 if (GET_MODE_CLASS (mode) == MODE_INT
41122 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41124 *total = (cost->add * 2
41125 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41126 << (GET_MODE (XEXP (x, 0)) != DImode))
41127 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41128 << (GET_MODE (XEXP (x, 1)) != DImode)));
41129 return true;
41131 /* FALLTHRU */
41133 case NEG:
41134 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41136 *total = cost->sse_op;
41137 return false;
41139 else if (X87_FLOAT_MODE_P (mode))
41141 *total = cost->fchs;
41142 return false;
41144 else if (FLOAT_MODE_P (mode))
41146 *total = ix86_vec_cost (mode, cost->sse_op, true);
41147 return false;
41149 /* FALLTHRU */
41151 case NOT:
41152 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41153 *total = ix86_vec_cost (mode, cost->sse_op, true);
41154 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41155 *total = cost->add * 2;
41156 else
41157 *total = cost->add;
41158 return false;
41160 case COMPARE:
41161 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41162 && XEXP (XEXP (x, 0), 1) == const1_rtx
41163 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41164 && XEXP (x, 1) == const0_rtx)
41166 /* This kind of construct is implemented using test[bwl].
41167 Treat it as if we had an AND. */
41168 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41169 *total = (cost->add
41170 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41171 opno, speed)
41172 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41173 return true;
41176 /* The embedded comparison operand is completely free. */
41177 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41178 && XEXP (x, 1) == const0_rtx)
41179 *total = 0;
41181 return false;
41183 case FLOAT_EXTEND:
41184 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41185 *total = 0;
41186 else
41187 *total = ix86_vec_cost (mode, cost->addss, true);
41188 return false;
41190 case FLOAT_TRUNCATE:
41191 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41192 *total = cost->fadd;
41193 else
41194 *total = ix86_vec_cost (mode, cost->addss, true);
41195 return false;
41197 case ABS:
41198 /* SSE requires memory load for the constant operand. It may make
41199 sense to account for this. Of course the constant operand may or
41200 may not be reused. */
41201 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41202 *total = cost->sse_op;
41203 else if (X87_FLOAT_MODE_P (mode))
41204 *total = cost->fabs;
41205 else if (FLOAT_MODE_P (mode))
41206 *total = ix86_vec_cost (mode, cost->sse_op, true);
41207 return false;
41209 case SQRT:
41210 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41211 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
41212 else if (X87_FLOAT_MODE_P (mode))
41213 *total = cost->fsqrt;
41214 else if (FLOAT_MODE_P (mode))
41215 *total = ix86_vec_cost (mode,
41216 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
41217 true);
41218 return false;
41220 case UNSPEC:
41221 if (XINT (x, 1) == UNSPEC_TP)
41222 *total = 0;
41223 return false;
41225 case VEC_SELECT:
41226 case VEC_CONCAT:
41227 case VEC_DUPLICATE:
41228 /* ??? Assume all of these vector manipulation patterns are
41229 recognizable. In which case they all pretty much have the
41230 same cost. */
41231 *total = cost->sse_op;
41232 return true;
41233 case VEC_MERGE:
41234 mask = XEXP (x, 2);
41235 /* This is masked instruction, assume the same cost,
41236 as nonmasked variant. */
41237 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41238 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41239 else
41240 *total = cost->sse_op;
41241 return true;
41243 default:
41244 return false;
41248 #if TARGET_MACHO
41250 static int current_machopic_label_num;
41252 /* Given a symbol name and its associated stub, write out the
41253 definition of the stub. */
41255 void
41256 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41258 unsigned int length;
41259 char *binder_name, *symbol_name, lazy_ptr_name[32];
41260 int label = ++current_machopic_label_num;
41262 /* For 64-bit we shouldn't get here. */
41263 gcc_assert (!TARGET_64BIT);
41265 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41266 symb = targetm.strip_name_encoding (symb);
41268 length = strlen (stub);
41269 binder_name = XALLOCAVEC (char, length + 32);
41270 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41272 length = strlen (symb);
41273 symbol_name = XALLOCAVEC (char, length + 32);
41274 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41276 sprintf (lazy_ptr_name, "L%d$lz", label);
41278 if (MACHOPIC_ATT_STUB)
41279 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41280 else if (MACHOPIC_PURE)
41281 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41282 else
41283 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41285 fprintf (file, "%s:\n", stub);
41286 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41288 if (MACHOPIC_ATT_STUB)
41290 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41292 else if (MACHOPIC_PURE)
41294 /* PIC stub. */
41295 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41296 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41297 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41298 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41299 label, lazy_ptr_name, label);
41300 fprintf (file, "\tjmp\t*%%ecx\n");
41302 else
41303 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41305 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41306 it needs no stub-binding-helper. */
41307 if (MACHOPIC_ATT_STUB)
41308 return;
41310 fprintf (file, "%s:\n", binder_name);
41312 if (MACHOPIC_PURE)
41314 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41315 fprintf (file, "\tpushl\t%%ecx\n");
41317 else
41318 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41320 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41322 /* N.B. Keep the correspondence of these
41323 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41324 old-pic/new-pic/non-pic stubs; altering this will break
41325 compatibility with existing dylibs. */
41326 if (MACHOPIC_PURE)
41328 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41329 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41331 else
41332 /* 16-byte -mdynamic-no-pic stub. */
41333 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41335 fprintf (file, "%s:\n", lazy_ptr_name);
41336 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41337 fprintf (file, ASM_LONG "%s\n", binder_name);
41339 #endif /* TARGET_MACHO */
41341 /* Order the registers for register allocator. */
41343 void
41344 x86_order_regs_for_local_alloc (void)
41346 int pos = 0;
41347 int i;
41349 /* First allocate the local general purpose registers. */
41350 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41351 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41352 reg_alloc_order [pos++] = i;
41354 /* Global general purpose registers. */
41355 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41356 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41357 reg_alloc_order [pos++] = i;
41359 /* x87 registers come first in case we are doing FP math
41360 using them. */
41361 if (!TARGET_SSE_MATH)
41362 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41363 reg_alloc_order [pos++] = i;
41365 /* SSE registers. */
41366 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41367 reg_alloc_order [pos++] = i;
41368 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41369 reg_alloc_order [pos++] = i;
41371 /* Extended REX SSE registers. */
41372 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41373 reg_alloc_order [pos++] = i;
41375 /* Mask register. */
41376 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41377 reg_alloc_order [pos++] = i;
41379 /* MPX bound registers. */
41380 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41381 reg_alloc_order [pos++] = i;
41383 /* x87 registers. */
41384 if (TARGET_SSE_MATH)
41385 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41386 reg_alloc_order [pos++] = i;
41388 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41389 reg_alloc_order [pos++] = i;
41391 /* Initialize the rest of array as we do not allocate some registers
41392 at all. */
41393 while (pos < FIRST_PSEUDO_REGISTER)
41394 reg_alloc_order [pos++] = 0;
41397 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41398 in struct attribute_spec handler. */
41399 static tree
41400 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41401 bool *no_add_attrs)
41403 if (TREE_CODE (*node) != FUNCTION_TYPE
41404 && TREE_CODE (*node) != METHOD_TYPE
41405 && TREE_CODE (*node) != FIELD_DECL
41406 && TREE_CODE (*node) != TYPE_DECL)
41408 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41409 name);
41410 *no_add_attrs = true;
41411 return NULL_TREE;
41413 if (TARGET_64BIT)
41415 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41416 name);
41417 *no_add_attrs = true;
41418 return NULL_TREE;
41420 if (is_attribute_p ("callee_pop_aggregate_return", name))
41422 tree cst;
41424 cst = TREE_VALUE (args);
41425 if (TREE_CODE (cst) != INTEGER_CST)
41427 warning (OPT_Wattributes,
41428 "%qE attribute requires an integer constant argument",
41429 name);
41430 *no_add_attrs = true;
41432 else if (compare_tree_int (cst, 0) != 0
41433 && compare_tree_int (cst, 1) != 0)
41435 warning (OPT_Wattributes,
41436 "argument to %qE attribute is neither zero, nor one",
41437 name);
41438 *no_add_attrs = true;
41441 return NULL_TREE;
41444 return NULL_TREE;
41447 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41448 struct attribute_spec.handler. */
41449 static tree
41450 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41451 bool *no_add_attrs)
41453 if (TREE_CODE (*node) != FUNCTION_TYPE
41454 && TREE_CODE (*node) != METHOD_TYPE
41455 && TREE_CODE (*node) != FIELD_DECL
41456 && TREE_CODE (*node) != TYPE_DECL)
41458 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41459 name);
41460 *no_add_attrs = true;
41461 return NULL_TREE;
41464 /* Can combine regparm with all attributes but fastcall. */
41465 if (is_attribute_p ("ms_abi", name))
41467 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41469 error ("ms_abi and sysv_abi attributes are not compatible");
41472 return NULL_TREE;
41474 else if (is_attribute_p ("sysv_abi", name))
41476 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41478 error ("ms_abi and sysv_abi attributes are not compatible");
41481 return NULL_TREE;
41484 return NULL_TREE;
41487 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41488 struct attribute_spec.handler. */
41489 static tree
41490 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41491 bool *no_add_attrs)
41493 tree *type = NULL;
41494 if (DECL_P (*node))
41496 if (TREE_CODE (*node) == TYPE_DECL)
41497 type = &TREE_TYPE (*node);
41499 else
41500 type = node;
41502 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41504 warning (OPT_Wattributes, "%qE attribute ignored",
41505 name);
41506 *no_add_attrs = true;
41509 else if ((is_attribute_p ("ms_struct", name)
41510 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41511 || ((is_attribute_p ("gcc_struct", name)
41512 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41514 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41515 name);
41516 *no_add_attrs = true;
41519 return NULL_TREE;
41522 static tree
41523 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41524 bool *no_add_attrs)
41526 if (TREE_CODE (*node) != FUNCTION_DECL)
41528 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41529 name);
41530 *no_add_attrs = true;
41533 if (is_attribute_p ("indirect_branch", name))
41535 tree cst = TREE_VALUE (args);
41536 if (TREE_CODE (cst) != STRING_CST)
41538 warning (OPT_Wattributes,
41539 "%qE attribute requires a string constant argument",
41540 name);
41541 *no_add_attrs = true;
41543 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41544 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41545 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41546 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41548 warning (OPT_Wattributes,
41549 "argument to %qE attribute is not "
41550 "(keep|thunk|thunk-inline|thunk-extern)", name);
41551 *no_add_attrs = true;
41555 if (is_attribute_p ("function_return", name))
41557 tree cst = TREE_VALUE (args);
41558 if (TREE_CODE (cst) != STRING_CST)
41560 warning (OPT_Wattributes,
41561 "%qE attribute requires a string constant argument",
41562 name);
41563 *no_add_attrs = true;
41565 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41566 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41567 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41568 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41570 warning (OPT_Wattributes,
41571 "argument to %qE attribute is not "
41572 "(keep|thunk|thunk-inline|thunk-extern)", name);
41573 *no_add_attrs = true;
41577 return NULL_TREE;
41580 static tree
41581 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41582 int, bool *)
41584 return NULL_TREE;
41587 static tree
41588 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41590 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41591 but the function type contains args and return type data. */
41592 tree func_type = *node;
41593 tree return_type = TREE_TYPE (func_type);
41595 int nargs = 0;
41596 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41597 while (current_arg_type
41598 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41600 if (nargs == 0)
41602 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41603 error ("interrupt service routine should have a pointer "
41604 "as the first argument");
41606 else if (nargs == 1)
41608 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41609 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41610 error ("interrupt service routine should have unsigned %s"
41611 "int as the second argument",
41612 TARGET_64BIT
41613 ? (TARGET_X32 ? "long long " : "long ")
41614 : "");
41616 nargs++;
41617 current_arg_type = TREE_CHAIN (current_arg_type);
41619 if (!nargs || nargs > 2)
41620 error ("interrupt service routine can only have a pointer argument "
41621 "and an optional integer argument");
41622 if (! VOID_TYPE_P (return_type))
41623 error ("interrupt service routine can't have non-void return value");
41625 return NULL_TREE;
41628 static bool
41629 ix86_ms_bitfield_layout_p (const_tree record_type)
41631 return ((TARGET_MS_BITFIELD_LAYOUT
41632 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41633 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41636 /* Returns an expression indicating where the this parameter is
41637 located on entry to the FUNCTION. */
41639 static rtx
41640 x86_this_parameter (tree function)
41642 tree type = TREE_TYPE (function);
41643 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41644 int nregs;
41646 if (TARGET_64BIT)
41648 const int *parm_regs;
41650 if (ix86_function_type_abi (type) == MS_ABI)
41651 parm_regs = x86_64_ms_abi_int_parameter_registers;
41652 else
41653 parm_regs = x86_64_int_parameter_registers;
41654 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41657 nregs = ix86_function_regparm (type, function);
41659 if (nregs > 0 && !stdarg_p (type))
41661 int regno;
41662 unsigned int ccvt = ix86_get_callcvt (type);
41664 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41665 regno = aggr ? DX_REG : CX_REG;
41666 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41668 regno = CX_REG;
41669 if (aggr)
41670 return gen_rtx_MEM (SImode,
41671 plus_constant (Pmode, stack_pointer_rtx, 4));
41673 else
41675 regno = AX_REG;
41676 if (aggr)
41678 regno = DX_REG;
41679 if (nregs == 1)
41680 return gen_rtx_MEM (SImode,
41681 plus_constant (Pmode,
41682 stack_pointer_rtx, 4));
41685 return gen_rtx_REG (SImode, regno);
41688 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41689 aggr ? 8 : 4));
41692 /* Determine whether x86_output_mi_thunk can succeed. */
41694 static bool
41695 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41696 const_tree function)
41698 /* 64-bit can handle anything. */
41699 if (TARGET_64BIT)
41700 return true;
41702 /* For 32-bit, everything's fine if we have one free register. */
41703 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41704 return true;
41706 /* Need a free register for vcall_offset. */
41707 if (vcall_offset)
41708 return false;
41710 /* Need a free register for GOT references. */
41711 if (flag_pic && !targetm.binds_local_p (function))
41712 return false;
41714 /* Otherwise ok. */
41715 return true;
41718 /* Output the assembler code for a thunk function. THUNK_DECL is the
41719 declaration for the thunk function itself, FUNCTION is the decl for
41720 the target function. DELTA is an immediate constant offset to be
41721 added to THIS. If VCALL_OFFSET is nonzero, the word at
41722 *(*this + vcall_offset) should be added to THIS. */
41724 static void
41725 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41726 HOST_WIDE_INT vcall_offset, tree function)
41728 rtx this_param = x86_this_parameter (function);
41729 rtx this_reg, tmp, fnaddr;
41730 unsigned int tmp_regno;
41731 rtx_insn *insn;
41733 if (TARGET_64BIT)
41734 tmp_regno = R10_REG;
41735 else
41737 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41738 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41739 tmp_regno = AX_REG;
41740 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41741 tmp_regno = DX_REG;
41742 else
41743 tmp_regno = CX_REG;
41746 emit_note (NOTE_INSN_PROLOGUE_END);
41748 /* CET is enabled, insert EB instruction. */
41749 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
41750 emit_insn (gen_nop_endbr ());
41752 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41753 pull it in now and let DELTA benefit. */
41754 if (REG_P (this_param))
41755 this_reg = this_param;
41756 else if (vcall_offset)
41758 /* Put the this parameter into %eax. */
41759 this_reg = gen_rtx_REG (Pmode, AX_REG);
41760 emit_move_insn (this_reg, this_param);
41762 else
41763 this_reg = NULL_RTX;
41765 /* Adjust the this parameter by a fixed constant. */
41766 if (delta)
41768 rtx delta_rtx = GEN_INT (delta);
41769 rtx delta_dst = this_reg ? this_reg : this_param;
41771 if (TARGET_64BIT)
41773 if (!x86_64_general_operand (delta_rtx, Pmode))
41775 tmp = gen_rtx_REG (Pmode, tmp_regno);
41776 emit_move_insn (tmp, delta_rtx);
41777 delta_rtx = tmp;
41781 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41784 /* Adjust the this parameter by a value stored in the vtable. */
41785 if (vcall_offset)
41787 rtx vcall_addr, vcall_mem, this_mem;
41789 tmp = gen_rtx_REG (Pmode, tmp_regno);
41791 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41792 if (Pmode != ptr_mode)
41793 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41794 emit_move_insn (tmp, this_mem);
41796 /* Adjust the this parameter. */
41797 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41798 if (TARGET_64BIT
41799 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41801 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41802 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41803 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41806 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41807 if (Pmode != ptr_mode)
41808 emit_insn (gen_addsi_1_zext (this_reg,
41809 gen_rtx_REG (ptr_mode,
41810 REGNO (this_reg)),
41811 vcall_mem));
41812 else
41813 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41816 /* If necessary, drop THIS back to its stack slot. */
41817 if (this_reg && this_reg != this_param)
41818 emit_move_insn (this_param, this_reg);
41820 fnaddr = XEXP (DECL_RTL (function), 0);
41821 if (TARGET_64BIT)
41823 if (!flag_pic || targetm.binds_local_p (function)
41824 || TARGET_PECOFF)
41826 else
41828 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41829 tmp = gen_rtx_CONST (Pmode, tmp);
41830 fnaddr = gen_const_mem (Pmode, tmp);
41833 else
41835 if (!flag_pic || targetm.binds_local_p (function))
41837 #if TARGET_MACHO
41838 else if (TARGET_MACHO)
41840 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41841 fnaddr = XEXP (fnaddr, 0);
41843 #endif /* TARGET_MACHO */
41844 else
41846 tmp = gen_rtx_REG (Pmode, CX_REG);
41847 output_set_got (tmp, NULL_RTX);
41849 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41850 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41851 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41852 fnaddr = gen_const_mem (Pmode, fnaddr);
41856 /* Our sibling call patterns do not allow memories, because we have no
41857 predicate that can distinguish between frame and non-frame memory.
41858 For our purposes here, we can get away with (ab)using a jump pattern,
41859 because we're going to do no optimization. */
41860 if (MEM_P (fnaddr))
41862 if (sibcall_insn_operand (fnaddr, word_mode))
41864 fnaddr = XEXP (DECL_RTL (function), 0);
41865 tmp = gen_rtx_MEM (QImode, fnaddr);
41866 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41867 tmp = emit_call_insn (tmp);
41868 SIBLING_CALL_P (tmp) = 1;
41870 else
41871 emit_jump_insn (gen_indirect_jump (fnaddr));
41873 else
41875 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41877 // CM_LARGE_PIC always uses pseudo PIC register which is
41878 // uninitialized. Since FUNCTION is local and calling it
41879 // doesn't go through PLT, we use scratch register %r11 as
41880 // PIC register and initialize it here.
41881 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41882 ix86_init_large_pic_reg (tmp_regno);
41883 fnaddr = legitimize_pic_address (fnaddr,
41884 gen_rtx_REG (Pmode, tmp_regno));
41887 if (!sibcall_insn_operand (fnaddr, word_mode))
41889 tmp = gen_rtx_REG (word_mode, tmp_regno);
41890 if (GET_MODE (fnaddr) != word_mode)
41891 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41892 emit_move_insn (tmp, fnaddr);
41893 fnaddr = tmp;
41896 tmp = gen_rtx_MEM (QImode, fnaddr);
41897 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41898 tmp = emit_call_insn (tmp);
41899 SIBLING_CALL_P (tmp) = 1;
41901 emit_barrier ();
41903 /* Emit just enough of rest_of_compilation to get the insns emitted.
41904 Note that use_thunk calls assemble_start_function et al. */
41905 insn = get_insns ();
41906 shorten_branches (insn);
41907 final_start_function (insn, file, 1);
41908 final (insn, file, 1);
41909 final_end_function ();
41912 static void
41913 x86_file_start (void)
41915 default_file_start ();
41916 if (TARGET_16BIT)
41917 fputs ("\t.code16gcc\n", asm_out_file);
41918 #if TARGET_MACHO
41919 darwin_file_start ();
41920 #endif
41921 if (X86_FILE_START_VERSION_DIRECTIVE)
41922 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41923 if (X86_FILE_START_FLTUSED)
41924 fputs ("\t.global\t__fltused\n", asm_out_file);
41925 if (ix86_asm_dialect == ASM_INTEL)
41926 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41930 x86_field_alignment (tree type, int computed)
41932 machine_mode mode;
41934 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41935 return computed;
41936 if (TARGET_IAMCU)
41937 return iamcu_alignment (type, computed);
41938 mode = TYPE_MODE (strip_array_types (type));
41939 if (mode == DFmode || mode == DCmode
41940 || GET_MODE_CLASS (mode) == MODE_INT
41941 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41942 return MIN (32, computed);
41943 return computed;
41946 /* Print call to TARGET to FILE. */
41948 static void
41949 x86_print_call_or_nop (FILE *file, const char *target)
41951 if (flag_nop_mcount)
41952 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41953 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41954 else
41955 fprintf (file, "1:\tcall\t%s\n", target);
41958 /* Output assembler code to FILE to increment profiler label # LABELNO
41959 for profiling a function entry. */
41960 void
41961 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41963 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41964 : MCOUNT_NAME);
41965 if (TARGET_64BIT)
41967 #ifndef NO_PROFILE_COUNTERS
41968 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41969 #endif
41971 if (!TARGET_PECOFF && flag_pic)
41972 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41973 else
41974 x86_print_call_or_nop (file, mcount_name);
41976 else if (flag_pic)
41978 #ifndef NO_PROFILE_COUNTERS
41979 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41980 LPREFIX, labelno);
41981 #endif
41982 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41984 else
41986 #ifndef NO_PROFILE_COUNTERS
41987 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41988 LPREFIX, labelno);
41989 #endif
41990 x86_print_call_or_nop (file, mcount_name);
41993 if (flag_record_mcount)
41995 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41996 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41997 fprintf (file, "\t.previous\n");
42001 /* We don't have exact information about the insn sizes, but we may assume
42002 quite safely that we are informed about all 1 byte insns and memory
42003 address sizes. This is enough to eliminate unnecessary padding in
42004 99% of cases. */
42007 ix86_min_insn_size (rtx_insn *insn)
42009 int l = 0, len;
42011 if (!INSN_P (insn) || !active_insn_p (insn))
42012 return 0;
42014 /* Discard alignments we've emit and jump instructions. */
42015 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42016 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42017 return 0;
42019 /* Important case - calls are always 5 bytes.
42020 It is common to have many calls in the row. */
42021 if (CALL_P (insn)
42022 && symbolic_reference_mentioned_p (PATTERN (insn))
42023 && !SIBLING_CALL_P (insn))
42024 return 5;
42025 len = get_attr_length (insn);
42026 if (len <= 1)
42027 return 1;
42029 /* For normal instructions we rely on get_attr_length being exact,
42030 with a few exceptions. */
42031 if (!JUMP_P (insn))
42033 enum attr_type type = get_attr_type (insn);
42035 switch (type)
42037 case TYPE_MULTI:
42038 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42039 || asm_noperands (PATTERN (insn)) >= 0)
42040 return 0;
42041 break;
42042 case TYPE_OTHER:
42043 case TYPE_FCMP:
42044 break;
42045 default:
42046 /* Otherwise trust get_attr_length. */
42047 return len;
42050 l = get_attr_length_address (insn);
42051 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42052 l = 4;
42054 if (l)
42055 return 1+l;
42056 else
42057 return 2;
42060 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42062 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42063 window. */
42065 static void
42066 ix86_avoid_jump_mispredicts (void)
42068 rtx_insn *insn, *start = get_insns ();
42069 int nbytes = 0, njumps = 0;
42070 bool isjump = false;
42072 /* Look for all minimal intervals of instructions containing 4 jumps.
42073 The intervals are bounded by START and INSN. NBYTES is the total
42074 size of instructions in the interval including INSN and not including
42075 START. When the NBYTES is smaller than 16 bytes, it is possible
42076 that the end of START and INSN ends up in the same 16byte page.
42078 The smallest offset in the page INSN can start is the case where START
42079 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42080 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42082 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42083 have to, control transfer to label(s) can be performed through other
42084 means, and also we estimate minimum length of all asm stmts as 0. */
42085 for (insn = start; insn; insn = NEXT_INSN (insn))
42087 int min_size;
42089 if (LABEL_P (insn))
42091 int align = label_to_alignment (insn);
42092 int max_skip = label_to_max_skip (insn);
42094 if (max_skip > 15)
42095 max_skip = 15;
42096 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42097 already in the current 16 byte page, because otherwise
42098 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42099 bytes to reach 16 byte boundary. */
42100 if (align <= 0
42101 || (align <= 3 && max_skip != (1 << align) - 1))
42102 max_skip = 0;
42103 if (dump_file)
42104 fprintf (dump_file, "Label %i with max_skip %i\n",
42105 INSN_UID (insn), max_skip);
42106 if (max_skip)
42108 while (nbytes + max_skip >= 16)
42110 start = NEXT_INSN (start);
42111 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42112 || CALL_P (start))
42113 njumps--, isjump = true;
42114 else
42115 isjump = false;
42116 nbytes -= ix86_min_insn_size (start);
42119 continue;
42122 min_size = ix86_min_insn_size (insn);
42123 nbytes += min_size;
42124 if (dump_file)
42125 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42126 INSN_UID (insn), min_size);
42127 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42128 || CALL_P (insn))
42129 njumps++;
42130 else
42131 continue;
42133 while (njumps > 3)
42135 start = NEXT_INSN (start);
42136 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42137 || CALL_P (start))
42138 njumps--, isjump = true;
42139 else
42140 isjump = false;
42141 nbytes -= ix86_min_insn_size (start);
42143 gcc_assert (njumps >= 0);
42144 if (dump_file)
42145 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42146 INSN_UID (start), INSN_UID (insn), nbytes);
42148 if (njumps == 3 && isjump && nbytes < 16)
42150 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
42152 if (dump_file)
42153 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42154 INSN_UID (insn), padsize);
42155 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42159 #endif
42161 /* AMD Athlon works faster
42162 when RET is not destination of conditional jump or directly preceded
42163 by other jump instruction. We avoid the penalty by inserting NOP just
42164 before the RET instructions in such cases. */
42165 static void
42166 ix86_pad_returns (void)
42168 edge e;
42169 edge_iterator ei;
42171 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42173 basic_block bb = e->src;
42174 rtx_insn *ret = BB_END (bb);
42175 rtx_insn *prev;
42176 bool replace = false;
42178 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42179 || optimize_bb_for_size_p (bb))
42180 continue;
42181 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42182 if (active_insn_p (prev) || LABEL_P (prev))
42183 break;
42184 if (prev && LABEL_P (prev))
42186 edge e;
42187 edge_iterator ei;
42189 FOR_EACH_EDGE (e, ei, bb->preds)
42190 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42191 && !(e->flags & EDGE_FALLTHRU))
42193 replace = true;
42194 break;
42197 if (!replace)
42199 prev = prev_active_insn (ret);
42200 if (prev
42201 && ((JUMP_P (prev) && any_condjump_p (prev))
42202 || CALL_P (prev)))
42203 replace = true;
42204 /* Empty functions get branch mispredict even when
42205 the jump destination is not visible to us. */
42206 if (!prev && !optimize_function_for_size_p (cfun))
42207 replace = true;
42209 if (replace)
42211 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42212 delete_insn (ret);
42217 /* Count the minimum number of instructions in BB. Return 4 if the
42218 number of instructions >= 4. */
42220 static int
42221 ix86_count_insn_bb (basic_block bb)
42223 rtx_insn *insn;
42224 int insn_count = 0;
42226 /* Count number of instructions in this block. Return 4 if the number
42227 of instructions >= 4. */
42228 FOR_BB_INSNS (bb, insn)
42230 /* Only happen in exit blocks. */
42231 if (JUMP_P (insn)
42232 && ANY_RETURN_P (PATTERN (insn)))
42233 break;
42235 if (NONDEBUG_INSN_P (insn)
42236 && GET_CODE (PATTERN (insn)) != USE
42237 && GET_CODE (PATTERN (insn)) != CLOBBER)
42239 insn_count++;
42240 if (insn_count >= 4)
42241 return insn_count;
42245 return insn_count;
42249 /* Count the minimum number of instructions in code path in BB.
42250 Return 4 if the number of instructions >= 4. */
42252 static int
42253 ix86_count_insn (basic_block bb)
42255 edge e;
42256 edge_iterator ei;
42257 int min_prev_count;
42259 /* Only bother counting instructions along paths with no
42260 more than 2 basic blocks between entry and exit. Given
42261 that BB has an edge to exit, determine if a predecessor
42262 of BB has an edge from entry. If so, compute the number
42263 of instructions in the predecessor block. If there
42264 happen to be multiple such blocks, compute the minimum. */
42265 min_prev_count = 4;
42266 FOR_EACH_EDGE (e, ei, bb->preds)
42268 edge prev_e;
42269 edge_iterator prev_ei;
42271 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42273 min_prev_count = 0;
42274 break;
42276 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42278 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42280 int count = ix86_count_insn_bb (e->src);
42281 if (count < min_prev_count)
42282 min_prev_count = count;
42283 break;
42288 if (min_prev_count < 4)
42289 min_prev_count += ix86_count_insn_bb (bb);
42291 return min_prev_count;
42294 /* Pad short function to 4 instructions. */
42296 static void
42297 ix86_pad_short_function (void)
42299 edge e;
42300 edge_iterator ei;
42302 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42304 rtx_insn *ret = BB_END (e->src);
42305 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42307 int insn_count = ix86_count_insn (e->src);
42309 /* Pad short function. */
42310 if (insn_count < 4)
42312 rtx_insn *insn = ret;
42314 /* Find epilogue. */
42315 while (insn
42316 && (!NOTE_P (insn)
42317 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42318 insn = PREV_INSN (insn);
42320 if (!insn)
42321 insn = ret;
42323 /* Two NOPs count as one instruction. */
42324 insn_count = 2 * (4 - insn_count);
42325 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42331 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42332 the epilogue, the Windows system unwinder will apply epilogue logic and
42333 produce incorrect offsets. This can be avoided by adding a nop between
42334 the last insn that can throw and the first insn of the epilogue. */
42336 static void
42337 ix86_seh_fixup_eh_fallthru (void)
42339 edge e;
42340 edge_iterator ei;
42342 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42344 rtx_insn *insn, *next;
42346 /* Find the beginning of the epilogue. */
42347 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42348 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42349 break;
42350 if (insn == NULL)
42351 continue;
42353 /* We only care about preceding insns that can throw. */
42354 insn = prev_active_insn (insn);
42355 if (insn == NULL || !can_throw_internal (insn))
42356 continue;
42358 /* Do not separate calls from their debug information. */
42359 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42360 if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
42361 insn = next;
42362 else
42363 break;
42365 emit_insn_after (gen_nops (const1_rtx), insn);
42369 /* Given a register number BASE, the lowest of a group of registers, update
42370 regsets IN and OUT with the registers that should be avoided in input
42371 and output operands respectively when trying to avoid generating a modr/m
42372 byte for -mmitigate-rop. */
42374 static void
42375 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42377 SET_HARD_REG_BIT (out, base);
42378 SET_HARD_REG_BIT (out, base + 1);
42379 SET_HARD_REG_BIT (in, base + 2);
42380 SET_HARD_REG_BIT (in, base + 3);
42383 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42384 that certain encodings of modr/m bytes do not occur. */
42385 static void
42386 ix86_mitigate_rop (void)
42388 HARD_REG_SET input_risky;
42389 HARD_REG_SET output_risky;
42390 HARD_REG_SET inout_risky;
42392 CLEAR_HARD_REG_SET (output_risky);
42393 CLEAR_HARD_REG_SET (input_risky);
42394 SET_HARD_REG_BIT (output_risky, AX_REG);
42395 SET_HARD_REG_BIT (output_risky, CX_REG);
42396 SET_HARD_REG_BIT (input_risky, BX_REG);
42397 SET_HARD_REG_BIT (input_risky, DX_REG);
42398 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42399 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42400 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42401 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42402 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42403 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42404 COPY_HARD_REG_SET (inout_risky, input_risky);
42405 IOR_HARD_REG_SET (inout_risky, output_risky);
42407 df_note_add_problem ();
42408 /* Fix up what stack-regs did. */
42409 df_insn_rescan_all ();
42410 df_analyze ();
42412 regrename_init (true);
42413 regrename_analyze (NULL);
42415 auto_vec<du_head_p> cands;
42417 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42419 if (!NONDEBUG_INSN_P (insn))
42420 continue;
42422 if (GET_CODE (PATTERN (insn)) == USE
42423 || GET_CODE (PATTERN (insn)) == CLOBBER)
42424 continue;
42426 extract_insn (insn);
42428 int opno0, opno1;
42429 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42430 recog_data.n_operands, &opno0,
42431 &opno1);
42433 if (!ix86_rop_should_change_byte_p (modrm))
42434 continue;
42436 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42438 /* This happens when regrename has to fail a block. */
42439 if (!info->op_info)
42440 continue;
42442 if (info->op_info[opno0].n_chains != 0)
42444 gcc_assert (info->op_info[opno0].n_chains == 1);
42445 du_head_p op0c;
42446 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42447 if (op0c->target_data_1 + op0c->target_data_2 == 0
42448 && !op0c->cannot_rename)
42449 cands.safe_push (op0c);
42451 op0c->target_data_1++;
42453 if (info->op_info[opno1].n_chains != 0)
42455 gcc_assert (info->op_info[opno1].n_chains == 1);
42456 du_head_p op1c;
42457 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42458 if (op1c->target_data_1 + op1c->target_data_2 == 0
42459 && !op1c->cannot_rename)
42460 cands.safe_push (op1c);
42462 op1c->target_data_2++;
42466 int i;
42467 du_head_p head;
42468 FOR_EACH_VEC_ELT (cands, i, head)
42470 int old_reg, best_reg;
42471 HARD_REG_SET unavailable;
42473 CLEAR_HARD_REG_SET (unavailable);
42474 if (head->target_data_1)
42475 IOR_HARD_REG_SET (unavailable, output_risky);
42476 if (head->target_data_2)
42477 IOR_HARD_REG_SET (unavailable, input_risky);
42479 int n_uses;
42480 reg_class superclass = regrename_find_superclass (head, &n_uses,
42481 &unavailable);
42482 old_reg = head->regno;
42483 best_reg = find_rename_reg (head, superclass, &unavailable,
42484 old_reg, false);
42485 bool ok = regrename_do_replace (head, best_reg);
42486 gcc_assert (ok);
42487 if (dump_file)
42488 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42489 reg_names[best_reg], reg_class_names[superclass]);
42493 regrename_finish ();
42495 df_analyze ();
42497 basic_block bb;
42498 regset_head live;
42500 INIT_REG_SET (&live);
42502 FOR_EACH_BB_FN (bb, cfun)
42504 rtx_insn *insn;
42506 COPY_REG_SET (&live, DF_LR_OUT (bb));
42507 df_simulate_initialize_backwards (bb, &live);
42509 FOR_BB_INSNS_REVERSE (bb, insn)
42511 if (!NONDEBUG_INSN_P (insn))
42512 continue;
42514 df_simulate_one_insn_backwards (bb, insn, &live);
42516 if (GET_CODE (PATTERN (insn)) == USE
42517 || GET_CODE (PATTERN (insn)) == CLOBBER)
42518 continue;
42520 extract_insn (insn);
42521 constrain_operands_cached (insn, reload_completed);
42522 int opno0, opno1;
42523 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42524 recog_data.n_operands, &opno0,
42525 &opno1);
42526 if (modrm < 0
42527 || !ix86_rop_should_change_byte_p (modrm)
42528 || opno0 == opno1)
42529 continue;
42531 rtx oldreg = recog_data.operand[opno1];
42532 preprocess_constraints (insn);
42533 const operand_alternative *alt = which_op_alt ();
42535 int i;
42536 for (i = 0; i < recog_data.n_operands; i++)
42537 if (i != opno1
42538 && alt[i].earlyclobber
42539 && reg_overlap_mentioned_p (recog_data.operand[i],
42540 oldreg))
42541 break;
42543 if (i < recog_data.n_operands)
42544 continue;
42546 if (dump_file)
42547 fprintf (dump_file,
42548 "attempting to fix modrm byte in insn %d:"
42549 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42550 reg_class_names[alt[opno1].cl]);
42552 HARD_REG_SET unavailable;
42553 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42554 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42555 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42556 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42557 IOR_HARD_REG_SET (unavailable, output_risky);
42558 IOR_COMPL_HARD_REG_SET (unavailable,
42559 reg_class_contents[alt[opno1].cl]);
42561 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42562 if (!TEST_HARD_REG_BIT (unavailable, i))
42563 break;
42564 if (i == FIRST_PSEUDO_REGISTER)
42566 if (dump_file)
42567 fprintf (dump_file, ", none available\n");
42568 continue;
42570 if (dump_file)
42571 fprintf (dump_file, " -> %d\n", i);
42572 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42573 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42574 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42579 /* Implement machine specific optimizations. We implement padding of returns
42580 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42581 static void
42582 ix86_reorg (void)
42584 /* We are freeing block_for_insn in the toplev to keep compatibility
42585 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42586 compute_bb_for_insn ();
42588 if (flag_mitigate_rop)
42589 ix86_mitigate_rop ();
42591 if (TARGET_SEH && current_function_has_exception_handlers ())
42592 ix86_seh_fixup_eh_fallthru ();
42594 if (optimize && optimize_function_for_speed_p (cfun))
42596 if (TARGET_PAD_SHORT_FUNCTION)
42597 ix86_pad_short_function ();
42598 else if (TARGET_PAD_RETURNS)
42599 ix86_pad_returns ();
42600 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42601 if (TARGET_FOUR_JUMP_LIMIT)
42602 ix86_avoid_jump_mispredicts ();
42603 #endif
42607 /* Return nonzero when QImode register that must be represented via REX prefix
42608 is used. */
42609 bool
42610 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42612 int i;
42613 extract_insn_cached (insn);
42614 for (i = 0; i < recog_data.n_operands; i++)
42615 if (GENERAL_REG_P (recog_data.operand[i])
42616 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42617 return true;
42618 return false;
42621 /* Return true when INSN mentions register that must be encoded using REX
42622 prefix. */
42623 bool
42624 x86_extended_reg_mentioned_p (rtx insn)
42626 subrtx_iterator::array_type array;
42627 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42629 const_rtx x = *iter;
42630 if (REG_P (x)
42631 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42632 return true;
42634 return false;
42637 /* If profitable, negate (without causing overflow) integer constant
42638 of mode MODE at location LOC. Return true in this case. */
42639 bool
42640 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42642 HOST_WIDE_INT val;
42644 if (!CONST_INT_P (*loc))
42645 return false;
42647 switch (mode)
42649 case E_DImode:
42650 /* DImode x86_64 constants must fit in 32 bits. */
42651 gcc_assert (x86_64_immediate_operand (*loc, mode));
42653 mode = SImode;
42654 break;
42656 case E_SImode:
42657 case E_HImode:
42658 case E_QImode:
42659 break;
42661 default:
42662 gcc_unreachable ();
42665 /* Avoid overflows. */
42666 if (mode_signbit_p (mode, *loc))
42667 return false;
42669 val = INTVAL (*loc);
42671 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42672 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42673 if ((val < 0 && val != -128)
42674 || val == 128)
42676 *loc = GEN_INT (-val);
42677 return true;
42680 return false;
42683 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42684 optabs would emit if we didn't have TFmode patterns. */
42686 void
42687 x86_emit_floatuns (rtx operands[2])
42689 rtx_code_label *neglab, *donelab;
42690 rtx i0, i1, f0, in, out;
42691 machine_mode mode, inmode;
42693 inmode = GET_MODE (operands[1]);
42694 gcc_assert (inmode == SImode || inmode == DImode);
42696 out = operands[0];
42697 in = force_reg (inmode, operands[1]);
42698 mode = GET_MODE (out);
42699 neglab = gen_label_rtx ();
42700 donelab = gen_label_rtx ();
42701 f0 = gen_reg_rtx (mode);
42703 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42705 expand_float (out, in, 0);
42707 emit_jump_insn (gen_jump (donelab));
42708 emit_barrier ();
42710 emit_label (neglab);
42712 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42713 1, OPTAB_DIRECT);
42714 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42715 1, OPTAB_DIRECT);
42716 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42718 expand_float (f0, i0, 0);
42720 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42722 emit_label (donelab);
42725 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42726 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42727 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42728 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42730 /* Get a vector mode of the same size as the original but with elements
42731 twice as wide. This is only guaranteed to apply to integral vectors. */
42733 static inline machine_mode
42734 get_mode_wider_vector (machine_mode o)
42736 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42737 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42738 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42739 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42740 return n;
42743 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42744 fill target with val via vec_duplicate. */
42746 static bool
42747 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42749 bool ok;
42750 rtx_insn *insn;
42751 rtx dup;
42753 /* First attempt to recognize VAL as-is. */
42754 dup = gen_vec_duplicate (mode, val);
42755 insn = emit_insn (gen_rtx_SET (target, dup));
42756 if (recog_memoized (insn) < 0)
42758 rtx_insn *seq;
42759 machine_mode innermode = GET_MODE_INNER (mode);
42760 rtx reg;
42762 /* If that fails, force VAL into a register. */
42764 start_sequence ();
42765 reg = force_reg (innermode, val);
42766 if (GET_MODE (reg) != innermode)
42767 reg = gen_lowpart (innermode, reg);
42768 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42769 seq = get_insns ();
42770 end_sequence ();
42771 if (seq)
42772 emit_insn_before (seq, insn);
42774 ok = recog_memoized (insn) >= 0;
42775 gcc_assert (ok);
42777 return true;
42780 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42781 with all elements equal to VAR. Return true if successful. */
42783 static bool
42784 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42785 rtx target, rtx val)
42787 bool ok;
42789 switch (mode)
42791 case E_V2SImode:
42792 case E_V2SFmode:
42793 if (!mmx_ok)
42794 return false;
42795 /* FALLTHRU */
42797 case E_V4DFmode:
42798 case E_V4DImode:
42799 case E_V8SFmode:
42800 case E_V8SImode:
42801 case E_V2DFmode:
42802 case E_V2DImode:
42803 case E_V4SFmode:
42804 case E_V4SImode:
42805 case E_V16SImode:
42806 case E_V8DImode:
42807 case E_V16SFmode:
42808 case E_V8DFmode:
42809 return ix86_vector_duplicate_value (mode, target, val);
42811 case E_V4HImode:
42812 if (!mmx_ok)
42813 return false;
42814 if (TARGET_SSE || TARGET_3DNOW_A)
42816 rtx x;
42818 val = gen_lowpart (SImode, val);
42819 x = gen_rtx_TRUNCATE (HImode, val);
42820 x = gen_rtx_VEC_DUPLICATE (mode, x);
42821 emit_insn (gen_rtx_SET (target, x));
42822 return true;
42824 goto widen;
42826 case E_V8QImode:
42827 if (!mmx_ok)
42828 return false;
42829 goto widen;
42831 case E_V8HImode:
42832 if (TARGET_AVX2)
42833 return ix86_vector_duplicate_value (mode, target, val);
42835 if (TARGET_SSE2)
42837 struct expand_vec_perm_d dperm;
42838 rtx tmp1, tmp2;
42840 permute:
42841 memset (&dperm, 0, sizeof (dperm));
42842 dperm.target = target;
42843 dperm.vmode = mode;
42844 dperm.nelt = GET_MODE_NUNITS (mode);
42845 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42846 dperm.one_operand_p = true;
42848 /* Extend to SImode using a paradoxical SUBREG. */
42849 tmp1 = gen_reg_rtx (SImode);
42850 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42852 /* Insert the SImode value as low element of a V4SImode vector. */
42853 tmp2 = gen_reg_rtx (V4SImode);
42854 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42855 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42857 ok = (expand_vec_perm_1 (&dperm)
42858 || expand_vec_perm_broadcast_1 (&dperm));
42859 gcc_assert (ok);
42860 return ok;
42862 goto widen;
42864 case E_V16QImode:
42865 if (TARGET_AVX2)
42866 return ix86_vector_duplicate_value (mode, target, val);
42868 if (TARGET_SSE2)
42869 goto permute;
42870 goto widen;
42872 widen:
42873 /* Replicate the value once into the next wider mode and recurse. */
42875 machine_mode smode, wsmode, wvmode;
42876 rtx x;
42878 smode = GET_MODE_INNER (mode);
42879 wvmode = get_mode_wider_vector (mode);
42880 wsmode = GET_MODE_INNER (wvmode);
42882 val = convert_modes (wsmode, smode, val, true);
42883 x = expand_simple_binop (wsmode, ASHIFT, val,
42884 GEN_INT (GET_MODE_BITSIZE (smode)),
42885 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42886 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42888 x = gen_reg_rtx (wvmode);
42889 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42890 gcc_assert (ok);
42891 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42892 return ok;
42895 case E_V16HImode:
42896 case E_V32QImode:
42897 if (TARGET_AVX2)
42898 return ix86_vector_duplicate_value (mode, target, val);
42899 else
42901 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42902 rtx x = gen_reg_rtx (hvmode);
42904 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42905 gcc_assert (ok);
42907 x = gen_rtx_VEC_CONCAT (mode, x, x);
42908 emit_insn (gen_rtx_SET (target, x));
42910 return true;
42912 case E_V64QImode:
42913 case E_V32HImode:
42914 if (TARGET_AVX512BW)
42915 return ix86_vector_duplicate_value (mode, target, val);
42916 else
42918 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42919 rtx x = gen_reg_rtx (hvmode);
42921 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42922 gcc_assert (ok);
42924 x = gen_rtx_VEC_CONCAT (mode, x, x);
42925 emit_insn (gen_rtx_SET (target, x));
42927 return true;
42929 default:
42930 return false;
42934 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42935 whose ONE_VAR element is VAR, and other elements are zero. Return true
42936 if successful. */
42938 static bool
42939 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42940 rtx target, rtx var, int one_var)
42942 machine_mode vsimode;
42943 rtx new_target;
42944 rtx x, tmp;
42945 bool use_vector_set = false;
42946 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42948 switch (mode)
42950 case E_V2DImode:
42951 /* For SSE4.1, we normally use vector set. But if the second
42952 element is zero and inter-unit moves are OK, we use movq
42953 instead. */
42954 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42955 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42956 && one_var == 0));
42957 break;
42958 case E_V16QImode:
42959 case E_V4SImode:
42960 case E_V4SFmode:
42961 use_vector_set = TARGET_SSE4_1;
42962 break;
42963 case E_V8HImode:
42964 use_vector_set = TARGET_SSE2;
42965 break;
42966 case E_V4HImode:
42967 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42968 break;
42969 case E_V32QImode:
42970 case E_V16HImode:
42971 use_vector_set = TARGET_AVX;
42972 break;
42973 case E_V8SImode:
42974 use_vector_set = TARGET_AVX;
42975 gen_vec_set_0 = gen_vec_setv8si_0;
42976 break;
42977 case E_V8SFmode:
42978 use_vector_set = TARGET_AVX;
42979 gen_vec_set_0 = gen_vec_setv8sf_0;
42980 break;
42981 case E_V4DFmode:
42982 use_vector_set = TARGET_AVX;
42983 gen_vec_set_0 = gen_vec_setv4df_0;
42984 break;
42985 case E_V4DImode:
42986 /* Use ix86_expand_vector_set in 64bit mode only. */
42987 use_vector_set = TARGET_AVX && TARGET_64BIT;
42988 gen_vec_set_0 = gen_vec_setv4di_0;
42989 break;
42990 case E_V16SImode:
42991 use_vector_set = TARGET_AVX512F && one_var == 0;
42992 gen_vec_set_0 = gen_vec_setv16si_0;
42993 break;
42994 case E_V16SFmode:
42995 use_vector_set = TARGET_AVX512F && one_var == 0;
42996 gen_vec_set_0 = gen_vec_setv16sf_0;
42997 break;
42998 case E_V8DFmode:
42999 use_vector_set = TARGET_AVX512F && one_var == 0;
43000 gen_vec_set_0 = gen_vec_setv8df_0;
43001 break;
43002 case E_V8DImode:
43003 /* Use ix86_expand_vector_set in 64bit mode only. */
43004 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
43005 gen_vec_set_0 = gen_vec_setv8di_0;
43006 break;
43007 default:
43008 break;
43011 if (use_vector_set)
43013 if (gen_vec_set_0 && one_var == 0)
43015 var = force_reg (GET_MODE_INNER (mode), var);
43016 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
43017 return true;
43019 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43020 var = force_reg (GET_MODE_INNER (mode), var);
43021 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43022 return true;
43025 switch (mode)
43027 case E_V2SFmode:
43028 case E_V2SImode:
43029 if (!mmx_ok)
43030 return false;
43031 /* FALLTHRU */
43033 case E_V2DFmode:
43034 case E_V2DImode:
43035 if (one_var != 0)
43036 return false;
43037 var = force_reg (GET_MODE_INNER (mode), var);
43038 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43039 emit_insn (gen_rtx_SET (target, x));
43040 return true;
43042 case E_V4SFmode:
43043 case E_V4SImode:
43044 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43045 new_target = gen_reg_rtx (mode);
43046 else
43047 new_target = target;
43048 var = force_reg (GET_MODE_INNER (mode), var);
43049 x = gen_rtx_VEC_DUPLICATE (mode, var);
43050 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43051 emit_insn (gen_rtx_SET (new_target, x));
43052 if (one_var != 0)
43054 /* We need to shuffle the value to the correct position, so
43055 create a new pseudo to store the intermediate result. */
43057 /* With SSE2, we can use the integer shuffle insns. */
43058 if (mode != V4SFmode && TARGET_SSE2)
43060 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43061 const1_rtx,
43062 GEN_INT (one_var == 1 ? 0 : 1),
43063 GEN_INT (one_var == 2 ? 0 : 1),
43064 GEN_INT (one_var == 3 ? 0 : 1)));
43065 if (target != new_target)
43066 emit_move_insn (target, new_target);
43067 return true;
43070 /* Otherwise convert the intermediate result to V4SFmode and
43071 use the SSE1 shuffle instructions. */
43072 if (mode != V4SFmode)
43074 tmp = gen_reg_rtx (V4SFmode);
43075 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43077 else
43078 tmp = new_target;
43080 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43081 const1_rtx,
43082 GEN_INT (one_var == 1 ? 0 : 1),
43083 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43084 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43086 if (mode != V4SFmode)
43087 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43088 else if (tmp != target)
43089 emit_move_insn (target, tmp);
43091 else if (target != new_target)
43092 emit_move_insn (target, new_target);
43093 return true;
43095 case E_V8HImode:
43096 case E_V16QImode:
43097 vsimode = V4SImode;
43098 goto widen;
43099 case E_V4HImode:
43100 case E_V8QImode:
43101 if (!mmx_ok)
43102 return false;
43103 vsimode = V2SImode;
43104 goto widen;
43105 widen:
43106 if (one_var != 0)
43107 return false;
43109 /* Zero extend the variable element to SImode and recurse. */
43110 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43112 x = gen_reg_rtx (vsimode);
43113 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43114 var, one_var))
43115 gcc_unreachable ();
43117 emit_move_insn (target, gen_lowpart (mode, x));
43118 return true;
43120 default:
43121 return false;
43125 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43126 consisting of the values in VALS. It is known that all elements
43127 except ONE_VAR are constants. Return true if successful. */
43129 static bool
43130 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43131 rtx target, rtx vals, int one_var)
43133 rtx var = XVECEXP (vals, 0, one_var);
43134 machine_mode wmode;
43135 rtx const_vec, x;
43137 const_vec = copy_rtx (vals);
43138 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43139 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43141 switch (mode)
43143 case E_V2DFmode:
43144 case E_V2DImode:
43145 case E_V2SFmode:
43146 case E_V2SImode:
43147 /* For the two element vectors, it's just as easy to use
43148 the general case. */
43149 return false;
43151 case E_V4DImode:
43152 /* Use ix86_expand_vector_set in 64bit mode only. */
43153 if (!TARGET_64BIT)
43154 return false;
43155 /* FALLTHRU */
43156 case E_V4DFmode:
43157 case E_V8SFmode:
43158 case E_V8SImode:
43159 case E_V16HImode:
43160 case E_V32QImode:
43161 case E_V4SFmode:
43162 case E_V4SImode:
43163 case E_V8HImode:
43164 case E_V4HImode:
43165 break;
43167 case E_V16QImode:
43168 if (TARGET_SSE4_1)
43169 break;
43170 wmode = V8HImode;
43171 goto widen;
43172 case E_V8QImode:
43173 wmode = V4HImode;
43174 goto widen;
43175 widen:
43176 /* There's no way to set one QImode entry easily. Combine
43177 the variable value with its adjacent constant value, and
43178 promote to an HImode set. */
43179 x = XVECEXP (vals, 0, one_var ^ 1);
43180 if (one_var & 1)
43182 var = convert_modes (HImode, QImode, var, true);
43183 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43184 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43185 x = GEN_INT (INTVAL (x) & 0xff);
43187 else
43189 var = convert_modes (HImode, QImode, var, true);
43190 x = gen_int_mode (INTVAL (x) << 8, HImode);
43192 if (x != const0_rtx)
43193 var = expand_simple_binop (HImode, IOR, var, x, var,
43194 1, OPTAB_LIB_WIDEN);
43196 x = gen_reg_rtx (wmode);
43197 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43198 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43200 emit_move_insn (target, gen_lowpart (mode, x));
43201 return true;
43203 default:
43204 return false;
43207 emit_move_insn (target, const_vec);
43208 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43209 return true;
43212 /* A subroutine of ix86_expand_vector_init_general. Use vector
43213 concatenate to handle the most general case: all values variable,
43214 and none identical. */
43216 static void
43217 ix86_expand_vector_init_concat (machine_mode mode,
43218 rtx target, rtx *ops, int n)
43220 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43221 rtx first[16], second[8], third[4];
43222 rtvec v;
43223 int i, j;
43225 switch (n)
43227 case 2:
43228 switch (mode)
43230 case E_V16SImode:
43231 cmode = V8SImode;
43232 break;
43233 case E_V16SFmode:
43234 cmode = V8SFmode;
43235 break;
43236 case E_V8DImode:
43237 cmode = V4DImode;
43238 break;
43239 case E_V8DFmode:
43240 cmode = V4DFmode;
43241 break;
43242 case E_V8SImode:
43243 cmode = V4SImode;
43244 break;
43245 case E_V8SFmode:
43246 cmode = V4SFmode;
43247 break;
43248 case E_V4DImode:
43249 cmode = V2DImode;
43250 break;
43251 case E_V4DFmode:
43252 cmode = V2DFmode;
43253 break;
43254 case E_V4SImode:
43255 cmode = V2SImode;
43256 break;
43257 case E_V4SFmode:
43258 cmode = V2SFmode;
43259 break;
43260 case E_V2DImode:
43261 cmode = DImode;
43262 break;
43263 case E_V2SImode:
43264 cmode = SImode;
43265 break;
43266 case E_V2DFmode:
43267 cmode = DFmode;
43268 break;
43269 case E_V2SFmode:
43270 cmode = SFmode;
43271 break;
43272 default:
43273 gcc_unreachable ();
43276 if (!register_operand (ops[1], cmode))
43277 ops[1] = force_reg (cmode, ops[1]);
43278 if (!register_operand (ops[0], cmode))
43279 ops[0] = force_reg (cmode, ops[0]);
43280 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43281 ops[1])));
43282 break;
43284 case 4:
43285 switch (mode)
43287 case E_V4DImode:
43288 cmode = V2DImode;
43289 break;
43290 case E_V4DFmode:
43291 cmode = V2DFmode;
43292 break;
43293 case E_V4SImode:
43294 cmode = V2SImode;
43295 break;
43296 case E_V4SFmode:
43297 cmode = V2SFmode;
43298 break;
43299 default:
43300 gcc_unreachable ();
43302 goto half;
43304 case 8:
43305 switch (mode)
43307 case E_V8DImode:
43308 cmode = V2DImode;
43309 hmode = V4DImode;
43310 break;
43311 case E_V8DFmode:
43312 cmode = V2DFmode;
43313 hmode = V4DFmode;
43314 break;
43315 case E_V8SImode:
43316 cmode = V2SImode;
43317 hmode = V4SImode;
43318 break;
43319 case E_V8SFmode:
43320 cmode = V2SFmode;
43321 hmode = V4SFmode;
43322 break;
43323 default:
43324 gcc_unreachable ();
43326 goto half;
43328 case 16:
43329 switch (mode)
43331 case E_V16SImode:
43332 cmode = V2SImode;
43333 hmode = V4SImode;
43334 gmode = V8SImode;
43335 break;
43336 case E_V16SFmode:
43337 cmode = V2SFmode;
43338 hmode = V4SFmode;
43339 gmode = V8SFmode;
43340 break;
43341 default:
43342 gcc_unreachable ();
43344 goto half;
43346 half:
43347 /* FIXME: We process inputs backward to help RA. PR 36222. */
43348 i = n - 1;
43349 j = (n >> 1) - 1;
43350 for (; i > 0; i -= 2, j--)
43352 first[j] = gen_reg_rtx (cmode);
43353 v = gen_rtvec (2, ops[i - 1], ops[i]);
43354 ix86_expand_vector_init (false, first[j],
43355 gen_rtx_PARALLEL (cmode, v));
43358 n >>= 1;
43359 if (n > 4)
43361 gcc_assert (hmode != VOIDmode);
43362 gcc_assert (gmode != VOIDmode);
43363 for (i = j = 0; i < n; i += 2, j++)
43365 second[j] = gen_reg_rtx (hmode);
43366 ix86_expand_vector_init_concat (hmode, second [j],
43367 &first [i], 2);
43369 n >>= 1;
43370 for (i = j = 0; i < n; i += 2, j++)
43372 third[j] = gen_reg_rtx (gmode);
43373 ix86_expand_vector_init_concat (gmode, third[j],
43374 &second[i], 2);
43376 n >>= 1;
43377 ix86_expand_vector_init_concat (mode, target, third, n);
43379 else if (n > 2)
43381 gcc_assert (hmode != VOIDmode);
43382 for (i = j = 0; i < n; i += 2, j++)
43384 second[j] = gen_reg_rtx (hmode);
43385 ix86_expand_vector_init_concat (hmode, second [j],
43386 &first [i], 2);
43388 n >>= 1;
43389 ix86_expand_vector_init_concat (mode, target, second, n);
43391 else
43392 ix86_expand_vector_init_concat (mode, target, first, n);
43393 break;
43395 default:
43396 gcc_unreachable ();
43400 /* A subroutine of ix86_expand_vector_init_general. Use vector
43401 interleave to handle the most general case: all values variable,
43402 and none identical. */
43404 static void
43405 ix86_expand_vector_init_interleave (machine_mode mode,
43406 rtx target, rtx *ops, int n)
43408 machine_mode first_imode, second_imode, third_imode, inner_mode;
43409 int i, j;
43410 rtx op0, op1;
43411 rtx (*gen_load_even) (rtx, rtx, rtx);
43412 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43413 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43415 switch (mode)
43417 case E_V8HImode:
43418 gen_load_even = gen_vec_setv8hi;
43419 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43420 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43421 inner_mode = HImode;
43422 first_imode = V4SImode;
43423 second_imode = V2DImode;
43424 third_imode = VOIDmode;
43425 break;
43426 case E_V16QImode:
43427 gen_load_even = gen_vec_setv16qi;
43428 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43429 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43430 inner_mode = QImode;
43431 first_imode = V8HImode;
43432 second_imode = V4SImode;
43433 third_imode = V2DImode;
43434 break;
43435 default:
43436 gcc_unreachable ();
43439 for (i = 0; i < n; i++)
43441 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43442 op0 = gen_reg_rtx (SImode);
43443 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43445 /* Insert the SImode value as low element of V4SImode vector. */
43446 op1 = gen_reg_rtx (V4SImode);
43447 op0 = gen_rtx_VEC_MERGE (V4SImode,
43448 gen_rtx_VEC_DUPLICATE (V4SImode,
43449 op0),
43450 CONST0_RTX (V4SImode),
43451 const1_rtx);
43452 emit_insn (gen_rtx_SET (op1, op0));
43454 /* Cast the V4SImode vector back to a vector in orignal mode. */
43455 op0 = gen_reg_rtx (mode);
43456 emit_move_insn (op0, gen_lowpart (mode, op1));
43458 /* Load even elements into the second position. */
43459 emit_insn (gen_load_even (op0,
43460 force_reg (inner_mode,
43461 ops [i + i + 1]),
43462 const1_rtx));
43464 /* Cast vector to FIRST_IMODE vector. */
43465 ops[i] = gen_reg_rtx (first_imode);
43466 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43469 /* Interleave low FIRST_IMODE vectors. */
43470 for (i = j = 0; i < n; i += 2, j++)
43472 op0 = gen_reg_rtx (first_imode);
43473 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43475 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43476 ops[j] = gen_reg_rtx (second_imode);
43477 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43480 /* Interleave low SECOND_IMODE vectors. */
43481 switch (second_imode)
43483 case E_V4SImode:
43484 for (i = j = 0; i < n / 2; i += 2, j++)
43486 op0 = gen_reg_rtx (second_imode);
43487 emit_insn (gen_interleave_second_low (op0, ops[i],
43488 ops[i + 1]));
43490 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43491 vector. */
43492 ops[j] = gen_reg_rtx (third_imode);
43493 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43495 second_imode = V2DImode;
43496 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43497 /* FALLTHRU */
43499 case E_V2DImode:
43500 op0 = gen_reg_rtx (second_imode);
43501 emit_insn (gen_interleave_second_low (op0, ops[0],
43502 ops[1]));
43504 /* Cast the SECOND_IMODE vector back to a vector on original
43505 mode. */
43506 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43507 break;
43509 default:
43510 gcc_unreachable ();
43514 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43515 all values variable, and none identical. */
43517 static void
43518 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43519 rtx target, rtx vals)
43521 rtx ops[64], op0, op1, op2, op3, op4, op5;
43522 machine_mode half_mode = VOIDmode;
43523 machine_mode quarter_mode = VOIDmode;
43524 int n, i;
43526 switch (mode)
43528 case E_V2SFmode:
43529 case E_V2SImode:
43530 if (!mmx_ok && !TARGET_SSE)
43531 break;
43532 /* FALLTHRU */
43534 case E_V16SImode:
43535 case E_V16SFmode:
43536 case E_V8DFmode:
43537 case E_V8DImode:
43538 case E_V8SFmode:
43539 case E_V8SImode:
43540 case E_V4DFmode:
43541 case E_V4DImode:
43542 case E_V4SFmode:
43543 case E_V4SImode:
43544 case E_V2DFmode:
43545 case E_V2DImode:
43546 n = GET_MODE_NUNITS (mode);
43547 for (i = 0; i < n; i++)
43548 ops[i] = XVECEXP (vals, 0, i);
43549 ix86_expand_vector_init_concat (mode, target, ops, n);
43550 return;
43552 case E_V2TImode:
43553 for (i = 0; i < 2; i++)
43554 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43555 op0 = gen_reg_rtx (V4DImode);
43556 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43557 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43558 return;
43560 case E_V4TImode:
43561 for (i = 0; i < 4; i++)
43562 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43563 ops[4] = gen_reg_rtx (V4DImode);
43564 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43565 ops[5] = gen_reg_rtx (V4DImode);
43566 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43567 op0 = gen_reg_rtx (V8DImode);
43568 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43569 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43570 return;
43572 case E_V32QImode:
43573 half_mode = V16QImode;
43574 goto half;
43576 case E_V16HImode:
43577 half_mode = V8HImode;
43578 goto half;
43580 half:
43581 n = GET_MODE_NUNITS (mode);
43582 for (i = 0; i < n; i++)
43583 ops[i] = XVECEXP (vals, 0, i);
43584 op0 = gen_reg_rtx (half_mode);
43585 op1 = gen_reg_rtx (half_mode);
43586 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43587 n >> 2);
43588 ix86_expand_vector_init_interleave (half_mode, op1,
43589 &ops [n >> 1], n >> 2);
43590 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43591 return;
43593 case E_V64QImode:
43594 quarter_mode = V16QImode;
43595 half_mode = V32QImode;
43596 goto quarter;
43598 case E_V32HImode:
43599 quarter_mode = V8HImode;
43600 half_mode = V16HImode;
43601 goto quarter;
43603 quarter:
43604 n = GET_MODE_NUNITS (mode);
43605 for (i = 0; i < n; i++)
43606 ops[i] = XVECEXP (vals, 0, i);
43607 op0 = gen_reg_rtx (quarter_mode);
43608 op1 = gen_reg_rtx (quarter_mode);
43609 op2 = gen_reg_rtx (quarter_mode);
43610 op3 = gen_reg_rtx (quarter_mode);
43611 op4 = gen_reg_rtx (half_mode);
43612 op5 = gen_reg_rtx (half_mode);
43613 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43614 n >> 3);
43615 ix86_expand_vector_init_interleave (quarter_mode, op1,
43616 &ops [n >> 2], n >> 3);
43617 ix86_expand_vector_init_interleave (quarter_mode, op2,
43618 &ops [n >> 1], n >> 3);
43619 ix86_expand_vector_init_interleave (quarter_mode, op3,
43620 &ops [(n >> 1) | (n >> 2)], n >> 3);
43621 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43622 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43623 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43624 return;
43626 case E_V16QImode:
43627 if (!TARGET_SSE4_1)
43628 break;
43629 /* FALLTHRU */
43631 case E_V8HImode:
43632 if (!TARGET_SSE2)
43633 break;
43635 /* Don't use ix86_expand_vector_init_interleave if we can't
43636 move from GPR to SSE register directly. */
43637 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43638 break;
43640 n = GET_MODE_NUNITS (mode);
43641 for (i = 0; i < n; i++)
43642 ops[i] = XVECEXP (vals, 0, i);
43643 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43644 return;
43646 case E_V4HImode:
43647 case E_V8QImode:
43648 break;
43650 default:
43651 gcc_unreachable ();
43655 int i, j, n_elts, n_words, n_elt_per_word;
43656 machine_mode inner_mode;
43657 rtx words[4], shift;
43659 inner_mode = GET_MODE_INNER (mode);
43660 n_elts = GET_MODE_NUNITS (mode);
43661 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43662 n_elt_per_word = n_elts / n_words;
43663 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43665 for (i = 0; i < n_words; ++i)
43667 rtx word = NULL_RTX;
43669 for (j = 0; j < n_elt_per_word; ++j)
43671 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43672 elt = convert_modes (word_mode, inner_mode, elt, true);
43674 if (j == 0)
43675 word = elt;
43676 else
43678 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43679 word, 1, OPTAB_LIB_WIDEN);
43680 word = expand_simple_binop (word_mode, IOR, word, elt,
43681 word, 1, OPTAB_LIB_WIDEN);
43685 words[i] = word;
43688 if (n_words == 1)
43689 emit_move_insn (target, gen_lowpart (mode, words[0]));
43690 else if (n_words == 2)
43692 rtx tmp = gen_reg_rtx (mode);
43693 emit_clobber (tmp);
43694 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43695 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43696 emit_move_insn (target, tmp);
43698 else if (n_words == 4)
43700 rtx tmp = gen_reg_rtx (V4SImode);
43701 gcc_assert (word_mode == SImode);
43702 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43703 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43704 emit_move_insn (target, gen_lowpart (mode, tmp));
43706 else
43707 gcc_unreachable ();
43711 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43712 instructions unless MMX_OK is true. */
43714 void
43715 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43717 machine_mode mode = GET_MODE (target);
43718 machine_mode inner_mode = GET_MODE_INNER (mode);
43719 int n_elts = GET_MODE_NUNITS (mode);
43720 int n_var = 0, one_var = -1;
43721 bool all_same = true, all_const_zero = true;
43722 int i;
43723 rtx x;
43725 /* Handle first initialization from vector elts. */
43726 if (n_elts != XVECLEN (vals, 0))
43728 rtx subtarget = target;
43729 x = XVECEXP (vals, 0, 0);
43730 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43731 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43733 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43734 if (inner_mode == QImode || inner_mode == HImode)
43736 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43737 mode = mode_for_vector (SImode, n_bits / 4).require ();
43738 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43739 ops[0] = gen_lowpart (inner_mode, ops[0]);
43740 ops[1] = gen_lowpart (inner_mode, ops[1]);
43741 subtarget = gen_reg_rtx (mode);
43743 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43744 if (subtarget != target)
43745 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43746 return;
43748 gcc_unreachable ();
43751 for (i = 0; i < n_elts; ++i)
43753 x = XVECEXP (vals, 0, i);
43754 if (!(CONST_SCALAR_INT_P (x)
43755 || CONST_DOUBLE_P (x)
43756 || CONST_FIXED_P (x)))
43757 n_var++, one_var = i;
43758 else if (x != CONST0_RTX (inner_mode))
43759 all_const_zero = false;
43760 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43761 all_same = false;
43764 /* Constants are best loaded from the constant pool. */
43765 if (n_var == 0)
43767 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43768 return;
43771 /* If all values are identical, broadcast the value. */
43772 if (all_same
43773 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43774 XVECEXP (vals, 0, 0)))
43775 return;
43777 /* Values where only one field is non-constant are best loaded from
43778 the pool and overwritten via move later. */
43779 if (n_var == 1)
43781 if (all_const_zero
43782 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43783 XVECEXP (vals, 0, one_var),
43784 one_var))
43785 return;
43787 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43788 return;
43791 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43794 void
43795 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43797 machine_mode mode = GET_MODE (target);
43798 machine_mode inner_mode = GET_MODE_INNER (mode);
43799 machine_mode half_mode;
43800 bool use_vec_merge = false;
43801 rtx tmp;
43802 static rtx (*gen_extract[6][2]) (rtx, rtx)
43804 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43805 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43806 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43807 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43808 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43809 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43811 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43813 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43814 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43815 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43816 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43817 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43818 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43820 int i, j, n;
43821 machine_mode mmode = VOIDmode;
43822 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43824 switch (mode)
43826 case E_V2SFmode:
43827 case E_V2SImode:
43828 if (mmx_ok)
43830 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43831 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43832 if (elt == 0)
43833 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43834 else
43835 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43836 emit_insn (gen_rtx_SET (target, tmp));
43837 return;
43839 break;
43841 case E_V2DImode:
43842 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43843 if (use_vec_merge)
43844 break;
43846 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43847 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43848 if (elt == 0)
43849 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43850 else
43851 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43852 emit_insn (gen_rtx_SET (target, tmp));
43853 return;
43855 case E_V2DFmode:
43857 rtx op0, op1;
43859 /* For the two element vectors, we implement a VEC_CONCAT with
43860 the extraction of the other element. */
43862 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43863 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43865 if (elt == 0)
43866 op0 = val, op1 = tmp;
43867 else
43868 op0 = tmp, op1 = val;
43870 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43871 emit_insn (gen_rtx_SET (target, tmp));
43873 return;
43875 case E_V4SFmode:
43876 use_vec_merge = TARGET_SSE4_1;
43877 if (use_vec_merge)
43878 break;
43880 switch (elt)
43882 case 0:
43883 use_vec_merge = true;
43884 break;
43886 case 1:
43887 /* tmp = target = A B C D */
43888 tmp = copy_to_reg (target);
43889 /* target = A A B B */
43890 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43891 /* target = X A B B */
43892 ix86_expand_vector_set (false, target, val, 0);
43893 /* target = A X C D */
43894 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43895 const1_rtx, const0_rtx,
43896 GEN_INT (2+4), GEN_INT (3+4)));
43897 return;
43899 case 2:
43900 /* tmp = target = A B C D */
43901 tmp = copy_to_reg (target);
43902 /* tmp = X B C D */
43903 ix86_expand_vector_set (false, tmp, val, 0);
43904 /* target = A B X D */
43905 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43906 const0_rtx, const1_rtx,
43907 GEN_INT (0+4), GEN_INT (3+4)));
43908 return;
43910 case 3:
43911 /* tmp = target = A B C D */
43912 tmp = copy_to_reg (target);
43913 /* tmp = X B C D */
43914 ix86_expand_vector_set (false, tmp, val, 0);
43915 /* target = A B X D */
43916 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43917 const0_rtx, const1_rtx,
43918 GEN_INT (2+4), GEN_INT (0+4)));
43919 return;
43921 default:
43922 gcc_unreachable ();
43924 break;
43926 case E_V4SImode:
43927 use_vec_merge = TARGET_SSE4_1;
43928 if (use_vec_merge)
43929 break;
43931 /* Element 0 handled by vec_merge below. */
43932 if (elt == 0)
43934 use_vec_merge = true;
43935 break;
43938 if (TARGET_SSE2)
43940 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43941 store into element 0, then shuffle them back. */
43943 rtx order[4];
43945 order[0] = GEN_INT (elt);
43946 order[1] = const1_rtx;
43947 order[2] = const2_rtx;
43948 order[3] = GEN_INT (3);
43949 order[elt] = const0_rtx;
43951 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43952 order[1], order[2], order[3]));
43954 ix86_expand_vector_set (false, target, val, 0);
43956 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43957 order[1], order[2], order[3]));
43959 else
43961 /* For SSE1, we have to reuse the V4SF code. */
43962 rtx t = gen_reg_rtx (V4SFmode);
43963 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43964 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43965 emit_move_insn (target, gen_lowpart (mode, t));
43967 return;
43969 case E_V8HImode:
43970 use_vec_merge = TARGET_SSE2;
43971 break;
43972 case E_V4HImode:
43973 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43974 break;
43976 case E_V16QImode:
43977 use_vec_merge = TARGET_SSE4_1;
43978 break;
43980 case E_V8QImode:
43981 break;
43983 case E_V32QImode:
43984 half_mode = V16QImode;
43985 j = 0;
43986 n = 16;
43987 goto half;
43989 case E_V16HImode:
43990 half_mode = V8HImode;
43991 j = 1;
43992 n = 8;
43993 goto half;
43995 case E_V8SImode:
43996 half_mode = V4SImode;
43997 j = 2;
43998 n = 4;
43999 goto half;
44001 case E_V4DImode:
44002 half_mode = V2DImode;
44003 j = 3;
44004 n = 2;
44005 goto half;
44007 case E_V8SFmode:
44008 half_mode = V4SFmode;
44009 j = 4;
44010 n = 4;
44011 goto half;
44013 case E_V4DFmode:
44014 half_mode = V2DFmode;
44015 j = 5;
44016 n = 2;
44017 goto half;
44019 half:
44020 /* Compute offset. */
44021 i = elt / n;
44022 elt %= n;
44024 gcc_assert (i <= 1);
44026 /* Extract the half. */
44027 tmp = gen_reg_rtx (half_mode);
44028 emit_insn (gen_extract[j][i] (tmp, target));
44030 /* Put val in tmp at elt. */
44031 ix86_expand_vector_set (false, tmp, val, elt);
44033 /* Put it back. */
44034 emit_insn (gen_insert[j][i] (target, target, tmp));
44035 return;
44037 case E_V8DFmode:
44038 if (TARGET_AVX512F)
44040 mmode = QImode;
44041 gen_blendm = gen_avx512f_blendmv8df;
44043 break;
44045 case E_V8DImode:
44046 if (TARGET_AVX512F)
44048 mmode = QImode;
44049 gen_blendm = gen_avx512f_blendmv8di;
44051 break;
44053 case E_V16SFmode:
44054 if (TARGET_AVX512F)
44056 mmode = HImode;
44057 gen_blendm = gen_avx512f_blendmv16sf;
44059 break;
44061 case E_V16SImode:
44062 if (TARGET_AVX512F)
44064 mmode = HImode;
44065 gen_blendm = gen_avx512f_blendmv16si;
44067 break;
44069 case E_V32HImode:
44070 if (TARGET_AVX512F && TARGET_AVX512BW)
44072 mmode = SImode;
44073 gen_blendm = gen_avx512bw_blendmv32hi;
44075 break;
44077 case E_V64QImode:
44078 if (TARGET_AVX512F && TARGET_AVX512BW)
44080 mmode = DImode;
44081 gen_blendm = gen_avx512bw_blendmv64qi;
44083 break;
44085 default:
44086 break;
44089 if (mmode != VOIDmode)
44091 tmp = gen_reg_rtx (mode);
44092 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44093 /* The avx512*_blendm<mode> expanders have different operand order
44094 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44095 elements where the mask is set and second input operand otherwise,
44096 in {sse,avx}*_*blend* the first input operand is used for elements
44097 where the mask is clear and second input operand otherwise. */
44098 emit_insn (gen_blendm (target, target, tmp,
44099 force_reg (mmode,
44100 gen_int_mode (1 << elt, mmode))));
44102 else if (use_vec_merge)
44104 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44105 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44106 emit_insn (gen_rtx_SET (target, tmp));
44108 else
44110 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44112 emit_move_insn (mem, target);
44114 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44115 emit_move_insn (tmp, val);
44117 emit_move_insn (target, mem);
44121 void
44122 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44124 machine_mode mode = GET_MODE (vec);
44125 machine_mode inner_mode = GET_MODE_INNER (mode);
44126 bool use_vec_extr = false;
44127 rtx tmp;
44129 switch (mode)
44131 case E_V2SImode:
44132 case E_V2SFmode:
44133 if (!mmx_ok)
44134 break;
44135 /* FALLTHRU */
44137 case E_V2DFmode:
44138 case E_V2DImode:
44139 case E_V2TImode:
44140 case E_V4TImode:
44141 use_vec_extr = true;
44142 break;
44144 case E_V4SFmode:
44145 use_vec_extr = TARGET_SSE4_1;
44146 if (use_vec_extr)
44147 break;
44149 switch (elt)
44151 case 0:
44152 tmp = vec;
44153 break;
44155 case 1:
44156 case 3:
44157 tmp = gen_reg_rtx (mode);
44158 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44159 GEN_INT (elt), GEN_INT (elt),
44160 GEN_INT (elt+4), GEN_INT (elt+4)));
44161 break;
44163 case 2:
44164 tmp = gen_reg_rtx (mode);
44165 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44166 break;
44168 default:
44169 gcc_unreachable ();
44171 vec = tmp;
44172 use_vec_extr = true;
44173 elt = 0;
44174 break;
44176 case E_V4SImode:
44177 use_vec_extr = TARGET_SSE4_1;
44178 if (use_vec_extr)
44179 break;
44181 if (TARGET_SSE2)
44183 switch (elt)
44185 case 0:
44186 tmp = vec;
44187 break;
44189 case 1:
44190 case 3:
44191 tmp = gen_reg_rtx (mode);
44192 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44193 GEN_INT (elt), GEN_INT (elt),
44194 GEN_INT (elt), GEN_INT (elt)));
44195 break;
44197 case 2:
44198 tmp = gen_reg_rtx (mode);
44199 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44200 break;
44202 default:
44203 gcc_unreachable ();
44205 vec = tmp;
44206 use_vec_extr = true;
44207 elt = 0;
44209 else
44211 /* For SSE1, we have to reuse the V4SF code. */
44212 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44213 gen_lowpart (V4SFmode, vec), elt);
44214 return;
44216 break;
44218 case E_V8HImode:
44219 use_vec_extr = TARGET_SSE2;
44220 break;
44221 case E_V4HImode:
44222 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44223 break;
44225 case E_V16QImode:
44226 use_vec_extr = TARGET_SSE4_1;
44227 break;
44229 case E_V8SFmode:
44230 if (TARGET_AVX)
44232 tmp = gen_reg_rtx (V4SFmode);
44233 if (elt < 4)
44234 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44235 else
44236 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44237 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44238 return;
44240 break;
44242 case E_V4DFmode:
44243 if (TARGET_AVX)
44245 tmp = gen_reg_rtx (V2DFmode);
44246 if (elt < 2)
44247 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44248 else
44249 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44250 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44251 return;
44253 break;
44255 case E_V32QImode:
44256 if (TARGET_AVX)
44258 tmp = gen_reg_rtx (V16QImode);
44259 if (elt < 16)
44260 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44261 else
44262 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44263 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44264 return;
44266 break;
44268 case E_V16HImode:
44269 if (TARGET_AVX)
44271 tmp = gen_reg_rtx (V8HImode);
44272 if (elt < 8)
44273 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44274 else
44275 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44276 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44277 return;
44279 break;
44281 case E_V8SImode:
44282 if (TARGET_AVX)
44284 tmp = gen_reg_rtx (V4SImode);
44285 if (elt < 4)
44286 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44287 else
44288 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44289 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44290 return;
44292 break;
44294 case E_V4DImode:
44295 if (TARGET_AVX)
44297 tmp = gen_reg_rtx (V2DImode);
44298 if (elt < 2)
44299 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44300 else
44301 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44302 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44303 return;
44305 break;
44307 case E_V32HImode:
44308 if (TARGET_AVX512BW)
44310 tmp = gen_reg_rtx (V16HImode);
44311 if (elt < 16)
44312 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44313 else
44314 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44315 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44316 return;
44318 break;
44320 case E_V64QImode:
44321 if (TARGET_AVX512BW)
44323 tmp = gen_reg_rtx (V32QImode);
44324 if (elt < 32)
44325 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44326 else
44327 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44328 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44329 return;
44331 break;
44333 case E_V16SFmode:
44334 tmp = gen_reg_rtx (V8SFmode);
44335 if (elt < 8)
44336 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44337 else
44338 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44339 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44340 return;
44342 case E_V8DFmode:
44343 tmp = gen_reg_rtx (V4DFmode);
44344 if (elt < 4)
44345 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44346 else
44347 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44348 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44349 return;
44351 case E_V16SImode:
44352 tmp = gen_reg_rtx (V8SImode);
44353 if (elt < 8)
44354 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44355 else
44356 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44357 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44358 return;
44360 case E_V8DImode:
44361 tmp = gen_reg_rtx (V4DImode);
44362 if (elt < 4)
44363 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44364 else
44365 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44366 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44367 return;
44369 case E_V8QImode:
44370 /* ??? Could extract the appropriate HImode element and shift. */
44371 default:
44372 break;
44375 if (use_vec_extr)
44377 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44378 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44380 /* Let the rtl optimizers know about the zero extension performed. */
44381 if (inner_mode == QImode || inner_mode == HImode)
44383 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44384 target = gen_lowpart (SImode, target);
44387 emit_insn (gen_rtx_SET (target, tmp));
44389 else
44391 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44393 emit_move_insn (mem, vec);
44395 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44396 emit_move_insn (target, tmp);
44400 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44401 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44402 The upper bits of DEST are undefined, though they shouldn't cause
44403 exceptions (some bits from src or all zeros are ok). */
44405 static void
44406 emit_reduc_half (rtx dest, rtx src, int i)
44408 rtx tem, d = dest;
44409 switch (GET_MODE (src))
44411 case E_V4SFmode:
44412 if (i == 128)
44413 tem = gen_sse_movhlps (dest, src, src);
44414 else
44415 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44416 GEN_INT (1 + 4), GEN_INT (1 + 4));
44417 break;
44418 case E_V2DFmode:
44419 tem = gen_vec_interleave_highv2df (dest, src, src);
44420 break;
44421 case E_V16QImode:
44422 case E_V8HImode:
44423 case E_V4SImode:
44424 case E_V2DImode:
44425 d = gen_reg_rtx (V1TImode);
44426 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44427 GEN_INT (i / 2));
44428 break;
44429 case E_V8SFmode:
44430 if (i == 256)
44431 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44432 else
44433 tem = gen_avx_shufps256 (dest, src, src,
44434 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44435 break;
44436 case E_V4DFmode:
44437 if (i == 256)
44438 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44439 else
44440 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44441 break;
44442 case E_V32QImode:
44443 case E_V16HImode:
44444 case E_V8SImode:
44445 case E_V4DImode:
44446 if (i == 256)
44448 if (GET_MODE (dest) != V4DImode)
44449 d = gen_reg_rtx (V4DImode);
44450 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44451 gen_lowpart (V4DImode, src),
44452 const1_rtx);
44454 else
44456 d = gen_reg_rtx (V2TImode);
44457 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44458 GEN_INT (i / 2));
44460 break;
44461 case E_V64QImode:
44462 case E_V32HImode:
44463 case E_V16SImode:
44464 case E_V16SFmode:
44465 case E_V8DImode:
44466 case E_V8DFmode:
44467 if (i > 128)
44468 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44469 gen_lowpart (V16SImode, src),
44470 gen_lowpart (V16SImode, src),
44471 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44472 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44473 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44474 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44475 GEN_INT (0xC), GEN_INT (0xD),
44476 GEN_INT (0xE), GEN_INT (0xF),
44477 GEN_INT (0x10), GEN_INT (0x11),
44478 GEN_INT (0x12), GEN_INT (0x13),
44479 GEN_INT (0x14), GEN_INT (0x15),
44480 GEN_INT (0x16), GEN_INT (0x17));
44481 else
44482 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44483 gen_lowpart (V16SImode, src),
44484 GEN_INT (i == 128 ? 0x2 : 0x1),
44485 GEN_INT (0x3),
44486 GEN_INT (0x3),
44487 GEN_INT (0x3),
44488 GEN_INT (i == 128 ? 0x6 : 0x5),
44489 GEN_INT (0x7),
44490 GEN_INT (0x7),
44491 GEN_INT (0x7),
44492 GEN_INT (i == 128 ? 0xA : 0x9),
44493 GEN_INT (0xB),
44494 GEN_INT (0xB),
44495 GEN_INT (0xB),
44496 GEN_INT (i == 128 ? 0xE : 0xD),
44497 GEN_INT (0xF),
44498 GEN_INT (0xF),
44499 GEN_INT (0xF));
44500 break;
44501 default:
44502 gcc_unreachable ();
44504 emit_insn (tem);
44505 if (d != dest)
44506 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44509 /* Expand a vector reduction. FN is the binary pattern to reduce;
44510 DEST is the destination; IN is the input vector. */
44512 void
44513 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44515 rtx half, dst, vec = in;
44516 machine_mode mode = GET_MODE (in);
44517 int i;
44519 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44520 if (TARGET_SSE4_1
44521 && mode == V8HImode
44522 && fn == gen_uminv8hi3)
44524 emit_insn (gen_sse4_1_phminposuw (dest, in));
44525 return;
44528 for (i = GET_MODE_BITSIZE (mode);
44529 i > GET_MODE_UNIT_BITSIZE (mode);
44530 i >>= 1)
44532 half = gen_reg_rtx (mode);
44533 emit_reduc_half (half, vec, i);
44534 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44535 dst = dest;
44536 else
44537 dst = gen_reg_rtx (mode);
44538 emit_insn (fn (dst, half, vec));
44539 vec = dst;
44543 /* Target hook for scalar_mode_supported_p. */
44544 static bool
44545 ix86_scalar_mode_supported_p (scalar_mode mode)
44547 if (DECIMAL_FLOAT_MODE_P (mode))
44548 return default_decimal_float_supported_p ();
44549 else if (mode == TFmode)
44550 return true;
44551 else
44552 return default_scalar_mode_supported_p (mode);
44555 /* Implements target hook vector_mode_supported_p. */
44556 static bool
44557 ix86_vector_mode_supported_p (machine_mode mode)
44559 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44560 return true;
44561 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44562 return true;
44563 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44564 return true;
44565 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44566 return true;
44567 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44568 return true;
44569 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44570 return true;
44571 return false;
44574 /* Target hook for c_mode_for_suffix. */
44575 static machine_mode
44576 ix86_c_mode_for_suffix (char suffix)
44578 if (suffix == 'q')
44579 return TFmode;
44580 if (suffix == 'w')
44581 return XFmode;
44583 return VOIDmode;
44586 /* Worker function for TARGET_MD_ASM_ADJUST.
44588 We implement asm flag outputs, and maintain source compatibility
44589 with the old cc0-based compiler. */
44591 static rtx_insn *
44592 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44593 vec<const char *> &constraints,
44594 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44596 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44597 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44599 bool saw_asm_flag = false;
44601 start_sequence ();
44602 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44604 const char *con = constraints[i];
44605 if (strncmp (con, "=@cc", 4) != 0)
44606 continue;
44607 con += 4;
44608 if (strchr (con, ',') != NULL)
44610 error ("alternatives not allowed in asm flag output");
44611 continue;
44614 bool invert = false;
44615 if (con[0] == 'n')
44616 invert = true, con++;
44618 machine_mode mode = CCmode;
44619 rtx_code code = UNKNOWN;
44621 switch (con[0])
44623 case 'a':
44624 if (con[1] == 0)
44625 mode = CCAmode, code = EQ;
44626 else if (con[1] == 'e' && con[2] == 0)
44627 mode = CCCmode, code = NE;
44628 break;
44629 case 'b':
44630 if (con[1] == 0)
44631 mode = CCCmode, code = EQ;
44632 else if (con[1] == 'e' && con[2] == 0)
44633 mode = CCAmode, code = NE;
44634 break;
44635 case 'c':
44636 if (con[1] == 0)
44637 mode = CCCmode, code = EQ;
44638 break;
44639 case 'e':
44640 if (con[1] == 0)
44641 mode = CCZmode, code = EQ;
44642 break;
44643 case 'g':
44644 if (con[1] == 0)
44645 mode = CCGCmode, code = GT;
44646 else if (con[1] == 'e' && con[2] == 0)
44647 mode = CCGCmode, code = GE;
44648 break;
44649 case 'l':
44650 if (con[1] == 0)
44651 mode = CCGCmode, code = LT;
44652 else if (con[1] == 'e' && con[2] == 0)
44653 mode = CCGCmode, code = LE;
44654 break;
44655 case 'o':
44656 if (con[1] == 0)
44657 mode = CCOmode, code = EQ;
44658 break;
44659 case 'p':
44660 if (con[1] == 0)
44661 mode = CCPmode, code = EQ;
44662 break;
44663 case 's':
44664 if (con[1] == 0)
44665 mode = CCSmode, code = EQ;
44666 break;
44667 case 'z':
44668 if (con[1] == 0)
44669 mode = CCZmode, code = EQ;
44670 break;
44672 if (code == UNKNOWN)
44674 error ("unknown asm flag output %qs", constraints[i]);
44675 continue;
44677 if (invert)
44678 code = reverse_condition (code);
44680 rtx dest = outputs[i];
44681 if (!saw_asm_flag)
44683 /* This is the first asm flag output. Here we put the flags
44684 register in as the real output and adjust the condition to
44685 allow it. */
44686 constraints[i] = "=Bf";
44687 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44688 saw_asm_flag = true;
44690 else
44692 /* We don't need the flags register as output twice. */
44693 constraints[i] = "=X";
44694 outputs[i] = gen_rtx_SCRATCH (SImode);
44697 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44698 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44700 machine_mode dest_mode = GET_MODE (dest);
44701 if (!SCALAR_INT_MODE_P (dest_mode))
44703 error ("invalid type for asm flag output");
44704 continue;
44707 if (dest_mode == DImode && !TARGET_64BIT)
44708 dest_mode = SImode;
44710 if (dest_mode != QImode)
44712 rtx destqi = gen_reg_rtx (QImode);
44713 emit_insn (gen_rtx_SET (destqi, x));
44715 if (TARGET_ZERO_EXTEND_WITH_AND
44716 && optimize_function_for_speed_p (cfun))
44718 x = force_reg (dest_mode, const0_rtx);
44720 emit_insn (gen_movstrictqi
44721 (gen_lowpart (QImode, x), destqi));
44723 else
44724 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44727 if (dest_mode != GET_MODE (dest))
44729 rtx tmp = gen_reg_rtx (SImode);
44731 emit_insn (gen_rtx_SET (tmp, x));
44732 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44734 else
44735 emit_insn (gen_rtx_SET (dest, x));
44737 rtx_insn *seq = get_insns ();
44738 end_sequence ();
44740 if (saw_asm_flag)
44741 return seq;
44742 else
44744 /* If we had no asm flag outputs, clobber the flags. */
44745 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44746 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44747 return NULL;
44751 /* Implements target vector targetm.asm.encode_section_info. */
44753 static void ATTRIBUTE_UNUSED
44754 ix86_encode_section_info (tree decl, rtx rtl, int first)
44756 default_encode_section_info (decl, rtl, first);
44758 if (ix86_in_large_data_p (decl))
44759 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44762 /* Worker function for REVERSE_CONDITION. */
44764 enum rtx_code
44765 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44767 return (mode == CCFPmode
44768 ? reverse_condition_maybe_unordered (code)
44769 : reverse_condition (code));
44772 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44773 to OPERANDS[0]. */
44775 const char *
44776 output_387_reg_move (rtx_insn *insn, rtx *operands)
44778 if (REG_P (operands[0]))
44780 if (REG_P (operands[1])
44781 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44783 if (REGNO (operands[0]) == FIRST_STACK_REG)
44784 return output_387_ffreep (operands, 0);
44785 return "fstp\t%y0";
44787 if (STACK_TOP_P (operands[0]))
44788 return "fld%Z1\t%y1";
44789 return "fst\t%y0";
44791 else if (MEM_P (operands[0]))
44793 gcc_assert (REG_P (operands[1]));
44794 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44795 return "fstp%Z0\t%y0";
44796 else
44798 /* There is no non-popping store to memory for XFmode.
44799 So if we need one, follow the store with a load. */
44800 if (GET_MODE (operands[0]) == XFmode)
44801 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44802 else
44803 return "fst%Z0\t%y0";
44806 else
44807 gcc_unreachable();
44810 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44811 FP status register is set. */
44813 void
44814 ix86_emit_fp_unordered_jump (rtx label)
44816 rtx reg = gen_reg_rtx (HImode);
44817 rtx temp;
44819 emit_insn (gen_x86_fnstsw_1 (reg));
44821 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44823 emit_insn (gen_x86_sahf_1 (reg));
44825 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44826 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44828 else
44830 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44832 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44833 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44836 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44837 gen_rtx_LABEL_REF (VOIDmode, label),
44838 pc_rtx);
44839 temp = gen_rtx_SET (pc_rtx, temp);
44841 emit_jump_insn (temp);
44842 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44845 /* Output code to perform a log1p XFmode calculation. */
44847 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44849 rtx_code_label *label1 = gen_label_rtx ();
44850 rtx_code_label *label2 = gen_label_rtx ();
44852 rtx tmp = gen_reg_rtx (XFmode);
44853 rtx tmp2 = gen_reg_rtx (XFmode);
44854 rtx test;
44856 emit_insn (gen_absxf2 (tmp, op1));
44857 test = gen_rtx_GE (VOIDmode, tmp,
44858 const_double_from_real_value (
44859 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44860 XFmode));
44861 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44863 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44864 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44865 emit_jump (label2);
44867 emit_label (label1);
44868 emit_move_insn (tmp, CONST1_RTX (XFmode));
44869 emit_insn (gen_addxf3 (tmp, op1, tmp));
44870 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44871 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44873 emit_label (label2);
44876 /* Emit code for round calculation. */
44877 void ix86_emit_i387_round (rtx op0, rtx op1)
44879 machine_mode inmode = GET_MODE (op1);
44880 machine_mode outmode = GET_MODE (op0);
44881 rtx e1, e2, res, tmp, tmp1, half;
44882 rtx scratch = gen_reg_rtx (HImode);
44883 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44884 rtx_code_label *jump_label = gen_label_rtx ();
44885 rtx insn;
44886 rtx (*gen_abs) (rtx, rtx);
44887 rtx (*gen_neg) (rtx, rtx);
44889 switch (inmode)
44891 case E_SFmode:
44892 gen_abs = gen_abssf2;
44893 break;
44894 case E_DFmode:
44895 gen_abs = gen_absdf2;
44896 break;
44897 case E_XFmode:
44898 gen_abs = gen_absxf2;
44899 break;
44900 default:
44901 gcc_unreachable ();
44904 switch (outmode)
44906 case E_SFmode:
44907 gen_neg = gen_negsf2;
44908 break;
44909 case E_DFmode:
44910 gen_neg = gen_negdf2;
44911 break;
44912 case E_XFmode:
44913 gen_neg = gen_negxf2;
44914 break;
44915 case E_HImode:
44916 gen_neg = gen_neghi2;
44917 break;
44918 case E_SImode:
44919 gen_neg = gen_negsi2;
44920 break;
44921 case E_DImode:
44922 gen_neg = gen_negdi2;
44923 break;
44924 default:
44925 gcc_unreachable ();
44928 e1 = gen_reg_rtx (inmode);
44929 e2 = gen_reg_rtx (inmode);
44930 res = gen_reg_rtx (outmode);
44932 half = const_double_from_real_value (dconsthalf, inmode);
44934 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44936 /* scratch = fxam(op1) */
44937 emit_insn (gen_rtx_SET (scratch,
44938 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44939 UNSPEC_FXAM)));
44940 /* e1 = fabs(op1) */
44941 emit_insn (gen_abs (e1, op1));
44943 /* e2 = e1 + 0.5 */
44944 half = force_reg (inmode, half);
44945 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44947 /* res = floor(e2) */
44948 if (inmode != XFmode)
44950 tmp1 = gen_reg_rtx (XFmode);
44952 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44954 else
44955 tmp1 = e2;
44957 switch (outmode)
44959 case E_SFmode:
44960 case E_DFmode:
44962 rtx tmp0 = gen_reg_rtx (XFmode);
44964 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44966 emit_insn (gen_rtx_SET (res,
44967 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44968 UNSPEC_TRUNC_NOOP)));
44970 break;
44971 case E_XFmode:
44972 emit_insn (gen_frndintxf2_floor (res, tmp1));
44973 break;
44974 case E_HImode:
44975 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44976 break;
44977 case E_SImode:
44978 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44979 break;
44980 case E_DImode:
44981 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44982 break;
44983 default:
44984 gcc_unreachable ();
44987 /* flags = signbit(a) */
44988 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44990 /* if (flags) then res = -res */
44991 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44992 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44993 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44994 pc_rtx);
44995 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44996 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44997 JUMP_LABEL (insn) = jump_label;
44999 emit_insn (gen_neg (res, res));
45001 emit_label (jump_label);
45002 LABEL_NUSES (jump_label) = 1;
45004 emit_move_insn (op0, res);
45007 /* Output code to perform a Newton-Rhapson approximation of a single precision
45008 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45010 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45012 rtx x0, x1, e0, e1;
45014 x0 = gen_reg_rtx (mode);
45015 e0 = gen_reg_rtx (mode);
45016 e1 = gen_reg_rtx (mode);
45017 x1 = gen_reg_rtx (mode);
45019 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45021 b = force_reg (mode, b);
45023 /* x0 = rcp(b) estimate */
45024 if (mode == V16SFmode || mode == V8DFmode)
45026 if (TARGET_AVX512ER)
45028 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45029 UNSPEC_RCP28)));
45030 /* res = a * x0 */
45031 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45032 return;
45034 else
45035 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45036 UNSPEC_RCP14)));
45038 else
45039 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45040 UNSPEC_RCP)));
45042 /* e0 = x0 * b */
45043 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45045 /* e0 = x0 * e0 */
45046 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45048 /* e1 = x0 + x0 */
45049 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45051 /* x1 = e1 - e0 */
45052 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45054 /* res = a * x1 */
45055 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45058 /* Output code to perform a Newton-Rhapson approximation of a
45059 single precision floating point [reciprocal] square root. */
45061 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45063 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45064 REAL_VALUE_TYPE r;
45065 int unspec;
45067 x0 = gen_reg_rtx (mode);
45068 e0 = gen_reg_rtx (mode);
45069 e1 = gen_reg_rtx (mode);
45070 e2 = gen_reg_rtx (mode);
45071 e3 = gen_reg_rtx (mode);
45073 if (TARGET_AVX512ER && mode == V16SFmode)
45075 if (recip)
45076 /* res = rsqrt28(a) estimate */
45077 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45078 UNSPEC_RSQRT28)));
45079 else
45081 /* x0 = rsqrt28(a) estimate */
45082 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45083 UNSPEC_RSQRT28)));
45084 /* res = rcp28(x0) estimate */
45085 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45086 UNSPEC_RCP28)));
45088 return;
45091 real_from_integer (&r, VOIDmode, -3, SIGNED);
45092 mthree = const_double_from_real_value (r, SFmode);
45094 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45095 mhalf = const_double_from_real_value (r, SFmode);
45096 unspec = UNSPEC_RSQRT;
45098 if (VECTOR_MODE_P (mode))
45100 mthree = ix86_build_const_vector (mode, true, mthree);
45101 mhalf = ix86_build_const_vector (mode, true, mhalf);
45102 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45103 if (GET_MODE_SIZE (mode) == 64)
45104 unspec = UNSPEC_RSQRT14;
45107 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45108 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45110 a = force_reg (mode, a);
45112 /* x0 = rsqrt(a) estimate */
45113 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45114 unspec)));
45116 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45117 if (!recip)
45119 rtx zero = force_reg (mode, CONST0_RTX(mode));
45120 rtx mask;
45122 /* Handle masked compare. */
45123 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45125 mask = gen_reg_rtx (HImode);
45126 /* Imm value 0x4 corresponds to not-equal comparison. */
45127 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45128 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45130 else
45132 mask = gen_reg_rtx (mode);
45133 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45134 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45138 /* e0 = x0 * a */
45139 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45140 /* e1 = e0 * x0 */
45141 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45143 /* e2 = e1 - 3. */
45144 mthree = force_reg (mode, mthree);
45145 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45147 mhalf = force_reg (mode, mhalf);
45148 if (recip)
45149 /* e3 = -.5 * x0 */
45150 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45151 else
45152 /* e3 = -.5 * e0 */
45153 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45154 /* ret = e2 * e3 */
45155 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45158 #ifdef TARGET_SOLARIS
45159 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45161 static void
45162 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45163 tree decl)
45165 /* With Binutils 2.15, the "@unwind" marker must be specified on
45166 every occurrence of the ".eh_frame" section, not just the first
45167 one. */
45168 if (TARGET_64BIT
45169 && strcmp (name, ".eh_frame") == 0)
45171 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45172 flags & SECTION_WRITE ? "aw" : "a");
45173 return;
45176 #ifndef USE_GAS
45177 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45179 solaris_elf_asm_comdat_section (name, flags, decl);
45180 return;
45182 #endif
45184 default_elf_asm_named_section (name, flags, decl);
45186 #endif /* TARGET_SOLARIS */
45188 /* Return the mangling of TYPE if it is an extended fundamental type. */
45190 static const char *
45191 ix86_mangle_type (const_tree type)
45193 type = TYPE_MAIN_VARIANT (type);
45195 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45196 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45197 return NULL;
45199 switch (TYPE_MODE (type))
45201 case E_TFmode:
45202 /* __float128 is "g". */
45203 return "g";
45204 case E_XFmode:
45205 /* "long double" or __float80 is "e". */
45206 return "e";
45207 default:
45208 return NULL;
45212 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45214 static tree
45215 ix86_stack_protect_guard (void)
45217 if (TARGET_SSP_TLS_GUARD)
45219 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45220 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45221 tree type = build_qualified_type (type_node, qual);
45222 tree t;
45224 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45226 t = ix86_tls_stack_chk_guard_decl;
45228 if (t == NULL)
45230 rtx x;
45232 t = build_decl
45233 (UNKNOWN_LOCATION, VAR_DECL,
45234 get_identifier (ix86_stack_protector_guard_symbol_str),
45235 type);
45236 TREE_STATIC (t) = 1;
45237 TREE_PUBLIC (t) = 1;
45238 DECL_EXTERNAL (t) = 1;
45239 TREE_USED (t) = 1;
45240 TREE_THIS_VOLATILE (t) = 1;
45241 DECL_ARTIFICIAL (t) = 1;
45242 DECL_IGNORED_P (t) = 1;
45244 /* Do not share RTL as the declaration is visible outside of
45245 current function. */
45246 x = DECL_RTL (t);
45247 RTX_FLAG (x, used) = 1;
45249 ix86_tls_stack_chk_guard_decl = t;
45252 else
45254 tree asptrtype = build_pointer_type (type);
45256 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45257 t = build2 (MEM_REF, asptrtype, t,
45258 build_int_cst (asptrtype, 0));
45261 return t;
45264 return default_stack_protect_guard ();
45267 /* For 32-bit code we can save PIC register setup by using
45268 __stack_chk_fail_local hidden function instead of calling
45269 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45270 register, so it is better to call __stack_chk_fail directly. */
45272 static tree ATTRIBUTE_UNUSED
45273 ix86_stack_protect_fail (void)
45275 return TARGET_64BIT
45276 ? default_external_stack_protect_fail ()
45277 : default_hidden_stack_protect_fail ();
45280 /* Select a format to encode pointers in exception handling data. CODE
45281 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45282 true if the symbol may be affected by dynamic relocations.
45284 ??? All x86 object file formats are capable of representing this.
45285 After all, the relocation needed is the same as for the call insn.
45286 Whether or not a particular assembler allows us to enter such, I
45287 guess we'll have to see. */
45289 asm_preferred_eh_data_format (int code, int global)
45291 if (flag_pic)
45293 int type = DW_EH_PE_sdata8;
45294 if (!TARGET_64BIT
45295 || ix86_cmodel == CM_SMALL_PIC
45296 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45297 type = DW_EH_PE_sdata4;
45298 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45300 if (ix86_cmodel == CM_SMALL
45301 || (ix86_cmodel == CM_MEDIUM && code))
45302 return DW_EH_PE_udata4;
45303 return DW_EH_PE_absptr;
45306 /* Expand copysign from SIGN to the positive value ABS_VALUE
45307 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45308 the sign-bit. */
45309 static void
45310 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45312 machine_mode mode = GET_MODE (sign);
45313 rtx sgn = gen_reg_rtx (mode);
45314 if (mask == NULL_RTX)
45316 machine_mode vmode;
45318 if (mode == SFmode)
45319 vmode = V4SFmode;
45320 else if (mode == DFmode)
45321 vmode = V2DFmode;
45322 else
45323 vmode = mode;
45325 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45326 if (!VECTOR_MODE_P (mode))
45328 /* We need to generate a scalar mode mask in this case. */
45329 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45330 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45331 mask = gen_reg_rtx (mode);
45332 emit_insn (gen_rtx_SET (mask, tmp));
45335 else
45336 mask = gen_rtx_NOT (mode, mask);
45337 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45338 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45341 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45342 mask for masking out the sign-bit is stored in *SMASK, if that is
45343 non-null. */
45344 static rtx
45345 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45347 machine_mode vmode, mode = GET_MODE (op0);
45348 rtx xa, mask;
45350 xa = gen_reg_rtx (mode);
45351 if (mode == SFmode)
45352 vmode = V4SFmode;
45353 else if (mode == DFmode)
45354 vmode = V2DFmode;
45355 else
45356 vmode = mode;
45357 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45358 if (!VECTOR_MODE_P (mode))
45360 /* We need to generate a scalar mode mask in this case. */
45361 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45362 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45363 mask = gen_reg_rtx (mode);
45364 emit_insn (gen_rtx_SET (mask, tmp));
45366 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45368 if (smask)
45369 *smask = mask;
45371 return xa;
45374 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45375 swapping the operands if SWAP_OPERANDS is true. The expanded
45376 code is a forward jump to a newly created label in case the
45377 comparison is true. The generated label rtx is returned. */
45378 static rtx_code_label *
45379 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45380 bool swap_operands)
45382 bool unordered_compare = ix86_unordered_fp_compare (code);
45383 rtx_code_label *label;
45384 rtx tmp, reg;
45386 if (swap_operands)
45387 std::swap (op0, op1);
45389 label = gen_label_rtx ();
45390 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45391 if (unordered_compare)
45392 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45393 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45394 emit_insn (gen_rtx_SET (reg, tmp));
45395 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45396 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45397 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45398 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45399 JUMP_LABEL (tmp) = label;
45401 return label;
45404 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45405 using comparison code CODE. Operands are swapped for the comparison if
45406 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45407 static rtx
45408 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45409 bool swap_operands)
45411 rtx (*insn)(rtx, rtx, rtx, rtx);
45412 machine_mode mode = GET_MODE (op0);
45413 rtx mask = gen_reg_rtx (mode);
45415 if (swap_operands)
45416 std::swap (op0, op1);
45418 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45420 emit_insn (insn (mask, op0, op1,
45421 gen_rtx_fmt_ee (code, mode, op0, op1)));
45422 return mask;
45425 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45426 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45427 static rtx
45428 ix86_gen_TWO52 (machine_mode mode)
45430 REAL_VALUE_TYPE TWO52r;
45431 rtx TWO52;
45433 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45434 TWO52 = const_double_from_real_value (TWO52r, mode);
45435 TWO52 = force_reg (mode, TWO52);
45437 return TWO52;
45440 /* Expand SSE sequence for computing lround from OP1 storing
45441 into OP0. */
45442 void
45443 ix86_expand_lround (rtx op0, rtx op1)
45445 /* C code for the stuff we're doing below:
45446 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45447 return (long)tmp;
45449 machine_mode mode = GET_MODE (op1);
45450 const struct real_format *fmt;
45451 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45452 rtx adj;
45454 /* load nextafter (0.5, 0.0) */
45455 fmt = REAL_MODE_FORMAT (mode);
45456 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45457 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45459 /* adj = copysign (0.5, op1) */
45460 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45461 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45463 /* adj = op1 + adj */
45464 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45466 /* op0 = (imode)adj */
45467 expand_fix (op0, adj, 0);
45470 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45471 into OPERAND0. */
45472 void
45473 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45475 /* C code for the stuff we're doing below (for do_floor):
45476 xi = (long)op1;
45477 xi -= (double)xi > op1 ? 1 : 0;
45478 return xi;
45480 machine_mode fmode = GET_MODE (op1);
45481 machine_mode imode = GET_MODE (op0);
45482 rtx ireg, freg, tmp;
45483 rtx_code_label *label;
45485 /* reg = (long)op1 */
45486 ireg = gen_reg_rtx (imode);
45487 expand_fix (ireg, op1, 0);
45489 /* freg = (double)reg */
45490 freg = gen_reg_rtx (fmode);
45491 expand_float (freg, ireg, 0);
45493 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45494 label = ix86_expand_sse_compare_and_jump (UNLE,
45495 freg, op1, !do_floor);
45496 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45497 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45498 emit_move_insn (ireg, tmp);
45500 emit_label (label);
45501 LABEL_NUSES (label) = 1;
45503 emit_move_insn (op0, ireg);
45506 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45507 void
45508 ix86_expand_rint (rtx operand0, rtx operand1)
45510 /* C code for the stuff we're doing below:
45511 xa = fabs (operand1);
45512 if (!isless (xa, 2**52))
45513 return operand1;
45514 two52 = 2**52;
45515 if (flag_rounding_math)
45517 two52 = copysign (two52, operand1);
45518 xa = operand1;
45520 xa = xa + two52 - two52;
45521 return copysign (xa, operand1);
45523 machine_mode mode = GET_MODE (operand0);
45524 rtx res, xa, TWO52, two52, mask;
45525 rtx_code_label *label;
45527 res = gen_reg_rtx (mode);
45528 emit_move_insn (res, operand1);
45530 /* xa = abs (operand1) */
45531 xa = ix86_expand_sse_fabs (res, &mask);
45533 /* if (!isless (xa, TWO52)) goto label; */
45534 TWO52 = ix86_gen_TWO52 (mode);
45535 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45537 two52 = TWO52;
45538 if (flag_rounding_math)
45540 two52 = gen_reg_rtx (mode);
45541 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45542 xa = res;
45545 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45546 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45548 ix86_sse_copysign_to_positive (res, xa, res, mask);
45550 emit_label (label);
45551 LABEL_NUSES (label) = 1;
45553 emit_move_insn (operand0, res);
45556 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45557 into OPERAND0. */
45558 void
45559 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45561 /* C code for the stuff we expand below.
45562 double xa = fabs (x), x2;
45563 if (!isless (xa, TWO52))
45564 return x;
45565 xa = xa + TWO52 - TWO52;
45566 x2 = copysign (xa, x);
45567 Compensate. Floor:
45568 if (x2 > x)
45569 x2 -= 1;
45570 Compensate. Ceil:
45571 if (x2 < x)
45572 x2 -= -1;
45573 return x2;
45575 machine_mode mode = GET_MODE (operand0);
45576 rtx xa, TWO52, tmp, one, res, mask;
45577 rtx_code_label *label;
45579 TWO52 = ix86_gen_TWO52 (mode);
45581 /* Temporary for holding the result, initialized to the input
45582 operand to ease control flow. */
45583 res = gen_reg_rtx (mode);
45584 emit_move_insn (res, operand1);
45586 /* xa = abs (operand1) */
45587 xa = ix86_expand_sse_fabs (res, &mask);
45589 /* if (!isless (xa, TWO52)) goto label; */
45590 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45592 /* xa = xa + TWO52 - TWO52; */
45593 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45594 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45596 /* xa = copysign (xa, operand1) */
45597 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45599 /* generate 1.0 or -1.0 */
45600 one = force_reg (mode,
45601 const_double_from_real_value (do_floor
45602 ? dconst1 : dconstm1, mode));
45604 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45605 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45606 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45607 /* We always need to subtract here to preserve signed zero. */
45608 tmp = expand_simple_binop (mode, MINUS,
45609 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45610 emit_move_insn (res, tmp);
45612 emit_label (label);
45613 LABEL_NUSES (label) = 1;
45615 emit_move_insn (operand0, res);
45618 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45619 into OPERAND0. */
45620 void
45621 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45623 /* C code for the stuff we expand below.
45624 double xa = fabs (x), x2;
45625 if (!isless (xa, TWO52))
45626 return x;
45627 x2 = (double)(long)x;
45628 Compensate. Floor:
45629 if (x2 > x)
45630 x2 -= 1;
45631 Compensate. Ceil:
45632 if (x2 < x)
45633 x2 += 1;
45634 if (HONOR_SIGNED_ZEROS (mode))
45635 return copysign (x2, x);
45636 return x2;
45638 machine_mode mode = GET_MODE (operand0);
45639 rtx xa, xi, TWO52, tmp, one, res, mask;
45640 rtx_code_label *label;
45642 TWO52 = ix86_gen_TWO52 (mode);
45644 /* Temporary for holding the result, initialized to the input
45645 operand to ease control flow. */
45646 res = gen_reg_rtx (mode);
45647 emit_move_insn (res, operand1);
45649 /* xa = abs (operand1) */
45650 xa = ix86_expand_sse_fabs (res, &mask);
45652 /* if (!isless (xa, TWO52)) goto label; */
45653 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45655 /* xa = (double)(long)x */
45656 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45657 expand_fix (xi, res, 0);
45658 expand_float (xa, xi, 0);
45660 /* generate 1.0 */
45661 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45663 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45664 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45665 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45666 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45667 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45668 emit_move_insn (res, tmp);
45670 if (HONOR_SIGNED_ZEROS (mode))
45671 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45673 emit_label (label);
45674 LABEL_NUSES (label) = 1;
45676 emit_move_insn (operand0, res);
45679 /* Expand SSE sequence for computing round from OPERAND1 storing
45680 into OPERAND0. Sequence that works without relying on DImode truncation
45681 via cvttsd2siq that is only available on 64bit targets. */
45682 void
45683 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45685 /* C code for the stuff we expand below.
45686 double xa = fabs (x), xa2, x2;
45687 if (!isless (xa, TWO52))
45688 return x;
45689 Using the absolute value and copying back sign makes
45690 -0.0 -> -0.0 correct.
45691 xa2 = xa + TWO52 - TWO52;
45692 Compensate.
45693 dxa = xa2 - xa;
45694 if (dxa <= -0.5)
45695 xa2 += 1;
45696 else if (dxa > 0.5)
45697 xa2 -= 1;
45698 x2 = copysign (xa2, x);
45699 return x2;
45701 machine_mode mode = GET_MODE (operand0);
45702 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45703 rtx_code_label *label;
45705 TWO52 = ix86_gen_TWO52 (mode);
45707 /* Temporary for holding the result, initialized to the input
45708 operand to ease control flow. */
45709 res = gen_reg_rtx (mode);
45710 emit_move_insn (res, operand1);
45712 /* xa = abs (operand1) */
45713 xa = ix86_expand_sse_fabs (res, &mask);
45715 /* if (!isless (xa, TWO52)) goto label; */
45716 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45718 /* xa2 = xa + TWO52 - TWO52; */
45719 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45720 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45722 /* dxa = xa2 - xa; */
45723 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45725 /* generate 0.5, 1.0 and -0.5 */
45726 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45727 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45728 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45729 0, OPTAB_DIRECT);
45731 /* Compensate. */
45732 tmp = gen_reg_rtx (mode);
45733 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45734 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45735 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45736 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45737 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45738 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45739 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45740 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45742 /* res = copysign (xa2, operand1) */
45743 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45745 emit_label (label);
45746 LABEL_NUSES (label) = 1;
45748 emit_move_insn (operand0, res);
45751 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45752 into OPERAND0. */
45753 void
45754 ix86_expand_trunc (rtx operand0, rtx operand1)
45756 /* C code for SSE variant we expand below.
45757 double xa = fabs (x), x2;
45758 if (!isless (xa, TWO52))
45759 return x;
45760 x2 = (double)(long)x;
45761 if (HONOR_SIGNED_ZEROS (mode))
45762 return copysign (x2, x);
45763 return x2;
45765 machine_mode mode = GET_MODE (operand0);
45766 rtx xa, xi, TWO52, res, mask;
45767 rtx_code_label *label;
45769 TWO52 = ix86_gen_TWO52 (mode);
45771 /* Temporary for holding the result, initialized to the input
45772 operand to ease control flow. */
45773 res = gen_reg_rtx (mode);
45774 emit_move_insn (res, operand1);
45776 /* xa = abs (operand1) */
45777 xa = ix86_expand_sse_fabs (res, &mask);
45779 /* if (!isless (xa, TWO52)) goto label; */
45780 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45782 /* x = (double)(long)x */
45783 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45784 expand_fix (xi, res, 0);
45785 expand_float (res, xi, 0);
45787 if (HONOR_SIGNED_ZEROS (mode))
45788 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45790 emit_label (label);
45791 LABEL_NUSES (label) = 1;
45793 emit_move_insn (operand0, res);
45796 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45797 into OPERAND0. */
45798 void
45799 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45801 machine_mode mode = GET_MODE (operand0);
45802 rtx xa, mask, TWO52, one, res, smask, tmp;
45803 rtx_code_label *label;
45805 /* C code for SSE variant we expand below.
45806 double xa = fabs (x), x2;
45807 if (!isless (xa, TWO52))
45808 return x;
45809 xa2 = xa + TWO52 - TWO52;
45810 Compensate:
45811 if (xa2 > xa)
45812 xa2 -= 1.0;
45813 x2 = copysign (xa2, x);
45814 return x2;
45817 TWO52 = ix86_gen_TWO52 (mode);
45819 /* Temporary for holding the result, initialized to the input
45820 operand to ease control flow. */
45821 res = gen_reg_rtx (mode);
45822 emit_move_insn (res, operand1);
45824 /* xa = abs (operand1) */
45825 xa = ix86_expand_sse_fabs (res, &smask);
45827 /* if (!isless (xa, TWO52)) goto label; */
45828 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45830 /* res = xa + TWO52 - TWO52; */
45831 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45832 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45833 emit_move_insn (res, tmp);
45835 /* generate 1.0 */
45836 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45838 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45839 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45840 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45841 tmp = expand_simple_binop (mode, MINUS,
45842 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45843 emit_move_insn (res, tmp);
45845 /* res = copysign (res, operand1) */
45846 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45848 emit_label (label);
45849 LABEL_NUSES (label) = 1;
45851 emit_move_insn (operand0, res);
45854 /* Expand SSE sequence for computing round from OPERAND1 storing
45855 into OPERAND0. */
45856 void
45857 ix86_expand_round (rtx operand0, rtx operand1)
45859 /* C code for the stuff we're doing below:
45860 double xa = fabs (x);
45861 if (!isless (xa, TWO52))
45862 return x;
45863 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45864 return copysign (xa, x);
45866 machine_mode mode = GET_MODE (operand0);
45867 rtx res, TWO52, xa, xi, half, mask;
45868 rtx_code_label *label;
45869 const struct real_format *fmt;
45870 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45872 /* Temporary for holding the result, initialized to the input
45873 operand to ease control flow. */
45874 res = gen_reg_rtx (mode);
45875 emit_move_insn (res, operand1);
45877 TWO52 = ix86_gen_TWO52 (mode);
45878 xa = ix86_expand_sse_fabs (res, &mask);
45879 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45881 /* load nextafter (0.5, 0.0) */
45882 fmt = REAL_MODE_FORMAT (mode);
45883 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45884 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45886 /* xa = xa + 0.5 */
45887 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45888 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45890 /* xa = (double)(int64_t)xa */
45891 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45892 expand_fix (xi, xa, 0);
45893 expand_float (xa, xi, 0);
45895 /* res = copysign (xa, operand1) */
45896 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45898 emit_label (label);
45899 LABEL_NUSES (label) = 1;
45901 emit_move_insn (operand0, res);
45904 /* Expand SSE sequence for computing round
45905 from OP1 storing into OP0 using sse4 round insn. */
45906 void
45907 ix86_expand_round_sse4 (rtx op0, rtx op1)
45909 machine_mode mode = GET_MODE (op0);
45910 rtx e1, e2, res, half;
45911 const struct real_format *fmt;
45912 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45913 rtx (*gen_copysign) (rtx, rtx, rtx);
45914 rtx (*gen_round) (rtx, rtx, rtx);
45916 switch (mode)
45918 case E_SFmode:
45919 gen_copysign = gen_copysignsf3;
45920 gen_round = gen_sse4_1_roundsf2;
45921 break;
45922 case E_DFmode:
45923 gen_copysign = gen_copysigndf3;
45924 gen_round = gen_sse4_1_rounddf2;
45925 break;
45926 default:
45927 gcc_unreachable ();
45930 /* round (a) = trunc (a + copysign (0.5, a)) */
45932 /* load nextafter (0.5, 0.0) */
45933 fmt = REAL_MODE_FORMAT (mode);
45934 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45935 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45936 half = const_double_from_real_value (pred_half, mode);
45938 /* e1 = copysign (0.5, op1) */
45939 e1 = gen_reg_rtx (mode);
45940 emit_insn (gen_copysign (e1, half, op1));
45942 /* e2 = op1 + e1 */
45943 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45945 /* res = trunc (e2) */
45946 res = gen_reg_rtx (mode);
45947 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45949 emit_move_insn (op0, res);
45953 /* Table of valid machine attributes. */
45954 static const struct attribute_spec ix86_attribute_table[] =
45956 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
45957 affects_type_identity, handler, exclude } */
45958 /* Stdcall attribute says callee is responsible for popping arguments
45959 if they are not variable. */
45960 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45961 NULL },
45962 /* Fastcall attribute says callee is responsible for popping arguments
45963 if they are not variable. */
45964 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45965 NULL },
45966 /* Thiscall attribute says callee is responsible for popping arguments
45967 if they are not variable. */
45968 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45969 NULL },
45970 /* Cdecl attribute says the callee is a normal C declaration */
45971 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45972 NULL },
45973 /* Regparm attribute specifies how many integer arguments are to be
45974 passed in registers. */
45975 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
45976 NULL },
45977 /* Sseregparm attribute says we are using x86_64 calling conventions
45978 for FP arguments. */
45979 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45980 NULL },
45981 /* The transactional memory builtins are implicitly regparm or fastcall
45982 depending on the ABI. Override the generic do-nothing attribute that
45983 these builtins were declared with. */
45984 { "*tm regparm", 0, 0, false, true, true, true,
45985 ix86_handle_tm_regparm_attribute, NULL },
45986 /* force_align_arg_pointer says this function realigns the stack at entry. */
45987 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45988 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
45989 NULL },
45990 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45991 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
45992 NULL },
45993 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
45994 NULL },
45995 { "shared", 0, 0, true, false, false, false,
45996 ix86_handle_shared_attribute, NULL },
45997 #endif
45998 { "ms_struct", 0, 0, false, false, false, false,
45999 ix86_handle_struct_attribute, NULL },
46000 { "gcc_struct", 0, 0, false, false, false, false,
46001 ix86_handle_struct_attribute, NULL },
46002 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46003 SUBTARGET_ATTRIBUTE_TABLE,
46004 #endif
46005 /* ms_abi and sysv_abi calling convention function attributes. */
46006 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
46007 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
46008 NULL },
46009 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46010 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
46011 { "ms_hook_prologue", 0, 0, true, false, false, false,
46012 ix86_handle_fndecl_attribute, NULL },
46013 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
46014 ix86_handle_callee_pop_aggregate_return, NULL },
46015 { "interrupt", 0, 0, false, true, true, false,
46016 ix86_handle_interrupt_attribute, NULL },
46017 { "no_caller_saved_registers", 0, 0, false, true, true, false,
46018 ix86_handle_no_caller_saved_registers_attribute, NULL },
46019 { "naked", 0, 0, true, false, false, false,
46020 ix86_handle_fndecl_attribute, NULL },
46021 { "indirect_branch", 1, 1, true, false, false, false,
46022 ix86_handle_fndecl_attribute, NULL },
46023 { "function_return", 1, 1, true, false, false, false,
46024 ix86_handle_fndecl_attribute, NULL },
46026 /* End element. */
46027 { NULL, 0, 0, false, false, false, false, NULL, NULL }
46030 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46031 static int
46032 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46033 tree vectype, int)
46035 bool fp = false;
46036 machine_mode mode = TImode;
46037 int index;
46038 if (vectype != NULL)
46040 fp = FLOAT_TYPE_P (vectype);
46041 mode = TYPE_MODE (vectype);
46044 switch (type_of_cost)
46046 case scalar_stmt:
46047 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
46049 case scalar_load:
46050 /* load/store costs are relative to register move which is 2. Recompute
46051 it to COSTS_N_INSNS so everything have same base. */
46052 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
46053 : ix86_cost->int_load [2]) / 2;
46055 case scalar_store:
46056 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
46057 : ix86_cost->int_store [2]) / 2;
46059 case vector_stmt:
46060 return ix86_vec_cost (mode,
46061 fp ? ix86_cost->addss : ix86_cost->sse_op,
46062 true);
46064 case vector_load:
46065 index = sse_store_index (mode);
46066 /* See PR82713 - we may end up being called on non-vector type. */
46067 if (index < 0)
46068 index = 2;
46069 return ix86_vec_cost (mode,
46070 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
46071 true);
46073 case vector_store:
46074 index = sse_store_index (mode);
46075 /* See PR82713 - we may end up being called on non-vector type. */
46076 if (index < 0)
46077 index = 2;
46078 return ix86_vec_cost (mode,
46079 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
46080 true);
46082 case vec_to_scalar:
46083 case scalar_to_vec:
46084 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
46086 /* We should have separate costs for unaligned loads and gather/scatter.
46087 Do that incrementally. */
46088 case unaligned_load:
46089 index = sse_store_index (mode);
46090 /* See PR82713 - we may end up being called on non-vector type. */
46091 if (index < 0)
46092 index = 2;
46093 return ix86_vec_cost (mode,
46094 COSTS_N_INSNS
46095 (ix86_cost->sse_unaligned_load[index]) / 2,
46096 true);
46098 case unaligned_store:
46099 index = sse_store_index (mode);
46100 /* See PR82713 - we may end up being called on non-vector type. */
46101 if (index < 0)
46102 index = 2;
46103 return ix86_vec_cost (mode,
46104 COSTS_N_INSNS
46105 (ix86_cost->sse_unaligned_store[index]) / 2,
46106 true);
46108 case vector_gather_load:
46109 return ix86_vec_cost (mode,
46110 COSTS_N_INSNS
46111 (ix86_cost->gather_static
46112 + ix86_cost->gather_per_elt
46113 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46114 true);
46116 case vector_scatter_store:
46117 return ix86_vec_cost (mode,
46118 COSTS_N_INSNS
46119 (ix86_cost->scatter_static
46120 + ix86_cost->scatter_per_elt
46121 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46122 true);
46124 case cond_branch_taken:
46125 return ix86_cost->cond_taken_branch_cost;
46127 case cond_branch_not_taken:
46128 return ix86_cost->cond_not_taken_branch_cost;
46130 case vec_perm:
46131 case vec_promote_demote:
46132 return ix86_vec_cost (mode,
46133 ix86_cost->sse_op, true);
46135 case vec_construct:
46137 /* N element inserts. */
46138 int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
46139 /* One vinserti128 for combining two SSE vectors for AVX256. */
46140 if (GET_MODE_BITSIZE (mode) == 256)
46141 cost += ix86_vec_cost (mode, ix86_cost->addss, true);
46142 /* One vinserti64x4 and two vinserti128 for combining SSE
46143 and AVX256 vectors to AVX512. */
46144 else if (GET_MODE_BITSIZE (mode) == 512)
46145 cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
46146 return cost;
46149 default:
46150 gcc_unreachable ();
46154 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46155 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46156 insn every time. */
46158 static GTY(()) rtx_insn *vselect_insn;
46160 /* Initialize vselect_insn. */
46162 static void
46163 init_vselect_insn (void)
46165 unsigned i;
46166 rtx x;
46168 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46169 for (i = 0; i < MAX_VECT_LEN; ++i)
46170 XVECEXP (x, 0, i) = const0_rtx;
46171 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46172 const0_rtx), x);
46173 x = gen_rtx_SET (const0_rtx, x);
46174 start_sequence ();
46175 vselect_insn = emit_insn (x);
46176 end_sequence ();
46179 /* Construct (set target (vec_select op0 (parallel perm))) and
46180 return true if that's a valid instruction in the active ISA. */
46182 static bool
46183 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46184 unsigned nelt, bool testing_p)
46186 unsigned int i;
46187 rtx x, save_vconcat;
46188 int icode;
46190 if (vselect_insn == NULL_RTX)
46191 init_vselect_insn ();
46193 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46194 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46195 for (i = 0; i < nelt; ++i)
46196 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46197 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46198 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46199 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46200 SET_DEST (PATTERN (vselect_insn)) = target;
46201 icode = recog_memoized (vselect_insn);
46203 if (icode >= 0 && !testing_p)
46204 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46206 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46207 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46208 INSN_CODE (vselect_insn) = -1;
46210 return icode >= 0;
46213 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46215 static bool
46216 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46217 const unsigned char *perm, unsigned nelt,
46218 bool testing_p)
46220 machine_mode v2mode;
46221 rtx x;
46222 bool ok;
46224 if (vselect_insn == NULL_RTX)
46225 init_vselect_insn ();
46227 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46228 return false;
46229 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46230 PUT_MODE (x, v2mode);
46231 XEXP (x, 0) = op0;
46232 XEXP (x, 1) = op1;
46233 ok = expand_vselect (target, x, perm, nelt, testing_p);
46234 XEXP (x, 0) = const0_rtx;
46235 XEXP (x, 1) = const0_rtx;
46236 return ok;
46239 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46240 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46242 static bool
46243 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46245 machine_mode mmode, vmode = d->vmode;
46246 unsigned i, mask, nelt = d->nelt;
46247 rtx target, op0, op1, maskop, x;
46248 rtx rperm[32], vperm;
46250 if (d->one_operand_p)
46251 return false;
46252 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46253 && (TARGET_AVX512BW
46254 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46256 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46258 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46260 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46262 else
46263 return false;
46265 /* This is a blend, not a permute. Elements must stay in their
46266 respective lanes. */
46267 for (i = 0; i < nelt; ++i)
46269 unsigned e = d->perm[i];
46270 if (!(e == i || e == i + nelt))
46271 return false;
46274 if (d->testing_p)
46275 return true;
46277 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46278 decision should be extracted elsewhere, so that we only try that
46279 sequence once all budget==3 options have been tried. */
46280 target = d->target;
46281 op0 = d->op0;
46282 op1 = d->op1;
46283 mask = 0;
46285 switch (vmode)
46287 case E_V8DFmode:
46288 case E_V16SFmode:
46289 case E_V4DFmode:
46290 case E_V8SFmode:
46291 case E_V2DFmode:
46292 case E_V4SFmode:
46293 case E_V8HImode:
46294 case E_V8SImode:
46295 case E_V32HImode:
46296 case E_V64QImode:
46297 case E_V16SImode:
46298 case E_V8DImode:
46299 for (i = 0; i < nelt; ++i)
46300 mask |= (d->perm[i] >= nelt) << i;
46301 break;
46303 case E_V2DImode:
46304 for (i = 0; i < 2; ++i)
46305 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46306 vmode = V8HImode;
46307 goto do_subreg;
46309 case E_V4SImode:
46310 for (i = 0; i < 4; ++i)
46311 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46312 vmode = V8HImode;
46313 goto do_subreg;
46315 case E_V16QImode:
46316 /* See if bytes move in pairs so we can use pblendw with
46317 an immediate argument, rather than pblendvb with a vector
46318 argument. */
46319 for (i = 0; i < 16; i += 2)
46320 if (d->perm[i] + 1 != d->perm[i + 1])
46322 use_pblendvb:
46323 for (i = 0; i < nelt; ++i)
46324 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46326 finish_pblendvb:
46327 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46328 vperm = force_reg (vmode, vperm);
46330 if (GET_MODE_SIZE (vmode) == 16)
46331 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46332 else
46333 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46334 if (target != d->target)
46335 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46336 return true;
46339 for (i = 0; i < 8; ++i)
46340 mask |= (d->perm[i * 2] >= 16) << i;
46341 vmode = V8HImode;
46342 /* FALLTHRU */
46344 do_subreg:
46345 target = gen_reg_rtx (vmode);
46346 op0 = gen_lowpart (vmode, op0);
46347 op1 = gen_lowpart (vmode, op1);
46348 break;
46350 case E_V32QImode:
46351 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46352 for (i = 0; i < 32; i += 2)
46353 if (d->perm[i] + 1 != d->perm[i + 1])
46354 goto use_pblendvb;
46355 /* See if bytes move in quadruplets. If yes, vpblendd
46356 with immediate can be used. */
46357 for (i = 0; i < 32; i += 4)
46358 if (d->perm[i] + 2 != d->perm[i + 2])
46359 break;
46360 if (i < 32)
46362 /* See if bytes move the same in both lanes. If yes,
46363 vpblendw with immediate can be used. */
46364 for (i = 0; i < 16; i += 2)
46365 if (d->perm[i] + 16 != d->perm[i + 16])
46366 goto use_pblendvb;
46368 /* Use vpblendw. */
46369 for (i = 0; i < 16; ++i)
46370 mask |= (d->perm[i * 2] >= 32) << i;
46371 vmode = V16HImode;
46372 goto do_subreg;
46375 /* Use vpblendd. */
46376 for (i = 0; i < 8; ++i)
46377 mask |= (d->perm[i * 4] >= 32) << i;
46378 vmode = V8SImode;
46379 goto do_subreg;
46381 case E_V16HImode:
46382 /* See if words move in pairs. If yes, vpblendd can be used. */
46383 for (i = 0; i < 16; i += 2)
46384 if (d->perm[i] + 1 != d->perm[i + 1])
46385 break;
46386 if (i < 16)
46388 /* See if words move the same in both lanes. If not,
46389 vpblendvb must be used. */
46390 for (i = 0; i < 8; i++)
46391 if (d->perm[i] + 8 != d->perm[i + 8])
46393 /* Use vpblendvb. */
46394 for (i = 0; i < 32; ++i)
46395 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46397 vmode = V32QImode;
46398 nelt = 32;
46399 target = gen_reg_rtx (vmode);
46400 op0 = gen_lowpart (vmode, op0);
46401 op1 = gen_lowpart (vmode, op1);
46402 goto finish_pblendvb;
46405 /* Use vpblendw. */
46406 for (i = 0; i < 16; ++i)
46407 mask |= (d->perm[i] >= 16) << i;
46408 break;
46411 /* Use vpblendd. */
46412 for (i = 0; i < 8; ++i)
46413 mask |= (d->perm[i * 2] >= 16) << i;
46414 vmode = V8SImode;
46415 goto do_subreg;
46417 case E_V4DImode:
46418 /* Use vpblendd. */
46419 for (i = 0; i < 4; ++i)
46420 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46421 vmode = V8SImode;
46422 goto do_subreg;
46424 default:
46425 gcc_unreachable ();
46428 switch (vmode)
46430 case E_V8DFmode:
46431 case E_V8DImode:
46432 mmode = QImode;
46433 break;
46434 case E_V16SFmode:
46435 case E_V16SImode:
46436 mmode = HImode;
46437 break;
46438 case E_V32HImode:
46439 mmode = SImode;
46440 break;
46441 case E_V64QImode:
46442 mmode = DImode;
46443 break;
46444 default:
46445 mmode = VOIDmode;
46448 if (mmode != VOIDmode)
46449 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46450 else
46451 maskop = GEN_INT (mask);
46453 /* This matches five different patterns with the different modes. */
46454 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46455 x = gen_rtx_SET (target, x);
46456 emit_insn (x);
46457 if (target != d->target)
46458 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46460 return true;
46463 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46464 in terms of the variable form of vpermilps.
46466 Note that we will have already failed the immediate input vpermilps,
46467 which requires that the high and low part shuffle be identical; the
46468 variable form doesn't require that. */
46470 static bool
46471 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46473 rtx rperm[8], vperm;
46474 unsigned i;
46476 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46477 return false;
46479 /* We can only permute within the 128-bit lane. */
46480 for (i = 0; i < 8; ++i)
46482 unsigned e = d->perm[i];
46483 if (i < 4 ? e >= 4 : e < 4)
46484 return false;
46487 if (d->testing_p)
46488 return true;
46490 for (i = 0; i < 8; ++i)
46492 unsigned e = d->perm[i];
46494 /* Within each 128-bit lane, the elements of op0 are numbered
46495 from 0 and the elements of op1 are numbered from 4. */
46496 if (e >= 8 + 4)
46497 e -= 8;
46498 else if (e >= 4)
46499 e -= 4;
46501 rperm[i] = GEN_INT (e);
46504 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46505 vperm = force_reg (V8SImode, vperm);
46506 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46508 return true;
46511 /* Return true if permutation D can be performed as VMODE permutation
46512 instead. */
46514 static bool
46515 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46517 unsigned int i, j, chunk;
46519 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46520 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46521 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46522 return false;
46524 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46525 return true;
46527 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46528 for (i = 0; i < d->nelt; i += chunk)
46529 if (d->perm[i] & (chunk - 1))
46530 return false;
46531 else
46532 for (j = 1; j < chunk; ++j)
46533 if (d->perm[i] + j != d->perm[i + j])
46534 return false;
46536 return true;
46539 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46540 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46542 static bool
46543 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46545 unsigned i, nelt, eltsz, mask;
46546 unsigned char perm[64];
46547 machine_mode vmode = V16QImode;
46548 rtx rperm[64], vperm, target, op0, op1;
46550 nelt = d->nelt;
46552 if (!d->one_operand_p)
46554 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46556 if (TARGET_AVX2
46557 && valid_perm_using_mode_p (V2TImode, d))
46559 if (d->testing_p)
46560 return true;
46562 /* Use vperm2i128 insn. The pattern uses
46563 V4DImode instead of V2TImode. */
46564 target = d->target;
46565 if (d->vmode != V4DImode)
46566 target = gen_reg_rtx (V4DImode);
46567 op0 = gen_lowpart (V4DImode, d->op0);
46568 op1 = gen_lowpart (V4DImode, d->op1);
46569 rperm[0]
46570 = GEN_INT ((d->perm[0] / (nelt / 2))
46571 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46572 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46573 if (target != d->target)
46574 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46575 return true;
46577 return false;
46580 else
46582 if (GET_MODE_SIZE (d->vmode) == 16)
46584 if (!TARGET_SSSE3)
46585 return false;
46587 else if (GET_MODE_SIZE (d->vmode) == 32)
46589 if (!TARGET_AVX2)
46590 return false;
46592 /* V4DImode should be already handled through
46593 expand_vselect by vpermq instruction. */
46594 gcc_assert (d->vmode != V4DImode);
46596 vmode = V32QImode;
46597 if (d->vmode == V8SImode
46598 || d->vmode == V16HImode
46599 || d->vmode == V32QImode)
46601 /* First see if vpermq can be used for
46602 V8SImode/V16HImode/V32QImode. */
46603 if (valid_perm_using_mode_p (V4DImode, d))
46605 for (i = 0; i < 4; i++)
46606 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46607 if (d->testing_p)
46608 return true;
46609 target = gen_reg_rtx (V4DImode);
46610 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46611 perm, 4, false))
46613 emit_move_insn (d->target,
46614 gen_lowpart (d->vmode, target));
46615 return true;
46617 return false;
46620 /* Next see if vpermd can be used. */
46621 if (valid_perm_using_mode_p (V8SImode, d))
46622 vmode = V8SImode;
46624 /* Or if vpermps can be used. */
46625 else if (d->vmode == V8SFmode)
46626 vmode = V8SImode;
46628 if (vmode == V32QImode)
46630 /* vpshufb only works intra lanes, it is not
46631 possible to shuffle bytes in between the lanes. */
46632 for (i = 0; i < nelt; ++i)
46633 if ((d->perm[i] ^ i) & (nelt / 2))
46634 return false;
46637 else if (GET_MODE_SIZE (d->vmode) == 64)
46639 if (!TARGET_AVX512BW)
46640 return false;
46642 /* If vpermq didn't work, vpshufb won't work either. */
46643 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46644 return false;
46646 vmode = V64QImode;
46647 if (d->vmode == V16SImode
46648 || d->vmode == V32HImode
46649 || d->vmode == V64QImode)
46651 /* First see if vpermq can be used for
46652 V16SImode/V32HImode/V64QImode. */
46653 if (valid_perm_using_mode_p (V8DImode, d))
46655 for (i = 0; i < 8; i++)
46656 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46657 if (d->testing_p)
46658 return true;
46659 target = gen_reg_rtx (V8DImode);
46660 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46661 perm, 8, false))
46663 emit_move_insn (d->target,
46664 gen_lowpart (d->vmode, target));
46665 return true;
46667 return false;
46670 /* Next see if vpermd can be used. */
46671 if (valid_perm_using_mode_p (V16SImode, d))
46672 vmode = V16SImode;
46674 /* Or if vpermps can be used. */
46675 else if (d->vmode == V16SFmode)
46676 vmode = V16SImode;
46677 if (vmode == V64QImode)
46679 /* vpshufb only works intra lanes, it is not
46680 possible to shuffle bytes in between the lanes. */
46681 for (i = 0; i < nelt; ++i)
46682 if ((d->perm[i] ^ i) & (nelt / 4))
46683 return false;
46686 else
46687 return false;
46690 if (d->testing_p)
46691 return true;
46693 if (vmode == V8SImode)
46694 for (i = 0; i < 8; ++i)
46695 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46696 else if (vmode == V16SImode)
46697 for (i = 0; i < 16; ++i)
46698 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46699 else
46701 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46702 if (!d->one_operand_p)
46703 mask = 2 * nelt - 1;
46704 else if (vmode == V16QImode)
46705 mask = nelt - 1;
46706 else if (vmode == V64QImode)
46707 mask = nelt / 4 - 1;
46708 else
46709 mask = nelt / 2 - 1;
46711 for (i = 0; i < nelt; ++i)
46713 unsigned j, e = d->perm[i] & mask;
46714 for (j = 0; j < eltsz; ++j)
46715 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46719 vperm = gen_rtx_CONST_VECTOR (vmode,
46720 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46721 vperm = force_reg (vmode, vperm);
46723 target = d->target;
46724 if (d->vmode != vmode)
46725 target = gen_reg_rtx (vmode);
46726 op0 = gen_lowpart (vmode, d->op0);
46727 if (d->one_operand_p)
46729 if (vmode == V16QImode)
46730 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46731 else if (vmode == V32QImode)
46732 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46733 else if (vmode == V64QImode)
46734 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46735 else if (vmode == V8SFmode)
46736 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46737 else if (vmode == V8SImode)
46738 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46739 else if (vmode == V16SFmode)
46740 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46741 else if (vmode == V16SImode)
46742 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46743 else
46744 gcc_unreachable ();
46746 else
46748 op1 = gen_lowpart (vmode, d->op1);
46749 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46751 if (target != d->target)
46752 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46754 return true;
46757 /* For V*[QHS]Imode permutations, check if the same permutation
46758 can't be performed in a 2x, 4x or 8x wider inner mode. */
46760 static bool
46761 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46762 struct expand_vec_perm_d *nd)
46764 int i;
46765 machine_mode mode = VOIDmode;
46767 switch (d->vmode)
46769 case E_V16QImode: mode = V8HImode; break;
46770 case E_V32QImode: mode = V16HImode; break;
46771 case E_V64QImode: mode = V32HImode; break;
46772 case E_V8HImode: mode = V4SImode; break;
46773 case E_V16HImode: mode = V8SImode; break;
46774 case E_V32HImode: mode = V16SImode; break;
46775 case E_V4SImode: mode = V2DImode; break;
46776 case E_V8SImode: mode = V4DImode; break;
46777 case E_V16SImode: mode = V8DImode; break;
46778 default: return false;
46780 for (i = 0; i < d->nelt; i += 2)
46781 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46782 return false;
46783 nd->vmode = mode;
46784 nd->nelt = d->nelt / 2;
46785 for (i = 0; i < nd->nelt; i++)
46786 nd->perm[i] = d->perm[2 * i] / 2;
46787 if (GET_MODE_INNER (mode) != DImode)
46788 canonicalize_vector_int_perm (nd, nd);
46789 if (nd != d)
46791 nd->one_operand_p = d->one_operand_p;
46792 nd->testing_p = d->testing_p;
46793 if (d->op0 == d->op1)
46794 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46795 else
46797 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46798 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46800 if (d->testing_p)
46801 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46802 else
46803 nd->target = gen_reg_rtx (nd->vmode);
46805 return true;
46808 /* Try to expand one-operand permutation with constant mask. */
46810 static bool
46811 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46813 machine_mode mode = GET_MODE (d->op0);
46814 machine_mode maskmode = mode;
46815 rtx (*gen) (rtx, rtx, rtx) = NULL;
46816 rtx target, op0, mask;
46817 rtx vec[64];
46819 if (!rtx_equal_p (d->op0, d->op1))
46820 return false;
46822 if (!TARGET_AVX512F)
46823 return false;
46825 switch (mode)
46827 case E_V16SImode:
46828 gen = gen_avx512f_permvarv16si;
46829 break;
46830 case E_V16SFmode:
46831 gen = gen_avx512f_permvarv16sf;
46832 maskmode = V16SImode;
46833 break;
46834 case E_V8DImode:
46835 gen = gen_avx512f_permvarv8di;
46836 break;
46837 case E_V8DFmode:
46838 gen = gen_avx512f_permvarv8df;
46839 maskmode = V8DImode;
46840 break;
46841 default:
46842 return false;
46845 target = d->target;
46846 op0 = d->op0;
46847 for (int i = 0; i < d->nelt; ++i)
46848 vec[i] = GEN_INT (d->perm[i]);
46849 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46850 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46851 return true;
46854 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46855 in a single instruction. */
46857 static bool
46858 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46860 unsigned i, nelt = d->nelt;
46861 struct expand_vec_perm_d nd;
46863 /* Check plain VEC_SELECT first, because AVX has instructions that could
46864 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46865 input where SEL+CONCAT may not. */
46866 if (d->one_operand_p)
46868 int mask = nelt - 1;
46869 bool identity_perm = true;
46870 bool broadcast_perm = true;
46872 for (i = 0; i < nelt; i++)
46874 nd.perm[i] = d->perm[i] & mask;
46875 if (nd.perm[i] != i)
46876 identity_perm = false;
46877 if (nd.perm[i])
46878 broadcast_perm = false;
46881 if (identity_perm)
46883 if (!d->testing_p)
46884 emit_move_insn (d->target, d->op0);
46885 return true;
46887 else if (broadcast_perm && TARGET_AVX2)
46889 /* Use vpbroadcast{b,w,d}. */
46890 rtx (*gen) (rtx, rtx) = NULL;
46891 switch (d->vmode)
46893 case E_V64QImode:
46894 if (TARGET_AVX512BW)
46895 gen = gen_avx512bw_vec_dupv64qi_1;
46896 break;
46897 case E_V32QImode:
46898 gen = gen_avx2_pbroadcastv32qi_1;
46899 break;
46900 case E_V32HImode:
46901 if (TARGET_AVX512BW)
46902 gen = gen_avx512bw_vec_dupv32hi_1;
46903 break;
46904 case E_V16HImode:
46905 gen = gen_avx2_pbroadcastv16hi_1;
46906 break;
46907 case E_V16SImode:
46908 if (TARGET_AVX512F)
46909 gen = gen_avx512f_vec_dupv16si_1;
46910 break;
46911 case E_V8SImode:
46912 gen = gen_avx2_pbroadcastv8si_1;
46913 break;
46914 case E_V16QImode:
46915 gen = gen_avx2_pbroadcastv16qi;
46916 break;
46917 case E_V8HImode:
46918 gen = gen_avx2_pbroadcastv8hi;
46919 break;
46920 case E_V16SFmode:
46921 if (TARGET_AVX512F)
46922 gen = gen_avx512f_vec_dupv16sf_1;
46923 break;
46924 case E_V8SFmode:
46925 gen = gen_avx2_vec_dupv8sf_1;
46926 break;
46927 case E_V8DFmode:
46928 if (TARGET_AVX512F)
46929 gen = gen_avx512f_vec_dupv8df_1;
46930 break;
46931 case E_V8DImode:
46932 if (TARGET_AVX512F)
46933 gen = gen_avx512f_vec_dupv8di_1;
46934 break;
46935 /* For other modes prefer other shuffles this function creates. */
46936 default: break;
46938 if (gen != NULL)
46940 if (!d->testing_p)
46941 emit_insn (gen (d->target, d->op0));
46942 return true;
46946 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46947 return true;
46949 /* There are plenty of patterns in sse.md that are written for
46950 SEL+CONCAT and are not replicated for a single op. Perhaps
46951 that should be changed, to avoid the nastiness here. */
46953 /* Recognize interleave style patterns, which means incrementing
46954 every other permutation operand. */
46955 for (i = 0; i < nelt; i += 2)
46957 nd.perm[i] = d->perm[i] & mask;
46958 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46960 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46961 d->testing_p))
46962 return true;
46964 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46965 if (nelt >= 4)
46967 for (i = 0; i < nelt; i += 4)
46969 nd.perm[i + 0] = d->perm[i + 0] & mask;
46970 nd.perm[i + 1] = d->perm[i + 1] & mask;
46971 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46972 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46975 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46976 d->testing_p))
46977 return true;
46981 /* Finally, try the fully general two operand permute. */
46982 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46983 d->testing_p))
46984 return true;
46986 /* Recognize interleave style patterns with reversed operands. */
46987 if (!d->one_operand_p)
46989 for (i = 0; i < nelt; ++i)
46991 unsigned e = d->perm[i];
46992 if (e >= nelt)
46993 e -= nelt;
46994 else
46995 e += nelt;
46996 nd.perm[i] = e;
46999 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47000 d->testing_p))
47001 return true;
47004 /* Try the SSE4.1 blend variable merge instructions. */
47005 if (expand_vec_perm_blend (d))
47006 return true;
47008 /* Try one of the AVX vpermil variable permutations. */
47009 if (expand_vec_perm_vpermil (d))
47010 return true;
47012 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47013 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47014 if (expand_vec_perm_pshufb (d))
47015 return true;
47017 /* Try the AVX2 vpalignr instruction. */
47018 if (expand_vec_perm_palignr (d, true))
47019 return true;
47021 /* Try the AVX512F vperm{s,d} instructions. */
47022 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47023 return true;
47025 /* Try the AVX512F vpermt2/vpermi2 instructions. */
47026 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47027 return true;
47029 /* See if we can get the same permutation in different vector integer
47030 mode. */
47031 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47033 if (!d->testing_p)
47034 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47035 return true;
47037 return false;
47040 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47041 in terms of a pair of pshuflw + pshufhw instructions. */
47043 static bool
47044 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47046 unsigned char perm2[MAX_VECT_LEN];
47047 unsigned i;
47048 bool ok;
47050 if (d->vmode != V8HImode || !d->one_operand_p)
47051 return false;
47053 /* The two permutations only operate in 64-bit lanes. */
47054 for (i = 0; i < 4; ++i)
47055 if (d->perm[i] >= 4)
47056 return false;
47057 for (i = 4; i < 8; ++i)
47058 if (d->perm[i] < 4)
47059 return false;
47061 if (d->testing_p)
47062 return true;
47064 /* Emit the pshuflw. */
47065 memcpy (perm2, d->perm, 4);
47066 for (i = 4; i < 8; ++i)
47067 perm2[i] = i;
47068 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47069 gcc_assert (ok);
47071 /* Emit the pshufhw. */
47072 memcpy (perm2 + 4, d->perm + 4, 4);
47073 for (i = 0; i < 4; ++i)
47074 perm2[i] = i;
47075 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47076 gcc_assert (ok);
47078 return true;
47081 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47082 the permutation using the SSSE3 palignr instruction. This succeeds
47083 when all of the elements in PERM fit within one vector and we merely
47084 need to shift them down so that a single vector permutation has a
47085 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47086 the vpalignr instruction itself can perform the requested permutation. */
47088 static bool
47089 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47091 unsigned i, nelt = d->nelt;
47092 unsigned min, max, minswap, maxswap;
47093 bool in_order, ok, swap = false;
47094 rtx shift, target;
47095 struct expand_vec_perm_d dcopy;
47097 /* Even with AVX, palignr only operates on 128-bit vectors,
47098 in AVX2 palignr operates on both 128-bit lanes. */
47099 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47100 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47101 return false;
47103 min = 2 * nelt;
47104 max = 0;
47105 minswap = 2 * nelt;
47106 maxswap = 0;
47107 for (i = 0; i < nelt; ++i)
47109 unsigned e = d->perm[i];
47110 unsigned eswap = d->perm[i] ^ nelt;
47111 if (GET_MODE_SIZE (d->vmode) == 32)
47113 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47114 eswap = e ^ (nelt / 2);
47116 if (e < min)
47117 min = e;
47118 if (e > max)
47119 max = e;
47120 if (eswap < minswap)
47121 minswap = eswap;
47122 if (eswap > maxswap)
47123 maxswap = eswap;
47125 if (min == 0
47126 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47128 if (d->one_operand_p
47129 || minswap == 0
47130 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47131 ? nelt / 2 : nelt))
47132 return false;
47133 swap = true;
47134 min = minswap;
47135 max = maxswap;
47138 /* Given that we have SSSE3, we know we'll be able to implement the
47139 single operand permutation after the palignr with pshufb for
47140 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47141 first. */
47142 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47143 return true;
47145 dcopy = *d;
47146 if (swap)
47148 dcopy.op0 = d->op1;
47149 dcopy.op1 = d->op0;
47150 for (i = 0; i < nelt; ++i)
47151 dcopy.perm[i] ^= nelt;
47154 in_order = true;
47155 for (i = 0; i < nelt; ++i)
47157 unsigned e = dcopy.perm[i];
47158 if (GET_MODE_SIZE (d->vmode) == 32
47159 && e >= nelt
47160 && (e & (nelt / 2 - 1)) < min)
47161 e = e - min - (nelt / 2);
47162 else
47163 e = e - min;
47164 if (e != i)
47165 in_order = false;
47166 dcopy.perm[i] = e;
47168 dcopy.one_operand_p = true;
47170 if (single_insn_only_p && !in_order)
47171 return false;
47173 /* For AVX2, test whether we can permute the result in one instruction. */
47174 if (d->testing_p)
47176 if (in_order)
47177 return true;
47178 dcopy.op1 = dcopy.op0;
47179 return expand_vec_perm_1 (&dcopy);
47182 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47183 if (GET_MODE_SIZE (d->vmode) == 16)
47185 target = gen_reg_rtx (TImode);
47186 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47187 gen_lowpart (TImode, dcopy.op0), shift));
47189 else
47191 target = gen_reg_rtx (V2TImode);
47192 emit_insn (gen_avx2_palignrv2ti (target,
47193 gen_lowpart (V2TImode, dcopy.op1),
47194 gen_lowpart (V2TImode, dcopy.op0),
47195 shift));
47198 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47200 /* Test for the degenerate case where the alignment by itself
47201 produces the desired permutation. */
47202 if (in_order)
47204 emit_move_insn (d->target, dcopy.op0);
47205 return true;
47208 ok = expand_vec_perm_1 (&dcopy);
47209 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47211 return ok;
47214 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47215 the permutation using the SSE4_1 pblendv instruction. Potentially
47216 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47218 static bool
47219 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47221 unsigned i, which, nelt = d->nelt;
47222 struct expand_vec_perm_d dcopy, dcopy1;
47223 machine_mode vmode = d->vmode;
47224 bool ok;
47226 /* Use the same checks as in expand_vec_perm_blend. */
47227 if (d->one_operand_p)
47228 return false;
47229 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47231 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47233 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47235 else
47236 return false;
47238 /* Figure out where permutation elements stay not in their
47239 respective lanes. */
47240 for (i = 0, which = 0; i < nelt; ++i)
47242 unsigned e = d->perm[i];
47243 if (e != i)
47244 which |= (e < nelt ? 1 : 2);
47246 /* We can pblend the part where elements stay not in their
47247 respective lanes only when these elements are all in one
47248 half of a permutation.
47249 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47250 lanes, but both 8 and 9 >= 8
47251 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47252 respective lanes and 8 >= 8, but 2 not. */
47253 if (which != 1 && which != 2)
47254 return false;
47255 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47256 return true;
47258 /* First we apply one operand permutation to the part where
47259 elements stay not in their respective lanes. */
47260 dcopy = *d;
47261 if (which == 2)
47262 dcopy.op0 = dcopy.op1 = d->op1;
47263 else
47264 dcopy.op0 = dcopy.op1 = d->op0;
47265 if (!d->testing_p)
47266 dcopy.target = gen_reg_rtx (vmode);
47267 dcopy.one_operand_p = true;
47269 for (i = 0; i < nelt; ++i)
47270 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47272 ok = expand_vec_perm_1 (&dcopy);
47273 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47274 return false;
47275 else
47276 gcc_assert (ok);
47277 if (d->testing_p)
47278 return true;
47280 /* Next we put permuted elements into their positions. */
47281 dcopy1 = *d;
47282 if (which == 2)
47283 dcopy1.op1 = dcopy.target;
47284 else
47285 dcopy1.op0 = dcopy.target;
47287 for (i = 0; i < nelt; ++i)
47288 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47290 ok = expand_vec_perm_blend (&dcopy1);
47291 gcc_assert (ok);
47293 return true;
47296 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47298 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47299 a two vector permutation into a single vector permutation by using
47300 an interleave operation to merge the vectors. */
47302 static bool
47303 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47305 struct expand_vec_perm_d dremap, dfinal;
47306 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47307 unsigned HOST_WIDE_INT contents;
47308 unsigned char remap[2 * MAX_VECT_LEN];
47309 rtx_insn *seq;
47310 bool ok, same_halves = false;
47312 if (GET_MODE_SIZE (d->vmode) == 16)
47314 if (d->one_operand_p)
47315 return false;
47317 else if (GET_MODE_SIZE (d->vmode) == 32)
47319 if (!TARGET_AVX)
47320 return false;
47321 /* For 32-byte modes allow even d->one_operand_p.
47322 The lack of cross-lane shuffling in some instructions
47323 might prevent a single insn shuffle. */
47324 dfinal = *d;
47325 dfinal.testing_p = true;
47326 /* If expand_vec_perm_interleave3 can expand this into
47327 a 3 insn sequence, give up and let it be expanded as
47328 3 insn sequence. While that is one insn longer,
47329 it doesn't need a memory operand and in the common
47330 case that both interleave low and high permutations
47331 with the same operands are adjacent needs 4 insns
47332 for both after CSE. */
47333 if (expand_vec_perm_interleave3 (&dfinal))
47334 return false;
47336 else
47337 return false;
47339 /* Examine from whence the elements come. */
47340 contents = 0;
47341 for (i = 0; i < nelt; ++i)
47342 contents |= HOST_WIDE_INT_1U << d->perm[i];
47344 memset (remap, 0xff, sizeof (remap));
47345 dremap = *d;
47347 if (GET_MODE_SIZE (d->vmode) == 16)
47349 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47351 /* Split the two input vectors into 4 halves. */
47352 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47353 h2 = h1 << nelt2;
47354 h3 = h2 << nelt2;
47355 h4 = h3 << nelt2;
47357 /* If the elements from the low halves use interleave low, and similarly
47358 for interleave high. If the elements are from mis-matched halves, we
47359 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47360 if ((contents & (h1 | h3)) == contents)
47362 /* punpckl* */
47363 for (i = 0; i < nelt2; ++i)
47365 remap[i] = i * 2;
47366 remap[i + nelt] = i * 2 + 1;
47367 dremap.perm[i * 2] = i;
47368 dremap.perm[i * 2 + 1] = i + nelt;
47370 if (!TARGET_SSE2 && d->vmode == V4SImode)
47371 dremap.vmode = V4SFmode;
47373 else if ((contents & (h2 | h4)) == contents)
47375 /* punpckh* */
47376 for (i = 0; i < nelt2; ++i)
47378 remap[i + nelt2] = i * 2;
47379 remap[i + nelt + nelt2] = i * 2 + 1;
47380 dremap.perm[i * 2] = i + nelt2;
47381 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47383 if (!TARGET_SSE2 && d->vmode == V4SImode)
47384 dremap.vmode = V4SFmode;
47386 else if ((contents & (h1 | h4)) == contents)
47388 /* shufps */
47389 for (i = 0; i < nelt2; ++i)
47391 remap[i] = i;
47392 remap[i + nelt + nelt2] = i + nelt2;
47393 dremap.perm[i] = i;
47394 dremap.perm[i + nelt2] = i + nelt + nelt2;
47396 if (nelt != 4)
47398 /* shufpd */
47399 dremap.vmode = V2DImode;
47400 dremap.nelt = 2;
47401 dremap.perm[0] = 0;
47402 dremap.perm[1] = 3;
47405 else if ((contents & (h2 | h3)) == contents)
47407 /* shufps */
47408 for (i = 0; i < nelt2; ++i)
47410 remap[i + nelt2] = i;
47411 remap[i + nelt] = i + nelt2;
47412 dremap.perm[i] = i + nelt2;
47413 dremap.perm[i + nelt2] = i + nelt;
47415 if (nelt != 4)
47417 /* shufpd */
47418 dremap.vmode = V2DImode;
47419 dremap.nelt = 2;
47420 dremap.perm[0] = 1;
47421 dremap.perm[1] = 2;
47424 else
47425 return false;
47427 else
47429 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47430 unsigned HOST_WIDE_INT q[8];
47431 unsigned int nonzero_halves[4];
47433 /* Split the two input vectors into 8 quarters. */
47434 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47435 for (i = 1; i < 8; ++i)
47436 q[i] = q[0] << (nelt4 * i);
47437 for (i = 0; i < 4; ++i)
47438 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47440 nonzero_halves[nzcnt] = i;
47441 ++nzcnt;
47444 if (nzcnt == 1)
47446 gcc_assert (d->one_operand_p);
47447 nonzero_halves[1] = nonzero_halves[0];
47448 same_halves = true;
47450 else if (d->one_operand_p)
47452 gcc_assert (nonzero_halves[0] == 0);
47453 gcc_assert (nonzero_halves[1] == 1);
47456 if (nzcnt <= 2)
47458 if (d->perm[0] / nelt2 == nonzero_halves[1])
47460 /* Attempt to increase the likelihood that dfinal
47461 shuffle will be intra-lane. */
47462 std::swap (nonzero_halves[0], nonzero_halves[1]);
47465 /* vperm2f128 or vperm2i128. */
47466 for (i = 0; i < nelt2; ++i)
47468 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47469 remap[i + nonzero_halves[0] * nelt2] = i;
47470 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47471 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47474 if (d->vmode != V8SFmode
47475 && d->vmode != V4DFmode
47476 && d->vmode != V8SImode)
47478 dremap.vmode = V8SImode;
47479 dremap.nelt = 8;
47480 for (i = 0; i < 4; ++i)
47482 dremap.perm[i] = i + nonzero_halves[0] * 4;
47483 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47487 else if (d->one_operand_p)
47488 return false;
47489 else if (TARGET_AVX2
47490 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47492 /* vpunpckl* */
47493 for (i = 0; i < nelt4; ++i)
47495 remap[i] = i * 2;
47496 remap[i + nelt] = i * 2 + 1;
47497 remap[i + nelt2] = i * 2 + nelt2;
47498 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47499 dremap.perm[i * 2] = i;
47500 dremap.perm[i * 2 + 1] = i + nelt;
47501 dremap.perm[i * 2 + nelt2] = i + nelt2;
47502 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47505 else if (TARGET_AVX2
47506 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47508 /* vpunpckh* */
47509 for (i = 0; i < nelt4; ++i)
47511 remap[i + nelt4] = i * 2;
47512 remap[i + nelt + nelt4] = i * 2 + 1;
47513 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47514 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47515 dremap.perm[i * 2] = i + nelt4;
47516 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47517 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47518 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47521 else
47522 return false;
47525 /* Use the remapping array set up above to move the elements from their
47526 swizzled locations into their final destinations. */
47527 dfinal = *d;
47528 for (i = 0; i < nelt; ++i)
47530 unsigned e = remap[d->perm[i]];
47531 gcc_assert (e < nelt);
47532 /* If same_halves is true, both halves of the remapped vector are the
47533 same. Avoid cross-lane accesses if possible. */
47534 if (same_halves && i >= nelt2)
47536 gcc_assert (e < nelt2);
47537 dfinal.perm[i] = e + nelt2;
47539 else
47540 dfinal.perm[i] = e;
47542 if (!d->testing_p)
47544 dremap.target = gen_reg_rtx (dremap.vmode);
47545 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47547 dfinal.op1 = dfinal.op0;
47548 dfinal.one_operand_p = true;
47550 /* Test if the final remap can be done with a single insn. For V4SFmode or
47551 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47552 start_sequence ();
47553 ok = expand_vec_perm_1 (&dfinal);
47554 seq = get_insns ();
47555 end_sequence ();
47557 if (!ok)
47558 return false;
47560 if (d->testing_p)
47561 return true;
47563 if (dremap.vmode != dfinal.vmode)
47565 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47566 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47569 ok = expand_vec_perm_1 (&dremap);
47570 gcc_assert (ok);
47572 emit_insn (seq);
47573 return true;
47576 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47577 a single vector cross-lane permutation into vpermq followed
47578 by any of the single insn permutations. */
47580 static bool
47581 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47583 struct expand_vec_perm_d dremap, dfinal;
47584 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47585 unsigned contents[2];
47586 bool ok;
47588 if (!(TARGET_AVX2
47589 && (d->vmode == V32QImode || d->vmode == V16HImode)
47590 && d->one_operand_p))
47591 return false;
47593 contents[0] = 0;
47594 contents[1] = 0;
47595 for (i = 0; i < nelt2; ++i)
47597 contents[0] |= 1u << (d->perm[i] / nelt4);
47598 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47601 for (i = 0; i < 2; ++i)
47603 unsigned int cnt = 0;
47604 for (j = 0; j < 4; ++j)
47605 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47606 return false;
47609 if (d->testing_p)
47610 return true;
47612 dremap = *d;
47613 dremap.vmode = V4DImode;
47614 dremap.nelt = 4;
47615 dremap.target = gen_reg_rtx (V4DImode);
47616 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47617 dremap.op1 = dremap.op0;
47618 dremap.one_operand_p = true;
47619 for (i = 0; i < 2; ++i)
47621 unsigned int cnt = 0;
47622 for (j = 0; j < 4; ++j)
47623 if ((contents[i] & (1u << j)) != 0)
47624 dremap.perm[2 * i + cnt++] = j;
47625 for (; cnt < 2; ++cnt)
47626 dremap.perm[2 * i + cnt] = 0;
47629 dfinal = *d;
47630 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47631 dfinal.op1 = dfinal.op0;
47632 dfinal.one_operand_p = true;
47633 for (i = 0, j = 0; i < nelt; ++i)
47635 if (i == nelt2)
47636 j = 2;
47637 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47638 if ((d->perm[i] / nelt4) == dremap.perm[j])
47640 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47641 dfinal.perm[i] |= nelt4;
47642 else
47643 gcc_unreachable ();
47646 ok = expand_vec_perm_1 (&dremap);
47647 gcc_assert (ok);
47649 ok = expand_vec_perm_1 (&dfinal);
47650 gcc_assert (ok);
47652 return true;
47655 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47656 a vector permutation using two instructions, vperm2f128 resp.
47657 vperm2i128 followed by any single in-lane permutation. */
47659 static bool
47660 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47662 struct expand_vec_perm_d dfirst, dsecond;
47663 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47664 bool ok;
47666 if (!TARGET_AVX
47667 || GET_MODE_SIZE (d->vmode) != 32
47668 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47669 return false;
47671 dsecond = *d;
47672 dsecond.one_operand_p = false;
47673 dsecond.testing_p = true;
47675 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47676 immediate. For perm < 16 the second permutation uses
47677 d->op0 as first operand, for perm >= 16 it uses d->op1
47678 as first operand. The second operand is the result of
47679 vperm2[fi]128. */
47680 for (perm = 0; perm < 32; perm++)
47682 /* Ignore permutations which do not move anything cross-lane. */
47683 if (perm < 16)
47685 /* The second shuffle for e.g. V4DFmode has
47686 0123 and ABCD operands.
47687 Ignore AB23, as 23 is already in the second lane
47688 of the first operand. */
47689 if ((perm & 0xc) == (1 << 2)) continue;
47690 /* And 01CD, as 01 is in the first lane of the first
47691 operand. */
47692 if ((perm & 3) == 0) continue;
47693 /* And 4567, as then the vperm2[fi]128 doesn't change
47694 anything on the original 4567 second operand. */
47695 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47697 else
47699 /* The second shuffle for e.g. V4DFmode has
47700 4567 and ABCD operands.
47701 Ignore AB67, as 67 is already in the second lane
47702 of the first operand. */
47703 if ((perm & 0xc) == (3 << 2)) continue;
47704 /* And 45CD, as 45 is in the first lane of the first
47705 operand. */
47706 if ((perm & 3) == 2) continue;
47707 /* And 0123, as then the vperm2[fi]128 doesn't change
47708 anything on the original 0123 first operand. */
47709 if ((perm & 0xf) == (1 << 2)) continue;
47712 for (i = 0; i < nelt; i++)
47714 j = d->perm[i] / nelt2;
47715 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47716 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47717 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47718 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47719 else
47720 break;
47723 if (i == nelt)
47725 start_sequence ();
47726 ok = expand_vec_perm_1 (&dsecond);
47727 end_sequence ();
47729 else
47730 ok = false;
47732 if (ok)
47734 if (d->testing_p)
47735 return true;
47737 /* Found a usable second shuffle. dfirst will be
47738 vperm2f128 on d->op0 and d->op1. */
47739 dsecond.testing_p = false;
47740 dfirst = *d;
47741 dfirst.target = gen_reg_rtx (d->vmode);
47742 for (i = 0; i < nelt; i++)
47743 dfirst.perm[i] = (i & (nelt2 - 1))
47744 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47746 canonicalize_perm (&dfirst);
47747 ok = expand_vec_perm_1 (&dfirst);
47748 gcc_assert (ok);
47750 /* And dsecond is some single insn shuffle, taking
47751 d->op0 and result of vperm2f128 (if perm < 16) or
47752 d->op1 and result of vperm2f128 (otherwise). */
47753 if (perm >= 16)
47754 dsecond.op0 = dsecond.op1;
47755 dsecond.op1 = dfirst.target;
47757 ok = expand_vec_perm_1 (&dsecond);
47758 gcc_assert (ok);
47760 return true;
47763 /* For one operand, the only useful vperm2f128 permutation is 0x01
47764 aka lanes swap. */
47765 if (d->one_operand_p)
47766 return false;
47769 return false;
47772 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47773 a two vector permutation using 2 intra-lane interleave insns
47774 and cross-lane shuffle for 32-byte vectors. */
47776 static bool
47777 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47779 unsigned i, nelt;
47780 rtx (*gen) (rtx, rtx, rtx);
47782 if (d->one_operand_p)
47783 return false;
47784 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47786 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47788 else
47789 return false;
47791 nelt = d->nelt;
47792 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47793 return false;
47794 for (i = 0; i < nelt; i += 2)
47795 if (d->perm[i] != d->perm[0] + i / 2
47796 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47797 return false;
47799 if (d->testing_p)
47800 return true;
47802 switch (d->vmode)
47804 case E_V32QImode:
47805 if (d->perm[0])
47806 gen = gen_vec_interleave_highv32qi;
47807 else
47808 gen = gen_vec_interleave_lowv32qi;
47809 break;
47810 case E_V16HImode:
47811 if (d->perm[0])
47812 gen = gen_vec_interleave_highv16hi;
47813 else
47814 gen = gen_vec_interleave_lowv16hi;
47815 break;
47816 case E_V8SImode:
47817 if (d->perm[0])
47818 gen = gen_vec_interleave_highv8si;
47819 else
47820 gen = gen_vec_interleave_lowv8si;
47821 break;
47822 case E_V4DImode:
47823 if (d->perm[0])
47824 gen = gen_vec_interleave_highv4di;
47825 else
47826 gen = gen_vec_interleave_lowv4di;
47827 break;
47828 case E_V8SFmode:
47829 if (d->perm[0])
47830 gen = gen_vec_interleave_highv8sf;
47831 else
47832 gen = gen_vec_interleave_lowv8sf;
47833 break;
47834 case E_V4DFmode:
47835 if (d->perm[0])
47836 gen = gen_vec_interleave_highv4df;
47837 else
47838 gen = gen_vec_interleave_lowv4df;
47839 break;
47840 default:
47841 gcc_unreachable ();
47844 emit_insn (gen (d->target, d->op0, d->op1));
47845 return true;
47848 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47849 a single vector permutation using a single intra-lane vector
47850 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47851 the non-swapped and swapped vectors together. */
47853 static bool
47854 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47856 struct expand_vec_perm_d dfirst, dsecond;
47857 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47858 rtx_insn *seq;
47859 bool ok;
47860 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47862 if (!TARGET_AVX
47863 || TARGET_AVX2
47864 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47865 || !d->one_operand_p)
47866 return false;
47868 dfirst = *d;
47869 for (i = 0; i < nelt; i++)
47870 dfirst.perm[i] = 0xff;
47871 for (i = 0, msk = 0; i < nelt; i++)
47873 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47874 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47875 return false;
47876 dfirst.perm[j] = d->perm[i];
47877 if (j != i)
47878 msk |= (1 << i);
47880 for (i = 0; i < nelt; i++)
47881 if (dfirst.perm[i] == 0xff)
47882 dfirst.perm[i] = i;
47884 if (!d->testing_p)
47885 dfirst.target = gen_reg_rtx (dfirst.vmode);
47887 start_sequence ();
47888 ok = expand_vec_perm_1 (&dfirst);
47889 seq = get_insns ();
47890 end_sequence ();
47892 if (!ok)
47893 return false;
47895 if (d->testing_p)
47896 return true;
47898 emit_insn (seq);
47900 dsecond = *d;
47901 dsecond.op0 = dfirst.target;
47902 dsecond.op1 = dfirst.target;
47903 dsecond.one_operand_p = true;
47904 dsecond.target = gen_reg_rtx (dsecond.vmode);
47905 for (i = 0; i < nelt; i++)
47906 dsecond.perm[i] = i ^ nelt2;
47908 ok = expand_vec_perm_1 (&dsecond);
47909 gcc_assert (ok);
47911 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47912 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47913 return true;
47916 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47917 permutation using two vperm2f128, followed by a vshufpd insn blending
47918 the two vectors together. */
47920 static bool
47921 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47923 struct expand_vec_perm_d dfirst, dsecond, dthird;
47924 bool ok;
47926 if (!TARGET_AVX || (d->vmode != V4DFmode))
47927 return false;
47929 if (d->testing_p)
47930 return true;
47932 dfirst = *d;
47933 dsecond = *d;
47934 dthird = *d;
47936 dfirst.perm[0] = (d->perm[0] & ~1);
47937 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47938 dfirst.perm[2] = (d->perm[2] & ~1);
47939 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47940 dsecond.perm[0] = (d->perm[1] & ~1);
47941 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47942 dsecond.perm[2] = (d->perm[3] & ~1);
47943 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47944 dthird.perm[0] = (d->perm[0] % 2);
47945 dthird.perm[1] = (d->perm[1] % 2) + 4;
47946 dthird.perm[2] = (d->perm[2] % 2) + 2;
47947 dthird.perm[3] = (d->perm[3] % 2) + 6;
47949 dfirst.target = gen_reg_rtx (dfirst.vmode);
47950 dsecond.target = gen_reg_rtx (dsecond.vmode);
47951 dthird.op0 = dfirst.target;
47952 dthird.op1 = dsecond.target;
47953 dthird.one_operand_p = false;
47955 canonicalize_perm (&dfirst);
47956 canonicalize_perm (&dsecond);
47958 ok = expand_vec_perm_1 (&dfirst)
47959 && expand_vec_perm_1 (&dsecond)
47960 && expand_vec_perm_1 (&dthird);
47962 gcc_assert (ok);
47964 return true;
47967 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47968 permutation with two pshufb insns and an ior. We should have already
47969 failed all two instruction sequences. */
47971 static bool
47972 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47974 rtx rperm[2][16], vperm, l, h, op, m128;
47975 unsigned int i, nelt, eltsz;
47977 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47978 return false;
47979 gcc_assert (!d->one_operand_p);
47981 if (d->testing_p)
47982 return true;
47984 nelt = d->nelt;
47985 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47987 /* Generate two permutation masks. If the required element is within
47988 the given vector it is shuffled into the proper lane. If the required
47989 element is in the other vector, force a zero into the lane by setting
47990 bit 7 in the permutation mask. */
47991 m128 = GEN_INT (-128);
47992 for (i = 0; i < nelt; ++i)
47994 unsigned j, e = d->perm[i];
47995 unsigned which = (e >= nelt);
47996 if (e >= nelt)
47997 e -= nelt;
47999 for (j = 0; j < eltsz; ++j)
48001 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48002 rperm[1-which][i*eltsz + j] = m128;
48006 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48007 vperm = force_reg (V16QImode, vperm);
48009 l = gen_reg_rtx (V16QImode);
48010 op = gen_lowpart (V16QImode, d->op0);
48011 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48013 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48014 vperm = force_reg (V16QImode, vperm);
48016 h = gen_reg_rtx (V16QImode);
48017 op = gen_lowpart (V16QImode, d->op1);
48018 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48020 op = d->target;
48021 if (d->vmode != V16QImode)
48022 op = gen_reg_rtx (V16QImode);
48023 emit_insn (gen_iorv16qi3 (op, l, h));
48024 if (op != d->target)
48025 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48027 return true;
48030 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48031 with two vpshufb insns, vpermq and vpor. We should have already failed
48032 all two or three instruction sequences. */
48034 static bool
48035 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48037 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48038 unsigned int i, nelt, eltsz;
48040 if (!TARGET_AVX2
48041 || !d->one_operand_p
48042 || (d->vmode != V32QImode && d->vmode != V16HImode))
48043 return false;
48045 if (d->testing_p)
48046 return true;
48048 nelt = d->nelt;
48049 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48051 /* Generate two permutation masks. If the required element is within
48052 the same lane, it is shuffled in. If the required element from the
48053 other lane, force a zero by setting bit 7 in the permutation mask.
48054 In the other mask the mask has non-negative elements if element
48055 is requested from the other lane, but also moved to the other lane,
48056 so that the result of vpshufb can have the two V2TImode halves
48057 swapped. */
48058 m128 = GEN_INT (-128);
48059 for (i = 0; i < nelt; ++i)
48061 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48062 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48064 for (j = 0; j < eltsz; ++j)
48066 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48067 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48071 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48072 vperm = force_reg (V32QImode, vperm);
48074 h = gen_reg_rtx (V32QImode);
48075 op = gen_lowpart (V32QImode, d->op0);
48076 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48078 /* Swap the 128-byte lanes of h into hp. */
48079 hp = gen_reg_rtx (V4DImode);
48080 op = gen_lowpart (V4DImode, h);
48081 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48082 const1_rtx));
48084 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48085 vperm = force_reg (V32QImode, vperm);
48087 l = gen_reg_rtx (V32QImode);
48088 op = gen_lowpart (V32QImode, d->op0);
48089 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48091 op = d->target;
48092 if (d->vmode != V32QImode)
48093 op = gen_reg_rtx (V32QImode);
48094 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48095 if (op != d->target)
48096 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48098 return true;
48101 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48102 and extract-odd permutations of two V32QImode and V16QImode operand
48103 with two vpshufb insns, vpor and vpermq. We should have already
48104 failed all two or three instruction sequences. */
48106 static bool
48107 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48109 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48110 unsigned int i, nelt, eltsz;
48112 if (!TARGET_AVX2
48113 || d->one_operand_p
48114 || (d->vmode != V32QImode && d->vmode != V16HImode))
48115 return false;
48117 for (i = 0; i < d->nelt; ++i)
48118 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48119 return false;
48121 if (d->testing_p)
48122 return true;
48124 nelt = d->nelt;
48125 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48127 /* Generate two permutation masks. In the first permutation mask
48128 the first quarter will contain indexes for the first half
48129 of the op0, the second quarter will contain bit 7 set, third quarter
48130 will contain indexes for the second half of the op0 and the
48131 last quarter bit 7 set. In the second permutation mask
48132 the first quarter will contain bit 7 set, the second quarter
48133 indexes for the first half of the op1, the third quarter bit 7 set
48134 and last quarter indexes for the second half of the op1.
48135 I.e. the first mask e.g. for V32QImode extract even will be:
48136 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48137 (all values masked with 0xf except for -128) and second mask
48138 for extract even will be
48139 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48140 m128 = GEN_INT (-128);
48141 for (i = 0; i < nelt; ++i)
48143 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48144 unsigned which = d->perm[i] >= nelt;
48145 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48147 for (j = 0; j < eltsz; ++j)
48149 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48150 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48154 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48155 vperm = force_reg (V32QImode, vperm);
48157 l = gen_reg_rtx (V32QImode);
48158 op = gen_lowpart (V32QImode, d->op0);
48159 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48161 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48162 vperm = force_reg (V32QImode, vperm);
48164 h = gen_reg_rtx (V32QImode);
48165 op = gen_lowpart (V32QImode, d->op1);
48166 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48168 ior = gen_reg_rtx (V32QImode);
48169 emit_insn (gen_iorv32qi3 (ior, l, h));
48171 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48172 op = gen_reg_rtx (V4DImode);
48173 ior = gen_lowpart (V4DImode, ior);
48174 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48175 const1_rtx, GEN_INT (3)));
48176 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48178 return true;
48181 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48182 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48183 with two "and" and "pack" or two "shift" and "pack" insns. We should
48184 have already failed all two instruction sequences. */
48186 static bool
48187 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48189 rtx op, dop0, dop1, t;
48190 unsigned i, odd, c, s, nelt = d->nelt;
48191 bool end_perm = false;
48192 machine_mode half_mode;
48193 rtx (*gen_and) (rtx, rtx, rtx);
48194 rtx (*gen_pack) (rtx, rtx, rtx);
48195 rtx (*gen_shift) (rtx, rtx, rtx);
48197 if (d->one_operand_p)
48198 return false;
48200 switch (d->vmode)
48202 case E_V8HImode:
48203 /* Required for "pack". */
48204 if (!TARGET_SSE4_1)
48205 return false;
48206 c = 0xffff;
48207 s = 16;
48208 half_mode = V4SImode;
48209 gen_and = gen_andv4si3;
48210 gen_pack = gen_sse4_1_packusdw;
48211 gen_shift = gen_lshrv4si3;
48212 break;
48213 case E_V16QImode:
48214 /* No check as all instructions are SSE2. */
48215 c = 0xff;
48216 s = 8;
48217 half_mode = V8HImode;
48218 gen_and = gen_andv8hi3;
48219 gen_pack = gen_sse2_packuswb;
48220 gen_shift = gen_lshrv8hi3;
48221 break;
48222 case E_V16HImode:
48223 if (!TARGET_AVX2)
48224 return false;
48225 c = 0xffff;
48226 s = 16;
48227 half_mode = V8SImode;
48228 gen_and = gen_andv8si3;
48229 gen_pack = gen_avx2_packusdw;
48230 gen_shift = gen_lshrv8si3;
48231 end_perm = true;
48232 break;
48233 case E_V32QImode:
48234 if (!TARGET_AVX2)
48235 return false;
48236 c = 0xff;
48237 s = 8;
48238 half_mode = V16HImode;
48239 gen_and = gen_andv16hi3;
48240 gen_pack = gen_avx2_packuswb;
48241 gen_shift = gen_lshrv16hi3;
48242 end_perm = true;
48243 break;
48244 default:
48245 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48246 general shuffles. */
48247 return false;
48250 /* Check that permutation is even or odd. */
48251 odd = d->perm[0];
48252 if (odd > 1)
48253 return false;
48255 for (i = 1; i < nelt; ++i)
48256 if (d->perm[i] != 2 * i + odd)
48257 return false;
48259 if (d->testing_p)
48260 return true;
48262 dop0 = gen_reg_rtx (half_mode);
48263 dop1 = gen_reg_rtx (half_mode);
48264 if (odd == 0)
48266 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48267 t = force_reg (half_mode, t);
48268 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48269 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48271 else
48273 emit_insn (gen_shift (dop0,
48274 gen_lowpart (half_mode, d->op0),
48275 GEN_INT (s)));
48276 emit_insn (gen_shift (dop1,
48277 gen_lowpart (half_mode, d->op1),
48278 GEN_INT (s)));
48280 /* In AVX2 for 256 bit case we need to permute pack result. */
48281 if (TARGET_AVX2 && end_perm)
48283 op = gen_reg_rtx (d->vmode);
48284 t = gen_reg_rtx (V4DImode);
48285 emit_insn (gen_pack (op, dop0, dop1));
48286 emit_insn (gen_avx2_permv4di_1 (t,
48287 gen_lowpart (V4DImode, op),
48288 const0_rtx,
48289 const2_rtx,
48290 const1_rtx,
48291 GEN_INT (3)));
48292 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48294 else
48295 emit_insn (gen_pack (d->target, dop0, dop1));
48297 return true;
48300 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48301 and extract-odd permutations of two V64QI operands
48302 with two "shifts", two "truncs" and one "concat" insns for "odd"
48303 and two "truncs" and one concat insn for "even."
48304 Have already failed all two instruction sequences. */
48306 static bool
48307 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48309 rtx t1, t2, t3, t4;
48310 unsigned i, odd, nelt = d->nelt;
48312 if (!TARGET_AVX512BW
48313 || d->one_operand_p
48314 || d->vmode != V64QImode)
48315 return false;
48317 /* Check that permutation is even or odd. */
48318 odd = d->perm[0];
48319 if (odd > 1)
48320 return false;
48322 for (i = 1; i < nelt; ++i)
48323 if (d->perm[i] != 2 * i + odd)
48324 return false;
48326 if (d->testing_p)
48327 return true;
48330 if (odd)
48332 t1 = gen_reg_rtx (V32HImode);
48333 t2 = gen_reg_rtx (V32HImode);
48334 emit_insn (gen_lshrv32hi3 (t1,
48335 gen_lowpart (V32HImode, d->op0),
48336 GEN_INT (8)));
48337 emit_insn (gen_lshrv32hi3 (t2,
48338 gen_lowpart (V32HImode, d->op1),
48339 GEN_INT (8)));
48341 else
48343 t1 = gen_lowpart (V32HImode, d->op0);
48344 t2 = gen_lowpart (V32HImode, d->op1);
48347 t3 = gen_reg_rtx (V32QImode);
48348 t4 = gen_reg_rtx (V32QImode);
48349 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48350 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48351 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48353 return true;
48356 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48357 and extract-odd permutations. */
48359 static bool
48360 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48362 rtx t1, t2, t3, t4, t5;
48364 switch (d->vmode)
48366 case E_V4DFmode:
48367 if (d->testing_p)
48368 break;
48369 t1 = gen_reg_rtx (V4DFmode);
48370 t2 = gen_reg_rtx (V4DFmode);
48372 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48373 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48374 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48376 /* Now an unpck[lh]pd will produce the result required. */
48377 if (odd)
48378 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48379 else
48380 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48381 emit_insn (t3);
48382 break;
48384 case E_V8SFmode:
48386 int mask = odd ? 0xdd : 0x88;
48388 if (d->testing_p)
48389 break;
48390 t1 = gen_reg_rtx (V8SFmode);
48391 t2 = gen_reg_rtx (V8SFmode);
48392 t3 = gen_reg_rtx (V8SFmode);
48394 /* Shuffle within the 128-bit lanes to produce:
48395 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48396 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48397 GEN_INT (mask)));
48399 /* Shuffle the lanes around to produce:
48400 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48401 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48402 GEN_INT (0x3)));
48404 /* Shuffle within the 128-bit lanes to produce:
48405 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48406 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48408 /* Shuffle within the 128-bit lanes to produce:
48409 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48410 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48412 /* Shuffle the lanes around to produce:
48413 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48414 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48415 GEN_INT (0x20)));
48417 break;
48419 case E_V2DFmode:
48420 case E_V4SFmode:
48421 case E_V2DImode:
48422 case E_V4SImode:
48423 /* These are always directly implementable by expand_vec_perm_1. */
48424 gcc_unreachable ();
48426 case E_V8HImode:
48427 if (TARGET_SSE4_1)
48428 return expand_vec_perm_even_odd_pack (d);
48429 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48430 return expand_vec_perm_pshufb2 (d);
48431 else
48433 if (d->testing_p)
48434 break;
48435 /* We need 2*log2(N)-1 operations to achieve odd/even
48436 with interleave. */
48437 t1 = gen_reg_rtx (V8HImode);
48438 t2 = gen_reg_rtx (V8HImode);
48439 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48440 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48441 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48442 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48443 if (odd)
48444 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48445 else
48446 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48447 emit_insn (t3);
48449 break;
48451 case E_V16QImode:
48452 return expand_vec_perm_even_odd_pack (d);
48454 case E_V16HImode:
48455 case E_V32QImode:
48456 return expand_vec_perm_even_odd_pack (d);
48458 case E_V64QImode:
48459 return expand_vec_perm_even_odd_trunc (d);
48461 case E_V4DImode:
48462 if (!TARGET_AVX2)
48464 struct expand_vec_perm_d d_copy = *d;
48465 d_copy.vmode = V4DFmode;
48466 if (d->testing_p)
48467 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48468 else
48469 d_copy.target = gen_reg_rtx (V4DFmode);
48470 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48471 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48472 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48474 if (!d->testing_p)
48475 emit_move_insn (d->target,
48476 gen_lowpart (V4DImode, d_copy.target));
48477 return true;
48479 return false;
48482 if (d->testing_p)
48483 break;
48485 t1 = gen_reg_rtx (V4DImode);
48486 t2 = gen_reg_rtx (V4DImode);
48488 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48489 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48490 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48492 /* Now an vpunpck[lh]qdq will produce the result required. */
48493 if (odd)
48494 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48495 else
48496 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48497 emit_insn (t3);
48498 break;
48500 case E_V8SImode:
48501 if (!TARGET_AVX2)
48503 struct expand_vec_perm_d d_copy = *d;
48504 d_copy.vmode = V8SFmode;
48505 if (d->testing_p)
48506 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48507 else
48508 d_copy.target = gen_reg_rtx (V8SFmode);
48509 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48510 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48511 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48513 if (!d->testing_p)
48514 emit_move_insn (d->target,
48515 gen_lowpart (V8SImode, d_copy.target));
48516 return true;
48518 return false;
48521 if (d->testing_p)
48522 break;
48524 t1 = gen_reg_rtx (V8SImode);
48525 t2 = gen_reg_rtx (V8SImode);
48526 t3 = gen_reg_rtx (V4DImode);
48527 t4 = gen_reg_rtx (V4DImode);
48528 t5 = gen_reg_rtx (V4DImode);
48530 /* Shuffle the lanes around into
48531 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48532 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48533 gen_lowpart (V4DImode, d->op1),
48534 GEN_INT (0x20)));
48535 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48536 gen_lowpart (V4DImode, d->op1),
48537 GEN_INT (0x31)));
48539 /* Swap the 2nd and 3rd position in each lane into
48540 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48541 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48542 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48543 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48544 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48546 /* Now an vpunpck[lh]qdq will produce
48547 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48548 if (odd)
48549 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48550 gen_lowpart (V4DImode, t2));
48551 else
48552 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48553 gen_lowpart (V4DImode, t2));
48554 emit_insn (t3);
48555 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48556 break;
48558 default:
48559 gcc_unreachable ();
48562 return true;
48565 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48566 extract-even and extract-odd permutations. */
48568 static bool
48569 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48571 unsigned i, odd, nelt = d->nelt;
48573 odd = d->perm[0];
48574 if (odd != 0 && odd != 1)
48575 return false;
48577 for (i = 1; i < nelt; ++i)
48578 if (d->perm[i] != 2 * i + odd)
48579 return false;
48581 return expand_vec_perm_even_odd_1 (d, odd);
48584 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48585 permutations. We assume that expand_vec_perm_1 has already failed. */
48587 static bool
48588 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48590 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48591 machine_mode vmode = d->vmode;
48592 unsigned char perm2[4];
48593 rtx op0 = d->op0, dest;
48594 bool ok;
48596 switch (vmode)
48598 case E_V4DFmode:
48599 case E_V8SFmode:
48600 /* These are special-cased in sse.md so that we can optionally
48601 use the vbroadcast instruction. They expand to two insns
48602 if the input happens to be in a register. */
48603 gcc_unreachable ();
48605 case E_V2DFmode:
48606 case E_V2DImode:
48607 case E_V4SFmode:
48608 case E_V4SImode:
48609 /* These are always implementable using standard shuffle patterns. */
48610 gcc_unreachable ();
48612 case E_V8HImode:
48613 case E_V16QImode:
48614 /* These can be implemented via interleave. We save one insn by
48615 stopping once we have promoted to V4SImode and then use pshufd. */
48616 if (d->testing_p)
48617 return true;
48620 rtx dest;
48621 rtx (*gen) (rtx, rtx, rtx)
48622 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48623 : gen_vec_interleave_lowv8hi;
48625 if (elt >= nelt2)
48627 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48628 : gen_vec_interleave_highv8hi;
48629 elt -= nelt2;
48631 nelt2 /= 2;
48633 dest = gen_reg_rtx (vmode);
48634 emit_insn (gen (dest, op0, op0));
48635 vmode = get_mode_wider_vector (vmode);
48636 op0 = gen_lowpart (vmode, dest);
48638 while (vmode != V4SImode);
48640 memset (perm2, elt, 4);
48641 dest = gen_reg_rtx (V4SImode);
48642 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48643 gcc_assert (ok);
48644 if (!d->testing_p)
48645 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48646 return true;
48648 case E_V64QImode:
48649 case E_V32QImode:
48650 case E_V16HImode:
48651 case E_V8SImode:
48652 case E_V4DImode:
48653 /* For AVX2 broadcasts of the first element vpbroadcast* or
48654 vpermq should be used by expand_vec_perm_1. */
48655 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48656 return false;
48658 default:
48659 gcc_unreachable ();
48663 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48664 broadcast permutations. */
48666 static bool
48667 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48669 unsigned i, elt, nelt = d->nelt;
48671 if (!d->one_operand_p)
48672 return false;
48674 elt = d->perm[0];
48675 for (i = 1; i < nelt; ++i)
48676 if (d->perm[i] != elt)
48677 return false;
48679 return expand_vec_perm_broadcast_1 (d);
48682 /* Implement arbitrary permutations of two V64QImode operands
48683 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48684 static bool
48685 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48687 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48688 return false;
48690 if (d->testing_p)
48691 return true;
48693 struct expand_vec_perm_d ds[2];
48694 rtx rperm[128], vperm, target0, target1;
48695 unsigned int i, nelt;
48696 machine_mode vmode;
48698 nelt = d->nelt;
48699 vmode = V64QImode;
48701 for (i = 0; i < 2; i++)
48703 ds[i] = *d;
48704 ds[i].vmode = V32HImode;
48705 ds[i].nelt = 32;
48706 ds[i].target = gen_reg_rtx (V32HImode);
48707 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48708 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48711 /* Prepare permutations such that the first one takes care of
48712 putting the even bytes into the right positions or one higher
48713 positions (ds[0]) and the second one takes care of
48714 putting the odd bytes into the right positions or one below
48715 (ds[1]). */
48717 for (i = 0; i < nelt; i++)
48719 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48720 if (i & 1)
48722 rperm[i] = constm1_rtx;
48723 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48725 else
48727 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48728 rperm[i + 64] = constm1_rtx;
48732 bool ok = expand_vec_perm_1 (&ds[0]);
48733 gcc_assert (ok);
48734 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48736 ok = expand_vec_perm_1 (&ds[1]);
48737 gcc_assert (ok);
48738 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48740 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48741 vperm = force_reg (vmode, vperm);
48742 target0 = gen_reg_rtx (V64QImode);
48743 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48745 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48746 vperm = force_reg (vmode, vperm);
48747 target1 = gen_reg_rtx (V64QImode);
48748 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48750 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48751 return true;
48754 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48755 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48756 all the shorter instruction sequences. */
48758 static bool
48759 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48761 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48762 unsigned int i, nelt, eltsz;
48763 bool used[4];
48765 if (!TARGET_AVX2
48766 || d->one_operand_p
48767 || (d->vmode != V32QImode && d->vmode != V16HImode))
48768 return false;
48770 if (d->testing_p)
48771 return true;
48773 nelt = d->nelt;
48774 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48776 /* Generate 4 permutation masks. If the required element is within
48777 the same lane, it is shuffled in. If the required element from the
48778 other lane, force a zero by setting bit 7 in the permutation mask.
48779 In the other mask the mask has non-negative elements if element
48780 is requested from the other lane, but also moved to the other lane,
48781 so that the result of vpshufb can have the two V2TImode halves
48782 swapped. */
48783 m128 = GEN_INT (-128);
48784 for (i = 0; i < 32; ++i)
48786 rperm[0][i] = m128;
48787 rperm[1][i] = m128;
48788 rperm[2][i] = m128;
48789 rperm[3][i] = m128;
48791 used[0] = false;
48792 used[1] = false;
48793 used[2] = false;
48794 used[3] = false;
48795 for (i = 0; i < nelt; ++i)
48797 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48798 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48799 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48801 for (j = 0; j < eltsz; ++j)
48802 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48803 used[which] = true;
48806 for (i = 0; i < 2; ++i)
48808 if (!used[2 * i + 1])
48810 h[i] = NULL_RTX;
48811 continue;
48813 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48814 gen_rtvec_v (32, rperm[2 * i + 1]));
48815 vperm = force_reg (V32QImode, vperm);
48816 h[i] = gen_reg_rtx (V32QImode);
48817 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48818 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48821 /* Swap the 128-byte lanes of h[X]. */
48822 for (i = 0; i < 2; ++i)
48824 if (h[i] == NULL_RTX)
48825 continue;
48826 op = gen_reg_rtx (V4DImode);
48827 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48828 const2_rtx, GEN_INT (3), const0_rtx,
48829 const1_rtx));
48830 h[i] = gen_lowpart (V32QImode, op);
48833 for (i = 0; i < 2; ++i)
48835 if (!used[2 * i])
48837 l[i] = NULL_RTX;
48838 continue;
48840 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48841 vperm = force_reg (V32QImode, vperm);
48842 l[i] = gen_reg_rtx (V32QImode);
48843 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48844 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48847 for (i = 0; i < 2; ++i)
48849 if (h[i] && l[i])
48851 op = gen_reg_rtx (V32QImode);
48852 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48853 l[i] = op;
48855 else if (h[i])
48856 l[i] = h[i];
48859 gcc_assert (l[0] && l[1]);
48860 op = d->target;
48861 if (d->vmode != V32QImode)
48862 op = gen_reg_rtx (V32QImode);
48863 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48864 if (op != d->target)
48865 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48866 return true;
48869 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48870 taken care of, perform the expansion in D and return true on success. */
48872 static bool
48873 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48875 /* Try a single instruction expansion. */
48876 if (expand_vec_perm_1 (d))
48877 return true;
48879 /* Try sequences of two instructions. */
48881 if (expand_vec_perm_pshuflw_pshufhw (d))
48882 return true;
48884 if (expand_vec_perm_palignr (d, false))
48885 return true;
48887 if (expand_vec_perm_interleave2 (d))
48888 return true;
48890 if (expand_vec_perm_broadcast (d))
48891 return true;
48893 if (expand_vec_perm_vpermq_perm_1 (d))
48894 return true;
48896 if (expand_vec_perm_vperm2f128 (d))
48897 return true;
48899 if (expand_vec_perm_pblendv (d))
48900 return true;
48902 /* Try sequences of three instructions. */
48904 if (expand_vec_perm_even_odd_pack (d))
48905 return true;
48907 if (expand_vec_perm_2vperm2f128_vshuf (d))
48908 return true;
48910 if (expand_vec_perm_pshufb2 (d))
48911 return true;
48913 if (expand_vec_perm_interleave3 (d))
48914 return true;
48916 if (expand_vec_perm_vperm2f128_vblend (d))
48917 return true;
48919 /* Try sequences of four instructions. */
48921 if (expand_vec_perm_even_odd_trunc (d))
48922 return true;
48923 if (expand_vec_perm_vpshufb2_vpermq (d))
48924 return true;
48926 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48927 return true;
48929 if (expand_vec_perm_vpermt2_vpshub2 (d))
48930 return true;
48932 /* ??? Look for narrow permutations whose element orderings would
48933 allow the promotion to a wider mode. */
48935 /* ??? Look for sequences of interleave or a wider permute that place
48936 the data into the correct lanes for a half-vector shuffle like
48937 pshuf[lh]w or vpermilps. */
48939 /* ??? Look for sequences of interleave that produce the desired results.
48940 The combinatorics of punpck[lh] get pretty ugly... */
48942 if (expand_vec_perm_even_odd (d))
48943 return true;
48945 /* Even longer sequences. */
48946 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48947 return true;
48949 /* See if we can get the same permutation in different vector integer
48950 mode. */
48951 struct expand_vec_perm_d nd;
48952 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48954 if (!d->testing_p)
48955 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48956 return true;
48959 return false;
48962 /* If a permutation only uses one operand, make it clear. Returns true
48963 if the permutation references both operands. */
48965 static bool
48966 canonicalize_perm (struct expand_vec_perm_d *d)
48968 int i, which, nelt = d->nelt;
48970 for (i = which = 0; i < nelt; ++i)
48971 which |= (d->perm[i] < nelt ? 1 : 2);
48973 d->one_operand_p = true;
48974 switch (which)
48976 default:
48977 gcc_unreachable();
48979 case 3:
48980 if (!rtx_equal_p (d->op0, d->op1))
48982 d->one_operand_p = false;
48983 break;
48985 /* The elements of PERM do not suggest that only the first operand
48986 is used, but both operands are identical. Allow easier matching
48987 of the permutation by folding the permutation into the single
48988 input vector. */
48989 /* FALLTHRU */
48991 case 2:
48992 for (i = 0; i < nelt; ++i)
48993 d->perm[i] &= nelt - 1;
48994 d->op0 = d->op1;
48995 break;
48997 case 1:
48998 d->op1 = d->op0;
48999 break;
49002 return (which == 3);
49005 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
49007 static bool
49008 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
49009 rtx op1, const vec_perm_indices &sel)
49011 struct expand_vec_perm_d d;
49012 unsigned char perm[MAX_VECT_LEN];
49013 unsigned int i, nelt, which;
49014 bool two_args;
49016 d.target = target;
49017 d.op0 = op0;
49018 d.op1 = op1;
49020 d.vmode = vmode;
49021 gcc_assert (VECTOR_MODE_P (d.vmode));
49022 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49023 d.testing_p = !target;
49025 gcc_assert (sel.length () == nelt);
49026 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49028 /* Given sufficient ISA support we can just return true here
49029 for selected vector modes. */
49030 switch (d.vmode)
49032 case E_V16SFmode:
49033 case E_V16SImode:
49034 case E_V8DImode:
49035 case E_V8DFmode:
49036 if (!TARGET_AVX512F)
49037 return false;
49038 /* All implementable with a single vperm[it]2 insn. */
49039 if (d.testing_p)
49040 return true;
49041 break;
49042 case E_V32HImode:
49043 if (!TARGET_AVX512BW)
49044 return false;
49045 if (d.testing_p)
49046 /* All implementable with a single vperm[it]2 insn. */
49047 return true;
49048 break;
49049 case E_V64QImode:
49050 if (!TARGET_AVX512BW)
49051 return false;
49052 if (d.testing_p)
49053 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
49054 return true;
49055 break;
49056 case E_V8SImode:
49057 case E_V8SFmode:
49058 case E_V4DFmode:
49059 case E_V4DImode:
49060 if (!TARGET_AVX)
49061 return false;
49062 if (d.testing_p && TARGET_AVX512VL)
49063 /* All implementable with a single vperm[it]2 insn. */
49064 return true;
49065 break;
49066 case E_V16HImode:
49067 if (!TARGET_SSE2)
49068 return false;
49069 if (d.testing_p && TARGET_AVX2)
49070 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49071 return true;
49072 break;
49073 case E_V32QImode:
49074 if (!TARGET_SSE2)
49075 return false;
49076 if (d.testing_p && TARGET_AVX2)
49077 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49078 return true;
49079 break;
49080 case E_V8HImode:
49081 case E_V16QImode:
49082 if (!TARGET_SSE2)
49083 return false;
49084 /* Fall through. */
49085 case E_V4SImode:
49086 case E_V4SFmode:
49087 if (!TARGET_SSE)
49088 return false;
49089 /* All implementable with a single vpperm insn. */
49090 if (d.testing_p && TARGET_XOP)
49091 return true;
49092 /* All implementable with 2 pshufb + 1 ior. */
49093 if (d.testing_p && TARGET_SSSE3)
49094 return true;
49095 break;
49096 case E_V2DImode:
49097 case E_V2DFmode:
49098 if (!TARGET_SSE)
49099 return false;
49100 /* All implementable with shufpd or unpck[lh]pd. */
49101 if (d.testing_p)
49102 return true;
49103 break;
49104 default:
49105 return false;
49108 for (i = which = 0; i < nelt; ++i)
49110 unsigned char e = sel[i];
49111 gcc_assert (e < 2 * nelt);
49112 d.perm[i] = e;
49113 perm[i] = e;
49114 which |= (e < nelt ? 1 : 2);
49117 if (d.testing_p)
49119 /* For all elements from second vector, fold the elements to first. */
49120 if (which == 2)
49121 for (i = 0; i < nelt; ++i)
49122 d.perm[i] -= nelt;
49124 /* Check whether the mask can be applied to the vector type. */
49125 d.one_operand_p = (which != 3);
49127 /* Implementable with shufps or pshufd. */
49128 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49129 return true;
49131 /* Otherwise we have to go through the motions and see if we can
49132 figure out how to generate the requested permutation. */
49133 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49134 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49135 if (!d.one_operand_p)
49136 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49138 start_sequence ();
49139 bool ret = ix86_expand_vec_perm_const_1 (&d);
49140 end_sequence ();
49142 return ret;
49145 two_args = canonicalize_perm (&d);
49147 if (ix86_expand_vec_perm_const_1 (&d))
49148 return true;
49150 /* If the selector says both arguments are needed, but the operands are the
49151 same, the above tried to expand with one_operand_p and flattened selector.
49152 If that didn't work, retry without one_operand_p; we succeeded with that
49153 during testing. */
49154 if (two_args && d.one_operand_p)
49156 d.one_operand_p = false;
49157 memcpy (d.perm, perm, sizeof (perm));
49158 return ix86_expand_vec_perm_const_1 (&d);
49161 return false;
49164 void
49165 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49167 struct expand_vec_perm_d d;
49168 unsigned i, nelt;
49170 d.target = targ;
49171 d.op0 = op0;
49172 d.op1 = op1;
49173 d.vmode = GET_MODE (targ);
49174 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49175 d.one_operand_p = false;
49176 d.testing_p = false;
49178 for (i = 0; i < nelt; ++i)
49179 d.perm[i] = i * 2 + odd;
49181 /* We'll either be able to implement the permutation directly... */
49182 if (expand_vec_perm_1 (&d))
49183 return;
49185 /* ... or we use the special-case patterns. */
49186 expand_vec_perm_even_odd_1 (&d, odd);
49189 static void
49190 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49192 struct expand_vec_perm_d d;
49193 unsigned i, nelt, base;
49194 bool ok;
49196 d.target = targ;
49197 d.op0 = op0;
49198 d.op1 = op1;
49199 d.vmode = GET_MODE (targ);
49200 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49201 d.one_operand_p = false;
49202 d.testing_p = false;
49204 base = high_p ? nelt / 2 : 0;
49205 for (i = 0; i < nelt / 2; ++i)
49207 d.perm[i * 2] = i + base;
49208 d.perm[i * 2 + 1] = i + base + nelt;
49211 /* Note that for AVX this isn't one instruction. */
49212 ok = ix86_expand_vec_perm_const_1 (&d);
49213 gcc_assert (ok);
49217 /* Expand a vector operation CODE for a V*QImode in terms of the
49218 same operation on V*HImode. */
49220 void
49221 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49223 machine_mode qimode = GET_MODE (dest);
49224 machine_mode himode;
49225 rtx (*gen_il) (rtx, rtx, rtx);
49226 rtx (*gen_ih) (rtx, rtx, rtx);
49227 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49228 struct expand_vec_perm_d d;
49229 bool ok, full_interleave;
49230 bool uns_p = false;
49231 int i;
49233 switch (qimode)
49235 case E_V16QImode:
49236 himode = V8HImode;
49237 gen_il = gen_vec_interleave_lowv16qi;
49238 gen_ih = gen_vec_interleave_highv16qi;
49239 break;
49240 case E_V32QImode:
49241 himode = V16HImode;
49242 gen_il = gen_avx2_interleave_lowv32qi;
49243 gen_ih = gen_avx2_interleave_highv32qi;
49244 break;
49245 case E_V64QImode:
49246 himode = V32HImode;
49247 gen_il = gen_avx512bw_interleave_lowv64qi;
49248 gen_ih = gen_avx512bw_interleave_highv64qi;
49249 break;
49250 default:
49251 gcc_unreachable ();
49254 op2_l = op2_h = op2;
49255 switch (code)
49257 case MULT:
49258 /* Unpack data such that we've got a source byte in each low byte of
49259 each word. We don't care what goes into the high byte of each word.
49260 Rather than trying to get zero in there, most convenient is to let
49261 it be a copy of the low byte. */
49262 op2_l = gen_reg_rtx (qimode);
49263 op2_h = gen_reg_rtx (qimode);
49264 emit_insn (gen_il (op2_l, op2, op2));
49265 emit_insn (gen_ih (op2_h, op2, op2));
49267 op1_l = gen_reg_rtx (qimode);
49268 op1_h = gen_reg_rtx (qimode);
49269 emit_insn (gen_il (op1_l, op1, op1));
49270 emit_insn (gen_ih (op1_h, op1, op1));
49271 full_interleave = qimode == V16QImode;
49272 break;
49274 case ASHIFT:
49275 case LSHIFTRT:
49276 uns_p = true;
49277 /* FALLTHRU */
49278 case ASHIFTRT:
49279 op1_l = gen_reg_rtx (himode);
49280 op1_h = gen_reg_rtx (himode);
49281 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49282 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49283 full_interleave = true;
49284 break;
49285 default:
49286 gcc_unreachable ();
49289 /* Perform the operation. */
49290 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49291 1, OPTAB_DIRECT);
49292 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49293 1, OPTAB_DIRECT);
49294 gcc_assert (res_l && res_h);
49296 /* Merge the data back into the right place. */
49297 d.target = dest;
49298 d.op0 = gen_lowpart (qimode, res_l);
49299 d.op1 = gen_lowpart (qimode, res_h);
49300 d.vmode = qimode;
49301 d.nelt = GET_MODE_NUNITS (qimode);
49302 d.one_operand_p = false;
49303 d.testing_p = false;
49305 if (full_interleave)
49307 /* For SSE2, we used an full interleave, so the desired
49308 results are in the even elements. */
49309 for (i = 0; i < d.nelt; ++i)
49310 d.perm[i] = i * 2;
49312 else
49314 /* For AVX, the interleave used above was not cross-lane. So the
49315 extraction is evens but with the second and third quarter swapped.
49316 Happily, that is even one insn shorter than even extraction.
49317 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49318 always first from the first and then from the second source operand,
49319 the index bits above the low 4 bits remains the same.
49320 Thus, for d.nelt == 32 we want permutation
49321 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49322 and for d.nelt == 64 we want permutation
49323 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49324 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49325 for (i = 0; i < d.nelt; ++i)
49326 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49329 ok = ix86_expand_vec_perm_const_1 (&d);
49330 gcc_assert (ok);
49332 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49333 gen_rtx_fmt_ee (code, qimode, op1, op2));
49336 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49337 if op is CONST_VECTOR with all odd elements equal to their
49338 preceding element. */
49340 static bool
49341 const_vector_equal_evenodd_p (rtx op)
49343 machine_mode mode = GET_MODE (op);
49344 int i, nunits = GET_MODE_NUNITS (mode);
49345 if (GET_CODE (op) != CONST_VECTOR
49346 || nunits != CONST_VECTOR_NUNITS (op))
49347 return false;
49348 for (i = 0; i < nunits; i += 2)
49349 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49350 return false;
49351 return true;
49354 void
49355 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49356 bool uns_p, bool odd_p)
49358 machine_mode mode = GET_MODE (op1);
49359 machine_mode wmode = GET_MODE (dest);
49360 rtx x;
49361 rtx orig_op1 = op1, orig_op2 = op2;
49363 if (!nonimmediate_operand (op1, mode))
49364 op1 = force_reg (mode, op1);
49365 if (!nonimmediate_operand (op2, mode))
49366 op2 = force_reg (mode, op2);
49368 /* We only play even/odd games with vectors of SImode. */
49369 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49371 /* If we're looking for the odd results, shift those members down to
49372 the even slots. For some cpus this is faster than a PSHUFD. */
49373 if (odd_p)
49375 /* For XOP use vpmacsdqh, but only for smult, as it is only
49376 signed. */
49377 if (TARGET_XOP && mode == V4SImode && !uns_p)
49379 x = force_reg (wmode, CONST0_RTX (wmode));
49380 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49381 return;
49384 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49385 if (!const_vector_equal_evenodd_p (orig_op1))
49386 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49387 x, NULL, 1, OPTAB_DIRECT);
49388 if (!const_vector_equal_evenodd_p (orig_op2))
49389 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49390 x, NULL, 1, OPTAB_DIRECT);
49391 op1 = gen_lowpart (mode, op1);
49392 op2 = gen_lowpart (mode, op2);
49395 if (mode == V16SImode)
49397 if (uns_p)
49398 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49399 else
49400 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49402 else if (mode == V8SImode)
49404 if (uns_p)
49405 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49406 else
49407 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49409 else if (uns_p)
49410 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49411 else if (TARGET_SSE4_1)
49412 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49413 else
49415 rtx s1, s2, t0, t1, t2;
49417 /* The easiest way to implement this without PMULDQ is to go through
49418 the motions as if we are performing a full 64-bit multiply. With
49419 the exception that we need to do less shuffling of the elements. */
49421 /* Compute the sign-extension, aka highparts, of the two operands. */
49422 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49423 op1, pc_rtx, pc_rtx);
49424 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49425 op2, pc_rtx, pc_rtx);
49427 /* Multiply LO(A) * HI(B), and vice-versa. */
49428 t1 = gen_reg_rtx (wmode);
49429 t2 = gen_reg_rtx (wmode);
49430 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49431 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49433 /* Multiply LO(A) * LO(B). */
49434 t0 = gen_reg_rtx (wmode);
49435 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49437 /* Combine and shift the highparts into place. */
49438 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49439 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49440 1, OPTAB_DIRECT);
49442 /* Combine high and low parts. */
49443 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49444 return;
49446 emit_insn (x);
49449 void
49450 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49451 bool uns_p, bool high_p)
49453 machine_mode wmode = GET_MODE (dest);
49454 machine_mode mode = GET_MODE (op1);
49455 rtx t1, t2, t3, t4, mask;
49457 switch (mode)
49459 case E_V4SImode:
49460 t1 = gen_reg_rtx (mode);
49461 t2 = gen_reg_rtx (mode);
49462 if (TARGET_XOP && !uns_p)
49464 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49465 shuffle the elements once so that all elements are in the right
49466 place for immediate use: { A C B D }. */
49467 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49468 const1_rtx, GEN_INT (3)));
49469 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49470 const1_rtx, GEN_INT (3)));
49472 else
49474 /* Put the elements into place for the multiply. */
49475 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49476 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49477 high_p = false;
49479 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49480 break;
49482 case E_V8SImode:
49483 /* Shuffle the elements between the lanes. After this we
49484 have { A B E F | C D G H } for each operand. */
49485 t1 = gen_reg_rtx (V4DImode);
49486 t2 = gen_reg_rtx (V4DImode);
49487 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49488 const0_rtx, const2_rtx,
49489 const1_rtx, GEN_INT (3)));
49490 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49491 const0_rtx, const2_rtx,
49492 const1_rtx, GEN_INT (3)));
49494 /* Shuffle the elements within the lanes. After this we
49495 have { A A B B | C C D D } or { E E F F | G G H H }. */
49496 t3 = gen_reg_rtx (V8SImode);
49497 t4 = gen_reg_rtx (V8SImode);
49498 mask = GEN_INT (high_p
49499 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49500 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49501 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49502 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49504 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49505 break;
49507 case E_V8HImode:
49508 case E_V16HImode:
49509 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49510 uns_p, OPTAB_DIRECT);
49511 t2 = expand_binop (mode,
49512 uns_p ? umul_highpart_optab : smul_highpart_optab,
49513 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49514 gcc_assert (t1 && t2);
49516 t3 = gen_reg_rtx (mode);
49517 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49518 emit_move_insn (dest, gen_lowpart (wmode, t3));
49519 break;
49521 case E_V16QImode:
49522 case E_V32QImode:
49523 case E_V32HImode:
49524 case E_V16SImode:
49525 case E_V64QImode:
49526 t1 = gen_reg_rtx (wmode);
49527 t2 = gen_reg_rtx (wmode);
49528 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49529 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49531 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49532 break;
49534 default:
49535 gcc_unreachable ();
49539 void
49540 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49542 rtx res_1, res_2, res_3, res_4;
49544 res_1 = gen_reg_rtx (V4SImode);
49545 res_2 = gen_reg_rtx (V4SImode);
49546 res_3 = gen_reg_rtx (V2DImode);
49547 res_4 = gen_reg_rtx (V2DImode);
49548 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49549 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49551 /* Move the results in element 2 down to element 1; we don't care
49552 what goes in elements 2 and 3. Then we can merge the parts
49553 back together with an interleave.
49555 Note that two other sequences were tried:
49556 (1) Use interleaves at the start instead of psrldq, which allows
49557 us to use a single shufps to merge things back at the end.
49558 (2) Use shufps here to combine the two vectors, then pshufd to
49559 put the elements in the correct order.
49560 In both cases the cost of the reformatting stall was too high
49561 and the overall sequence slower. */
49563 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49564 const0_rtx, const2_rtx,
49565 const0_rtx, const0_rtx));
49566 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49567 const0_rtx, const2_rtx,
49568 const0_rtx, const0_rtx));
49569 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49571 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49574 void
49575 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49577 machine_mode mode = GET_MODE (op0);
49578 rtx t1, t2, t3, t4, t5, t6;
49580 if (TARGET_AVX512DQ && mode == V8DImode)
49581 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49582 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49583 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49584 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49585 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49586 else if (TARGET_XOP && mode == V2DImode)
49588 /* op1: A,B,C,D, op2: E,F,G,H */
49589 op1 = gen_lowpart (V4SImode, op1);
49590 op2 = gen_lowpart (V4SImode, op2);
49592 t1 = gen_reg_rtx (V4SImode);
49593 t2 = gen_reg_rtx (V4SImode);
49594 t3 = gen_reg_rtx (V2DImode);
49595 t4 = gen_reg_rtx (V2DImode);
49597 /* t1: B,A,D,C */
49598 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49599 GEN_INT (1),
49600 GEN_INT (0),
49601 GEN_INT (3),
49602 GEN_INT (2)));
49604 /* t2: (B*E),(A*F),(D*G),(C*H) */
49605 emit_insn (gen_mulv4si3 (t2, t1, op2));
49607 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49608 emit_insn (gen_xop_phadddq (t3, t2));
49610 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49611 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49613 /* Multiply lower parts and add all */
49614 t5 = gen_reg_rtx (V2DImode);
49615 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49616 gen_lowpart (V4SImode, op1),
49617 gen_lowpart (V4SImode, op2)));
49618 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49621 else
49623 machine_mode nmode;
49624 rtx (*umul) (rtx, rtx, rtx);
49626 if (mode == V2DImode)
49628 umul = gen_vec_widen_umult_even_v4si;
49629 nmode = V4SImode;
49631 else if (mode == V4DImode)
49633 umul = gen_vec_widen_umult_even_v8si;
49634 nmode = V8SImode;
49636 else if (mode == V8DImode)
49638 umul = gen_vec_widen_umult_even_v16si;
49639 nmode = V16SImode;
49641 else
49642 gcc_unreachable ();
49645 /* Multiply low parts. */
49646 t1 = gen_reg_rtx (mode);
49647 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49649 /* Shift input vectors right 32 bits so we can multiply high parts. */
49650 t6 = GEN_INT (32);
49651 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49652 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49654 /* Multiply high parts by low parts. */
49655 t4 = gen_reg_rtx (mode);
49656 t5 = gen_reg_rtx (mode);
49657 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49658 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49660 /* Combine and shift the highparts back. */
49661 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49662 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49664 /* Combine high and low parts. */
49665 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49668 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49669 gen_rtx_MULT (mode, op1, op2));
49672 /* Return 1 if control tansfer instruction INSN
49673 should be encoded with bnd prefix.
49674 If insn is NULL then return 1 when control
49675 transfer instructions should be prefixed with
49676 bnd by default for current function. */
49678 bool
49679 ix86_bnd_prefixed_insn_p (rtx insn)
49681 /* For call insns check special flag. */
49682 if (insn && CALL_P (insn))
49684 rtx call = get_call_rtx_from (insn);
49685 if (call)
49686 return CALL_EXPR_WITH_BOUNDS_P (call);
49689 /* All other insns are prefixed only if function is instrumented. */
49690 return chkp_function_instrumented_p (current_function_decl);
49693 /* Return 1 if control tansfer instruction INSN
49694 should be encoded with notrack prefix. */
49696 static bool
49697 ix86_notrack_prefixed_insn_p (rtx insn)
49699 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
49700 return false;
49702 if (CALL_P (insn))
49704 rtx call = get_call_rtx_from (insn);
49705 gcc_assert (call != NULL_RTX);
49706 rtx addr = XEXP (call, 0);
49708 /* Do not emit 'notrack' if it's not an indirect call. */
49709 if (MEM_P (addr)
49710 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49711 return false;
49712 else
49713 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49716 if (JUMP_P (insn) && !flag_cet_switch)
49718 rtx target = JUMP_LABEL (insn);
49719 if (target == NULL_RTX || ANY_RETURN_P (target))
49720 return false;
49722 /* Check the jump is a switch table. */
49723 rtx_insn *label = as_a<rtx_insn *> (target);
49724 rtx_insn *table = next_insn (label);
49725 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49726 return false;
49727 else
49728 return true;
49730 return false;
49733 /* Calculate integer abs() using only SSE2 instructions. */
49735 void
49736 ix86_expand_sse2_abs (rtx target, rtx input)
49738 machine_mode mode = GET_MODE (target);
49739 rtx tmp0, tmp1, x;
49741 switch (mode)
49743 /* For 32-bit signed integer X, the best way to calculate the absolute
49744 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49745 case E_V4SImode:
49746 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49747 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49748 NULL, 0, OPTAB_DIRECT);
49749 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49750 NULL, 0, OPTAB_DIRECT);
49751 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49752 target, 0, OPTAB_DIRECT);
49753 break;
49755 /* For 16-bit signed integer X, the best way to calculate the absolute
49756 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49757 case E_V8HImode:
49758 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49760 x = expand_simple_binop (mode, SMAX, tmp0, input,
49761 target, 0, OPTAB_DIRECT);
49762 break;
49764 /* For 8-bit signed integer X, the best way to calculate the absolute
49765 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49766 as SSE2 provides the PMINUB insn. */
49767 case E_V16QImode:
49768 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49770 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49771 target, 0, OPTAB_DIRECT);
49772 break;
49774 default:
49775 gcc_unreachable ();
49778 if (x != target)
49779 emit_move_insn (target, x);
49782 /* Expand an extract from a vector register through pextr insn.
49783 Return true if successful. */
49785 bool
49786 ix86_expand_pextr (rtx *operands)
49788 rtx dst = operands[0];
49789 rtx src = operands[1];
49791 unsigned int size = INTVAL (operands[2]);
49792 unsigned int pos = INTVAL (operands[3]);
49794 if (SUBREG_P (dst))
49796 /* Reject non-lowpart subregs. */
49797 if (SUBREG_BYTE (dst) > 0)
49798 return false;
49799 dst = SUBREG_REG (dst);
49802 if (SUBREG_P (src))
49804 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49805 src = SUBREG_REG (src);
49808 switch (GET_MODE (src))
49810 case E_V16QImode:
49811 case E_V8HImode:
49812 case E_V4SImode:
49813 case E_V2DImode:
49814 case E_V1TImode:
49815 case E_TImode:
49817 machine_mode srcmode, dstmode;
49818 rtx d, pat;
49820 if (!int_mode_for_size (size, 0).exists (&dstmode))
49821 return false;
49823 switch (dstmode)
49825 case E_QImode:
49826 if (!TARGET_SSE4_1)
49827 return false;
49828 srcmode = V16QImode;
49829 break;
49831 case E_HImode:
49832 if (!TARGET_SSE2)
49833 return false;
49834 srcmode = V8HImode;
49835 break;
49837 case E_SImode:
49838 if (!TARGET_SSE4_1)
49839 return false;
49840 srcmode = V4SImode;
49841 break;
49843 case E_DImode:
49844 gcc_assert (TARGET_64BIT);
49845 if (!TARGET_SSE4_1)
49846 return false;
49847 srcmode = V2DImode;
49848 break;
49850 default:
49851 return false;
49854 /* Reject extractions from misaligned positions. */
49855 if (pos & (size-1))
49856 return false;
49858 if (GET_MODE (dst) == dstmode)
49859 d = dst;
49860 else
49861 d = gen_reg_rtx (dstmode);
49863 /* Construct insn pattern. */
49864 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49865 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49867 /* Let the rtl optimizers know about the zero extension performed. */
49868 if (dstmode == QImode || dstmode == HImode)
49870 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49871 d = gen_lowpart (SImode, d);
49874 emit_insn (gen_rtx_SET (d, pat));
49876 if (d != dst)
49877 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49878 return true;
49881 default:
49882 return false;
49886 /* Expand an insert into a vector register through pinsr insn.
49887 Return true if successful. */
49889 bool
49890 ix86_expand_pinsr (rtx *operands)
49892 rtx dst = operands[0];
49893 rtx src = operands[3];
49895 unsigned int size = INTVAL (operands[1]);
49896 unsigned int pos = INTVAL (operands[2]);
49898 if (SUBREG_P (dst))
49900 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49901 dst = SUBREG_REG (dst);
49904 switch (GET_MODE (dst))
49906 case E_V16QImode:
49907 case E_V8HImode:
49908 case E_V4SImode:
49909 case E_V2DImode:
49910 case E_V1TImode:
49911 case E_TImode:
49913 machine_mode srcmode, dstmode;
49914 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49915 rtx d;
49917 if (!int_mode_for_size (size, 0).exists (&srcmode))
49918 return false;
49920 switch (srcmode)
49922 case E_QImode:
49923 if (!TARGET_SSE4_1)
49924 return false;
49925 dstmode = V16QImode;
49926 pinsr = gen_sse4_1_pinsrb;
49927 break;
49929 case E_HImode:
49930 if (!TARGET_SSE2)
49931 return false;
49932 dstmode = V8HImode;
49933 pinsr = gen_sse2_pinsrw;
49934 break;
49936 case E_SImode:
49937 if (!TARGET_SSE4_1)
49938 return false;
49939 dstmode = V4SImode;
49940 pinsr = gen_sse4_1_pinsrd;
49941 break;
49943 case E_DImode:
49944 gcc_assert (TARGET_64BIT);
49945 if (!TARGET_SSE4_1)
49946 return false;
49947 dstmode = V2DImode;
49948 pinsr = gen_sse4_1_pinsrq;
49949 break;
49951 default:
49952 return false;
49955 /* Reject insertions to misaligned positions. */
49956 if (pos & (size-1))
49957 return false;
49959 if (SUBREG_P (src))
49961 unsigned int srcpos = SUBREG_BYTE (src);
49963 if (srcpos > 0)
49965 rtx extr_ops[4];
49967 extr_ops[0] = gen_reg_rtx (srcmode);
49968 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49969 extr_ops[2] = GEN_INT (size);
49970 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49972 if (!ix86_expand_pextr (extr_ops))
49973 return false;
49975 src = extr_ops[0];
49977 else
49978 src = gen_lowpart (srcmode, SUBREG_REG (src));
49981 if (GET_MODE (dst) == dstmode)
49982 d = dst;
49983 else
49984 d = gen_reg_rtx (dstmode);
49986 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49987 gen_lowpart (srcmode, src),
49988 GEN_INT (1 << (pos / size))));
49989 if (d != dst)
49990 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49991 return true;
49994 default:
49995 return false;
49999 /* This function returns the calling abi specific va_list type node.
50000 It returns the FNDECL specific va_list type. */
50002 static tree
50003 ix86_fn_abi_va_list (tree fndecl)
50005 if (!TARGET_64BIT)
50006 return va_list_type_node;
50007 gcc_assert (fndecl != NULL_TREE);
50009 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50010 return ms_va_list_type_node;
50011 else
50012 return sysv_va_list_type_node;
50015 /* Returns the canonical va_list type specified by TYPE. If there
50016 is no valid TYPE provided, it return NULL_TREE. */
50018 static tree
50019 ix86_canonical_va_list_type (tree type)
50021 if (TARGET_64BIT)
50023 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50024 return ms_va_list_type_node;
50026 if ((TREE_CODE (type) == ARRAY_TYPE
50027 && integer_zerop (array_type_nelts (type)))
50028 || POINTER_TYPE_P (type))
50030 tree elem_type = TREE_TYPE (type);
50031 if (TREE_CODE (elem_type) == RECORD_TYPE
50032 && lookup_attribute ("sysv_abi va_list",
50033 TYPE_ATTRIBUTES (elem_type)))
50034 return sysv_va_list_type_node;
50037 return NULL_TREE;
50040 return std_canonical_va_list_type (type);
50043 /* Iterate through the target-specific builtin types for va_list.
50044 IDX denotes the iterator, *PTREE is set to the result type of
50045 the va_list builtin, and *PNAME to its internal type.
50046 Returns zero if there is no element for this index, otherwise
50047 IDX should be increased upon the next call.
50048 Note, do not iterate a base builtin's name like __builtin_va_list.
50049 Used from c_common_nodes_and_builtins. */
50051 static int
50052 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50054 if (TARGET_64BIT)
50056 switch (idx)
50058 default:
50059 break;
50061 case 0:
50062 *ptree = ms_va_list_type_node;
50063 *pname = "__builtin_ms_va_list";
50064 return 1;
50066 case 1:
50067 *ptree = sysv_va_list_type_node;
50068 *pname = "__builtin_sysv_va_list";
50069 return 1;
50073 return 0;
50076 #undef TARGET_SCHED_DISPATCH
50077 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
50078 #undef TARGET_SCHED_DISPATCH_DO
50079 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
50080 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50081 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50082 #undef TARGET_SCHED_REORDER
50083 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
50084 #undef TARGET_SCHED_ADJUST_PRIORITY
50085 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50086 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50087 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50088 ix86_dependencies_evaluation_hook
50091 /* Implementation of reassociation_width target hook used by
50092 reassoc phase to identify parallelism level in reassociated
50093 tree. Statements tree_code is passed in OPC. Arguments type
50094 is passed in MODE. */
50096 static int
50097 ix86_reassociation_width (unsigned int op, machine_mode mode)
50099 int width = 1;
50100 /* Vector part. */
50101 if (VECTOR_MODE_P (mode))
50103 int div = 1;
50104 if (INTEGRAL_MODE_P (mode))
50105 width = ix86_cost->reassoc_vec_int;
50106 else if (FLOAT_MODE_P (mode))
50107 width = ix86_cost->reassoc_vec_fp;
50109 if (width == 1)
50110 return 1;
50112 /* Integer vector instructions execute in FP unit
50113 and can execute 3 additions and one multiplication per cycle. */
50114 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
50115 && op != PLUS && op != MINUS)
50116 return 1;
50118 /* Account for targets that splits wide vectors into multiple parts. */
50119 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
50120 div = GET_MODE_BITSIZE (mode) / 128;
50121 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
50122 div = GET_MODE_BITSIZE (mode) / 64;
50123 width = (width + div - 1) / div;
50125 /* Scalar part. */
50126 else if (INTEGRAL_MODE_P (mode))
50127 width = ix86_cost->reassoc_int;
50128 else if (FLOAT_MODE_P (mode))
50129 width = ix86_cost->reassoc_fp;
50131 /* Avoid using too many registers in 32bit mode. */
50132 if (!TARGET_64BIT && width > 2)
50133 width = 2;
50134 return width;
50137 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50138 place emms and femms instructions. */
50140 static machine_mode
50141 ix86_preferred_simd_mode (scalar_mode mode)
50143 if (!TARGET_SSE)
50144 return word_mode;
50146 switch (mode)
50148 case E_QImode:
50149 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50150 return V64QImode;
50151 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50152 return V32QImode;
50153 else
50154 return V16QImode;
50156 case E_HImode:
50157 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50158 return V32HImode;
50159 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50160 return V16HImode;
50161 else
50162 return V8HImode;
50164 case E_SImode:
50165 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50166 return V16SImode;
50167 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50168 return V8SImode;
50169 else
50170 return V4SImode;
50172 case E_DImode:
50173 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50174 return V8DImode;
50175 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50176 return V4DImode;
50177 else
50178 return V2DImode;
50180 case E_SFmode:
50181 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50182 return V16SFmode;
50183 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50184 return V8SFmode;
50185 else
50186 return V4SFmode;
50188 case E_DFmode:
50189 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50190 return V8DFmode;
50191 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50192 return V4DFmode;
50193 else if (TARGET_SSE2)
50194 return V2DFmode;
50195 /* FALLTHRU */
50197 default:
50198 return word_mode;
50202 /* All CPUs prefer to avoid cross-lane operations so perform reductions
50203 upper against lower halves up to SSE reg size. */
50205 static machine_mode
50206 ix86_split_reduction (machine_mode mode)
50208 /* Reduce lowpart against highpart until we reach SSE reg width to
50209 avoid cross-lane operations. */
50210 switch (mode)
50212 case E_V8DImode:
50213 case E_V4DImode:
50214 return V2DImode;
50215 case E_V16SImode:
50216 case E_V8SImode:
50217 return V4SImode;
50218 case E_V32HImode:
50219 case E_V16HImode:
50220 return V8HImode;
50221 case E_V64QImode:
50222 case E_V32QImode:
50223 return V16QImode;
50224 case E_V16SFmode:
50225 case E_V8SFmode:
50226 return V4SFmode;
50227 case E_V8DFmode:
50228 case E_V4DFmode:
50229 return V2DFmode;
50230 default:
50231 return mode;
50235 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50236 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50237 256bit and 128bit vectors. */
50239 static void
50240 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
50242 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50244 sizes->safe_push (64);
50245 sizes->safe_push (32);
50246 sizes->safe_push (16);
50248 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50250 sizes->safe_push (32);
50251 sizes->safe_push (16);
50255 /* Implemenation of targetm.vectorize.get_mask_mode. */
50257 static opt_machine_mode
50258 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50260 unsigned elem_size = vector_size / nunits;
50262 /* Scalar mask case. */
50263 if ((TARGET_AVX512F && vector_size == 64)
50264 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50266 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50267 return smallest_int_mode_for_size (nunits);
50270 scalar_int_mode elem_mode
50271 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50273 gcc_assert (elem_size * nunits == vector_size);
50275 return mode_for_vector (elem_mode, nunits);
50280 /* Return class of registers which could be used for pseudo of MODE
50281 and of class RCLASS for spilling instead of memory. Return NO_REGS
50282 if it is not possible or non-profitable. */
50284 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50286 static reg_class_t
50287 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50289 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50290 && TARGET_SSE2
50291 && TARGET_INTER_UNIT_MOVES_TO_VEC
50292 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50293 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50294 && INTEGER_CLASS_P (rclass))
50295 return ALL_SSE_REGS;
50296 return NO_REGS;
50299 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50300 but returns a lower bound. */
50302 static unsigned int
50303 ix86_max_noce_ifcvt_seq_cost (edge e)
50305 bool predictable_p = predictable_edge_p (e);
50307 enum compiler_param param
50308 = (predictable_p
50309 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50310 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50312 /* If we have a parameter set, use that, otherwise take a guess using
50313 BRANCH_COST. */
50314 if (global_options_set.x_param_values[param])
50315 return PARAM_VALUE (param);
50316 else
50317 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50320 /* Return true if SEQ is a good candidate as a replacement for the
50321 if-convertible sequence described in IF_INFO. */
50323 static bool
50324 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50326 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50328 int cmov_cnt = 0;
50329 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50330 Maybe we should allow even more conditional moves as long as they
50331 are used far enough not to stall the CPU, or also consider
50332 IF_INFO->TEST_BB succ edge probabilities. */
50333 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50335 rtx set = single_set (insn);
50336 if (!set)
50337 continue;
50338 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50339 continue;
50340 rtx src = SET_SRC (set);
50341 machine_mode mode = GET_MODE (src);
50342 if (GET_MODE_CLASS (mode) != MODE_INT
50343 && GET_MODE_CLASS (mode) != MODE_FLOAT)
50344 continue;
50345 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50346 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50347 continue;
50348 /* insn is CMOV or FCMOV. */
50349 if (++cmov_cnt > 1)
50350 return false;
50353 return default_noce_conversion_profitable_p (seq, if_info);
50356 /* Implement targetm.vectorize.init_cost. */
50358 static void *
50359 ix86_init_cost (struct loop *)
50361 unsigned *cost = XNEWVEC (unsigned, 3);
50362 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50363 return cost;
50366 /* Implement targetm.vectorize.add_stmt_cost. */
50368 static unsigned
50369 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50370 struct _stmt_vec_info *stmt_info, int misalign,
50371 enum vect_cost_model_location where)
50373 unsigned *cost = (unsigned *) data;
50374 unsigned retval = 0;
50376 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50377 int stmt_cost = - 1;
50379 if ((kind == vector_stmt || kind == scalar_stmt)
50380 && stmt_info
50381 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50383 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50384 bool fp = false;
50385 machine_mode mode = TImode;
50387 if (vectype != NULL)
50389 fp = FLOAT_TYPE_P (vectype);
50390 mode = TYPE_MODE (vectype);
50392 /*machine_mode inner_mode = mode;
50393 if (VECTOR_MODE_P (mode))
50394 inner_mode = GET_MODE_INNER (mode);*/
50396 switch (subcode)
50398 case PLUS_EXPR:
50399 case POINTER_PLUS_EXPR:
50400 case MINUS_EXPR:
50401 if (kind == scalar_stmt)
50403 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50404 stmt_cost = ix86_cost->addss;
50405 else if (X87_FLOAT_MODE_P (mode))
50406 stmt_cost = ix86_cost->fadd;
50407 else
50408 stmt_cost = ix86_cost->add;
50410 else
50411 stmt_cost = ix86_vec_cost (mode,
50412 fp ? ix86_cost->addss
50413 : ix86_cost->sse_op,
50414 true);
50415 break;
50417 case MULT_EXPR:
50418 case WIDEN_MULT_EXPR:
50419 case MULT_HIGHPART_EXPR:
50420 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50421 break;
50422 case FMA_EXPR:
50423 stmt_cost = ix86_vec_cost (mode,
50424 mode == SFmode ? ix86_cost->fmass
50425 : ix86_cost->fmasd,
50426 true);
50427 break;
50428 case NEGATE_EXPR:
50429 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50430 stmt_cost = ix86_cost->sse_op;
50431 else if (X87_FLOAT_MODE_P (mode))
50432 stmt_cost = ix86_cost->fchs;
50433 else if (VECTOR_MODE_P (mode))
50434 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50435 else
50436 stmt_cost = ix86_cost->add;
50437 break;
50438 case TRUNC_DIV_EXPR:
50439 case CEIL_DIV_EXPR:
50440 case FLOOR_DIV_EXPR:
50441 case ROUND_DIV_EXPR:
50442 case TRUNC_MOD_EXPR:
50443 case CEIL_MOD_EXPR:
50444 case FLOOR_MOD_EXPR:
50445 case RDIV_EXPR:
50446 case ROUND_MOD_EXPR:
50447 case EXACT_DIV_EXPR:
50448 stmt_cost = ix86_division_cost (ix86_cost, mode);
50449 break;
50451 case RSHIFT_EXPR:
50452 case LSHIFT_EXPR:
50453 case LROTATE_EXPR:
50454 case RROTATE_EXPR:
50456 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50457 stmt_cost = ix86_shift_rotate_cost
50458 (ix86_cost, mode,
50459 TREE_CODE (op2) == INTEGER_CST,
50460 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50461 true, false, false, NULL, NULL);
50463 break;
50464 case NOP_EXPR:
50465 stmt_cost = 0;
50466 break;
50468 case BIT_IOR_EXPR:
50469 case ABS_EXPR:
50470 case MIN_EXPR:
50471 case MAX_EXPR:
50472 case BIT_XOR_EXPR:
50473 case BIT_AND_EXPR:
50474 case BIT_NOT_EXPR:
50475 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50476 stmt_cost = ix86_cost->sse_op;
50477 else if (VECTOR_MODE_P (mode))
50478 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50479 else
50480 stmt_cost = ix86_cost->add;
50481 break;
50482 default:
50483 break;
50486 /* If we do elementwise loads into a vector then we are bound by
50487 latency and execution resources for the many scalar loads
50488 (AGU and load ports). Try to account for this by scaling the
50489 construction cost by the number of elements involved. */
50490 if (kind == vec_construct
50491 && stmt_info
50492 && stmt_info->type == load_vec_info_type
50493 && stmt_info->memory_access_type == VMAT_ELEMENTWISE)
50495 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50496 stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
50498 if (stmt_cost == -1)
50499 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50501 /* Penalize DFmode vector operations for Bonnell. */
50502 if (TARGET_BONNELL && kind == vector_stmt
50503 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50504 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50506 /* Statements in an inner loop relative to the loop being
50507 vectorized are weighted more heavily. The value here is
50508 arbitrary and could potentially be improved with analysis. */
50509 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50510 count *= 50; /* FIXME. */
50512 retval = (unsigned) (count * stmt_cost);
50514 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50515 for Silvermont as it has out of order integer pipeline and can execute
50516 2 scalar instruction per tick, but has in order SIMD pipeline. */
50517 if ((TARGET_SILVERMONT || TARGET_INTEL)
50518 && stmt_info && stmt_info->stmt)
50520 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50521 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50522 retval = (retval * 17) / 10;
50525 cost[where] += retval;
50527 return retval;
50530 /* Implement targetm.vectorize.finish_cost. */
50532 static void
50533 ix86_finish_cost (void *data, unsigned *prologue_cost,
50534 unsigned *body_cost, unsigned *epilogue_cost)
50536 unsigned *cost = (unsigned *) data;
50537 *prologue_cost = cost[vect_prologue];
50538 *body_cost = cost[vect_body];
50539 *epilogue_cost = cost[vect_epilogue];
50542 /* Implement targetm.vectorize.destroy_cost_data. */
50544 static void
50545 ix86_destroy_cost_data (void *data)
50547 free (data);
50550 /* Validate target specific memory model bits in VAL. */
50552 static unsigned HOST_WIDE_INT
50553 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50555 enum memmodel model = memmodel_from_int (val);
50556 bool strong;
50558 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50559 |MEMMODEL_MASK)
50560 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50562 warning (OPT_Winvalid_memory_model,
50563 "unknown architecture specific memory model");
50564 return MEMMODEL_SEQ_CST;
50566 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50567 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50569 warning (OPT_Winvalid_memory_model,
50570 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50571 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50573 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50575 warning (OPT_Winvalid_memory_model,
50576 "HLE_RELEASE not used with RELEASE or stronger memory model");
50577 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50579 return val;
50582 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50583 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50584 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50585 or number of vecsize_mangle variants that should be emitted. */
50587 static int
50588 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50589 struct cgraph_simd_clone *clonei,
50590 tree base_type, int num)
50592 int ret = 1;
50594 if (clonei->simdlen
50595 && (clonei->simdlen < 2
50596 || clonei->simdlen > 1024
50597 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50599 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50600 "unsupported simdlen %d", clonei->simdlen);
50601 return 0;
50604 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50605 if (TREE_CODE (ret_type) != VOID_TYPE)
50606 switch (TYPE_MODE (ret_type))
50608 case E_QImode:
50609 case E_HImode:
50610 case E_SImode:
50611 case E_DImode:
50612 case E_SFmode:
50613 case E_DFmode:
50614 /* case E_SCmode: */
50615 /* case E_DCmode: */
50616 break;
50617 default:
50618 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50619 "unsupported return type %qT for simd", ret_type);
50620 return 0;
50623 tree t;
50624 int i;
50626 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50627 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50628 switch (TYPE_MODE (TREE_TYPE (t)))
50630 case E_QImode:
50631 case E_HImode:
50632 case E_SImode:
50633 case E_DImode:
50634 case E_SFmode:
50635 case E_DFmode:
50636 /* case E_SCmode: */
50637 /* case E_DCmode: */
50638 break;
50639 default:
50640 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50641 "unsupported argument type %qT for simd", TREE_TYPE (t));
50642 return 0;
50645 if (!TREE_PUBLIC (node->decl))
50647 /* If the function isn't exported, we can pick up just one ISA
50648 for the clones. */
50649 if (TARGET_AVX512F)
50650 clonei->vecsize_mangle = 'e';
50651 else if (TARGET_AVX2)
50652 clonei->vecsize_mangle = 'd';
50653 else if (TARGET_AVX)
50654 clonei->vecsize_mangle = 'c';
50655 else
50656 clonei->vecsize_mangle = 'b';
50657 ret = 1;
50659 else
50661 clonei->vecsize_mangle = "bcde"[num];
50662 ret = 4;
50664 clonei->mask_mode = VOIDmode;
50665 switch (clonei->vecsize_mangle)
50667 case 'b':
50668 clonei->vecsize_int = 128;
50669 clonei->vecsize_float = 128;
50670 break;
50671 case 'c':
50672 clonei->vecsize_int = 128;
50673 clonei->vecsize_float = 256;
50674 break;
50675 case 'd':
50676 clonei->vecsize_int = 256;
50677 clonei->vecsize_float = 256;
50678 break;
50679 case 'e':
50680 clonei->vecsize_int = 512;
50681 clonei->vecsize_float = 512;
50682 if (TYPE_MODE (base_type) == QImode)
50683 clonei->mask_mode = DImode;
50684 else
50685 clonei->mask_mode = SImode;
50686 break;
50688 if (clonei->simdlen == 0)
50690 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50691 clonei->simdlen = clonei->vecsize_int;
50692 else
50693 clonei->simdlen = clonei->vecsize_float;
50694 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50696 else if (clonei->simdlen > 16)
50698 /* For compatibility with ICC, use the same upper bounds
50699 for simdlen. In particular, for CTYPE below, use the return type,
50700 unless the function returns void, in that case use the characteristic
50701 type. If it is possible for given SIMDLEN to pass CTYPE value
50702 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50703 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50704 emit corresponding clone. */
50705 tree ctype = ret_type;
50706 if (TREE_CODE (ret_type) == VOID_TYPE)
50707 ctype = base_type;
50708 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50709 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50710 cnt /= clonei->vecsize_int;
50711 else
50712 cnt /= clonei->vecsize_float;
50713 if (cnt > (TARGET_64BIT ? 16 : 8))
50715 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50716 "unsupported simdlen %d", clonei->simdlen);
50717 return 0;
50720 return ret;
50723 /* Add target attribute to SIMD clone NODE if needed. */
50725 static void
50726 ix86_simd_clone_adjust (struct cgraph_node *node)
50728 const char *str = NULL;
50729 gcc_assert (node->decl == cfun->decl);
50730 switch (node->simdclone->vecsize_mangle)
50732 case 'b':
50733 if (!TARGET_SSE2)
50734 str = "sse2";
50735 break;
50736 case 'c':
50737 if (!TARGET_AVX)
50738 str = "avx";
50739 break;
50740 case 'd':
50741 if (!TARGET_AVX2)
50742 str = "avx2";
50743 break;
50744 case 'e':
50745 if (!TARGET_AVX512F)
50746 str = "avx512f";
50747 break;
50748 default:
50749 gcc_unreachable ();
50751 if (str == NULL)
50752 return;
50753 push_cfun (NULL);
50754 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50755 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50756 gcc_assert (ok);
50757 pop_cfun ();
50758 ix86_reset_previous_fndecl ();
50759 ix86_set_current_function (node->decl);
50762 /* If SIMD clone NODE can't be used in a vectorized loop
50763 in current function, return -1, otherwise return a badness of using it
50764 (0 if it is most desirable from vecsize_mangle point of view, 1
50765 slightly less desirable, etc.). */
50767 static int
50768 ix86_simd_clone_usable (struct cgraph_node *node)
50770 switch (node->simdclone->vecsize_mangle)
50772 case 'b':
50773 if (!TARGET_SSE2)
50774 return -1;
50775 if (!TARGET_AVX)
50776 return 0;
50777 return TARGET_AVX2 ? 2 : 1;
50778 case 'c':
50779 if (!TARGET_AVX)
50780 return -1;
50781 return TARGET_AVX2 ? 1 : 0;
50782 case 'd':
50783 if (!TARGET_AVX2)
50784 return -1;
50785 return 0;
50786 case 'e':
50787 if (!TARGET_AVX512F)
50788 return -1;
50789 return 0;
50790 default:
50791 gcc_unreachable ();
50795 /* This function adjusts the unroll factor based on
50796 the hardware capabilities. For ex, bdver3 has
50797 a loop buffer which makes unrolling of smaller
50798 loops less important. This function decides the
50799 unroll factor using number of memory references
50800 (value 32 is used) as a heuristic. */
50802 static unsigned
50803 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50805 basic_block *bbs;
50806 rtx_insn *insn;
50807 unsigned i;
50808 unsigned mem_count = 0;
50810 if (!TARGET_ADJUST_UNROLL)
50811 return nunroll;
50813 /* Count the number of memory references within the loop body.
50814 This value determines the unrolling factor for bdver3 and bdver4
50815 architectures. */
50816 subrtx_iterator::array_type array;
50817 bbs = get_loop_body (loop);
50818 for (i = 0; i < loop->num_nodes; i++)
50819 FOR_BB_INSNS (bbs[i], insn)
50820 if (NONDEBUG_INSN_P (insn))
50821 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50822 if (const_rtx x = *iter)
50823 if (MEM_P (x))
50825 machine_mode mode = GET_MODE (x);
50826 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50827 if (n_words > 4)
50828 mem_count += 2;
50829 else
50830 mem_count += 1;
50832 free (bbs);
50834 if (mem_count && mem_count <=32)
50835 return MIN (nunroll, 32 / mem_count);
50837 return nunroll;
50841 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50843 static bool
50844 ix86_float_exceptions_rounding_supported_p (void)
50846 /* For x87 floating point with standard excess precision handling,
50847 there is no adddf3 pattern (since x87 floating point only has
50848 XFmode operations) so the default hook implementation gets this
50849 wrong. */
50850 return TARGET_80387 || TARGET_SSE_MATH;
50853 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50855 static void
50856 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50858 if (!TARGET_80387 && !TARGET_SSE_MATH)
50859 return;
50860 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50861 if (TARGET_80387)
50863 tree fenv_index_type = build_index_type (size_int (6));
50864 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50865 tree fenv_var = create_tmp_var_raw (fenv_type);
50866 TREE_ADDRESSABLE (fenv_var) = 1;
50867 tree fenv_ptr = build_pointer_type (fenv_type);
50868 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50869 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50870 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50871 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50872 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50873 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50874 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50875 tree hold_fnclex = build_call_expr (fnclex, 0);
50876 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50877 NULL_TREE, NULL_TREE);
50878 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50879 hold_fnclex);
50880 *clear = build_call_expr (fnclex, 0);
50881 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50882 tree fnstsw_call = build_call_expr (fnstsw, 0);
50883 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50884 sw_var, fnstsw_call);
50885 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50886 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50887 exceptions_var, exceptions_x87);
50888 *update = build2 (COMPOUND_EXPR, integer_type_node,
50889 sw_mod, update_mod);
50890 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50891 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50893 if (TARGET_SSE_MATH)
50895 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50896 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50897 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50898 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50899 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50900 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50901 mxcsr_orig_var, stmxcsr_hold_call);
50902 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50903 mxcsr_orig_var,
50904 build_int_cst (unsigned_type_node, 0x1f80));
50905 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50906 build_int_cst (unsigned_type_node, 0xffffffc0));
50907 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50908 mxcsr_mod_var, hold_mod_val);
50909 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50910 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50911 hold_assign_orig, hold_assign_mod);
50912 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50913 ldmxcsr_hold_call);
50914 if (*hold)
50915 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50916 else
50917 *hold = hold_all;
50918 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50919 if (*clear)
50920 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50921 ldmxcsr_clear_call);
50922 else
50923 *clear = ldmxcsr_clear_call;
50924 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50925 tree exceptions_sse = fold_convert (integer_type_node,
50926 stxmcsr_update_call);
50927 if (*update)
50929 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50930 exceptions_var, exceptions_sse);
50931 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50932 exceptions_var, exceptions_mod);
50933 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50934 exceptions_assign);
50936 else
50937 *update = build2 (MODIFY_EXPR, integer_type_node,
50938 exceptions_var, exceptions_sse);
50939 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50940 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50941 ldmxcsr_update_call);
50943 tree atomic_feraiseexcept
50944 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50945 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50946 1, exceptions_var);
50947 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50948 atomic_feraiseexcept_call);
50951 /* Return mode to be used for bounds or VOIDmode
50952 if bounds are not supported. */
50954 static machine_mode
50955 ix86_mpx_bound_mode ()
50957 /* Do not support pointer checker if MPX
50958 is not enabled. */
50959 if (!TARGET_MPX)
50961 if (flag_check_pointer_bounds)
50962 warning (0, "Pointer Checker requires MPX support on this target."
50963 " Use -mmpx options to enable MPX.");
50964 return VOIDmode;
50967 return BNDmode;
50970 /* Return constant used to statically initialize constant bounds.
50972 This function is used to create special bound values. For now
50973 only INIT bounds and NONE bounds are expected. More special
50974 values may be added later. */
50976 static tree
50977 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50979 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50980 : build_zero_cst (pointer_sized_int_node);
50981 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50982 : build_minus_one_cst (pointer_sized_int_node);
50984 /* This function is supposed to be used to create INIT and
50985 NONE bounds only. */
50986 gcc_assert ((lb == 0 && ub == -1)
50987 || (lb == -1 && ub == 0));
50989 return build_complex (NULL, low, high);
50992 /* Generate a list of statements STMTS to initialize pointer bounds
50993 variable VAR with bounds LB and UB. Return the number of generated
50994 statements. */
50996 static int
50997 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50999 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51000 tree lhs, modify, var_p;
51002 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51003 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51005 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51006 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51007 append_to_statement_list (modify, stmts);
51009 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51010 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51011 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51012 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51013 append_to_statement_list (modify, stmts);
51015 return 2;
51018 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51019 /* For i386, common symbol is local only for non-PIE binaries. For
51020 x86-64, common symbol is local only for non-PIE binaries or linker
51021 supports copy reloc in PIE binaries. */
51023 static bool
51024 ix86_binds_local_p (const_tree exp)
51026 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51027 (!flag_pic
51028 || (TARGET_64BIT
51029 && HAVE_LD_PIE_COPYRELOC != 0)));
51031 #endif
51033 /* If MEM is in the form of [base+offset], extract the two parts
51034 of address and set to BASE and OFFSET, otherwise return false. */
51036 static bool
51037 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51039 rtx addr;
51041 gcc_assert (MEM_P (mem));
51043 addr = XEXP (mem, 0);
51045 if (GET_CODE (addr) == CONST)
51046 addr = XEXP (addr, 0);
51048 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51050 *base = addr;
51051 *offset = const0_rtx;
51052 return true;
51055 if (GET_CODE (addr) == PLUS
51056 && (REG_P (XEXP (addr, 0))
51057 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51058 && CONST_INT_P (XEXP (addr, 1)))
51060 *base = XEXP (addr, 0);
51061 *offset = XEXP (addr, 1);
51062 return true;
51065 return false;
51068 /* Given OPERANDS of consecutive load/store, check if we can merge
51069 them into move multiple. LOAD is true if they are load instructions.
51070 MODE is the mode of memory operands. */
51072 bool
51073 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51074 machine_mode mode)
51076 HOST_WIDE_INT offval_1, offval_2, msize;
51077 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51079 if (load)
51081 mem_1 = operands[1];
51082 mem_2 = operands[3];
51083 reg_1 = operands[0];
51084 reg_2 = operands[2];
51086 else
51088 mem_1 = operands[0];
51089 mem_2 = operands[2];
51090 reg_1 = operands[1];
51091 reg_2 = operands[3];
51094 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51096 if (REGNO (reg_1) != REGNO (reg_2))
51097 return false;
51099 /* Check if the addresses are in the form of [base+offset]. */
51100 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51101 return false;
51102 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51103 return false;
51105 /* Check if the bases are the same. */
51106 if (!rtx_equal_p (base_1, base_2))
51107 return false;
51109 offval_1 = INTVAL (offset_1);
51110 offval_2 = INTVAL (offset_2);
51111 msize = GET_MODE_SIZE (mode);
51112 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51113 if (offval_1 + msize != offval_2)
51114 return false;
51116 return true;
51119 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51121 static bool
51122 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51123 optimization_type opt_type)
51125 switch (op)
51127 case asin_optab:
51128 case acos_optab:
51129 case log1p_optab:
51130 case exp_optab:
51131 case exp10_optab:
51132 case exp2_optab:
51133 case expm1_optab:
51134 case ldexp_optab:
51135 case scalb_optab:
51136 case round_optab:
51137 return opt_type == OPTIMIZE_FOR_SPEED;
51139 case rint_optab:
51140 if (SSE_FLOAT_MODE_P (mode1)
51141 && TARGET_SSE_MATH
51142 && !flag_trapping_math
51143 && !TARGET_SSE4_1)
51144 return opt_type == OPTIMIZE_FOR_SPEED;
51145 return true;
51147 case floor_optab:
51148 case ceil_optab:
51149 case btrunc_optab:
51150 if (SSE_FLOAT_MODE_P (mode1)
51151 && TARGET_SSE_MATH
51152 && !flag_trapping_math
51153 && TARGET_SSE4_1)
51154 return true;
51155 return opt_type == OPTIMIZE_FOR_SPEED;
51157 case rsqrt_optab:
51158 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51160 default:
51161 return true;
51165 /* Address space support.
51167 This is not "far pointers" in the 16-bit sense, but an easy way
51168 to use %fs and %gs segment prefixes. Therefore:
51170 (a) All address spaces have the same modes,
51171 (b) All address spaces have the same addresss forms,
51172 (c) While %fs and %gs are technically subsets of the generic
51173 address space, they are probably not subsets of each other.
51174 (d) Since we have no access to the segment base register values
51175 without resorting to a system call, we cannot convert a
51176 non-default address space to a default address space.
51177 Therefore we do not claim %fs or %gs are subsets of generic.
51179 Therefore we can (mostly) use the default hooks. */
51181 /* All use of segmentation is assumed to make address 0 valid. */
51183 static bool
51184 ix86_addr_space_zero_address_valid (addr_space_t as)
51186 return as != ADDR_SPACE_GENERIC;
51189 static void
51190 ix86_init_libfuncs (void)
51192 if (TARGET_64BIT)
51194 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51195 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51197 else
51199 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51200 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51203 #if TARGET_MACHO
51204 darwin_rename_builtins ();
51205 #endif
51208 /* Generate call to __divmoddi4. */
51210 static void
51211 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51212 rtx op0, rtx op1,
51213 rtx *quot_p, rtx *rem_p)
51215 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51217 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51218 mode,
51219 op0, GET_MODE (op0),
51220 op1, GET_MODE (op1),
51221 XEXP (rem, 0), Pmode);
51222 *quot_p = quot;
51223 *rem_p = rem;
51226 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51227 FPU, assume that the fpcw is set to extended precision; when using
51228 only SSE, rounding is correct; when using both SSE and the FPU,
51229 the rounding precision is indeterminate, since either may be chosen
51230 apparently at random. */
51232 static enum flt_eval_method
51233 ix86_excess_precision (enum excess_precision_type type)
51235 switch (type)
51237 case EXCESS_PRECISION_TYPE_FAST:
51238 /* The fastest type to promote to will always be the native type,
51239 whether that occurs with implicit excess precision or
51240 otherwise. */
51241 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51242 case EXCESS_PRECISION_TYPE_STANDARD:
51243 case EXCESS_PRECISION_TYPE_IMPLICIT:
51244 /* Otherwise, the excess precision we want when we are
51245 in a standards compliant mode, and the implicit precision we
51246 provide would be identical were it not for the unpredictable
51247 cases. */
51248 if (!TARGET_80387)
51249 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51250 else if (!TARGET_MIX_SSE_I387)
51252 if (!TARGET_SSE_MATH)
51253 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51254 else if (TARGET_SSE2)
51255 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51258 /* If we are in standards compliant mode, but we know we will
51259 calculate in unpredictable precision, return
51260 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51261 excess precision if the target can't guarantee it will honor
51262 it. */
51263 return (type == EXCESS_PRECISION_TYPE_STANDARD
51264 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51265 : FLT_EVAL_METHOD_UNPREDICTABLE);
51266 default:
51267 gcc_unreachable ();
51270 return FLT_EVAL_METHOD_UNPREDICTABLE;
51273 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
51274 decrements by exactly 2 no matter what the position was, there is no pushb.
51276 But as CIE data alignment factor on this arch is -4 for 32bit targets
51277 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51278 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
51280 poly_int64
51281 ix86_push_rounding (poly_int64 bytes)
51283 return ROUND_UP (bytes, UNITS_PER_WORD);
51286 /* Target-specific selftests. */
51288 #if CHECKING_P
51290 namespace selftest {
51292 /* Verify that hard regs are dumped as expected (in compact mode). */
51294 static void
51295 ix86_test_dumping_hard_regs ()
51297 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51298 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51301 /* Test dumping an insn with repeated references to the same SCRATCH,
51302 to verify the rtx_reuse code. */
51304 static void
51305 ix86_test_dumping_memory_blockage ()
51307 set_new_first_and_last_insn (NULL, NULL);
51309 rtx pat = gen_memory_blockage ();
51310 rtx_reuse_manager r;
51311 r.preprocess (pat);
51313 /* Verify that the repeated references to the SCRATCH show use
51314 reuse IDS. The first should be prefixed with a reuse ID,
51315 and the second should be dumped as a "reuse_rtx" of that ID.
51316 The expected string assumes Pmode == DImode. */
51317 if (Pmode == DImode)
51318 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51319 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51320 " (unspec:BLK [\n"
51321 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51322 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51325 /* Verify loading an RTL dump; specifically a dump of copying
51326 a param on x86_64 from a hard reg into the frame.
51327 This test is target-specific since the dump contains target-specific
51328 hard reg names. */
51330 static void
51331 ix86_test_loading_dump_fragment_1 ()
51333 rtl_dump_test t (SELFTEST_LOCATION,
51334 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51336 rtx_insn *insn = get_insn_by_uid (1);
51338 /* The block structure and indentation here is purely for
51339 readability; it mirrors the structure of the rtx. */
51340 tree mem_expr;
51342 rtx pat = PATTERN (insn);
51343 ASSERT_EQ (SET, GET_CODE (pat));
51345 rtx dest = SET_DEST (pat);
51346 ASSERT_EQ (MEM, GET_CODE (dest));
51347 /* Verify the "/c" was parsed. */
51348 ASSERT_TRUE (RTX_FLAG (dest, call));
51349 ASSERT_EQ (SImode, GET_MODE (dest));
51351 rtx addr = XEXP (dest, 0);
51352 ASSERT_EQ (PLUS, GET_CODE (addr));
51353 ASSERT_EQ (DImode, GET_MODE (addr));
51355 rtx lhs = XEXP (addr, 0);
51356 /* Verify that the "frame" REG was consolidated. */
51357 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51360 rtx rhs = XEXP (addr, 1);
51361 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51362 ASSERT_EQ (-4, INTVAL (rhs));
51365 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51366 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51367 /* "i" should have been handled by synthesizing a global int
51368 variable named "i". */
51369 mem_expr = MEM_EXPR (dest);
51370 ASSERT_NE (mem_expr, NULL);
51371 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51372 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51373 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51374 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51375 /* "+0". */
51376 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51377 ASSERT_EQ (0, MEM_OFFSET (dest));
51378 /* "S4". */
51379 ASSERT_EQ (4, MEM_SIZE (dest));
51380 /* "A32. */
51381 ASSERT_EQ (32, MEM_ALIGN (dest));
51384 rtx src = SET_SRC (pat);
51385 ASSERT_EQ (REG, GET_CODE (src));
51386 ASSERT_EQ (SImode, GET_MODE (src));
51387 ASSERT_EQ (5, REGNO (src));
51388 tree reg_expr = REG_EXPR (src);
51389 /* "i" here should point to the same var as for the MEM_EXPR. */
51390 ASSERT_EQ (reg_expr, mem_expr);
51395 /* Verify that the RTL loader copes with a call_insn dump.
51396 This test is target-specific since the dump contains a target-specific
51397 hard reg name. */
51399 static void
51400 ix86_test_loading_call_insn ()
51402 /* The test dump includes register "xmm0", where requires TARGET_SSE
51403 to exist. */
51404 if (!TARGET_SSE)
51405 return;
51407 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51409 rtx_insn *insn = get_insns ();
51410 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51412 /* "/j". */
51413 ASSERT_TRUE (RTX_FLAG (insn, jump));
51415 rtx pat = PATTERN (insn);
51416 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51418 /* Verify REG_NOTES. */
51420 /* "(expr_list:REG_CALL_DECL". */
51421 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51422 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51423 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51425 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51426 rtx_expr_list *note1 = note0->next ();
51427 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51429 ASSERT_EQ (NULL, note1->next ());
51432 /* Verify CALL_INSN_FUNCTION_USAGE. */
51434 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51435 rtx_expr_list *usage
51436 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51437 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51438 ASSERT_EQ (DFmode, GET_MODE (usage));
51439 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51440 ASSERT_EQ (NULL, usage->next ());
51444 /* Verify that the RTL loader copes a dump from print_rtx_function.
51445 This test is target-specific since the dump contains target-specific
51446 hard reg names. */
51448 static void
51449 ix86_test_loading_full_dump ()
51451 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51453 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51455 rtx_insn *insn_1 = get_insn_by_uid (1);
51456 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51458 rtx_insn *insn_7 = get_insn_by_uid (7);
51459 ASSERT_EQ (INSN, GET_CODE (insn_7));
51460 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51462 rtx_insn *insn_15 = get_insn_by_uid (15);
51463 ASSERT_EQ (INSN, GET_CODE (insn_15));
51464 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51466 /* Verify crtl->return_rtx. */
51467 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51468 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51469 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51472 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51473 In particular, verify that it correctly loads the 2nd operand.
51474 This test is target-specific since these are machine-specific
51475 operands (and enums). */
51477 static void
51478 ix86_test_loading_unspec ()
51480 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51482 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51484 ASSERT_TRUE (cfun);
51486 /* Test of an UNSPEC. */
51487 rtx_insn *insn = get_insns ();
51488 ASSERT_EQ (INSN, GET_CODE (insn));
51489 rtx set = single_set (insn);
51490 ASSERT_NE (NULL, set);
51491 rtx dst = SET_DEST (set);
51492 ASSERT_EQ (MEM, GET_CODE (dst));
51493 rtx src = SET_SRC (set);
51494 ASSERT_EQ (UNSPEC, GET_CODE (src));
51495 ASSERT_EQ (BLKmode, GET_MODE (src));
51496 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51498 rtx v0 = XVECEXP (src, 0, 0);
51500 /* Verify that the two uses of the first SCRATCH have pointer
51501 equality. */
51502 rtx scratch_a = XEXP (dst, 0);
51503 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51505 rtx scratch_b = XEXP (v0, 0);
51506 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51508 ASSERT_EQ (scratch_a, scratch_b);
51510 /* Verify that the two mems are thus treated as equal. */
51511 ASSERT_TRUE (rtx_equal_p (dst, v0));
51513 /* Verify the the insn is recognized. */
51514 ASSERT_NE(-1, recog_memoized (insn));
51516 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51517 insn = NEXT_INSN (insn);
51518 ASSERT_EQ (INSN, GET_CODE (insn));
51520 set = single_set (insn);
51521 ASSERT_NE (NULL, set);
51523 src = SET_SRC (set);
51524 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51525 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51528 /* Run all target-specific selftests. */
51530 static void
51531 ix86_run_selftests (void)
51533 ix86_test_dumping_hard_regs ();
51534 ix86_test_dumping_memory_blockage ();
51536 /* Various tests of loading RTL dumps, here because they contain
51537 ix86-isms (e.g. names of hard regs). */
51538 ix86_test_loading_dump_fragment_1 ();
51539 ix86_test_loading_call_insn ();
51540 ix86_test_loading_full_dump ();
51541 ix86_test_loading_unspec ();
51544 } // namespace selftest
51546 #endif /* CHECKING_P */
51548 /* Initialize the GCC target structure. */
51549 #undef TARGET_RETURN_IN_MEMORY
51550 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51552 #undef TARGET_LEGITIMIZE_ADDRESS
51553 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51555 #undef TARGET_ATTRIBUTE_TABLE
51556 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51557 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51558 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51559 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51560 # undef TARGET_MERGE_DECL_ATTRIBUTES
51561 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51562 #endif
51564 #undef TARGET_COMP_TYPE_ATTRIBUTES
51565 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51567 #undef TARGET_INIT_BUILTINS
51568 #define TARGET_INIT_BUILTINS ix86_init_builtins
51569 #undef TARGET_BUILTIN_DECL
51570 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51571 #undef TARGET_EXPAND_BUILTIN
51572 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51576 ix86_builtin_vectorized_function
51578 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51579 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51581 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51582 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51584 #undef TARGET_BUILTIN_RECIPROCAL
51585 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51587 #undef TARGET_ASM_FUNCTION_EPILOGUE
51588 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51590 #undef TARGET_ENCODE_SECTION_INFO
51591 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51592 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51593 #else
51594 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51595 #endif
51597 #undef TARGET_ASM_OPEN_PAREN
51598 #define TARGET_ASM_OPEN_PAREN ""
51599 #undef TARGET_ASM_CLOSE_PAREN
51600 #define TARGET_ASM_CLOSE_PAREN ""
51602 #undef TARGET_ASM_BYTE_OP
51603 #define TARGET_ASM_BYTE_OP ASM_BYTE
51605 #undef TARGET_ASM_ALIGNED_HI_OP
51606 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51607 #undef TARGET_ASM_ALIGNED_SI_OP
51608 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51609 #ifdef ASM_QUAD
51610 #undef TARGET_ASM_ALIGNED_DI_OP
51611 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51612 #endif
51614 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51615 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51617 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51618 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51620 #undef TARGET_ASM_UNALIGNED_HI_OP
51621 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51622 #undef TARGET_ASM_UNALIGNED_SI_OP
51623 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51624 #undef TARGET_ASM_UNALIGNED_DI_OP
51625 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51627 #undef TARGET_PRINT_OPERAND
51628 #define TARGET_PRINT_OPERAND ix86_print_operand
51629 #undef TARGET_PRINT_OPERAND_ADDRESS
51630 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51631 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51632 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51633 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51634 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51636 #undef TARGET_SCHED_INIT_GLOBAL
51637 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51638 #undef TARGET_SCHED_ADJUST_COST
51639 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51640 #undef TARGET_SCHED_ISSUE_RATE
51641 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51642 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51643 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51644 ia32_multipass_dfa_lookahead
51645 #undef TARGET_SCHED_MACRO_FUSION_P
51646 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51647 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51648 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51650 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51651 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51653 #undef TARGET_MEMMODEL_CHECK
51654 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51656 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51657 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51659 #ifdef HAVE_AS_TLS
51660 #undef TARGET_HAVE_TLS
51661 #define TARGET_HAVE_TLS true
51662 #endif
51663 #undef TARGET_CANNOT_FORCE_CONST_MEM
51664 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51665 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51666 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51668 #undef TARGET_DELEGITIMIZE_ADDRESS
51669 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51671 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51672 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51674 #undef TARGET_MS_BITFIELD_LAYOUT_P
51675 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51677 #if TARGET_MACHO
51678 #undef TARGET_BINDS_LOCAL_P
51679 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51680 #else
51681 #undef TARGET_BINDS_LOCAL_P
51682 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51683 #endif
51684 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51685 #undef TARGET_BINDS_LOCAL_P
51686 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51687 #endif
51689 #undef TARGET_ASM_OUTPUT_MI_THUNK
51690 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51691 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51692 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51694 #undef TARGET_ASM_FILE_START
51695 #define TARGET_ASM_FILE_START x86_file_start
51697 #undef TARGET_OPTION_OVERRIDE
51698 #define TARGET_OPTION_OVERRIDE ix86_option_override
51700 #undef TARGET_REGISTER_MOVE_COST
51701 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51702 #undef TARGET_MEMORY_MOVE_COST
51703 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51704 #undef TARGET_RTX_COSTS
51705 #define TARGET_RTX_COSTS ix86_rtx_costs
51706 #undef TARGET_ADDRESS_COST
51707 #define TARGET_ADDRESS_COST ix86_address_cost
51709 #undef TARGET_FLAGS_REGNUM
51710 #define TARGET_FLAGS_REGNUM FLAGS_REG
51711 #undef TARGET_FIXED_CONDITION_CODE_REGS
51712 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51713 #undef TARGET_CC_MODES_COMPATIBLE
51714 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51716 #undef TARGET_MACHINE_DEPENDENT_REORG
51717 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51719 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51720 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51722 #undef TARGET_BUILD_BUILTIN_VA_LIST
51723 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51725 #undef TARGET_FOLD_BUILTIN
51726 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51728 #undef TARGET_GIMPLE_FOLD_BUILTIN
51729 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51731 #undef TARGET_COMPARE_VERSION_PRIORITY
51732 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51734 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51735 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51736 ix86_generate_version_dispatcher_body
51738 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51739 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51740 ix86_get_function_versions_dispatcher
51742 #undef TARGET_ENUM_VA_LIST_P
51743 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51745 #undef TARGET_FN_ABI_VA_LIST
51746 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51748 #undef TARGET_CANONICAL_VA_LIST_TYPE
51749 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51751 #undef TARGET_EXPAND_BUILTIN_VA_START
51752 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51754 #undef TARGET_MD_ASM_ADJUST
51755 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51757 #undef TARGET_C_EXCESS_PRECISION
51758 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51759 #undef TARGET_PROMOTE_PROTOTYPES
51760 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51761 #undef TARGET_SETUP_INCOMING_VARARGS
51762 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51763 #undef TARGET_MUST_PASS_IN_STACK
51764 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51765 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51766 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51767 #undef TARGET_FUNCTION_ARG_ADVANCE
51768 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51769 #undef TARGET_FUNCTION_ARG
51770 #define TARGET_FUNCTION_ARG ix86_function_arg
51771 #undef TARGET_INIT_PIC_REG
51772 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51773 #undef TARGET_USE_PSEUDO_PIC_REG
51774 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51775 #undef TARGET_FUNCTION_ARG_BOUNDARY
51776 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51777 #undef TARGET_PASS_BY_REFERENCE
51778 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51779 #undef TARGET_INTERNAL_ARG_POINTER
51780 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51781 #undef TARGET_UPDATE_STACK_BOUNDARY
51782 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51783 #undef TARGET_GET_DRAP_RTX
51784 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51785 #undef TARGET_STRICT_ARGUMENT_NAMING
51786 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51787 #undef TARGET_STATIC_CHAIN
51788 #define TARGET_STATIC_CHAIN ix86_static_chain
51789 #undef TARGET_TRAMPOLINE_INIT
51790 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51791 #undef TARGET_RETURN_POPS_ARGS
51792 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51794 #undef TARGET_WARN_FUNC_RETURN
51795 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51797 #undef TARGET_LEGITIMATE_COMBINED_INSN
51798 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51800 #undef TARGET_ASAN_SHADOW_OFFSET
51801 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51803 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51804 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51806 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51807 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51809 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51810 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51812 #undef TARGET_C_MODE_FOR_SUFFIX
51813 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51815 #ifdef HAVE_AS_TLS
51816 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51817 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51818 #endif
51820 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51821 #undef TARGET_INSERT_ATTRIBUTES
51822 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51823 #endif
51825 #undef TARGET_MANGLE_TYPE
51826 #define TARGET_MANGLE_TYPE ix86_mangle_type
51828 #undef TARGET_STACK_PROTECT_GUARD
51829 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51831 #if !TARGET_MACHO
51832 #undef TARGET_STACK_PROTECT_FAIL
51833 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51834 #endif
51836 #undef TARGET_FUNCTION_VALUE
51837 #define TARGET_FUNCTION_VALUE ix86_function_value
51839 #undef TARGET_FUNCTION_VALUE_REGNO_P
51840 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51842 #undef TARGET_PROMOTE_FUNCTION_MODE
51843 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51845 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51846 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51848 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51849 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51851 #undef TARGET_INSTANTIATE_DECLS
51852 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51854 #undef TARGET_SECONDARY_RELOAD
51855 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51856 #undef TARGET_SECONDARY_MEMORY_NEEDED
51857 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51858 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51859 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51861 #undef TARGET_CLASS_MAX_NREGS
51862 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51864 #undef TARGET_PREFERRED_RELOAD_CLASS
51865 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51866 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51867 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51868 #undef TARGET_CLASS_LIKELY_SPILLED_P
51869 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51871 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51872 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51873 ix86_builtin_vectorization_cost
51874 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51875 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51876 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51877 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51878 ix86_preferred_simd_mode
51879 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51880 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51881 ix86_split_reduction
51882 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51883 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51884 ix86_autovectorize_vector_sizes
51885 #undef TARGET_VECTORIZE_GET_MASK_MODE
51886 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51887 #undef TARGET_VECTORIZE_INIT_COST
51888 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51889 #undef TARGET_VECTORIZE_ADD_STMT_COST
51890 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51891 #undef TARGET_VECTORIZE_FINISH_COST
51892 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51893 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51894 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51896 #undef TARGET_SET_CURRENT_FUNCTION
51897 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51899 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51900 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51902 #undef TARGET_OPTION_SAVE
51903 #define TARGET_OPTION_SAVE ix86_function_specific_save
51905 #undef TARGET_OPTION_RESTORE
51906 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51908 #undef TARGET_OPTION_POST_STREAM_IN
51909 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51911 #undef TARGET_OPTION_PRINT
51912 #define TARGET_OPTION_PRINT ix86_function_specific_print
51914 #undef TARGET_OPTION_FUNCTION_VERSIONS
51915 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51917 #undef TARGET_CAN_INLINE_P
51918 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51920 #undef TARGET_LEGITIMATE_ADDRESS_P
51921 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51923 #undef TARGET_REGISTER_PRIORITY
51924 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51926 #undef TARGET_REGISTER_USAGE_LEVELING_P
51927 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51929 #undef TARGET_LEGITIMATE_CONSTANT_P
51930 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51932 #undef TARGET_COMPUTE_FRAME_LAYOUT
51933 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
51935 #undef TARGET_FRAME_POINTER_REQUIRED
51936 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51938 #undef TARGET_CAN_ELIMINATE
51939 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51941 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51942 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51944 #undef TARGET_ASM_CODE_END
51945 #define TARGET_ASM_CODE_END ix86_code_end
51947 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51948 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51950 #undef TARGET_CANONICALIZE_COMPARISON
51951 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
51953 #undef TARGET_LOOP_UNROLL_ADJUST
51954 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51956 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51957 #undef TARGET_SPILL_CLASS
51958 #define TARGET_SPILL_CLASS ix86_spill_class
51960 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51961 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51962 ix86_simd_clone_compute_vecsize_and_simdlen
51964 #undef TARGET_SIMD_CLONE_ADJUST
51965 #define TARGET_SIMD_CLONE_ADJUST \
51966 ix86_simd_clone_adjust
51968 #undef TARGET_SIMD_CLONE_USABLE
51969 #define TARGET_SIMD_CLONE_USABLE \
51970 ix86_simd_clone_usable
51972 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51973 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51974 ix86_float_exceptions_rounding_supported_p
51976 #undef TARGET_MODE_EMIT
51977 #define TARGET_MODE_EMIT ix86_emit_mode_set
51979 #undef TARGET_MODE_NEEDED
51980 #define TARGET_MODE_NEEDED ix86_mode_needed
51982 #undef TARGET_MODE_AFTER
51983 #define TARGET_MODE_AFTER ix86_mode_after
51985 #undef TARGET_MODE_ENTRY
51986 #define TARGET_MODE_ENTRY ix86_mode_entry
51988 #undef TARGET_MODE_EXIT
51989 #define TARGET_MODE_EXIT ix86_mode_exit
51991 #undef TARGET_MODE_PRIORITY
51992 #define TARGET_MODE_PRIORITY ix86_mode_priority
51994 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51995 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51997 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51998 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52000 #undef TARGET_STORE_BOUNDS_FOR_ARG
52001 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52003 #undef TARGET_LOAD_RETURNED_BOUNDS
52004 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52006 #undef TARGET_STORE_RETURNED_BOUNDS
52007 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52009 #undef TARGET_CHKP_BOUND_MODE
52010 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52012 #undef TARGET_BUILTIN_CHKP_FUNCTION
52013 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52015 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52016 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52018 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52019 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52021 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52022 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52024 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52025 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52027 #undef TARGET_OFFLOAD_OPTIONS
52028 #define TARGET_OFFLOAD_OPTIONS \
52029 ix86_offload_options
52031 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52032 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52034 #undef TARGET_OPTAB_SUPPORTED_P
52035 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52037 #undef TARGET_HARD_REGNO_SCRATCH_OK
52038 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52040 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52041 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52043 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52044 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52046 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52047 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52049 #undef TARGET_INIT_LIBFUNCS
52050 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52052 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52053 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52055 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52056 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52058 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52059 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52061 #undef TARGET_HARD_REGNO_NREGS
52062 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
52063 #undef TARGET_HARD_REGNO_MODE_OK
52064 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
52066 #undef TARGET_MODES_TIEABLE_P
52067 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
52069 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
52070 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
52071 ix86_hard_regno_call_part_clobbered
52073 #undef TARGET_CAN_CHANGE_MODE_CLASS
52074 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
52076 #undef TARGET_STATIC_RTX_ALIGNMENT
52077 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
52078 #undef TARGET_CONSTANT_ALIGNMENT
52079 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
52081 #undef TARGET_EMPTY_RECORD_P
52082 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
52084 #undef TARGET_WARN_PARAMETER_PASSING_ABI
52085 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
52087 #if CHECKING_P
52088 #undef TARGET_RUN_TARGET_SELFTESTS
52089 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52090 #endif /* #if CHECKING_P */
52092 struct gcc_target targetm = TARGET_INITIALIZER;
52094 #include "gt-i386.h"