386: Disallow naked attribute with interrupt attribute
[official-gcc.git] / gcc / config / i386 / i386.c
blob587dbe61e8b34184bb90113edaa99b52e615b246
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
89 /* This file should be included last. */
90 #include "target-def.h"
92 static rtx legitimize_dllimport_symbol (rtx, bool);
93 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
94 static rtx legitimize_pe_coff_symbol (rtx, bool);
95 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
96 static bool ix86_save_reg (unsigned int, bool, bool);
97 static bool ix86_function_naked (const_tree);
99 #ifndef CHECK_STACK_LIMIT
100 #define CHECK_STACK_LIMIT (-1)
101 #endif
103 /* Return index of given mode in mult and division cost tables. */
104 #define MODE_INDEX(mode) \
105 ((mode) == QImode ? 0 \
106 : (mode) == HImode ? 1 \
107 : (mode) == SImode ? 2 \
108 : (mode) == DImode ? 3 \
109 : 4)
111 /* Processor costs (relative to an add) */
112 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
113 #define COSTS_N_BYTES(N) ((N) * 2)
115 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
117 static stringop_algs ix86_size_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
120 static stringop_algs ix86_size_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
124 const
125 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
126 COSTS_N_BYTES (2), /* cost of an add instruction */
127 COSTS_N_BYTES (3), /* cost of a lea instruction */
128 COSTS_N_BYTES (2), /* variable shift costs */
129 COSTS_N_BYTES (3), /* constant shift costs */
130 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
131 COSTS_N_BYTES (3), /* HI */
132 COSTS_N_BYTES (3), /* SI */
133 COSTS_N_BYTES (3), /* DI */
134 COSTS_N_BYTES (5)}, /* other */
135 0, /* cost of multiply per each bit set */
136 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
137 COSTS_N_BYTES (3), /* HI */
138 COSTS_N_BYTES (3), /* SI */
139 COSTS_N_BYTES (3), /* DI */
140 COSTS_N_BYTES (5)}, /* other */
141 COSTS_N_BYTES (3), /* cost of movsx */
142 COSTS_N_BYTES (3), /* cost of movzx */
143 0, /* "large" insn */
144 2, /* MOVE_RATIO */
145 2, /* cost for loading QImode using movzbl */
146 {2, 2, 2}, /* cost of loading integer registers
147 in QImode, HImode and SImode.
148 Relative to reg-reg move (2). */
149 {2, 2, 2}, /* cost of storing integer registers */
150 2, /* cost of reg,reg fld/fst */
151 {2, 2, 2}, /* cost of loading fp registers
152 in SFmode, DFmode and XFmode */
153 {2, 2, 2}, /* cost of storing fp registers
154 in SFmode, DFmode and XFmode */
155 3, /* cost of moving MMX register */
156 {3, 3}, /* cost of loading MMX registers
157 in SImode and DImode */
158 {3, 3}, /* cost of storing MMX registers
159 in SImode and DImode */
160 3, /* cost of moving SSE register */
161 {3, 3, 3}, /* cost of loading SSE registers
162 in SImode, DImode and TImode */
163 {3, 3, 3}, /* cost of storing SSE registers
164 in SImode, DImode and TImode */
165 3, /* MMX or SSE register to integer */
166 0, /* size of l1 cache */
167 0, /* size of l2 cache */
168 0, /* size of prefetch block */
169 0, /* number of parallel prefetches */
170 2, /* Branch cost */
171 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
172 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
173 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
174 COSTS_N_BYTES (2), /* cost of FABS instruction. */
175 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
176 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
177 ix86_size_memcpy,
178 ix86_size_memset,
179 1, /* scalar_stmt_cost. */
180 1, /* scalar load_cost. */
181 1, /* scalar_store_cost. */
182 1, /* vec_stmt_cost. */
183 1, /* vec_to_scalar_cost. */
184 1, /* scalar_to_vec_cost. */
185 1, /* vec_align_load_cost. */
186 1, /* vec_unalign_load_cost. */
187 1, /* vec_store_cost. */
188 1, /* cond_taken_branch_cost. */
189 1, /* cond_not_taken_branch_cost. */
192 /* Processor costs (relative to an add) */
193 static stringop_algs i386_memcpy[2] = {
194 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
195 DUMMY_STRINGOP_ALGS};
196 static stringop_algs i386_memset[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
200 static const
201 struct processor_costs i386_cost = { /* 386 specific costs */
202 COSTS_N_INSNS (1), /* cost of an add instruction */
203 COSTS_N_INSNS (1), /* cost of a lea instruction */
204 COSTS_N_INSNS (3), /* variable shift costs */
205 COSTS_N_INSNS (2), /* constant shift costs */
206 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
207 COSTS_N_INSNS (6), /* HI */
208 COSTS_N_INSNS (6), /* SI */
209 COSTS_N_INSNS (6), /* DI */
210 COSTS_N_INSNS (6)}, /* other */
211 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
212 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
213 COSTS_N_INSNS (23), /* HI */
214 COSTS_N_INSNS (23), /* SI */
215 COSTS_N_INSNS (23), /* DI */
216 COSTS_N_INSNS (23)}, /* other */
217 COSTS_N_INSNS (3), /* cost of movsx */
218 COSTS_N_INSNS (2), /* cost of movzx */
219 15, /* "large" insn */
220 3, /* MOVE_RATIO */
221 4, /* cost for loading QImode using movzbl */
222 {2, 4, 2}, /* cost of loading integer registers
223 in QImode, HImode and SImode.
224 Relative to reg-reg move (2). */
225 {2, 4, 2}, /* cost of storing integer registers */
226 2, /* cost of reg,reg fld/fst */
227 {8, 8, 8}, /* cost of loading fp registers
228 in SFmode, DFmode and XFmode */
229 {8, 8, 8}, /* cost of storing fp registers
230 in SFmode, DFmode and XFmode */
231 2, /* cost of moving MMX register */
232 {4, 8}, /* cost of loading MMX registers
233 in SImode and DImode */
234 {4, 8}, /* cost of storing MMX registers
235 in SImode and DImode */
236 2, /* cost of moving SSE register */
237 {4, 8, 16}, /* cost of loading SSE registers
238 in SImode, DImode and TImode */
239 {4, 8, 16}, /* cost of storing SSE registers
240 in SImode, DImode and TImode */
241 3, /* MMX or SSE register to integer */
242 0, /* size of l1 cache */
243 0, /* size of l2 cache */
244 0, /* size of prefetch block */
245 0, /* number of parallel prefetches */
246 1, /* Branch cost */
247 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
248 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
249 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
250 COSTS_N_INSNS (22), /* cost of FABS instruction. */
251 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
252 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
253 i386_memcpy,
254 i386_memset,
255 1, /* scalar_stmt_cost. */
256 1, /* scalar load_cost. */
257 1, /* scalar_store_cost. */
258 1, /* vec_stmt_cost. */
259 1, /* vec_to_scalar_cost. */
260 1, /* scalar_to_vec_cost. */
261 1, /* vec_align_load_cost. */
262 2, /* vec_unalign_load_cost. */
263 1, /* vec_store_cost. */
264 3, /* cond_taken_branch_cost. */
265 1, /* cond_not_taken_branch_cost. */
268 static stringop_algs i486_memcpy[2] = {
269 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
270 DUMMY_STRINGOP_ALGS};
271 static stringop_algs i486_memset[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
275 static const
276 struct processor_costs i486_cost = { /* 486 specific costs */
277 COSTS_N_INSNS (1), /* cost of an add instruction */
278 COSTS_N_INSNS (1), /* cost of a lea instruction */
279 COSTS_N_INSNS (3), /* variable shift costs */
280 COSTS_N_INSNS (2), /* constant shift costs */
281 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
282 COSTS_N_INSNS (12), /* HI */
283 COSTS_N_INSNS (12), /* SI */
284 COSTS_N_INSNS (12), /* DI */
285 COSTS_N_INSNS (12)}, /* other */
286 1, /* cost of multiply per each bit set */
287 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
288 COSTS_N_INSNS (40), /* HI */
289 COSTS_N_INSNS (40), /* SI */
290 COSTS_N_INSNS (40), /* DI */
291 COSTS_N_INSNS (40)}, /* other */
292 COSTS_N_INSNS (3), /* cost of movsx */
293 COSTS_N_INSNS (2), /* cost of movzx */
294 15, /* "large" insn */
295 3, /* MOVE_RATIO */
296 4, /* cost for loading QImode using movzbl */
297 {2, 4, 2}, /* cost of loading integer registers
298 in QImode, HImode and SImode.
299 Relative to reg-reg move (2). */
300 {2, 4, 2}, /* cost of storing integer registers */
301 2, /* cost of reg,reg fld/fst */
302 {8, 8, 8}, /* cost of loading fp registers
303 in SFmode, DFmode and XFmode */
304 {8, 8, 8}, /* cost of storing fp registers
305 in SFmode, DFmode and XFmode */
306 2, /* cost of moving MMX register */
307 {4, 8}, /* cost of loading MMX registers
308 in SImode and DImode */
309 {4, 8}, /* cost of storing MMX registers
310 in SImode and DImode */
311 2, /* cost of moving SSE register */
312 {4, 8, 16}, /* cost of loading SSE registers
313 in SImode, DImode and TImode */
314 {4, 8, 16}, /* cost of storing SSE registers
315 in SImode, DImode and TImode */
316 3, /* MMX or SSE register to integer */
317 4, /* size of l1 cache. 486 has 8kB cache
318 shared for code and data, so 4kB is
319 not really precise. */
320 4, /* size of l2 cache */
321 0, /* size of prefetch block */
322 0, /* number of parallel prefetches */
323 1, /* Branch cost */
324 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
325 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
326 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
327 COSTS_N_INSNS (3), /* cost of FABS instruction. */
328 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
329 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
330 i486_memcpy,
331 i486_memset,
332 1, /* scalar_stmt_cost. */
333 1, /* scalar load_cost. */
334 1, /* scalar_store_cost. */
335 1, /* vec_stmt_cost. */
336 1, /* vec_to_scalar_cost. */
337 1, /* scalar_to_vec_cost. */
338 1, /* vec_align_load_cost. */
339 2, /* vec_unalign_load_cost. */
340 1, /* vec_store_cost. */
341 3, /* cond_taken_branch_cost. */
342 1, /* cond_not_taken_branch_cost. */
345 static stringop_algs pentium_memcpy[2] = {
346 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
347 DUMMY_STRINGOP_ALGS};
348 static stringop_algs pentium_memset[2] = {
349 {libcall, {{-1, rep_prefix_4_byte, false}}},
350 DUMMY_STRINGOP_ALGS};
352 static const
353 struct processor_costs pentium_cost = {
354 COSTS_N_INSNS (1), /* cost of an add instruction */
355 COSTS_N_INSNS (1), /* cost of a lea instruction */
356 COSTS_N_INSNS (4), /* variable shift costs */
357 COSTS_N_INSNS (1), /* constant shift costs */
358 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
359 COSTS_N_INSNS (11), /* HI */
360 COSTS_N_INSNS (11), /* SI */
361 COSTS_N_INSNS (11), /* DI */
362 COSTS_N_INSNS (11)}, /* other */
363 0, /* cost of multiply per each bit set */
364 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
365 COSTS_N_INSNS (25), /* HI */
366 COSTS_N_INSNS (25), /* SI */
367 COSTS_N_INSNS (25), /* DI */
368 COSTS_N_INSNS (25)}, /* other */
369 COSTS_N_INSNS (3), /* cost of movsx */
370 COSTS_N_INSNS (2), /* cost of movzx */
371 8, /* "large" insn */
372 6, /* MOVE_RATIO */
373 6, /* cost for loading QImode using movzbl */
374 {2, 4, 2}, /* cost of loading integer registers
375 in QImode, HImode and SImode.
376 Relative to reg-reg move (2). */
377 {2, 4, 2}, /* cost of storing integer registers */
378 2, /* cost of reg,reg fld/fst */
379 {2, 2, 6}, /* cost of loading fp registers
380 in SFmode, DFmode and XFmode */
381 {4, 4, 6}, /* cost of storing fp registers
382 in SFmode, DFmode and XFmode */
383 8, /* cost of moving MMX register */
384 {8, 8}, /* cost of loading MMX registers
385 in SImode and DImode */
386 {8, 8}, /* cost of storing MMX registers
387 in SImode and DImode */
388 2, /* cost of moving SSE register */
389 {4, 8, 16}, /* cost of loading SSE registers
390 in SImode, DImode and TImode */
391 {4, 8, 16}, /* cost of storing SSE registers
392 in SImode, DImode and TImode */
393 3, /* MMX or SSE register to integer */
394 8, /* size of l1 cache. */
395 8, /* size of l2 cache */
396 0, /* size of prefetch block */
397 0, /* number of parallel prefetches */
398 2, /* Branch cost */
399 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
400 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
401 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
402 COSTS_N_INSNS (1), /* cost of FABS instruction. */
403 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
404 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
405 pentium_memcpy,
406 pentium_memset,
407 1, /* scalar_stmt_cost. */
408 1, /* scalar load_cost. */
409 1, /* scalar_store_cost. */
410 1, /* vec_stmt_cost. */
411 1, /* vec_to_scalar_cost. */
412 1, /* scalar_to_vec_cost. */
413 1, /* vec_align_load_cost. */
414 2, /* vec_unalign_load_cost. */
415 1, /* vec_store_cost. */
416 3, /* cond_taken_branch_cost. */
417 1, /* cond_not_taken_branch_cost. */
420 static const
421 struct processor_costs lakemont_cost = {
422 COSTS_N_INSNS (1), /* cost of an add instruction */
423 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
424 COSTS_N_INSNS (1), /* variable shift costs */
425 COSTS_N_INSNS (1), /* constant shift costs */
426 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
427 COSTS_N_INSNS (11), /* HI */
428 COSTS_N_INSNS (11), /* SI */
429 COSTS_N_INSNS (11), /* DI */
430 COSTS_N_INSNS (11)}, /* other */
431 0, /* cost of multiply per each bit set */
432 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
433 COSTS_N_INSNS (25), /* HI */
434 COSTS_N_INSNS (25), /* SI */
435 COSTS_N_INSNS (25), /* DI */
436 COSTS_N_INSNS (25)}, /* other */
437 COSTS_N_INSNS (3), /* cost of movsx */
438 COSTS_N_INSNS (2), /* cost of movzx */
439 8, /* "large" insn */
440 17, /* MOVE_RATIO */
441 6, /* cost for loading QImode using movzbl */
442 {2, 4, 2}, /* cost of loading integer registers
443 in QImode, HImode and SImode.
444 Relative to reg-reg move (2). */
445 {2, 4, 2}, /* cost of storing integer registers */
446 2, /* cost of reg,reg fld/fst */
447 {2, 2, 6}, /* cost of loading fp registers
448 in SFmode, DFmode and XFmode */
449 {4, 4, 6}, /* cost of storing fp registers
450 in SFmode, DFmode and XFmode */
451 8, /* cost of moving MMX register */
452 {8, 8}, /* cost of loading MMX registers
453 in SImode and DImode */
454 {8, 8}, /* cost of storing MMX registers
455 in SImode and DImode */
456 2, /* cost of moving SSE register */
457 {4, 8, 16}, /* cost of loading SSE registers
458 in SImode, DImode and TImode */
459 {4, 8, 16}, /* cost of storing SSE registers
460 in SImode, DImode and TImode */
461 3, /* MMX or SSE register to integer */
462 8, /* size of l1 cache. */
463 8, /* size of l2 cache */
464 0, /* size of prefetch block */
465 0, /* number of parallel prefetches */
466 2, /* Branch cost */
467 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
468 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
469 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
470 COSTS_N_INSNS (1), /* cost of FABS instruction. */
471 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
472 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
473 pentium_memcpy,
474 pentium_memset,
475 1, /* scalar_stmt_cost. */
476 1, /* scalar load_cost. */
477 1, /* scalar_store_cost. */
478 1, /* vec_stmt_cost. */
479 1, /* vec_to_scalar_cost. */
480 1, /* scalar_to_vec_cost. */
481 1, /* vec_align_load_cost. */
482 2, /* vec_unalign_load_cost. */
483 1, /* vec_store_cost. */
484 3, /* cond_taken_branch_cost. */
485 1, /* cond_not_taken_branch_cost. */
488 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
489 (we ensure the alignment). For small blocks inline loop is still a
490 noticeable win, for bigger blocks either rep movsl or rep movsb is
491 way to go. Rep movsb has apparently more expensive startup time in CPU,
492 but after 4K the difference is down in the noise. */
493 static stringop_algs pentiumpro_memcpy[2] = {
494 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
495 {8192, rep_prefix_4_byte, false},
496 {-1, rep_prefix_1_byte, false}}},
497 DUMMY_STRINGOP_ALGS};
498 static stringop_algs pentiumpro_memset[2] = {
499 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
500 {8192, rep_prefix_4_byte, false},
501 {-1, libcall, false}}},
502 DUMMY_STRINGOP_ALGS};
503 static const
504 struct processor_costs pentiumpro_cost = {
505 COSTS_N_INSNS (1), /* cost of an add instruction */
506 COSTS_N_INSNS (1), /* cost of a lea instruction */
507 COSTS_N_INSNS (1), /* variable shift costs */
508 COSTS_N_INSNS (1), /* constant shift costs */
509 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
510 COSTS_N_INSNS (4), /* HI */
511 COSTS_N_INSNS (4), /* SI */
512 COSTS_N_INSNS (4), /* DI */
513 COSTS_N_INSNS (4)}, /* other */
514 0, /* cost of multiply per each bit set */
515 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
516 COSTS_N_INSNS (17), /* HI */
517 COSTS_N_INSNS (17), /* SI */
518 COSTS_N_INSNS (17), /* DI */
519 COSTS_N_INSNS (17)}, /* other */
520 COSTS_N_INSNS (1), /* cost of movsx */
521 COSTS_N_INSNS (1), /* cost of movzx */
522 8, /* "large" insn */
523 6, /* MOVE_RATIO */
524 2, /* cost for loading QImode using movzbl */
525 {4, 4, 4}, /* cost of loading integer registers
526 in QImode, HImode and SImode.
527 Relative to reg-reg move (2). */
528 {2, 2, 2}, /* cost of storing integer registers */
529 2, /* cost of reg,reg fld/fst */
530 {2, 2, 6}, /* cost of loading fp registers
531 in SFmode, DFmode and XFmode */
532 {4, 4, 6}, /* cost of storing fp registers
533 in SFmode, DFmode and XFmode */
534 2, /* cost of moving MMX register */
535 {2, 2}, /* cost of loading MMX registers
536 in SImode and DImode */
537 {2, 2}, /* cost of storing MMX registers
538 in SImode and DImode */
539 2, /* cost of moving SSE register */
540 {2, 2, 8}, /* cost of loading SSE registers
541 in SImode, DImode and TImode */
542 {2, 2, 8}, /* cost of storing SSE registers
543 in SImode, DImode and TImode */
544 3, /* MMX or SSE register to integer */
545 8, /* size of l1 cache. */
546 256, /* size of l2 cache */
547 32, /* size of prefetch block */
548 6, /* number of parallel prefetches */
549 2, /* Branch cost */
550 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
551 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
552 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
553 COSTS_N_INSNS (2), /* cost of FABS instruction. */
554 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
555 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
556 pentiumpro_memcpy,
557 pentiumpro_memset,
558 1, /* scalar_stmt_cost. */
559 1, /* scalar load_cost. */
560 1, /* scalar_store_cost. */
561 1, /* vec_stmt_cost. */
562 1, /* vec_to_scalar_cost. */
563 1, /* scalar_to_vec_cost. */
564 1, /* vec_align_load_cost. */
565 2, /* vec_unalign_load_cost. */
566 1, /* vec_store_cost. */
567 3, /* cond_taken_branch_cost. */
568 1, /* cond_not_taken_branch_cost. */
571 static stringop_algs geode_memcpy[2] = {
572 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
573 DUMMY_STRINGOP_ALGS};
574 static stringop_algs geode_memset[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static const
578 struct processor_costs geode_cost = {
579 COSTS_N_INSNS (1), /* cost of an add instruction */
580 COSTS_N_INSNS (1), /* cost of a lea instruction */
581 COSTS_N_INSNS (2), /* variable shift costs */
582 COSTS_N_INSNS (1), /* constant shift costs */
583 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
584 COSTS_N_INSNS (4), /* HI */
585 COSTS_N_INSNS (7), /* SI */
586 COSTS_N_INSNS (7), /* DI */
587 COSTS_N_INSNS (7)}, /* other */
588 0, /* cost of multiply per each bit set */
589 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
590 COSTS_N_INSNS (23), /* HI */
591 COSTS_N_INSNS (39), /* SI */
592 COSTS_N_INSNS (39), /* DI */
593 COSTS_N_INSNS (39)}, /* other */
594 COSTS_N_INSNS (1), /* cost of movsx */
595 COSTS_N_INSNS (1), /* cost of movzx */
596 8, /* "large" insn */
597 4, /* MOVE_RATIO */
598 1, /* cost for loading QImode using movzbl */
599 {1, 1, 1}, /* cost of loading integer registers
600 in QImode, HImode and SImode.
601 Relative to reg-reg move (2). */
602 {1, 1, 1}, /* cost of storing integer registers */
603 1, /* cost of reg,reg fld/fst */
604 {1, 1, 1}, /* cost of loading fp registers
605 in SFmode, DFmode and XFmode */
606 {4, 6, 6}, /* cost of storing fp registers
607 in SFmode, DFmode and XFmode */
609 2, /* cost of moving MMX register */
610 {2, 2}, /* cost of loading MMX registers
611 in SImode and DImode */
612 {2, 2}, /* cost of storing MMX registers
613 in SImode and DImode */
614 2, /* cost of moving SSE register */
615 {2, 2, 8}, /* cost of loading SSE registers
616 in SImode, DImode and TImode */
617 {2, 2, 8}, /* cost of storing SSE registers
618 in SImode, DImode and TImode */
619 3, /* MMX or SSE register to integer */
620 64, /* size of l1 cache. */
621 128, /* size of l2 cache. */
622 32, /* size of prefetch block */
623 1, /* number of parallel prefetches */
624 1, /* Branch cost */
625 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
626 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
627 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
628 COSTS_N_INSNS (1), /* cost of FABS instruction. */
629 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
630 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
631 geode_memcpy,
632 geode_memset,
633 1, /* scalar_stmt_cost. */
634 1, /* scalar load_cost. */
635 1, /* scalar_store_cost. */
636 1, /* vec_stmt_cost. */
637 1, /* vec_to_scalar_cost. */
638 1, /* scalar_to_vec_cost. */
639 1, /* vec_align_load_cost. */
640 2, /* vec_unalign_load_cost. */
641 1, /* vec_store_cost. */
642 3, /* cond_taken_branch_cost. */
643 1, /* cond_not_taken_branch_cost. */
646 static stringop_algs k6_memcpy[2] = {
647 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
648 DUMMY_STRINGOP_ALGS};
649 static stringop_algs k6_memset[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static const
653 struct processor_costs k6_cost = {
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (2), /* cost of a lea instruction */
656 COSTS_N_INSNS (1), /* variable shift costs */
657 COSTS_N_INSNS (1), /* constant shift costs */
658 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (3), /* HI */
660 COSTS_N_INSNS (3), /* SI */
661 COSTS_N_INSNS (3), /* DI */
662 COSTS_N_INSNS (3)}, /* other */
663 0, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (18), /* HI */
666 COSTS_N_INSNS (18), /* SI */
667 COSTS_N_INSNS (18), /* DI */
668 COSTS_N_INSNS (18)}, /* other */
669 COSTS_N_INSNS (2), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 8, /* "large" insn */
672 4, /* MOVE_RATIO */
673 3, /* cost for loading QImode using movzbl */
674 {4, 5, 4}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 3, 2}, /* cost of storing integer registers */
678 4, /* cost of reg,reg fld/fst */
679 {6, 6, 6}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {4, 4, 4}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {2, 2}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {2, 2}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {2, 2, 8}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {2, 2, 8}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 6, /* MMX or SSE register to integer */
694 32, /* size of l1 cache. */
695 32, /* size of l2 cache. Some models
696 have integrated l2 cache, but
697 optimizing for k6 is not important
698 enough to worry about that. */
699 32, /* size of prefetch block */
700 1, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (2), /* cost of FABS instruction. */
706 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
708 k6_memcpy,
709 k6_memset,
710 1, /* scalar_stmt_cost. */
711 1, /* scalar load_cost. */
712 1, /* scalar_store_cost. */
713 1, /* vec_stmt_cost. */
714 1, /* vec_to_scalar_cost. */
715 1, /* scalar_to_vec_cost. */
716 1, /* vec_align_load_cost. */
717 2, /* vec_unalign_load_cost. */
718 1, /* vec_store_cost. */
719 3, /* cond_taken_branch_cost. */
720 1, /* cond_not_taken_branch_cost. */
723 /* For some reason, Athlon deals better with REP prefix (relative to loops)
724 compared to K8. Alignment becomes important after 8 bytes for memcpy and
725 128 bytes for memset. */
726 static stringop_algs athlon_memcpy[2] = {
727 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
728 DUMMY_STRINGOP_ALGS};
729 static stringop_algs athlon_memset[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static const
733 struct processor_costs athlon_cost = {
734 COSTS_N_INSNS (1), /* cost of an add instruction */
735 COSTS_N_INSNS (2), /* cost of a lea instruction */
736 COSTS_N_INSNS (1), /* variable shift costs */
737 COSTS_N_INSNS (1), /* constant shift costs */
738 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
739 COSTS_N_INSNS (5), /* HI */
740 COSTS_N_INSNS (5), /* SI */
741 COSTS_N_INSNS (5), /* DI */
742 COSTS_N_INSNS (5)}, /* other */
743 0, /* cost of multiply per each bit set */
744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
745 COSTS_N_INSNS (26), /* HI */
746 COSTS_N_INSNS (42), /* SI */
747 COSTS_N_INSNS (74), /* DI */
748 COSTS_N_INSNS (74)}, /* other */
749 COSTS_N_INSNS (1), /* cost of movsx */
750 COSTS_N_INSNS (1), /* cost of movzx */
751 8, /* "large" insn */
752 9, /* MOVE_RATIO */
753 4, /* cost for loading QImode using movzbl */
754 {3, 4, 3}, /* cost of loading integer registers
755 in QImode, HImode and SImode.
756 Relative to reg-reg move (2). */
757 {3, 4, 3}, /* cost of storing integer registers */
758 4, /* cost of reg,reg fld/fst */
759 {4, 4, 12}, /* cost of loading fp registers
760 in SFmode, DFmode and XFmode */
761 {6, 6, 8}, /* cost of storing fp registers
762 in SFmode, DFmode and XFmode */
763 2, /* cost of moving MMX register */
764 {4, 4}, /* cost of loading MMX registers
765 in SImode and DImode */
766 {4, 4}, /* cost of storing MMX registers
767 in SImode and DImode */
768 2, /* cost of moving SSE register */
769 {4, 4, 6}, /* cost of loading SSE registers
770 in SImode, DImode and TImode */
771 {4, 4, 5}, /* cost of storing SSE registers
772 in SImode, DImode and TImode */
773 5, /* MMX or SSE register to integer */
774 64, /* size of l1 cache. */
775 256, /* size of l2 cache. */
776 64, /* size of prefetch block */
777 6, /* number of parallel prefetches */
778 5, /* Branch cost */
779 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
780 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
781 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
782 COSTS_N_INSNS (2), /* cost of FABS instruction. */
783 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
784 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
785 athlon_memcpy,
786 athlon_memset,
787 1, /* scalar_stmt_cost. */
788 1, /* scalar load_cost. */
789 1, /* scalar_store_cost. */
790 1, /* vec_stmt_cost. */
791 1, /* vec_to_scalar_cost. */
792 1, /* scalar_to_vec_cost. */
793 1, /* vec_align_load_cost. */
794 2, /* vec_unalign_load_cost. */
795 1, /* vec_store_cost. */
796 3, /* cond_taken_branch_cost. */
797 1, /* cond_not_taken_branch_cost. */
800 /* K8 has optimized REP instruction for medium sized blocks, but for very
801 small blocks it is better to use loop. For large blocks, libcall can
802 do nontemporary accesses and beat inline considerably. */
803 static stringop_algs k8_memcpy[2] = {
804 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
805 {-1, rep_prefix_4_byte, false}}},
806 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
807 {-1, libcall, false}}}};
808 static stringop_algs k8_memset[2] = {
809 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
810 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
811 {libcall, {{48, unrolled_loop, false},
812 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
813 static const
814 struct processor_costs k8_cost = {
815 COSTS_N_INSNS (1), /* cost of an add instruction */
816 COSTS_N_INSNS (2), /* cost of a lea instruction */
817 COSTS_N_INSNS (1), /* variable shift costs */
818 COSTS_N_INSNS (1), /* constant shift costs */
819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
820 COSTS_N_INSNS (4), /* HI */
821 COSTS_N_INSNS (3), /* SI */
822 COSTS_N_INSNS (4), /* DI */
823 COSTS_N_INSNS (5)}, /* other */
824 0, /* cost of multiply per each bit set */
825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
826 COSTS_N_INSNS (26), /* HI */
827 COSTS_N_INSNS (42), /* SI */
828 COSTS_N_INSNS (74), /* DI */
829 COSTS_N_INSNS (74)}, /* other */
830 COSTS_N_INSNS (1), /* cost of movsx */
831 COSTS_N_INSNS (1), /* cost of movzx */
832 8, /* "large" insn */
833 9, /* MOVE_RATIO */
834 4, /* cost for loading QImode using movzbl */
835 {3, 4, 3}, /* cost of loading integer registers
836 in QImode, HImode and SImode.
837 Relative to reg-reg move (2). */
838 {3, 4, 3}, /* cost of storing integer registers */
839 4, /* cost of reg,reg fld/fst */
840 {4, 4, 12}, /* cost of loading fp registers
841 in SFmode, DFmode and XFmode */
842 {6, 6, 8}, /* cost of storing fp registers
843 in SFmode, DFmode and XFmode */
844 2, /* cost of moving MMX register */
845 {3, 3}, /* cost of loading MMX registers
846 in SImode and DImode */
847 {4, 4}, /* cost of storing MMX registers
848 in SImode and DImode */
849 2, /* cost of moving SSE register */
850 {4, 3, 6}, /* cost of loading SSE registers
851 in SImode, DImode and TImode */
852 {4, 4, 5}, /* cost of storing SSE registers
853 in SImode, DImode and TImode */
854 5, /* MMX or SSE register to integer */
855 64, /* size of l1 cache. */
856 512, /* size of l2 cache. */
857 64, /* size of prefetch block */
858 /* New AMD processors never drop prefetches; if they cannot be performed
859 immediately, they are queued. We set number of simultaneous prefetches
860 to a large constant to reflect this (it probably is not a good idea not
861 to limit number of prefetches at all, as their execution also takes some
862 time). */
863 100, /* number of parallel prefetches */
864 3, /* Branch cost */
865 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
866 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
867 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
868 COSTS_N_INSNS (2), /* cost of FABS instruction. */
869 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
870 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
872 k8_memcpy,
873 k8_memset,
874 4, /* scalar_stmt_cost. */
875 2, /* scalar load_cost. */
876 2, /* scalar_store_cost. */
877 5, /* vec_stmt_cost. */
878 0, /* vec_to_scalar_cost. */
879 2, /* scalar_to_vec_cost. */
880 2, /* vec_align_load_cost. */
881 3, /* vec_unalign_load_cost. */
882 3, /* vec_store_cost. */
883 3, /* cond_taken_branch_cost. */
884 2, /* cond_not_taken_branch_cost. */
887 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
888 very small blocks it is better to use loop. For large blocks, libcall can
889 do nontemporary accesses and beat inline considerably. */
890 static stringop_algs amdfam10_memcpy[2] = {
891 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
892 {-1, rep_prefix_4_byte, false}}},
893 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
894 {-1, libcall, false}}}};
895 static stringop_algs amdfam10_memset[2] = {
896 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
897 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
898 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
899 {-1, libcall, false}}}};
900 struct processor_costs amdfam10_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (2), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (3), /* SI */
908 COSTS_N_INSNS (4), /* DI */
909 COSTS_N_INSNS (5)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {3, 4, 3}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {3, 4, 3}, /* cost of storing integer registers */
925 4, /* cost of reg,reg fld/fst */
926 {4, 4, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {6, 6, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {3, 3}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 3}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 5}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 3, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 64, /* size of l1 cache. */
950 512, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
966 amdfam10_memcpy,
967 amdfam10_memset,
968 4, /* scalar_stmt_cost. */
969 2, /* scalar load_cost. */
970 2, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 2, /* vec_align_load_cost. */
975 2, /* vec_unalign_load_cost. */
976 2, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
984 static stringop_algs bdver1_memcpy[2] = {
985 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
986 {-1, rep_prefix_4_byte, false}}},
987 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
989 static stringop_algs bdver1_memset[2] = {
990 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
991 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
992 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
993 {-1, libcall, false}}}};
995 const struct processor_costs bdver1_cost = {
996 COSTS_N_INSNS (1), /* cost of an add instruction */
997 COSTS_N_INSNS (1), /* cost of a lea instruction */
998 COSTS_N_INSNS (1), /* variable shift costs */
999 COSTS_N_INSNS (1), /* constant shift costs */
1000 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1001 COSTS_N_INSNS (4), /* HI */
1002 COSTS_N_INSNS (4), /* SI */
1003 COSTS_N_INSNS (6), /* DI */
1004 COSTS_N_INSNS (6)}, /* other */
1005 0, /* cost of multiply per each bit set */
1006 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1007 COSTS_N_INSNS (35), /* HI */
1008 COSTS_N_INSNS (51), /* SI */
1009 COSTS_N_INSNS (83), /* DI */
1010 COSTS_N_INSNS (83)}, /* other */
1011 COSTS_N_INSNS (1), /* cost of movsx */
1012 COSTS_N_INSNS (1), /* cost of movzx */
1013 8, /* "large" insn */
1014 9, /* MOVE_RATIO */
1015 4, /* cost for loading QImode using movzbl */
1016 {5, 5, 4}, /* cost of loading integer registers
1017 in QImode, HImode and SImode.
1018 Relative to reg-reg move (2). */
1019 {4, 4, 4}, /* cost of storing integer registers */
1020 2, /* cost of reg,reg fld/fst */
1021 {5, 5, 12}, /* cost of loading fp registers
1022 in SFmode, DFmode and XFmode */
1023 {4, 4, 8}, /* cost of storing fp registers
1024 in SFmode, DFmode and XFmode */
1025 2, /* cost of moving MMX register */
1026 {4, 4}, /* cost of loading MMX registers
1027 in SImode and DImode */
1028 {4, 4}, /* cost of storing MMX registers
1029 in SImode and DImode */
1030 2, /* cost of moving SSE register */
1031 {4, 4, 4}, /* cost of loading SSE registers
1032 in SImode, DImode and TImode */
1033 {4, 4, 4}, /* cost of storing SSE registers
1034 in SImode, DImode and TImode */
1035 2, /* MMX or SSE register to integer */
1036 /* On K8:
1037 MOVD reg64, xmmreg Double FSTORE 4
1038 MOVD reg32, xmmreg Double FSTORE 4
1039 On AMDFAM10:
1040 MOVD reg64, xmmreg Double FADD 3
1041 1/1 1/1
1042 MOVD reg32, xmmreg Double FADD 3
1043 1/1 1/1 */
1044 16, /* size of l1 cache. */
1045 2048, /* size of l2 cache. */
1046 64, /* size of prefetch block */
1047 /* New AMD processors never drop prefetches; if they cannot be performed
1048 immediately, they are queued. We set number of simultaneous prefetches
1049 to a large constant to reflect this (it probably is not a good idea not
1050 to limit number of prefetches at all, as their execution also takes some
1051 time). */
1052 100, /* number of parallel prefetches */
1053 2, /* Branch cost */
1054 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1055 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1056 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1057 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1058 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1059 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1061 bdver1_memcpy,
1062 bdver1_memset,
1063 6, /* scalar_stmt_cost. */
1064 4, /* scalar load_cost. */
1065 4, /* scalar_store_cost. */
1066 6, /* vec_stmt_cost. */
1067 0, /* vec_to_scalar_cost. */
1068 2, /* scalar_to_vec_cost. */
1069 4, /* vec_align_load_cost. */
1070 4, /* vec_unalign_load_cost. */
1071 4, /* vec_store_cost. */
1072 4, /* cond_taken_branch_cost. */
1073 2, /* cond_not_taken_branch_cost. */
1076 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1077 very small blocks it is better to use loop. For large blocks, libcall
1078 can do nontemporary accesses and beat inline considerably. */
1080 static stringop_algs bdver2_memcpy[2] = {
1081 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1082 {-1, rep_prefix_4_byte, false}}},
1083 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 static stringop_algs bdver2_memset[2] = {
1086 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1087 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1088 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1089 {-1, libcall, false}}}};
1091 const struct processor_costs bdver2_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 /* On K8:
1133 MOVD reg64, xmmreg Double FSTORE 4
1134 MOVD reg32, xmmreg Double FSTORE 4
1135 On AMDFAM10:
1136 MOVD reg64, xmmreg Double FADD 3
1137 1/1 1/1
1138 MOVD reg32, xmmreg Double FADD 3
1139 1/1 1/1 */
1140 16, /* size of l1 cache. */
1141 2048, /* size of l2 cache. */
1142 64, /* size of prefetch block */
1143 /* New AMD processors never drop prefetches; if they cannot be performed
1144 immediately, they are queued. We set number of simultaneous prefetches
1145 to a large constant to reflect this (it probably is not a good idea not
1146 to limit number of prefetches at all, as their execution also takes some
1147 time). */
1148 100, /* number of parallel prefetches */
1149 2, /* Branch cost */
1150 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1151 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1152 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1153 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1154 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1155 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1157 bdver2_memcpy,
1158 bdver2_memset,
1159 6, /* scalar_stmt_cost. */
1160 4, /* scalar load_cost. */
1161 4, /* scalar_store_cost. */
1162 6, /* vec_stmt_cost. */
1163 0, /* vec_to_scalar_cost. */
1164 2, /* scalar_to_vec_cost. */
1165 4, /* vec_align_load_cost. */
1166 4, /* vec_unalign_load_cost. */
1167 4, /* vec_store_cost. */
1168 4, /* cond_taken_branch_cost. */
1169 2, /* cond_not_taken_branch_cost. */
1173 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1174 very small blocks it is better to use loop. For large blocks, libcall
1175 can do nontemporary accesses and beat inline considerably. */
1176 static stringop_algs bdver3_memcpy[2] = {
1177 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1178 {-1, rep_prefix_4_byte, false}}},
1179 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1180 {-1, libcall, false}}}};
1181 static stringop_algs bdver3_memset[2] = {
1182 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1183 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1184 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1185 {-1, libcall, false}}}};
1186 struct processor_costs bdver3_cost = {
1187 COSTS_N_INSNS (1), /* cost of an add instruction */
1188 COSTS_N_INSNS (1), /* cost of a lea instruction */
1189 COSTS_N_INSNS (1), /* variable shift costs */
1190 COSTS_N_INSNS (1), /* constant shift costs */
1191 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1192 COSTS_N_INSNS (4), /* HI */
1193 COSTS_N_INSNS (4), /* SI */
1194 COSTS_N_INSNS (6), /* DI */
1195 COSTS_N_INSNS (6)}, /* other */
1196 0, /* cost of multiply per each bit set */
1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1198 COSTS_N_INSNS (35), /* HI */
1199 COSTS_N_INSNS (51), /* SI */
1200 COSTS_N_INSNS (83), /* DI */
1201 COSTS_N_INSNS (83)}, /* other */
1202 COSTS_N_INSNS (1), /* cost of movsx */
1203 COSTS_N_INSNS (1), /* cost of movzx */
1204 8, /* "large" insn */
1205 9, /* MOVE_RATIO */
1206 4, /* cost for loading QImode using movzbl */
1207 {5, 5, 4}, /* cost of loading integer registers
1208 in QImode, HImode and SImode.
1209 Relative to reg-reg move (2). */
1210 {4, 4, 4}, /* cost of storing integer registers */
1211 2, /* cost of reg,reg fld/fst */
1212 {5, 5, 12}, /* cost of loading fp registers
1213 in SFmode, DFmode and XFmode */
1214 {4, 4, 8}, /* cost of storing fp registers
1215 in SFmode, DFmode and XFmode */
1216 2, /* cost of moving MMX register */
1217 {4, 4}, /* cost of loading MMX registers
1218 in SImode and DImode */
1219 {4, 4}, /* cost of storing MMX registers
1220 in SImode and DImode */
1221 2, /* cost of moving SSE register */
1222 {4, 4, 4}, /* cost of loading SSE registers
1223 in SImode, DImode and TImode */
1224 {4, 4, 4}, /* cost of storing SSE registers
1225 in SImode, DImode and TImode */
1226 2, /* MMX or SSE register to integer */
1227 16, /* size of l1 cache. */
1228 2048, /* size of l2 cache. */
1229 64, /* size of prefetch block */
1230 /* New AMD processors never drop prefetches; if they cannot be performed
1231 immediately, they are queued. We set number of simultaneous prefetches
1232 to a large constant to reflect this (it probably is not a good idea not
1233 to limit number of prefetches at all, as their execution also takes some
1234 time). */
1235 100, /* number of parallel prefetches */
1236 2, /* Branch cost */
1237 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1238 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1239 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1240 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1241 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1242 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1244 bdver3_memcpy,
1245 bdver3_memset,
1246 6, /* scalar_stmt_cost. */
1247 4, /* scalar load_cost. */
1248 4, /* scalar_store_cost. */
1249 6, /* vec_stmt_cost. */
1250 0, /* vec_to_scalar_cost. */
1251 2, /* scalar_to_vec_cost. */
1252 4, /* vec_align_load_cost. */
1253 4, /* vec_unalign_load_cost. */
1254 4, /* vec_store_cost. */
1255 4, /* cond_taken_branch_cost. */
1256 2, /* cond_not_taken_branch_cost. */
1259 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1260 very small blocks it is better to use loop. For large blocks, libcall
1261 can do nontemporary accesses and beat inline considerably. */
1262 static stringop_algs bdver4_memcpy[2] = {
1263 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1264 {-1, rep_prefix_4_byte, false}}},
1265 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1266 {-1, libcall, false}}}};
1267 static stringop_algs bdver4_memset[2] = {
1268 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1269 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1270 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1271 {-1, libcall, false}}}};
1272 struct processor_costs bdver4_cost = {
1273 COSTS_N_INSNS (1), /* cost of an add instruction */
1274 COSTS_N_INSNS (1), /* cost of a lea instruction */
1275 COSTS_N_INSNS (1), /* variable shift costs */
1276 COSTS_N_INSNS (1), /* constant shift costs */
1277 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1278 COSTS_N_INSNS (4), /* HI */
1279 COSTS_N_INSNS (4), /* SI */
1280 COSTS_N_INSNS (6), /* DI */
1281 COSTS_N_INSNS (6)}, /* other */
1282 0, /* cost of multiply per each bit set */
1283 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1284 COSTS_N_INSNS (35), /* HI */
1285 COSTS_N_INSNS (51), /* SI */
1286 COSTS_N_INSNS (83), /* DI */
1287 COSTS_N_INSNS (83)}, /* other */
1288 COSTS_N_INSNS (1), /* cost of movsx */
1289 COSTS_N_INSNS (1), /* cost of movzx */
1290 8, /* "large" insn */
1291 9, /* MOVE_RATIO */
1292 4, /* cost for loading QImode using movzbl */
1293 {5, 5, 4}, /* cost of loading integer registers
1294 in QImode, HImode and SImode.
1295 Relative to reg-reg move (2). */
1296 {4, 4, 4}, /* cost of storing integer registers */
1297 2, /* cost of reg,reg fld/fst */
1298 {5, 5, 12}, /* cost of loading fp registers
1299 in SFmode, DFmode and XFmode */
1300 {4, 4, 8}, /* cost of storing fp registers
1301 in SFmode, DFmode and XFmode */
1302 2, /* cost of moving MMX register */
1303 {4, 4}, /* cost of loading MMX registers
1304 in SImode and DImode */
1305 {4, 4}, /* cost of storing MMX registers
1306 in SImode and DImode */
1307 2, /* cost of moving SSE register */
1308 {4, 4, 4}, /* cost of loading SSE registers
1309 in SImode, DImode and TImode */
1310 {4, 4, 4}, /* cost of storing SSE registers
1311 in SImode, DImode and TImode */
1312 2, /* MMX or SSE register to integer */
1313 16, /* size of l1 cache. */
1314 2048, /* size of l2 cache. */
1315 64, /* size of prefetch block */
1316 /* New AMD processors never drop prefetches; if they cannot be performed
1317 immediately, they are queued. We set number of simultaneous prefetches
1318 to a large constant to reflect this (it probably is not a good idea not
1319 to limit number of prefetches at all, as their execution also takes some
1320 time). */
1321 100, /* number of parallel prefetches */
1322 2, /* Branch cost */
1323 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1324 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1325 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1326 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1327 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1328 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1330 bdver4_memcpy,
1331 bdver4_memset,
1332 6, /* scalar_stmt_cost. */
1333 4, /* scalar load_cost. */
1334 4, /* scalar_store_cost. */
1335 6, /* vec_stmt_cost. */
1336 0, /* vec_to_scalar_cost. */
1337 2, /* scalar_to_vec_cost. */
1338 4, /* vec_align_load_cost. */
1339 4, /* vec_unalign_load_cost. */
1340 4, /* vec_store_cost. */
1341 4, /* cond_taken_branch_cost. */
1342 2, /* cond_not_taken_branch_cost. */
1346 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1347 very small blocks it is better to use loop. For large blocks, libcall
1348 can do nontemporary accesses and beat inline considerably. */
1349 static stringop_algs znver1_memcpy[2] = {
1350 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1351 {-1, rep_prefix_4_byte, false}}},
1352 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1353 {-1, libcall, false}}}};
1354 static stringop_algs znver1_memset[2] = {
1355 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1356 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1357 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1358 {-1, libcall, false}}}};
1359 struct processor_costs znver1_cost = {
1360 COSTS_N_INSNS (1), /* cost of an add instruction. */
1361 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1362 COSTS_N_INSNS (1), /* variable shift costs. */
1363 COSTS_N_INSNS (1), /* constant shift costs. */
1364 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1365 COSTS_N_INSNS (3), /* HI. */
1366 COSTS_N_INSNS (3), /* SI. */
1367 COSTS_N_INSNS (4), /* DI. */
1368 COSTS_N_INSNS (4)}, /* other. */
1369 0, /* cost of multiply per each bit
1370 set. */
1371 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1372 COSTS_N_INSNS (35), /* HI. */
1373 COSTS_N_INSNS (51), /* SI. */
1374 COSTS_N_INSNS (83), /* DI. */
1375 COSTS_N_INSNS (83)}, /* other. */
1376 COSTS_N_INSNS (1), /* cost of movsx. */
1377 COSTS_N_INSNS (1), /* cost of movzx. */
1378 8, /* "large" insn. */
1379 9, /* MOVE_RATIO. */
1380 4, /* cost for loading QImode using
1381 movzbl. */
1382 {5, 5, 4}, /* cost of loading integer registers
1383 in QImode, HImode and SImode.
1384 Relative to reg-reg move (2). */
1385 {4, 4, 4}, /* cost of storing integer
1386 registers. */
1387 2, /* cost of reg,reg fld/fst. */
1388 {5, 5, 12}, /* cost of loading fp registers
1389 in SFmode, DFmode and XFmode. */
1390 {4, 4, 8}, /* cost of storing fp registers
1391 in SFmode, DFmode and XFmode. */
1392 2, /* cost of moving MMX register. */
1393 {4, 4}, /* cost of loading MMX registers
1394 in SImode and DImode. */
1395 {4, 4}, /* cost of storing MMX registers
1396 in SImode and DImode. */
1397 2, /* cost of moving SSE register. */
1398 {4, 4, 4}, /* cost of loading SSE registers
1399 in SImode, DImode and TImode. */
1400 {4, 4, 4}, /* cost of storing SSE registers
1401 in SImode, DImode and TImode. */
1402 2, /* MMX or SSE register to integer. */
1403 32, /* size of l1 cache. */
1404 512, /* size of l2 cache. */
1405 64, /* size of prefetch block. */
1406 /* New AMD processors never drop prefetches; if they cannot be performed
1407 immediately, they are queued. We set number of simultaneous prefetches
1408 to a large constant to reflect this (it probably is not a good idea not
1409 to limit number of prefetches at all, as their execution also takes some
1410 time). */
1411 100, /* number of parallel prefetches. */
1412 2, /* Branch cost. */
1413 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1414 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1415 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1416 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1417 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1418 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1420 znver1_memcpy,
1421 znver1_memset,
1422 6, /* scalar_stmt_cost. */
1423 4, /* scalar load_cost. */
1424 4, /* scalar_store_cost. */
1425 6, /* vec_stmt_cost. */
1426 0, /* vec_to_scalar_cost. */
1427 2, /* scalar_to_vec_cost. */
1428 4, /* vec_align_load_cost. */
1429 4, /* vec_unalign_load_cost. */
1430 4, /* vec_store_cost. */
1431 4, /* cond_taken_branch_cost. */
1432 2, /* cond_not_taken_branch_cost. */
1435 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1436 very small blocks it is better to use loop. For large blocks, libcall can
1437 do nontemporary accesses and beat inline considerably. */
1438 static stringop_algs btver1_memcpy[2] = {
1439 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1440 {-1, rep_prefix_4_byte, false}}},
1441 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1442 {-1, libcall, false}}}};
1443 static stringop_algs btver1_memset[2] = {
1444 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1445 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1446 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1447 {-1, libcall, false}}}};
1448 const struct processor_costs btver1_cost = {
1449 COSTS_N_INSNS (1), /* cost of an add instruction */
1450 COSTS_N_INSNS (2), /* cost of a lea instruction */
1451 COSTS_N_INSNS (1), /* variable shift costs */
1452 COSTS_N_INSNS (1), /* constant shift costs */
1453 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1454 COSTS_N_INSNS (4), /* HI */
1455 COSTS_N_INSNS (3), /* SI */
1456 COSTS_N_INSNS (4), /* DI */
1457 COSTS_N_INSNS (5)}, /* other */
1458 0, /* cost of multiply per each bit set */
1459 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1460 COSTS_N_INSNS (35), /* HI */
1461 COSTS_N_INSNS (51), /* SI */
1462 COSTS_N_INSNS (83), /* DI */
1463 COSTS_N_INSNS (83)}, /* other */
1464 COSTS_N_INSNS (1), /* cost of movsx */
1465 COSTS_N_INSNS (1), /* cost of movzx */
1466 8, /* "large" insn */
1467 9, /* MOVE_RATIO */
1468 4, /* cost for loading QImode using movzbl */
1469 {3, 4, 3}, /* cost of loading integer registers
1470 in QImode, HImode and SImode.
1471 Relative to reg-reg move (2). */
1472 {3, 4, 3}, /* cost of storing integer registers */
1473 4, /* cost of reg,reg fld/fst */
1474 {4, 4, 12}, /* cost of loading fp registers
1475 in SFmode, DFmode and XFmode */
1476 {6, 6, 8}, /* cost of storing fp registers
1477 in SFmode, DFmode and XFmode */
1478 2, /* cost of moving MMX register */
1479 {3, 3}, /* cost of loading MMX registers
1480 in SImode and DImode */
1481 {4, 4}, /* cost of storing MMX registers
1482 in SImode and DImode */
1483 2, /* cost of moving SSE register */
1484 {4, 4, 3}, /* cost of loading SSE registers
1485 in SImode, DImode and TImode */
1486 {4, 4, 5}, /* cost of storing SSE registers
1487 in SImode, DImode and TImode */
1488 3, /* MMX or SSE register to integer */
1489 /* On K8:
1490 MOVD reg64, xmmreg Double FSTORE 4
1491 MOVD reg32, xmmreg Double FSTORE 4
1492 On AMDFAM10:
1493 MOVD reg64, xmmreg Double FADD 3
1494 1/1 1/1
1495 MOVD reg32, xmmreg Double FADD 3
1496 1/1 1/1 */
1497 32, /* size of l1 cache. */
1498 512, /* size of l2 cache. */
1499 64, /* size of prefetch block */
1500 100, /* number of parallel prefetches */
1501 2, /* Branch cost */
1502 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1503 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1504 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1505 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1506 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1507 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1509 btver1_memcpy,
1510 btver1_memset,
1511 4, /* scalar_stmt_cost. */
1512 2, /* scalar load_cost. */
1513 2, /* scalar_store_cost. */
1514 6, /* vec_stmt_cost. */
1515 0, /* vec_to_scalar_cost. */
1516 2, /* scalar_to_vec_cost. */
1517 2, /* vec_align_load_cost. */
1518 2, /* vec_unalign_load_cost. */
1519 2, /* vec_store_cost. */
1520 2, /* cond_taken_branch_cost. */
1521 1, /* cond_not_taken_branch_cost. */
1524 static stringop_algs btver2_memcpy[2] = {
1525 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1526 {-1, rep_prefix_4_byte, false}}},
1527 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1528 {-1, libcall, false}}}};
1529 static stringop_algs btver2_memset[2] = {
1530 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1531 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1532 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1533 {-1, libcall, false}}}};
1534 const struct processor_costs btver2_cost = {
1535 COSTS_N_INSNS (1), /* cost of an add instruction */
1536 COSTS_N_INSNS (2), /* cost of a lea instruction */
1537 COSTS_N_INSNS (1), /* variable shift costs */
1538 COSTS_N_INSNS (1), /* constant shift costs */
1539 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1540 COSTS_N_INSNS (4), /* HI */
1541 COSTS_N_INSNS (3), /* SI */
1542 COSTS_N_INSNS (4), /* DI */
1543 COSTS_N_INSNS (5)}, /* other */
1544 0, /* cost of multiply per each bit set */
1545 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1546 COSTS_N_INSNS (35), /* HI */
1547 COSTS_N_INSNS (51), /* SI */
1548 COSTS_N_INSNS (83), /* DI */
1549 COSTS_N_INSNS (83)}, /* other */
1550 COSTS_N_INSNS (1), /* cost of movsx */
1551 COSTS_N_INSNS (1), /* cost of movzx */
1552 8, /* "large" insn */
1553 9, /* MOVE_RATIO */
1554 4, /* cost for loading QImode using movzbl */
1555 {3, 4, 3}, /* cost of loading integer registers
1556 in QImode, HImode and SImode.
1557 Relative to reg-reg move (2). */
1558 {3, 4, 3}, /* cost of storing integer registers */
1559 4, /* cost of reg,reg fld/fst */
1560 {4, 4, 12}, /* cost of loading fp registers
1561 in SFmode, DFmode and XFmode */
1562 {6, 6, 8}, /* cost of storing fp registers
1563 in SFmode, DFmode and XFmode */
1564 2, /* cost of moving MMX register */
1565 {3, 3}, /* cost of loading MMX registers
1566 in SImode and DImode */
1567 {4, 4}, /* cost of storing MMX registers
1568 in SImode and DImode */
1569 2, /* cost of moving SSE register */
1570 {4, 4, 3}, /* cost of loading SSE registers
1571 in SImode, DImode and TImode */
1572 {4, 4, 5}, /* cost of storing SSE registers
1573 in SImode, DImode and TImode */
1574 3, /* MMX or SSE register to integer */
1575 /* On K8:
1576 MOVD reg64, xmmreg Double FSTORE 4
1577 MOVD reg32, xmmreg Double FSTORE 4
1578 On AMDFAM10:
1579 MOVD reg64, xmmreg Double FADD 3
1580 1/1 1/1
1581 MOVD reg32, xmmreg Double FADD 3
1582 1/1 1/1 */
1583 32, /* size of l1 cache. */
1584 2048, /* size of l2 cache. */
1585 64, /* size of prefetch block */
1586 100, /* number of parallel prefetches */
1587 2, /* Branch cost */
1588 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1589 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1590 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1591 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1592 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1593 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1594 btver2_memcpy,
1595 btver2_memset,
1596 4, /* scalar_stmt_cost. */
1597 2, /* scalar load_cost. */
1598 2, /* scalar_store_cost. */
1599 6, /* vec_stmt_cost. */
1600 0, /* vec_to_scalar_cost. */
1601 2, /* scalar_to_vec_cost. */
1602 2, /* vec_align_load_cost. */
1603 2, /* vec_unalign_load_cost. */
1604 2, /* vec_store_cost. */
1605 2, /* cond_taken_branch_cost. */
1606 1, /* cond_not_taken_branch_cost. */
1609 static stringop_algs pentium4_memcpy[2] = {
1610 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1611 DUMMY_STRINGOP_ALGS};
1612 static stringop_algs pentium4_memset[2] = {
1613 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1614 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1615 DUMMY_STRINGOP_ALGS};
1617 static const
1618 struct processor_costs pentium4_cost = {
1619 COSTS_N_INSNS (1), /* cost of an add instruction */
1620 COSTS_N_INSNS (3), /* cost of a lea instruction */
1621 COSTS_N_INSNS (4), /* variable shift costs */
1622 COSTS_N_INSNS (4), /* constant shift costs */
1623 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1624 COSTS_N_INSNS (15), /* HI */
1625 COSTS_N_INSNS (15), /* SI */
1626 COSTS_N_INSNS (15), /* DI */
1627 COSTS_N_INSNS (15)}, /* other */
1628 0, /* cost of multiply per each bit set */
1629 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1630 COSTS_N_INSNS (56), /* HI */
1631 COSTS_N_INSNS (56), /* SI */
1632 COSTS_N_INSNS (56), /* DI */
1633 COSTS_N_INSNS (56)}, /* other */
1634 COSTS_N_INSNS (1), /* cost of movsx */
1635 COSTS_N_INSNS (1), /* cost of movzx */
1636 16, /* "large" insn */
1637 6, /* MOVE_RATIO */
1638 2, /* cost for loading QImode using movzbl */
1639 {4, 5, 4}, /* cost of loading integer registers
1640 in QImode, HImode and SImode.
1641 Relative to reg-reg move (2). */
1642 {2, 3, 2}, /* cost of storing integer registers */
1643 2, /* cost of reg,reg fld/fst */
1644 {2, 2, 6}, /* cost of loading fp registers
1645 in SFmode, DFmode and XFmode */
1646 {4, 4, 6}, /* cost of storing fp registers
1647 in SFmode, DFmode and XFmode */
1648 2, /* cost of moving MMX register */
1649 {2, 2}, /* cost of loading MMX registers
1650 in SImode and DImode */
1651 {2, 2}, /* cost of storing MMX registers
1652 in SImode and DImode */
1653 12, /* cost of moving SSE register */
1654 {12, 12, 12}, /* cost of loading SSE registers
1655 in SImode, DImode and TImode */
1656 {2, 2, 8}, /* cost of storing SSE registers
1657 in SImode, DImode and TImode */
1658 10, /* MMX or SSE register to integer */
1659 8, /* size of l1 cache. */
1660 256, /* size of l2 cache. */
1661 64, /* size of prefetch block */
1662 6, /* number of parallel prefetches */
1663 2, /* Branch cost */
1664 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1665 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1666 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1667 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1668 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1669 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1670 pentium4_memcpy,
1671 pentium4_memset,
1672 1, /* scalar_stmt_cost. */
1673 1, /* scalar load_cost. */
1674 1, /* scalar_store_cost. */
1675 1, /* vec_stmt_cost. */
1676 1, /* vec_to_scalar_cost. */
1677 1, /* scalar_to_vec_cost. */
1678 1, /* vec_align_load_cost. */
1679 2, /* vec_unalign_load_cost. */
1680 1, /* vec_store_cost. */
1681 3, /* cond_taken_branch_cost. */
1682 1, /* cond_not_taken_branch_cost. */
1685 static stringop_algs nocona_memcpy[2] = {
1686 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1687 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1688 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1690 static stringop_algs nocona_memset[2] = {
1691 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1692 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1693 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1694 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1696 static const
1697 struct processor_costs nocona_cost = {
1698 COSTS_N_INSNS (1), /* cost of an add instruction */
1699 COSTS_N_INSNS (1), /* cost of a lea instruction */
1700 COSTS_N_INSNS (1), /* variable shift costs */
1701 COSTS_N_INSNS (1), /* constant shift costs */
1702 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1703 COSTS_N_INSNS (10), /* HI */
1704 COSTS_N_INSNS (10), /* SI */
1705 COSTS_N_INSNS (10), /* DI */
1706 COSTS_N_INSNS (10)}, /* other */
1707 0, /* cost of multiply per each bit set */
1708 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1709 COSTS_N_INSNS (66), /* HI */
1710 COSTS_N_INSNS (66), /* SI */
1711 COSTS_N_INSNS (66), /* DI */
1712 COSTS_N_INSNS (66)}, /* other */
1713 COSTS_N_INSNS (1), /* cost of movsx */
1714 COSTS_N_INSNS (1), /* cost of movzx */
1715 16, /* "large" insn */
1716 17, /* MOVE_RATIO */
1717 4, /* cost for loading QImode using movzbl */
1718 {4, 4, 4}, /* cost of loading integer registers
1719 in QImode, HImode and SImode.
1720 Relative to reg-reg move (2). */
1721 {4, 4, 4}, /* cost of storing integer registers */
1722 3, /* cost of reg,reg fld/fst */
1723 {12, 12, 12}, /* cost of loading fp registers
1724 in SFmode, DFmode and XFmode */
1725 {4, 4, 4}, /* cost of storing fp registers
1726 in SFmode, DFmode and XFmode */
1727 6, /* cost of moving MMX register */
1728 {12, 12}, /* cost of loading MMX registers
1729 in SImode and DImode */
1730 {12, 12}, /* cost of storing MMX registers
1731 in SImode and DImode */
1732 6, /* cost of moving SSE register */
1733 {12, 12, 12}, /* cost of loading SSE registers
1734 in SImode, DImode and TImode */
1735 {12, 12, 12}, /* cost of storing SSE registers
1736 in SImode, DImode and TImode */
1737 8, /* MMX or SSE register to integer */
1738 8, /* size of l1 cache. */
1739 1024, /* size of l2 cache. */
1740 64, /* size of prefetch block */
1741 8, /* number of parallel prefetches */
1742 1, /* Branch cost */
1743 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1744 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1745 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1746 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1747 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1748 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1749 nocona_memcpy,
1750 nocona_memset,
1751 1, /* scalar_stmt_cost. */
1752 1, /* scalar load_cost. */
1753 1, /* scalar_store_cost. */
1754 1, /* vec_stmt_cost. */
1755 1, /* vec_to_scalar_cost. */
1756 1, /* scalar_to_vec_cost. */
1757 1, /* vec_align_load_cost. */
1758 2, /* vec_unalign_load_cost. */
1759 1, /* vec_store_cost. */
1760 3, /* cond_taken_branch_cost. */
1761 1, /* cond_not_taken_branch_cost. */
1764 static stringop_algs atom_memcpy[2] = {
1765 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1766 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1767 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1768 static stringop_algs atom_memset[2] = {
1769 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1770 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1771 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1772 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1773 static const
1774 struct processor_costs atom_cost = {
1775 COSTS_N_INSNS (1), /* cost of an add instruction */
1776 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1777 COSTS_N_INSNS (1), /* variable shift costs */
1778 COSTS_N_INSNS (1), /* constant shift costs */
1779 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1780 COSTS_N_INSNS (4), /* HI */
1781 COSTS_N_INSNS (3), /* SI */
1782 COSTS_N_INSNS (4), /* DI */
1783 COSTS_N_INSNS (2)}, /* other */
1784 0, /* cost of multiply per each bit set */
1785 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1786 COSTS_N_INSNS (26), /* HI */
1787 COSTS_N_INSNS (42), /* SI */
1788 COSTS_N_INSNS (74), /* DI */
1789 COSTS_N_INSNS (74)}, /* other */
1790 COSTS_N_INSNS (1), /* cost of movsx */
1791 COSTS_N_INSNS (1), /* cost of movzx */
1792 8, /* "large" insn */
1793 17, /* MOVE_RATIO */
1794 4, /* cost for loading QImode using movzbl */
1795 {4, 4, 4}, /* cost of loading integer registers
1796 in QImode, HImode and SImode.
1797 Relative to reg-reg move (2). */
1798 {4, 4, 4}, /* cost of storing integer registers */
1799 4, /* cost of reg,reg fld/fst */
1800 {12, 12, 12}, /* cost of loading fp registers
1801 in SFmode, DFmode and XFmode */
1802 {6, 6, 8}, /* cost of storing fp registers
1803 in SFmode, DFmode and XFmode */
1804 2, /* cost of moving MMX register */
1805 {8, 8}, /* cost of loading MMX registers
1806 in SImode and DImode */
1807 {8, 8}, /* cost of storing MMX registers
1808 in SImode and DImode */
1809 2, /* cost of moving SSE register */
1810 {8, 8, 8}, /* cost of loading SSE registers
1811 in SImode, DImode and TImode */
1812 {8, 8, 8}, /* cost of storing SSE registers
1813 in SImode, DImode and TImode */
1814 5, /* MMX or SSE register to integer */
1815 32, /* size of l1 cache. */
1816 256, /* size of l2 cache. */
1817 64, /* size of prefetch block */
1818 6, /* number of parallel prefetches */
1819 3, /* Branch cost */
1820 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1821 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1822 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1823 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1824 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1825 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1826 atom_memcpy,
1827 atom_memset,
1828 1, /* scalar_stmt_cost. */
1829 1, /* scalar load_cost. */
1830 1, /* scalar_store_cost. */
1831 1, /* vec_stmt_cost. */
1832 1, /* vec_to_scalar_cost. */
1833 1, /* scalar_to_vec_cost. */
1834 1, /* vec_align_load_cost. */
1835 2, /* vec_unalign_load_cost. */
1836 1, /* vec_store_cost. */
1837 3, /* cond_taken_branch_cost. */
1838 1, /* cond_not_taken_branch_cost. */
1841 static stringop_algs slm_memcpy[2] = {
1842 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1843 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1844 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1845 static stringop_algs slm_memset[2] = {
1846 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1847 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1848 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1849 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1850 static const
1851 struct processor_costs slm_cost = {
1852 COSTS_N_INSNS (1), /* cost of an add instruction */
1853 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1854 COSTS_N_INSNS (1), /* variable shift costs */
1855 COSTS_N_INSNS (1), /* constant shift costs */
1856 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1857 COSTS_N_INSNS (3), /* HI */
1858 COSTS_N_INSNS (3), /* SI */
1859 COSTS_N_INSNS (4), /* DI */
1860 COSTS_N_INSNS (2)}, /* other */
1861 0, /* cost of multiply per each bit set */
1862 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1863 COSTS_N_INSNS (26), /* HI */
1864 COSTS_N_INSNS (42), /* SI */
1865 COSTS_N_INSNS (74), /* DI */
1866 COSTS_N_INSNS (74)}, /* other */
1867 COSTS_N_INSNS (1), /* cost of movsx */
1868 COSTS_N_INSNS (1), /* cost of movzx */
1869 8, /* "large" insn */
1870 17, /* MOVE_RATIO */
1871 4, /* cost for loading QImode using movzbl */
1872 {4, 4, 4}, /* cost of loading integer registers
1873 in QImode, HImode and SImode.
1874 Relative to reg-reg move (2). */
1875 {4, 4, 4}, /* cost of storing integer registers */
1876 4, /* cost of reg,reg fld/fst */
1877 {12, 12, 12}, /* cost of loading fp registers
1878 in SFmode, DFmode and XFmode */
1879 {6, 6, 8}, /* cost of storing fp registers
1880 in SFmode, DFmode and XFmode */
1881 2, /* cost of moving MMX register */
1882 {8, 8}, /* cost of loading MMX registers
1883 in SImode and DImode */
1884 {8, 8}, /* cost of storing MMX registers
1885 in SImode and DImode */
1886 2, /* cost of moving SSE register */
1887 {8, 8, 8}, /* cost of loading SSE registers
1888 in SImode, DImode and TImode */
1889 {8, 8, 8}, /* cost of storing SSE registers
1890 in SImode, DImode and TImode */
1891 5, /* MMX or SSE register to integer */
1892 32, /* size of l1 cache. */
1893 256, /* size of l2 cache. */
1894 64, /* size of prefetch block */
1895 6, /* number of parallel prefetches */
1896 3, /* Branch cost */
1897 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1898 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1899 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1900 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1901 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1902 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1903 slm_memcpy,
1904 slm_memset,
1905 1, /* scalar_stmt_cost. */
1906 1, /* scalar load_cost. */
1907 1, /* scalar_store_cost. */
1908 1, /* vec_stmt_cost. */
1909 4, /* vec_to_scalar_cost. */
1910 1, /* scalar_to_vec_cost. */
1911 1, /* vec_align_load_cost. */
1912 2, /* vec_unalign_load_cost. */
1913 1, /* vec_store_cost. */
1914 3, /* cond_taken_branch_cost. */
1915 1, /* cond_not_taken_branch_cost. */
1918 static stringop_algs intel_memcpy[2] = {
1919 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1920 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1921 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1922 static stringop_algs intel_memset[2] = {
1923 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1924 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1925 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1926 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1927 static const
1928 struct processor_costs intel_cost = {
1929 COSTS_N_INSNS (1), /* cost of an add instruction */
1930 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1931 COSTS_N_INSNS (1), /* variable shift costs */
1932 COSTS_N_INSNS (1), /* constant shift costs */
1933 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1934 COSTS_N_INSNS (3), /* HI */
1935 COSTS_N_INSNS (3), /* SI */
1936 COSTS_N_INSNS (4), /* DI */
1937 COSTS_N_INSNS (2)}, /* other */
1938 0, /* cost of multiply per each bit set */
1939 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1940 COSTS_N_INSNS (26), /* HI */
1941 COSTS_N_INSNS (42), /* SI */
1942 COSTS_N_INSNS (74), /* DI */
1943 COSTS_N_INSNS (74)}, /* other */
1944 COSTS_N_INSNS (1), /* cost of movsx */
1945 COSTS_N_INSNS (1), /* cost of movzx */
1946 8, /* "large" insn */
1947 17, /* MOVE_RATIO */
1948 4, /* cost for loading QImode using movzbl */
1949 {4, 4, 4}, /* cost of loading integer registers
1950 in QImode, HImode and SImode.
1951 Relative to reg-reg move (2). */
1952 {4, 4, 4}, /* cost of storing integer registers */
1953 4, /* cost of reg,reg fld/fst */
1954 {12, 12, 12}, /* cost of loading fp registers
1955 in SFmode, DFmode and XFmode */
1956 {6, 6, 8}, /* cost of storing fp registers
1957 in SFmode, DFmode and XFmode */
1958 2, /* cost of moving MMX register */
1959 {8, 8}, /* cost of loading MMX registers
1960 in SImode and DImode */
1961 {8, 8}, /* cost of storing MMX registers
1962 in SImode and DImode */
1963 2, /* cost of moving SSE register */
1964 {8, 8, 8}, /* cost of loading SSE registers
1965 in SImode, DImode and TImode */
1966 {8, 8, 8}, /* cost of storing SSE registers
1967 in SImode, DImode and TImode */
1968 5, /* MMX or SSE register to integer */
1969 32, /* size of l1 cache. */
1970 256, /* size of l2 cache. */
1971 64, /* size of prefetch block */
1972 6, /* number of parallel prefetches */
1973 3, /* Branch cost */
1974 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1975 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1976 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1977 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1978 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1979 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1980 intel_memcpy,
1981 intel_memset,
1982 1, /* scalar_stmt_cost. */
1983 1, /* scalar load_cost. */
1984 1, /* scalar_store_cost. */
1985 1, /* vec_stmt_cost. */
1986 4, /* vec_to_scalar_cost. */
1987 1, /* scalar_to_vec_cost. */
1988 1, /* vec_align_load_cost. */
1989 2, /* vec_unalign_load_cost. */
1990 1, /* vec_store_cost. */
1991 3, /* cond_taken_branch_cost. */
1992 1, /* cond_not_taken_branch_cost. */
1995 /* Generic should produce code tuned for Core-i7 (and newer chips)
1996 and btver1 (and newer chips). */
1998 static stringop_algs generic_memcpy[2] = {
1999 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2000 {-1, libcall, false}}},
2001 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2002 {-1, libcall, false}}}};
2003 static stringop_algs generic_memset[2] = {
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2005 {-1, libcall, false}}},
2006 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2007 {-1, libcall, false}}}};
2008 static const
2009 struct processor_costs generic_cost = {
2010 COSTS_N_INSNS (1), /* cost of an add instruction */
2011 /* On all chips taken into consideration lea is 2 cycles and more. With
2012 this cost however our current implementation of synth_mult results in
2013 use of unnecessary temporary registers causing regression on several
2014 SPECfp benchmarks. */
2015 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2016 COSTS_N_INSNS (1), /* variable shift costs */
2017 COSTS_N_INSNS (1), /* constant shift costs */
2018 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2019 COSTS_N_INSNS (4), /* HI */
2020 COSTS_N_INSNS (3), /* SI */
2021 COSTS_N_INSNS (4), /* DI */
2022 COSTS_N_INSNS (2)}, /* other */
2023 0, /* cost of multiply per each bit set */
2024 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2025 COSTS_N_INSNS (26), /* HI */
2026 COSTS_N_INSNS (42), /* SI */
2027 COSTS_N_INSNS (74), /* DI */
2028 COSTS_N_INSNS (74)}, /* other */
2029 COSTS_N_INSNS (1), /* cost of movsx */
2030 COSTS_N_INSNS (1), /* cost of movzx */
2031 8, /* "large" insn */
2032 17, /* MOVE_RATIO */
2033 4, /* cost for loading QImode using movzbl */
2034 {4, 4, 4}, /* cost of loading integer registers
2035 in QImode, HImode and SImode.
2036 Relative to reg-reg move (2). */
2037 {4, 4, 4}, /* cost of storing integer registers */
2038 4, /* cost of reg,reg fld/fst */
2039 {12, 12, 12}, /* cost of loading fp registers
2040 in SFmode, DFmode and XFmode */
2041 {6, 6, 8}, /* cost of storing fp registers
2042 in SFmode, DFmode and XFmode */
2043 2, /* cost of moving MMX register */
2044 {8, 8}, /* cost of loading MMX registers
2045 in SImode and DImode */
2046 {8, 8}, /* cost of storing MMX registers
2047 in SImode and DImode */
2048 2, /* cost of moving SSE register */
2049 {8, 8, 8}, /* cost of loading SSE registers
2050 in SImode, DImode and TImode */
2051 {8, 8, 8}, /* cost of storing SSE registers
2052 in SImode, DImode and TImode */
2053 5, /* MMX or SSE register to integer */
2054 32, /* size of l1 cache. */
2055 512, /* size of l2 cache. */
2056 64, /* size of prefetch block */
2057 6, /* number of parallel prefetches */
2058 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2059 value is increased to perhaps more appropriate value of 5. */
2060 3, /* Branch cost */
2061 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2062 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2063 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2064 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2065 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2066 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2067 generic_memcpy,
2068 generic_memset,
2069 1, /* scalar_stmt_cost. */
2070 1, /* scalar load_cost. */
2071 1, /* scalar_store_cost. */
2072 1, /* vec_stmt_cost. */
2073 1, /* vec_to_scalar_cost. */
2074 1, /* scalar_to_vec_cost. */
2075 1, /* vec_align_load_cost. */
2076 2, /* vec_unalign_load_cost. */
2077 1, /* vec_store_cost. */
2078 3, /* cond_taken_branch_cost. */
2079 1, /* cond_not_taken_branch_cost. */
2082 /* core_cost should produce code tuned for Core familly of CPUs. */
2083 static stringop_algs core_memcpy[2] = {
2084 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2085 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2086 {-1, libcall, false}}}};
2087 static stringop_algs core_memset[2] = {
2088 {libcall, {{6, loop_1_byte, true},
2089 {24, loop, true},
2090 {8192, rep_prefix_4_byte, true},
2091 {-1, libcall, false}}},
2092 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2093 {-1, libcall, false}}}};
2095 static const
2096 struct processor_costs core_cost = {
2097 COSTS_N_INSNS (1), /* cost of an add instruction */
2098 /* On all chips taken into consideration lea is 2 cycles and more. With
2099 this cost however our current implementation of synth_mult results in
2100 use of unnecessary temporary registers causing regression on several
2101 SPECfp benchmarks. */
2102 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2103 COSTS_N_INSNS (1), /* variable shift costs */
2104 COSTS_N_INSNS (1), /* constant shift costs */
2105 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2106 COSTS_N_INSNS (4), /* HI */
2107 COSTS_N_INSNS (3), /* SI */
2108 COSTS_N_INSNS (4), /* DI */
2109 COSTS_N_INSNS (2)}, /* other */
2110 0, /* cost of multiply per each bit set */
2111 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2112 COSTS_N_INSNS (26), /* HI */
2113 COSTS_N_INSNS (42), /* SI */
2114 COSTS_N_INSNS (74), /* DI */
2115 COSTS_N_INSNS (74)}, /* other */
2116 COSTS_N_INSNS (1), /* cost of movsx */
2117 COSTS_N_INSNS (1), /* cost of movzx */
2118 8, /* "large" insn */
2119 17, /* MOVE_RATIO */
2120 4, /* cost for loading QImode using movzbl */
2121 {4, 4, 4}, /* cost of loading integer registers
2122 in QImode, HImode and SImode.
2123 Relative to reg-reg move (2). */
2124 {4, 4, 4}, /* cost of storing integer registers */
2125 4, /* cost of reg,reg fld/fst */
2126 {12, 12, 12}, /* cost of loading fp registers
2127 in SFmode, DFmode and XFmode */
2128 {6, 6, 8}, /* cost of storing fp registers
2129 in SFmode, DFmode and XFmode */
2130 2, /* cost of moving MMX register */
2131 {8, 8}, /* cost of loading MMX registers
2132 in SImode and DImode */
2133 {8, 8}, /* cost of storing MMX registers
2134 in SImode and DImode */
2135 2, /* cost of moving SSE register */
2136 {8, 8, 8}, /* cost of loading SSE registers
2137 in SImode, DImode and TImode */
2138 {8, 8, 8}, /* cost of storing SSE registers
2139 in SImode, DImode and TImode */
2140 5, /* MMX or SSE register to integer */
2141 64, /* size of l1 cache. */
2142 512, /* size of l2 cache. */
2143 64, /* size of prefetch block */
2144 6, /* number of parallel prefetches */
2145 /* FIXME perhaps more appropriate value is 5. */
2146 3, /* Branch cost */
2147 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2148 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2149 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2150 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2151 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2152 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2153 core_memcpy,
2154 core_memset,
2155 1, /* scalar_stmt_cost. */
2156 1, /* scalar load_cost. */
2157 1, /* scalar_store_cost. */
2158 1, /* vec_stmt_cost. */
2159 1, /* vec_to_scalar_cost. */
2160 1, /* scalar_to_vec_cost. */
2161 1, /* vec_align_load_cost. */
2162 2, /* vec_unalign_load_cost. */
2163 1, /* vec_store_cost. */
2164 3, /* cond_taken_branch_cost. */
2165 1, /* cond_not_taken_branch_cost. */
2169 /* Set by -mtune. */
2170 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2172 /* Set by -mtune or -Os. */
2173 const struct processor_costs *ix86_cost = &pentium_cost;
2175 /* Processor feature/optimization bitmasks. */
2176 #define m_386 (1U<<PROCESSOR_I386)
2177 #define m_486 (1U<<PROCESSOR_I486)
2178 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2179 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2180 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2181 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2182 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2183 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2184 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2185 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2186 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2187 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2188 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2189 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2190 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2191 #define m_KNL (1U<<PROCESSOR_KNL)
2192 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2193 #define m_INTEL (1U<<PROCESSOR_INTEL)
2195 #define m_GEODE (1U<<PROCESSOR_GEODE)
2196 #define m_K6 (1U<<PROCESSOR_K6)
2197 #define m_K6_GEODE (m_K6 | m_GEODE)
2198 #define m_K8 (1U<<PROCESSOR_K8)
2199 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2200 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2201 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2202 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2203 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2204 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2205 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2206 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2207 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2208 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2209 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2210 #define m_BTVER (m_BTVER1 | m_BTVER2)
2211 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2212 | m_ZNVER1)
2214 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2216 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2217 #undef DEF_TUNE
2218 #define DEF_TUNE(tune, name, selector) name,
2219 #include "x86-tune.def"
2220 #undef DEF_TUNE
2223 /* Feature tests against the various tunings. */
2224 unsigned char ix86_tune_features[X86_TUNE_LAST];
2226 /* Feature tests against the various tunings used to create ix86_tune_features
2227 based on the processor mask. */
2228 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2229 #undef DEF_TUNE
2230 #define DEF_TUNE(tune, name, selector) selector,
2231 #include "x86-tune.def"
2232 #undef DEF_TUNE
2235 /* Feature tests against the various architecture variations. */
2236 unsigned char ix86_arch_features[X86_ARCH_LAST];
2238 /* Feature tests against the various architecture variations, used to create
2239 ix86_arch_features based on the processor mask. */
2240 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2241 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2242 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2244 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2245 ~m_386,
2247 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2248 ~(m_386 | m_486),
2250 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2251 ~m_386,
2253 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2254 ~m_386,
2257 /* In case the average insn count for single function invocation is
2258 lower than this constant, emit fast (but longer) prologue and
2259 epilogue code. */
2260 #define FAST_PROLOGUE_INSN_COUNT 20
2262 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2263 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2264 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2265 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2267 /* Array of the smallest class containing reg number REGNO, indexed by
2268 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2270 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2272 /* ax, dx, cx, bx */
2273 AREG, DREG, CREG, BREG,
2274 /* si, di, bp, sp */
2275 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2276 /* FP registers */
2277 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2278 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2279 /* arg pointer */
2280 NON_Q_REGS,
2281 /* flags, fpsr, fpcr, frame */
2282 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2283 /* SSE registers */
2284 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2285 SSE_REGS, SSE_REGS,
2286 /* MMX registers */
2287 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2288 MMX_REGS, MMX_REGS,
2289 /* REX registers */
2290 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2291 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2292 /* SSE REX registers */
2293 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2294 SSE_REGS, SSE_REGS,
2295 /* AVX-512 SSE registers */
2296 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2297 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2298 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 /* Mask registers. */
2301 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2302 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2303 /* MPX bound registers */
2304 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2307 /* The "default" register map used in 32bit mode. */
2309 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2311 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2312 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2313 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2314 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2315 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2316 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2317 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2318 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2320 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2321 101, 102, 103, 104, /* bound registers */
2324 /* The "default" register map used in 64bit mode. */
2326 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2328 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2329 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2330 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2331 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2332 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2333 8,9,10,11,12,13,14,15, /* extended integer registers */
2334 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2335 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2336 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2337 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2338 126, 127, 128, 129, /* bound registers */
2341 /* Define the register numbers to be used in Dwarf debugging information.
2342 The SVR4 reference port C compiler uses the following register numbers
2343 in its Dwarf output code:
2344 0 for %eax (gcc regno = 0)
2345 1 for %ecx (gcc regno = 2)
2346 2 for %edx (gcc regno = 1)
2347 3 for %ebx (gcc regno = 3)
2348 4 for %esp (gcc regno = 7)
2349 5 for %ebp (gcc regno = 6)
2350 6 for %esi (gcc regno = 4)
2351 7 for %edi (gcc regno = 5)
2352 The following three DWARF register numbers are never generated by
2353 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2354 believes these numbers have these meanings.
2355 8 for %eip (no gcc equivalent)
2356 9 for %eflags (gcc regno = 17)
2357 10 for %trapno (no gcc equivalent)
2358 It is not at all clear how we should number the FP stack registers
2359 for the x86 architecture. If the version of SDB on x86/svr4 were
2360 a bit less brain dead with respect to floating-point then we would
2361 have a precedent to follow with respect to DWARF register numbers
2362 for x86 FP registers, but the SDB on x86/svr4 is so completely
2363 broken with respect to FP registers that it is hardly worth thinking
2364 of it as something to strive for compatibility with.
2365 The version of x86/svr4 SDB I have at the moment does (partially)
2366 seem to believe that DWARF register number 11 is associated with
2367 the x86 register %st(0), but that's about all. Higher DWARF
2368 register numbers don't seem to be associated with anything in
2369 particular, and even for DWARF regno 11, SDB only seems to under-
2370 stand that it should say that a variable lives in %st(0) (when
2371 asked via an `=' command) if we said it was in DWARF regno 11,
2372 but SDB still prints garbage when asked for the value of the
2373 variable in question (via a `/' command).
2374 (Also note that the labels SDB prints for various FP stack regs
2375 when doing an `x' command are all wrong.)
2376 Note that these problems generally don't affect the native SVR4
2377 C compiler because it doesn't allow the use of -O with -g and
2378 because when it is *not* optimizing, it allocates a memory
2379 location for each floating-point variable, and the memory
2380 location is what gets described in the DWARF AT_location
2381 attribute for the variable in question.
2382 Regardless of the severe mental illness of the x86/svr4 SDB, we
2383 do something sensible here and we use the following DWARF
2384 register numbers. Note that these are all stack-top-relative
2385 numbers.
2386 11 for %st(0) (gcc regno = 8)
2387 12 for %st(1) (gcc regno = 9)
2388 13 for %st(2) (gcc regno = 10)
2389 14 for %st(3) (gcc regno = 11)
2390 15 for %st(4) (gcc regno = 12)
2391 16 for %st(5) (gcc regno = 13)
2392 17 for %st(6) (gcc regno = 14)
2393 18 for %st(7) (gcc regno = 15)
2395 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2397 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2398 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2399 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2400 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2401 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2402 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2403 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2404 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2406 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2407 101, 102, 103, 104, /* bound registers */
2410 /* Define parameter passing and return registers. */
2412 static int const x86_64_int_parameter_registers[6] =
2414 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2417 static int const x86_64_ms_abi_int_parameter_registers[4] =
2419 CX_REG, DX_REG, R8_REG, R9_REG
2422 static int const x86_64_int_return_registers[4] =
2424 AX_REG, DX_REG, DI_REG, SI_REG
2427 /* Additional registers that are clobbered by SYSV calls. */
2429 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2430 static int const x86_64_ms_sysv_extra_clobbered_registers
2431 [NUM_X86_64_MS_CLOBBERED_REGS] =
2433 SI_REG, DI_REG,
2434 XMM6_REG, XMM7_REG,
2435 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2436 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2439 enum xlogue_stub {
2440 XLOGUE_STUB_SAVE,
2441 XLOGUE_STUB_RESTORE,
2442 XLOGUE_STUB_RESTORE_TAIL,
2443 XLOGUE_STUB_SAVE_HFP,
2444 XLOGUE_STUB_RESTORE_HFP,
2445 XLOGUE_STUB_RESTORE_HFP_TAIL,
2447 XLOGUE_STUB_COUNT
2450 enum xlogue_stub_sets {
2451 XLOGUE_SET_ALIGNED,
2452 XLOGUE_SET_ALIGNED_PLUS_8,
2453 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2454 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2456 XLOGUE_SET_COUNT
2459 /* Register save/restore layout used by out-of-line stubs. */
2460 class xlogue_layout {
2461 public:
2462 struct reginfo
2464 unsigned regno;
2465 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2466 rsi) to where each register is stored. */
2469 unsigned get_nregs () const {return m_nregs;}
2470 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2472 const reginfo &get_reginfo (unsigned reg) const
2474 gcc_assert (reg < m_nregs);
2475 return m_regs[reg];
2478 static const char *get_stub_name (enum xlogue_stub stub,
2479 unsigned n_extra_args);
2481 /* Returns an rtx for the stub's symbol based upon
2482 1.) the specified stub (save, restore or restore_ret) and
2483 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2484 3.) rather or not stack alignment is being performed. */
2485 static rtx get_stub_rtx (enum xlogue_stub stub);
2487 /* Returns the amount of stack space (including padding) that the stub
2488 needs to store registers based upon data in the machine_function. */
2489 HOST_WIDE_INT get_stack_space_used () const
2491 const struct machine_function *m = cfun->machine;
2492 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2494 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2495 return m_regs[last_reg].offset
2496 + (m->call_ms2sysv_pad_out ? 8 : 0)
2497 + STUB_INDEX_OFFSET;
2500 /* Returns the offset for the base pointer used by the stub. */
2501 HOST_WIDE_INT get_stub_ptr_offset () const
2503 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2506 static const struct xlogue_layout &get_instance ();
2507 static unsigned count_stub_managed_regs ();
2508 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2510 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2511 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2512 static const unsigned MAX_REGS = 18;
2513 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2514 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2515 static const unsigned STUB_NAME_MAX_LEN = 16;
2516 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2517 static const unsigned REG_ORDER[MAX_REGS];
2518 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2520 private:
2521 xlogue_layout ();
2522 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2523 xlogue_layout (const xlogue_layout &);
2525 /* True if hard frame pointer is used. */
2526 bool m_hfp;
2528 /* Max number of register this layout manages. */
2529 unsigned m_nregs;
2531 /* Incoming offset from 16-byte alignment. */
2532 HOST_WIDE_INT m_stack_align_off_in;
2534 /* Register order and offsets. */
2535 struct reginfo m_regs[MAX_REGS];
2537 /* Lazy-inited cache of symbol names for stubs. */
2538 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2539 [STUB_NAME_MAX_LEN];
2541 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2544 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2545 "savms64",
2546 "resms64",
2547 "resms64x",
2548 "savms64f",
2549 "resms64f",
2550 "resms64fx"
2553 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2554 /* The below offset values are where each register is stored for the layout
2555 relative to incoming stack pointer. The value of each m_regs[].offset will
2556 be relative to the incoming base pointer (rax or rsi) used by the stub.
2558 s_instances: 0 1 2 3
2559 Offset: realigned or aligned + 8
2560 Register aligned aligned + 8 aligned w/HFP w/HFP */
2561 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2562 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2563 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2564 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2565 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2566 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2567 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2568 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2569 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2570 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2571 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2572 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2573 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2574 BP_REG, /* 0xc0 0xc8 N/A N/A */
2575 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2576 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2577 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2578 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2581 /* Instantiate static const values. */
2582 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2583 const unsigned xlogue_layout::MIN_REGS;
2584 const unsigned xlogue_layout::MAX_REGS;
2585 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2586 const unsigned xlogue_layout::VARIANT_COUNT;
2587 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2589 /* Initialize xlogue_layout::s_stub_names to zero. */
2590 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2591 [STUB_NAME_MAX_LEN];
2593 /* Instantiates all xlogue_layout instances. */
2594 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2595 xlogue_layout (0, false),
2596 xlogue_layout (8, false),
2597 xlogue_layout (0, true),
2598 xlogue_layout (8, true)
2601 /* Return an appropriate const instance of xlogue_layout based upon values
2602 in cfun->machine and crtl. */
2603 const struct xlogue_layout &
2604 xlogue_layout::get_instance ()
2606 enum xlogue_stub_sets stub_set;
2607 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2609 if (stack_realign_fp)
2610 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2611 else if (frame_pointer_needed)
2612 stub_set = aligned_plus_8
2613 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2614 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2615 else
2616 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2618 return s_instances[stub_set];
2621 /* Determine how many clobbered registers can be saved by the stub.
2622 Returns the count of registers the stub will save and restore. */
2623 unsigned
2624 xlogue_layout::count_stub_managed_regs ()
2626 bool hfp = frame_pointer_needed || stack_realign_fp;
2627 unsigned i, count;
2628 unsigned regno;
2630 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2632 regno = REG_ORDER[i];
2633 if (regno == BP_REG && hfp)
2634 continue;
2635 if (!ix86_save_reg (regno, false, false))
2636 break;
2637 ++count;
2639 return count;
2642 /* Determine if register REGNO is a stub managed register given the
2643 total COUNT of stub managed registers. */
2644 bool
2645 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2647 bool hfp = frame_pointer_needed || stack_realign_fp;
2648 unsigned i;
2650 for (i = 0; i < count; ++i)
2652 gcc_assert (i < MAX_REGS);
2653 if (REG_ORDER[i] == BP_REG && hfp)
2654 ++count;
2655 else if (REG_ORDER[i] == regno)
2656 return true;
2658 return false;
2661 /* Constructor for xlogue_layout. */
2662 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2663 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2664 m_stack_align_off_in (stack_align_off_in)
2666 HOST_WIDE_INT offset = stack_align_off_in;
2667 unsigned i, j;
2669 for (i = j = 0; i < MAX_REGS; ++i)
2671 unsigned regno = REG_ORDER[i];
2673 if (regno == BP_REG && hfp)
2674 continue;
2675 if (SSE_REGNO_P (regno))
2677 offset += 16;
2678 /* Verify that SSE regs are always aligned. */
2679 gcc_assert (!((stack_align_off_in + offset) & 15));
2681 else
2682 offset += 8;
2684 m_regs[j].regno = regno;
2685 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2687 gcc_assert (j == m_nregs);
2690 const char *
2691 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2692 unsigned n_extra_regs)
2694 char *name = s_stub_names[stub][n_extra_regs];
2696 /* Lazy init */
2697 if (!*name)
2699 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2700 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2701 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2704 return name;
2707 /* Return rtx of a symbol ref for the entry point (based upon
2708 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2710 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2712 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2713 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2714 gcc_assert (stub < XLOGUE_STUB_COUNT);
2715 gcc_assert (crtl->stack_realign_finalized);
2717 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2720 /* Define the structure for the machine field in struct function. */
2722 struct GTY(()) stack_local_entry {
2723 unsigned short mode;
2724 unsigned short n;
2725 rtx rtl;
2726 struct stack_local_entry *next;
2729 /* Which cpu are we scheduling for. */
2730 enum attr_cpu ix86_schedule;
2732 /* Which cpu are we optimizing for. */
2733 enum processor_type ix86_tune;
2735 /* Which instruction set architecture to use. */
2736 enum processor_type ix86_arch;
2738 /* True if processor has SSE prefetch instruction. */
2739 unsigned char x86_prefetch_sse;
2741 /* -mstackrealign option */
2742 static const char ix86_force_align_arg_pointer_string[]
2743 = "force_align_arg_pointer";
2745 static rtx (*ix86_gen_leave) (void);
2746 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2747 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2748 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2749 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2750 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2751 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_clzero) (rtx);
2753 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2754 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2755 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2757 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2758 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2760 /* Preferred alignment for stack boundary in bits. */
2761 unsigned int ix86_preferred_stack_boundary;
2763 /* Alignment for incoming stack boundary in bits specified at
2764 command line. */
2765 static unsigned int ix86_user_incoming_stack_boundary;
2767 /* Default alignment for incoming stack boundary in bits. */
2768 static unsigned int ix86_default_incoming_stack_boundary;
2770 /* Alignment for incoming stack boundary in bits. */
2771 unsigned int ix86_incoming_stack_boundary;
2773 /* Calling abi specific va_list type nodes. */
2774 static GTY(()) tree sysv_va_list_type_node;
2775 static GTY(()) tree ms_va_list_type_node;
2777 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2778 char internal_label_prefix[16];
2779 int internal_label_prefix_len;
2781 /* Fence to use after loop using movnt. */
2782 tree x86_mfence;
2784 /* Register class used for passing given 64bit part of the argument.
2785 These represent classes as documented by the PS ABI, with the exception
2786 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2787 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2789 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2790 whenever possible (upper half does contain padding). */
2791 enum x86_64_reg_class
2793 X86_64_NO_CLASS,
2794 X86_64_INTEGER_CLASS,
2795 X86_64_INTEGERSI_CLASS,
2796 X86_64_SSE_CLASS,
2797 X86_64_SSESF_CLASS,
2798 X86_64_SSEDF_CLASS,
2799 X86_64_SSEUP_CLASS,
2800 X86_64_X87_CLASS,
2801 X86_64_X87UP_CLASS,
2802 X86_64_COMPLEX_X87_CLASS,
2803 X86_64_MEMORY_CLASS
2806 #define MAX_CLASSES 8
2808 /* Table of constants used by fldpi, fldln2, etc.... */
2809 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2810 static bool ext_80387_constants_init;
2813 static struct machine_function * ix86_init_machine_status (void);
2814 static rtx ix86_function_value (const_tree, const_tree, bool);
2815 static bool ix86_function_value_regno_p (const unsigned int);
2816 static unsigned int ix86_function_arg_boundary (machine_mode,
2817 const_tree);
2818 static rtx ix86_static_chain (const_tree, bool);
2819 static int ix86_function_regparm (const_tree, const_tree);
2820 static void ix86_compute_frame_layout (void);
2821 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2822 rtx, rtx, int);
2823 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2824 static tree ix86_canonical_va_list_type (tree);
2825 static void predict_jump (int);
2826 static unsigned int split_stack_prologue_scratch_regno (void);
2827 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2829 enum ix86_function_specific_strings
2831 IX86_FUNCTION_SPECIFIC_ARCH,
2832 IX86_FUNCTION_SPECIFIC_TUNE,
2833 IX86_FUNCTION_SPECIFIC_MAX
2836 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2837 const char *, const char *, enum fpmath_unit,
2838 bool);
2839 static void ix86_function_specific_save (struct cl_target_option *,
2840 struct gcc_options *opts);
2841 static void ix86_function_specific_restore (struct gcc_options *opts,
2842 struct cl_target_option *);
2843 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2844 static void ix86_function_specific_print (FILE *, int,
2845 struct cl_target_option *);
2846 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2847 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2848 struct gcc_options *,
2849 struct gcc_options *,
2850 struct gcc_options *);
2851 static bool ix86_can_inline_p (tree, tree);
2852 static void ix86_set_current_function (tree);
2853 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2855 static enum calling_abi ix86_function_abi (const_tree);
2858 #ifndef SUBTARGET32_DEFAULT_CPU
2859 #define SUBTARGET32_DEFAULT_CPU "i386"
2860 #endif
2862 /* Whether -mtune= or -march= were specified */
2863 static int ix86_tune_defaulted;
2864 static int ix86_arch_specified;
2866 /* Vectorization library interface and handlers. */
2867 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2869 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2870 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2872 /* Processor target table, indexed by processor number */
2873 struct ptt
2875 const char *const name; /* processor name */
2876 const struct processor_costs *cost; /* Processor costs */
2877 const int align_loop; /* Default alignments. */
2878 const int align_loop_max_skip;
2879 const int align_jump;
2880 const int align_jump_max_skip;
2881 const int align_func;
2884 /* This table must be in sync with enum processor_type in i386.h. */
2885 static const struct ptt processor_target_table[PROCESSOR_max] =
2887 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2888 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2889 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2890 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2891 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2892 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2893 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2894 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2895 {"core2", &core_cost, 16, 10, 16, 10, 16},
2896 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2897 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2898 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2899 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2900 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2901 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2902 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2903 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2904 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2905 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2906 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2907 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2908 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2909 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2910 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2911 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2912 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2913 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2914 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2915 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2918 static unsigned int
2919 rest_of_handle_insert_vzeroupper (void)
2921 int i;
2923 /* vzeroupper instructions are inserted immediately after reload to
2924 account for possible spills from 256bit registers. The pass
2925 reuses mode switching infrastructure by re-running mode insertion
2926 pass, so disable entities that have already been processed. */
2927 for (i = 0; i < MAX_386_ENTITIES; i++)
2928 ix86_optimize_mode_switching[i] = 0;
2930 ix86_optimize_mode_switching[AVX_U128] = 1;
2932 /* Call optimize_mode_switching. */
2933 g->get_passes ()->execute_pass_mode_switching ();
2934 return 0;
2937 /* Return 1 if INSN uses or defines a hard register.
2938 Hard register uses in a memory address are ignored.
2939 Clobbers and flags definitions are ignored. */
2941 static bool
2942 has_non_address_hard_reg (rtx_insn *insn)
2944 df_ref ref;
2945 FOR_EACH_INSN_DEF (ref, insn)
2946 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2947 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2948 && DF_REF_REGNO (ref) != FLAGS_REG)
2949 return true;
2951 FOR_EACH_INSN_USE (ref, insn)
2952 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2953 return true;
2955 return false;
2958 /* Check if comparison INSN may be transformed
2959 into vector comparison. Currently we transform
2960 zero checks only which look like:
2962 (set (reg:CCZ 17 flags)
2963 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2964 (subreg:SI (reg:DI x) 0))
2965 (const_int 0 [0]))) */
2967 static bool
2968 convertible_comparison_p (rtx_insn *insn)
2970 if (!TARGET_SSE4_1)
2971 return false;
2973 rtx def_set = single_set (insn);
2975 gcc_assert (def_set);
2977 rtx src = SET_SRC (def_set);
2978 rtx dst = SET_DEST (def_set);
2980 gcc_assert (GET_CODE (src) == COMPARE);
2982 if (GET_CODE (dst) != REG
2983 || REGNO (dst) != FLAGS_REG
2984 || GET_MODE (dst) != CCZmode)
2985 return false;
2987 rtx op1 = XEXP (src, 0);
2988 rtx op2 = XEXP (src, 1);
2990 if (op2 != CONST0_RTX (GET_MODE (op2)))
2991 return false;
2993 if (GET_CODE (op1) != IOR)
2994 return false;
2996 op2 = XEXP (op1, 1);
2997 op1 = XEXP (op1, 0);
2999 if (!SUBREG_P (op1)
3000 || !SUBREG_P (op2)
3001 || GET_MODE (op1) != SImode
3002 || GET_MODE (op2) != SImode
3003 || ((SUBREG_BYTE (op1) != 0
3004 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3005 && (SUBREG_BYTE (op2) != 0
3006 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3007 return false;
3009 op1 = SUBREG_REG (op1);
3010 op2 = SUBREG_REG (op2);
3012 if (op1 != op2
3013 || !REG_P (op1)
3014 || GET_MODE (op1) != DImode)
3015 return false;
3017 return true;
3020 /* The DImode version of scalar_to_vector_candidate_p. */
3022 static bool
3023 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3025 rtx def_set = single_set (insn);
3027 if (!def_set)
3028 return false;
3030 if (has_non_address_hard_reg (insn))
3031 return false;
3033 rtx src = SET_SRC (def_set);
3034 rtx dst = SET_DEST (def_set);
3036 if (GET_CODE (src) == COMPARE)
3037 return convertible_comparison_p (insn);
3039 /* We are interested in DImode promotion only. */
3040 if ((GET_MODE (src) != DImode
3041 && !CONST_INT_P (src))
3042 || GET_MODE (dst) != DImode)
3043 return false;
3045 if (!REG_P (dst) && !MEM_P (dst))
3046 return false;
3048 switch (GET_CODE (src))
3050 case ASHIFTRT:
3051 if (!TARGET_AVX512VL)
3052 return false;
3053 /* FALLTHRU */
3055 case ASHIFT:
3056 case LSHIFTRT:
3057 if (!REG_P (XEXP (src, 1))
3058 && (!SUBREG_P (XEXP (src, 1))
3059 || SUBREG_BYTE (XEXP (src, 1)) != 0
3060 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3061 && (!CONST_INT_P (XEXP (src, 1))
3062 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3063 return false;
3065 if (GET_MODE (XEXP (src, 1)) != QImode
3066 && !CONST_INT_P (XEXP (src, 1)))
3067 return false;
3068 break;
3070 case PLUS:
3071 case MINUS:
3072 case IOR:
3073 case XOR:
3074 case AND:
3075 if (!REG_P (XEXP (src, 1))
3076 && !MEM_P (XEXP (src, 1))
3077 && !CONST_INT_P (XEXP (src, 1)))
3078 return false;
3080 if (GET_MODE (XEXP (src, 1)) != DImode
3081 && !CONST_INT_P (XEXP (src, 1)))
3082 return false;
3083 break;
3085 case NEG:
3086 case NOT:
3087 break;
3089 case REG:
3090 return true;
3092 case MEM:
3093 case CONST_INT:
3094 return REG_P (dst);
3096 default:
3097 return false;
3100 if (!REG_P (XEXP (src, 0))
3101 && !MEM_P (XEXP (src, 0))
3102 && !CONST_INT_P (XEXP (src, 0))
3103 /* Check for andnot case. */
3104 && (GET_CODE (src) != AND
3105 || GET_CODE (XEXP (src, 0)) != NOT
3106 || !REG_P (XEXP (XEXP (src, 0), 0))))
3107 return false;
3109 if (GET_MODE (XEXP (src, 0)) != DImode
3110 && !CONST_INT_P (XEXP (src, 0)))
3111 return false;
3113 return true;
3116 /* The TImode version of scalar_to_vector_candidate_p. */
3118 static bool
3119 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3121 rtx def_set = single_set (insn);
3123 if (!def_set)
3124 return false;
3126 if (has_non_address_hard_reg (insn))
3127 return false;
3129 rtx src = SET_SRC (def_set);
3130 rtx dst = SET_DEST (def_set);
3132 /* Only TImode load and store are allowed. */
3133 if (GET_MODE (dst) != TImode)
3134 return false;
3136 if (MEM_P (dst))
3138 /* Check for store. Memory must be aligned or unaligned store
3139 is optimal. Only support store from register, standard SSE
3140 constant or CONST_WIDE_INT generated from piecewise store.
3142 ??? Verify performance impact before enabling CONST_INT for
3143 __int128 store. */
3144 if (misaligned_operand (dst, TImode)
3145 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3146 return false;
3148 switch (GET_CODE (src))
3150 default:
3151 return false;
3153 case REG:
3154 case CONST_WIDE_INT:
3155 return true;
3157 case CONST_INT:
3158 return standard_sse_constant_p (src, TImode);
3161 else if (MEM_P (src))
3163 /* Check for load. Memory must be aligned or unaligned load is
3164 optimal. */
3165 return (REG_P (dst)
3166 && (!misaligned_operand (src, TImode)
3167 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3170 return false;
3173 /* Return 1 if INSN may be converted into vector
3174 instruction. */
3176 static bool
3177 scalar_to_vector_candidate_p (rtx_insn *insn)
3179 if (TARGET_64BIT)
3180 return timode_scalar_to_vector_candidate_p (insn);
3181 else
3182 return dimode_scalar_to_vector_candidate_p (insn);
3185 /* The DImode version of remove_non_convertible_regs. */
3187 static void
3188 dimode_remove_non_convertible_regs (bitmap candidates)
3190 bitmap_iterator bi;
3191 unsigned id;
3192 bitmap regs = BITMAP_ALLOC (NULL);
3194 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3196 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3197 rtx reg = SET_DEST (def_set);
3199 if (!REG_P (reg)
3200 || bitmap_bit_p (regs, REGNO (reg))
3201 || HARD_REGISTER_P (reg))
3202 continue;
3204 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3205 def;
3206 def = DF_REF_NEXT_REG (def))
3208 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3210 if (dump_file)
3211 fprintf (dump_file,
3212 "r%d has non convertible definition in insn %d\n",
3213 REGNO (reg), DF_REF_INSN_UID (def));
3215 bitmap_set_bit (regs, REGNO (reg));
3216 break;
3221 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3223 for (df_ref def = DF_REG_DEF_CHAIN (id);
3224 def;
3225 def = DF_REF_NEXT_REG (def))
3226 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3228 if (dump_file)
3229 fprintf (dump_file, "Removing insn %d from candidates list\n",
3230 DF_REF_INSN_UID (def));
3232 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3236 BITMAP_FREE (regs);
3239 /* For a register REGNO, scan instructions for its defs and uses.
3240 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3242 static void
3243 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3244 unsigned int regno)
3246 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3247 def;
3248 def = DF_REF_NEXT_REG (def))
3250 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3252 if (dump_file)
3253 fprintf (dump_file,
3254 "r%d has non convertible def in insn %d\n",
3255 regno, DF_REF_INSN_UID (def));
3257 bitmap_set_bit (regs, regno);
3258 break;
3262 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3263 ref;
3264 ref = DF_REF_NEXT_REG (ref))
3266 /* Debug instructions are skipped. */
3267 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3268 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3270 if (dump_file)
3271 fprintf (dump_file,
3272 "r%d has non convertible use in insn %d\n",
3273 regno, DF_REF_INSN_UID (ref));
3275 bitmap_set_bit (regs, regno);
3276 break;
3281 /* The TImode version of remove_non_convertible_regs. */
3283 static void
3284 timode_remove_non_convertible_regs (bitmap candidates)
3286 bitmap_iterator bi;
3287 unsigned id;
3288 bitmap regs = BITMAP_ALLOC (NULL);
3290 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3292 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3293 rtx dest = SET_DEST (def_set);
3294 rtx src = SET_SRC (def_set);
3296 if ((!REG_P (dest)
3297 || bitmap_bit_p (regs, REGNO (dest))
3298 || HARD_REGISTER_P (dest))
3299 && (!REG_P (src)
3300 || bitmap_bit_p (regs, REGNO (src))
3301 || HARD_REGISTER_P (src)))
3302 continue;
3304 if (REG_P (dest))
3305 timode_check_non_convertible_regs (candidates, regs,
3306 REGNO (dest));
3308 if (REG_P (src))
3309 timode_check_non_convertible_regs (candidates, regs,
3310 REGNO (src));
3313 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3315 for (df_ref def = DF_REG_DEF_CHAIN (id);
3316 def;
3317 def = DF_REF_NEXT_REG (def))
3318 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3320 if (dump_file)
3321 fprintf (dump_file, "Removing insn %d from candidates list\n",
3322 DF_REF_INSN_UID (def));
3324 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3327 for (df_ref ref = DF_REG_USE_CHAIN (id);
3328 ref;
3329 ref = DF_REF_NEXT_REG (ref))
3330 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3332 if (dump_file)
3333 fprintf (dump_file, "Removing insn %d from candidates list\n",
3334 DF_REF_INSN_UID (ref));
3336 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3340 BITMAP_FREE (regs);
3343 /* For a given bitmap of insn UIDs scans all instruction and
3344 remove insn from CANDIDATES in case it has both convertible
3345 and not convertible definitions.
3347 All insns in a bitmap are conversion candidates according to
3348 scalar_to_vector_candidate_p. Currently it implies all insns
3349 are single_set. */
3351 static void
3352 remove_non_convertible_regs (bitmap candidates)
3354 if (TARGET_64BIT)
3355 timode_remove_non_convertible_regs (candidates);
3356 else
3357 dimode_remove_non_convertible_regs (candidates);
3360 class scalar_chain
3362 public:
3363 scalar_chain ();
3364 virtual ~scalar_chain ();
3366 static unsigned max_id;
3368 /* ID of a chain. */
3369 unsigned int chain_id;
3370 /* A queue of instructions to be included into a chain. */
3371 bitmap queue;
3372 /* Instructions included into a chain. */
3373 bitmap insns;
3374 /* All registers defined by a chain. */
3375 bitmap defs;
3376 /* Registers used in both vector and sclar modes. */
3377 bitmap defs_conv;
3379 void build (bitmap candidates, unsigned insn_uid);
3380 virtual int compute_convert_gain () = 0;
3381 int convert ();
3383 protected:
3384 void add_to_queue (unsigned insn_uid);
3385 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3387 private:
3388 void add_insn (bitmap candidates, unsigned insn_uid);
3389 void analyze_register_chain (bitmap candidates, df_ref ref);
3390 virtual void mark_dual_mode_def (df_ref def) = 0;
3391 virtual void convert_insn (rtx_insn *insn) = 0;
3392 virtual void convert_registers () = 0;
3395 class dimode_scalar_chain : public scalar_chain
3397 public:
3398 int compute_convert_gain ();
3399 private:
3400 void mark_dual_mode_def (df_ref def);
3401 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3402 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3403 void convert_insn (rtx_insn *insn);
3404 void convert_op (rtx *op, rtx_insn *insn);
3405 void convert_reg (unsigned regno);
3406 void make_vector_copies (unsigned regno);
3407 void convert_registers ();
3408 int vector_const_cost (rtx exp);
3411 class timode_scalar_chain : public scalar_chain
3413 public:
3414 /* Convert from TImode to V1TImode is always faster. */
3415 int compute_convert_gain () { return 1; }
3417 private:
3418 void mark_dual_mode_def (df_ref def);
3419 void fix_debug_reg_uses (rtx reg);
3420 void convert_insn (rtx_insn *insn);
3421 /* We don't convert registers to difference size. */
3422 void convert_registers () {}
3425 unsigned scalar_chain::max_id = 0;
3427 /* Initialize new chain. */
3429 scalar_chain::scalar_chain ()
3431 chain_id = ++max_id;
3433 if (dump_file)
3434 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3436 bitmap_obstack_initialize (NULL);
3437 insns = BITMAP_ALLOC (NULL);
3438 defs = BITMAP_ALLOC (NULL);
3439 defs_conv = BITMAP_ALLOC (NULL);
3440 queue = NULL;
3443 /* Free chain's data. */
3445 scalar_chain::~scalar_chain ()
3447 BITMAP_FREE (insns);
3448 BITMAP_FREE (defs);
3449 BITMAP_FREE (defs_conv);
3450 bitmap_obstack_release (NULL);
3453 /* Add instruction into chains' queue. */
3455 void
3456 scalar_chain::add_to_queue (unsigned insn_uid)
3458 if (bitmap_bit_p (insns, insn_uid)
3459 || bitmap_bit_p (queue, insn_uid))
3460 return;
3462 if (dump_file)
3463 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3464 insn_uid, chain_id);
3465 bitmap_set_bit (queue, insn_uid);
3468 /* For DImode conversion, mark register defined by DEF as requiring
3469 conversion. */
3471 void
3472 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3474 gcc_assert (DF_REF_REG_DEF_P (def));
3476 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3477 return;
3479 if (dump_file)
3480 fprintf (dump_file,
3481 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3482 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3484 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3487 /* For TImode conversion, it is unused. */
3489 void
3490 timode_scalar_chain::mark_dual_mode_def (df_ref)
3492 gcc_unreachable ();
3495 /* Check REF's chain to add new insns into a queue
3496 and find registers requiring conversion. */
3498 void
3499 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3501 df_link *chain;
3503 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3504 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3505 add_to_queue (DF_REF_INSN_UID (ref));
3507 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3509 unsigned uid = DF_REF_INSN_UID (chain->ref);
3511 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3512 continue;
3514 if (!DF_REF_REG_MEM_P (chain->ref))
3516 if (bitmap_bit_p (insns, uid))
3517 continue;
3519 if (bitmap_bit_p (candidates, uid))
3521 add_to_queue (uid);
3522 continue;
3526 if (DF_REF_REG_DEF_P (chain->ref))
3528 if (dump_file)
3529 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3530 DF_REF_REGNO (chain->ref), uid);
3531 mark_dual_mode_def (chain->ref);
3533 else
3535 if (dump_file)
3536 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3537 DF_REF_REGNO (chain->ref), uid);
3538 mark_dual_mode_def (ref);
3543 /* Add instruction into a chain. */
3545 void
3546 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3548 if (bitmap_bit_p (insns, insn_uid))
3549 return;
3551 if (dump_file)
3552 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3554 bitmap_set_bit (insns, insn_uid);
3556 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3557 rtx def_set = single_set (insn);
3558 if (def_set && REG_P (SET_DEST (def_set))
3559 && !HARD_REGISTER_P (SET_DEST (def_set)))
3560 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3562 df_ref ref;
3563 df_ref def;
3564 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3565 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3566 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3567 def;
3568 def = DF_REF_NEXT_REG (def))
3569 analyze_register_chain (candidates, def);
3570 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3571 if (!DF_REF_REG_MEM_P (ref))
3572 analyze_register_chain (candidates, ref);
3575 /* Build new chain starting from insn INSN_UID recursively
3576 adding all dependent uses and definitions. */
3578 void
3579 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3581 queue = BITMAP_ALLOC (NULL);
3582 bitmap_set_bit (queue, insn_uid);
3584 if (dump_file)
3585 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3587 while (!bitmap_empty_p (queue))
3589 insn_uid = bitmap_first_set_bit (queue);
3590 bitmap_clear_bit (queue, insn_uid);
3591 bitmap_clear_bit (candidates, insn_uid);
3592 add_insn (candidates, insn_uid);
3595 if (dump_file)
3597 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3598 fprintf (dump_file, " insns: ");
3599 dump_bitmap (dump_file, insns);
3600 if (!bitmap_empty_p (defs_conv))
3602 bitmap_iterator bi;
3603 unsigned id;
3604 const char *comma = "";
3605 fprintf (dump_file, " defs to convert: ");
3606 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3608 fprintf (dump_file, "%sr%d", comma, id);
3609 comma = ", ";
3611 fprintf (dump_file, "\n");
3615 BITMAP_FREE (queue);
3618 /* Return a cost of building a vector costant
3619 instead of using a scalar one. */
3622 dimode_scalar_chain::vector_const_cost (rtx exp)
3624 gcc_assert (CONST_INT_P (exp));
3626 if (standard_sse_constant_p (exp, V2DImode))
3627 return COSTS_N_INSNS (1);
3628 return ix86_cost->sse_load[1];
3631 /* Compute a gain for chain conversion. */
3634 dimode_scalar_chain::compute_convert_gain ()
3636 bitmap_iterator bi;
3637 unsigned insn_uid;
3638 int gain = 0;
3639 int cost = 0;
3641 if (dump_file)
3642 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3644 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3646 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3647 rtx def_set = single_set (insn);
3648 rtx src = SET_SRC (def_set);
3649 rtx dst = SET_DEST (def_set);
3651 if (REG_P (src) && REG_P (dst))
3652 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3653 else if (REG_P (src) && MEM_P (dst))
3654 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3655 else if (MEM_P (src) && REG_P (dst))
3656 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3657 else if (GET_CODE (src) == ASHIFT
3658 || GET_CODE (src) == ASHIFTRT
3659 || GET_CODE (src) == LSHIFTRT)
3661 if (CONST_INT_P (XEXP (src, 0)))
3662 gain -= vector_const_cost (XEXP (src, 0));
3663 if (CONST_INT_P (XEXP (src, 1)))
3665 gain += ix86_cost->shift_const;
3666 if (INTVAL (XEXP (src, 1)) >= 32)
3667 gain -= COSTS_N_INSNS (1);
3669 else
3670 /* Additional gain for omitting two CMOVs. */
3671 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3673 else if (GET_CODE (src) == PLUS
3674 || GET_CODE (src) == MINUS
3675 || GET_CODE (src) == IOR
3676 || GET_CODE (src) == XOR
3677 || GET_CODE (src) == AND)
3679 gain += ix86_cost->add;
3680 /* Additional gain for andnot for targets without BMI. */
3681 if (GET_CODE (XEXP (src, 0)) == NOT
3682 && !TARGET_BMI)
3683 gain += 2 * ix86_cost->add;
3685 if (CONST_INT_P (XEXP (src, 0)))
3686 gain -= vector_const_cost (XEXP (src, 0));
3687 if (CONST_INT_P (XEXP (src, 1)))
3688 gain -= vector_const_cost (XEXP (src, 1));
3690 else if (GET_CODE (src) == NEG
3691 || GET_CODE (src) == NOT)
3692 gain += ix86_cost->add - COSTS_N_INSNS (1);
3693 else if (GET_CODE (src) == COMPARE)
3695 /* Assume comparison cost is the same. */
3697 else if (CONST_INT_P (src))
3699 if (REG_P (dst))
3700 gain += COSTS_N_INSNS (2);
3701 else if (MEM_P (dst))
3702 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3703 gain -= vector_const_cost (src);
3705 else
3706 gcc_unreachable ();
3709 if (dump_file)
3710 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3712 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3713 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3715 if (dump_file)
3716 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3718 gain -= cost;
3720 if (dump_file)
3721 fprintf (dump_file, " Total gain: %d\n", gain);
3723 return gain;
3726 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3729 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3731 if (x == reg)
3732 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3734 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3735 int i, j;
3736 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3738 if (fmt[i] == 'e')
3739 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3740 else if (fmt[i] == 'E')
3741 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3742 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3743 reg, new_reg);
3746 return x;
3749 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3751 void
3752 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3753 rtx reg, rtx new_reg)
3755 replace_with_subreg (single_set (insn), reg, new_reg);
3758 /* Insert generated conversion instruction sequence INSNS
3759 after instruction AFTER. New BB may be required in case
3760 instruction has EH region attached. */
3762 void
3763 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3765 if (!control_flow_insn_p (after))
3767 emit_insn_after (insns, after);
3768 return;
3771 basic_block bb = BLOCK_FOR_INSN (after);
3772 edge e = find_fallthru_edge (bb->succs);
3773 gcc_assert (e);
3775 basic_block new_bb = split_edge (e);
3776 emit_insn_after (insns, BB_HEAD (new_bb));
3779 /* Make vector copies for all register REGNO definitions
3780 and replace its uses in a chain. */
3782 void
3783 dimode_scalar_chain::make_vector_copies (unsigned regno)
3785 rtx reg = regno_reg_rtx[regno];
3786 rtx vreg = gen_reg_rtx (DImode);
3787 bool count_reg = false;
3788 df_ref ref;
3790 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3791 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3793 df_ref use;
3795 /* Detect the count register of a shift instruction. */
3796 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3797 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3799 rtx_insn *insn = DF_REF_INSN (use);
3800 rtx def_set = single_set (insn);
3802 gcc_assert (def_set);
3804 rtx src = SET_SRC (def_set);
3806 if ((GET_CODE (src) == ASHIFT
3807 || GET_CODE (src) == ASHIFTRT
3808 || GET_CODE (src) == LSHIFTRT)
3809 && !CONST_INT_P (XEXP (src, 1))
3810 && reg_or_subregno (XEXP (src, 1)) == regno)
3811 count_reg = true;
3814 start_sequence ();
3815 if (count_reg)
3817 rtx qreg = gen_lowpart (QImode, reg);
3818 rtx tmp = gen_reg_rtx (SImode);
3820 if (TARGET_ZERO_EXTEND_WITH_AND
3821 && optimize_function_for_speed_p (cfun))
3823 emit_move_insn (tmp, const0_rtx);
3824 emit_insn (gen_movstrictqi
3825 (gen_lowpart (QImode, tmp), qreg));
3827 else
3828 emit_insn (gen_rtx_SET
3829 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3831 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3833 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3834 emit_move_insn (slot, tmp);
3835 tmp = copy_rtx (slot);
3838 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3840 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3842 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3843 emit_move_insn (adjust_address (tmp, SImode, 0),
3844 gen_rtx_SUBREG (SImode, reg, 0));
3845 emit_move_insn (adjust_address (tmp, SImode, 4),
3846 gen_rtx_SUBREG (SImode, reg, 4));
3847 emit_move_insn (vreg, tmp);
3849 else if (TARGET_SSE4_1)
3851 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3852 CONST0_RTX (V4SImode),
3853 gen_rtx_SUBREG (SImode, reg, 0)));
3854 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3855 gen_rtx_SUBREG (V4SImode, vreg, 0),
3856 gen_rtx_SUBREG (SImode, reg, 4),
3857 GEN_INT (2)));
3859 else
3861 rtx tmp = gen_reg_rtx (DImode);
3862 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3863 CONST0_RTX (V4SImode),
3864 gen_rtx_SUBREG (SImode, reg, 0)));
3865 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3866 CONST0_RTX (V4SImode),
3867 gen_rtx_SUBREG (SImode, reg, 4)));
3868 emit_insn (gen_vec_interleave_lowv4si
3869 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3870 gen_rtx_SUBREG (V4SImode, vreg, 0),
3871 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3873 rtx_insn *seq = get_insns ();
3874 end_sequence ();
3875 rtx_insn *insn = DF_REF_INSN (ref);
3876 emit_conversion_insns (seq, insn);
3878 if (dump_file)
3879 fprintf (dump_file,
3880 " Copied r%d to a vector register r%d for insn %d\n",
3881 regno, REGNO (vreg), INSN_UID (insn));
3884 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3885 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3887 rtx_insn *insn = DF_REF_INSN (ref);
3888 if (count_reg)
3890 rtx def_set = single_set (insn);
3891 gcc_assert (def_set);
3893 rtx src = SET_SRC (def_set);
3895 if ((GET_CODE (src) == ASHIFT
3896 || GET_CODE (src) == ASHIFTRT
3897 || GET_CODE (src) == LSHIFTRT)
3898 && !CONST_INT_P (XEXP (src, 1))
3899 && reg_or_subregno (XEXP (src, 1)) == regno)
3900 XEXP (src, 1) = vreg;
3902 else
3903 replace_with_subreg_in_insn (insn, reg, vreg);
3905 if (dump_file)
3906 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3907 regno, REGNO (vreg), INSN_UID (insn));
3911 /* Convert all definitions of register REGNO
3912 and fix its uses. Scalar copies may be created
3913 in case register is used in not convertible insn. */
3915 void
3916 dimode_scalar_chain::convert_reg (unsigned regno)
3918 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3919 rtx reg = regno_reg_rtx[regno];
3920 rtx scopy = NULL_RTX;
3921 df_ref ref;
3922 bitmap conv;
3924 conv = BITMAP_ALLOC (NULL);
3925 bitmap_copy (conv, insns);
3927 if (scalar_copy)
3928 scopy = gen_reg_rtx (DImode);
3930 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3932 rtx_insn *insn = DF_REF_INSN (ref);
3933 rtx def_set = single_set (insn);
3934 rtx src = SET_SRC (def_set);
3935 rtx reg = DF_REF_REG (ref);
3937 if (!MEM_P (src))
3939 replace_with_subreg_in_insn (insn, reg, reg);
3940 bitmap_clear_bit (conv, INSN_UID (insn));
3943 if (scalar_copy)
3945 start_sequence ();
3946 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3948 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3949 emit_move_insn (tmp, reg);
3950 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3951 adjust_address (tmp, SImode, 0));
3952 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3953 adjust_address (tmp, SImode, 4));
3955 else if (TARGET_SSE4_1)
3957 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3958 emit_insn
3959 (gen_rtx_SET
3960 (gen_rtx_SUBREG (SImode, scopy, 0),
3961 gen_rtx_VEC_SELECT (SImode,
3962 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3964 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3965 emit_insn
3966 (gen_rtx_SET
3967 (gen_rtx_SUBREG (SImode, scopy, 4),
3968 gen_rtx_VEC_SELECT (SImode,
3969 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3971 else
3973 rtx vcopy = gen_reg_rtx (V2DImode);
3974 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3975 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3976 gen_rtx_SUBREG (SImode, vcopy, 0));
3977 emit_move_insn (vcopy,
3978 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3979 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3980 gen_rtx_SUBREG (SImode, vcopy, 0));
3982 rtx_insn *seq = get_insns ();
3983 end_sequence ();
3984 emit_conversion_insns (seq, insn);
3986 if (dump_file)
3987 fprintf (dump_file,
3988 " Copied r%d to a scalar register r%d for insn %d\n",
3989 regno, REGNO (scopy), INSN_UID (insn));
3993 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3994 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3996 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3998 rtx_insn *insn = DF_REF_INSN (ref);
4000 rtx def_set = single_set (insn);
4001 gcc_assert (def_set);
4003 rtx src = SET_SRC (def_set);
4004 rtx dst = SET_DEST (def_set);
4006 if ((GET_CODE (src) == ASHIFT
4007 || GET_CODE (src) == ASHIFTRT
4008 || GET_CODE (src) == LSHIFTRT)
4009 && !CONST_INT_P (XEXP (src, 1))
4010 && reg_or_subregno (XEXP (src, 1)) == regno)
4012 rtx tmp2 = gen_reg_rtx (V2DImode);
4014 start_sequence ();
4016 if (TARGET_SSE4_1)
4017 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4018 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4019 else
4021 rtx vec_cst
4022 = gen_rtx_CONST_VECTOR (V2DImode,
4023 gen_rtvec (2, GEN_INT (0xff),
4024 const0_rtx));
4025 vec_cst
4026 = validize_mem (force_const_mem (V2DImode, vec_cst));
4028 emit_insn (gen_rtx_SET
4029 (tmp2,
4030 gen_rtx_AND (V2DImode,
4031 gen_rtx_SUBREG (V2DImode, reg, 0),
4032 vec_cst)));
4034 rtx_insn *seq = get_insns ();
4035 end_sequence ();
4037 emit_insn_before (seq, insn);
4039 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4041 else if (!MEM_P (dst) || !REG_P (src))
4042 replace_with_subreg_in_insn (insn, reg, reg);
4044 bitmap_clear_bit (conv, INSN_UID (insn));
4047 /* Skip debug insns and uninitialized uses. */
4048 else if (DF_REF_CHAIN (ref)
4049 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4051 gcc_assert (scopy);
4052 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4053 df_insn_rescan (DF_REF_INSN (ref));
4056 BITMAP_FREE (conv);
4059 /* Convert operand OP in INSN. We should handle
4060 memory operands and uninitialized registers.
4061 All other register uses are converted during
4062 registers conversion. */
4064 void
4065 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4067 *op = copy_rtx_if_shared (*op);
4069 if (GET_CODE (*op) == NOT)
4071 convert_op (&XEXP (*op, 0), insn);
4072 PUT_MODE (*op, V2DImode);
4074 else if (MEM_P (*op))
4076 rtx tmp = gen_reg_rtx (DImode);
4078 emit_insn_before (gen_move_insn (tmp, *op), insn);
4079 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4081 if (dump_file)
4082 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4083 INSN_UID (insn), REGNO (tmp));
4085 else if (REG_P (*op))
4087 /* We may have not converted register usage in case
4088 this register has no definition. Otherwise it
4089 should be converted in convert_reg. */
4090 df_ref ref;
4091 FOR_EACH_INSN_USE (ref, insn)
4092 if (DF_REF_REGNO (ref) == REGNO (*op))
4094 gcc_assert (!DF_REF_CHAIN (ref));
4095 break;
4097 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4099 else if (CONST_INT_P (*op))
4101 rtx vec_cst;
4102 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4104 /* Prefer all ones vector in case of -1. */
4105 if (constm1_operand (*op, GET_MODE (*op)))
4106 vec_cst = CONSTM1_RTX (V2DImode);
4107 else
4108 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4109 gen_rtvec (2, *op, const0_rtx));
4111 if (!standard_sse_constant_p (vec_cst, V2DImode))
4113 start_sequence ();
4114 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4115 rtx_insn *seq = get_insns ();
4116 end_sequence ();
4117 emit_insn_before (seq, insn);
4120 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4121 *op = tmp;
4123 else
4125 gcc_assert (SUBREG_P (*op));
4126 gcc_assert (GET_MODE (*op) == V2DImode);
4130 /* Convert INSN to vector mode. */
4132 void
4133 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4135 rtx def_set = single_set (insn);
4136 rtx src = SET_SRC (def_set);
4137 rtx dst = SET_DEST (def_set);
4138 rtx subreg;
4140 if (MEM_P (dst) && !REG_P (src))
4142 /* There are no scalar integer instructions and therefore
4143 temporary register usage is required. */
4144 rtx tmp = gen_reg_rtx (DImode);
4145 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4146 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4149 switch (GET_CODE (src))
4151 case ASHIFT:
4152 case ASHIFTRT:
4153 case LSHIFTRT:
4154 convert_op (&XEXP (src, 0), insn);
4155 PUT_MODE (src, V2DImode);
4156 break;
4158 case PLUS:
4159 case MINUS:
4160 case IOR:
4161 case XOR:
4162 case AND:
4163 convert_op (&XEXP (src, 0), insn);
4164 convert_op (&XEXP (src, 1), insn);
4165 PUT_MODE (src, V2DImode);
4166 break;
4168 case NEG:
4169 src = XEXP (src, 0);
4170 convert_op (&src, insn);
4171 subreg = gen_reg_rtx (V2DImode);
4172 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4173 src = gen_rtx_MINUS (V2DImode, subreg, src);
4174 break;
4176 case NOT:
4177 src = XEXP (src, 0);
4178 convert_op (&src, insn);
4179 subreg = gen_reg_rtx (V2DImode);
4180 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4181 src = gen_rtx_XOR (V2DImode, src, subreg);
4182 break;
4184 case MEM:
4185 if (!REG_P (dst))
4186 convert_op (&src, insn);
4187 break;
4189 case REG:
4190 if (!MEM_P (dst))
4191 convert_op (&src, insn);
4192 break;
4194 case SUBREG:
4195 gcc_assert (GET_MODE (src) == V2DImode);
4196 break;
4198 case COMPARE:
4199 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4201 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4202 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4204 if (REG_P (src))
4205 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4206 else
4207 subreg = copy_rtx_if_shared (src);
4208 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4209 copy_rtx_if_shared (subreg),
4210 copy_rtx_if_shared (subreg)),
4211 insn);
4212 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4213 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4214 copy_rtx_if_shared (src)),
4215 UNSPEC_PTEST);
4216 break;
4218 case CONST_INT:
4219 convert_op (&src, insn);
4220 break;
4222 default:
4223 gcc_unreachable ();
4226 SET_SRC (def_set) = src;
4227 SET_DEST (def_set) = dst;
4229 /* Drop possible dead definitions. */
4230 PATTERN (insn) = def_set;
4232 INSN_CODE (insn) = -1;
4233 recog_memoized (insn);
4234 df_insn_rescan (insn);
4237 /* Fix uses of converted REG in debug insns. */
4239 void
4240 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4242 if (!flag_var_tracking)
4243 return;
4245 df_ref ref, next;
4246 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4248 rtx_insn *insn = DF_REF_INSN (ref);
4249 /* Make sure the next ref is for a different instruction,
4250 so that we're not affected by the rescan. */
4251 next = DF_REF_NEXT_REG (ref);
4252 while (next && DF_REF_INSN (next) == insn)
4253 next = DF_REF_NEXT_REG (next);
4255 if (DEBUG_INSN_P (insn))
4257 /* It may be a debug insn with a TImode variable in
4258 register. */
4259 bool changed = false;
4260 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4262 rtx *loc = DF_REF_LOC (ref);
4263 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4265 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4266 changed = true;
4269 if (changed)
4270 df_insn_rescan (insn);
4275 /* Convert INSN from TImode to V1T1mode. */
4277 void
4278 timode_scalar_chain::convert_insn (rtx_insn *insn)
4280 rtx def_set = single_set (insn);
4281 rtx src = SET_SRC (def_set);
4282 rtx dst = SET_DEST (def_set);
4284 switch (GET_CODE (dst))
4286 case REG:
4288 rtx tmp = find_reg_equal_equiv_note (insn);
4289 if (tmp)
4290 PUT_MODE (XEXP (tmp, 0), V1TImode);
4291 PUT_MODE (dst, V1TImode);
4292 fix_debug_reg_uses (dst);
4294 break;
4295 case MEM:
4296 PUT_MODE (dst, V1TImode);
4297 break;
4299 default:
4300 gcc_unreachable ();
4303 switch (GET_CODE (src))
4305 case REG:
4306 PUT_MODE (src, V1TImode);
4307 /* Call fix_debug_reg_uses only if SRC is never defined. */
4308 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4309 fix_debug_reg_uses (src);
4310 break;
4312 case MEM:
4313 PUT_MODE (src, V1TImode);
4314 break;
4316 case CONST_WIDE_INT:
4317 if (NONDEBUG_INSN_P (insn))
4319 /* Since there are no instructions to store 128-bit constant,
4320 temporary register usage is required. */
4321 rtx tmp = gen_reg_rtx (V1TImode);
4322 start_sequence ();
4323 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4324 src = validize_mem (force_const_mem (V1TImode, src));
4325 rtx_insn *seq = get_insns ();
4326 end_sequence ();
4327 if (seq)
4328 emit_insn_before (seq, insn);
4329 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4330 dst = tmp;
4332 break;
4334 case CONST_INT:
4335 switch (standard_sse_constant_p (src, TImode))
4337 case 1:
4338 src = CONST0_RTX (GET_MODE (dst));
4339 break;
4340 case 2:
4341 src = CONSTM1_RTX (GET_MODE (dst));
4342 break;
4343 default:
4344 gcc_unreachable ();
4346 if (NONDEBUG_INSN_P (insn))
4348 rtx tmp = gen_reg_rtx (V1TImode);
4349 /* Since there are no instructions to store standard SSE
4350 constant, temporary register usage is required. */
4351 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4352 dst = tmp;
4354 break;
4356 default:
4357 gcc_unreachable ();
4360 SET_SRC (def_set) = src;
4361 SET_DEST (def_set) = dst;
4363 /* Drop possible dead definitions. */
4364 PATTERN (insn) = def_set;
4366 INSN_CODE (insn) = -1;
4367 recog_memoized (insn);
4368 df_insn_rescan (insn);
4371 void
4372 dimode_scalar_chain::convert_registers ()
4374 bitmap_iterator bi;
4375 unsigned id;
4377 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4378 convert_reg (id);
4380 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4381 make_vector_copies (id);
4384 /* Convert whole chain creating required register
4385 conversions and copies. */
4388 scalar_chain::convert ()
4390 bitmap_iterator bi;
4391 unsigned id;
4392 int converted_insns = 0;
4394 if (!dbg_cnt (stv_conversion))
4395 return 0;
4397 if (dump_file)
4398 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4400 convert_registers ();
4402 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4404 convert_insn (DF_INSN_UID_GET (id)->insn);
4405 converted_insns++;
4408 return converted_insns;
4411 /* Main STV pass function. Find and convert scalar
4412 instructions into vector mode when profitable. */
4414 static unsigned int
4415 convert_scalars_to_vector ()
4417 basic_block bb;
4418 bitmap candidates;
4419 int converted_insns = 0;
4421 bitmap_obstack_initialize (NULL);
4422 candidates = BITMAP_ALLOC (NULL);
4424 calculate_dominance_info (CDI_DOMINATORS);
4425 df_set_flags (DF_DEFER_INSN_RESCAN);
4426 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4427 df_md_add_problem ();
4428 df_analyze ();
4430 /* Find all instructions we want to convert into vector mode. */
4431 if (dump_file)
4432 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4434 FOR_EACH_BB_FN (bb, cfun)
4436 rtx_insn *insn;
4437 FOR_BB_INSNS (bb, insn)
4438 if (scalar_to_vector_candidate_p (insn))
4440 if (dump_file)
4441 fprintf (dump_file, " insn %d is marked as a candidate\n",
4442 INSN_UID (insn));
4444 bitmap_set_bit (candidates, INSN_UID (insn));
4448 remove_non_convertible_regs (candidates);
4450 if (bitmap_empty_p (candidates))
4451 if (dump_file)
4452 fprintf (dump_file, "There are no candidates for optimization.\n");
4454 while (!bitmap_empty_p (candidates))
4456 unsigned uid = bitmap_first_set_bit (candidates);
4457 scalar_chain *chain;
4459 if (TARGET_64BIT)
4460 chain = new timode_scalar_chain;
4461 else
4462 chain = new dimode_scalar_chain;
4464 /* Find instructions chain we want to convert to vector mode.
4465 Check all uses and definitions to estimate all required
4466 conversions. */
4467 chain->build (candidates, uid);
4469 if (chain->compute_convert_gain () > 0)
4470 converted_insns += chain->convert ();
4471 else
4472 if (dump_file)
4473 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4474 chain->chain_id);
4476 delete chain;
4479 if (dump_file)
4480 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4482 BITMAP_FREE (candidates);
4483 bitmap_obstack_release (NULL);
4484 df_process_deferred_rescans ();
4486 /* Conversion means we may have 128bit register spills/fills
4487 which require aligned stack. */
4488 if (converted_insns)
4490 if (crtl->stack_alignment_needed < 128)
4491 crtl->stack_alignment_needed = 128;
4492 if (crtl->stack_alignment_estimated < 128)
4493 crtl->stack_alignment_estimated = 128;
4494 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4495 if (TARGET_64BIT)
4496 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4497 parm; parm = DECL_CHAIN (parm))
4499 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4500 continue;
4501 if (DECL_RTL_SET_P (parm)
4502 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4504 rtx r = DECL_RTL (parm);
4505 if (REG_P (r))
4506 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4508 if (DECL_INCOMING_RTL (parm)
4509 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4511 rtx r = DECL_INCOMING_RTL (parm);
4512 if (REG_P (r))
4513 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4518 return 0;
4521 namespace {
4523 const pass_data pass_data_insert_vzeroupper =
4525 RTL_PASS, /* type */
4526 "vzeroupper", /* name */
4527 OPTGROUP_NONE, /* optinfo_flags */
4528 TV_MACH_DEP, /* tv_id */
4529 0, /* properties_required */
4530 0, /* properties_provided */
4531 0, /* properties_destroyed */
4532 0, /* todo_flags_start */
4533 TODO_df_finish, /* todo_flags_finish */
4536 class pass_insert_vzeroupper : public rtl_opt_pass
4538 public:
4539 pass_insert_vzeroupper(gcc::context *ctxt)
4540 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4543 /* opt_pass methods: */
4544 virtual bool gate (function *)
4546 return TARGET_AVX && !TARGET_AVX512F
4547 && TARGET_VZEROUPPER && flag_expensive_optimizations
4548 && !optimize_size;
4551 virtual unsigned int execute (function *)
4553 return rest_of_handle_insert_vzeroupper ();
4556 }; // class pass_insert_vzeroupper
4558 const pass_data pass_data_stv =
4560 RTL_PASS, /* type */
4561 "stv", /* name */
4562 OPTGROUP_NONE, /* optinfo_flags */
4563 TV_MACH_DEP, /* tv_id */
4564 0, /* properties_required */
4565 0, /* properties_provided */
4566 0, /* properties_destroyed */
4567 0, /* todo_flags_start */
4568 TODO_df_finish, /* todo_flags_finish */
4571 class pass_stv : public rtl_opt_pass
4573 public:
4574 pass_stv (gcc::context *ctxt)
4575 : rtl_opt_pass (pass_data_stv, ctxt),
4576 timode_p (false)
4579 /* opt_pass methods: */
4580 virtual bool gate (function *)
4582 return (timode_p == !!TARGET_64BIT
4583 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4586 virtual unsigned int execute (function *)
4588 return convert_scalars_to_vector ();
4591 opt_pass *clone ()
4593 return new pass_stv (m_ctxt);
4596 void set_pass_param (unsigned int n, bool param)
4598 gcc_assert (n == 0);
4599 timode_p = param;
4602 private:
4603 bool timode_p;
4604 }; // class pass_stv
4606 } // anon namespace
4608 rtl_opt_pass *
4609 make_pass_insert_vzeroupper (gcc::context *ctxt)
4611 return new pass_insert_vzeroupper (ctxt);
4614 rtl_opt_pass *
4615 make_pass_stv (gcc::context *ctxt)
4617 return new pass_stv (ctxt);
4620 /* Return true if a red-zone is in use. */
4622 bool
4623 ix86_using_red_zone (void)
4625 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4628 /* Return a string that documents the current -m options. The caller is
4629 responsible for freeing the string. */
4631 static char *
4632 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4633 int flags, int flags2,
4634 const char *arch, const char *tune,
4635 enum fpmath_unit fpmath, bool add_nl_p)
4637 struct ix86_target_opts
4639 const char *option; /* option string */
4640 HOST_WIDE_INT mask; /* isa mask options */
4643 /* This table is ordered so that options like -msse4.2 that imply other
4644 ISAs come first. Target string will be displayed in the same order. */
4645 static struct ix86_target_opts isa2_opts[] =
4647 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4648 { "-msgx", OPTION_MASK_ISA_SGX },
4649 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4650 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4651 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4653 static struct ix86_target_opts isa_opts[] =
4655 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4656 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4657 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4658 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4659 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4660 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4661 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4662 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4663 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4664 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4665 { "-mfma", OPTION_MASK_ISA_FMA },
4666 { "-mxop", OPTION_MASK_ISA_XOP },
4667 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4668 { "-mf16c", OPTION_MASK_ISA_F16C },
4669 { "-mavx", OPTION_MASK_ISA_AVX },
4670 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4671 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4672 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4673 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4674 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4675 { "-msse3", OPTION_MASK_ISA_SSE3 },
4676 { "-maes", OPTION_MASK_ISA_AES },
4677 { "-msha", OPTION_MASK_ISA_SHA },
4678 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4679 { "-msse2", OPTION_MASK_ISA_SSE2 },
4680 { "-msse", OPTION_MASK_ISA_SSE },
4681 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4682 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4683 { "-mmmx", OPTION_MASK_ISA_MMX },
4684 { "-mrtm", OPTION_MASK_ISA_RTM },
4685 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4686 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4687 { "-madx", OPTION_MASK_ISA_ADX },
4688 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4689 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4690 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4691 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4692 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4693 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4694 { "-mabm", OPTION_MASK_ISA_ABM },
4695 { "-mbmi", OPTION_MASK_ISA_BMI },
4696 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4697 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4698 { "-mtbm", OPTION_MASK_ISA_TBM },
4699 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4700 { "-mcx16", OPTION_MASK_ISA_CX16 },
4701 { "-msahf", OPTION_MASK_ISA_SAHF },
4702 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4703 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4704 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4705 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4706 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4707 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4708 { "-mpku", OPTION_MASK_ISA_PKU },
4709 { "-mlwp", OPTION_MASK_ISA_LWP },
4710 { "-mhle", OPTION_MASK_ISA_HLE },
4711 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4712 { "-mmpx", OPTION_MASK_ISA_MPX },
4713 { "-mclwb", OPTION_MASK_ISA_CLWB }
4716 /* Flag options. */
4717 static struct ix86_target_opts flag_opts[] =
4719 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4720 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4721 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4722 { "-m80387", MASK_80387 },
4723 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4724 { "-malign-double", MASK_ALIGN_DOUBLE },
4725 { "-mcld", MASK_CLD },
4726 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4727 { "-mieee-fp", MASK_IEEE_FP },
4728 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4729 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4730 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4731 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4732 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4733 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4734 { "-mno-red-zone", MASK_NO_RED_ZONE },
4735 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4736 { "-mrecip", MASK_RECIP },
4737 { "-mrtd", MASK_RTD },
4738 { "-msseregparm", MASK_SSEREGPARM },
4739 { "-mstack-arg-probe", MASK_STACK_PROBE },
4740 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4741 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4742 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4743 { "-mvzeroupper", MASK_VZEROUPPER },
4744 { "-mstv", MASK_STV },
4745 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4746 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4747 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4748 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4751 /* Additional flag options. */
4752 static struct ix86_target_opts flag2_opts[] =
4754 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4757 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4758 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4760 char isa_other[40];
4761 char isa2_other[40];
4762 char flags_other[40];
4763 char flags2_other[40];
4764 unsigned num = 0;
4765 unsigned i, j;
4766 char *ret;
4767 char *ptr;
4768 size_t len;
4769 size_t line_len;
4770 size_t sep_len;
4771 const char *abi;
4773 memset (opts, '\0', sizeof (opts));
4775 /* Add -march= option. */
4776 if (arch)
4778 opts[num][0] = "-march=";
4779 opts[num++][1] = arch;
4782 /* Add -mtune= option. */
4783 if (tune)
4785 opts[num][0] = "-mtune=";
4786 opts[num++][1] = tune;
4789 /* Add -m32/-m64/-mx32. */
4790 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4792 if ((isa & OPTION_MASK_ABI_64) != 0)
4793 abi = "-m64";
4794 else
4795 abi = "-mx32";
4796 isa &= ~ (OPTION_MASK_ISA_64BIT
4797 | OPTION_MASK_ABI_64
4798 | OPTION_MASK_ABI_X32);
4800 else
4801 abi = "-m32";
4802 opts[num++][0] = abi;
4804 /* Pick out the options in isa2 options. */
4805 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4807 if ((isa2 & isa2_opts[i].mask) != 0)
4809 opts[num++][0] = isa2_opts[i].option;
4810 isa2 &= ~ isa2_opts[i].mask;
4814 if (isa2 && add_nl_p)
4816 opts[num++][0] = isa2_other;
4817 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4820 /* Pick out the options in isa options. */
4821 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4823 if ((isa & isa_opts[i].mask) != 0)
4825 opts[num++][0] = isa_opts[i].option;
4826 isa &= ~ isa_opts[i].mask;
4830 if (isa && add_nl_p)
4832 opts[num++][0] = isa_other;
4833 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4836 /* Add flag options. */
4837 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4839 if ((flags & flag_opts[i].mask) != 0)
4841 opts[num++][0] = flag_opts[i].option;
4842 flags &= ~ flag_opts[i].mask;
4846 if (flags && add_nl_p)
4848 opts[num++][0] = flags_other;
4849 sprintf (flags_other, "(other flags: %#x)", flags);
4852 /* Add additional flag options. */
4853 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4855 if ((flags2 & flag2_opts[i].mask) != 0)
4857 opts[num++][0] = flag2_opts[i].option;
4858 flags2 &= ~ flag2_opts[i].mask;
4862 if (flags2 && add_nl_p)
4864 opts[num++][0] = flags2_other;
4865 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4868 /* Add -fpmath= option. */
4869 if (fpmath)
4871 opts[num][0] = "-mfpmath=";
4872 switch ((int) fpmath)
4874 case FPMATH_387:
4875 opts[num++][1] = "387";
4876 break;
4878 case FPMATH_SSE:
4879 opts[num++][1] = "sse";
4880 break;
4882 case FPMATH_387 | FPMATH_SSE:
4883 opts[num++][1] = "sse+387";
4884 break;
4886 default:
4887 gcc_unreachable ();
4891 /* Any options? */
4892 if (num == 0)
4893 return NULL;
4895 gcc_assert (num < ARRAY_SIZE (opts));
4897 /* Size the string. */
4898 len = 0;
4899 sep_len = (add_nl_p) ? 3 : 1;
4900 for (i = 0; i < num; i++)
4902 len += sep_len;
4903 for (j = 0; j < 2; j++)
4904 if (opts[i][j])
4905 len += strlen (opts[i][j]);
4908 /* Build the string. */
4909 ret = ptr = (char *) xmalloc (len);
4910 line_len = 0;
4912 for (i = 0; i < num; i++)
4914 size_t len2[2];
4916 for (j = 0; j < 2; j++)
4917 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4919 if (i != 0)
4921 *ptr++ = ' ';
4922 line_len++;
4924 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4926 *ptr++ = '\\';
4927 *ptr++ = '\n';
4928 line_len = 0;
4932 for (j = 0; j < 2; j++)
4933 if (opts[i][j])
4935 memcpy (ptr, opts[i][j], len2[j]);
4936 ptr += len2[j];
4937 line_len += len2[j];
4941 *ptr = '\0';
4942 gcc_assert (ret + len >= ptr);
4944 return ret;
4947 /* Return true, if profiling code should be emitted before
4948 prologue. Otherwise it returns false.
4949 Note: For x86 with "hotfix" it is sorried. */
4950 static bool
4951 ix86_profile_before_prologue (void)
4953 return flag_fentry != 0;
4956 /* Function that is callable from the debugger to print the current
4957 options. */
4958 void ATTRIBUTE_UNUSED
4959 ix86_debug_options (void)
4961 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4962 target_flags, ix86_target_flags,
4963 ix86_arch_string,ix86_tune_string,
4964 ix86_fpmath, true);
4966 if (opts)
4968 fprintf (stderr, "%s\n\n", opts);
4969 free (opts);
4971 else
4972 fputs ("<no options>\n\n", stderr);
4974 return;
4977 /* Return true if T is one of the bytes we should avoid with
4978 -fmitigate-rop. */
4980 static bool
4981 ix86_rop_should_change_byte_p (int t)
4983 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4986 static const char *stringop_alg_names[] = {
4987 #define DEF_ENUM
4988 #define DEF_ALG(alg, name) #name,
4989 #include "stringop.def"
4990 #undef DEF_ENUM
4991 #undef DEF_ALG
4994 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4995 The string is of the following form (or comma separated list of it):
4997 strategy_alg:max_size:[align|noalign]
4999 where the full size range for the strategy is either [0, max_size] or
5000 [min_size, max_size], in which min_size is the max_size + 1 of the
5001 preceding range. The last size range must have max_size == -1.
5003 Examples:
5006 -mmemcpy-strategy=libcall:-1:noalign
5008 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5012 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5014 This is to tell the compiler to use the following strategy for memset
5015 1) when the expected size is between [1, 16], use rep_8byte strategy;
5016 2) when the size is between [17, 2048], use vector_loop;
5017 3) when the size is > 2048, use libcall. */
5019 struct stringop_size_range
5021 int max;
5022 stringop_alg alg;
5023 bool noalign;
5026 static void
5027 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5029 const struct stringop_algs *default_algs;
5030 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5031 char *curr_range_str, *next_range_str;
5032 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5033 int i = 0, n = 0;
5035 if (is_memset)
5036 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5037 else
5038 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5040 curr_range_str = strategy_str;
5044 int maxs;
5045 char alg_name[128];
5046 char align[16];
5047 next_range_str = strchr (curr_range_str, ',');
5048 if (next_range_str)
5049 *next_range_str++ = '\0';
5051 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5052 alg_name, &maxs, align))
5054 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5055 return;
5058 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5060 error ("size ranges of option %qs should be increasing", opt);
5061 return;
5064 for (i = 0; i < last_alg; i++)
5065 if (!strcmp (alg_name, stringop_alg_names[i]))
5066 break;
5068 if (i == last_alg)
5070 error ("wrong strategy name %qs specified for option %qs",
5071 alg_name, opt);
5073 auto_vec <const char *> candidates;
5074 for (i = 0; i < last_alg; i++)
5075 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5076 candidates.safe_push (stringop_alg_names[i]);
5078 char *s;
5079 const char *hint
5080 = candidates_list_and_hint (alg_name, s, candidates);
5081 if (hint)
5082 inform (input_location,
5083 "valid arguments to %qs are: %s; did you mean %qs?",
5084 opt, s, hint);
5085 else
5086 inform (input_location, "valid arguments to %qs are: %s",
5087 opt, s);
5088 XDELETEVEC (s);
5089 return;
5092 if ((stringop_alg) i == rep_prefix_8_byte
5093 && !TARGET_64BIT)
5095 /* rep; movq isn't available in 32-bit code. */
5096 error ("strategy name %qs specified for option %qs "
5097 "not supported for 32-bit code", alg_name, opt);
5098 return;
5101 input_ranges[n].max = maxs;
5102 input_ranges[n].alg = (stringop_alg) i;
5103 if (!strcmp (align, "align"))
5104 input_ranges[n].noalign = false;
5105 else if (!strcmp (align, "noalign"))
5106 input_ranges[n].noalign = true;
5107 else
5109 error ("unknown alignment %qs specified for option %qs", align, opt);
5110 return;
5112 n++;
5113 curr_range_str = next_range_str;
5115 while (curr_range_str);
5117 if (input_ranges[n - 1].max != -1)
5119 error ("the max value for the last size range should be -1"
5120 " for option %qs", opt);
5121 return;
5124 if (n > MAX_STRINGOP_ALGS)
5126 error ("too many size ranges specified in option %qs", opt);
5127 return;
5130 /* Now override the default algs array. */
5131 for (i = 0; i < n; i++)
5133 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5134 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5135 = input_ranges[i].alg;
5136 *const_cast<int *>(&default_algs->size[i].noalign)
5137 = input_ranges[i].noalign;
5142 /* parse -mtune-ctrl= option. When DUMP is true,
5143 print the features that are explicitly set. */
5145 static void
5146 parse_mtune_ctrl_str (bool dump)
5148 if (!ix86_tune_ctrl_string)
5149 return;
5151 char *next_feature_string = NULL;
5152 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5153 char *orig = curr_feature_string;
5154 int i;
5157 bool clear = false;
5159 next_feature_string = strchr (curr_feature_string, ',');
5160 if (next_feature_string)
5161 *next_feature_string++ = '\0';
5162 if (*curr_feature_string == '^')
5164 curr_feature_string++;
5165 clear = true;
5167 for (i = 0; i < X86_TUNE_LAST; i++)
5169 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5171 ix86_tune_features[i] = !clear;
5172 if (dump)
5173 fprintf (stderr, "Explicitly %s feature %s\n",
5174 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5175 break;
5178 if (i == X86_TUNE_LAST)
5179 error ("Unknown parameter to option -mtune-ctrl: %s",
5180 clear ? curr_feature_string - 1 : curr_feature_string);
5181 curr_feature_string = next_feature_string;
5183 while (curr_feature_string);
5184 free (orig);
5187 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5188 processor type. */
5190 static void
5191 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5193 unsigned int ix86_tune_mask = 1u << ix86_tune;
5194 int i;
5196 for (i = 0; i < X86_TUNE_LAST; ++i)
5198 if (ix86_tune_no_default)
5199 ix86_tune_features[i] = 0;
5200 else
5201 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5204 if (dump)
5206 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5207 for (i = 0; i < X86_TUNE_LAST; i++)
5208 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5209 ix86_tune_features[i] ? "on" : "off");
5212 parse_mtune_ctrl_str (dump);
5216 /* Default align_* from the processor table. */
5218 static void
5219 ix86_default_align (struct gcc_options *opts)
5221 if (opts->x_align_loops == 0)
5223 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5224 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5226 if (opts->x_align_jumps == 0)
5228 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5229 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5231 if (opts->x_align_functions == 0)
5233 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5237 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5239 static void
5240 ix86_override_options_after_change (void)
5242 ix86_default_align (&global_options);
5245 /* Override various settings based on options. If MAIN_ARGS_P, the
5246 options are from the command line, otherwise they are from
5247 attributes. Return true if there's an error related to march
5248 option. */
5250 static bool
5251 ix86_option_override_internal (bool main_args_p,
5252 struct gcc_options *opts,
5253 struct gcc_options *opts_set)
5255 int i;
5256 unsigned int ix86_arch_mask;
5257 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5259 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5260 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5261 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5262 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5263 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5264 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5265 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5266 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5267 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5268 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5269 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5270 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5271 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5272 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5273 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5274 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5275 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5276 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5277 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5278 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5279 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5280 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5281 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5282 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5283 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5284 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5285 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5286 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5287 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5288 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5289 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5290 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5291 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5292 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5293 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5294 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5295 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5296 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5297 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5298 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5299 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5300 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5301 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5302 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5303 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5304 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5305 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5306 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5307 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5308 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5309 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5310 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5311 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5312 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5313 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5314 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5315 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5316 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5317 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5318 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5319 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5320 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5321 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5322 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5324 #define PTA_CORE2 \
5325 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5326 | PTA_CX16 | PTA_FXSR)
5327 #define PTA_NEHALEM \
5328 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5329 #define PTA_WESTMERE \
5330 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5331 #define PTA_SANDYBRIDGE \
5332 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5333 #define PTA_IVYBRIDGE \
5334 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5335 #define PTA_HASWELL \
5336 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5337 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5338 #define PTA_BROADWELL \
5339 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5340 #define PTA_SKYLAKE \
5341 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5342 #define PTA_SKYLAKE_AVX512 \
5343 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5344 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5345 #define PTA_KNL \
5346 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5347 #define PTA_BONNELL \
5348 (PTA_CORE2 | PTA_MOVBE)
5349 #define PTA_SILVERMONT \
5350 (PTA_WESTMERE | PTA_MOVBE)
5352 /* if this reaches 64, need to widen struct pta flags below */
5354 static struct pta
5356 const char *const name; /* processor name or nickname. */
5357 const enum processor_type processor;
5358 const enum attr_cpu schedule;
5359 const unsigned HOST_WIDE_INT flags;
5361 const processor_alias_table[] =
5363 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5364 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5365 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5366 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5367 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5368 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5369 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5370 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5371 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5372 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5373 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5374 PTA_MMX | PTA_SSE | PTA_FXSR},
5375 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5376 PTA_MMX | PTA_SSE | PTA_FXSR},
5377 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5378 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5379 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5380 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5381 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5382 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5383 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5384 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5385 PTA_MMX | PTA_SSE | PTA_FXSR},
5386 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5387 PTA_MMX | PTA_SSE | PTA_FXSR},
5388 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5389 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5390 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5391 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5392 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5393 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5394 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5395 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5396 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5397 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5398 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5399 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5400 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5401 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5402 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5403 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5404 PTA_SANDYBRIDGE},
5405 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5406 PTA_SANDYBRIDGE},
5407 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5408 PTA_IVYBRIDGE},
5409 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5410 PTA_IVYBRIDGE},
5411 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5412 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5413 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5414 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5415 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5416 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5417 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5418 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5419 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5420 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5421 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5422 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5423 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5424 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5425 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5426 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5427 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5428 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5429 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5430 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5431 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5432 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5433 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5434 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5435 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5436 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5437 {"x86-64", PROCESSOR_K8, CPU_K8,
5438 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5439 {"eden-x2", PROCESSOR_K8, CPU_K8,
5440 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5441 {"nano", PROCESSOR_K8, CPU_K8,
5442 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5443 | PTA_SSSE3 | PTA_FXSR},
5444 {"nano-1000", PROCESSOR_K8, CPU_K8,
5445 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5446 | PTA_SSSE3 | PTA_FXSR},
5447 {"nano-2000", PROCESSOR_K8, CPU_K8,
5448 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5449 | PTA_SSSE3 | PTA_FXSR},
5450 {"nano-3000", PROCESSOR_K8, CPU_K8,
5451 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5452 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5453 {"nano-x2", PROCESSOR_K8, CPU_K8,
5454 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5455 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5456 {"eden-x4", PROCESSOR_K8, CPU_K8,
5457 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5458 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5459 {"nano-x4", PROCESSOR_K8, CPU_K8,
5460 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5461 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5462 {"k8", PROCESSOR_K8, CPU_K8,
5463 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5464 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5465 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5466 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5467 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5468 {"opteron", PROCESSOR_K8, CPU_K8,
5469 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5470 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5471 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5472 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5473 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5474 {"athlon64", PROCESSOR_K8, CPU_K8,
5475 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5476 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5477 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5478 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5479 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5480 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5481 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5482 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5483 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5484 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5485 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5486 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5487 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5488 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5489 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5490 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5491 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5492 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5493 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5494 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5495 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5496 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5497 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5498 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5499 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5500 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5501 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5502 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5503 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5504 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5505 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5506 | PTA_XSAVEOPT | PTA_FSGSBASE},
5507 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5508 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5509 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5510 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5511 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5512 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5513 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5514 | PTA_MOVBE | PTA_MWAITX},
5515 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5516 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5517 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5518 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5519 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5520 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5521 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5522 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5523 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5524 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5525 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5526 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5527 | PTA_FXSR | PTA_XSAVE},
5528 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5529 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5530 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5531 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5532 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5533 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5535 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5536 PTA_64BIT
5537 | PTA_HLE /* flags are only used for -march switch. */ },
5540 /* -mrecip options. */
5541 static struct
5543 const char *string; /* option name */
5544 unsigned int mask; /* mask bits to set */
5546 const recip_options[] =
5548 { "all", RECIP_MASK_ALL },
5549 { "none", RECIP_MASK_NONE },
5550 { "div", RECIP_MASK_DIV },
5551 { "sqrt", RECIP_MASK_SQRT },
5552 { "vec-div", RECIP_MASK_VEC_DIV },
5553 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5556 int const pta_size = ARRAY_SIZE (processor_alias_table);
5558 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5559 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5560 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5561 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5562 #ifdef TARGET_BI_ARCH
5563 else
5565 #if TARGET_BI_ARCH == 1
5566 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5567 is on and OPTION_MASK_ABI_X32 is off. We turn off
5568 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5569 -mx32. */
5570 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5571 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5572 #else
5573 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5574 on and OPTION_MASK_ABI_64 is off. We turn off
5575 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5576 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5577 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5578 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5579 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5580 #endif
5581 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5582 && TARGET_IAMCU_P (opts->x_target_flags))
5583 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5584 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5586 #endif
5588 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5590 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5591 OPTION_MASK_ABI_64 for TARGET_X32. */
5592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5593 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5595 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5596 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5597 | OPTION_MASK_ABI_X32
5598 | OPTION_MASK_ABI_64);
5599 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5601 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5602 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5603 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5604 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5607 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5608 SUBTARGET_OVERRIDE_OPTIONS;
5609 #endif
5611 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5612 SUBSUBTARGET_OVERRIDE_OPTIONS;
5613 #endif
5615 /* -fPIC is the default for x86_64. */
5616 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5617 opts->x_flag_pic = 2;
5619 /* Need to check -mtune=generic first. */
5620 if (opts->x_ix86_tune_string)
5622 /* As special support for cross compilers we read -mtune=native
5623 as -mtune=generic. With native compilers we won't see the
5624 -mtune=native, as it was changed by the driver. */
5625 if (!strcmp (opts->x_ix86_tune_string, "native"))
5627 opts->x_ix86_tune_string = "generic";
5629 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5630 warning (OPT_Wdeprecated,
5631 main_args_p
5632 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5633 "or %<-mtune=generic%> instead as appropriate")
5634 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5635 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5636 " instead as appropriate"));
5638 else
5640 if (opts->x_ix86_arch_string)
5641 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5642 if (!opts->x_ix86_tune_string)
5644 opts->x_ix86_tune_string
5645 = processor_target_table[TARGET_CPU_DEFAULT].name;
5646 ix86_tune_defaulted = 1;
5649 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5650 or defaulted. We need to use a sensible tune option. */
5651 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5653 opts->x_ix86_tune_string = "generic";
5657 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5658 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5660 /* rep; movq isn't available in 32-bit code. */
5661 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5662 opts->x_ix86_stringop_alg = no_stringop;
5665 if (!opts->x_ix86_arch_string)
5666 opts->x_ix86_arch_string
5667 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5668 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5669 else
5670 ix86_arch_specified = 1;
5672 if (opts_set->x_ix86_pmode)
5674 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5675 && opts->x_ix86_pmode == PMODE_SI)
5676 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5677 && opts->x_ix86_pmode == PMODE_DI))
5678 error ("address mode %qs not supported in the %s bit mode",
5679 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5680 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5682 else
5683 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5684 ? PMODE_DI : PMODE_SI;
5686 if (!opts_set->x_ix86_abi)
5687 opts->x_ix86_abi = DEFAULT_ABI;
5689 /* For targets using ms ABI enable ms-extensions, if not
5690 explicit turned off. For non-ms ABI we turn off this
5691 option. */
5692 if (!opts_set->x_flag_ms_extensions)
5693 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5695 if (opts_set->x_ix86_cmodel)
5697 switch (opts->x_ix86_cmodel)
5699 case CM_SMALL:
5700 case CM_SMALL_PIC:
5701 if (opts->x_flag_pic)
5702 opts->x_ix86_cmodel = CM_SMALL_PIC;
5703 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5704 error ("code model %qs not supported in the %s bit mode",
5705 "small", "32");
5706 break;
5708 case CM_MEDIUM:
5709 case CM_MEDIUM_PIC:
5710 if (opts->x_flag_pic)
5711 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5712 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5713 error ("code model %qs not supported in the %s bit mode",
5714 "medium", "32");
5715 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5716 error ("code model %qs not supported in x32 mode",
5717 "medium");
5718 break;
5720 case CM_LARGE:
5721 case CM_LARGE_PIC:
5722 if (opts->x_flag_pic)
5723 opts->x_ix86_cmodel = CM_LARGE_PIC;
5724 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5725 error ("code model %qs not supported in the %s bit mode",
5726 "large", "32");
5727 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5728 error ("code model %qs not supported in x32 mode",
5729 "large");
5730 break;
5732 case CM_32:
5733 if (opts->x_flag_pic)
5734 error ("code model %s does not support PIC mode", "32");
5735 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5736 error ("code model %qs not supported in the %s bit mode",
5737 "32", "64");
5738 break;
5740 case CM_KERNEL:
5741 if (opts->x_flag_pic)
5743 error ("code model %s does not support PIC mode", "kernel");
5744 opts->x_ix86_cmodel = CM_32;
5746 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5747 error ("code model %qs not supported in the %s bit mode",
5748 "kernel", "32");
5749 break;
5751 default:
5752 gcc_unreachable ();
5755 else
5757 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5758 use of rip-relative addressing. This eliminates fixups that
5759 would otherwise be needed if this object is to be placed in a
5760 DLL, and is essentially just as efficient as direct addressing. */
5761 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5762 && (TARGET_RDOS || TARGET_PECOFF))
5763 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5764 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5765 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5766 else
5767 opts->x_ix86_cmodel = CM_32;
5769 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5771 error ("-masm=intel not supported in this configuration");
5772 opts->x_ix86_asm_dialect = ASM_ATT;
5774 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5775 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5776 sorry ("%i-bit mode not compiled in",
5777 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5779 for (i = 0; i < pta_size; i++)
5780 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5782 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5784 error (main_args_p
5785 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5786 "switch")
5787 : G_("%<generic%> CPU can be used only for "
5788 "%<target(\"tune=\")%> attribute"));
5789 return false;
5791 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5793 error (main_args_p
5794 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5795 "switch")
5796 : G_("%<intel%> CPU can be used only for "
5797 "%<target(\"tune=\")%> attribute"));
5798 return false;
5801 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5802 && !(processor_alias_table[i].flags & PTA_64BIT))
5804 error ("CPU you selected does not support x86-64 "
5805 "instruction set");
5806 return false;
5809 ix86_schedule = processor_alias_table[i].schedule;
5810 ix86_arch = processor_alias_table[i].processor;
5811 /* Default cpu tuning to the architecture. */
5812 ix86_tune = ix86_arch;
5814 if (processor_alias_table[i].flags & PTA_MMX
5815 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5816 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5817 if (processor_alias_table[i].flags & PTA_3DNOW
5818 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5819 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5820 if (processor_alias_table[i].flags & PTA_3DNOW_A
5821 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5822 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5823 if (processor_alias_table[i].flags & PTA_SSE
5824 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5825 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5826 if (processor_alias_table[i].flags & PTA_SSE2
5827 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5828 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5829 if (processor_alias_table[i].flags & PTA_SSE3
5830 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5831 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5832 if (processor_alias_table[i].flags & PTA_SSSE3
5833 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5834 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5835 if (processor_alias_table[i].flags & PTA_SSE4_1
5836 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5837 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5838 if (processor_alias_table[i].flags & PTA_SSE4_2
5839 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5840 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5841 if (processor_alias_table[i].flags & PTA_AVX
5842 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5843 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5844 if (processor_alias_table[i].flags & PTA_AVX2
5845 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5846 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5847 if (processor_alias_table[i].flags & PTA_FMA
5848 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5849 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5850 if (processor_alias_table[i].flags & PTA_SSE4A
5851 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5852 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5853 if (processor_alias_table[i].flags & PTA_FMA4
5854 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5855 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5856 if (processor_alias_table[i].flags & PTA_XOP
5857 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5858 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5859 if (processor_alias_table[i].flags & PTA_LWP
5860 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5861 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5862 if (processor_alias_table[i].flags & PTA_ABM
5863 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5864 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5865 if (processor_alias_table[i].flags & PTA_BMI
5866 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5867 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5868 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5869 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5870 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5871 if (processor_alias_table[i].flags & PTA_TBM
5872 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5873 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5874 if (processor_alias_table[i].flags & PTA_BMI2
5875 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5876 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5877 if (processor_alias_table[i].flags & PTA_CX16
5878 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5879 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5880 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5881 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5882 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5883 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5884 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5885 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5886 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5887 if (processor_alias_table[i].flags & PTA_MOVBE
5888 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5889 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5890 if (processor_alias_table[i].flags & PTA_AES
5891 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5892 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5893 if (processor_alias_table[i].flags & PTA_SHA
5894 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5895 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5896 if (processor_alias_table[i].flags & PTA_PCLMUL
5897 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5898 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5899 if (processor_alias_table[i].flags & PTA_FSGSBASE
5900 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5901 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5902 if (processor_alias_table[i].flags & PTA_RDRND
5903 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5904 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5905 if (processor_alias_table[i].flags & PTA_F16C
5906 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5907 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5908 if (processor_alias_table[i].flags & PTA_RTM
5909 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5910 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5911 if (processor_alias_table[i].flags & PTA_HLE
5912 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5913 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5914 if (processor_alias_table[i].flags & PTA_PRFCHW
5915 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5916 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5917 if (processor_alias_table[i].flags & PTA_RDSEED
5918 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5919 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5920 if (processor_alias_table[i].flags & PTA_ADX
5921 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5922 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5923 if (processor_alias_table[i].flags & PTA_FXSR
5924 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5925 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5926 if (processor_alias_table[i].flags & PTA_XSAVE
5927 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5928 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5929 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5930 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5931 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5932 if (processor_alias_table[i].flags & PTA_AVX512F
5933 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5934 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5935 if (processor_alias_table[i].flags & PTA_AVX512ER
5936 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5937 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5938 if (processor_alias_table[i].flags & PTA_AVX512PF
5939 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5940 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5941 if (processor_alias_table[i].flags & PTA_AVX512CD
5942 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5943 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5944 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5945 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5946 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5947 if (processor_alias_table[i].flags & PTA_CLWB
5948 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5949 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5950 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5951 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5952 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5953 if (processor_alias_table[i].flags & PTA_CLZERO
5954 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5955 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5956 if (processor_alias_table[i].flags & PTA_XSAVEC
5957 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5958 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5959 if (processor_alias_table[i].flags & PTA_XSAVES
5960 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5961 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5962 if (processor_alias_table[i].flags & PTA_AVX512DQ
5963 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5964 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5965 if (processor_alias_table[i].flags & PTA_AVX512BW
5966 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5967 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5968 if (processor_alias_table[i].flags & PTA_AVX512VL
5969 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5970 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5971 if (processor_alias_table[i].flags & PTA_MPX
5972 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5973 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5974 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5975 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5976 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5977 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5978 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5979 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5981 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5982 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5983 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5984 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5985 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5986 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5987 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5988 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5989 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5990 if (processor_alias_table[i].flags & PTA_SGX
5991 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5992 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5994 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5995 x86_prefetch_sse = true;
5996 if (processor_alias_table[i].flags & PTA_MWAITX
5997 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5998 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5999 if (processor_alias_table[i].flags & PTA_PKU
6000 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6003 /* Don't enable x87 instructions if only
6004 general registers are allowed. */
6005 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6006 && !(opts_set->x_target_flags & MASK_80387))
6008 if (processor_alias_table[i].flags & PTA_NO_80387)
6009 opts->x_target_flags &= ~MASK_80387;
6010 else
6011 opts->x_target_flags |= MASK_80387;
6013 break;
6016 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6017 error ("Intel MPX does not support x32");
6019 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6020 error ("Intel MPX does not support x32");
6022 if (i == pta_size)
6024 error (main_args_p
6025 ? G_("bad value (%qs) for %<-march=%> switch")
6026 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6027 opts->x_ix86_arch_string);
6029 auto_vec <const char *> candidates;
6030 for (i = 0; i < pta_size; i++)
6031 if (strcmp (processor_alias_table[i].name, "generic")
6032 && strcmp (processor_alias_table[i].name, "intel")
6033 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6034 || (processor_alias_table[i].flags & PTA_64BIT)))
6035 candidates.safe_push (processor_alias_table[i].name);
6037 char *s;
6038 const char *hint
6039 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6040 if (hint)
6041 inform (input_location,
6042 main_args_p
6043 ? G_("valid arguments to %<-march=%> switch are: "
6044 "%s; did you mean %qs?")
6045 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6046 "%s; did you mean %qs?"), s, hint);
6047 else
6048 inform (input_location,
6049 main_args_p
6050 ? G_("valid arguments to %<-march=%> switch are: %s")
6051 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6052 "are: %s"), s);
6053 XDELETEVEC (s);
6056 ix86_arch_mask = 1u << ix86_arch;
6057 for (i = 0; i < X86_ARCH_LAST; ++i)
6058 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6060 for (i = 0; i < pta_size; i++)
6061 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6063 ix86_schedule = processor_alias_table[i].schedule;
6064 ix86_tune = processor_alias_table[i].processor;
6065 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6067 if (!(processor_alias_table[i].flags & PTA_64BIT))
6069 if (ix86_tune_defaulted)
6071 opts->x_ix86_tune_string = "x86-64";
6072 for (i = 0; i < pta_size; i++)
6073 if (! strcmp (opts->x_ix86_tune_string,
6074 processor_alias_table[i].name))
6075 break;
6076 ix86_schedule = processor_alias_table[i].schedule;
6077 ix86_tune = processor_alias_table[i].processor;
6079 else
6080 error ("CPU you selected does not support x86-64 "
6081 "instruction set");
6084 /* Intel CPUs have always interpreted SSE prefetch instructions as
6085 NOPs; so, we can enable SSE prefetch instructions even when
6086 -mtune (rather than -march) points us to a processor that has them.
6087 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6088 higher processors. */
6089 if (TARGET_CMOV
6090 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6091 x86_prefetch_sse = true;
6092 break;
6095 if (ix86_tune_specified && i == pta_size)
6097 error (main_args_p
6098 ? G_("bad value (%qs) for %<-mtune=%> switch")
6099 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6100 opts->x_ix86_tune_string);
6102 auto_vec <const char *> candidates;
6103 for (i = 0; i < pta_size; i++)
6104 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6105 || (processor_alias_table[i].flags & PTA_64BIT))
6106 candidates.safe_push (processor_alias_table[i].name);
6108 char *s;
6109 const char *hint
6110 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6111 if (hint)
6112 inform (input_location,
6113 main_args_p
6114 ? G_("valid arguments to %<-mtune=%> switch are: "
6115 "%s; did you mean %qs?")
6116 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6117 "%s; did you mean %qs?"), s, hint);
6118 else
6119 inform (input_location,
6120 main_args_p
6121 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6122 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6123 "are: %s"), s);
6124 XDELETEVEC (s);
6127 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6129 #ifndef USE_IX86_FRAME_POINTER
6130 #define USE_IX86_FRAME_POINTER 0
6131 #endif
6133 #ifndef USE_X86_64_FRAME_POINTER
6134 #define USE_X86_64_FRAME_POINTER 0
6135 #endif
6137 /* Set the default values for switches whose default depends on TARGET_64BIT
6138 in case they weren't overwritten by command line options. */
6139 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6141 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6142 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6143 if (opts->x_flag_asynchronous_unwind_tables
6144 && !opts_set->x_flag_unwind_tables
6145 && TARGET_64BIT_MS_ABI)
6146 opts->x_flag_unwind_tables = 1;
6147 if (opts->x_flag_asynchronous_unwind_tables == 2)
6148 opts->x_flag_unwind_tables
6149 = opts->x_flag_asynchronous_unwind_tables = 1;
6150 if (opts->x_flag_pcc_struct_return == 2)
6151 opts->x_flag_pcc_struct_return = 0;
6153 else
6155 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6156 opts->x_flag_omit_frame_pointer
6157 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6158 if (opts->x_flag_asynchronous_unwind_tables == 2)
6159 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6160 if (opts->x_flag_pcc_struct_return == 2)
6162 /* Intel MCU psABI specifies that -freg-struct-return should
6163 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6164 we check -miamcu so that -freg-struct-return is always
6165 turned on if -miamcu is used. */
6166 if (TARGET_IAMCU_P (opts->x_target_flags))
6167 opts->x_flag_pcc_struct_return = 0;
6168 else
6169 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6173 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6174 /* TODO: ix86_cost should be chosen at instruction or function granuality
6175 so for cold code we use size_cost even in !optimize_size compilation. */
6176 if (opts->x_optimize_size)
6177 ix86_cost = &ix86_size_cost;
6178 else
6179 ix86_cost = ix86_tune_cost;
6181 /* Arrange to set up i386_stack_locals for all functions. */
6182 init_machine_status = ix86_init_machine_status;
6184 /* Validate -mregparm= value. */
6185 if (opts_set->x_ix86_regparm)
6187 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6188 warning (0, "-mregparm is ignored in 64-bit mode");
6189 else if (TARGET_IAMCU_P (opts->x_target_flags))
6190 warning (0, "-mregparm is ignored for Intel MCU psABI");
6191 if (opts->x_ix86_regparm > REGPARM_MAX)
6193 error ("-mregparm=%d is not between 0 and %d",
6194 opts->x_ix86_regparm, REGPARM_MAX);
6195 opts->x_ix86_regparm = 0;
6198 if (TARGET_IAMCU_P (opts->x_target_flags)
6199 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6200 opts->x_ix86_regparm = REGPARM_MAX;
6202 /* Default align_* from the processor table. */
6203 ix86_default_align (opts);
6205 /* Provide default for -mbranch-cost= value. */
6206 if (!opts_set->x_ix86_branch_cost)
6207 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6209 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6211 opts->x_target_flags
6212 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6214 /* Enable by default the SSE and MMX builtins. Do allow the user to
6215 explicitly disable any of these. In particular, disabling SSE and
6216 MMX for kernel code is extremely useful. */
6217 if (!ix86_arch_specified)
6218 opts->x_ix86_isa_flags
6219 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6220 | TARGET_SUBTARGET64_ISA_DEFAULT)
6221 & ~opts->x_ix86_isa_flags_explicit);
6223 if (TARGET_RTD_P (opts->x_target_flags))
6224 warning (0,
6225 main_args_p
6226 ? G_("%<-mrtd%> is ignored in 64bit mode")
6227 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6229 else
6231 opts->x_target_flags
6232 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6234 if (!ix86_arch_specified)
6235 opts->x_ix86_isa_flags
6236 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6238 /* i386 ABI does not specify red zone. It still makes sense to use it
6239 when programmer takes care to stack from being destroyed. */
6240 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6241 opts->x_target_flags |= MASK_NO_RED_ZONE;
6244 /* Keep nonleaf frame pointers. */
6245 if (opts->x_flag_omit_frame_pointer)
6246 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6247 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6248 opts->x_flag_omit_frame_pointer = 1;
6250 /* If we're doing fast math, we don't care about comparison order
6251 wrt NaNs. This lets us use a shorter comparison sequence. */
6252 if (opts->x_flag_finite_math_only)
6253 opts->x_target_flags &= ~MASK_IEEE_FP;
6255 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6256 since the insns won't need emulation. */
6257 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6258 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6260 /* Likewise, if the target doesn't have a 387, or we've specified
6261 software floating point, don't use 387 inline intrinsics. */
6262 if (!TARGET_80387_P (opts->x_target_flags))
6263 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6265 /* Turn on MMX builtins for -msse. */
6266 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6267 opts->x_ix86_isa_flags
6268 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6270 /* Enable SSE prefetch. */
6271 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6272 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6273 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6274 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6275 x86_prefetch_sse = true;
6277 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6278 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6279 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6280 opts->x_ix86_isa_flags
6281 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6283 /* Enable lzcnt instruction for -mabm. */
6284 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6285 opts->x_ix86_isa_flags
6286 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6288 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6289 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6290 opts->x_ix86_isa_flags
6291 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6292 & ~opts->x_ix86_isa_flags_explicit);
6294 /* Validate -mpreferred-stack-boundary= value or default it to
6295 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6296 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6297 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6299 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6300 int max = TARGET_SEH ? 4 : 12;
6302 if (opts->x_ix86_preferred_stack_boundary_arg < min
6303 || opts->x_ix86_preferred_stack_boundary_arg > max)
6305 if (min == max)
6306 error ("-mpreferred-stack-boundary is not supported "
6307 "for this target");
6308 else
6309 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6310 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6312 else
6313 ix86_preferred_stack_boundary
6314 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6317 /* Set the default value for -mstackrealign. */
6318 if (opts->x_ix86_force_align_arg_pointer == -1)
6319 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6321 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6323 /* Validate -mincoming-stack-boundary= value or default it to
6324 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6325 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6326 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6328 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6330 if (opts->x_ix86_incoming_stack_boundary_arg < min
6331 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6332 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6333 opts->x_ix86_incoming_stack_boundary_arg, min);
6334 else
6336 ix86_user_incoming_stack_boundary
6337 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6338 ix86_incoming_stack_boundary
6339 = ix86_user_incoming_stack_boundary;
6343 #ifndef NO_PROFILE_COUNTERS
6344 if (flag_nop_mcount)
6345 error ("-mnop-mcount is not compatible with this target");
6346 #endif
6347 if (flag_nop_mcount && flag_pic)
6348 error ("-mnop-mcount is not implemented for -fPIC");
6350 /* Accept -msseregparm only if at least SSE support is enabled. */
6351 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6352 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6353 error (main_args_p
6354 ? G_("%<-msseregparm%> used without SSE enabled")
6355 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6357 if (opts_set->x_ix86_fpmath)
6359 if (opts->x_ix86_fpmath & FPMATH_SSE)
6361 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6363 if (TARGET_80387_P (opts->x_target_flags))
6365 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6366 opts->x_ix86_fpmath = FPMATH_387;
6369 else if ((opts->x_ix86_fpmath & FPMATH_387)
6370 && !TARGET_80387_P (opts->x_target_flags))
6372 warning (0, "387 instruction set disabled, using SSE arithmetics");
6373 opts->x_ix86_fpmath = FPMATH_SSE;
6377 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6378 fpmath=387. The second is however default at many targets since the
6379 extra 80bit precision of temporaries is considered to be part of ABI.
6380 Overwrite the default at least for -ffast-math.
6381 TODO: -mfpmath=both seems to produce same performing code with bit
6382 smaller binaries. It is however not clear if register allocation is
6383 ready for this setting.
6384 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6385 codegen. We may switch to 387 with -ffast-math for size optimized
6386 functions. */
6387 else if (fast_math_flags_set_p (&global_options)
6388 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6389 opts->x_ix86_fpmath = FPMATH_SSE;
6390 else
6391 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6393 /* Use external vectorized library in vectorizing intrinsics. */
6394 if (opts_set->x_ix86_veclibabi_type)
6395 switch (opts->x_ix86_veclibabi_type)
6397 case ix86_veclibabi_type_svml:
6398 ix86_veclib_handler = ix86_veclibabi_svml;
6399 break;
6401 case ix86_veclibabi_type_acml:
6402 ix86_veclib_handler = ix86_veclibabi_acml;
6403 break;
6405 default:
6406 gcc_unreachable ();
6409 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6410 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6411 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6413 /* If stack probes are required, the space used for large function
6414 arguments on the stack must also be probed, so enable
6415 -maccumulate-outgoing-args so this happens in the prologue. */
6416 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6417 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6419 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6420 warning (0,
6421 main_args_p
6422 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6423 "for correctness")
6424 : G_("stack probing requires "
6425 "%<target(\"accumulate-outgoing-args\")%> for "
6426 "correctness"));
6427 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6430 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6431 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6432 if (fixed_regs[BP_REG]
6433 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6435 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6436 warning (0,
6437 main_args_p
6438 ? G_("fixed ebp register requires "
6439 "%<-maccumulate-outgoing-args%>")
6440 : G_("fixed ebp register requires "
6441 "%<target(\"accumulate-outgoing-args\")%>"));
6442 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6445 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6447 char *p;
6448 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6449 p = strchr (internal_label_prefix, 'X');
6450 internal_label_prefix_len = p - internal_label_prefix;
6451 *p = '\0';
6454 /* When scheduling description is not available, disable scheduler pass
6455 so it won't slow down the compilation and make x87 code slower. */
6456 if (!TARGET_SCHEDULE)
6457 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6459 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6460 ix86_tune_cost->simultaneous_prefetches,
6461 opts->x_param_values,
6462 opts_set->x_param_values);
6463 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6464 ix86_tune_cost->prefetch_block,
6465 opts->x_param_values,
6466 opts_set->x_param_values);
6467 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6468 ix86_tune_cost->l1_cache_size,
6469 opts->x_param_values,
6470 opts_set->x_param_values);
6471 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6472 ix86_tune_cost->l2_cache_size,
6473 opts->x_param_values,
6474 opts_set->x_param_values);
6476 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6477 if (opts->x_flag_prefetch_loop_arrays < 0
6478 && HAVE_prefetch
6479 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6480 && !opts->x_optimize_size
6481 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6482 opts->x_flag_prefetch_loop_arrays = 1;
6484 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6485 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6486 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6487 targetm.expand_builtin_va_start = NULL;
6489 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6491 ix86_gen_leave = gen_leave_rex64;
6492 if (Pmode == DImode)
6494 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6495 ix86_gen_tls_local_dynamic_base_64
6496 = gen_tls_local_dynamic_base_64_di;
6498 else
6500 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6501 ix86_gen_tls_local_dynamic_base_64
6502 = gen_tls_local_dynamic_base_64_si;
6505 else
6506 ix86_gen_leave = gen_leave;
6508 if (Pmode == DImode)
6510 ix86_gen_add3 = gen_adddi3;
6511 ix86_gen_sub3 = gen_subdi3;
6512 ix86_gen_sub3_carry = gen_subdi3_carry;
6513 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6514 ix86_gen_andsp = gen_anddi3;
6515 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6516 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6517 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6518 ix86_gen_monitor = gen_sse3_monitor_di;
6519 ix86_gen_monitorx = gen_monitorx_di;
6520 ix86_gen_clzero = gen_clzero_di;
6522 else
6524 ix86_gen_add3 = gen_addsi3;
6525 ix86_gen_sub3 = gen_subsi3;
6526 ix86_gen_sub3_carry = gen_subsi3_carry;
6527 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6528 ix86_gen_andsp = gen_andsi3;
6529 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6530 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6531 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6532 ix86_gen_monitor = gen_sse3_monitor_si;
6533 ix86_gen_monitorx = gen_monitorx_si;
6534 ix86_gen_clzero = gen_clzero_si;
6537 #ifdef USE_IX86_CLD
6538 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6539 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6540 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6541 #endif
6543 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6545 if (opts->x_flag_fentry > 0)
6546 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6547 "with -fpic");
6548 opts->x_flag_fentry = 0;
6550 else if (TARGET_SEH)
6552 if (opts->x_flag_fentry == 0)
6553 sorry ("-mno-fentry isn%'t compatible with SEH");
6554 opts->x_flag_fentry = 1;
6556 else if (opts->x_flag_fentry < 0)
6558 #if defined(PROFILE_BEFORE_PROLOGUE)
6559 opts->x_flag_fentry = 1;
6560 #else
6561 opts->x_flag_fentry = 0;
6562 #endif
6565 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6566 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6568 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6569 opts->x_target_flags |= MASK_VZEROUPPER;
6570 if (!(opts_set->x_target_flags & MASK_STV))
6571 opts->x_target_flags |= MASK_STV;
6572 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6573 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6574 stack realignment will be extra cost the pass doesn't take into
6575 account and the pass can't realign the stack. */
6576 if (ix86_preferred_stack_boundary < 128
6577 || ix86_incoming_stack_boundary < 128
6578 || opts->x_ix86_force_align_arg_pointer)
6579 opts->x_target_flags &= ~MASK_STV;
6580 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6581 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6582 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6583 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6584 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6585 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6586 /* Enable 128-bit AVX instruction generation
6587 for the auto-vectorizer. */
6588 if (TARGET_AVX128_OPTIMAL
6589 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6590 opts->x_target_flags |= MASK_PREFER_AVX128;
6592 if (opts->x_ix86_recip_name)
6594 char *p = ASTRDUP (opts->x_ix86_recip_name);
6595 char *q;
6596 unsigned int mask, i;
6597 bool invert;
6599 while ((q = strtok (p, ",")) != NULL)
6601 p = NULL;
6602 if (*q == '!')
6604 invert = true;
6605 q++;
6607 else
6608 invert = false;
6610 if (!strcmp (q, "default"))
6611 mask = RECIP_MASK_ALL;
6612 else
6614 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6615 if (!strcmp (q, recip_options[i].string))
6617 mask = recip_options[i].mask;
6618 break;
6621 if (i == ARRAY_SIZE (recip_options))
6623 error ("unknown option for -mrecip=%s", q);
6624 invert = false;
6625 mask = RECIP_MASK_NONE;
6629 opts->x_recip_mask_explicit |= mask;
6630 if (invert)
6631 opts->x_recip_mask &= ~mask;
6632 else
6633 opts->x_recip_mask |= mask;
6637 if (TARGET_RECIP_P (opts->x_target_flags))
6638 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6639 else if (opts_set->x_target_flags & MASK_RECIP)
6640 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6642 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6643 for 64-bit Bionic. Also default long double to 64-bit for Intel
6644 MCU psABI. */
6645 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6646 && !(opts_set->x_target_flags
6647 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6648 opts->x_target_flags |= (TARGET_64BIT
6649 ? MASK_LONG_DOUBLE_128
6650 : MASK_LONG_DOUBLE_64);
6652 /* Only one of them can be active. */
6653 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6654 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6656 /* Save the initial options in case the user does function specific
6657 options. */
6658 if (main_args_p)
6659 target_option_default_node = target_option_current_node
6660 = build_target_option_node (opts);
6662 /* Handle stack protector */
6663 if (!opts_set->x_ix86_stack_protector_guard)
6664 opts->x_ix86_stack_protector_guard
6665 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6667 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6668 if (opts->x_ix86_tune_memcpy_strategy)
6670 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6671 ix86_parse_stringop_strategy_string (str, false);
6672 free (str);
6675 if (opts->x_ix86_tune_memset_strategy)
6677 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6678 ix86_parse_stringop_strategy_string (str, true);
6679 free (str);
6682 return true;
6685 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6687 static void
6688 ix86_option_override (void)
6690 ix86_option_override_internal (true, &global_options, &global_options_set);
6693 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6694 static char *
6695 ix86_offload_options (void)
6697 if (TARGET_LP64)
6698 return xstrdup ("-foffload-abi=lp64");
6699 return xstrdup ("-foffload-abi=ilp32");
6702 /* Update register usage after having seen the compiler flags. */
6704 static void
6705 ix86_conditional_register_usage (void)
6707 int i, c_mask;
6709 /* If there are no caller-saved registers, preserve all registers.
6710 except fixed_regs and registers used for function return value
6711 since aggregate_value_p checks call_used_regs[regno] on return
6712 value. */
6713 if (cfun && cfun->machine->no_caller_saved_registers)
6714 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6715 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6716 call_used_regs[i] = 0;
6718 /* For 32-bit targets, squash the REX registers. */
6719 if (! TARGET_64BIT)
6721 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6722 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6723 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6724 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6725 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6726 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6729 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6730 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6732 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6734 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6736 /* Set/reset conditionally defined registers from
6737 CALL_USED_REGISTERS initializer. */
6738 if (call_used_regs[i] > 1)
6739 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6741 /* Calculate registers of CLOBBERED_REGS register set
6742 as call used registers from GENERAL_REGS register set. */
6743 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6744 && call_used_regs[i])
6745 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6748 /* If MMX is disabled, squash the registers. */
6749 if (! TARGET_MMX)
6750 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6751 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6752 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6754 /* If SSE is disabled, squash the registers. */
6755 if (! TARGET_SSE)
6756 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6757 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6758 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6760 /* If the FPU is disabled, squash the registers. */
6761 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6762 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6763 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6764 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6766 /* If AVX512F is disabled, squash the registers. */
6767 if (! TARGET_AVX512F)
6769 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6770 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6772 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6773 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6776 /* If MPX is disabled, squash the registers. */
6777 if (! TARGET_MPX)
6778 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6779 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6783 /* Save the current options */
6785 static void
6786 ix86_function_specific_save (struct cl_target_option *ptr,
6787 struct gcc_options *opts)
6789 ptr->arch = ix86_arch;
6790 ptr->schedule = ix86_schedule;
6791 ptr->prefetch_sse = x86_prefetch_sse;
6792 ptr->tune = ix86_tune;
6793 ptr->branch_cost = ix86_branch_cost;
6794 ptr->tune_defaulted = ix86_tune_defaulted;
6795 ptr->arch_specified = ix86_arch_specified;
6796 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6797 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6798 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6799 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6800 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6801 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6802 ptr->x_ix86_abi = opts->x_ix86_abi;
6803 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6804 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6805 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6806 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6807 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6808 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6809 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6810 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6811 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6812 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6813 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6814 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6815 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6816 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6817 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6818 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6819 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6820 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6821 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6822 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6824 /* The fields are char but the variables are not; make sure the
6825 values fit in the fields. */
6826 gcc_assert (ptr->arch == ix86_arch);
6827 gcc_assert (ptr->schedule == ix86_schedule);
6828 gcc_assert (ptr->tune == ix86_tune);
6829 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6832 /* Restore the current options */
6834 static void
6835 ix86_function_specific_restore (struct gcc_options *opts,
6836 struct cl_target_option *ptr)
6838 enum processor_type old_tune = ix86_tune;
6839 enum processor_type old_arch = ix86_arch;
6840 unsigned int ix86_arch_mask;
6841 int i;
6843 /* We don't change -fPIC. */
6844 opts->x_flag_pic = flag_pic;
6846 ix86_arch = (enum processor_type) ptr->arch;
6847 ix86_schedule = (enum attr_cpu) ptr->schedule;
6848 ix86_tune = (enum processor_type) ptr->tune;
6849 x86_prefetch_sse = ptr->prefetch_sse;
6850 opts->x_ix86_branch_cost = ptr->branch_cost;
6851 ix86_tune_defaulted = ptr->tune_defaulted;
6852 ix86_arch_specified = ptr->arch_specified;
6853 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6854 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6855 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6856 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6857 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6858 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6859 opts->x_ix86_abi = ptr->x_ix86_abi;
6860 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6861 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6862 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6863 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6864 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6865 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6866 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6867 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6868 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6869 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6870 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6871 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6872 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6873 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6874 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6875 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6876 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6877 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6878 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6879 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6880 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6881 /* TODO: ix86_cost should be chosen at instruction or function granuality
6882 so for cold code we use size_cost even in !optimize_size compilation. */
6883 if (opts->x_optimize_size)
6884 ix86_cost = &ix86_size_cost;
6885 else
6886 ix86_cost = ix86_tune_cost;
6888 /* Recreate the arch feature tests if the arch changed */
6889 if (old_arch != ix86_arch)
6891 ix86_arch_mask = 1u << ix86_arch;
6892 for (i = 0; i < X86_ARCH_LAST; ++i)
6893 ix86_arch_features[i]
6894 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6897 /* Recreate the tune optimization tests */
6898 if (old_tune != ix86_tune)
6899 set_ix86_tune_features (ix86_tune, false);
6902 /* Adjust target options after streaming them in. This is mainly about
6903 reconciling them with global options. */
6905 static void
6906 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6908 /* flag_pic is a global option, but ix86_cmodel is target saved option
6909 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6910 for PIC, or error out. */
6911 if (flag_pic)
6912 switch (ptr->x_ix86_cmodel)
6914 case CM_SMALL:
6915 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6916 break;
6918 case CM_MEDIUM:
6919 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6920 break;
6922 case CM_LARGE:
6923 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6924 break;
6926 case CM_KERNEL:
6927 error ("code model %s does not support PIC mode", "kernel");
6928 break;
6930 default:
6931 break;
6933 else
6934 switch (ptr->x_ix86_cmodel)
6936 case CM_SMALL_PIC:
6937 ptr->x_ix86_cmodel = CM_SMALL;
6938 break;
6940 case CM_MEDIUM_PIC:
6941 ptr->x_ix86_cmodel = CM_MEDIUM;
6942 break;
6944 case CM_LARGE_PIC:
6945 ptr->x_ix86_cmodel = CM_LARGE;
6946 break;
6948 default:
6949 break;
6953 /* Print the current options */
6955 static void
6956 ix86_function_specific_print (FILE *file, int indent,
6957 struct cl_target_option *ptr)
6959 char *target_string
6960 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6961 ptr->x_target_flags, ptr->x_ix86_target_flags,
6962 NULL, NULL, ptr->x_ix86_fpmath, false);
6964 gcc_assert (ptr->arch < PROCESSOR_max);
6965 fprintf (file, "%*sarch = %d (%s)\n",
6966 indent, "",
6967 ptr->arch, processor_target_table[ptr->arch].name);
6969 gcc_assert (ptr->tune < PROCESSOR_max);
6970 fprintf (file, "%*stune = %d (%s)\n",
6971 indent, "",
6972 ptr->tune, processor_target_table[ptr->tune].name);
6974 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6976 if (target_string)
6978 fprintf (file, "%*s%s\n", indent, "", target_string);
6979 free (target_string);
6984 /* Inner function to process the attribute((target(...))), take an argument and
6985 set the current options from the argument. If we have a list, recursively go
6986 over the list. */
6988 static bool
6989 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6990 struct gcc_options *opts,
6991 struct gcc_options *opts_set,
6992 struct gcc_options *enum_opts_set)
6994 char *next_optstr;
6995 bool ret = true;
6997 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6998 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6999 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
7000 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7001 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7003 enum ix86_opt_type
7005 ix86_opt_unknown,
7006 ix86_opt_yes,
7007 ix86_opt_no,
7008 ix86_opt_str,
7009 ix86_opt_enum,
7010 ix86_opt_isa
7013 static const struct
7015 const char *string;
7016 size_t len;
7017 enum ix86_opt_type type;
7018 int opt;
7019 int mask;
7020 } attrs[] = {
7021 /* isa options */
7022 IX86_ATTR_ISA ("sgx", OPT_msgx),
7023 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7024 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7025 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7027 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7028 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7029 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7030 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7031 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7032 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7033 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7034 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7035 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7036 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7037 IX86_ATTR_ISA ("fma", OPT_mfma),
7038 IX86_ATTR_ISA ("xop", OPT_mxop),
7039 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7040 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7041 IX86_ATTR_ISA ("avx", OPT_mavx),
7042 IX86_ATTR_ISA ("sse4", OPT_msse4),
7043 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7044 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7045 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7046 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7047 IX86_ATTR_ISA ("sse3", OPT_msse3),
7048 IX86_ATTR_ISA ("aes", OPT_maes),
7049 IX86_ATTR_ISA ("sha", OPT_msha),
7050 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7051 IX86_ATTR_ISA ("sse2", OPT_msse2),
7052 IX86_ATTR_ISA ("sse", OPT_msse),
7053 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7054 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7055 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7056 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7057 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7058 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7059 IX86_ATTR_ISA ("adx", OPT_madx),
7060 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7061 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7062 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7063 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7064 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7065 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7066 IX86_ATTR_ISA ("abm", OPT_mabm),
7067 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7068 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7069 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7070 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7071 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7072 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7073 IX86_ATTR_ISA ("sahf", OPT_msahf),
7074 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7075 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7076 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7077 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7078 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7079 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7080 IX86_ATTR_ISA ("pku", OPT_mpku),
7081 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7082 IX86_ATTR_ISA ("hle", OPT_mhle),
7083 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7084 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7085 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7086 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7088 /* enum options */
7089 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7091 /* string options */
7092 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7093 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7095 /* flag options */
7096 IX86_ATTR_YES ("cld",
7097 OPT_mcld,
7098 MASK_CLD),
7100 IX86_ATTR_NO ("fancy-math-387",
7101 OPT_mfancy_math_387,
7102 MASK_NO_FANCY_MATH_387),
7104 IX86_ATTR_YES ("ieee-fp",
7105 OPT_mieee_fp,
7106 MASK_IEEE_FP),
7108 IX86_ATTR_YES ("inline-all-stringops",
7109 OPT_minline_all_stringops,
7110 MASK_INLINE_ALL_STRINGOPS),
7112 IX86_ATTR_YES ("inline-stringops-dynamically",
7113 OPT_minline_stringops_dynamically,
7114 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7116 IX86_ATTR_NO ("align-stringops",
7117 OPT_mno_align_stringops,
7118 MASK_NO_ALIGN_STRINGOPS),
7120 IX86_ATTR_YES ("recip",
7121 OPT_mrecip,
7122 MASK_RECIP),
7126 /* If this is a list, recurse to get the options. */
7127 if (TREE_CODE (args) == TREE_LIST)
7129 bool ret = true;
7131 for (; args; args = TREE_CHAIN (args))
7132 if (TREE_VALUE (args)
7133 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7134 p_strings, opts, opts_set,
7135 enum_opts_set))
7136 ret = false;
7138 return ret;
7141 else if (TREE_CODE (args) != STRING_CST)
7143 error ("attribute %<target%> argument not a string");
7144 return false;
7147 /* Handle multiple arguments separated by commas. */
7148 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7150 while (next_optstr && *next_optstr != '\0')
7152 char *p = next_optstr;
7153 char *orig_p = p;
7154 char *comma = strchr (next_optstr, ',');
7155 const char *opt_string;
7156 size_t len, opt_len;
7157 int opt;
7158 bool opt_set_p;
7159 char ch;
7160 unsigned i;
7161 enum ix86_opt_type type = ix86_opt_unknown;
7162 int mask = 0;
7164 if (comma)
7166 *comma = '\0';
7167 len = comma - next_optstr;
7168 next_optstr = comma + 1;
7170 else
7172 len = strlen (p);
7173 next_optstr = NULL;
7176 /* Recognize no-xxx. */
7177 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7179 opt_set_p = false;
7180 p += 3;
7181 len -= 3;
7183 else
7184 opt_set_p = true;
7186 /* Find the option. */
7187 ch = *p;
7188 opt = N_OPTS;
7189 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7191 type = attrs[i].type;
7192 opt_len = attrs[i].len;
7193 if (ch == attrs[i].string[0]
7194 && ((type != ix86_opt_str && type != ix86_opt_enum)
7195 ? len == opt_len
7196 : len > opt_len)
7197 && memcmp (p, attrs[i].string, opt_len) == 0)
7199 opt = attrs[i].opt;
7200 mask = attrs[i].mask;
7201 opt_string = attrs[i].string;
7202 break;
7206 /* Process the option. */
7207 if (opt == N_OPTS)
7209 error ("attribute(target(\"%s\")) is unknown", orig_p);
7210 ret = false;
7213 else if (type == ix86_opt_isa)
7215 struct cl_decoded_option decoded;
7217 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7218 ix86_handle_option (opts, opts_set,
7219 &decoded, input_location);
7222 else if (type == ix86_opt_yes || type == ix86_opt_no)
7224 if (type == ix86_opt_no)
7225 opt_set_p = !opt_set_p;
7227 if (opt_set_p)
7228 opts->x_target_flags |= mask;
7229 else
7230 opts->x_target_flags &= ~mask;
7233 else if (type == ix86_opt_str)
7235 if (p_strings[opt])
7237 error ("option(\"%s\") was already specified", opt_string);
7238 ret = false;
7240 else
7241 p_strings[opt] = xstrdup (p + opt_len);
7244 else if (type == ix86_opt_enum)
7246 bool arg_ok;
7247 int value;
7249 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7250 if (arg_ok)
7251 set_option (opts, enum_opts_set, opt, value,
7252 p + opt_len, DK_UNSPECIFIED, input_location,
7253 global_dc);
7254 else
7256 error ("attribute(target(\"%s\")) is unknown", orig_p);
7257 ret = false;
7261 else
7262 gcc_unreachable ();
7265 return ret;
7268 /* Release allocated strings. */
7269 static void
7270 release_options_strings (char **option_strings)
7272 /* Free up memory allocated to hold the strings */
7273 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7274 free (option_strings[i]);
7277 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7279 tree
7280 ix86_valid_target_attribute_tree (tree args,
7281 struct gcc_options *opts,
7282 struct gcc_options *opts_set)
7284 const char *orig_arch_string = opts->x_ix86_arch_string;
7285 const char *orig_tune_string = opts->x_ix86_tune_string;
7286 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7287 int orig_tune_defaulted = ix86_tune_defaulted;
7288 int orig_arch_specified = ix86_arch_specified;
7289 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7290 tree t = NULL_TREE;
7291 struct cl_target_option *def
7292 = TREE_TARGET_OPTION (target_option_default_node);
7293 struct gcc_options enum_opts_set;
7295 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7297 /* Process each of the options on the chain. */
7298 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7299 opts_set, &enum_opts_set))
7300 return error_mark_node;
7302 /* If the changed options are different from the default, rerun
7303 ix86_option_override_internal, and then save the options away.
7304 The string options are attribute options, and will be undone
7305 when we copy the save structure. */
7306 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7307 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7308 || opts->x_target_flags != def->x_target_flags
7309 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7310 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7311 || enum_opts_set.x_ix86_fpmath)
7313 /* If we are using the default tune= or arch=, undo the string assigned,
7314 and use the default. */
7315 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7317 opts->x_ix86_arch_string
7318 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7320 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7321 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7322 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7323 | OPTION_MASK_ABI_64
7324 | OPTION_MASK_ABI_X32
7325 | OPTION_MASK_CODE16);
7326 opts->x_ix86_isa_flags2 = 0;
7328 else if (!orig_arch_specified)
7329 opts->x_ix86_arch_string = NULL;
7331 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7332 opts->x_ix86_tune_string
7333 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7334 else if (orig_tune_defaulted)
7335 opts->x_ix86_tune_string = NULL;
7337 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7338 if (enum_opts_set.x_ix86_fpmath)
7339 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7340 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
7341 && TARGET_SSE_P (opts->x_ix86_isa_flags))
7343 if (TARGET_80387_P (opts->x_target_flags))
7344 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
7345 | FPMATH_387);
7346 else
7347 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
7348 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7351 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7352 bool r = ix86_option_override_internal (false, opts, opts_set);
7353 if (!r)
7355 release_options_strings (option_strings);
7356 return error_mark_node;
7359 /* Add any builtin functions with the new isa if any. */
7360 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7362 /* Save the current options unless we are validating options for
7363 #pragma. */
7364 t = build_target_option_node (opts);
7366 opts->x_ix86_arch_string = orig_arch_string;
7367 opts->x_ix86_tune_string = orig_tune_string;
7368 opts_set->x_ix86_fpmath = orig_fpmath_set;
7370 release_options_strings (option_strings);
7373 return t;
7376 /* Hook to validate attribute((target("string"))). */
7378 static bool
7379 ix86_valid_target_attribute_p (tree fndecl,
7380 tree ARG_UNUSED (name),
7381 tree args,
7382 int ARG_UNUSED (flags))
7384 struct gcc_options func_options;
7385 tree new_target, new_optimize;
7386 bool ret = true;
7388 /* attribute((target("default"))) does nothing, beyond
7389 affecting multi-versioning. */
7390 if (TREE_VALUE (args)
7391 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7392 && TREE_CHAIN (args) == NULL_TREE
7393 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7394 return true;
7396 tree old_optimize = build_optimization_node (&global_options);
7398 /* Get the optimization options of the current function. */
7399 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7401 if (!func_optimize)
7402 func_optimize = old_optimize;
7404 /* Init func_options. */
7405 memset (&func_options, 0, sizeof (func_options));
7406 init_options_struct (&func_options, NULL);
7407 lang_hooks.init_options_struct (&func_options);
7409 cl_optimization_restore (&func_options,
7410 TREE_OPTIMIZATION (func_optimize));
7412 /* Initialize func_options to the default before its target options can
7413 be set. */
7414 cl_target_option_restore (&func_options,
7415 TREE_TARGET_OPTION (target_option_default_node));
7417 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7418 &global_options_set);
7420 new_optimize = build_optimization_node (&func_options);
7422 if (new_target == error_mark_node)
7423 ret = false;
7425 else if (fndecl && new_target)
7427 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7429 if (old_optimize != new_optimize)
7430 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7433 finalize_options_struct (&func_options);
7435 return ret;
7439 /* Hook to determine if one function can safely inline another. */
7441 static bool
7442 ix86_can_inline_p (tree caller, tree callee)
7444 bool ret = false;
7445 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7446 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7448 /* If callee has no option attributes, then it is ok to inline. */
7449 if (!callee_tree)
7450 ret = true;
7452 /* If caller has no option attributes, but callee does then it is not ok to
7453 inline. */
7454 else if (!caller_tree)
7455 ret = false;
7457 else
7459 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7460 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7462 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7463 function can inline a SSE2 function but a SSE2 function can't inline
7464 a SSE4 function. */
7465 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7466 != callee_opts->x_ix86_isa_flags)
7467 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7468 != callee_opts->x_ix86_isa_flags2))
7469 ret = false;
7471 /* See if we have the same non-isa options. */
7472 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7473 ret = false;
7475 /* See if arch, tune, etc. are the same. */
7476 else if (caller_opts->arch != callee_opts->arch)
7477 ret = false;
7479 else if (caller_opts->tune != callee_opts->tune)
7480 ret = false;
7482 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7483 ret = false;
7485 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7486 ret = false;
7488 else
7489 ret = true;
7492 return ret;
7496 /* Remember the last target of ix86_set_current_function. */
7497 static GTY(()) tree ix86_previous_fndecl;
7499 /* Set targets globals to the default (or current #pragma GCC target
7500 if active). Invalidate ix86_previous_fndecl cache. */
7502 void
7503 ix86_reset_previous_fndecl (void)
7505 tree new_tree = target_option_current_node;
7506 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7507 if (TREE_TARGET_GLOBALS (new_tree))
7508 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7509 else if (new_tree == target_option_default_node)
7510 restore_target_globals (&default_target_globals);
7511 else
7512 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7513 ix86_previous_fndecl = NULL_TREE;
7516 /* Set the func_type field from the function FNDECL. */
7518 static void
7519 ix86_set_func_type (tree fndecl)
7521 if (cfun->machine->func_type == TYPE_UNKNOWN)
7523 if (lookup_attribute ("interrupt",
7524 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7526 if (ix86_function_naked (fndecl))
7527 error_at (DECL_SOURCE_LOCATION (fndecl),
7528 "interrupt and naked attributes are not compatible");
7530 int nargs = 0;
7531 for (tree arg = DECL_ARGUMENTS (fndecl);
7532 arg;
7533 arg = TREE_CHAIN (arg))
7534 nargs++;
7535 cfun->machine->no_caller_saved_registers = true;
7536 cfun->machine->func_type
7537 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7539 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7541 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7542 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7543 sorry ("Only DWARF debug format is supported for interrupt "
7544 "service routine.");
7546 else
7548 cfun->machine->func_type = TYPE_NORMAL;
7549 if (lookup_attribute ("no_caller_saved_registers",
7550 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7551 cfun->machine->no_caller_saved_registers = true;
7556 /* Establish appropriate back-end context for processing the function
7557 FNDECL. The argument might be NULL to indicate processing at top
7558 level, outside of any function scope. */
7559 static void
7560 ix86_set_current_function (tree fndecl)
7562 /* Only change the context if the function changes. This hook is called
7563 several times in the course of compiling a function, and we don't want to
7564 slow things down too much or call target_reinit when it isn't safe. */
7565 if (fndecl == ix86_previous_fndecl)
7567 /* There may be 2 function bodies for the same function FNDECL,
7568 one is extern inline and one isn't. Call ix86_set_func_type
7569 to set the func_type field. */
7570 if (fndecl != NULL_TREE)
7571 ix86_set_func_type (fndecl);
7572 return;
7575 tree old_tree;
7576 if (ix86_previous_fndecl == NULL_TREE)
7577 old_tree = target_option_current_node;
7578 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7579 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7580 else
7581 old_tree = target_option_default_node;
7583 if (fndecl == NULL_TREE)
7585 if (old_tree != target_option_current_node)
7586 ix86_reset_previous_fndecl ();
7587 return;
7590 ix86_set_func_type (fndecl);
7592 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7593 if (new_tree == NULL_TREE)
7594 new_tree = target_option_default_node;
7596 if (old_tree != new_tree)
7598 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7599 if (TREE_TARGET_GLOBALS (new_tree))
7600 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7601 else if (new_tree == target_option_default_node)
7602 restore_target_globals (&default_target_globals);
7603 else
7604 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7606 ix86_previous_fndecl = fndecl;
7608 static bool prev_no_caller_saved_registers;
7610 /* 64-bit MS and SYSV ABI have different set of call used registers.
7611 Avoid expensive re-initialization of init_regs each time we switch
7612 function context. */
7613 if (TARGET_64BIT
7614 && (call_used_regs[SI_REG]
7615 == (cfun->machine->call_abi == MS_ABI)))
7616 reinit_regs ();
7617 /* Need to re-initialize init_regs if caller-saved registers are
7618 changed. */
7619 else if (prev_no_caller_saved_registers
7620 != cfun->machine->no_caller_saved_registers)
7621 reinit_regs ();
7623 if (cfun->machine->func_type != TYPE_NORMAL
7624 || cfun->machine->no_caller_saved_registers)
7626 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7627 may change processor state. */
7628 const char *isa;
7629 if (TARGET_MPX)
7630 isa = "MPX";
7631 else if (TARGET_SSE)
7632 isa = "SSE";
7633 else if (TARGET_MMX)
7634 isa = "MMX/3Dnow";
7635 else if (TARGET_80387)
7636 isa = "80387";
7637 else
7638 isa = NULL;
7639 if (isa != NULL)
7641 if (cfun->machine->func_type != TYPE_NORMAL)
7642 sorry ("%s instructions aren't allowed in %s service routine",
7643 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7644 ? "exception" : "interrupt"));
7645 else
7646 sorry ("%s instructions aren't allowed in function with "
7647 "no_caller_saved_registers attribute", isa);
7648 /* Don't issue the same error twice. */
7649 cfun->machine->func_type = TYPE_NORMAL;
7650 cfun->machine->no_caller_saved_registers = false;
7654 prev_no_caller_saved_registers
7655 = cfun->machine->no_caller_saved_registers;
7659 /* Return true if this goes in large data/bss. */
7661 static bool
7662 ix86_in_large_data_p (tree exp)
7664 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7665 return false;
7667 if (exp == NULL_TREE)
7668 return false;
7670 /* Functions are never large data. */
7671 if (TREE_CODE (exp) == FUNCTION_DECL)
7672 return false;
7674 /* Automatic variables are never large data. */
7675 if (VAR_P (exp) && !is_global_var (exp))
7676 return false;
7678 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7680 const char *section = DECL_SECTION_NAME (exp);
7681 if (strcmp (section, ".ldata") == 0
7682 || strcmp (section, ".lbss") == 0)
7683 return true;
7684 return false;
7686 else
7688 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7690 /* If this is an incomplete type with size 0, then we can't put it
7691 in data because it might be too big when completed. Also,
7692 int_size_in_bytes returns -1 if size can vary or is larger than
7693 an integer in which case also it is safer to assume that it goes in
7694 large data. */
7695 if (size <= 0 || size > ix86_section_threshold)
7696 return true;
7699 return false;
7702 /* i386-specific section flag to mark large sections. */
7703 #define SECTION_LARGE SECTION_MACH_DEP
7705 /* Switch to the appropriate section for output of DECL.
7706 DECL is either a `VAR_DECL' node or a constant of some sort.
7707 RELOC indicates whether forming the initial value of DECL requires
7708 link-time relocations. */
7710 ATTRIBUTE_UNUSED static section *
7711 x86_64_elf_select_section (tree decl, int reloc,
7712 unsigned HOST_WIDE_INT align)
7714 if (ix86_in_large_data_p (decl))
7716 const char *sname = NULL;
7717 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7718 switch (categorize_decl_for_section (decl, reloc))
7720 case SECCAT_DATA:
7721 sname = ".ldata";
7722 break;
7723 case SECCAT_DATA_REL:
7724 sname = ".ldata.rel";
7725 break;
7726 case SECCAT_DATA_REL_LOCAL:
7727 sname = ".ldata.rel.local";
7728 break;
7729 case SECCAT_DATA_REL_RO:
7730 sname = ".ldata.rel.ro";
7731 break;
7732 case SECCAT_DATA_REL_RO_LOCAL:
7733 sname = ".ldata.rel.ro.local";
7734 break;
7735 case SECCAT_BSS:
7736 sname = ".lbss";
7737 flags |= SECTION_BSS;
7738 break;
7739 case SECCAT_RODATA:
7740 case SECCAT_RODATA_MERGE_STR:
7741 case SECCAT_RODATA_MERGE_STR_INIT:
7742 case SECCAT_RODATA_MERGE_CONST:
7743 sname = ".lrodata";
7744 flags &= ~SECTION_WRITE;
7745 break;
7746 case SECCAT_SRODATA:
7747 case SECCAT_SDATA:
7748 case SECCAT_SBSS:
7749 gcc_unreachable ();
7750 case SECCAT_TEXT:
7751 case SECCAT_TDATA:
7752 case SECCAT_TBSS:
7753 /* We don't split these for medium model. Place them into
7754 default sections and hope for best. */
7755 break;
7757 if (sname)
7759 /* We might get called with string constants, but get_named_section
7760 doesn't like them as they are not DECLs. Also, we need to set
7761 flags in that case. */
7762 if (!DECL_P (decl))
7763 return get_section (sname, flags, NULL);
7764 return get_named_section (decl, sname, reloc);
7767 return default_elf_select_section (decl, reloc, align);
7770 /* Select a set of attributes for section NAME based on the properties
7771 of DECL and whether or not RELOC indicates that DECL's initializer
7772 might contain runtime relocations. */
7774 static unsigned int ATTRIBUTE_UNUSED
7775 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7777 unsigned int flags = default_section_type_flags (decl, name, reloc);
7779 if (ix86_in_large_data_p (decl))
7780 flags |= SECTION_LARGE;
7782 if (decl == NULL_TREE
7783 && (strcmp (name, ".ldata.rel.ro") == 0
7784 || strcmp (name, ".ldata.rel.ro.local") == 0))
7785 flags |= SECTION_RELRO;
7787 if (strcmp (name, ".lbss") == 0
7788 || strncmp (name, ".lbss.", 5) == 0
7789 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7790 flags |= SECTION_BSS;
7792 return flags;
7795 /* Build up a unique section name, expressed as a
7796 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7797 RELOC indicates whether the initial value of EXP requires
7798 link-time relocations. */
7800 static void ATTRIBUTE_UNUSED
7801 x86_64_elf_unique_section (tree decl, int reloc)
7803 if (ix86_in_large_data_p (decl))
7805 const char *prefix = NULL;
7806 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7807 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7809 switch (categorize_decl_for_section (decl, reloc))
7811 case SECCAT_DATA:
7812 case SECCAT_DATA_REL:
7813 case SECCAT_DATA_REL_LOCAL:
7814 case SECCAT_DATA_REL_RO:
7815 case SECCAT_DATA_REL_RO_LOCAL:
7816 prefix = one_only ? ".ld" : ".ldata";
7817 break;
7818 case SECCAT_BSS:
7819 prefix = one_only ? ".lb" : ".lbss";
7820 break;
7821 case SECCAT_RODATA:
7822 case SECCAT_RODATA_MERGE_STR:
7823 case SECCAT_RODATA_MERGE_STR_INIT:
7824 case SECCAT_RODATA_MERGE_CONST:
7825 prefix = one_only ? ".lr" : ".lrodata";
7826 break;
7827 case SECCAT_SRODATA:
7828 case SECCAT_SDATA:
7829 case SECCAT_SBSS:
7830 gcc_unreachable ();
7831 case SECCAT_TEXT:
7832 case SECCAT_TDATA:
7833 case SECCAT_TBSS:
7834 /* We don't split these for medium model. Place them into
7835 default sections and hope for best. */
7836 break;
7838 if (prefix)
7840 const char *name, *linkonce;
7841 char *string;
7843 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7844 name = targetm.strip_name_encoding (name);
7846 /* If we're using one_only, then there needs to be a .gnu.linkonce
7847 prefix to the section name. */
7848 linkonce = one_only ? ".gnu.linkonce" : "";
7850 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7852 set_decl_section_name (decl, string);
7853 return;
7856 default_unique_section (decl, reloc);
7859 #ifdef COMMON_ASM_OP
7861 #ifndef LARGECOMM_SECTION_ASM_OP
7862 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7863 #endif
7865 /* This says how to output assembler code to declare an
7866 uninitialized external linkage data object.
7868 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7869 large objects. */
7870 void
7871 x86_elf_aligned_decl_common (FILE *file, tree decl,
7872 const char *name, unsigned HOST_WIDE_INT size,
7873 int align)
7875 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7876 && size > (unsigned int)ix86_section_threshold)
7878 switch_to_section (get_named_section (decl, ".lbss", 0));
7879 fputs (LARGECOMM_SECTION_ASM_OP, file);
7881 else
7882 fputs (COMMON_ASM_OP, file);
7883 assemble_name (file, name);
7884 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7885 size, align / BITS_PER_UNIT);
7887 #endif
7889 /* Utility function for targets to use in implementing
7890 ASM_OUTPUT_ALIGNED_BSS. */
7892 void
7893 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7894 unsigned HOST_WIDE_INT size, int align)
7896 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7897 && size > (unsigned int)ix86_section_threshold)
7898 switch_to_section (get_named_section (decl, ".lbss", 0));
7899 else
7900 switch_to_section (bss_section);
7901 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7902 #ifdef ASM_DECLARE_OBJECT_NAME
7903 last_assemble_variable_decl = decl;
7904 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7905 #else
7906 /* Standard thing is just output label for the object. */
7907 ASM_OUTPUT_LABEL (file, name);
7908 #endif /* ASM_DECLARE_OBJECT_NAME */
7909 ASM_OUTPUT_SKIP (file, size ? size : 1);
7912 /* Decide whether we must probe the stack before any space allocation
7913 on this target. It's essentially TARGET_STACK_PROBE except when
7914 -fstack-check causes the stack to be already probed differently. */
7916 bool
7917 ix86_target_stack_probe (void)
7919 /* Do not probe the stack twice if static stack checking is enabled. */
7920 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7921 return false;
7923 return TARGET_STACK_PROBE;
7926 /* Decide whether we can make a sibling call to a function. DECL is the
7927 declaration of the function being targeted by the call and EXP is the
7928 CALL_EXPR representing the call. */
7930 static bool
7931 ix86_function_ok_for_sibcall (tree decl, tree exp)
7933 tree type, decl_or_type;
7934 rtx a, b;
7935 bool bind_global = decl && !targetm.binds_local_p (decl);
7937 if (ix86_function_naked (current_function_decl))
7938 return false;
7940 /* Sibling call isn't OK if there are no caller-saved registers
7941 since all registers must be preserved before return. */
7942 if (cfun->machine->no_caller_saved_registers)
7943 return false;
7945 /* If we are generating position-independent code, we cannot sibcall
7946 optimize direct calls to global functions, as the PLT requires
7947 %ebx be live. (Darwin does not have a PLT.) */
7948 if (!TARGET_MACHO
7949 && !TARGET_64BIT
7950 && flag_pic
7951 && flag_plt
7952 && bind_global)
7953 return false;
7955 /* If we need to align the outgoing stack, then sibcalling would
7956 unalign the stack, which may break the called function. */
7957 if (ix86_minimum_incoming_stack_boundary (true)
7958 < PREFERRED_STACK_BOUNDARY)
7959 return false;
7961 if (decl)
7963 decl_or_type = decl;
7964 type = TREE_TYPE (decl);
7966 else
7968 /* We're looking at the CALL_EXPR, we need the type of the function. */
7969 type = CALL_EXPR_FN (exp); /* pointer expression */
7970 type = TREE_TYPE (type); /* pointer type */
7971 type = TREE_TYPE (type); /* function type */
7972 decl_or_type = type;
7975 /* Check that the return value locations are the same. Like
7976 if we are returning floats on the 80387 register stack, we cannot
7977 make a sibcall from a function that doesn't return a float to a
7978 function that does or, conversely, from a function that does return
7979 a float to a function that doesn't; the necessary stack adjustment
7980 would not be executed. This is also the place we notice
7981 differences in the return value ABI. Note that it is ok for one
7982 of the functions to have void return type as long as the return
7983 value of the other is passed in a register. */
7984 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7985 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7986 cfun->decl, false);
7987 if (STACK_REG_P (a) || STACK_REG_P (b))
7989 if (!rtx_equal_p (a, b))
7990 return false;
7992 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7994 else if (!rtx_equal_p (a, b))
7995 return false;
7997 if (TARGET_64BIT)
7999 /* The SYSV ABI has more call-clobbered registers;
8000 disallow sibcalls from MS to SYSV. */
8001 if (cfun->machine->call_abi == MS_ABI
8002 && ix86_function_type_abi (type) == SYSV_ABI)
8003 return false;
8005 else
8007 /* If this call is indirect, we'll need to be able to use a
8008 call-clobbered register for the address of the target function.
8009 Make sure that all such registers are not used for passing
8010 parameters. Note that DLLIMPORT functions and call to global
8011 function via GOT slot are indirect. */
8012 if (!decl
8013 || (bind_global && flag_pic && !flag_plt)
8014 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8016 /* Check if regparm >= 3 since arg_reg_available is set to
8017 false if regparm == 0. If regparm is 1 or 2, there is
8018 always a call-clobbered register available.
8020 ??? The symbol indirect call doesn't need a call-clobbered
8021 register. But we don't know if this is a symbol indirect
8022 call or not here. */
8023 if (ix86_function_regparm (type, NULL) >= 3
8024 && !cfun->machine->arg_reg_available)
8025 return false;
8029 /* Otherwise okay. That also includes certain types of indirect calls. */
8030 return true;
8033 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8034 and "sseregparm" calling convention attributes;
8035 arguments as in struct attribute_spec.handler. */
8037 static tree
8038 ix86_handle_cconv_attribute (tree *node, tree name,
8039 tree args,
8040 int,
8041 bool *no_add_attrs)
8043 if (TREE_CODE (*node) != FUNCTION_TYPE
8044 && TREE_CODE (*node) != METHOD_TYPE
8045 && TREE_CODE (*node) != FIELD_DECL
8046 && TREE_CODE (*node) != TYPE_DECL)
8048 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8049 name);
8050 *no_add_attrs = true;
8051 return NULL_TREE;
8054 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8055 if (is_attribute_p ("regparm", name))
8057 tree cst;
8059 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8061 error ("fastcall and regparm attributes are not compatible");
8064 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8066 error ("regparam and thiscall attributes are not compatible");
8069 cst = TREE_VALUE (args);
8070 if (TREE_CODE (cst) != INTEGER_CST)
8072 warning (OPT_Wattributes,
8073 "%qE attribute requires an integer constant argument",
8074 name);
8075 *no_add_attrs = true;
8077 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8079 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8080 name, REGPARM_MAX);
8081 *no_add_attrs = true;
8084 return NULL_TREE;
8087 if (TARGET_64BIT)
8089 /* Do not warn when emulating the MS ABI. */
8090 if ((TREE_CODE (*node) != FUNCTION_TYPE
8091 && TREE_CODE (*node) != METHOD_TYPE)
8092 || ix86_function_type_abi (*node) != MS_ABI)
8093 warning (OPT_Wattributes, "%qE attribute ignored",
8094 name);
8095 *no_add_attrs = true;
8096 return NULL_TREE;
8099 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8100 if (is_attribute_p ("fastcall", name))
8102 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8104 error ("fastcall and cdecl attributes are not compatible");
8106 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8108 error ("fastcall and stdcall attributes are not compatible");
8110 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8112 error ("fastcall and regparm attributes are not compatible");
8114 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8116 error ("fastcall and thiscall attributes are not compatible");
8120 /* Can combine stdcall with fastcall (redundant), regparm and
8121 sseregparm. */
8122 else if (is_attribute_p ("stdcall", name))
8124 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8126 error ("stdcall and cdecl attributes are not compatible");
8128 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8130 error ("stdcall and fastcall attributes are not compatible");
8132 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8134 error ("stdcall and thiscall attributes are not compatible");
8138 /* Can combine cdecl with regparm and sseregparm. */
8139 else if (is_attribute_p ("cdecl", name))
8141 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8143 error ("stdcall and cdecl attributes are not compatible");
8145 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8147 error ("fastcall and cdecl attributes are not compatible");
8149 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8151 error ("cdecl and thiscall attributes are not compatible");
8154 else if (is_attribute_p ("thiscall", name))
8156 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8157 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8158 name);
8159 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8161 error ("stdcall and thiscall attributes are not compatible");
8163 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8165 error ("fastcall and thiscall attributes are not compatible");
8167 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8169 error ("cdecl and thiscall attributes are not compatible");
8173 /* Can combine sseregparm with all attributes. */
8175 return NULL_TREE;
8178 /* The transactional memory builtins are implicitly regparm or fastcall
8179 depending on the ABI. Override the generic do-nothing attribute that
8180 these builtins were declared with, and replace it with one of the two
8181 attributes that we expect elsewhere. */
8183 static tree
8184 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8185 int flags, bool *no_add_attrs)
8187 tree alt;
8189 /* In no case do we want to add the placeholder attribute. */
8190 *no_add_attrs = true;
8192 /* The 64-bit ABI is unchanged for transactional memory. */
8193 if (TARGET_64BIT)
8194 return NULL_TREE;
8196 /* ??? Is there a better way to validate 32-bit windows? We have
8197 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8198 if (CHECK_STACK_LIMIT > 0)
8199 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8200 else
8202 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8203 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8205 decl_attributes (node, alt, flags);
8207 return NULL_TREE;
8210 /* This function determines from TYPE the calling-convention. */
8212 unsigned int
8213 ix86_get_callcvt (const_tree type)
8215 unsigned int ret = 0;
8216 bool is_stdarg;
8217 tree attrs;
8219 if (TARGET_64BIT)
8220 return IX86_CALLCVT_CDECL;
8222 attrs = TYPE_ATTRIBUTES (type);
8223 if (attrs != NULL_TREE)
8225 if (lookup_attribute ("cdecl", attrs))
8226 ret |= IX86_CALLCVT_CDECL;
8227 else if (lookup_attribute ("stdcall", attrs))
8228 ret |= IX86_CALLCVT_STDCALL;
8229 else if (lookup_attribute ("fastcall", attrs))
8230 ret |= IX86_CALLCVT_FASTCALL;
8231 else if (lookup_attribute ("thiscall", attrs))
8232 ret |= IX86_CALLCVT_THISCALL;
8234 /* Regparam isn't allowed for thiscall and fastcall. */
8235 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8237 if (lookup_attribute ("regparm", attrs))
8238 ret |= IX86_CALLCVT_REGPARM;
8239 if (lookup_attribute ("sseregparm", attrs))
8240 ret |= IX86_CALLCVT_SSEREGPARM;
8243 if (IX86_BASE_CALLCVT(ret) != 0)
8244 return ret;
8247 is_stdarg = stdarg_p (type);
8248 if (TARGET_RTD && !is_stdarg)
8249 return IX86_CALLCVT_STDCALL | ret;
8251 if (ret != 0
8252 || is_stdarg
8253 || TREE_CODE (type) != METHOD_TYPE
8254 || ix86_function_type_abi (type) != MS_ABI)
8255 return IX86_CALLCVT_CDECL | ret;
8257 return IX86_CALLCVT_THISCALL;
8260 /* Return 0 if the attributes for two types are incompatible, 1 if they
8261 are compatible, and 2 if they are nearly compatible (which causes a
8262 warning to be generated). */
8264 static int
8265 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8267 unsigned int ccvt1, ccvt2;
8269 if (TREE_CODE (type1) != FUNCTION_TYPE
8270 && TREE_CODE (type1) != METHOD_TYPE)
8271 return 1;
8273 ccvt1 = ix86_get_callcvt (type1);
8274 ccvt2 = ix86_get_callcvt (type2);
8275 if (ccvt1 != ccvt2)
8276 return 0;
8277 if (ix86_function_regparm (type1, NULL)
8278 != ix86_function_regparm (type2, NULL))
8279 return 0;
8281 return 1;
8284 /* Return the regparm value for a function with the indicated TYPE and DECL.
8285 DECL may be NULL when calling function indirectly
8286 or considering a libcall. */
8288 static int
8289 ix86_function_regparm (const_tree type, const_tree decl)
8291 tree attr;
8292 int regparm;
8293 unsigned int ccvt;
8295 if (TARGET_64BIT)
8296 return (ix86_function_type_abi (type) == SYSV_ABI
8297 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8298 ccvt = ix86_get_callcvt (type);
8299 regparm = ix86_regparm;
8301 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8303 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8304 if (attr)
8306 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8307 return regparm;
8310 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8311 return 2;
8312 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8313 return 1;
8315 /* Use register calling convention for local functions when possible. */
8316 if (decl
8317 && TREE_CODE (decl) == FUNCTION_DECL)
8319 cgraph_node *target = cgraph_node::get (decl);
8320 if (target)
8321 target = target->function_symbol ();
8323 /* Caller and callee must agree on the calling convention, so
8324 checking here just optimize means that with
8325 __attribute__((optimize (...))) caller could use regparm convention
8326 and callee not, or vice versa. Instead look at whether the callee
8327 is optimized or not. */
8328 if (target && opt_for_fn (target->decl, optimize)
8329 && !(profile_flag && !flag_fentry))
8331 cgraph_local_info *i = &target->local;
8332 if (i && i->local && i->can_change_signature)
8334 int local_regparm, globals = 0, regno;
8336 /* Make sure no regparm register is taken by a
8337 fixed register variable. */
8338 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8339 local_regparm++)
8340 if (fixed_regs[local_regparm])
8341 break;
8343 /* We don't want to use regparm(3) for nested functions as
8344 these use a static chain pointer in the third argument. */
8345 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8346 local_regparm = 2;
8348 /* Save a register for the split stack. */
8349 if (flag_split_stack)
8351 if (local_regparm == 3)
8352 local_regparm = 2;
8353 else if (local_regparm == 2
8354 && DECL_STATIC_CHAIN (target->decl))
8355 local_regparm = 1;
8358 /* Each fixed register usage increases register pressure,
8359 so less registers should be used for argument passing.
8360 This functionality can be overriden by an explicit
8361 regparm value. */
8362 for (regno = AX_REG; regno <= DI_REG; regno++)
8363 if (fixed_regs[regno])
8364 globals++;
8366 local_regparm
8367 = globals < local_regparm ? local_regparm - globals : 0;
8369 if (local_regparm > regparm)
8370 regparm = local_regparm;
8375 return regparm;
8378 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8379 DFmode (2) arguments in SSE registers for a function with the
8380 indicated TYPE and DECL. DECL may be NULL when calling function
8381 indirectly or considering a libcall. Return -1 if any FP parameter
8382 should be rejected by error. This is used in siutation we imply SSE
8383 calling convetion but the function is called from another function with
8384 SSE disabled. Otherwise return 0. */
8386 static int
8387 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8389 gcc_assert (!TARGET_64BIT);
8391 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8392 by the sseregparm attribute. */
8393 if (TARGET_SSEREGPARM
8394 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8396 if (!TARGET_SSE)
8398 if (warn)
8400 if (decl)
8401 error ("calling %qD with attribute sseregparm without "
8402 "SSE/SSE2 enabled", decl);
8403 else
8404 error ("calling %qT with attribute sseregparm without "
8405 "SSE/SSE2 enabled", type);
8407 return 0;
8410 return 2;
8413 if (!decl)
8414 return 0;
8416 cgraph_node *target = cgraph_node::get (decl);
8417 if (target)
8418 target = target->function_symbol ();
8420 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8421 (and DFmode for SSE2) arguments in SSE registers. */
8422 if (target
8423 /* TARGET_SSE_MATH */
8424 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8425 && opt_for_fn (target->decl, optimize)
8426 && !(profile_flag && !flag_fentry))
8428 cgraph_local_info *i = &target->local;
8429 if (i && i->local && i->can_change_signature)
8431 /* Refuse to produce wrong code when local function with SSE enabled
8432 is called from SSE disabled function.
8433 FIXME: We need a way to detect these cases cross-ltrans partition
8434 and avoid using SSE calling conventions on local functions called
8435 from function with SSE disabled. For now at least delay the
8436 warning until we know we are going to produce wrong code.
8437 See PR66047 */
8438 if (!TARGET_SSE && warn)
8439 return -1;
8440 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8441 ->x_ix86_isa_flags) ? 2 : 1;
8445 return 0;
8448 /* Return true if EAX is live at the start of the function. Used by
8449 ix86_expand_prologue to determine if we need special help before
8450 calling allocate_stack_worker. */
8452 static bool
8453 ix86_eax_live_at_start_p (void)
8455 /* Cheat. Don't bother working forward from ix86_function_regparm
8456 to the function type to whether an actual argument is located in
8457 eax. Instead just look at cfg info, which is still close enough
8458 to correct at this point. This gives false positives for broken
8459 functions that might use uninitialized data that happens to be
8460 allocated in eax, but who cares? */
8461 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8464 static bool
8465 ix86_keep_aggregate_return_pointer (tree fntype)
8467 tree attr;
8469 if (!TARGET_64BIT)
8471 attr = lookup_attribute ("callee_pop_aggregate_return",
8472 TYPE_ATTRIBUTES (fntype));
8473 if (attr)
8474 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8476 /* For 32-bit MS-ABI the default is to keep aggregate
8477 return pointer. */
8478 if (ix86_function_type_abi (fntype) == MS_ABI)
8479 return true;
8481 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8484 /* Value is the number of bytes of arguments automatically
8485 popped when returning from a subroutine call.
8486 FUNDECL is the declaration node of the function (as a tree),
8487 FUNTYPE is the data type of the function (as a tree),
8488 or for a library call it is an identifier node for the subroutine name.
8489 SIZE is the number of bytes of arguments passed on the stack.
8491 On the 80386, the RTD insn may be used to pop them if the number
8492 of args is fixed, but if the number is variable then the caller
8493 must pop them all. RTD can't be used for library calls now
8494 because the library is compiled with the Unix compiler.
8495 Use of RTD is a selectable option, since it is incompatible with
8496 standard Unix calling sequences. If the option is not selected,
8497 the caller must always pop the args.
8499 The attribute stdcall is equivalent to RTD on a per module basis. */
8501 static int
8502 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8504 unsigned int ccvt;
8506 /* None of the 64-bit ABIs pop arguments. */
8507 if (TARGET_64BIT)
8508 return 0;
8510 ccvt = ix86_get_callcvt (funtype);
8512 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8513 | IX86_CALLCVT_THISCALL)) != 0
8514 && ! stdarg_p (funtype))
8515 return size;
8517 /* Lose any fake structure return argument if it is passed on the stack. */
8518 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8519 && !ix86_keep_aggregate_return_pointer (funtype))
8521 int nregs = ix86_function_regparm (funtype, fundecl);
8522 if (nregs == 0)
8523 return GET_MODE_SIZE (Pmode);
8526 return 0;
8529 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8531 static bool
8532 ix86_legitimate_combined_insn (rtx_insn *insn)
8534 int i;
8536 /* Check operand constraints in case hard registers were propagated
8537 into insn pattern. This check prevents combine pass from
8538 generating insn patterns with invalid hard register operands.
8539 These invalid insns can eventually confuse reload to error out
8540 with a spill failure. See also PRs 46829 and 46843. */
8542 gcc_assert (INSN_CODE (insn) >= 0);
8544 extract_insn (insn);
8545 preprocess_constraints (insn);
8547 int n_operands = recog_data.n_operands;
8548 int n_alternatives = recog_data.n_alternatives;
8549 for (i = 0; i < n_operands; i++)
8551 rtx op = recog_data.operand[i];
8552 machine_mode mode = GET_MODE (op);
8553 const operand_alternative *op_alt;
8554 int offset = 0;
8555 bool win;
8556 int j;
8558 /* A unary operator may be accepted by the predicate, but it
8559 is irrelevant for matching constraints. */
8560 if (UNARY_P (op))
8561 op = XEXP (op, 0);
8563 if (SUBREG_P (op))
8565 if (REG_P (SUBREG_REG (op))
8566 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8567 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8568 GET_MODE (SUBREG_REG (op)),
8569 SUBREG_BYTE (op),
8570 GET_MODE (op));
8571 op = SUBREG_REG (op);
8574 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8575 continue;
8577 op_alt = recog_op_alt;
8579 /* Operand has no constraints, anything is OK. */
8580 win = !n_alternatives;
8582 alternative_mask preferred = get_preferred_alternatives (insn);
8583 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8585 if (!TEST_BIT (preferred, j))
8586 continue;
8587 if (op_alt[i].anything_ok
8588 || (op_alt[i].matches != -1
8589 && operands_match_p
8590 (recog_data.operand[i],
8591 recog_data.operand[op_alt[i].matches]))
8592 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8594 win = true;
8595 break;
8599 if (!win)
8600 return false;
8603 return true;
8606 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8608 static unsigned HOST_WIDE_INT
8609 ix86_asan_shadow_offset (void)
8611 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8612 : HOST_WIDE_INT_C (0x7fff8000))
8613 : (HOST_WIDE_INT_1 << 29);
8616 /* Argument support functions. */
8618 /* Return true when register may be used to pass function parameters. */
8619 bool
8620 ix86_function_arg_regno_p (int regno)
8622 int i;
8623 enum calling_abi call_abi;
8624 const int *parm_regs;
8626 if (TARGET_MPX && BND_REGNO_P (regno))
8627 return true;
8629 if (!TARGET_64BIT)
8631 if (TARGET_MACHO)
8632 return (regno < REGPARM_MAX
8633 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8634 else
8635 return (regno < REGPARM_MAX
8636 || (TARGET_MMX && MMX_REGNO_P (regno)
8637 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8638 || (TARGET_SSE && SSE_REGNO_P (regno)
8639 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8642 if (TARGET_SSE && SSE_REGNO_P (regno)
8643 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8644 return true;
8646 /* TODO: The function should depend on current function ABI but
8647 builtins.c would need updating then. Therefore we use the
8648 default ABI. */
8649 call_abi = ix86_cfun_abi ();
8651 /* RAX is used as hidden argument to va_arg functions. */
8652 if (call_abi == SYSV_ABI && regno == AX_REG)
8653 return true;
8655 if (call_abi == MS_ABI)
8656 parm_regs = x86_64_ms_abi_int_parameter_registers;
8657 else
8658 parm_regs = x86_64_int_parameter_registers;
8660 for (i = 0; i < (call_abi == MS_ABI
8661 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8662 if (regno == parm_regs[i])
8663 return true;
8664 return false;
8667 /* Return if we do not know how to pass TYPE solely in registers. */
8669 static bool
8670 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8672 if (must_pass_in_stack_var_size_or_pad (mode, type))
8673 return true;
8675 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8676 The layout_type routine is crafty and tries to trick us into passing
8677 currently unsupported vector types on the stack by using TImode. */
8678 return (!TARGET_64BIT && mode == TImode
8679 && type && TREE_CODE (type) != VECTOR_TYPE);
8682 /* It returns the size, in bytes, of the area reserved for arguments passed
8683 in registers for the function represented by fndecl dependent to the used
8684 abi format. */
8686 ix86_reg_parm_stack_space (const_tree fndecl)
8688 enum calling_abi call_abi = SYSV_ABI;
8689 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8690 call_abi = ix86_function_abi (fndecl);
8691 else
8692 call_abi = ix86_function_type_abi (fndecl);
8693 if (TARGET_64BIT && call_abi == MS_ABI)
8694 return 32;
8695 return 0;
8698 /* We add this as a workaround in order to use libc_has_function
8699 hook in i386.md. */
8700 bool
8701 ix86_libc_has_function (enum function_class fn_class)
8703 return targetm.libc_has_function (fn_class);
8706 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8707 specifying the call abi used. */
8708 enum calling_abi
8709 ix86_function_type_abi (const_tree fntype)
8711 enum calling_abi abi = ix86_abi;
8713 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8714 return abi;
8716 if (abi == SYSV_ABI
8717 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8719 if (TARGET_X32)
8720 error ("X32 does not support ms_abi attribute");
8722 abi = MS_ABI;
8724 else if (abi == MS_ABI
8725 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8726 abi = SYSV_ABI;
8728 return abi;
8731 static enum calling_abi
8732 ix86_function_abi (const_tree fndecl)
8734 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8737 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8738 specifying the call abi used. */
8739 enum calling_abi
8740 ix86_cfun_abi (void)
8742 return cfun ? cfun->machine->call_abi : ix86_abi;
8745 static bool
8746 ix86_function_ms_hook_prologue (const_tree fn)
8748 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8750 if (decl_function_context (fn) != NULL_TREE)
8751 error_at (DECL_SOURCE_LOCATION (fn),
8752 "ms_hook_prologue is not compatible with nested function");
8753 else
8754 return true;
8756 return false;
8759 static bool
8760 ix86_function_naked (const_tree fn)
8762 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8763 return true;
8765 return false;
8768 /* Write the extra assembler code needed to declare a function properly. */
8770 void
8771 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8772 tree decl)
8774 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8776 if (is_ms_hook)
8778 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8779 unsigned int filler_cc = 0xcccccccc;
8781 for (i = 0; i < filler_count; i += 4)
8782 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8785 #ifdef SUBTARGET_ASM_UNWIND_INIT
8786 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8787 #endif
8789 ASM_OUTPUT_LABEL (asm_out_file, fname);
8791 /* Output magic byte marker, if hot-patch attribute is set. */
8792 if (is_ms_hook)
8794 if (TARGET_64BIT)
8796 /* leaq [%rsp + 0], %rsp */
8797 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8798 asm_out_file);
8800 else
8802 /* movl.s %edi, %edi
8803 push %ebp
8804 movl.s %esp, %ebp */
8805 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8810 /* Implementation of call abi switching target hook. Specific to FNDECL
8811 the specific call register sets are set. See also
8812 ix86_conditional_register_usage for more details. */
8813 void
8814 ix86_call_abi_override (const_tree fndecl)
8816 cfun->machine->call_abi = ix86_function_abi (fndecl);
8819 /* Return 1 if pseudo register should be created and used to hold
8820 GOT address for PIC code. */
8821 bool
8822 ix86_use_pseudo_pic_reg (void)
8824 if ((TARGET_64BIT
8825 && (ix86_cmodel == CM_SMALL_PIC
8826 || TARGET_PECOFF))
8827 || !flag_pic)
8828 return false;
8829 return true;
8832 /* Initialize large model PIC register. */
8834 static void
8835 ix86_init_large_pic_reg (unsigned int tmp_regno)
8837 rtx_code_label *label;
8838 rtx tmp_reg;
8840 gcc_assert (Pmode == DImode);
8841 label = gen_label_rtx ();
8842 emit_label (label);
8843 LABEL_PRESERVE_P (label) = 1;
8844 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8845 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8846 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8847 label));
8848 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8849 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8850 pic_offset_table_rtx, tmp_reg));
8853 /* Create and initialize PIC register if required. */
8854 static void
8855 ix86_init_pic_reg (void)
8857 edge entry_edge;
8858 rtx_insn *seq;
8860 if (!ix86_use_pseudo_pic_reg ())
8861 return;
8863 start_sequence ();
8865 if (TARGET_64BIT)
8867 if (ix86_cmodel == CM_LARGE_PIC)
8868 ix86_init_large_pic_reg (R11_REG);
8869 else
8870 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8872 else
8874 /* If there is future mcount call in the function it is more profitable
8875 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8876 rtx reg = crtl->profile
8877 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8878 : pic_offset_table_rtx;
8879 rtx_insn *insn = emit_insn (gen_set_got (reg));
8880 RTX_FRAME_RELATED_P (insn) = 1;
8881 if (crtl->profile)
8882 emit_move_insn (pic_offset_table_rtx, reg);
8883 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8886 seq = get_insns ();
8887 end_sequence ();
8889 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8890 insert_insn_on_edge (seq, entry_edge);
8891 commit_one_edge_insertion (entry_edge);
8894 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8895 for a call to a function whose data type is FNTYPE.
8896 For a library call, FNTYPE is 0. */
8898 void
8899 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8900 tree fntype, /* tree ptr for function decl */
8901 rtx libname, /* SYMBOL_REF of library name or 0 */
8902 tree fndecl,
8903 int caller)
8905 struct cgraph_local_info *i = NULL;
8906 struct cgraph_node *target = NULL;
8908 memset (cum, 0, sizeof (*cum));
8910 if (fndecl)
8912 target = cgraph_node::get (fndecl);
8913 if (target)
8915 target = target->function_symbol ();
8916 i = cgraph_node::local_info (target->decl);
8917 cum->call_abi = ix86_function_abi (target->decl);
8919 else
8920 cum->call_abi = ix86_function_abi (fndecl);
8922 else
8923 cum->call_abi = ix86_function_type_abi (fntype);
8925 cum->caller = caller;
8927 /* Set up the number of registers to use for passing arguments. */
8928 cum->nregs = ix86_regparm;
8929 if (TARGET_64BIT)
8931 cum->nregs = (cum->call_abi == SYSV_ABI
8932 ? X86_64_REGPARM_MAX
8933 : X86_64_MS_REGPARM_MAX);
8935 if (TARGET_SSE)
8937 cum->sse_nregs = SSE_REGPARM_MAX;
8938 if (TARGET_64BIT)
8940 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8941 ? X86_64_SSE_REGPARM_MAX
8942 : X86_64_MS_SSE_REGPARM_MAX);
8945 if (TARGET_MMX)
8946 cum->mmx_nregs = MMX_REGPARM_MAX;
8947 cum->warn_avx512f = true;
8948 cum->warn_avx = true;
8949 cum->warn_sse = true;
8950 cum->warn_mmx = true;
8952 /* Because type might mismatch in between caller and callee, we need to
8953 use actual type of function for local calls.
8954 FIXME: cgraph_analyze can be told to actually record if function uses
8955 va_start so for local functions maybe_vaarg can be made aggressive
8956 helping K&R code.
8957 FIXME: once typesytem is fixed, we won't need this code anymore. */
8958 if (i && i->local && i->can_change_signature)
8959 fntype = TREE_TYPE (target->decl);
8960 cum->stdarg = stdarg_p (fntype);
8961 cum->maybe_vaarg = (fntype
8962 ? (!prototype_p (fntype) || stdarg_p (fntype))
8963 : !libname);
8965 cum->bnd_regno = FIRST_BND_REG;
8966 cum->bnds_in_bt = 0;
8967 cum->force_bnd_pass = 0;
8968 cum->decl = fndecl;
8970 if (!TARGET_64BIT)
8972 /* If there are variable arguments, then we won't pass anything
8973 in registers in 32-bit mode. */
8974 if (stdarg_p (fntype))
8976 cum->nregs = 0;
8977 /* Since in 32-bit, variable arguments are always passed on
8978 stack, there is scratch register available for indirect
8979 sibcall. */
8980 cfun->machine->arg_reg_available = true;
8981 cum->sse_nregs = 0;
8982 cum->mmx_nregs = 0;
8983 cum->warn_avx512f = false;
8984 cum->warn_avx = false;
8985 cum->warn_sse = false;
8986 cum->warn_mmx = false;
8987 return;
8990 /* Use ecx and edx registers if function has fastcall attribute,
8991 else look for regparm information. */
8992 if (fntype)
8994 unsigned int ccvt = ix86_get_callcvt (fntype);
8995 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8997 cum->nregs = 1;
8998 cum->fastcall = 1; /* Same first register as in fastcall. */
9000 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9002 cum->nregs = 2;
9003 cum->fastcall = 1;
9005 else
9006 cum->nregs = ix86_function_regparm (fntype, fndecl);
9009 /* Set up the number of SSE registers used for passing SFmode
9010 and DFmode arguments. Warn for mismatching ABI. */
9011 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9014 cfun->machine->arg_reg_available = (cum->nregs > 0);
9017 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9018 But in the case of vector types, it is some vector mode.
9020 When we have only some of our vector isa extensions enabled, then there
9021 are some modes for which vector_mode_supported_p is false. For these
9022 modes, the generic vector support in gcc will choose some non-vector mode
9023 in order to implement the type. By computing the natural mode, we'll
9024 select the proper ABI location for the operand and not depend on whatever
9025 the middle-end decides to do with these vector types.
9027 The midde-end can't deal with the vector types > 16 bytes. In this
9028 case, we return the original mode and warn ABI change if CUM isn't
9029 NULL.
9031 If INT_RETURN is true, warn ABI change if the vector mode isn't
9032 available for function return value. */
9034 static machine_mode
9035 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9036 bool in_return)
9038 machine_mode mode = TYPE_MODE (type);
9040 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9042 HOST_WIDE_INT size = int_size_in_bytes (type);
9043 if ((size == 8 || size == 16 || size == 32 || size == 64)
9044 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9045 && TYPE_VECTOR_SUBPARTS (type) > 1)
9047 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9049 /* There are no XFmode vector modes. */
9050 if (innermode == XFmode)
9051 return mode;
9053 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9054 mode = MIN_MODE_VECTOR_FLOAT;
9055 else
9056 mode = MIN_MODE_VECTOR_INT;
9058 /* Get the mode which has this inner mode and number of units. */
9059 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
9060 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9061 && GET_MODE_INNER (mode) == innermode)
9063 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9065 static bool warnedavx512f;
9066 static bool warnedavx512f_ret;
9068 if (cum && cum->warn_avx512f && !warnedavx512f)
9070 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9071 "without AVX512F enabled changes the ABI"))
9072 warnedavx512f = true;
9074 else if (in_return && !warnedavx512f_ret)
9076 if (warning (OPT_Wpsabi, "AVX512F vector return "
9077 "without AVX512F enabled changes the ABI"))
9078 warnedavx512f_ret = true;
9081 return TYPE_MODE (type);
9083 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9085 static bool warnedavx;
9086 static bool warnedavx_ret;
9088 if (cum && cum->warn_avx && !warnedavx)
9090 if (warning (OPT_Wpsabi, "AVX vector argument "
9091 "without AVX enabled changes the ABI"))
9092 warnedavx = true;
9094 else if (in_return && !warnedavx_ret)
9096 if (warning (OPT_Wpsabi, "AVX vector return "
9097 "without AVX enabled changes the ABI"))
9098 warnedavx_ret = true;
9101 return TYPE_MODE (type);
9103 else if (((size == 8 && TARGET_64BIT) || size == 16)
9104 && !TARGET_SSE
9105 && !TARGET_IAMCU)
9107 static bool warnedsse;
9108 static bool warnedsse_ret;
9110 if (cum && cum->warn_sse && !warnedsse)
9112 if (warning (OPT_Wpsabi, "SSE vector argument "
9113 "without SSE enabled changes the ABI"))
9114 warnedsse = true;
9116 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9118 if (warning (OPT_Wpsabi, "SSE vector return "
9119 "without SSE enabled changes the ABI"))
9120 warnedsse_ret = true;
9123 else if ((size == 8 && !TARGET_64BIT)
9124 && (!cfun
9125 || cfun->machine->func_type == TYPE_NORMAL)
9126 && !TARGET_MMX
9127 && !TARGET_IAMCU)
9129 static bool warnedmmx;
9130 static bool warnedmmx_ret;
9132 if (cum && cum->warn_mmx && !warnedmmx)
9134 if (warning (OPT_Wpsabi, "MMX vector argument "
9135 "without MMX enabled changes the ABI"))
9136 warnedmmx = true;
9138 else if (in_return && !warnedmmx_ret)
9140 if (warning (OPT_Wpsabi, "MMX vector return "
9141 "without MMX enabled changes the ABI"))
9142 warnedmmx_ret = true;
9145 return mode;
9148 gcc_unreachable ();
9152 return mode;
9155 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9156 this may not agree with the mode that the type system has chosen for the
9157 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9158 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9160 static rtx
9161 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9162 unsigned int regno)
9164 rtx tmp;
9166 if (orig_mode != BLKmode)
9167 tmp = gen_rtx_REG (orig_mode, regno);
9168 else
9170 tmp = gen_rtx_REG (mode, regno);
9171 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9172 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9175 return tmp;
9178 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9179 of this code is to classify each 8bytes of incoming argument by the register
9180 class and assign registers accordingly. */
9182 /* Return the union class of CLASS1 and CLASS2.
9183 See the x86-64 PS ABI for details. */
9185 static enum x86_64_reg_class
9186 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9188 /* Rule #1: If both classes are equal, this is the resulting class. */
9189 if (class1 == class2)
9190 return class1;
9192 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9193 the other class. */
9194 if (class1 == X86_64_NO_CLASS)
9195 return class2;
9196 if (class2 == X86_64_NO_CLASS)
9197 return class1;
9199 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9200 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9201 return X86_64_MEMORY_CLASS;
9203 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9204 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9205 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9206 return X86_64_INTEGERSI_CLASS;
9207 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9208 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9209 return X86_64_INTEGER_CLASS;
9211 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9212 MEMORY is used. */
9213 if (class1 == X86_64_X87_CLASS
9214 || class1 == X86_64_X87UP_CLASS
9215 || class1 == X86_64_COMPLEX_X87_CLASS
9216 || class2 == X86_64_X87_CLASS
9217 || class2 == X86_64_X87UP_CLASS
9218 || class2 == X86_64_COMPLEX_X87_CLASS)
9219 return X86_64_MEMORY_CLASS;
9221 /* Rule #6: Otherwise class SSE is used. */
9222 return X86_64_SSE_CLASS;
9225 /* Classify the argument of type TYPE and mode MODE.
9226 CLASSES will be filled by the register class used to pass each word
9227 of the operand. The number of words is returned. In case the parameter
9228 should be passed in memory, 0 is returned. As a special case for zero
9229 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9231 BIT_OFFSET is used internally for handling records and specifies offset
9232 of the offset in bits modulo 512 to avoid overflow cases.
9234 See the x86-64 PS ABI for details.
9237 static int
9238 classify_argument (machine_mode mode, const_tree type,
9239 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9241 HOST_WIDE_INT bytes =
9242 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9243 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9245 /* Variable sized entities are always passed/returned in memory. */
9246 if (bytes < 0)
9247 return 0;
9249 if (mode != VOIDmode
9250 && targetm.calls.must_pass_in_stack (mode, type))
9251 return 0;
9253 if (type && AGGREGATE_TYPE_P (type))
9255 int i;
9256 tree field;
9257 enum x86_64_reg_class subclasses[MAX_CLASSES];
9259 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9260 if (bytes > 64)
9261 return 0;
9263 for (i = 0; i < words; i++)
9264 classes[i] = X86_64_NO_CLASS;
9266 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9267 signalize memory class, so handle it as special case. */
9268 if (!words)
9270 classes[0] = X86_64_NO_CLASS;
9271 return 1;
9274 /* Classify each field of record and merge classes. */
9275 switch (TREE_CODE (type))
9277 case RECORD_TYPE:
9278 /* And now merge the fields of structure. */
9279 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9281 if (TREE_CODE (field) == FIELD_DECL)
9283 int num;
9285 if (TREE_TYPE (field) == error_mark_node)
9286 continue;
9288 /* Bitfields are always classified as integer. Handle them
9289 early, since later code would consider them to be
9290 misaligned integers. */
9291 if (DECL_BIT_FIELD (field))
9293 for (i = (int_bit_position (field)
9294 + (bit_offset % 64)) / 8 / 8;
9295 i < ((int_bit_position (field) + (bit_offset % 64))
9296 + tree_to_shwi (DECL_SIZE (field))
9297 + 63) / 8 / 8; i++)
9298 classes[i] =
9299 merge_classes (X86_64_INTEGER_CLASS,
9300 classes[i]);
9302 else
9304 int pos;
9306 type = TREE_TYPE (field);
9308 /* Flexible array member is ignored. */
9309 if (TYPE_MODE (type) == BLKmode
9310 && TREE_CODE (type) == ARRAY_TYPE
9311 && TYPE_SIZE (type) == NULL_TREE
9312 && TYPE_DOMAIN (type) != NULL_TREE
9313 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9314 == NULL_TREE))
9316 static bool warned;
9318 if (!warned && warn_psabi)
9320 warned = true;
9321 inform (input_location,
9322 "the ABI of passing struct with"
9323 " a flexible array member has"
9324 " changed in GCC 4.4");
9326 continue;
9328 num = classify_argument (TYPE_MODE (type), type,
9329 subclasses,
9330 (int_bit_position (field)
9331 + bit_offset) % 512);
9332 if (!num)
9333 return 0;
9334 pos = (int_bit_position (field)
9335 + (bit_offset % 64)) / 8 / 8;
9336 for (i = 0; i < num && (i + pos) < words; i++)
9337 classes[i + pos] =
9338 merge_classes (subclasses[i], classes[i + pos]);
9342 break;
9344 case ARRAY_TYPE:
9345 /* Arrays are handled as small records. */
9347 int num;
9348 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9349 TREE_TYPE (type), subclasses, bit_offset);
9350 if (!num)
9351 return 0;
9353 /* The partial classes are now full classes. */
9354 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9355 subclasses[0] = X86_64_SSE_CLASS;
9356 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9357 && !((bit_offset % 64) == 0 && bytes == 4))
9358 subclasses[0] = X86_64_INTEGER_CLASS;
9360 for (i = 0; i < words; i++)
9361 classes[i] = subclasses[i % num];
9363 break;
9365 case UNION_TYPE:
9366 case QUAL_UNION_TYPE:
9367 /* Unions are similar to RECORD_TYPE but offset is always 0.
9369 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9371 if (TREE_CODE (field) == FIELD_DECL)
9373 int num;
9375 if (TREE_TYPE (field) == error_mark_node)
9376 continue;
9378 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9379 TREE_TYPE (field), subclasses,
9380 bit_offset);
9381 if (!num)
9382 return 0;
9383 for (i = 0; i < num && i < words; i++)
9384 classes[i] = merge_classes (subclasses[i], classes[i]);
9387 break;
9389 default:
9390 gcc_unreachable ();
9393 if (words > 2)
9395 /* When size > 16 bytes, if the first one isn't
9396 X86_64_SSE_CLASS or any other ones aren't
9397 X86_64_SSEUP_CLASS, everything should be passed in
9398 memory. */
9399 if (classes[0] != X86_64_SSE_CLASS)
9400 return 0;
9402 for (i = 1; i < words; i++)
9403 if (classes[i] != X86_64_SSEUP_CLASS)
9404 return 0;
9407 /* Final merger cleanup. */
9408 for (i = 0; i < words; i++)
9410 /* If one class is MEMORY, everything should be passed in
9411 memory. */
9412 if (classes[i] == X86_64_MEMORY_CLASS)
9413 return 0;
9415 /* The X86_64_SSEUP_CLASS should be always preceded by
9416 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9417 if (classes[i] == X86_64_SSEUP_CLASS
9418 && classes[i - 1] != X86_64_SSE_CLASS
9419 && classes[i - 1] != X86_64_SSEUP_CLASS)
9421 /* The first one should never be X86_64_SSEUP_CLASS. */
9422 gcc_assert (i != 0);
9423 classes[i] = X86_64_SSE_CLASS;
9426 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9427 everything should be passed in memory. */
9428 if (classes[i] == X86_64_X87UP_CLASS
9429 && (classes[i - 1] != X86_64_X87_CLASS))
9431 static bool warned;
9433 /* The first one should never be X86_64_X87UP_CLASS. */
9434 gcc_assert (i != 0);
9435 if (!warned && warn_psabi)
9437 warned = true;
9438 inform (input_location,
9439 "the ABI of passing union with long double"
9440 " has changed in GCC 4.4");
9442 return 0;
9445 return words;
9448 /* Compute alignment needed. We align all types to natural boundaries with
9449 exception of XFmode that is aligned to 64bits. */
9450 if (mode != VOIDmode && mode != BLKmode)
9452 int mode_alignment = GET_MODE_BITSIZE (mode);
9454 if (mode == XFmode)
9455 mode_alignment = 128;
9456 else if (mode == XCmode)
9457 mode_alignment = 256;
9458 if (COMPLEX_MODE_P (mode))
9459 mode_alignment /= 2;
9460 /* Misaligned fields are always returned in memory. */
9461 if (bit_offset % mode_alignment)
9462 return 0;
9465 /* for V1xx modes, just use the base mode */
9466 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9467 && GET_MODE_UNIT_SIZE (mode) == bytes)
9468 mode = GET_MODE_INNER (mode);
9470 /* Classification of atomic types. */
9471 switch (mode)
9473 case SDmode:
9474 case DDmode:
9475 classes[0] = X86_64_SSE_CLASS;
9476 return 1;
9477 case TDmode:
9478 classes[0] = X86_64_SSE_CLASS;
9479 classes[1] = X86_64_SSEUP_CLASS;
9480 return 2;
9481 case DImode:
9482 case SImode:
9483 case HImode:
9484 case QImode:
9485 case CSImode:
9486 case CHImode:
9487 case CQImode:
9489 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9491 /* Analyze last 128 bits only. */
9492 size = (size - 1) & 0x7f;
9494 if (size < 32)
9496 classes[0] = X86_64_INTEGERSI_CLASS;
9497 return 1;
9499 else if (size < 64)
9501 classes[0] = X86_64_INTEGER_CLASS;
9502 return 1;
9504 else if (size < 64+32)
9506 classes[0] = X86_64_INTEGER_CLASS;
9507 classes[1] = X86_64_INTEGERSI_CLASS;
9508 return 2;
9510 else if (size < 64+64)
9512 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9513 return 2;
9515 else
9516 gcc_unreachable ();
9518 case CDImode:
9519 case TImode:
9520 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9521 return 2;
9522 case COImode:
9523 case OImode:
9524 /* OImode shouldn't be used directly. */
9525 gcc_unreachable ();
9526 case CTImode:
9527 return 0;
9528 case SFmode:
9529 if (!(bit_offset % 64))
9530 classes[0] = X86_64_SSESF_CLASS;
9531 else
9532 classes[0] = X86_64_SSE_CLASS;
9533 return 1;
9534 case DFmode:
9535 classes[0] = X86_64_SSEDF_CLASS;
9536 return 1;
9537 case XFmode:
9538 classes[0] = X86_64_X87_CLASS;
9539 classes[1] = X86_64_X87UP_CLASS;
9540 return 2;
9541 case TFmode:
9542 classes[0] = X86_64_SSE_CLASS;
9543 classes[1] = X86_64_SSEUP_CLASS;
9544 return 2;
9545 case SCmode:
9546 classes[0] = X86_64_SSE_CLASS;
9547 if (!(bit_offset % 64))
9548 return 1;
9549 else
9551 static bool warned;
9553 if (!warned && warn_psabi)
9555 warned = true;
9556 inform (input_location,
9557 "the ABI of passing structure with complex float"
9558 " member has changed in GCC 4.4");
9560 classes[1] = X86_64_SSESF_CLASS;
9561 return 2;
9563 case DCmode:
9564 classes[0] = X86_64_SSEDF_CLASS;
9565 classes[1] = X86_64_SSEDF_CLASS;
9566 return 2;
9567 case XCmode:
9568 classes[0] = X86_64_COMPLEX_X87_CLASS;
9569 return 1;
9570 case TCmode:
9571 /* This modes is larger than 16 bytes. */
9572 return 0;
9573 case V8SFmode:
9574 case V8SImode:
9575 case V32QImode:
9576 case V16HImode:
9577 case V4DFmode:
9578 case V4DImode:
9579 classes[0] = X86_64_SSE_CLASS;
9580 classes[1] = X86_64_SSEUP_CLASS;
9581 classes[2] = X86_64_SSEUP_CLASS;
9582 classes[3] = X86_64_SSEUP_CLASS;
9583 return 4;
9584 case V8DFmode:
9585 case V16SFmode:
9586 case V8DImode:
9587 case V16SImode:
9588 case V32HImode:
9589 case V64QImode:
9590 classes[0] = X86_64_SSE_CLASS;
9591 classes[1] = X86_64_SSEUP_CLASS;
9592 classes[2] = X86_64_SSEUP_CLASS;
9593 classes[3] = X86_64_SSEUP_CLASS;
9594 classes[4] = X86_64_SSEUP_CLASS;
9595 classes[5] = X86_64_SSEUP_CLASS;
9596 classes[6] = X86_64_SSEUP_CLASS;
9597 classes[7] = X86_64_SSEUP_CLASS;
9598 return 8;
9599 case V4SFmode:
9600 case V4SImode:
9601 case V16QImode:
9602 case V8HImode:
9603 case V2DFmode:
9604 case V2DImode:
9605 classes[0] = X86_64_SSE_CLASS;
9606 classes[1] = X86_64_SSEUP_CLASS;
9607 return 2;
9608 case V1TImode:
9609 case V1DImode:
9610 case V2SFmode:
9611 case V2SImode:
9612 case V4HImode:
9613 case V8QImode:
9614 classes[0] = X86_64_SSE_CLASS;
9615 return 1;
9616 case BLKmode:
9617 case VOIDmode:
9618 return 0;
9619 default:
9620 gcc_assert (VECTOR_MODE_P (mode));
9622 if (bytes > 16)
9623 return 0;
9625 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9627 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9628 classes[0] = X86_64_INTEGERSI_CLASS;
9629 else
9630 classes[0] = X86_64_INTEGER_CLASS;
9631 classes[1] = X86_64_INTEGER_CLASS;
9632 return 1 + (bytes > 8);
9636 /* Examine the argument and return set number of register required in each
9637 class. Return true iff parameter should be passed in memory. */
9639 static bool
9640 examine_argument (machine_mode mode, const_tree type, int in_return,
9641 int *int_nregs, int *sse_nregs)
9643 enum x86_64_reg_class regclass[MAX_CLASSES];
9644 int n = classify_argument (mode, type, regclass, 0);
9646 *int_nregs = 0;
9647 *sse_nregs = 0;
9649 if (!n)
9650 return true;
9651 for (n--; n >= 0; n--)
9652 switch (regclass[n])
9654 case X86_64_INTEGER_CLASS:
9655 case X86_64_INTEGERSI_CLASS:
9656 (*int_nregs)++;
9657 break;
9658 case X86_64_SSE_CLASS:
9659 case X86_64_SSESF_CLASS:
9660 case X86_64_SSEDF_CLASS:
9661 (*sse_nregs)++;
9662 break;
9663 case X86_64_NO_CLASS:
9664 case X86_64_SSEUP_CLASS:
9665 break;
9666 case X86_64_X87_CLASS:
9667 case X86_64_X87UP_CLASS:
9668 case X86_64_COMPLEX_X87_CLASS:
9669 if (!in_return)
9670 return true;
9671 break;
9672 case X86_64_MEMORY_CLASS:
9673 gcc_unreachable ();
9676 return false;
9679 /* Construct container for the argument used by GCC interface. See
9680 FUNCTION_ARG for the detailed description. */
9682 static rtx
9683 construct_container (machine_mode mode, machine_mode orig_mode,
9684 const_tree type, int in_return, int nintregs, int nsseregs,
9685 const int *intreg, int sse_regno)
9687 /* The following variables hold the static issued_error state. */
9688 static bool issued_sse_arg_error;
9689 static bool issued_sse_ret_error;
9690 static bool issued_x87_ret_error;
9692 machine_mode tmpmode;
9693 int bytes =
9694 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9695 enum x86_64_reg_class regclass[MAX_CLASSES];
9696 int n;
9697 int i;
9698 int nexps = 0;
9699 int needed_sseregs, needed_intregs;
9700 rtx exp[MAX_CLASSES];
9701 rtx ret;
9703 n = classify_argument (mode, type, regclass, 0);
9704 if (!n)
9705 return NULL;
9706 if (examine_argument (mode, type, in_return, &needed_intregs,
9707 &needed_sseregs))
9708 return NULL;
9709 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9710 return NULL;
9712 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9713 some less clueful developer tries to use floating-point anyway. */
9714 if (needed_sseregs && !TARGET_SSE)
9716 if (in_return)
9718 if (!issued_sse_ret_error)
9720 error ("SSE register return with SSE disabled");
9721 issued_sse_ret_error = true;
9724 else if (!issued_sse_arg_error)
9726 error ("SSE register argument with SSE disabled");
9727 issued_sse_arg_error = true;
9729 return NULL;
9732 /* Likewise, error if the ABI requires us to return values in the
9733 x87 registers and the user specified -mno-80387. */
9734 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9735 for (i = 0; i < n; i++)
9736 if (regclass[i] == X86_64_X87_CLASS
9737 || regclass[i] == X86_64_X87UP_CLASS
9738 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9740 if (!issued_x87_ret_error)
9742 error ("x87 register return with x87 disabled");
9743 issued_x87_ret_error = true;
9745 return NULL;
9748 /* First construct simple cases. Avoid SCmode, since we want to use
9749 single register to pass this type. */
9750 if (n == 1 && mode != SCmode)
9751 switch (regclass[0])
9753 case X86_64_INTEGER_CLASS:
9754 case X86_64_INTEGERSI_CLASS:
9755 return gen_rtx_REG (mode, intreg[0]);
9756 case X86_64_SSE_CLASS:
9757 case X86_64_SSESF_CLASS:
9758 case X86_64_SSEDF_CLASS:
9759 if (mode != BLKmode)
9760 return gen_reg_or_parallel (mode, orig_mode,
9761 SSE_REGNO (sse_regno));
9762 break;
9763 case X86_64_X87_CLASS:
9764 case X86_64_COMPLEX_X87_CLASS:
9765 return gen_rtx_REG (mode, FIRST_STACK_REG);
9766 case X86_64_NO_CLASS:
9767 /* Zero sized array, struct or class. */
9768 return NULL;
9769 default:
9770 gcc_unreachable ();
9772 if (n == 2
9773 && regclass[0] == X86_64_SSE_CLASS
9774 && regclass[1] == X86_64_SSEUP_CLASS
9775 && mode != BLKmode)
9776 return gen_reg_or_parallel (mode, orig_mode,
9777 SSE_REGNO (sse_regno));
9778 if (n == 4
9779 && regclass[0] == X86_64_SSE_CLASS
9780 && regclass[1] == X86_64_SSEUP_CLASS
9781 && regclass[2] == X86_64_SSEUP_CLASS
9782 && regclass[3] == X86_64_SSEUP_CLASS
9783 && mode != BLKmode)
9784 return gen_reg_or_parallel (mode, orig_mode,
9785 SSE_REGNO (sse_regno));
9786 if (n == 8
9787 && regclass[0] == X86_64_SSE_CLASS
9788 && regclass[1] == X86_64_SSEUP_CLASS
9789 && regclass[2] == X86_64_SSEUP_CLASS
9790 && regclass[3] == X86_64_SSEUP_CLASS
9791 && regclass[4] == X86_64_SSEUP_CLASS
9792 && regclass[5] == X86_64_SSEUP_CLASS
9793 && regclass[6] == X86_64_SSEUP_CLASS
9794 && regclass[7] == X86_64_SSEUP_CLASS
9795 && mode != BLKmode)
9796 return gen_reg_or_parallel (mode, orig_mode,
9797 SSE_REGNO (sse_regno));
9798 if (n == 2
9799 && regclass[0] == X86_64_X87_CLASS
9800 && regclass[1] == X86_64_X87UP_CLASS)
9801 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9803 if (n == 2
9804 && regclass[0] == X86_64_INTEGER_CLASS
9805 && regclass[1] == X86_64_INTEGER_CLASS
9806 && (mode == CDImode || mode == TImode)
9807 && intreg[0] + 1 == intreg[1])
9808 return gen_rtx_REG (mode, intreg[0]);
9810 /* Otherwise figure out the entries of the PARALLEL. */
9811 for (i = 0; i < n; i++)
9813 int pos;
9815 switch (regclass[i])
9817 case X86_64_NO_CLASS:
9818 break;
9819 case X86_64_INTEGER_CLASS:
9820 case X86_64_INTEGERSI_CLASS:
9821 /* Merge TImodes on aligned occasions here too. */
9822 if (i * 8 + 8 > bytes)
9823 tmpmode
9824 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9825 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9826 tmpmode = SImode;
9827 else
9828 tmpmode = DImode;
9829 /* We've requested 24 bytes we
9830 don't have mode for. Use DImode. */
9831 if (tmpmode == BLKmode)
9832 tmpmode = DImode;
9833 exp [nexps++]
9834 = gen_rtx_EXPR_LIST (VOIDmode,
9835 gen_rtx_REG (tmpmode, *intreg),
9836 GEN_INT (i*8));
9837 intreg++;
9838 break;
9839 case X86_64_SSESF_CLASS:
9840 exp [nexps++]
9841 = gen_rtx_EXPR_LIST (VOIDmode,
9842 gen_rtx_REG (SFmode,
9843 SSE_REGNO (sse_regno)),
9844 GEN_INT (i*8));
9845 sse_regno++;
9846 break;
9847 case X86_64_SSEDF_CLASS:
9848 exp [nexps++]
9849 = gen_rtx_EXPR_LIST (VOIDmode,
9850 gen_rtx_REG (DFmode,
9851 SSE_REGNO (sse_regno)),
9852 GEN_INT (i*8));
9853 sse_regno++;
9854 break;
9855 case X86_64_SSE_CLASS:
9856 pos = i;
9857 switch (n)
9859 case 1:
9860 tmpmode = DImode;
9861 break;
9862 case 2:
9863 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9865 tmpmode = TImode;
9866 i++;
9868 else
9869 tmpmode = DImode;
9870 break;
9871 case 4:
9872 gcc_assert (i == 0
9873 && regclass[1] == X86_64_SSEUP_CLASS
9874 && regclass[2] == X86_64_SSEUP_CLASS
9875 && regclass[3] == X86_64_SSEUP_CLASS);
9876 tmpmode = OImode;
9877 i += 3;
9878 break;
9879 case 8:
9880 gcc_assert (i == 0
9881 && regclass[1] == X86_64_SSEUP_CLASS
9882 && regclass[2] == X86_64_SSEUP_CLASS
9883 && regclass[3] == X86_64_SSEUP_CLASS
9884 && regclass[4] == X86_64_SSEUP_CLASS
9885 && regclass[5] == X86_64_SSEUP_CLASS
9886 && regclass[6] == X86_64_SSEUP_CLASS
9887 && regclass[7] == X86_64_SSEUP_CLASS);
9888 tmpmode = XImode;
9889 i += 7;
9890 break;
9891 default:
9892 gcc_unreachable ();
9894 exp [nexps++]
9895 = gen_rtx_EXPR_LIST (VOIDmode,
9896 gen_rtx_REG (tmpmode,
9897 SSE_REGNO (sse_regno)),
9898 GEN_INT (pos*8));
9899 sse_regno++;
9900 break;
9901 default:
9902 gcc_unreachable ();
9906 /* Empty aligned struct, union or class. */
9907 if (nexps == 0)
9908 return NULL;
9910 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9911 for (i = 0; i < nexps; i++)
9912 XVECEXP (ret, 0, i) = exp [i];
9913 return ret;
9916 /* Update the data in CUM to advance over an argument of mode MODE
9917 and data type TYPE. (TYPE is null for libcalls where that information
9918 may not be available.)
9920 Return a number of integer regsiters advanced over. */
9922 static int
9923 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9924 const_tree type, HOST_WIDE_INT bytes,
9925 HOST_WIDE_INT words)
9927 int res = 0;
9928 bool error_p = false;
9930 if (TARGET_IAMCU)
9932 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9933 bytes in registers. */
9934 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9935 goto pass_in_reg;
9936 return res;
9939 switch (mode)
9941 default:
9942 break;
9944 case BLKmode:
9945 if (bytes < 0)
9946 break;
9947 /* FALLTHRU */
9949 case DImode:
9950 case SImode:
9951 case HImode:
9952 case QImode:
9953 pass_in_reg:
9954 cum->words += words;
9955 cum->nregs -= words;
9956 cum->regno += words;
9957 if (cum->nregs >= 0)
9958 res = words;
9959 if (cum->nregs <= 0)
9961 cum->nregs = 0;
9962 cfun->machine->arg_reg_available = false;
9963 cum->regno = 0;
9965 break;
9967 case OImode:
9968 /* OImode shouldn't be used directly. */
9969 gcc_unreachable ();
9971 case DFmode:
9972 if (cum->float_in_sse == -1)
9973 error_p = true;
9974 if (cum->float_in_sse < 2)
9975 break;
9976 /* FALLTHRU */
9977 case SFmode:
9978 if (cum->float_in_sse == -1)
9979 error_p = true;
9980 if (cum->float_in_sse < 1)
9981 break;
9982 /* FALLTHRU */
9984 case V8SFmode:
9985 case V8SImode:
9986 case V64QImode:
9987 case V32HImode:
9988 case V16SImode:
9989 case V8DImode:
9990 case V16SFmode:
9991 case V8DFmode:
9992 case V32QImode:
9993 case V16HImode:
9994 case V4DFmode:
9995 case V4DImode:
9996 case TImode:
9997 case V16QImode:
9998 case V8HImode:
9999 case V4SImode:
10000 case V2DImode:
10001 case V4SFmode:
10002 case V2DFmode:
10003 if (!type || !AGGREGATE_TYPE_P (type))
10005 cum->sse_words += words;
10006 cum->sse_nregs -= 1;
10007 cum->sse_regno += 1;
10008 if (cum->sse_nregs <= 0)
10010 cum->sse_nregs = 0;
10011 cum->sse_regno = 0;
10014 break;
10016 case V8QImode:
10017 case V4HImode:
10018 case V2SImode:
10019 case V2SFmode:
10020 case V1TImode:
10021 case V1DImode:
10022 if (!type || !AGGREGATE_TYPE_P (type))
10024 cum->mmx_words += words;
10025 cum->mmx_nregs -= 1;
10026 cum->mmx_regno += 1;
10027 if (cum->mmx_nregs <= 0)
10029 cum->mmx_nregs = 0;
10030 cum->mmx_regno = 0;
10033 break;
10035 if (error_p)
10037 cum->float_in_sse = 0;
10038 error ("calling %qD with SSE calling convention without "
10039 "SSE/SSE2 enabled", cum->decl);
10040 sorry ("this is a GCC bug that can be worked around by adding "
10041 "attribute used to function called");
10044 return res;
10047 static int
10048 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10049 const_tree type, HOST_WIDE_INT words, bool named)
10051 int int_nregs, sse_nregs;
10053 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10054 if (!named && (VALID_AVX512F_REG_MODE (mode)
10055 || VALID_AVX256_REG_MODE (mode)))
10056 return 0;
10058 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10059 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10061 cum->nregs -= int_nregs;
10062 cum->sse_nregs -= sse_nregs;
10063 cum->regno += int_nregs;
10064 cum->sse_regno += sse_nregs;
10065 return int_nregs;
10067 else
10069 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10070 cum->words = ROUND_UP (cum->words, align);
10071 cum->words += words;
10072 return 0;
10076 static int
10077 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10078 HOST_WIDE_INT words)
10080 /* Otherwise, this should be passed indirect. */
10081 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10083 cum->words += words;
10084 if (cum->nregs > 0)
10086 cum->nregs -= 1;
10087 cum->regno += 1;
10088 return 1;
10090 return 0;
10093 /* Update the data in CUM to advance over an argument of mode MODE and
10094 data type TYPE. (TYPE is null for libcalls where that information
10095 may not be available.) */
10097 static void
10098 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10099 const_tree type, bool named)
10101 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10102 HOST_WIDE_INT bytes, words;
10103 int nregs;
10105 /* The argument of interrupt handler is a special case and is
10106 handled in ix86_function_arg. */
10107 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10108 return;
10110 if (mode == BLKmode)
10111 bytes = int_size_in_bytes (type);
10112 else
10113 bytes = GET_MODE_SIZE (mode);
10114 words = CEIL (bytes, UNITS_PER_WORD);
10116 if (type)
10117 mode = type_natural_mode (type, NULL, false);
10119 if ((type && POINTER_BOUNDS_TYPE_P (type))
10120 || POINTER_BOUNDS_MODE_P (mode))
10122 /* If we pass bounds in BT then just update remained bounds count. */
10123 if (cum->bnds_in_bt)
10125 cum->bnds_in_bt--;
10126 return;
10129 /* Update remained number of bounds to force. */
10130 if (cum->force_bnd_pass)
10131 cum->force_bnd_pass--;
10133 cum->bnd_regno++;
10135 return;
10138 /* The first arg not going to Bounds Tables resets this counter. */
10139 cum->bnds_in_bt = 0;
10140 /* For unnamed args we always pass bounds to avoid bounds mess when
10141 passed and received types do not match. If bounds do not follow
10142 unnamed arg, still pretend required number of bounds were passed. */
10143 if (cum->force_bnd_pass)
10145 cum->bnd_regno += cum->force_bnd_pass;
10146 cum->force_bnd_pass = 0;
10149 if (TARGET_64BIT)
10151 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10153 if (call_abi == MS_ABI)
10154 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10155 else
10156 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10158 else
10159 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10161 /* For stdarg we expect bounds to be passed for each value passed
10162 in register. */
10163 if (cum->stdarg)
10164 cum->force_bnd_pass = nregs;
10165 /* For pointers passed in memory we expect bounds passed in Bounds
10166 Table. */
10167 if (!nregs)
10169 /* Track if there are outgoing arguments on stack. */
10170 if (cum->caller)
10171 cfun->machine->outgoing_args_on_stack = true;
10173 cum->bnds_in_bt = chkp_type_bounds_count (type);
10177 /* Define where to put the arguments to a function.
10178 Value is zero to push the argument on the stack,
10179 or a hard register in which to store the argument.
10181 MODE is the argument's machine mode.
10182 TYPE is the data type of the argument (as a tree).
10183 This is null for libcalls where that information may
10184 not be available.
10185 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10186 the preceding args and about the function being called.
10187 NAMED is nonzero if this argument is a named parameter
10188 (otherwise it is an extra parameter matching an ellipsis). */
10190 static rtx
10191 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10192 machine_mode orig_mode, const_tree type,
10193 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10195 bool error_p = false;
10197 /* Avoid the AL settings for the Unix64 ABI. */
10198 if (mode == VOIDmode)
10199 return constm1_rtx;
10201 if (TARGET_IAMCU)
10203 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10204 bytes in registers. */
10205 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10206 goto pass_in_reg;
10207 return NULL_RTX;
10210 switch (mode)
10212 default:
10213 break;
10215 case BLKmode:
10216 if (bytes < 0)
10217 break;
10218 /* FALLTHRU */
10219 case DImode:
10220 case SImode:
10221 case HImode:
10222 case QImode:
10223 pass_in_reg:
10224 if (words <= cum->nregs)
10226 int regno = cum->regno;
10228 /* Fastcall allocates the first two DWORD (SImode) or
10229 smaller arguments to ECX and EDX if it isn't an
10230 aggregate type . */
10231 if (cum->fastcall)
10233 if (mode == BLKmode
10234 || mode == DImode
10235 || (type && AGGREGATE_TYPE_P (type)))
10236 break;
10238 /* ECX not EAX is the first allocated register. */
10239 if (regno == AX_REG)
10240 regno = CX_REG;
10242 return gen_rtx_REG (mode, regno);
10244 break;
10246 case DFmode:
10247 if (cum->float_in_sse == -1)
10248 error_p = true;
10249 if (cum->float_in_sse < 2)
10250 break;
10251 /* FALLTHRU */
10252 case SFmode:
10253 if (cum->float_in_sse == -1)
10254 error_p = true;
10255 if (cum->float_in_sse < 1)
10256 break;
10257 /* FALLTHRU */
10258 case TImode:
10259 /* In 32bit, we pass TImode in xmm registers. */
10260 case V16QImode:
10261 case V8HImode:
10262 case V4SImode:
10263 case V2DImode:
10264 case V4SFmode:
10265 case V2DFmode:
10266 if (!type || !AGGREGATE_TYPE_P (type))
10268 if (cum->sse_nregs)
10269 return gen_reg_or_parallel (mode, orig_mode,
10270 cum->sse_regno + FIRST_SSE_REG);
10272 break;
10274 case OImode:
10275 case XImode:
10276 /* OImode and XImode shouldn't be used directly. */
10277 gcc_unreachable ();
10279 case V64QImode:
10280 case V32HImode:
10281 case V16SImode:
10282 case V8DImode:
10283 case V16SFmode:
10284 case V8DFmode:
10285 case V8SFmode:
10286 case V8SImode:
10287 case V32QImode:
10288 case V16HImode:
10289 case V4DFmode:
10290 case V4DImode:
10291 if (!type || !AGGREGATE_TYPE_P (type))
10293 if (cum->sse_nregs)
10294 return gen_reg_or_parallel (mode, orig_mode,
10295 cum->sse_regno + FIRST_SSE_REG);
10297 break;
10299 case V8QImode:
10300 case V4HImode:
10301 case V2SImode:
10302 case V2SFmode:
10303 case V1TImode:
10304 case V1DImode:
10305 if (!type || !AGGREGATE_TYPE_P (type))
10307 if (cum->mmx_nregs)
10308 return gen_reg_or_parallel (mode, orig_mode,
10309 cum->mmx_regno + FIRST_MMX_REG);
10311 break;
10313 if (error_p)
10315 cum->float_in_sse = 0;
10316 error ("calling %qD with SSE calling convention without "
10317 "SSE/SSE2 enabled", cum->decl);
10318 sorry ("this is a GCC bug that can be worked around by adding "
10319 "attribute used to function called");
10322 return NULL_RTX;
10325 static rtx
10326 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10327 machine_mode orig_mode, const_tree type, bool named)
10329 /* Handle a hidden AL argument containing number of registers
10330 for varargs x86-64 functions. */
10331 if (mode == VOIDmode)
10332 return GEN_INT (cum->maybe_vaarg
10333 ? (cum->sse_nregs < 0
10334 ? X86_64_SSE_REGPARM_MAX
10335 : cum->sse_regno)
10336 : -1);
10338 switch (mode)
10340 default:
10341 break;
10343 case V8SFmode:
10344 case V8SImode:
10345 case V32QImode:
10346 case V16HImode:
10347 case V4DFmode:
10348 case V4DImode:
10349 case V16SFmode:
10350 case V16SImode:
10351 case V64QImode:
10352 case V32HImode:
10353 case V8DFmode:
10354 case V8DImode:
10355 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10356 if (!named)
10357 return NULL;
10358 break;
10361 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10362 cum->sse_nregs,
10363 &x86_64_int_parameter_registers [cum->regno],
10364 cum->sse_regno);
10367 static rtx
10368 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10369 machine_mode orig_mode, bool named,
10370 HOST_WIDE_INT bytes)
10372 unsigned int regno;
10374 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10375 We use value of -2 to specify that current function call is MSABI. */
10376 if (mode == VOIDmode)
10377 return GEN_INT (-2);
10379 /* If we've run out of registers, it goes on the stack. */
10380 if (cum->nregs == 0)
10381 return NULL_RTX;
10383 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10385 /* Only floating point modes are passed in anything but integer regs. */
10386 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10388 if (named)
10389 regno = cum->regno + FIRST_SSE_REG;
10390 else
10392 rtx t1, t2;
10394 /* Unnamed floating parameters are passed in both the
10395 SSE and integer registers. */
10396 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10397 t2 = gen_rtx_REG (mode, regno);
10398 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10399 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10400 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10403 /* Handle aggregated types passed in register. */
10404 if (orig_mode == BLKmode)
10406 if (bytes > 0 && bytes <= 8)
10407 mode = (bytes > 4 ? DImode : SImode);
10408 if (mode == BLKmode)
10409 mode = DImode;
10412 return gen_reg_or_parallel (mode, orig_mode, regno);
10415 /* Return where to put the arguments to a function.
10416 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10418 MODE is the argument's machine mode. TYPE is the data type of the
10419 argument. It is null for libcalls where that information may not be
10420 available. CUM gives information about the preceding args and about
10421 the function being called. NAMED is nonzero if this argument is a
10422 named parameter (otherwise it is an extra parameter matching an
10423 ellipsis). */
10425 static rtx
10426 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10427 const_tree type, bool named)
10429 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10430 machine_mode mode = omode;
10431 HOST_WIDE_INT bytes, words;
10432 rtx arg;
10434 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10436 gcc_assert (type != NULL_TREE);
10437 if (POINTER_TYPE_P (type))
10439 /* This is the pointer argument. */
10440 gcc_assert (TYPE_MODE (type) == Pmode);
10441 /* It is at -WORD(AP) in the current frame in interrupt and
10442 exception handlers. */
10443 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10445 else
10447 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10448 && TREE_CODE (type) == INTEGER_TYPE
10449 && TYPE_MODE (type) == word_mode);
10450 /* The error code is the word-mode integer argument at
10451 -2 * WORD(AP) in the current frame of the exception
10452 handler. */
10453 arg = gen_rtx_MEM (word_mode,
10454 plus_constant (Pmode,
10455 arg_pointer_rtx,
10456 -2 * UNITS_PER_WORD));
10458 return arg;
10461 /* All pointer bounds arguments are handled separately here. */
10462 if ((type && POINTER_BOUNDS_TYPE_P (type))
10463 || POINTER_BOUNDS_MODE_P (mode))
10465 /* Return NULL if bounds are forced to go in Bounds Table. */
10466 if (cum->bnds_in_bt)
10467 arg = NULL;
10468 /* Return the next available bound reg if any. */
10469 else if (cum->bnd_regno <= LAST_BND_REG)
10470 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10471 /* Return the next special slot number otherwise. */
10472 else
10473 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10475 return arg;
10478 if (mode == BLKmode)
10479 bytes = int_size_in_bytes (type);
10480 else
10481 bytes = GET_MODE_SIZE (mode);
10482 words = CEIL (bytes, UNITS_PER_WORD);
10484 /* To simplify the code below, represent vector types with a vector mode
10485 even if MMX/SSE are not active. */
10486 if (type && TREE_CODE (type) == VECTOR_TYPE)
10487 mode = type_natural_mode (type, cum, false);
10489 if (TARGET_64BIT)
10491 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10493 if (call_abi == MS_ABI)
10494 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10495 else
10496 arg = function_arg_64 (cum, mode, omode, type, named);
10498 else
10499 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10501 /* Track if there are outgoing arguments on stack. */
10502 if (arg == NULL_RTX && cum->caller)
10503 cfun->machine->outgoing_args_on_stack = true;
10505 return arg;
10508 /* A C expression that indicates when an argument must be passed by
10509 reference. If nonzero for an argument, a copy of that argument is
10510 made in memory and a pointer to the argument is passed instead of
10511 the argument itself. The pointer is passed in whatever way is
10512 appropriate for passing a pointer to that type. */
10514 static bool
10515 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10516 const_tree type, bool)
10518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10520 /* Bounds are never passed by reference. */
10521 if ((type && POINTER_BOUNDS_TYPE_P (type))
10522 || POINTER_BOUNDS_MODE_P (mode))
10523 return false;
10525 if (TARGET_64BIT)
10527 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10529 /* See Windows x64 Software Convention. */
10530 if (call_abi == MS_ABI)
10532 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10534 if (type)
10536 /* Arrays are passed by reference. */
10537 if (TREE_CODE (type) == ARRAY_TYPE)
10538 return true;
10540 if (RECORD_OR_UNION_TYPE_P (type))
10542 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10543 are passed by reference. */
10544 msize = int_size_in_bytes (type);
10548 /* __m128 is passed by reference. */
10549 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10551 else if (type && int_size_in_bytes (type) == -1)
10552 return true;
10555 return false;
10558 /* Return true when TYPE should be 128bit aligned for 32bit argument
10559 passing ABI. XXX: This function is obsolete and is only used for
10560 checking psABI compatibility with previous versions of GCC. */
10562 static bool
10563 ix86_compat_aligned_value_p (const_tree type)
10565 machine_mode mode = TYPE_MODE (type);
10566 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10567 || mode == TDmode
10568 || mode == TFmode
10569 || mode == TCmode)
10570 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10571 return true;
10572 if (TYPE_ALIGN (type) < 128)
10573 return false;
10575 if (AGGREGATE_TYPE_P (type))
10577 /* Walk the aggregates recursively. */
10578 switch (TREE_CODE (type))
10580 case RECORD_TYPE:
10581 case UNION_TYPE:
10582 case QUAL_UNION_TYPE:
10584 tree field;
10586 /* Walk all the structure fields. */
10587 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10589 if (TREE_CODE (field) == FIELD_DECL
10590 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10591 return true;
10593 break;
10596 case ARRAY_TYPE:
10597 /* Just for use if some languages passes arrays by value. */
10598 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10599 return true;
10600 break;
10602 default:
10603 gcc_unreachable ();
10606 return false;
10609 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10610 XXX: This function is obsolete and is only used for checking psABI
10611 compatibility with previous versions of GCC. */
10613 static unsigned int
10614 ix86_compat_function_arg_boundary (machine_mode mode,
10615 const_tree type, unsigned int align)
10617 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10618 natural boundaries. */
10619 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10621 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10622 make an exception for SSE modes since these require 128bit
10623 alignment.
10625 The handling here differs from field_alignment. ICC aligns MMX
10626 arguments to 4 byte boundaries, while structure fields are aligned
10627 to 8 byte boundaries. */
10628 if (!type)
10630 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10631 align = PARM_BOUNDARY;
10633 else
10635 if (!ix86_compat_aligned_value_p (type))
10636 align = PARM_BOUNDARY;
10639 if (align > BIGGEST_ALIGNMENT)
10640 align = BIGGEST_ALIGNMENT;
10641 return align;
10644 /* Return true when TYPE should be 128bit aligned for 32bit argument
10645 passing ABI. */
10647 static bool
10648 ix86_contains_aligned_value_p (const_tree type)
10650 machine_mode mode = TYPE_MODE (type);
10652 if (mode == XFmode || mode == XCmode)
10653 return false;
10655 if (TYPE_ALIGN (type) < 128)
10656 return false;
10658 if (AGGREGATE_TYPE_P (type))
10660 /* Walk the aggregates recursively. */
10661 switch (TREE_CODE (type))
10663 case RECORD_TYPE:
10664 case UNION_TYPE:
10665 case QUAL_UNION_TYPE:
10667 tree field;
10669 /* Walk all the structure fields. */
10670 for (field = TYPE_FIELDS (type);
10671 field;
10672 field = DECL_CHAIN (field))
10674 if (TREE_CODE (field) == FIELD_DECL
10675 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10676 return true;
10678 break;
10681 case ARRAY_TYPE:
10682 /* Just for use if some languages passes arrays by value. */
10683 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10684 return true;
10685 break;
10687 default:
10688 gcc_unreachable ();
10691 else
10692 return TYPE_ALIGN (type) >= 128;
10694 return false;
10697 /* Gives the alignment boundary, in bits, of an argument with the
10698 specified mode and type. */
10700 static unsigned int
10701 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10703 unsigned int align;
10704 if (type)
10706 /* Since the main variant type is used for call, we convert it to
10707 the main variant type. */
10708 type = TYPE_MAIN_VARIANT (type);
10709 align = TYPE_ALIGN (type);
10711 else
10712 align = GET_MODE_ALIGNMENT (mode);
10713 if (align < PARM_BOUNDARY)
10714 align = PARM_BOUNDARY;
10715 else
10717 static bool warned;
10718 unsigned int saved_align = align;
10720 if (!TARGET_64BIT)
10722 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10723 if (!type)
10725 if (mode == XFmode || mode == XCmode)
10726 align = PARM_BOUNDARY;
10728 else if (!ix86_contains_aligned_value_p (type))
10729 align = PARM_BOUNDARY;
10731 if (align < 128)
10732 align = PARM_BOUNDARY;
10735 if (warn_psabi
10736 && !warned
10737 && align != ix86_compat_function_arg_boundary (mode, type,
10738 saved_align))
10740 warned = true;
10741 inform (input_location,
10742 "The ABI for passing parameters with %d-byte"
10743 " alignment has changed in GCC 4.6",
10744 align / BITS_PER_UNIT);
10748 return align;
10751 /* Return true if N is a possible register number of function value. */
10753 static bool
10754 ix86_function_value_regno_p (const unsigned int regno)
10756 switch (regno)
10758 case AX_REG:
10759 return true;
10760 case DX_REG:
10761 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10762 case DI_REG:
10763 case SI_REG:
10764 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10766 case BND0_REG:
10767 case BND1_REG:
10768 return chkp_function_instrumented_p (current_function_decl);
10770 /* Complex values are returned in %st(0)/%st(1) pair. */
10771 case ST0_REG:
10772 case ST1_REG:
10773 /* TODO: The function should depend on current function ABI but
10774 builtins.c would need updating then. Therefore we use the
10775 default ABI. */
10776 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10777 return false;
10778 return TARGET_FLOAT_RETURNS_IN_80387;
10780 /* Complex values are returned in %xmm0/%xmm1 pair. */
10781 case XMM0_REG:
10782 case XMM1_REG:
10783 return TARGET_SSE;
10785 case MM0_REG:
10786 if (TARGET_MACHO || TARGET_64BIT)
10787 return false;
10788 return TARGET_MMX;
10791 return false;
10794 /* Define how to find the value returned by a function.
10795 VALTYPE is the data type of the value (as a tree).
10796 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10797 otherwise, FUNC is 0. */
10799 static rtx
10800 function_value_32 (machine_mode orig_mode, machine_mode mode,
10801 const_tree fntype, const_tree fn)
10803 unsigned int regno;
10805 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10806 we normally prevent this case when mmx is not available. However
10807 some ABIs may require the result to be returned like DImode. */
10808 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10809 regno = FIRST_MMX_REG;
10811 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10812 we prevent this case when sse is not available. However some ABIs
10813 may require the result to be returned like integer TImode. */
10814 else if (mode == TImode
10815 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10816 regno = FIRST_SSE_REG;
10818 /* 32-byte vector modes in %ymm0. */
10819 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10820 regno = FIRST_SSE_REG;
10822 /* 64-byte vector modes in %zmm0. */
10823 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10824 regno = FIRST_SSE_REG;
10826 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10827 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10828 regno = FIRST_FLOAT_REG;
10829 else
10830 /* Most things go in %eax. */
10831 regno = AX_REG;
10833 /* Override FP return register with %xmm0 for local functions when
10834 SSE math is enabled or for functions with sseregparm attribute. */
10835 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10837 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10838 if (sse_level == -1)
10840 error ("calling %qD with SSE calling convention without "
10841 "SSE/SSE2 enabled", fn);
10842 sorry ("this is a GCC bug that can be worked around by adding "
10843 "attribute used to function called");
10845 else if ((sse_level >= 1 && mode == SFmode)
10846 || (sse_level == 2 && mode == DFmode))
10847 regno = FIRST_SSE_REG;
10850 /* OImode shouldn't be used directly. */
10851 gcc_assert (mode != OImode);
10853 return gen_rtx_REG (orig_mode, regno);
10856 static rtx
10857 function_value_64 (machine_mode orig_mode, machine_mode mode,
10858 const_tree valtype)
10860 rtx ret;
10862 /* Handle libcalls, which don't provide a type node. */
10863 if (valtype == NULL)
10865 unsigned int regno;
10867 switch (mode)
10869 case SFmode:
10870 case SCmode:
10871 case DFmode:
10872 case DCmode:
10873 case TFmode:
10874 case SDmode:
10875 case DDmode:
10876 case TDmode:
10877 regno = FIRST_SSE_REG;
10878 break;
10879 case XFmode:
10880 case XCmode:
10881 regno = FIRST_FLOAT_REG;
10882 break;
10883 case TCmode:
10884 return NULL;
10885 default:
10886 regno = AX_REG;
10889 return gen_rtx_REG (mode, regno);
10891 else if (POINTER_TYPE_P (valtype))
10893 /* Pointers are always returned in word_mode. */
10894 mode = word_mode;
10897 ret = construct_container (mode, orig_mode, valtype, 1,
10898 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10899 x86_64_int_return_registers, 0);
10901 /* For zero sized structures, construct_container returns NULL, but we
10902 need to keep rest of compiler happy by returning meaningful value. */
10903 if (!ret)
10904 ret = gen_rtx_REG (orig_mode, AX_REG);
10906 return ret;
10909 static rtx
10910 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10911 const_tree valtype)
10913 unsigned int regno = AX_REG;
10915 if (TARGET_SSE)
10917 switch (GET_MODE_SIZE (mode))
10919 case 16:
10920 if (valtype != NULL_TREE
10921 && !VECTOR_INTEGER_TYPE_P (valtype)
10922 && !VECTOR_INTEGER_TYPE_P (valtype)
10923 && !INTEGRAL_TYPE_P (valtype)
10924 && !VECTOR_FLOAT_TYPE_P (valtype))
10925 break;
10926 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10927 && !COMPLEX_MODE_P (mode))
10928 regno = FIRST_SSE_REG;
10929 break;
10930 case 8:
10931 case 4:
10932 if (mode == SFmode || mode == DFmode)
10933 regno = FIRST_SSE_REG;
10934 break;
10935 default:
10936 break;
10939 return gen_rtx_REG (orig_mode, regno);
10942 static rtx
10943 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10944 machine_mode orig_mode, machine_mode mode)
10946 const_tree fn, fntype;
10948 fn = NULL_TREE;
10949 if (fntype_or_decl && DECL_P (fntype_or_decl))
10950 fn = fntype_or_decl;
10951 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10953 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10954 || POINTER_BOUNDS_MODE_P (mode))
10955 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10956 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10957 return function_value_ms_64 (orig_mode, mode, valtype);
10958 else if (TARGET_64BIT)
10959 return function_value_64 (orig_mode, mode, valtype);
10960 else
10961 return function_value_32 (orig_mode, mode, fntype, fn);
10964 static rtx
10965 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10967 machine_mode mode, orig_mode;
10969 orig_mode = TYPE_MODE (valtype);
10970 mode = type_natural_mode (valtype, NULL, true);
10971 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10974 /* Return an RTX representing a place where a function returns
10975 or recieves pointer bounds or NULL if no bounds are returned.
10977 VALTYPE is a data type of a value returned by the function.
10979 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10980 or FUNCTION_TYPE of the function.
10982 If OUTGOING is false, return a place in which the caller will
10983 see the return value. Otherwise, return a place where a
10984 function returns a value. */
10986 static rtx
10987 ix86_function_value_bounds (const_tree valtype,
10988 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10989 bool outgoing ATTRIBUTE_UNUSED)
10991 rtx res = NULL_RTX;
10993 if (BOUNDED_TYPE_P (valtype))
10994 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10995 else if (chkp_type_has_pointer (valtype))
10997 bitmap slots;
10998 rtx bounds[2];
10999 bitmap_iterator bi;
11000 unsigned i, bnd_no = 0;
11002 bitmap_obstack_initialize (NULL);
11003 slots = BITMAP_ALLOC (NULL);
11004 chkp_find_bound_slots (valtype, slots);
11006 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11008 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11009 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11010 gcc_assert (bnd_no < 2);
11011 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11014 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11016 BITMAP_FREE (slots);
11017 bitmap_obstack_release (NULL);
11019 else
11020 res = NULL_RTX;
11022 return res;
11025 /* Pointer function arguments and return values are promoted to
11026 word_mode for normal functions. */
11028 static machine_mode
11029 ix86_promote_function_mode (const_tree type, machine_mode mode,
11030 int *punsignedp, const_tree fntype,
11031 int for_return)
11033 if (cfun->machine->func_type == TYPE_NORMAL
11034 && type != NULL_TREE
11035 && POINTER_TYPE_P (type))
11037 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11038 return word_mode;
11040 return default_promote_function_mode (type, mode, punsignedp, fntype,
11041 for_return);
11044 /* Return true if a structure, union or array with MODE containing FIELD
11045 should be accessed using BLKmode. */
11047 static bool
11048 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11050 /* Union with XFmode must be in BLKmode. */
11051 return (mode == XFmode
11052 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11053 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11057 ix86_libcall_value (machine_mode mode)
11059 return ix86_function_value_1 (NULL, NULL, mode, mode);
11062 /* Return true iff type is returned in memory. */
11064 static bool
11065 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11067 #ifdef SUBTARGET_RETURN_IN_MEMORY
11068 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11069 #else
11070 const machine_mode mode = type_natural_mode (type, NULL, true);
11071 HOST_WIDE_INT size;
11073 if (POINTER_BOUNDS_TYPE_P (type))
11074 return false;
11076 if (TARGET_64BIT)
11078 if (ix86_function_type_abi (fntype) == MS_ABI)
11080 size = int_size_in_bytes (type);
11082 /* __m128 is returned in xmm0. */
11083 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11084 || INTEGRAL_TYPE_P (type)
11085 || VECTOR_FLOAT_TYPE_P (type))
11086 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11087 && !COMPLEX_MODE_P (mode)
11088 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11089 return false;
11091 /* Otherwise, the size must be exactly in [1248]. */
11092 return size != 1 && size != 2 && size != 4 && size != 8;
11094 else
11096 int needed_intregs, needed_sseregs;
11098 return examine_argument (mode, type, 1,
11099 &needed_intregs, &needed_sseregs);
11102 else
11104 size = int_size_in_bytes (type);
11106 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11107 bytes in registers. */
11108 if (TARGET_IAMCU)
11109 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11111 if (mode == BLKmode)
11112 return true;
11114 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11115 return false;
11117 if (VECTOR_MODE_P (mode) || mode == TImode)
11119 /* User-created vectors small enough to fit in EAX. */
11120 if (size < 8)
11121 return false;
11123 /* Unless ABI prescibes otherwise,
11124 MMX/3dNow values are returned in MM0 if available. */
11126 if (size == 8)
11127 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11129 /* SSE values are returned in XMM0 if available. */
11130 if (size == 16)
11131 return !TARGET_SSE;
11133 /* AVX values are returned in YMM0 if available. */
11134 if (size == 32)
11135 return !TARGET_AVX;
11137 /* AVX512F values are returned in ZMM0 if available. */
11138 if (size == 64)
11139 return !TARGET_AVX512F;
11142 if (mode == XFmode)
11143 return false;
11145 if (size > 12)
11146 return true;
11148 /* OImode shouldn't be used directly. */
11149 gcc_assert (mode != OImode);
11151 return false;
11153 #endif
11157 /* Create the va_list data type. */
11159 static tree
11160 ix86_build_builtin_va_list_64 (void)
11162 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11164 record = lang_hooks.types.make_type (RECORD_TYPE);
11165 type_decl = build_decl (BUILTINS_LOCATION,
11166 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11168 f_gpr = build_decl (BUILTINS_LOCATION,
11169 FIELD_DECL, get_identifier ("gp_offset"),
11170 unsigned_type_node);
11171 f_fpr = build_decl (BUILTINS_LOCATION,
11172 FIELD_DECL, get_identifier ("fp_offset"),
11173 unsigned_type_node);
11174 f_ovf = build_decl (BUILTINS_LOCATION,
11175 FIELD_DECL, get_identifier ("overflow_arg_area"),
11176 ptr_type_node);
11177 f_sav = build_decl (BUILTINS_LOCATION,
11178 FIELD_DECL, get_identifier ("reg_save_area"),
11179 ptr_type_node);
11181 va_list_gpr_counter_field = f_gpr;
11182 va_list_fpr_counter_field = f_fpr;
11184 DECL_FIELD_CONTEXT (f_gpr) = record;
11185 DECL_FIELD_CONTEXT (f_fpr) = record;
11186 DECL_FIELD_CONTEXT (f_ovf) = record;
11187 DECL_FIELD_CONTEXT (f_sav) = record;
11189 TYPE_STUB_DECL (record) = type_decl;
11190 TYPE_NAME (record) = type_decl;
11191 TYPE_FIELDS (record) = f_gpr;
11192 DECL_CHAIN (f_gpr) = f_fpr;
11193 DECL_CHAIN (f_fpr) = f_ovf;
11194 DECL_CHAIN (f_ovf) = f_sav;
11196 layout_type (record);
11198 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11199 NULL_TREE, TYPE_ATTRIBUTES (record));
11201 /* The correct type is an array type of one element. */
11202 return build_array_type (record, build_index_type (size_zero_node));
11205 /* Setup the builtin va_list data type and for 64-bit the additional
11206 calling convention specific va_list data types. */
11208 static tree
11209 ix86_build_builtin_va_list (void)
11211 if (TARGET_64BIT)
11213 /* Initialize ABI specific va_list builtin types.
11215 In lto1, we can encounter two va_list types:
11216 - one as a result of the type-merge across TUs, and
11217 - the one constructed here.
11218 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11219 a type identity check in canonical_va_list_type based on
11220 TYPE_MAIN_VARIANT (which we used to have) will not work.
11221 Instead, we tag each va_list_type_node with its unique attribute, and
11222 look for the attribute in the type identity check in
11223 canonical_va_list_type.
11225 Tagging sysv_va_list_type_node directly with the attribute is
11226 problematic since it's a array of one record, which will degrade into a
11227 pointer to record when used as parameter (see build_va_arg comments for
11228 an example), dropping the attribute in the process. So we tag the
11229 record instead. */
11231 /* For SYSV_ABI we use an array of one record. */
11232 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11234 /* For MS_ABI we use plain pointer to argument area. */
11235 tree char_ptr_type = build_pointer_type (char_type_node);
11236 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11237 TYPE_ATTRIBUTES (char_ptr_type));
11238 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11240 return ((ix86_abi == MS_ABI)
11241 ? ms_va_list_type_node
11242 : sysv_va_list_type_node);
11244 else
11246 /* For i386 we use plain pointer to argument area. */
11247 return build_pointer_type (char_type_node);
11251 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11253 static void
11254 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11256 rtx save_area, mem;
11257 alias_set_type set;
11258 int i, max;
11260 /* GPR size of varargs save area. */
11261 if (cfun->va_list_gpr_size)
11262 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11263 else
11264 ix86_varargs_gpr_size = 0;
11266 /* FPR size of varargs save area. We don't need it if we don't pass
11267 anything in SSE registers. */
11268 if (TARGET_SSE && cfun->va_list_fpr_size)
11269 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11270 else
11271 ix86_varargs_fpr_size = 0;
11273 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11274 return;
11276 save_area = frame_pointer_rtx;
11277 set = get_varargs_alias_set ();
11279 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11280 if (max > X86_64_REGPARM_MAX)
11281 max = X86_64_REGPARM_MAX;
11283 for (i = cum->regno; i < max; i++)
11285 mem = gen_rtx_MEM (word_mode,
11286 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11287 MEM_NOTRAP_P (mem) = 1;
11288 set_mem_alias_set (mem, set);
11289 emit_move_insn (mem,
11290 gen_rtx_REG (word_mode,
11291 x86_64_int_parameter_registers[i]));
11294 if (ix86_varargs_fpr_size)
11296 machine_mode smode;
11297 rtx_code_label *label;
11298 rtx test;
11300 /* Now emit code to save SSE registers. The AX parameter contains number
11301 of SSE parameter registers used to call this function, though all we
11302 actually check here is the zero/non-zero status. */
11304 label = gen_label_rtx ();
11305 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11306 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11307 label));
11309 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11310 we used movdqa (i.e. TImode) instead? Perhaps even better would
11311 be if we could determine the real mode of the data, via a hook
11312 into pass_stdarg. Ignore all that for now. */
11313 smode = V4SFmode;
11314 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11315 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11317 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11318 if (max > X86_64_SSE_REGPARM_MAX)
11319 max = X86_64_SSE_REGPARM_MAX;
11321 for (i = cum->sse_regno; i < max; ++i)
11323 mem = plus_constant (Pmode, save_area,
11324 i * 16 + ix86_varargs_gpr_size);
11325 mem = gen_rtx_MEM (smode, mem);
11326 MEM_NOTRAP_P (mem) = 1;
11327 set_mem_alias_set (mem, set);
11328 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11330 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11333 emit_label (label);
11337 static void
11338 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11340 alias_set_type set = get_varargs_alias_set ();
11341 int i;
11343 /* Reset to zero, as there might be a sysv vaarg used
11344 before. */
11345 ix86_varargs_gpr_size = 0;
11346 ix86_varargs_fpr_size = 0;
11348 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11350 rtx reg, mem;
11352 mem = gen_rtx_MEM (Pmode,
11353 plus_constant (Pmode, virtual_incoming_args_rtx,
11354 i * UNITS_PER_WORD));
11355 MEM_NOTRAP_P (mem) = 1;
11356 set_mem_alias_set (mem, set);
11358 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11359 emit_move_insn (mem, reg);
11363 static void
11364 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11365 tree type, int *, int no_rtl)
11367 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11368 CUMULATIVE_ARGS next_cum;
11369 tree fntype;
11371 /* This argument doesn't appear to be used anymore. Which is good,
11372 because the old code here didn't suppress rtl generation. */
11373 gcc_assert (!no_rtl);
11375 if (!TARGET_64BIT)
11376 return;
11378 fntype = TREE_TYPE (current_function_decl);
11380 /* For varargs, we do not want to skip the dummy va_dcl argument.
11381 For stdargs, we do want to skip the last named argument. */
11382 next_cum = *cum;
11383 if (stdarg_p (fntype))
11384 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11385 true);
11387 if (cum->call_abi == MS_ABI)
11388 setup_incoming_varargs_ms_64 (&next_cum);
11389 else
11390 setup_incoming_varargs_64 (&next_cum);
11393 static void
11394 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11395 machine_mode mode,
11396 tree type,
11397 int *pretend_size ATTRIBUTE_UNUSED,
11398 int no_rtl)
11400 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11401 CUMULATIVE_ARGS next_cum;
11402 tree fntype;
11403 rtx save_area;
11404 int bnd_reg, i, max;
11406 gcc_assert (!no_rtl);
11408 /* Do nothing if we use plain pointer to argument area. */
11409 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11410 return;
11412 fntype = TREE_TYPE (current_function_decl);
11414 /* For varargs, we do not want to skip the dummy va_dcl argument.
11415 For stdargs, we do want to skip the last named argument. */
11416 next_cum = *cum;
11417 if (stdarg_p (fntype))
11418 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11419 true);
11420 save_area = frame_pointer_rtx;
11422 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11423 if (max > X86_64_REGPARM_MAX)
11424 max = X86_64_REGPARM_MAX;
11426 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11427 if (chkp_function_instrumented_p (current_function_decl))
11428 for (i = cum->regno; i < max; i++)
11430 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11431 rtx ptr = gen_rtx_REG (Pmode,
11432 x86_64_int_parameter_registers[i]);
11433 rtx bounds;
11435 if (bnd_reg <= LAST_BND_REG)
11436 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11437 else
11439 rtx ldx_addr =
11440 plus_constant (Pmode, arg_pointer_rtx,
11441 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11442 bounds = gen_reg_rtx (BNDmode);
11443 emit_insn (BNDmode == BND64mode
11444 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11445 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11448 emit_insn (BNDmode == BND64mode
11449 ? gen_bnd64_stx (addr, ptr, bounds)
11450 : gen_bnd32_stx (addr, ptr, bounds));
11452 bnd_reg++;
11457 /* Checks if TYPE is of kind va_list char *. */
11459 static bool
11460 is_va_list_char_pointer (tree type)
11462 tree canonic;
11464 /* For 32-bit it is always true. */
11465 if (!TARGET_64BIT)
11466 return true;
11467 canonic = ix86_canonical_va_list_type (type);
11468 return (canonic == ms_va_list_type_node
11469 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11472 /* Implement va_start. */
11474 static void
11475 ix86_va_start (tree valist, rtx nextarg)
11477 HOST_WIDE_INT words, n_gpr, n_fpr;
11478 tree f_gpr, f_fpr, f_ovf, f_sav;
11479 tree gpr, fpr, ovf, sav, t;
11480 tree type;
11481 rtx ovf_rtx;
11483 if (flag_split_stack
11484 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11486 unsigned int scratch_regno;
11488 /* When we are splitting the stack, we can't refer to the stack
11489 arguments using internal_arg_pointer, because they may be on
11490 the old stack. The split stack prologue will arrange to
11491 leave a pointer to the old stack arguments in a scratch
11492 register, which we here copy to a pseudo-register. The split
11493 stack prologue can't set the pseudo-register directly because
11494 it (the prologue) runs before any registers have been saved. */
11496 scratch_regno = split_stack_prologue_scratch_regno ();
11497 if (scratch_regno != INVALID_REGNUM)
11499 rtx reg;
11500 rtx_insn *seq;
11502 reg = gen_reg_rtx (Pmode);
11503 cfun->machine->split_stack_varargs_pointer = reg;
11505 start_sequence ();
11506 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11507 seq = get_insns ();
11508 end_sequence ();
11510 push_topmost_sequence ();
11511 emit_insn_after (seq, entry_of_function ());
11512 pop_topmost_sequence ();
11516 /* Only 64bit target needs something special. */
11517 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11519 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11520 std_expand_builtin_va_start (valist, nextarg);
11521 else
11523 rtx va_r, next;
11525 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11526 next = expand_binop (ptr_mode, add_optab,
11527 cfun->machine->split_stack_varargs_pointer,
11528 crtl->args.arg_offset_rtx,
11529 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11530 convert_move (va_r, next, 0);
11532 /* Store zero bounds for va_list. */
11533 if (chkp_function_instrumented_p (current_function_decl))
11534 chkp_expand_bounds_reset_for_mem (valist,
11535 make_tree (TREE_TYPE (valist),
11536 next));
11539 return;
11542 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11543 f_fpr = DECL_CHAIN (f_gpr);
11544 f_ovf = DECL_CHAIN (f_fpr);
11545 f_sav = DECL_CHAIN (f_ovf);
11547 valist = build_simple_mem_ref (valist);
11548 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11549 /* The following should be folded into the MEM_REF offset. */
11550 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11551 f_gpr, NULL_TREE);
11552 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11553 f_fpr, NULL_TREE);
11554 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11555 f_ovf, NULL_TREE);
11556 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11557 f_sav, NULL_TREE);
11559 /* Count number of gp and fp argument registers used. */
11560 words = crtl->args.info.words;
11561 n_gpr = crtl->args.info.regno;
11562 n_fpr = crtl->args.info.sse_regno;
11564 if (cfun->va_list_gpr_size)
11566 type = TREE_TYPE (gpr);
11567 t = build2 (MODIFY_EXPR, type,
11568 gpr, build_int_cst (type, n_gpr * 8));
11569 TREE_SIDE_EFFECTS (t) = 1;
11570 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11573 if (TARGET_SSE && cfun->va_list_fpr_size)
11575 type = TREE_TYPE (fpr);
11576 t = build2 (MODIFY_EXPR, type, fpr,
11577 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11578 TREE_SIDE_EFFECTS (t) = 1;
11579 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11582 /* Find the overflow area. */
11583 type = TREE_TYPE (ovf);
11584 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11585 ovf_rtx = crtl->args.internal_arg_pointer;
11586 else
11587 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11588 t = make_tree (type, ovf_rtx);
11589 if (words != 0)
11590 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11592 /* Store zero bounds for overflow area pointer. */
11593 if (chkp_function_instrumented_p (current_function_decl))
11594 chkp_expand_bounds_reset_for_mem (ovf, t);
11596 t = build2 (MODIFY_EXPR, type, ovf, t);
11597 TREE_SIDE_EFFECTS (t) = 1;
11598 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11600 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11602 /* Find the register save area.
11603 Prologue of the function save it right above stack frame. */
11604 type = TREE_TYPE (sav);
11605 t = make_tree (type, frame_pointer_rtx);
11606 if (!ix86_varargs_gpr_size)
11607 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11609 /* Store zero bounds for save area pointer. */
11610 if (chkp_function_instrumented_p (current_function_decl))
11611 chkp_expand_bounds_reset_for_mem (sav, t);
11613 t = build2 (MODIFY_EXPR, type, sav, t);
11614 TREE_SIDE_EFFECTS (t) = 1;
11615 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11619 /* Implement va_arg. */
11621 static tree
11622 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11623 gimple_seq *post_p)
11625 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11626 tree f_gpr, f_fpr, f_ovf, f_sav;
11627 tree gpr, fpr, ovf, sav, t;
11628 int size, rsize;
11629 tree lab_false, lab_over = NULL_TREE;
11630 tree addr, t2;
11631 rtx container;
11632 int indirect_p = 0;
11633 tree ptrtype;
11634 machine_mode nat_mode;
11635 unsigned int arg_boundary;
11637 /* Only 64bit target needs something special. */
11638 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11639 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11641 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11642 f_fpr = DECL_CHAIN (f_gpr);
11643 f_ovf = DECL_CHAIN (f_fpr);
11644 f_sav = DECL_CHAIN (f_ovf);
11646 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11647 valist, f_gpr, NULL_TREE);
11649 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11650 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11651 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11653 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11654 if (indirect_p)
11655 type = build_pointer_type (type);
11656 size = int_size_in_bytes (type);
11657 rsize = CEIL (size, UNITS_PER_WORD);
11659 nat_mode = type_natural_mode (type, NULL, false);
11660 switch (nat_mode)
11662 case V8SFmode:
11663 case V8SImode:
11664 case V32QImode:
11665 case V16HImode:
11666 case V4DFmode:
11667 case V4DImode:
11668 case V16SFmode:
11669 case V16SImode:
11670 case V64QImode:
11671 case V32HImode:
11672 case V8DFmode:
11673 case V8DImode:
11674 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11675 if (!TARGET_64BIT_MS_ABI)
11677 container = NULL;
11678 break;
11680 /* FALLTHRU */
11682 default:
11683 container = construct_container (nat_mode, TYPE_MODE (type),
11684 type, 0, X86_64_REGPARM_MAX,
11685 X86_64_SSE_REGPARM_MAX, intreg,
11687 break;
11690 /* Pull the value out of the saved registers. */
11692 addr = create_tmp_var (ptr_type_node, "addr");
11694 if (container)
11696 int needed_intregs, needed_sseregs;
11697 bool need_temp;
11698 tree int_addr, sse_addr;
11700 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11701 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11703 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11705 need_temp = (!REG_P (container)
11706 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11707 || TYPE_ALIGN (type) > 128));
11709 /* In case we are passing structure, verify that it is consecutive block
11710 on the register save area. If not we need to do moves. */
11711 if (!need_temp && !REG_P (container))
11713 /* Verify that all registers are strictly consecutive */
11714 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11716 int i;
11718 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11720 rtx slot = XVECEXP (container, 0, i);
11721 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11722 || INTVAL (XEXP (slot, 1)) != i * 16)
11723 need_temp = true;
11726 else
11728 int i;
11730 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11732 rtx slot = XVECEXP (container, 0, i);
11733 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11734 || INTVAL (XEXP (slot, 1)) != i * 8)
11735 need_temp = true;
11739 if (!need_temp)
11741 int_addr = addr;
11742 sse_addr = addr;
11744 else
11746 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11747 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11750 /* First ensure that we fit completely in registers. */
11751 if (needed_intregs)
11753 t = build_int_cst (TREE_TYPE (gpr),
11754 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11755 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11756 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11757 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11758 gimplify_and_add (t, pre_p);
11760 if (needed_sseregs)
11762 t = build_int_cst (TREE_TYPE (fpr),
11763 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11764 + X86_64_REGPARM_MAX * 8);
11765 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11766 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11767 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11768 gimplify_and_add (t, pre_p);
11771 /* Compute index to start of area used for integer regs. */
11772 if (needed_intregs)
11774 /* int_addr = gpr + sav; */
11775 t = fold_build_pointer_plus (sav, gpr);
11776 gimplify_assign (int_addr, t, pre_p);
11778 if (needed_sseregs)
11780 /* sse_addr = fpr + sav; */
11781 t = fold_build_pointer_plus (sav, fpr);
11782 gimplify_assign (sse_addr, t, pre_p);
11784 if (need_temp)
11786 int i, prev_size = 0;
11787 tree temp = create_tmp_var (type, "va_arg_tmp");
11789 /* addr = &temp; */
11790 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11791 gimplify_assign (addr, t, pre_p);
11793 for (i = 0; i < XVECLEN (container, 0); i++)
11795 rtx slot = XVECEXP (container, 0, i);
11796 rtx reg = XEXP (slot, 0);
11797 machine_mode mode = GET_MODE (reg);
11798 tree piece_type;
11799 tree addr_type;
11800 tree daddr_type;
11801 tree src_addr, src;
11802 int src_offset;
11803 tree dest_addr, dest;
11804 int cur_size = GET_MODE_SIZE (mode);
11806 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11807 prev_size = INTVAL (XEXP (slot, 1));
11808 if (prev_size + cur_size > size)
11810 cur_size = size - prev_size;
11811 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11812 if (mode == BLKmode)
11813 mode = QImode;
11815 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11816 if (mode == GET_MODE (reg))
11817 addr_type = build_pointer_type (piece_type);
11818 else
11819 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11820 true);
11821 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11822 true);
11824 if (SSE_REGNO_P (REGNO (reg)))
11826 src_addr = sse_addr;
11827 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11829 else
11831 src_addr = int_addr;
11832 src_offset = REGNO (reg) * 8;
11834 src_addr = fold_convert (addr_type, src_addr);
11835 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11837 dest_addr = fold_convert (daddr_type, addr);
11838 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11839 if (cur_size == GET_MODE_SIZE (mode))
11841 src = build_va_arg_indirect_ref (src_addr);
11842 dest = build_va_arg_indirect_ref (dest_addr);
11844 gimplify_assign (dest, src, pre_p);
11846 else
11848 tree copy
11849 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11850 3, dest_addr, src_addr,
11851 size_int (cur_size));
11852 gimplify_and_add (copy, pre_p);
11854 prev_size += cur_size;
11858 if (needed_intregs)
11860 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11861 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11862 gimplify_assign (gpr, t, pre_p);
11865 if (needed_sseregs)
11867 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11868 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11869 gimplify_assign (unshare_expr (fpr), t, pre_p);
11872 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11874 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11877 /* ... otherwise out of the overflow area. */
11879 /* When we align parameter on stack for caller, if the parameter
11880 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11881 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11882 here with caller. */
11883 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11884 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11885 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11887 /* Care for on-stack alignment if needed. */
11888 if (arg_boundary <= 64 || size == 0)
11889 t = ovf;
11890 else
11892 HOST_WIDE_INT align = arg_boundary / 8;
11893 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11894 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11895 build_int_cst (TREE_TYPE (t), -align));
11898 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11899 gimplify_assign (addr, t, pre_p);
11901 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11902 gimplify_assign (unshare_expr (ovf), t, pre_p);
11904 if (container)
11905 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11907 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11908 addr = fold_convert (ptrtype, addr);
11910 if (indirect_p)
11911 addr = build_va_arg_indirect_ref (addr);
11912 return build_va_arg_indirect_ref (addr);
11915 /* Return true if OPNUM's MEM should be matched
11916 in movabs* patterns. */
11918 bool
11919 ix86_check_movabs (rtx insn, int opnum)
11921 rtx set, mem;
11923 set = PATTERN (insn);
11924 if (GET_CODE (set) == PARALLEL)
11925 set = XVECEXP (set, 0, 0);
11926 gcc_assert (GET_CODE (set) == SET);
11927 mem = XEXP (set, opnum);
11928 while (SUBREG_P (mem))
11929 mem = SUBREG_REG (mem);
11930 gcc_assert (MEM_P (mem));
11931 return volatile_ok || !MEM_VOLATILE_P (mem);
11934 /* Return false if INSN contains a MEM with a non-default address space. */
11935 bool
11936 ix86_check_no_addr_space (rtx insn)
11938 subrtx_var_iterator::array_type array;
11939 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11941 rtx x = *iter;
11942 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11943 return false;
11945 return true;
11948 /* Initialize the table of extra 80387 mathematical constants. */
11950 static void
11951 init_ext_80387_constants (void)
11953 static const char * cst[5] =
11955 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11956 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11957 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11958 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11959 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11961 int i;
11963 for (i = 0; i < 5; i++)
11965 real_from_string (&ext_80387_constants_table[i], cst[i]);
11966 /* Ensure each constant is rounded to XFmode precision. */
11967 real_convert (&ext_80387_constants_table[i],
11968 XFmode, &ext_80387_constants_table[i]);
11971 ext_80387_constants_init = 1;
11974 /* Return non-zero if the constant is something that
11975 can be loaded with a special instruction. */
11978 standard_80387_constant_p (rtx x)
11980 machine_mode mode = GET_MODE (x);
11982 const REAL_VALUE_TYPE *r;
11984 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11985 return -1;
11987 if (x == CONST0_RTX (mode))
11988 return 1;
11989 if (x == CONST1_RTX (mode))
11990 return 2;
11992 r = CONST_DOUBLE_REAL_VALUE (x);
11994 /* For XFmode constants, try to find a special 80387 instruction when
11995 optimizing for size or on those CPUs that benefit from them. */
11996 if (mode == XFmode
11997 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11999 int i;
12001 if (! ext_80387_constants_init)
12002 init_ext_80387_constants ();
12004 for (i = 0; i < 5; i++)
12005 if (real_identical (r, &ext_80387_constants_table[i]))
12006 return i + 3;
12009 /* Load of the constant -0.0 or -1.0 will be split as
12010 fldz;fchs or fld1;fchs sequence. */
12011 if (real_isnegzero (r))
12012 return 8;
12013 if (real_identical (r, &dconstm1))
12014 return 9;
12016 return 0;
12019 /* Return the opcode of the special instruction to be used to load
12020 the constant X. */
12022 const char *
12023 standard_80387_constant_opcode (rtx x)
12025 switch (standard_80387_constant_p (x))
12027 case 1:
12028 return "fldz";
12029 case 2:
12030 return "fld1";
12031 case 3:
12032 return "fldlg2";
12033 case 4:
12034 return "fldln2";
12035 case 5:
12036 return "fldl2e";
12037 case 6:
12038 return "fldl2t";
12039 case 7:
12040 return "fldpi";
12041 case 8:
12042 case 9:
12043 return "#";
12044 default:
12045 gcc_unreachable ();
12049 /* Return the CONST_DOUBLE representing the 80387 constant that is
12050 loaded by the specified special instruction. The argument IDX
12051 matches the return value from standard_80387_constant_p. */
12054 standard_80387_constant_rtx (int idx)
12056 int i;
12058 if (! ext_80387_constants_init)
12059 init_ext_80387_constants ();
12061 switch (idx)
12063 case 3:
12064 case 4:
12065 case 5:
12066 case 6:
12067 case 7:
12068 i = idx - 3;
12069 break;
12071 default:
12072 gcc_unreachable ();
12075 return const_double_from_real_value (ext_80387_constants_table[i],
12076 XFmode);
12079 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12080 in supported SSE/AVX vector mode. */
12083 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12085 machine_mode mode;
12087 if (!TARGET_SSE)
12088 return 0;
12090 mode = GET_MODE (x);
12092 if (x == const0_rtx || const0_operand (x, mode))
12093 return 1;
12095 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12097 /* VOIDmode integer constant, get mode from the predicate. */
12098 if (mode == VOIDmode)
12099 mode = pred_mode;
12101 switch (GET_MODE_SIZE (mode))
12103 case 64:
12104 if (TARGET_AVX512F)
12105 return 2;
12106 break;
12107 case 32:
12108 if (TARGET_AVX2)
12109 return 2;
12110 break;
12111 case 16:
12112 if (TARGET_SSE2)
12113 return 2;
12114 break;
12115 case 0:
12116 /* VOIDmode */
12117 gcc_unreachable ();
12118 default:
12119 break;
12123 return 0;
12126 /* Return the opcode of the special instruction to be used to load
12127 the constant X. */
12129 const char *
12130 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12132 machine_mode mode;
12134 gcc_assert (TARGET_SSE);
12136 mode = GET_MODE (x);
12138 if (x == const0_rtx || const0_operand (x, mode))
12140 switch (get_attr_mode (insn))
12142 case MODE_XI:
12143 return "vpxord\t%g0, %g0, %g0";
12144 case MODE_OI:
12145 return (TARGET_AVX512VL
12146 ? "vpxord\t%x0, %x0, %x0"
12147 : "vpxor\t%x0, %x0, %x0");
12148 case MODE_TI:
12149 return (TARGET_AVX512VL
12150 ? "vpxord\t%t0, %t0, %t0"
12151 : "%vpxor\t%0, %d0");
12153 case MODE_V8DF:
12154 return (TARGET_AVX512DQ
12155 ? "vxorpd\t%g0, %g0, %g0"
12156 : "vpxorq\t%g0, %g0, %g0");
12157 case MODE_V4DF:
12158 return "vxorpd\t%x0, %x0, %x0";
12159 case MODE_V2DF:
12160 return "%vxorpd\t%0, %d0";
12162 case MODE_V16SF:
12163 return (TARGET_AVX512DQ
12164 ? "vxorps\t%g0, %g0, %g0"
12165 : "vpxord\t%g0, %g0, %g0");
12166 case MODE_V8SF:
12167 return "vxorps\t%x0, %x0, %x0";
12168 case MODE_V4SF:
12169 return "%vxorps\t%0, %d0";
12171 default:
12172 gcc_unreachable ();
12175 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12177 enum attr_mode insn_mode = get_attr_mode (insn);
12179 switch (insn_mode)
12181 case MODE_XI:
12182 case MODE_V8DF:
12183 case MODE_V16SF:
12184 gcc_assert (TARGET_AVX512F);
12185 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12187 case MODE_OI:
12188 case MODE_V4DF:
12189 case MODE_V8SF:
12190 gcc_assert (TARGET_AVX2);
12191 /* FALLTHRU */
12192 case MODE_TI:
12193 case MODE_V2DF:
12194 case MODE_V4SF:
12195 gcc_assert (TARGET_SSE2);
12196 return (TARGET_AVX
12197 ? "vpcmpeqd\t%0, %0, %0"
12198 : "pcmpeqd\t%0, %0");
12200 default:
12201 gcc_unreachable ();
12205 gcc_unreachable ();
12208 /* Returns true if INSN can be transformed from a memory load
12209 to a supported FP constant load. */
12211 bool
12212 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12214 rtx src = find_constant_src (insn);
12216 gcc_assert (REG_P (dst));
12218 if (src == NULL
12219 || (SSE_REGNO_P (REGNO (dst))
12220 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12221 || (STACK_REGNO_P (REGNO (dst))
12222 && standard_80387_constant_p (src) < 1))
12223 return false;
12225 return true;
12228 /* Returns true if OP contains a symbol reference */
12230 bool
12231 symbolic_reference_mentioned_p (rtx op)
12233 const char *fmt;
12234 int i;
12236 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12237 return true;
12239 fmt = GET_RTX_FORMAT (GET_CODE (op));
12240 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12242 if (fmt[i] == 'E')
12244 int j;
12246 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12247 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12248 return true;
12251 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12252 return true;
12255 return false;
12258 /* Return true if it is appropriate to emit `ret' instructions in the
12259 body of a function. Do this only if the epilogue is simple, needing a
12260 couple of insns. Prior to reloading, we can't tell how many registers
12261 must be saved, so return false then. Return false if there is no frame
12262 marker to de-allocate. */
12264 bool
12265 ix86_can_use_return_insn_p (void)
12267 struct ix86_frame frame;
12269 if (ix86_function_naked (current_function_decl))
12270 return false;
12272 /* Don't use `ret' instruction in interrupt handler. */
12273 if (! reload_completed
12274 || frame_pointer_needed
12275 || cfun->machine->func_type != TYPE_NORMAL)
12276 return 0;
12278 /* Don't allow more than 32k pop, since that's all we can do
12279 with one instruction. */
12280 if (crtl->args.pops_args && crtl->args.size >= 32768)
12281 return 0;
12283 frame = cfun->machine->frame;
12284 return (frame.stack_pointer_offset == UNITS_PER_WORD
12285 && (frame.nregs + frame.nsseregs) == 0);
12288 /* Value should be nonzero if functions must have frame pointers.
12289 Zero means the frame pointer need not be set up (and parms may
12290 be accessed via the stack pointer) in functions that seem suitable. */
12292 static bool
12293 ix86_frame_pointer_required (void)
12295 /* If we accessed previous frames, then the generated code expects
12296 to be able to access the saved ebp value in our frame. */
12297 if (cfun->machine->accesses_prev_frame)
12298 return true;
12300 /* Several x86 os'es need a frame pointer for other reasons,
12301 usually pertaining to setjmp. */
12302 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12303 return true;
12305 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12306 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12307 return true;
12309 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12310 allocation is 4GB. */
12311 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12312 return true;
12314 /* SSE saves require frame-pointer when stack is misaligned. */
12315 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12316 return true;
12318 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12319 turns off the frame pointer by default. Turn it back on now if
12320 we've not got a leaf function. */
12321 if (TARGET_OMIT_LEAF_FRAME_POINTER
12322 && (!crtl->is_leaf
12323 || ix86_current_function_calls_tls_descriptor))
12324 return true;
12326 if (crtl->profile && !flag_fentry)
12327 return true;
12329 return false;
12332 /* Record that the current function accesses previous call frames. */
12334 void
12335 ix86_setup_frame_addresses (void)
12337 cfun->machine->accesses_prev_frame = 1;
12340 #ifndef USE_HIDDEN_LINKONCE
12341 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12342 # define USE_HIDDEN_LINKONCE 1
12343 # else
12344 # define USE_HIDDEN_LINKONCE 0
12345 # endif
12346 #endif
12348 static int pic_labels_used;
12350 /* Fills in the label name that should be used for a pc thunk for
12351 the given register. */
12353 static void
12354 get_pc_thunk_name (char name[32], unsigned int regno)
12356 gcc_assert (!TARGET_64BIT);
12358 if (USE_HIDDEN_LINKONCE)
12359 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12360 else
12361 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12365 /* This function generates code for -fpic that loads %ebx with
12366 the return address of the caller and then returns. */
12368 static void
12369 ix86_code_end (void)
12371 rtx xops[2];
12372 int regno;
12374 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12376 char name[32];
12377 tree decl;
12379 if (!(pic_labels_used & (1 << regno)))
12380 continue;
12382 get_pc_thunk_name (name, regno);
12384 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12385 get_identifier (name),
12386 build_function_type_list (void_type_node, NULL_TREE));
12387 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12388 NULL_TREE, void_type_node);
12389 TREE_PUBLIC (decl) = 1;
12390 TREE_STATIC (decl) = 1;
12391 DECL_IGNORED_P (decl) = 1;
12393 #if TARGET_MACHO
12394 if (TARGET_MACHO)
12396 switch_to_section (darwin_sections[picbase_thunk_section]);
12397 fputs ("\t.weak_definition\t", asm_out_file);
12398 assemble_name (asm_out_file, name);
12399 fputs ("\n\t.private_extern\t", asm_out_file);
12400 assemble_name (asm_out_file, name);
12401 putc ('\n', asm_out_file);
12402 ASM_OUTPUT_LABEL (asm_out_file, name);
12403 DECL_WEAK (decl) = 1;
12405 else
12406 #endif
12407 if (USE_HIDDEN_LINKONCE)
12409 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12411 targetm.asm_out.unique_section (decl, 0);
12412 switch_to_section (get_named_section (decl, NULL, 0));
12414 targetm.asm_out.globalize_label (asm_out_file, name);
12415 fputs ("\t.hidden\t", asm_out_file);
12416 assemble_name (asm_out_file, name);
12417 putc ('\n', asm_out_file);
12418 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12420 else
12422 switch_to_section (text_section);
12423 ASM_OUTPUT_LABEL (asm_out_file, name);
12426 DECL_INITIAL (decl) = make_node (BLOCK);
12427 current_function_decl = decl;
12428 allocate_struct_function (decl, false);
12429 init_function_start (decl);
12430 /* We're about to hide the function body from callees of final_* by
12431 emitting it directly; tell them we're a thunk, if they care. */
12432 cfun->is_thunk = true;
12433 first_function_block_is_cold = false;
12434 /* Make sure unwind info is emitted for the thunk if needed. */
12435 final_start_function (emit_barrier (), asm_out_file, 1);
12437 /* Pad stack IP move with 4 instructions (two NOPs count
12438 as one instruction). */
12439 if (TARGET_PAD_SHORT_FUNCTION)
12441 int i = 8;
12443 while (i--)
12444 fputs ("\tnop\n", asm_out_file);
12447 xops[0] = gen_rtx_REG (Pmode, regno);
12448 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12449 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12450 output_asm_insn ("%!ret", NULL);
12451 final_end_function ();
12452 init_insn_lengths ();
12453 free_after_compilation (cfun);
12454 set_cfun (NULL);
12455 current_function_decl = NULL;
12458 if (flag_split_stack)
12459 file_end_indicate_split_stack ();
12462 /* Emit code for the SET_GOT patterns. */
12464 const char *
12465 output_set_got (rtx dest, rtx label)
12467 rtx xops[3];
12469 xops[0] = dest;
12471 if (TARGET_VXWORKS_RTP && flag_pic)
12473 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12474 xops[2] = gen_rtx_MEM (Pmode,
12475 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12476 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12478 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12479 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12480 an unadorned address. */
12481 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12482 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12483 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12484 return "";
12487 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12489 if (flag_pic)
12491 char name[32];
12492 get_pc_thunk_name (name, REGNO (dest));
12493 pic_labels_used |= 1 << REGNO (dest);
12495 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12496 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12497 output_asm_insn ("%!call\t%X2", xops);
12499 #if TARGET_MACHO
12500 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12501 This is what will be referenced by the Mach-O PIC subsystem. */
12502 if (machopic_should_output_picbase_label () || !label)
12503 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12505 /* When we are restoring the pic base at the site of a nonlocal label,
12506 and we decided to emit the pic base above, we will still output a
12507 local label used for calculating the correction offset (even though
12508 the offset will be 0 in that case). */
12509 if (label)
12510 targetm.asm_out.internal_label (asm_out_file, "L",
12511 CODE_LABEL_NUMBER (label));
12512 #endif
12514 else
12516 if (TARGET_MACHO)
12517 /* We don't need a pic base, we're not producing pic. */
12518 gcc_unreachable ();
12520 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12521 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12522 targetm.asm_out.internal_label (asm_out_file, "L",
12523 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12526 if (!TARGET_MACHO)
12527 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12529 return "";
12532 /* Generate an "push" pattern for input ARG. */
12534 static rtx
12535 gen_push (rtx arg)
12537 struct machine_function *m = cfun->machine;
12539 if (m->fs.cfa_reg == stack_pointer_rtx)
12540 m->fs.cfa_offset += UNITS_PER_WORD;
12541 m->fs.sp_offset += UNITS_PER_WORD;
12543 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12544 arg = gen_rtx_REG (word_mode, REGNO (arg));
12546 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12547 gen_rtx_PRE_DEC (Pmode,
12548 stack_pointer_rtx)),
12549 arg);
12552 /* Generate an "pop" pattern for input ARG. */
12554 static rtx
12555 gen_pop (rtx arg)
12557 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12558 arg = gen_rtx_REG (word_mode, REGNO (arg));
12560 return gen_rtx_SET (arg,
12561 gen_rtx_MEM (word_mode,
12562 gen_rtx_POST_INC (Pmode,
12563 stack_pointer_rtx)));
12566 /* Return >= 0 if there is an unused call-clobbered register available
12567 for the entire function. */
12569 static unsigned int
12570 ix86_select_alt_pic_regnum (void)
12572 if (ix86_use_pseudo_pic_reg ())
12573 return INVALID_REGNUM;
12575 if (crtl->is_leaf
12576 && !crtl->profile
12577 && !ix86_current_function_calls_tls_descriptor)
12579 int i, drap;
12580 /* Can't use the same register for both PIC and DRAP. */
12581 if (crtl->drap_reg)
12582 drap = REGNO (crtl->drap_reg);
12583 else
12584 drap = -1;
12585 for (i = 2; i >= 0; --i)
12586 if (i != drap && !df_regs_ever_live_p (i))
12587 return i;
12590 return INVALID_REGNUM;
12593 /* Return true if REGNO is used by the epilogue. */
12595 bool
12596 ix86_epilogue_uses (int regno)
12598 /* If there are no caller-saved registers, we preserve all registers,
12599 except for MMX and x87 registers which aren't supported when saving
12600 and restoring registers. Don't explicitly save SP register since
12601 it is always preserved. */
12602 return (epilogue_completed
12603 && cfun->machine->no_caller_saved_registers
12604 && !fixed_regs[regno]
12605 && !STACK_REGNO_P (regno)
12606 && !MMX_REGNO_P (regno));
12609 /* Return nonzero if register REGNO can be used as a scratch register
12610 in peephole2. */
12612 static bool
12613 ix86_hard_regno_scratch_ok (unsigned int regno)
12615 /* If there are no caller-saved registers, we can't use any register
12616 as a scratch register after epilogue and use REGNO as scratch
12617 register only if it has been used before to avoid saving and
12618 restoring it. */
12619 return (!cfun->machine->no_caller_saved_registers
12620 || (!epilogue_completed
12621 && df_regs_ever_live_p (regno)));
12624 /* Return true if register class CL should be an additional allocno
12625 class. */
12627 static bool
12628 ix86_additional_allocno_class_p (reg_class_t cl)
12630 return cl == MOD4_SSE_REGS;
12633 /* Return TRUE if we need to save REGNO. */
12635 static bool
12636 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12638 /* If there are no caller-saved registers, we preserve all registers,
12639 except for MMX and x87 registers which aren't supported when saving
12640 and restoring registers. Don't explicitly save SP register since
12641 it is always preserved. */
12642 if (cfun->machine->no_caller_saved_registers)
12644 /* Don't preserve registers used for function return value. */
12645 rtx reg = crtl->return_rtx;
12646 if (reg)
12648 unsigned int i = REGNO (reg);
12649 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12650 while (nregs-- > 0)
12651 if ((i + nregs) == regno)
12652 return false;
12654 reg = crtl->return_bnd;
12655 if (reg)
12657 i = REGNO (reg);
12658 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12659 while (nregs-- > 0)
12660 if ((i + nregs) == regno)
12661 return false;
12665 return (df_regs_ever_live_p (regno)
12666 && !fixed_regs[regno]
12667 && !STACK_REGNO_P (regno)
12668 && !MMX_REGNO_P (regno)
12669 && (regno != HARD_FRAME_POINTER_REGNUM
12670 || !frame_pointer_needed));
12673 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12674 && pic_offset_table_rtx)
12676 if (ix86_use_pseudo_pic_reg ())
12678 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12679 _mcount in prologue. */
12680 if (!TARGET_64BIT && flag_pic && crtl->profile)
12681 return true;
12683 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12684 || crtl->profile
12685 || crtl->calls_eh_return
12686 || crtl->uses_const_pool
12687 || cfun->has_nonlocal_label)
12688 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12691 if (crtl->calls_eh_return && maybe_eh_return)
12693 unsigned i;
12694 for (i = 0; ; i++)
12696 unsigned test = EH_RETURN_DATA_REGNO (i);
12697 if (test == INVALID_REGNUM)
12698 break;
12699 if (test == regno)
12700 return true;
12704 if (ignore_outlined && cfun->machine->call_ms2sysv)
12706 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12707 + xlogue_layout::MIN_REGS;
12708 if (xlogue_layout::is_stub_managed_reg (regno, count))
12709 return false;
12712 if (crtl->drap_reg
12713 && regno == REGNO (crtl->drap_reg)
12714 && !cfun->machine->no_drap_save_restore)
12715 return true;
12717 return (df_regs_ever_live_p (regno)
12718 && !call_used_regs[regno]
12719 && !fixed_regs[regno]
12720 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12723 /* Return number of saved general prupose registers. */
12725 static int
12726 ix86_nsaved_regs (void)
12728 int nregs = 0;
12729 int regno;
12731 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12732 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12733 nregs ++;
12734 return nregs;
12737 /* Return number of saved SSE registers. */
12739 static int
12740 ix86_nsaved_sseregs (void)
12742 int nregs = 0;
12743 int regno;
12745 if (!TARGET_64BIT_MS_ABI)
12746 return 0;
12747 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12748 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12749 nregs ++;
12750 return nregs;
12753 /* Given FROM and TO register numbers, say whether this elimination is
12754 allowed. If stack alignment is needed, we can only replace argument
12755 pointer with hard frame pointer, or replace frame pointer with stack
12756 pointer. Otherwise, frame pointer elimination is automatically
12757 handled and all other eliminations are valid. */
12759 static bool
12760 ix86_can_eliminate (const int from, const int to)
12762 if (stack_realign_fp)
12763 return ((from == ARG_POINTER_REGNUM
12764 && to == HARD_FRAME_POINTER_REGNUM)
12765 || (from == FRAME_POINTER_REGNUM
12766 && to == STACK_POINTER_REGNUM));
12767 else
12768 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12771 /* Return the offset between two registers, one to be eliminated, and the other
12772 its replacement, at the start of a routine. */
12774 HOST_WIDE_INT
12775 ix86_initial_elimination_offset (int from, int to)
12777 struct ix86_frame frame = cfun->machine->frame;
12779 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12780 return frame.hard_frame_pointer_offset;
12781 else if (from == FRAME_POINTER_REGNUM
12782 && to == HARD_FRAME_POINTER_REGNUM)
12783 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12784 else
12786 gcc_assert (to == STACK_POINTER_REGNUM);
12788 if (from == ARG_POINTER_REGNUM)
12789 return frame.stack_pointer_offset;
12791 gcc_assert (from == FRAME_POINTER_REGNUM);
12792 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12796 /* In a dynamically-aligned function, we can't know the offset from
12797 stack pointer to frame pointer, so we must ensure that setjmp
12798 eliminates fp against the hard fp (%ebp) rather than trying to
12799 index from %esp up to the top of the frame across a gap that is
12800 of unknown (at compile-time) size. */
12801 static rtx
12802 ix86_builtin_setjmp_frame_value (void)
12804 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12807 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12808 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12810 static bool warned_once = false;
12811 if (!warned_once)
12813 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12814 feature);
12815 warned_once = true;
12819 /* When using -fsplit-stack, the allocation routines set a field in
12820 the TCB to the bottom of the stack plus this much space, measured
12821 in bytes. */
12823 #define SPLIT_STACK_AVAILABLE 256
12825 /* Fill structure ix86_frame about frame of currently computed function. */
12827 static void
12828 ix86_compute_frame_layout (void)
12830 struct ix86_frame *frame = &cfun->machine->frame;
12831 struct machine_function *m = cfun->machine;
12832 unsigned HOST_WIDE_INT stack_alignment_needed;
12833 HOST_WIDE_INT offset;
12834 unsigned HOST_WIDE_INT preferred_alignment;
12835 HOST_WIDE_INT size = get_frame_size ();
12836 HOST_WIDE_INT to_allocate;
12838 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12839 * ms_abi functions that call a sysv function. We now need to prune away
12840 * cases where it should be disabled. */
12841 if (TARGET_64BIT && m->call_ms2sysv)
12843 gcc_assert (TARGET_64BIT_MS_ABI);
12844 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12845 gcc_assert (!TARGET_SEH);
12846 gcc_assert (TARGET_SSE);
12847 gcc_assert (!ix86_using_red_zone ());
12849 if (crtl->calls_eh_return)
12851 gcc_assert (!reload_completed);
12852 m->call_ms2sysv = false;
12853 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12856 else if (ix86_static_chain_on_stack)
12858 gcc_assert (!reload_completed);
12859 m->call_ms2sysv = false;
12860 warn_once_call_ms2sysv_xlogues ("static call chains");
12863 /* Finally, compute which registers the stub will manage. */
12864 else
12866 unsigned count = xlogue_layout::count_stub_managed_regs ();
12867 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12871 frame->nregs = ix86_nsaved_regs ();
12872 frame->nsseregs = ix86_nsaved_sseregs ();
12873 m->call_ms2sysv_pad_in = 0;
12874 m->call_ms2sysv_pad_out = 0;
12876 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12877 except for function prologues, leaf functions and when the defult
12878 incoming stack boundary is overriden at command line or via
12879 force_align_arg_pointer attribute. */
12880 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12881 && (!crtl->is_leaf || cfun->calls_alloca != 0
12882 || ix86_current_function_calls_tls_descriptor
12883 || ix86_incoming_stack_boundary < 128))
12885 crtl->preferred_stack_boundary = 128;
12886 crtl->stack_alignment_needed = 128;
12889 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12890 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12892 gcc_assert (!size || stack_alignment_needed);
12893 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12894 gcc_assert (preferred_alignment <= stack_alignment_needed);
12896 /* For SEH we have to limit the amount of code movement into the prologue.
12897 At present we do this via a BLOCKAGE, at which point there's very little
12898 scheduling that can be done, which means that there's very little point
12899 in doing anything except PUSHs. */
12900 if (TARGET_SEH)
12901 m->use_fast_prologue_epilogue = false;
12902 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12904 int count = frame->nregs;
12905 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12907 /* The fast prologue uses move instead of push to save registers. This
12908 is significantly longer, but also executes faster as modern hardware
12909 can execute the moves in parallel, but can't do that for push/pop.
12911 Be careful about choosing what prologue to emit: When function takes
12912 many instructions to execute we may use slow version as well as in
12913 case function is known to be outside hot spot (this is known with
12914 feedback only). Weight the size of function by number of registers
12915 to save as it is cheap to use one or two push instructions but very
12916 slow to use many of them. */
12917 if (count)
12918 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12919 if (node->frequency < NODE_FREQUENCY_NORMAL
12920 || (flag_branch_probabilities
12921 && node->frequency < NODE_FREQUENCY_HOT))
12922 m->use_fast_prologue_epilogue = false;
12923 else
12924 m->use_fast_prologue_epilogue
12925 = !expensive_function_p (count);
12928 frame->save_regs_using_mov
12929 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12930 /* If static stack checking is enabled and done with probes,
12931 the registers need to be saved before allocating the frame. */
12932 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12934 /* Skip return address and error code in exception handler. */
12935 offset = INCOMING_FRAME_SP_OFFSET;
12937 /* Skip pushed static chain. */
12938 if (ix86_static_chain_on_stack)
12939 offset += UNITS_PER_WORD;
12941 /* Skip saved base pointer. */
12942 if (frame_pointer_needed)
12943 offset += UNITS_PER_WORD;
12944 frame->hfp_save_offset = offset;
12946 /* The traditional frame pointer location is at the top of the frame. */
12947 frame->hard_frame_pointer_offset = offset;
12949 /* Register save area */
12950 offset += frame->nregs * UNITS_PER_WORD;
12951 frame->reg_save_offset = offset;
12953 /* On SEH target, registers are pushed just before the frame pointer
12954 location. */
12955 if (TARGET_SEH)
12956 frame->hard_frame_pointer_offset = offset;
12958 /* When re-aligning the stack frame, but not saving SSE registers, this
12959 is the offset we want adjust the stack pointer to. */
12960 frame->stack_realign_allocate_offset = offset;
12962 /* The re-aligned stack starts here. Values before this point are not
12963 directly comparable with values below this point. Use sp_valid_at
12964 to determine if the stack pointer is valid for a given offset and
12965 fp_valid_at for the frame pointer. */
12966 if (stack_realign_fp)
12967 offset = ROUND_UP (offset, stack_alignment_needed);
12968 frame->stack_realign_offset = offset;
12970 if (TARGET_64BIT && m->call_ms2sysv)
12972 gcc_assert (stack_alignment_needed >= 16);
12973 gcc_assert (!frame->nsseregs);
12975 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
12977 /* Select an appropriate layout for incoming stack offset. */
12978 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12980 if ((offset + xlogue.get_stack_space_used ()) & UNITS_PER_WORD)
12981 m->call_ms2sysv_pad_out = 1;
12983 offset += xlogue.get_stack_space_used ();
12984 gcc_assert (!(offset & 0xf));
12985 frame->outlined_save_offset = offset;
12988 /* Align and set SSE register save area. */
12989 else if (frame->nsseregs)
12991 /* The only ABI that has saved SSE registers (Win64) also has a
12992 16-byte aligned default stack. However, many programs violate
12993 the ABI, and Wine64 forces stack realignment to compensate.
12995 If the incoming stack boundary is at least 16 bytes, or DRAP is
12996 required and the DRAP re-alignment boundary is at least 16 bytes,
12997 then we want the SSE register save area properly aligned. */
12998 if (ix86_incoming_stack_boundary >= 128
12999 || (stack_realign_drap && stack_alignment_needed >= 16))
13000 offset = ROUND_UP (offset, 16);
13001 offset += frame->nsseregs * 16;
13002 frame->stack_realign_allocate_offset = offset;
13005 frame->sse_reg_save_offset = offset;
13007 /* Va-arg area */
13008 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
13009 offset += frame->va_arg_size;
13011 /* Align start of frame for local function. */
13012 if (stack_realign_fp
13013 || offset != frame->sse_reg_save_offset
13014 || size != 0
13015 || !crtl->is_leaf
13016 || cfun->calls_alloca
13017 || ix86_current_function_calls_tls_descriptor)
13018 offset = ROUND_UP (offset, stack_alignment_needed);
13020 /* Frame pointer points here. */
13021 frame->frame_pointer_offset = offset;
13023 offset += size;
13025 /* Add outgoing arguments area. Can be skipped if we eliminated
13026 all the function calls as dead code.
13027 Skipping is however impossible when function calls alloca. Alloca
13028 expander assumes that last crtl->outgoing_args_size
13029 of stack frame are unused. */
13030 if (ACCUMULATE_OUTGOING_ARGS
13031 && (!crtl->is_leaf || cfun->calls_alloca
13032 || ix86_current_function_calls_tls_descriptor))
13034 offset += crtl->outgoing_args_size;
13035 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13037 else
13038 frame->outgoing_arguments_size = 0;
13040 /* Align stack boundary. Only needed if we're calling another function
13041 or using alloca. */
13042 if (!crtl->is_leaf || cfun->calls_alloca
13043 || ix86_current_function_calls_tls_descriptor)
13044 offset = ROUND_UP (offset, preferred_alignment);
13046 /* We've reached end of stack frame. */
13047 frame->stack_pointer_offset = offset;
13049 /* Size prologue needs to allocate. */
13050 to_allocate = offset - frame->sse_reg_save_offset;
13052 if ((!to_allocate && frame->nregs <= 1)
13053 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13054 frame->save_regs_using_mov = false;
13056 if (ix86_using_red_zone ()
13057 && crtl->sp_is_unchanging
13058 && crtl->is_leaf
13059 && !ix86_pc_thunk_call_expanded
13060 && !ix86_current_function_calls_tls_descriptor)
13062 frame->red_zone_size = to_allocate;
13063 if (frame->save_regs_using_mov)
13064 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13065 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13066 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13068 else
13069 frame->red_zone_size = 0;
13070 frame->stack_pointer_offset -= frame->red_zone_size;
13072 /* The SEH frame pointer location is near the bottom of the frame.
13073 This is enforced by the fact that the difference between the
13074 stack pointer and the frame pointer is limited to 240 bytes in
13075 the unwind data structure. */
13076 if (TARGET_SEH)
13078 HOST_WIDE_INT diff;
13080 /* If we can leave the frame pointer where it is, do so. Also, returns
13081 the establisher frame for __builtin_frame_address (0). */
13082 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13083 if (diff <= SEH_MAX_FRAME_SIZE
13084 && (diff > 240 || (diff & 15) != 0)
13085 && !crtl->accesses_prior_frames)
13087 /* Ideally we'd determine what portion of the local stack frame
13088 (within the constraint of the lowest 240) is most heavily used.
13089 But without that complication, simply bias the frame pointer
13090 by 128 bytes so as to maximize the amount of the local stack
13091 frame that is addressable with 8-bit offsets. */
13092 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13097 /* This is semi-inlined memory_address_length, but simplified
13098 since we know that we're always dealing with reg+offset, and
13099 to avoid having to create and discard all that rtl. */
13101 static inline int
13102 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13104 int len = 4;
13106 if (offset == 0)
13108 /* EBP and R13 cannot be encoded without an offset. */
13109 len = (regno == BP_REG || regno == R13_REG);
13111 else if (IN_RANGE (offset, -128, 127))
13112 len = 1;
13114 /* ESP and R12 must be encoded with a SIB byte. */
13115 if (regno == SP_REG || regno == R12_REG)
13116 len++;
13118 return len;
13121 /* Determine if the stack pointer is valid for accessing the cfa_offset.
13122 The register is saved at CFA - CFA_OFFSET. */
13124 static inline bool
13125 sp_valid_at (HOST_WIDE_INT cfa_offset)
13127 const struct machine_frame_state &fs = cfun->machine->fs;
13128 return fs.sp_valid && !(fs.sp_realigned
13129 && cfa_offset <= fs.sp_realigned_offset);
13132 /* Determine if the frame pointer is valid for accessing the cfa_offset.
13133 The register is saved at CFA - CFA_OFFSET. */
13135 static inline bool
13136 fp_valid_at (HOST_WIDE_INT cfa_offset)
13138 const struct machine_frame_state &fs = cfun->machine->fs;
13139 return fs.fp_valid && !(fs.sp_valid && fs.sp_realigned
13140 && cfa_offset > fs.sp_realigned_offset);
13143 /* Choose a base register based upon alignment requested, speed and/or
13144 size. */
13146 static void
13147 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13148 HOST_WIDE_INT &base_offset,
13149 unsigned int align_reqested, unsigned int *align)
13151 const struct machine_function *m = cfun->machine;
13152 unsigned int hfp_align;
13153 unsigned int drap_align;
13154 unsigned int sp_align;
13155 bool hfp_ok = fp_valid_at (cfa_offset);
13156 bool drap_ok = m->fs.drap_valid;
13157 bool sp_ok = sp_valid_at (cfa_offset);
13159 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13161 /* Filter out any registers that don't meet the requested alignment
13162 criteria. */
13163 if (align_reqested)
13165 if (m->fs.realigned)
13166 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13167 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13168 notes (which we would need to use a realigned stack pointer),
13169 so disable on SEH targets. */
13170 else if (m->fs.sp_realigned)
13171 sp_align = crtl->stack_alignment_needed;
13173 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13174 drap_ok = drap_ok && drap_align >= align_reqested;
13175 sp_ok = sp_ok && sp_align >= align_reqested;
13178 if (m->use_fast_prologue_epilogue)
13180 /* Choose the base register most likely to allow the most scheduling
13181 opportunities. Generally FP is valid throughout the function,
13182 while DRAP must be reloaded within the epilogue. But choose either
13183 over the SP due to increased encoding size. */
13185 if (hfp_ok)
13187 base_reg = hard_frame_pointer_rtx;
13188 base_offset = m->fs.fp_offset - cfa_offset;
13190 else if (drap_ok)
13192 base_reg = crtl->drap_reg;
13193 base_offset = 0 - cfa_offset;
13195 else if (sp_ok)
13197 base_reg = stack_pointer_rtx;
13198 base_offset = m->fs.sp_offset - cfa_offset;
13201 else
13203 HOST_WIDE_INT toffset;
13204 int len = 16, tlen;
13206 /* Choose the base register with the smallest address encoding.
13207 With a tie, choose FP > DRAP > SP. */
13208 if (sp_ok)
13210 base_reg = stack_pointer_rtx;
13211 base_offset = m->fs.sp_offset - cfa_offset;
13212 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13214 if (drap_ok)
13216 toffset = 0 - cfa_offset;
13217 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13218 if (tlen <= len)
13220 base_reg = crtl->drap_reg;
13221 base_offset = toffset;
13222 len = tlen;
13225 if (hfp_ok)
13227 toffset = m->fs.fp_offset - cfa_offset;
13228 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13229 if (tlen <= len)
13231 base_reg = hard_frame_pointer_rtx;
13232 base_offset = toffset;
13233 len = tlen;
13238 /* Set the align return value. */
13239 if (align)
13241 if (base_reg == stack_pointer_rtx)
13242 *align = sp_align;
13243 else if (base_reg == crtl->drap_reg)
13244 *align = drap_align;
13245 else if (base_reg == hard_frame_pointer_rtx)
13246 *align = hfp_align;
13250 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13251 the alignment of address. If align is non-null, it should point to
13252 an alignment value (in bits) that is preferred or zero and will
13253 recieve the alignment of the base register that was selected. The
13254 valid base registers are taken from CFUN->MACHINE->FS. */
13256 static rtx
13257 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13259 rtx base_reg = NULL;
13260 HOST_WIDE_INT base_offset = 0;
13262 /* If a specific alignment is requested, try to get a base register
13263 with that alignment first. */
13264 if (align && *align)
13265 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13267 if (!base_reg)
13268 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13270 gcc_assert (base_reg != NULL);
13271 return plus_constant (Pmode, base_reg, base_offset);
13274 /* Emit code to save registers in the prologue. */
13276 static void
13277 ix86_emit_save_regs (void)
13279 unsigned int regno;
13280 rtx_insn *insn;
13282 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13283 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13285 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13286 RTX_FRAME_RELATED_P (insn) = 1;
13290 /* Emit a single register save at CFA - CFA_OFFSET. */
13292 static void
13293 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13294 HOST_WIDE_INT cfa_offset)
13296 struct machine_function *m = cfun->machine;
13297 rtx reg = gen_rtx_REG (mode, regno);
13298 rtx mem, addr, base, insn;
13299 unsigned int align = GET_MODE_ALIGNMENT (mode);
13301 addr = choose_baseaddr (cfa_offset, &align);
13302 mem = gen_frame_mem (mode, addr);
13304 /* The location aligment depends upon the base register. */
13305 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13306 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13307 set_mem_align (mem, align);
13309 insn = emit_insn (gen_rtx_SET (mem, reg));
13310 RTX_FRAME_RELATED_P (insn) = 1;
13312 base = addr;
13313 if (GET_CODE (base) == PLUS)
13314 base = XEXP (base, 0);
13315 gcc_checking_assert (REG_P (base));
13317 /* When saving registers into a re-aligned local stack frame, avoid
13318 any tricky guessing by dwarf2out. */
13319 if (m->fs.realigned)
13321 gcc_checking_assert (stack_realign_drap);
13323 if (regno == REGNO (crtl->drap_reg))
13325 /* A bit of a hack. We force the DRAP register to be saved in
13326 the re-aligned stack frame, which provides us with a copy
13327 of the CFA that will last past the prologue. Install it. */
13328 gcc_checking_assert (cfun->machine->fs.fp_valid);
13329 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13330 cfun->machine->fs.fp_offset - cfa_offset);
13331 mem = gen_rtx_MEM (mode, addr);
13332 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13334 else
13336 /* The frame pointer is a stable reference within the
13337 aligned frame. Use it. */
13338 gcc_checking_assert (cfun->machine->fs.fp_valid);
13339 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13340 cfun->machine->fs.fp_offset - cfa_offset);
13341 mem = gen_rtx_MEM (mode, addr);
13342 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13346 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13347 && cfa_offset >= m->fs.sp_realigned_offset)
13349 gcc_checking_assert (stack_realign_fp);
13350 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13353 /* The memory may not be relative to the current CFA register,
13354 which means that we may need to generate a new pattern for
13355 use by the unwind info. */
13356 else if (base != m->fs.cfa_reg)
13358 addr = plus_constant (Pmode, m->fs.cfa_reg,
13359 m->fs.cfa_offset - cfa_offset);
13360 mem = gen_rtx_MEM (mode, addr);
13361 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13365 /* Emit code to save registers using MOV insns.
13366 First register is stored at CFA - CFA_OFFSET. */
13367 static void
13368 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13370 unsigned int regno;
13372 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13373 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13375 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13376 cfa_offset -= UNITS_PER_WORD;
13380 /* Emit code to save SSE registers using MOV insns.
13381 First register is stored at CFA - CFA_OFFSET. */
13382 static void
13383 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13385 unsigned int regno;
13387 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13388 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13390 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13391 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13395 static GTY(()) rtx queued_cfa_restores;
13397 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13398 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13399 Don't add the note if the previously saved value will be left untouched
13400 within stack red-zone till return, as unwinders can find the same value
13401 in the register and on the stack. */
13403 static void
13404 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13406 if (!crtl->shrink_wrapped
13407 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13408 return;
13410 if (insn)
13412 add_reg_note (insn, REG_CFA_RESTORE, reg);
13413 RTX_FRAME_RELATED_P (insn) = 1;
13415 else
13416 queued_cfa_restores
13417 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13420 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13422 static void
13423 ix86_add_queued_cfa_restore_notes (rtx insn)
13425 rtx last;
13426 if (!queued_cfa_restores)
13427 return;
13428 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13430 XEXP (last, 1) = REG_NOTES (insn);
13431 REG_NOTES (insn) = queued_cfa_restores;
13432 queued_cfa_restores = NULL_RTX;
13433 RTX_FRAME_RELATED_P (insn) = 1;
13436 /* Expand prologue or epilogue stack adjustment.
13437 The pattern exist to put a dependency on all ebp-based memory accesses.
13438 STYLE should be negative if instructions should be marked as frame related,
13439 zero if %r11 register is live and cannot be freely used and positive
13440 otherwise. */
13442 static void
13443 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13444 int style, bool set_cfa)
13446 struct machine_function *m = cfun->machine;
13447 rtx insn;
13448 bool add_frame_related_expr = false;
13450 if (Pmode == SImode)
13451 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13452 else if (x86_64_immediate_operand (offset, DImode))
13453 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13454 else
13456 rtx tmp;
13457 /* r11 is used by indirect sibcall return as well, set before the
13458 epilogue and used after the epilogue. */
13459 if (style)
13460 tmp = gen_rtx_REG (DImode, R11_REG);
13461 else
13463 gcc_assert (src != hard_frame_pointer_rtx
13464 && dest != hard_frame_pointer_rtx);
13465 tmp = hard_frame_pointer_rtx;
13467 insn = emit_insn (gen_rtx_SET (tmp, offset));
13468 if (style < 0)
13469 add_frame_related_expr = true;
13471 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13474 insn = emit_insn (insn);
13475 if (style >= 0)
13476 ix86_add_queued_cfa_restore_notes (insn);
13478 if (set_cfa)
13480 rtx r;
13482 gcc_assert (m->fs.cfa_reg == src);
13483 m->fs.cfa_offset += INTVAL (offset);
13484 m->fs.cfa_reg = dest;
13486 r = gen_rtx_PLUS (Pmode, src, offset);
13487 r = gen_rtx_SET (dest, r);
13488 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13489 RTX_FRAME_RELATED_P (insn) = 1;
13491 else if (style < 0)
13493 RTX_FRAME_RELATED_P (insn) = 1;
13494 if (add_frame_related_expr)
13496 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13497 r = gen_rtx_SET (dest, r);
13498 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13502 if (dest == stack_pointer_rtx)
13504 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13505 bool valid = m->fs.sp_valid;
13506 bool realigned = m->fs.sp_realigned;
13508 if (src == hard_frame_pointer_rtx)
13510 valid = m->fs.fp_valid;
13511 realigned = false;
13512 ooffset = m->fs.fp_offset;
13514 else if (src == crtl->drap_reg)
13516 valid = m->fs.drap_valid;
13517 realigned = false;
13518 ooffset = 0;
13520 else
13522 /* Else there are two possibilities: SP itself, which we set
13523 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13524 taken care of this by hand along the eh_return path. */
13525 gcc_checking_assert (src == stack_pointer_rtx
13526 || offset == const0_rtx);
13529 m->fs.sp_offset = ooffset - INTVAL (offset);
13530 m->fs.sp_valid = valid;
13531 m->fs.sp_realigned = realigned;
13535 /* Find an available register to be used as dynamic realign argument
13536 pointer regsiter. Such a register will be written in prologue and
13537 used in begin of body, so it must not be
13538 1. parameter passing register.
13539 2. GOT pointer.
13540 We reuse static-chain register if it is available. Otherwise, we
13541 use DI for i386 and R13 for x86-64. We chose R13 since it has
13542 shorter encoding.
13544 Return: the regno of chosen register. */
13546 static unsigned int
13547 find_drap_reg (void)
13549 tree decl = cfun->decl;
13551 /* Always use callee-saved register if there are no caller-saved
13552 registers. */
13553 if (TARGET_64BIT)
13555 /* Use R13 for nested function or function need static chain.
13556 Since function with tail call may use any caller-saved
13557 registers in epilogue, DRAP must not use caller-saved
13558 register in such case. */
13559 if (DECL_STATIC_CHAIN (decl)
13560 || cfun->machine->no_caller_saved_registers
13561 || crtl->tail_call_emit)
13562 return R13_REG;
13564 return R10_REG;
13566 else
13568 /* Use DI for nested function or function need static chain.
13569 Since function with tail call may use any caller-saved
13570 registers in epilogue, DRAP must not use caller-saved
13571 register in such case. */
13572 if (DECL_STATIC_CHAIN (decl)
13573 || cfun->machine->no_caller_saved_registers
13574 || crtl->tail_call_emit)
13575 return DI_REG;
13577 /* Reuse static chain register if it isn't used for parameter
13578 passing. */
13579 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13581 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13582 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13583 return CX_REG;
13585 return DI_REG;
13589 /* Handle a "force_align_arg_pointer" attribute. */
13591 static tree
13592 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13593 tree, int, bool *no_add_attrs)
13595 if (TREE_CODE (*node) != FUNCTION_TYPE
13596 && TREE_CODE (*node) != METHOD_TYPE
13597 && TREE_CODE (*node) != FIELD_DECL
13598 && TREE_CODE (*node) != TYPE_DECL)
13600 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13601 name);
13602 *no_add_attrs = true;
13605 return NULL_TREE;
13608 /* Return minimum incoming stack alignment. */
13610 static unsigned int
13611 ix86_minimum_incoming_stack_boundary (bool sibcall)
13613 unsigned int incoming_stack_boundary;
13615 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13616 if (cfun->machine->func_type != TYPE_NORMAL)
13617 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13618 /* Prefer the one specified at command line. */
13619 else if (ix86_user_incoming_stack_boundary)
13620 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13621 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13622 if -mstackrealign is used, it isn't used for sibcall check and
13623 estimated stack alignment is 128bit. */
13624 else if (!sibcall
13625 && ix86_force_align_arg_pointer
13626 && crtl->stack_alignment_estimated == 128)
13627 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13628 else
13629 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13631 /* Incoming stack alignment can be changed on individual functions
13632 via force_align_arg_pointer attribute. We use the smallest
13633 incoming stack boundary. */
13634 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13635 && lookup_attribute (ix86_force_align_arg_pointer_string,
13636 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13637 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13639 /* The incoming stack frame has to be aligned at least at
13640 parm_stack_boundary. */
13641 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13642 incoming_stack_boundary = crtl->parm_stack_boundary;
13644 /* Stack at entrance of main is aligned by runtime. We use the
13645 smallest incoming stack boundary. */
13646 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13647 && DECL_NAME (current_function_decl)
13648 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13649 && DECL_FILE_SCOPE_P (current_function_decl))
13650 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13652 return incoming_stack_boundary;
13655 /* Update incoming stack boundary and estimated stack alignment. */
13657 static void
13658 ix86_update_stack_boundary (void)
13660 ix86_incoming_stack_boundary
13661 = ix86_minimum_incoming_stack_boundary (false);
13663 /* x86_64 vararg needs 16byte stack alignment for register save
13664 area. */
13665 if (TARGET_64BIT
13666 && cfun->stdarg
13667 && crtl->stack_alignment_estimated < 128)
13668 crtl->stack_alignment_estimated = 128;
13670 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13671 if (ix86_tls_descriptor_calls_expanded_in_cfun
13672 && crtl->preferred_stack_boundary < 128)
13673 crtl->preferred_stack_boundary = 128;
13676 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13677 needed or an rtx for DRAP otherwise. */
13679 static rtx
13680 ix86_get_drap_rtx (void)
13682 /* We must use DRAP if there are outgoing arguments on stack and
13683 ACCUMULATE_OUTGOING_ARGS is false. */
13684 if (ix86_force_drap
13685 || (cfun->machine->outgoing_args_on_stack
13686 && !ACCUMULATE_OUTGOING_ARGS))
13687 crtl->need_drap = true;
13689 if (stack_realign_drap)
13691 /* Assign DRAP to vDRAP and returns vDRAP */
13692 unsigned int regno = find_drap_reg ();
13693 rtx drap_vreg;
13694 rtx arg_ptr;
13695 rtx_insn *seq, *insn;
13697 arg_ptr = gen_rtx_REG (Pmode, regno);
13698 crtl->drap_reg = arg_ptr;
13700 start_sequence ();
13701 drap_vreg = copy_to_reg (arg_ptr);
13702 seq = get_insns ();
13703 end_sequence ();
13705 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13706 if (!optimize)
13708 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13709 RTX_FRAME_RELATED_P (insn) = 1;
13711 return drap_vreg;
13713 else
13714 return NULL;
13717 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13719 static rtx
13720 ix86_internal_arg_pointer (void)
13722 return virtual_incoming_args_rtx;
13725 struct scratch_reg {
13726 rtx reg;
13727 bool saved;
13730 /* Return a short-lived scratch register for use on function entry.
13731 In 32-bit mode, it is valid only after the registers are saved
13732 in the prologue. This register must be released by means of
13733 release_scratch_register_on_entry once it is dead. */
13735 static void
13736 get_scratch_register_on_entry (struct scratch_reg *sr)
13738 int regno;
13740 sr->saved = false;
13742 if (TARGET_64BIT)
13744 /* We always use R11 in 64-bit mode. */
13745 regno = R11_REG;
13747 else
13749 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13750 bool fastcall_p
13751 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13752 bool thiscall_p
13753 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13754 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13755 int regparm = ix86_function_regparm (fntype, decl);
13756 int drap_regno
13757 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13759 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13760 for the static chain register. */
13761 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13762 && drap_regno != AX_REG)
13763 regno = AX_REG;
13764 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13765 for the static chain register. */
13766 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13767 regno = AX_REG;
13768 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13769 regno = DX_REG;
13770 /* ecx is the static chain register. */
13771 else if (regparm < 3 && !fastcall_p && !thiscall_p
13772 && !static_chain_p
13773 && drap_regno != CX_REG)
13774 regno = CX_REG;
13775 else if (ix86_save_reg (BX_REG, true, false))
13776 regno = BX_REG;
13777 /* esi is the static chain register. */
13778 else if (!(regparm == 3 && static_chain_p)
13779 && ix86_save_reg (SI_REG, true, false))
13780 regno = SI_REG;
13781 else if (ix86_save_reg (DI_REG, true, false))
13782 regno = DI_REG;
13783 else
13785 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13786 sr->saved = true;
13790 sr->reg = gen_rtx_REG (Pmode, regno);
13791 if (sr->saved)
13793 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13794 RTX_FRAME_RELATED_P (insn) = 1;
13798 /* Release a scratch register obtained from the preceding function. */
13800 static void
13801 release_scratch_register_on_entry (struct scratch_reg *sr)
13803 if (sr->saved)
13805 struct machine_function *m = cfun->machine;
13806 rtx x, insn = emit_insn (gen_pop (sr->reg));
13808 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13809 RTX_FRAME_RELATED_P (insn) = 1;
13810 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13811 x = gen_rtx_SET (stack_pointer_rtx, x);
13812 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13813 m->fs.sp_offset -= UNITS_PER_WORD;
13817 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13819 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13821 static void
13822 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13824 /* We skip the probe for the first interval + a small dope of 4 words and
13825 probe that many bytes past the specified size to maintain a protection
13826 area at the botton of the stack. */
13827 const int dope = 4 * UNITS_PER_WORD;
13828 rtx size_rtx = GEN_INT (size), last;
13830 /* See if we have a constant small number of probes to generate. If so,
13831 that's the easy case. The run-time loop is made up of 9 insns in the
13832 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13833 for n # of intervals. */
13834 if (size <= 4 * PROBE_INTERVAL)
13836 HOST_WIDE_INT i, adjust;
13837 bool first_probe = true;
13839 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13840 values of N from 1 until it exceeds SIZE. If only one probe is
13841 needed, this will not generate any code. Then adjust and probe
13842 to PROBE_INTERVAL + SIZE. */
13843 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13845 if (first_probe)
13847 adjust = 2 * PROBE_INTERVAL + dope;
13848 first_probe = false;
13850 else
13851 adjust = PROBE_INTERVAL;
13853 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13854 plus_constant (Pmode, stack_pointer_rtx,
13855 -adjust)));
13856 emit_stack_probe (stack_pointer_rtx);
13859 if (first_probe)
13860 adjust = size + PROBE_INTERVAL + dope;
13861 else
13862 adjust = size + PROBE_INTERVAL - i;
13864 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13865 plus_constant (Pmode, stack_pointer_rtx,
13866 -adjust)));
13867 emit_stack_probe (stack_pointer_rtx);
13869 /* Adjust back to account for the additional first interval. */
13870 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13871 plus_constant (Pmode, stack_pointer_rtx,
13872 PROBE_INTERVAL + dope)));
13875 /* Otherwise, do the same as above, but in a loop. Note that we must be
13876 extra careful with variables wrapping around because we might be at
13877 the very top (or the very bottom) of the address space and we have
13878 to be able to handle this case properly; in particular, we use an
13879 equality test for the loop condition. */
13880 else
13882 HOST_WIDE_INT rounded_size;
13883 struct scratch_reg sr;
13885 get_scratch_register_on_entry (&sr);
13888 /* Step 1: round SIZE to the previous multiple of the interval. */
13890 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13893 /* Step 2: compute initial and final value of the loop counter. */
13895 /* SP = SP_0 + PROBE_INTERVAL. */
13896 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13897 plus_constant (Pmode, stack_pointer_rtx,
13898 - (PROBE_INTERVAL + dope))));
13900 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13901 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13902 emit_insn (gen_rtx_SET (sr.reg,
13903 plus_constant (Pmode, stack_pointer_rtx,
13904 -rounded_size)));
13905 else
13907 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13908 emit_insn (gen_rtx_SET (sr.reg,
13909 gen_rtx_PLUS (Pmode, sr.reg,
13910 stack_pointer_rtx)));
13914 /* Step 3: the loop
13918 SP = SP + PROBE_INTERVAL
13919 probe at SP
13921 while (SP != LAST_ADDR)
13923 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13924 values of N from 1 until it is equal to ROUNDED_SIZE. */
13926 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13929 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13930 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13932 if (size != rounded_size)
13934 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13935 plus_constant (Pmode, stack_pointer_rtx,
13936 rounded_size - size)));
13937 emit_stack_probe (stack_pointer_rtx);
13940 /* Adjust back to account for the additional first interval. */
13941 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13942 plus_constant (Pmode, stack_pointer_rtx,
13943 PROBE_INTERVAL + dope)));
13945 release_scratch_register_on_entry (&sr);
13948 /* Even if the stack pointer isn't the CFA register, we need to correctly
13949 describe the adjustments made to it, in particular differentiate the
13950 frame-related ones from the frame-unrelated ones. */
13951 if (size > 0)
13953 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13954 XVECEXP (expr, 0, 0)
13955 = gen_rtx_SET (stack_pointer_rtx,
13956 plus_constant (Pmode, stack_pointer_rtx, -size));
13957 XVECEXP (expr, 0, 1)
13958 = gen_rtx_SET (stack_pointer_rtx,
13959 plus_constant (Pmode, stack_pointer_rtx,
13960 PROBE_INTERVAL + dope + size));
13961 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13962 RTX_FRAME_RELATED_P (last) = 1;
13964 cfun->machine->fs.sp_offset += size;
13967 /* Make sure nothing is scheduled before we are done. */
13968 emit_insn (gen_blockage ());
13971 /* Adjust the stack pointer up to REG while probing it. */
13973 const char *
13974 output_adjust_stack_and_probe (rtx reg)
13976 static int labelno = 0;
13977 char loop_lab[32];
13978 rtx xops[2];
13980 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13982 /* Loop. */
13983 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13985 /* SP = SP + PROBE_INTERVAL. */
13986 xops[0] = stack_pointer_rtx;
13987 xops[1] = GEN_INT (PROBE_INTERVAL);
13988 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13990 /* Probe at SP. */
13991 xops[1] = const0_rtx;
13992 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13994 /* Test if SP == LAST_ADDR. */
13995 xops[0] = stack_pointer_rtx;
13996 xops[1] = reg;
13997 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13999 /* Branch. */
14000 fputs ("\tjne\t", asm_out_file);
14001 assemble_name_raw (asm_out_file, loop_lab);
14002 fputc ('\n', asm_out_file);
14004 return "";
14007 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
14008 inclusive. These are offsets from the current stack pointer. */
14010 static void
14011 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14013 /* See if we have a constant small number of probes to generate. If so,
14014 that's the easy case. The run-time loop is made up of 6 insns in the
14015 generic case while the compile-time loop is made up of n insns for n #
14016 of intervals. */
14017 if (size <= 6 * PROBE_INTERVAL)
14019 HOST_WIDE_INT i;
14021 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14022 it exceeds SIZE. If only one probe is needed, this will not
14023 generate any code. Then probe at FIRST + SIZE. */
14024 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14025 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14026 -(first + i)));
14028 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14029 -(first + size)));
14032 /* Otherwise, do the same as above, but in a loop. Note that we must be
14033 extra careful with variables wrapping around because we might be at
14034 the very top (or the very bottom) of the address space and we have
14035 to be able to handle this case properly; in particular, we use an
14036 equality test for the loop condition. */
14037 else
14039 HOST_WIDE_INT rounded_size, last;
14040 struct scratch_reg sr;
14042 get_scratch_register_on_entry (&sr);
14045 /* Step 1: round SIZE to the previous multiple of the interval. */
14047 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14050 /* Step 2: compute initial and final value of the loop counter. */
14052 /* TEST_OFFSET = FIRST. */
14053 emit_move_insn (sr.reg, GEN_INT (-first));
14055 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14056 last = first + rounded_size;
14059 /* Step 3: the loop
14063 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14064 probe at TEST_ADDR
14066 while (TEST_ADDR != LAST_ADDR)
14068 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14069 until it is equal to ROUNDED_SIZE. */
14071 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14074 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14075 that SIZE is equal to ROUNDED_SIZE. */
14077 if (size != rounded_size)
14078 emit_stack_probe (plus_constant (Pmode,
14079 gen_rtx_PLUS (Pmode,
14080 stack_pointer_rtx,
14081 sr.reg),
14082 rounded_size - size));
14084 release_scratch_register_on_entry (&sr);
14087 /* Make sure nothing is scheduled before we are done. */
14088 emit_insn (gen_blockage ());
14091 /* Probe a range of stack addresses from REG to END, inclusive. These are
14092 offsets from the current stack pointer. */
14094 const char *
14095 output_probe_stack_range (rtx reg, rtx end)
14097 static int labelno = 0;
14098 char loop_lab[32];
14099 rtx xops[3];
14101 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14103 /* Loop. */
14104 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14106 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14107 xops[0] = reg;
14108 xops[1] = GEN_INT (PROBE_INTERVAL);
14109 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14111 /* Probe at TEST_ADDR. */
14112 xops[0] = stack_pointer_rtx;
14113 xops[1] = reg;
14114 xops[2] = const0_rtx;
14115 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14117 /* Test if TEST_ADDR == LAST_ADDR. */
14118 xops[0] = reg;
14119 xops[1] = end;
14120 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14122 /* Branch. */
14123 fputs ("\tjne\t", asm_out_file);
14124 assemble_name_raw (asm_out_file, loop_lab);
14125 fputc ('\n', asm_out_file);
14127 return "";
14130 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
14131 to be generated in correct form. */
14132 static void
14133 ix86_finalize_stack_realign_flags (void)
14135 /* Check if stack realign is really needed after reload, and
14136 stores result in cfun */
14137 unsigned int incoming_stack_boundary
14138 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14139 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14140 unsigned int stack_realign
14141 = (incoming_stack_boundary
14142 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14143 ? crtl->max_used_stack_slot_alignment
14144 : crtl->stack_alignment_needed));
14145 bool recompute_frame_layout_p = false;
14147 if (crtl->stack_realign_finalized)
14149 /* After stack_realign_needed is finalized, we can't no longer
14150 change it. */
14151 gcc_assert (crtl->stack_realign_needed == stack_realign);
14152 return;
14155 /* If the only reason for frame_pointer_needed is that we conservatively
14156 assumed stack realignment might be needed, but in the end nothing that
14157 needed the stack alignment had been spilled, clear frame_pointer_needed
14158 and say we don't need stack realignment. */
14159 if (stack_realign
14160 && frame_pointer_needed
14161 && crtl->is_leaf
14162 && flag_omit_frame_pointer
14163 && crtl->sp_is_unchanging
14164 && !ix86_current_function_calls_tls_descriptor
14165 && !crtl->accesses_prior_frames
14166 && !cfun->calls_alloca
14167 && !crtl->calls_eh_return
14168 /* See ira_setup_eliminable_regset for the rationale. */
14169 && !(STACK_CHECK_MOVING_SP
14170 && flag_stack_check
14171 && flag_exceptions
14172 && cfun->can_throw_non_call_exceptions)
14173 && !ix86_frame_pointer_required ()
14174 && get_frame_size () == 0
14175 && ix86_nsaved_sseregs () == 0
14176 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14178 HARD_REG_SET set_up_by_prologue, prologue_used;
14179 basic_block bb;
14181 CLEAR_HARD_REG_SET (prologue_used);
14182 CLEAR_HARD_REG_SET (set_up_by_prologue);
14183 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14184 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14185 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14186 HARD_FRAME_POINTER_REGNUM);
14187 FOR_EACH_BB_FN (bb, cfun)
14189 rtx_insn *insn;
14190 FOR_BB_INSNS (bb, insn)
14191 if (NONDEBUG_INSN_P (insn)
14192 && requires_stack_frame_p (insn, prologue_used,
14193 set_up_by_prologue))
14195 if (crtl->stack_realign_needed != stack_realign)
14196 recompute_frame_layout_p = true;
14197 crtl->stack_realign_needed = stack_realign;
14198 crtl->stack_realign_finalized = true;
14199 if (recompute_frame_layout_p)
14200 ix86_compute_frame_layout ();
14201 return;
14205 /* If drap has been set, but it actually isn't live at the start
14206 of the function, there is no reason to set it up. */
14207 if (crtl->drap_reg)
14209 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14210 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14212 crtl->drap_reg = NULL_RTX;
14213 crtl->need_drap = false;
14216 else
14217 cfun->machine->no_drap_save_restore = true;
14219 frame_pointer_needed = false;
14220 stack_realign = false;
14221 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14222 crtl->stack_alignment_needed = incoming_stack_boundary;
14223 crtl->stack_alignment_estimated = incoming_stack_boundary;
14224 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14225 crtl->preferred_stack_boundary = incoming_stack_boundary;
14226 df_finish_pass (true);
14227 df_scan_alloc (NULL);
14228 df_scan_blocks ();
14229 df_compute_regs_ever_live (true);
14230 df_analyze ();
14231 recompute_frame_layout_p = true;
14234 if (crtl->stack_realign_needed != stack_realign)
14235 recompute_frame_layout_p = true;
14236 crtl->stack_realign_needed = stack_realign;
14237 crtl->stack_realign_finalized = true;
14238 if (recompute_frame_layout_p)
14239 ix86_compute_frame_layout ();
14242 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14244 static void
14245 ix86_elim_entry_set_got (rtx reg)
14247 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14248 rtx_insn *c_insn = BB_HEAD (bb);
14249 if (!NONDEBUG_INSN_P (c_insn))
14250 c_insn = next_nonnote_nondebug_insn (c_insn);
14251 if (c_insn && NONJUMP_INSN_P (c_insn))
14253 rtx pat = PATTERN (c_insn);
14254 if (GET_CODE (pat) == PARALLEL)
14256 rtx vec = XVECEXP (pat, 0, 0);
14257 if (GET_CODE (vec) == SET
14258 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14259 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14260 delete_insn (c_insn);
14265 static rtx
14266 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14268 rtx addr, mem;
14270 if (offset)
14271 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14272 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14273 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14276 static inline rtx
14277 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14279 return gen_frame_set (reg, frame_reg, offset, false);
14282 static inline rtx
14283 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14285 return gen_frame_set (reg, frame_reg, offset, true);
14288 static void
14289 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14291 struct machine_function *m = cfun->machine;
14292 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14293 + m->call_ms2sysv_extra_regs;
14294 rtvec v = rtvec_alloc (ncregs + 1);
14295 unsigned int align, i, vi = 0;
14296 rtx_insn *insn;
14297 rtx sym, addr;
14298 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14299 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14300 HOST_WIDE_INT rax_offset = xlogue.get_stub_ptr_offset () + m->fs.sp_offset;
14301 HOST_WIDE_INT stack_alloc_size = frame.stack_pointer_offset - m->fs.sp_offset;
14302 HOST_WIDE_INT stack_align_off_in = xlogue.get_stack_align_off_in ();
14304 /* Verify that the incoming stack 16-byte alignment offset matches the
14305 layout we're using. */
14306 gcc_assert (stack_align_off_in == (m->fs.sp_offset & UNITS_PER_WORD));
14308 /* Get the stub symbol. */
14309 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14310 : XLOGUE_STUB_SAVE);
14311 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14313 /* Setup RAX as the stub's base pointer. */
14314 align = GET_MODE_ALIGNMENT (V4SFmode);
14315 addr = choose_baseaddr (rax_offset, &align);
14316 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14317 insn = emit_insn (gen_rtx_SET (rax, addr));
14319 gcc_assert (stack_alloc_size >= xlogue.get_stack_space_used ());
14320 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14321 GEN_INT (-stack_alloc_size), -1,
14322 m->fs.cfa_reg == stack_pointer_rtx);
14323 for (i = 0; i < ncregs; ++i)
14325 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14326 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14327 r.regno);
14328 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);;
14331 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14333 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14334 RTX_FRAME_RELATED_P (insn) = true;
14337 /* Expand the prologue into a bunch of separate insns. */
14339 void
14340 ix86_expand_prologue (void)
14342 struct machine_function *m = cfun->machine;
14343 rtx insn, t;
14344 struct ix86_frame frame;
14345 HOST_WIDE_INT allocate;
14346 bool int_registers_saved;
14347 bool sse_registers_saved;
14348 rtx static_chain = NULL_RTX;
14350 if (ix86_function_naked (current_function_decl))
14351 return;
14353 ix86_finalize_stack_realign_flags ();
14355 /* DRAP should not coexist with stack_realign_fp */
14356 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14358 memset (&m->fs, 0, sizeof (m->fs));
14360 /* Initialize CFA state for before the prologue. */
14361 m->fs.cfa_reg = stack_pointer_rtx;
14362 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14364 /* Track SP offset to the CFA. We continue tracking this after we've
14365 swapped the CFA register away from SP. In the case of re-alignment
14366 this is fudged; we're interested to offsets within the local frame. */
14367 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14368 m->fs.sp_valid = true;
14369 m->fs.sp_realigned = false;
14371 frame = m->frame;
14373 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14375 /* We should have already generated an error for any use of
14376 ms_hook on a nested function. */
14377 gcc_checking_assert (!ix86_static_chain_on_stack);
14379 /* Check if profiling is active and we shall use profiling before
14380 prologue variant. If so sorry. */
14381 if (crtl->profile && flag_fentry != 0)
14382 sorry ("ms_hook_prologue attribute isn%'t compatible "
14383 "with -mfentry for 32-bit");
14385 /* In ix86_asm_output_function_label we emitted:
14386 8b ff movl.s %edi,%edi
14387 55 push %ebp
14388 8b ec movl.s %esp,%ebp
14390 This matches the hookable function prologue in Win32 API
14391 functions in Microsoft Windows XP Service Pack 2 and newer.
14392 Wine uses this to enable Windows apps to hook the Win32 API
14393 functions provided by Wine.
14395 What that means is that we've already set up the frame pointer. */
14397 if (frame_pointer_needed
14398 && !(crtl->drap_reg && crtl->stack_realign_needed))
14400 rtx push, mov;
14402 /* We've decided to use the frame pointer already set up.
14403 Describe this to the unwinder by pretending that both
14404 push and mov insns happen right here.
14406 Putting the unwind info here at the end of the ms_hook
14407 is done so that we can make absolutely certain we get
14408 the required byte sequence at the start of the function,
14409 rather than relying on an assembler that can produce
14410 the exact encoding required.
14412 However it does mean (in the unpatched case) that we have
14413 a 1 insn window where the asynchronous unwind info is
14414 incorrect. However, if we placed the unwind info at
14415 its correct location we would have incorrect unwind info
14416 in the patched case. Which is probably all moot since
14417 I don't expect Wine generates dwarf2 unwind info for the
14418 system libraries that use this feature. */
14420 insn = emit_insn (gen_blockage ());
14422 push = gen_push (hard_frame_pointer_rtx);
14423 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14424 stack_pointer_rtx);
14425 RTX_FRAME_RELATED_P (push) = 1;
14426 RTX_FRAME_RELATED_P (mov) = 1;
14428 RTX_FRAME_RELATED_P (insn) = 1;
14429 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14430 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14432 /* Note that gen_push incremented m->fs.cfa_offset, even
14433 though we didn't emit the push insn here. */
14434 m->fs.cfa_reg = hard_frame_pointer_rtx;
14435 m->fs.fp_offset = m->fs.cfa_offset;
14436 m->fs.fp_valid = true;
14438 else
14440 /* The frame pointer is not needed so pop %ebp again.
14441 This leaves us with a pristine state. */
14442 emit_insn (gen_pop (hard_frame_pointer_rtx));
14446 /* The first insn of a function that accepts its static chain on the
14447 stack is to push the register that would be filled in by a direct
14448 call. This insn will be skipped by the trampoline. */
14449 else if (ix86_static_chain_on_stack)
14451 static_chain = ix86_static_chain (cfun->decl, false);
14452 insn = emit_insn (gen_push (static_chain));
14453 emit_insn (gen_blockage ());
14455 /* We don't want to interpret this push insn as a register save,
14456 only as a stack adjustment. The real copy of the register as
14457 a save will be done later, if needed. */
14458 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14459 t = gen_rtx_SET (stack_pointer_rtx, t);
14460 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14461 RTX_FRAME_RELATED_P (insn) = 1;
14464 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14465 of DRAP is needed and stack realignment is really needed after reload */
14466 if (stack_realign_drap)
14468 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14470 /* Can't use DRAP in interrupt function. */
14471 if (cfun->machine->func_type != TYPE_NORMAL)
14472 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14473 "in interrupt service routine. This may be worked "
14474 "around by avoiding functions with aggregate return.");
14476 /* Only need to push parameter pointer reg if it is caller saved. */
14477 if (!call_used_regs[REGNO (crtl->drap_reg)])
14479 /* Push arg pointer reg */
14480 insn = emit_insn (gen_push (crtl->drap_reg));
14481 RTX_FRAME_RELATED_P (insn) = 1;
14484 /* Grab the argument pointer. */
14485 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14486 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14487 RTX_FRAME_RELATED_P (insn) = 1;
14488 m->fs.cfa_reg = crtl->drap_reg;
14489 m->fs.cfa_offset = 0;
14491 /* Align the stack. */
14492 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14493 stack_pointer_rtx,
14494 GEN_INT (-align_bytes)));
14495 RTX_FRAME_RELATED_P (insn) = 1;
14497 /* Replicate the return address on the stack so that return
14498 address can be reached via (argp - 1) slot. This is needed
14499 to implement macro RETURN_ADDR_RTX and intrinsic function
14500 expand_builtin_return_addr etc. */
14501 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14502 t = gen_frame_mem (word_mode, t);
14503 insn = emit_insn (gen_push (t));
14504 RTX_FRAME_RELATED_P (insn) = 1;
14506 /* For the purposes of frame and register save area addressing,
14507 we've started over with a new frame. */
14508 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14509 m->fs.realigned = true;
14511 if (static_chain)
14513 /* Replicate static chain on the stack so that static chain
14514 can be reached via (argp - 2) slot. This is needed for
14515 nested function with stack realignment. */
14516 insn = emit_insn (gen_push (static_chain));
14517 RTX_FRAME_RELATED_P (insn) = 1;
14521 int_registers_saved = (frame.nregs == 0);
14522 sse_registers_saved = (frame.nsseregs == 0);
14524 if (frame_pointer_needed && !m->fs.fp_valid)
14526 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14527 slower on all targets. Also sdb doesn't like it. */
14528 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14529 RTX_FRAME_RELATED_P (insn) = 1;
14531 /* Push registers now, before setting the frame pointer
14532 on SEH target. */
14533 if (!int_registers_saved
14534 && TARGET_SEH
14535 && !frame.save_regs_using_mov)
14537 ix86_emit_save_regs ();
14538 int_registers_saved = true;
14539 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14542 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14544 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14545 RTX_FRAME_RELATED_P (insn) = 1;
14547 if (m->fs.cfa_reg == stack_pointer_rtx)
14548 m->fs.cfa_reg = hard_frame_pointer_rtx;
14549 m->fs.fp_offset = m->fs.sp_offset;
14550 m->fs.fp_valid = true;
14554 if (!int_registers_saved)
14556 /* If saving registers via PUSH, do so now. */
14557 if (!frame.save_regs_using_mov)
14559 ix86_emit_save_regs ();
14560 int_registers_saved = true;
14561 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14564 /* When using red zone we may start register saving before allocating
14565 the stack frame saving one cycle of the prologue. However, avoid
14566 doing this if we have to probe the stack; at least on x86_64 the
14567 stack probe can turn into a call that clobbers a red zone location. */
14568 else if (ix86_using_red_zone ()
14569 && (! TARGET_STACK_PROBE
14570 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14572 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14573 int_registers_saved = true;
14577 if (stack_realign_fp)
14579 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14580 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14582 /* The computation of the size of the re-aligned stack frame means
14583 that we must allocate the size of the register save area before
14584 performing the actual alignment. Otherwise we cannot guarantee
14585 that there's enough storage above the realignment point. */
14586 allocate = frame.stack_realign_allocate_offset - m->fs.sp_offset;
14587 if (allocate && !m->call_ms2sysv)
14588 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14589 GEN_INT (-allocate), -1, false);
14591 /* Align the stack. */
14592 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14593 stack_pointer_rtx,
14594 GEN_INT (-align_bytes)));
14595 /* For the purposes of register save area addressing, the stack
14596 pointer can no longer be used to access anything in the frame
14597 below m->fs.sp_realigned_offset and the frame pointer cannot be
14598 used for anything at or above. */
14599 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14600 m->fs.sp_realigned = true;
14601 m->fs.sp_realigned_offset = m->fs.sp_offset - frame.nsseregs * 16;
14602 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14603 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14604 is needed to describe where a register is saved using a realigned
14605 stack pointer, so we need to invalidate the stack pointer for that
14606 target. */
14607 if (TARGET_SEH)
14608 m->fs.sp_valid = false;
14611 if (m->call_ms2sysv)
14612 ix86_emit_outlined_ms2sysv_save (frame);
14614 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14616 if (flag_stack_usage_info)
14618 /* We start to count from ARG_POINTER. */
14619 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14621 /* If it was realigned, take into account the fake frame. */
14622 if (stack_realign_drap)
14624 if (ix86_static_chain_on_stack)
14625 stack_size += UNITS_PER_WORD;
14627 if (!call_used_regs[REGNO (crtl->drap_reg)])
14628 stack_size += UNITS_PER_WORD;
14630 /* This over-estimates by 1 minimal-stack-alignment-unit but
14631 mitigates that by counting in the new return address slot. */
14632 current_function_dynamic_stack_size
14633 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14636 current_function_static_stack_size = stack_size;
14639 /* On SEH target with very large frame size, allocate an area to save
14640 SSE registers (as the very large allocation won't be described). */
14641 if (TARGET_SEH
14642 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14643 && !sse_registers_saved)
14645 HOST_WIDE_INT sse_size =
14646 frame.sse_reg_save_offset - frame.reg_save_offset;
14648 gcc_assert (int_registers_saved);
14650 /* No need to do stack checking as the area will be immediately
14651 written. */
14652 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14653 GEN_INT (-sse_size), -1,
14654 m->fs.cfa_reg == stack_pointer_rtx);
14655 allocate -= sse_size;
14656 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14657 sse_registers_saved = true;
14660 /* The stack has already been decremented by the instruction calling us
14661 so probe if the size is non-negative to preserve the protection area. */
14662 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14664 /* We expect the registers to be saved when probes are used. */
14665 gcc_assert (int_registers_saved);
14667 if (STACK_CHECK_MOVING_SP)
14669 if (!(crtl->is_leaf && !cfun->calls_alloca
14670 && allocate <= PROBE_INTERVAL))
14672 ix86_adjust_stack_and_probe (allocate);
14673 allocate = 0;
14676 else
14678 HOST_WIDE_INT size = allocate;
14680 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14681 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14683 if (TARGET_STACK_PROBE)
14685 if (crtl->is_leaf && !cfun->calls_alloca)
14687 if (size > PROBE_INTERVAL)
14688 ix86_emit_probe_stack_range (0, size);
14690 else
14691 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14693 else
14695 if (crtl->is_leaf && !cfun->calls_alloca)
14697 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14698 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14699 size - STACK_CHECK_PROTECT);
14701 else
14702 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14707 if (allocate == 0)
14709 else if (!ix86_target_stack_probe ()
14710 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14712 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14713 GEN_INT (-allocate), -1,
14714 m->fs.cfa_reg == stack_pointer_rtx);
14716 else
14718 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14719 rtx r10 = NULL;
14720 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14721 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14722 bool eax_live = ix86_eax_live_at_start_p ();
14723 bool r10_live = false;
14725 if (TARGET_64BIT)
14726 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14728 if (eax_live)
14730 insn = emit_insn (gen_push (eax));
14731 allocate -= UNITS_PER_WORD;
14732 /* Note that SEH directives need to continue tracking the stack
14733 pointer even after the frame pointer has been set up. */
14734 if (sp_is_cfa_reg || TARGET_SEH)
14736 if (sp_is_cfa_reg)
14737 m->fs.cfa_offset += UNITS_PER_WORD;
14738 RTX_FRAME_RELATED_P (insn) = 1;
14739 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14740 gen_rtx_SET (stack_pointer_rtx,
14741 plus_constant (Pmode, stack_pointer_rtx,
14742 -UNITS_PER_WORD)));
14746 if (r10_live)
14748 r10 = gen_rtx_REG (Pmode, R10_REG);
14749 insn = emit_insn (gen_push (r10));
14750 allocate -= UNITS_PER_WORD;
14751 if (sp_is_cfa_reg || TARGET_SEH)
14753 if (sp_is_cfa_reg)
14754 m->fs.cfa_offset += UNITS_PER_WORD;
14755 RTX_FRAME_RELATED_P (insn) = 1;
14756 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14757 gen_rtx_SET (stack_pointer_rtx,
14758 plus_constant (Pmode, stack_pointer_rtx,
14759 -UNITS_PER_WORD)));
14763 emit_move_insn (eax, GEN_INT (allocate));
14764 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14766 /* Use the fact that AX still contains ALLOCATE. */
14767 adjust_stack_insn = (Pmode == DImode
14768 ? gen_pro_epilogue_adjust_stack_di_sub
14769 : gen_pro_epilogue_adjust_stack_si_sub);
14771 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14772 stack_pointer_rtx, eax));
14774 if (sp_is_cfa_reg || TARGET_SEH)
14776 if (sp_is_cfa_reg)
14777 m->fs.cfa_offset += allocate;
14778 RTX_FRAME_RELATED_P (insn) = 1;
14779 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14780 gen_rtx_SET (stack_pointer_rtx,
14781 plus_constant (Pmode, stack_pointer_rtx,
14782 -allocate)));
14784 m->fs.sp_offset += allocate;
14786 /* Use stack_pointer_rtx for relative addressing so that code
14787 works for realigned stack, too. */
14788 if (r10_live && eax_live)
14790 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14791 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14792 gen_frame_mem (word_mode, t));
14793 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14794 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14795 gen_frame_mem (word_mode, t));
14797 else if (eax_live || r10_live)
14799 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14800 emit_move_insn (gen_rtx_REG (word_mode,
14801 (eax_live ? AX_REG : R10_REG)),
14802 gen_frame_mem (word_mode, t));
14805 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14807 /* If we havn't already set up the frame pointer, do so now. */
14808 if (frame_pointer_needed && !m->fs.fp_valid)
14810 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14811 GEN_INT (frame.stack_pointer_offset
14812 - frame.hard_frame_pointer_offset));
14813 insn = emit_insn (insn);
14814 RTX_FRAME_RELATED_P (insn) = 1;
14815 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14817 if (m->fs.cfa_reg == stack_pointer_rtx)
14818 m->fs.cfa_reg = hard_frame_pointer_rtx;
14819 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14820 m->fs.fp_valid = true;
14823 if (!int_registers_saved)
14824 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14825 if (!sse_registers_saved)
14826 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14828 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14829 in PROLOGUE. */
14830 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14832 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14833 insn = emit_insn (gen_set_got (pic));
14834 RTX_FRAME_RELATED_P (insn) = 1;
14835 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14836 emit_insn (gen_prologue_use (pic));
14837 /* Deleting already emmitted SET_GOT if exist and allocated to
14838 REAL_PIC_OFFSET_TABLE_REGNUM. */
14839 ix86_elim_entry_set_got (pic);
14842 if (crtl->drap_reg && !crtl->stack_realign_needed)
14844 /* vDRAP is setup but after reload it turns out stack realign
14845 isn't necessary, here we will emit prologue to setup DRAP
14846 without stack realign adjustment */
14847 t = choose_baseaddr (0, NULL);
14848 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14851 /* Prevent instructions from being scheduled into register save push
14852 sequence when access to the redzone area is done through frame pointer.
14853 The offset between the frame pointer and the stack pointer is calculated
14854 relative to the value of the stack pointer at the end of the function
14855 prologue, and moving instructions that access redzone area via frame
14856 pointer inside push sequence violates this assumption. */
14857 if (frame_pointer_needed && frame.red_zone_size)
14858 emit_insn (gen_memory_blockage ());
14860 /* SEH requires that the prologue end within 256 bytes of the start of
14861 the function. Prevent instruction schedules that would extend that.
14862 Further, prevent alloca modifications to the stack pointer from being
14863 combined with prologue modifications. */
14864 if (TARGET_SEH)
14865 emit_insn (gen_prologue_use (stack_pointer_rtx));
14868 /* Emit code to restore REG using a POP insn. */
14870 static void
14871 ix86_emit_restore_reg_using_pop (rtx reg)
14873 struct machine_function *m = cfun->machine;
14874 rtx_insn *insn = emit_insn (gen_pop (reg));
14876 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14877 m->fs.sp_offset -= UNITS_PER_WORD;
14879 if (m->fs.cfa_reg == crtl->drap_reg
14880 && REGNO (reg) == REGNO (crtl->drap_reg))
14882 /* Previously we'd represented the CFA as an expression
14883 like *(%ebp - 8). We've just popped that value from
14884 the stack, which means we need to reset the CFA to
14885 the drap register. This will remain until we restore
14886 the stack pointer. */
14887 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14888 RTX_FRAME_RELATED_P (insn) = 1;
14890 /* This means that the DRAP register is valid for addressing too. */
14891 m->fs.drap_valid = true;
14892 return;
14895 if (m->fs.cfa_reg == stack_pointer_rtx)
14897 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14898 x = gen_rtx_SET (stack_pointer_rtx, x);
14899 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14900 RTX_FRAME_RELATED_P (insn) = 1;
14902 m->fs.cfa_offset -= UNITS_PER_WORD;
14905 /* When the frame pointer is the CFA, and we pop it, we are
14906 swapping back to the stack pointer as the CFA. This happens
14907 for stack frames that don't allocate other data, so we assume
14908 the stack pointer is now pointing at the return address, i.e.
14909 the function entry state, which makes the offset be 1 word. */
14910 if (reg == hard_frame_pointer_rtx)
14912 m->fs.fp_valid = false;
14913 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14915 m->fs.cfa_reg = stack_pointer_rtx;
14916 m->fs.cfa_offset -= UNITS_PER_WORD;
14918 add_reg_note (insn, REG_CFA_DEF_CFA,
14919 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14920 GEN_INT (m->fs.cfa_offset)));
14921 RTX_FRAME_RELATED_P (insn) = 1;
14926 /* Emit code to restore saved registers using POP insns. */
14928 static void
14929 ix86_emit_restore_regs_using_pop (void)
14931 unsigned int regno;
14933 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14934 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14935 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14938 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14939 omits the emit and only attaches the notes. */
14941 static void
14942 ix86_emit_leave (rtx_insn *insn)
14944 struct machine_function *m = cfun->machine;
14945 if (!insn)
14946 insn = emit_insn (ix86_gen_leave ());
14948 ix86_add_queued_cfa_restore_notes (insn);
14950 gcc_assert (m->fs.fp_valid);
14951 m->fs.sp_valid = true;
14952 m->fs.sp_realigned = false;
14953 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14954 m->fs.fp_valid = false;
14956 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14958 m->fs.cfa_reg = stack_pointer_rtx;
14959 m->fs.cfa_offset = m->fs.sp_offset;
14961 add_reg_note (insn, REG_CFA_DEF_CFA,
14962 plus_constant (Pmode, stack_pointer_rtx,
14963 m->fs.sp_offset));
14964 RTX_FRAME_RELATED_P (insn) = 1;
14966 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14967 m->fs.fp_offset);
14970 /* Emit code to restore saved registers using MOV insns.
14971 First register is restored from CFA - CFA_OFFSET. */
14972 static void
14973 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14974 bool maybe_eh_return)
14976 struct machine_function *m = cfun->machine;
14977 unsigned int regno;
14979 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14980 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14982 rtx reg = gen_rtx_REG (word_mode, regno);
14983 rtx mem;
14984 rtx_insn *insn;
14986 mem = choose_baseaddr (cfa_offset, NULL);
14987 mem = gen_frame_mem (word_mode, mem);
14988 insn = emit_move_insn (reg, mem);
14990 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14992 /* Previously we'd represented the CFA as an expression
14993 like *(%ebp - 8). We've just popped that value from
14994 the stack, which means we need to reset the CFA to
14995 the drap register. This will remain until we restore
14996 the stack pointer. */
14997 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14998 RTX_FRAME_RELATED_P (insn) = 1;
15000 /* This means that the DRAP register is valid for addressing. */
15001 m->fs.drap_valid = true;
15003 else
15004 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15006 cfa_offset -= UNITS_PER_WORD;
15010 /* Emit code to restore saved registers using MOV insns.
15011 First register is restored from CFA - CFA_OFFSET. */
15012 static void
15013 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15014 bool maybe_eh_return)
15016 unsigned int regno;
15018 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15019 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15021 rtx reg = gen_rtx_REG (V4SFmode, regno);
15022 rtx mem;
15023 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15025 mem = choose_baseaddr (cfa_offset, &align);
15026 mem = gen_rtx_MEM (V4SFmode, mem);
15028 /* The location aligment depends upon the base register. */
15029 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15030 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15031 set_mem_align (mem, align);
15032 emit_insn (gen_rtx_SET (reg, mem));
15034 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15036 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15040 static void
15041 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15042 bool use_call, int style)
15044 struct machine_function *m = cfun->machine;
15045 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15046 + m->call_ms2sysv_extra_regs;
15047 rtvec v;
15048 unsigned int elems_needed, align, i, vi = 0;
15049 rtx_insn *insn;
15050 rtx sym, tmp;
15051 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15052 rtx r10 = NULL_RTX;
15053 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15054 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15055 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15056 rtx rsi_frame_load = NULL_RTX;
15057 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15058 enum xlogue_stub stub;
15060 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15062 /* If using a realigned stack, we should never start with padding. */
15063 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15065 /* Setup RSI as the stub's base pointer. */
15066 align = GET_MODE_ALIGNMENT (V4SFmode);
15067 tmp = choose_baseaddr (rsi_offset, &align);
15068 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15069 emit_insn (gen_rtx_SET (rsi, tmp));
15071 /* Get a symbol for the stub. */
15072 if (frame_pointer_needed)
15073 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15074 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15075 else
15076 stub = use_call ? XLOGUE_STUB_RESTORE
15077 : XLOGUE_STUB_RESTORE_TAIL;
15078 sym = xlogue.get_stub_rtx (stub);
15080 elems_needed = ncregs;
15081 if (use_call)
15082 elems_needed += 1;
15083 else
15084 elems_needed += frame_pointer_needed ? 5 : 3;
15085 v = rtvec_alloc (elems_needed);
15087 /* We call the epilogue stub when we need to pop incoming args or we are
15088 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15089 epilogue stub and it is the tail-call. */
15090 if (use_call)
15091 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15092 else
15094 RTVEC_ELT (v, vi++) = ret_rtx;
15095 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15096 if (frame_pointer_needed)
15098 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15099 gcc_assert (m->fs.fp_valid);
15100 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15102 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15103 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15104 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15105 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15106 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15108 else
15110 /* If no hard frame pointer, we set R10 to the SP restore value. */
15111 gcc_assert (!m->fs.fp_valid);
15112 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15113 gcc_assert (m->fs.sp_valid);
15115 r10 = gen_rtx_REG (DImode, R10_REG);
15116 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15117 emit_insn (gen_rtx_SET (r10, tmp));
15119 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15123 /* Generate frame load insns and restore notes. */
15124 for (i = 0; i < ncregs; ++i)
15126 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15127 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15128 rtx reg, frame_load;
15130 reg = gen_rtx_REG (mode, r.regno);
15131 frame_load = gen_frame_load (reg, rsi, r.offset);
15133 /* Save RSI frame load insn & note to add last. */
15134 if (r.regno == SI_REG)
15136 gcc_assert (!rsi_frame_load);
15137 rsi_frame_load = frame_load;
15138 rsi_restore_offset = r.offset;
15140 else
15142 RTVEC_ELT (v, vi++) = frame_load;
15143 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15147 /* Add RSI frame load & restore note at the end. */
15148 gcc_assert (rsi_frame_load);
15149 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15150 RTVEC_ELT (v, vi++) = rsi_frame_load;
15151 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15152 rsi_restore_offset);
15154 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15155 if (!use_call && !frame_pointer_needed)
15157 gcc_assert (m->fs.sp_valid);
15158 gcc_assert (!m->fs.sp_realigned);
15160 /* At this point, R10 should point to frame.stack_realign_offset. */
15161 if (m->fs.cfa_reg == stack_pointer_rtx)
15162 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15163 m->fs.sp_offset = frame.stack_realign_offset;
15166 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15167 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15168 if (use_call)
15169 insn = emit_insn (tmp);
15170 else
15172 insn = emit_jump_insn (tmp);
15173 JUMP_LABEL (insn) = ret_rtx;
15175 if (frame_pointer_needed)
15176 ix86_emit_leave (insn);
15177 else
15179 /* Need CFA adjust note. */
15180 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15181 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15185 RTX_FRAME_RELATED_P (insn) = true;
15186 ix86_add_queued_cfa_restore_notes (insn);
15188 /* If we're not doing a tail-call, we need to adjust the stack. */
15189 if (use_call && m->fs.sp_valid)
15191 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15192 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15193 GEN_INT (dealloc), style,
15194 m->fs.cfa_reg == stack_pointer_rtx);
15198 /* Restore function stack, frame, and registers. */
15200 void
15201 ix86_expand_epilogue (int style)
15203 struct machine_function *m = cfun->machine;
15204 struct machine_frame_state frame_state_save = m->fs;
15205 struct ix86_frame frame;
15206 bool restore_regs_via_mov;
15207 bool using_drap;
15208 bool restore_stub_is_tail = false;
15210 if (ix86_function_naked (current_function_decl))
15212 /* The program should not reach this point. */
15213 emit_insn (gen_trap ());
15214 return;
15217 ix86_finalize_stack_realign_flags ();
15218 frame = m->frame;
15220 m->fs.sp_realigned = stack_realign_fp;
15221 m->fs.sp_valid = stack_realign_fp
15222 || !frame_pointer_needed
15223 || crtl->sp_is_unchanging;
15224 gcc_assert (!m->fs.sp_valid
15225 || m->fs.sp_offset == frame.stack_pointer_offset);
15227 /* The FP must be valid if the frame pointer is present. */
15228 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15229 gcc_assert (!m->fs.fp_valid
15230 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15232 /* We must have *some* valid pointer to the stack frame. */
15233 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15235 /* The DRAP is never valid at this point. */
15236 gcc_assert (!m->fs.drap_valid);
15238 /* See the comment about red zone and frame
15239 pointer usage in ix86_expand_prologue. */
15240 if (frame_pointer_needed && frame.red_zone_size)
15241 emit_insn (gen_memory_blockage ());
15243 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15244 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15246 /* Determine the CFA offset of the end of the red-zone. */
15247 m->fs.red_zone_offset = 0;
15248 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15250 /* The red-zone begins below return address and error code in
15251 exception handler. */
15252 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15254 /* When the register save area is in the aligned portion of
15255 the stack, determine the maximum runtime displacement that
15256 matches up with the aligned frame. */
15257 if (stack_realign_drap)
15258 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15259 + UNITS_PER_WORD);
15262 /* Special care must be taken for the normal return case of a function
15263 using eh_return: the eax and edx registers are marked as saved, but
15264 not restored along this path. Adjust the save location to match. */
15265 if (crtl->calls_eh_return && style != 2)
15266 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15268 /* EH_RETURN requires the use of moves to function properly. */
15269 if (crtl->calls_eh_return)
15270 restore_regs_via_mov = true;
15271 /* SEH requires the use of pops to identify the epilogue. */
15272 else if (TARGET_SEH)
15273 restore_regs_via_mov = false;
15274 /* If we're only restoring one register and sp cannot be used then
15275 using a move instruction to restore the register since it's
15276 less work than reloading sp and popping the register. */
15277 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15278 restore_regs_via_mov = true;
15279 else if (TARGET_EPILOGUE_USING_MOVE
15280 && cfun->machine->use_fast_prologue_epilogue
15281 && (frame.nregs > 1
15282 || m->fs.sp_offset != frame.reg_save_offset))
15283 restore_regs_via_mov = true;
15284 else if (frame_pointer_needed
15285 && !frame.nregs
15286 && m->fs.sp_offset != frame.reg_save_offset)
15287 restore_regs_via_mov = true;
15288 else if (frame_pointer_needed
15289 && TARGET_USE_LEAVE
15290 && cfun->machine->use_fast_prologue_epilogue
15291 && frame.nregs == 1)
15292 restore_regs_via_mov = true;
15293 else
15294 restore_regs_via_mov = false;
15296 if (restore_regs_via_mov || frame.nsseregs)
15298 /* Ensure that the entire register save area is addressable via
15299 the stack pointer, if we will restore via sp. */
15300 if (TARGET_64BIT
15301 && m->fs.sp_offset > 0x7fffffff
15302 && !(fp_valid_at (frame.stack_realign_offset) || m->fs.drap_valid)
15303 && (frame.nsseregs + frame.nregs) != 0)
15305 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15306 GEN_INT (m->fs.sp_offset
15307 - frame.sse_reg_save_offset),
15308 style,
15309 m->fs.cfa_reg == stack_pointer_rtx);
15313 /* If there are any SSE registers to restore, then we have to do it
15314 via moves, since there's obviously no pop for SSE regs. */
15315 if (frame.nsseregs)
15316 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15317 style == 2);
15319 if (m->call_ms2sysv)
15321 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15323 /* We cannot use a tail-call for the stub if:
15324 1. We have to pop incoming args,
15325 2. We have additional int regs to restore, or
15326 3. A sibling call will be the tail-call, or
15327 4. We are emitting an eh_return_internal epilogue.
15329 TODO: Item 4 has not yet tested!
15331 If any of the above are true, we will call the stub rather than
15332 jump to it. */
15333 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15334 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15337 /* If using out-of-line stub that is a tail-call, then...*/
15338 if (m->call_ms2sysv && restore_stub_is_tail)
15340 /* TODO: parinoid tests. (remove eventually) */
15341 gcc_assert (m->fs.sp_valid);
15342 gcc_assert (!m->fs.sp_realigned);
15343 gcc_assert (!m->fs.fp_valid);
15344 gcc_assert (!m->fs.realigned);
15345 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15346 gcc_assert (!crtl->drap_reg);
15347 gcc_assert (!frame.nregs);
15349 else if (restore_regs_via_mov)
15351 rtx t;
15353 if (frame.nregs)
15354 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15356 /* eh_return epilogues need %ecx added to the stack pointer. */
15357 if (style == 2)
15359 rtx sa = EH_RETURN_STACKADJ_RTX;
15360 rtx_insn *insn;
15362 /* %ecx can't be used for both DRAP register and eh_return. */
15363 if (crtl->drap_reg)
15364 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15366 /* regparm nested functions don't work with eh_return. */
15367 gcc_assert (!ix86_static_chain_on_stack);
15369 if (frame_pointer_needed)
15371 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15372 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15373 emit_insn (gen_rtx_SET (sa, t));
15375 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15376 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15378 /* Note that we use SA as a temporary CFA, as the return
15379 address is at the proper place relative to it. We
15380 pretend this happens at the FP restore insn because
15381 prior to this insn the FP would be stored at the wrong
15382 offset relative to SA, and after this insn we have no
15383 other reasonable register to use for the CFA. We don't
15384 bother resetting the CFA to the SP for the duration of
15385 the return insn. */
15386 add_reg_note (insn, REG_CFA_DEF_CFA,
15387 plus_constant (Pmode, sa, UNITS_PER_WORD));
15388 ix86_add_queued_cfa_restore_notes (insn);
15389 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15390 RTX_FRAME_RELATED_P (insn) = 1;
15392 m->fs.cfa_reg = sa;
15393 m->fs.cfa_offset = UNITS_PER_WORD;
15394 m->fs.fp_valid = false;
15396 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15397 const0_rtx, style, false);
15399 else
15401 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15402 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15403 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15404 ix86_add_queued_cfa_restore_notes (insn);
15406 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15407 if (m->fs.cfa_offset != UNITS_PER_WORD)
15409 m->fs.cfa_offset = UNITS_PER_WORD;
15410 add_reg_note (insn, REG_CFA_DEF_CFA,
15411 plus_constant (Pmode, stack_pointer_rtx,
15412 UNITS_PER_WORD));
15413 RTX_FRAME_RELATED_P (insn) = 1;
15416 m->fs.sp_offset = UNITS_PER_WORD;
15417 m->fs.sp_valid = true;
15418 m->fs.sp_realigned = false;
15421 else
15423 /* SEH requires that the function end with (1) a stack adjustment
15424 if necessary, (2) a sequence of pops, and (3) a return or
15425 jump instruction. Prevent insns from the function body from
15426 being scheduled into this sequence. */
15427 if (TARGET_SEH)
15429 /* Prevent a catch region from being adjacent to the standard
15430 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15431 several other flags that would be interesting to test are
15432 not yet set up. */
15433 if (flag_non_call_exceptions)
15434 emit_insn (gen_nops (const1_rtx));
15435 else
15436 emit_insn (gen_blockage ());
15439 /* First step is to deallocate the stack frame so that we can
15440 pop the registers. If the stack pointer was realigned, it needs
15441 to be restored now. Also do it on SEH target for very large
15442 frame as the emitted instructions aren't allowed by the ABI
15443 in epilogues. */
15444 if (!m->fs.sp_valid || m->fs.sp_realigned
15445 || (TARGET_SEH
15446 && (m->fs.sp_offset - frame.reg_save_offset
15447 >= SEH_MAX_FRAME_SIZE)))
15449 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15450 GEN_INT (m->fs.fp_offset
15451 - frame.reg_save_offset),
15452 style, false);
15454 else if (m->fs.sp_offset != frame.reg_save_offset)
15456 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15457 GEN_INT (m->fs.sp_offset
15458 - frame.reg_save_offset),
15459 style,
15460 m->fs.cfa_reg == stack_pointer_rtx);
15463 ix86_emit_restore_regs_using_pop ();
15466 /* If we used a stack pointer and haven't already got rid of it,
15467 then do so now. */
15468 if (m->fs.fp_valid)
15470 /* If the stack pointer is valid and pointing at the frame
15471 pointer store address, then we only need a pop. */
15472 if (sp_valid_at (frame.hfp_save_offset)
15473 && m->fs.sp_offset == frame.hfp_save_offset)
15474 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15475 /* Leave results in shorter dependency chains on CPUs that are
15476 able to grok it fast. */
15477 else if (TARGET_USE_LEAVE
15478 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15479 || !cfun->machine->use_fast_prologue_epilogue)
15480 ix86_emit_leave (NULL);
15481 else
15483 pro_epilogue_adjust_stack (stack_pointer_rtx,
15484 hard_frame_pointer_rtx,
15485 const0_rtx, style, !using_drap);
15486 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15490 if (using_drap)
15492 int param_ptr_offset = UNITS_PER_WORD;
15493 rtx_insn *insn;
15495 gcc_assert (stack_realign_drap);
15497 if (ix86_static_chain_on_stack)
15498 param_ptr_offset += UNITS_PER_WORD;
15499 if (!call_used_regs[REGNO (crtl->drap_reg)])
15500 param_ptr_offset += UNITS_PER_WORD;
15502 insn = emit_insn (gen_rtx_SET
15503 (stack_pointer_rtx,
15504 gen_rtx_PLUS (Pmode,
15505 crtl->drap_reg,
15506 GEN_INT (-param_ptr_offset))));
15507 m->fs.cfa_reg = stack_pointer_rtx;
15508 m->fs.cfa_offset = param_ptr_offset;
15509 m->fs.sp_offset = param_ptr_offset;
15510 m->fs.realigned = false;
15512 add_reg_note (insn, REG_CFA_DEF_CFA,
15513 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15514 GEN_INT (param_ptr_offset)));
15515 RTX_FRAME_RELATED_P (insn) = 1;
15517 if (!call_used_regs[REGNO (crtl->drap_reg)])
15518 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15521 /* At this point the stack pointer must be valid, and we must have
15522 restored all of the registers. We may not have deallocated the
15523 entire stack frame. We've delayed this until now because it may
15524 be possible to merge the local stack deallocation with the
15525 deallocation forced by ix86_static_chain_on_stack. */
15526 gcc_assert (m->fs.sp_valid);
15527 gcc_assert (!m->fs.sp_realigned);
15528 gcc_assert (!m->fs.fp_valid);
15529 gcc_assert (!m->fs.realigned);
15530 if (m->fs.sp_offset != UNITS_PER_WORD)
15532 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15533 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15534 style, true);
15536 else
15537 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15539 /* Sibcall epilogues don't want a return instruction. */
15540 if (style == 0)
15542 m->fs = frame_state_save;
15543 return;
15546 if (cfun->machine->func_type != TYPE_NORMAL)
15547 emit_jump_insn (gen_interrupt_return ());
15548 else if (crtl->args.pops_args && crtl->args.size)
15550 rtx popc = GEN_INT (crtl->args.pops_args);
15552 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15553 address, do explicit add, and jump indirectly to the caller. */
15555 if (crtl->args.pops_args >= 65536)
15557 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15558 rtx_insn *insn;
15560 /* There is no "pascal" calling convention in any 64bit ABI. */
15561 gcc_assert (!TARGET_64BIT);
15563 insn = emit_insn (gen_pop (ecx));
15564 m->fs.cfa_offset -= UNITS_PER_WORD;
15565 m->fs.sp_offset -= UNITS_PER_WORD;
15567 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15568 x = gen_rtx_SET (stack_pointer_rtx, x);
15569 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15570 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15571 RTX_FRAME_RELATED_P (insn) = 1;
15573 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15574 popc, -1, true);
15575 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15577 else
15578 emit_jump_insn (gen_simple_return_pop_internal (popc));
15580 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15581 emit_jump_insn (gen_simple_return_internal ());
15583 /* Restore the state back to the state from the prologue,
15584 so that it's correct for the next epilogue. */
15585 m->fs = frame_state_save;
15588 /* Reset from the function's potential modifications. */
15590 static void
15591 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
15593 if (pic_offset_table_rtx
15594 && !ix86_use_pseudo_pic_reg ())
15595 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15597 if (TARGET_MACHO)
15599 rtx_insn *insn = get_last_insn ();
15600 rtx_insn *deleted_debug_label = NULL;
15602 /* Mach-O doesn't support labels at the end of objects, so if
15603 it looks like we might want one, take special action.
15604 First, collect any sequence of deleted debug labels. */
15605 while (insn
15606 && NOTE_P (insn)
15607 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15609 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15610 notes only, instead set their CODE_LABEL_NUMBER to -1,
15611 otherwise there would be code generation differences
15612 in between -g and -g0. */
15613 if (NOTE_P (insn) && NOTE_KIND (insn)
15614 == NOTE_INSN_DELETED_DEBUG_LABEL)
15615 deleted_debug_label = insn;
15616 insn = PREV_INSN (insn);
15619 /* If we have:
15620 label:
15621 barrier
15622 then this needs to be detected, so skip past the barrier. */
15624 if (insn && BARRIER_P (insn))
15625 insn = PREV_INSN (insn);
15627 /* Up to now we've only seen notes or barriers. */
15628 if (insn)
15630 if (LABEL_P (insn)
15631 || (NOTE_P (insn)
15632 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15633 /* Trailing label. */
15634 fputs ("\tnop\n", file);
15635 else if (cfun && ! cfun->is_thunk)
15637 /* See if we have a completely empty function body, skipping
15638 the special case of the picbase thunk emitted as asm. */
15639 while (insn && ! INSN_P (insn))
15640 insn = PREV_INSN (insn);
15641 /* If we don't find any insns, we've got an empty function body;
15642 I.e. completely empty - without a return or branch. This is
15643 taken as the case where a function body has been removed
15644 because it contains an inline __builtin_unreachable(). GCC
15645 declares that reaching __builtin_unreachable() means UB so
15646 we're not obliged to do anything special; however, we want
15647 non-zero-sized function bodies. To meet this, and help the
15648 user out, let's trap the case. */
15649 if (insn == NULL)
15650 fputs ("\tud2\n", file);
15653 else if (deleted_debug_label)
15654 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15655 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15656 CODE_LABEL_NUMBER (insn) = -1;
15660 /* Return a scratch register to use in the split stack prologue. The
15661 split stack prologue is used for -fsplit-stack. It is the first
15662 instructions in the function, even before the regular prologue.
15663 The scratch register can be any caller-saved register which is not
15664 used for parameters or for the static chain. */
15666 static unsigned int
15667 split_stack_prologue_scratch_regno (void)
15669 if (TARGET_64BIT)
15670 return R11_REG;
15671 else
15673 bool is_fastcall, is_thiscall;
15674 int regparm;
15676 is_fastcall = (lookup_attribute ("fastcall",
15677 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15678 != NULL);
15679 is_thiscall = (lookup_attribute ("thiscall",
15680 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15681 != NULL);
15682 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15684 if (is_fastcall)
15686 if (DECL_STATIC_CHAIN (cfun->decl))
15688 sorry ("-fsplit-stack does not support fastcall with "
15689 "nested function");
15690 return INVALID_REGNUM;
15692 return AX_REG;
15694 else if (is_thiscall)
15696 if (!DECL_STATIC_CHAIN (cfun->decl))
15697 return DX_REG;
15698 return AX_REG;
15700 else if (regparm < 3)
15702 if (!DECL_STATIC_CHAIN (cfun->decl))
15703 return CX_REG;
15704 else
15706 if (regparm >= 2)
15708 sorry ("-fsplit-stack does not support 2 register "
15709 "parameters for a nested function");
15710 return INVALID_REGNUM;
15712 return DX_REG;
15715 else
15717 /* FIXME: We could make this work by pushing a register
15718 around the addition and comparison. */
15719 sorry ("-fsplit-stack does not support 3 register parameters");
15720 return INVALID_REGNUM;
15725 /* A SYMBOL_REF for the function which allocates new stackspace for
15726 -fsplit-stack. */
15728 static GTY(()) rtx split_stack_fn;
15730 /* A SYMBOL_REF for the more stack function when using the large
15731 model. */
15733 static GTY(()) rtx split_stack_fn_large;
15735 /* Handle -fsplit-stack. These are the first instructions in the
15736 function, even before the regular prologue. */
15738 void
15739 ix86_expand_split_stack_prologue (void)
15741 struct ix86_frame frame;
15742 HOST_WIDE_INT allocate;
15743 unsigned HOST_WIDE_INT args_size;
15744 rtx_code_label *label;
15745 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15746 rtx scratch_reg = NULL_RTX;
15747 rtx_code_label *varargs_label = NULL;
15748 rtx fn;
15750 gcc_assert (flag_split_stack && reload_completed);
15752 ix86_finalize_stack_realign_flags ();
15753 frame = cfun->machine->frame;
15754 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15756 /* This is the label we will branch to if we have enough stack
15757 space. We expect the basic block reordering pass to reverse this
15758 branch if optimizing, so that we branch in the unlikely case. */
15759 label = gen_label_rtx ();
15761 /* We need to compare the stack pointer minus the frame size with
15762 the stack boundary in the TCB. The stack boundary always gives
15763 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15764 can compare directly. Otherwise we need to do an addition. */
15766 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
15767 UNSPEC_STACK_CHECK);
15768 limit = gen_rtx_CONST (Pmode, limit);
15769 limit = gen_rtx_MEM (Pmode, limit);
15770 if (allocate < SPLIT_STACK_AVAILABLE)
15771 current = stack_pointer_rtx;
15772 else
15774 unsigned int scratch_regno;
15775 rtx offset;
15777 /* We need a scratch register to hold the stack pointer minus
15778 the required frame size. Since this is the very start of the
15779 function, the scratch register can be any caller-saved
15780 register which is not used for parameters. */
15781 offset = GEN_INT (- allocate);
15782 scratch_regno = split_stack_prologue_scratch_regno ();
15783 if (scratch_regno == INVALID_REGNUM)
15784 return;
15785 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15786 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15788 /* We don't use ix86_gen_add3 in this case because it will
15789 want to split to lea, but when not optimizing the insn
15790 will not be split after this point. */
15791 emit_insn (gen_rtx_SET (scratch_reg,
15792 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15793 offset)));
15795 else
15797 emit_move_insn (scratch_reg, offset);
15798 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15799 stack_pointer_rtx));
15801 current = scratch_reg;
15804 ix86_expand_branch (GEU, current, limit, label);
15805 rtx_insn *jump_insn = get_last_insn ();
15806 JUMP_LABEL (jump_insn) = label;
15808 /* Mark the jump as very likely to be taken. */
15809 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15811 if (split_stack_fn == NULL_RTX)
15813 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15814 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15816 fn = split_stack_fn;
15818 /* Get more stack space. We pass in the desired stack space and the
15819 size of the arguments to copy to the new stack. In 32-bit mode
15820 we push the parameters; __morestack will return on a new stack
15821 anyhow. In 64-bit mode we pass the parameters in r10 and
15822 r11. */
15823 allocate_rtx = GEN_INT (allocate);
15824 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15825 call_fusage = NULL_RTX;
15826 rtx pop = NULL_RTX;
15827 if (TARGET_64BIT)
15829 rtx reg10, reg11;
15831 reg10 = gen_rtx_REG (Pmode, R10_REG);
15832 reg11 = gen_rtx_REG (Pmode, R11_REG);
15834 /* If this function uses a static chain, it will be in %r10.
15835 Preserve it across the call to __morestack. */
15836 if (DECL_STATIC_CHAIN (cfun->decl))
15838 rtx rax;
15840 rax = gen_rtx_REG (word_mode, AX_REG);
15841 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15842 use_reg (&call_fusage, rax);
15845 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15846 && !TARGET_PECOFF)
15848 HOST_WIDE_INT argval;
15850 gcc_assert (Pmode == DImode);
15851 /* When using the large model we need to load the address
15852 into a register, and we've run out of registers. So we
15853 switch to a different calling convention, and we call a
15854 different function: __morestack_large. We pass the
15855 argument size in the upper 32 bits of r10 and pass the
15856 frame size in the lower 32 bits. */
15857 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15858 gcc_assert ((args_size & 0xffffffff) == args_size);
15860 if (split_stack_fn_large == NULL_RTX)
15862 split_stack_fn_large =
15863 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15864 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15866 if (ix86_cmodel == CM_LARGE_PIC)
15868 rtx_code_label *label;
15869 rtx x;
15871 label = gen_label_rtx ();
15872 emit_label (label);
15873 LABEL_PRESERVE_P (label) = 1;
15874 emit_insn (gen_set_rip_rex64 (reg10, label));
15875 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15876 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15877 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15878 UNSPEC_GOT);
15879 x = gen_rtx_CONST (Pmode, x);
15880 emit_move_insn (reg11, x);
15881 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15882 x = gen_const_mem (Pmode, x);
15883 emit_move_insn (reg11, x);
15885 else
15886 emit_move_insn (reg11, split_stack_fn_large);
15888 fn = reg11;
15890 argval = ((args_size << 16) << 16) + allocate;
15891 emit_move_insn (reg10, GEN_INT (argval));
15893 else
15895 emit_move_insn (reg10, allocate_rtx);
15896 emit_move_insn (reg11, GEN_INT (args_size));
15897 use_reg (&call_fusage, reg11);
15900 use_reg (&call_fusage, reg10);
15902 else
15904 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15905 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15906 insn = emit_insn (gen_push (allocate_rtx));
15907 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15908 pop = GEN_INT (2 * UNITS_PER_WORD);
15910 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15911 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15912 pop, false);
15913 add_function_usage_to (call_insn, call_fusage);
15914 if (!TARGET_64BIT)
15915 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15916 /* Indicate that this function can't jump to non-local gotos. */
15917 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15919 /* In order to make call/return prediction work right, we now need
15920 to execute a return instruction. See
15921 libgcc/config/i386/morestack.S for the details on how this works.
15923 For flow purposes gcc must not see this as a return
15924 instruction--we need control flow to continue at the subsequent
15925 label. Therefore, we use an unspec. */
15926 gcc_assert (crtl->args.pops_args < 65536);
15927 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15929 /* If we are in 64-bit mode and this function uses a static chain,
15930 we saved %r10 in %rax before calling _morestack. */
15931 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15932 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15933 gen_rtx_REG (word_mode, AX_REG));
15935 /* If this function calls va_start, we need to store a pointer to
15936 the arguments on the old stack, because they may not have been
15937 all copied to the new stack. At this point the old stack can be
15938 found at the frame pointer value used by __morestack, because
15939 __morestack has set that up before calling back to us. Here we
15940 store that pointer in a scratch register, and in
15941 ix86_expand_prologue we store the scratch register in a stack
15942 slot. */
15943 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15945 unsigned int scratch_regno;
15946 rtx frame_reg;
15947 int words;
15949 scratch_regno = split_stack_prologue_scratch_regno ();
15950 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15951 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15953 /* 64-bit:
15954 fp -> old fp value
15955 return address within this function
15956 return address of caller of this function
15957 stack arguments
15958 So we add three words to get to the stack arguments.
15960 32-bit:
15961 fp -> old fp value
15962 return address within this function
15963 first argument to __morestack
15964 second argument to __morestack
15965 return address of caller of this function
15966 stack arguments
15967 So we add five words to get to the stack arguments.
15969 words = TARGET_64BIT ? 3 : 5;
15970 emit_insn (gen_rtx_SET (scratch_reg,
15971 gen_rtx_PLUS (Pmode, frame_reg,
15972 GEN_INT (words * UNITS_PER_WORD))));
15974 varargs_label = gen_label_rtx ();
15975 emit_jump_insn (gen_jump (varargs_label));
15976 JUMP_LABEL (get_last_insn ()) = varargs_label;
15978 emit_barrier ();
15981 emit_label (label);
15982 LABEL_NUSES (label) = 1;
15984 /* If this function calls va_start, we now have to set the scratch
15985 register for the case where we do not call __morestack. In this
15986 case we need to set it based on the stack pointer. */
15987 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15989 emit_insn (gen_rtx_SET (scratch_reg,
15990 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15991 GEN_INT (UNITS_PER_WORD))));
15993 emit_label (varargs_label);
15994 LABEL_NUSES (varargs_label) = 1;
15998 /* We may have to tell the dataflow pass that the split stack prologue
15999 is initializing a scratch register. */
16001 static void
16002 ix86_live_on_entry (bitmap regs)
16004 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16006 gcc_assert (flag_split_stack);
16007 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16011 /* Extract the parts of an RTL expression that is a valid memory address
16012 for an instruction. Return 0 if the structure of the address is
16013 grossly off. Return -1 if the address contains ASHIFT, so it is not
16014 strictly valid, but still used for computing length of lea instruction. */
16017 ix86_decompose_address (rtx addr, struct ix86_address *out)
16019 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16020 rtx base_reg, index_reg;
16021 HOST_WIDE_INT scale = 1;
16022 rtx scale_rtx = NULL_RTX;
16023 rtx tmp;
16024 int retval = 1;
16025 addr_space_t seg = ADDR_SPACE_GENERIC;
16027 /* Allow zero-extended SImode addresses,
16028 they will be emitted with addr32 prefix. */
16029 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16031 if (GET_CODE (addr) == ZERO_EXTEND
16032 && GET_MODE (XEXP (addr, 0)) == SImode)
16034 addr = XEXP (addr, 0);
16035 if (CONST_INT_P (addr))
16036 return 0;
16038 else if (GET_CODE (addr) == AND
16039 && const_32bit_mask (XEXP (addr, 1), DImode))
16041 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16042 if (addr == NULL_RTX)
16043 return 0;
16045 if (CONST_INT_P (addr))
16046 return 0;
16050 /* Allow SImode subregs of DImode addresses,
16051 they will be emitted with addr32 prefix. */
16052 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16054 if (SUBREG_P (addr)
16055 && GET_MODE (SUBREG_REG (addr)) == DImode)
16057 addr = SUBREG_REG (addr);
16058 if (CONST_INT_P (addr))
16059 return 0;
16063 if (REG_P (addr))
16064 base = addr;
16065 else if (SUBREG_P (addr))
16067 if (REG_P (SUBREG_REG (addr)))
16068 base = addr;
16069 else
16070 return 0;
16072 else if (GET_CODE (addr) == PLUS)
16074 rtx addends[4], op;
16075 int n = 0, i;
16077 op = addr;
16080 if (n >= 4)
16081 return 0;
16082 addends[n++] = XEXP (op, 1);
16083 op = XEXP (op, 0);
16085 while (GET_CODE (op) == PLUS);
16086 if (n >= 4)
16087 return 0;
16088 addends[n] = op;
16090 for (i = n; i >= 0; --i)
16092 op = addends[i];
16093 switch (GET_CODE (op))
16095 case MULT:
16096 if (index)
16097 return 0;
16098 index = XEXP (op, 0);
16099 scale_rtx = XEXP (op, 1);
16100 break;
16102 case ASHIFT:
16103 if (index)
16104 return 0;
16105 index = XEXP (op, 0);
16106 tmp = XEXP (op, 1);
16107 if (!CONST_INT_P (tmp))
16108 return 0;
16109 scale = INTVAL (tmp);
16110 if ((unsigned HOST_WIDE_INT) scale > 3)
16111 return 0;
16112 scale = 1 << scale;
16113 break;
16115 case ZERO_EXTEND:
16116 op = XEXP (op, 0);
16117 if (GET_CODE (op) != UNSPEC)
16118 return 0;
16119 /* FALLTHRU */
16121 case UNSPEC:
16122 if (XINT (op, 1) == UNSPEC_TP
16123 && TARGET_TLS_DIRECT_SEG_REFS
16124 && seg == ADDR_SPACE_GENERIC)
16125 seg = DEFAULT_TLS_SEG_REG;
16126 else
16127 return 0;
16128 break;
16130 case SUBREG:
16131 if (!REG_P (SUBREG_REG (op)))
16132 return 0;
16133 /* FALLTHRU */
16135 case REG:
16136 if (!base)
16137 base = op;
16138 else if (!index)
16139 index = op;
16140 else
16141 return 0;
16142 break;
16144 case CONST:
16145 case CONST_INT:
16146 case SYMBOL_REF:
16147 case LABEL_REF:
16148 if (disp)
16149 return 0;
16150 disp = op;
16151 break;
16153 default:
16154 return 0;
16158 else if (GET_CODE (addr) == MULT)
16160 index = XEXP (addr, 0); /* index*scale */
16161 scale_rtx = XEXP (addr, 1);
16163 else if (GET_CODE (addr) == ASHIFT)
16165 /* We're called for lea too, which implements ashift on occasion. */
16166 index = XEXP (addr, 0);
16167 tmp = XEXP (addr, 1);
16168 if (!CONST_INT_P (tmp))
16169 return 0;
16170 scale = INTVAL (tmp);
16171 if ((unsigned HOST_WIDE_INT) scale > 3)
16172 return 0;
16173 scale = 1 << scale;
16174 retval = -1;
16176 else
16177 disp = addr; /* displacement */
16179 if (index)
16181 if (REG_P (index))
16183 else if (SUBREG_P (index)
16184 && REG_P (SUBREG_REG (index)))
16186 else
16187 return 0;
16190 /* Extract the integral value of scale. */
16191 if (scale_rtx)
16193 if (!CONST_INT_P (scale_rtx))
16194 return 0;
16195 scale = INTVAL (scale_rtx);
16198 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16199 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16201 /* Avoid useless 0 displacement. */
16202 if (disp == const0_rtx && (base || index))
16203 disp = NULL_RTX;
16205 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16206 if (base_reg && index_reg && scale == 1
16207 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16208 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16209 || REGNO (index_reg) == SP_REG))
16211 std::swap (base, index);
16212 std::swap (base_reg, index_reg);
16215 /* Special case: %ebp cannot be encoded as a base without a displacement.
16216 Similarly %r13. */
16217 if (!disp && base_reg
16218 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16219 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16220 || REGNO (base_reg) == BP_REG
16221 || REGNO (base_reg) == R13_REG))
16222 disp = const0_rtx;
16224 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16225 Avoid this by transforming to [%esi+0].
16226 Reload calls address legitimization without cfun defined, so we need
16227 to test cfun for being non-NULL. */
16228 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16229 && base_reg && !index_reg && !disp
16230 && REGNO (base_reg) == SI_REG)
16231 disp = const0_rtx;
16233 /* Special case: encode reg+reg instead of reg*2. */
16234 if (!base && index && scale == 2)
16235 base = index, base_reg = index_reg, scale = 1;
16237 /* Special case: scaling cannot be encoded without base or displacement. */
16238 if (!base && !disp && index && scale != 1)
16239 disp = const0_rtx;
16241 out->base = base;
16242 out->index = index;
16243 out->disp = disp;
16244 out->scale = scale;
16245 out->seg = seg;
16247 return retval;
16250 /* Return cost of the memory address x.
16251 For i386, it is better to use a complex address than let gcc copy
16252 the address into a reg and make a new pseudo. But not if the address
16253 requires to two regs - that would mean more pseudos with longer
16254 lifetimes. */
16255 static int
16256 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16258 struct ix86_address parts;
16259 int cost = 1;
16260 int ok = ix86_decompose_address (x, &parts);
16262 gcc_assert (ok);
16264 if (parts.base && SUBREG_P (parts.base))
16265 parts.base = SUBREG_REG (parts.base);
16266 if (parts.index && SUBREG_P (parts.index))
16267 parts.index = SUBREG_REG (parts.index);
16269 /* Attempt to minimize number of registers in the address by increasing
16270 address cost for each used register. We don't increase address cost
16271 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16272 is not invariant itself it most likely means that base or index is not
16273 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16274 which is not profitable for x86. */
16275 if (parts.base
16276 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16277 && (current_pass->type == GIMPLE_PASS
16278 || !pic_offset_table_rtx
16279 || !REG_P (parts.base)
16280 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16281 cost++;
16283 if (parts.index
16284 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16285 && (current_pass->type == GIMPLE_PASS
16286 || !pic_offset_table_rtx
16287 || !REG_P (parts.index)
16288 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16289 cost++;
16291 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16292 since it's predecode logic can't detect the length of instructions
16293 and it degenerates to vector decoded. Increase cost of such
16294 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16295 to split such addresses or even refuse such addresses at all.
16297 Following addressing modes are affected:
16298 [base+scale*index]
16299 [scale*index+disp]
16300 [base+index]
16302 The first and last case may be avoidable by explicitly coding the zero in
16303 memory address, but I don't have AMD-K6 machine handy to check this
16304 theory. */
16306 if (TARGET_K6
16307 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16308 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16309 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16310 cost += 10;
16312 return cost;
16315 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16316 this is used for to form addresses to local data when -fPIC is in
16317 use. */
16319 static bool
16320 darwin_local_data_pic (rtx disp)
16322 return (GET_CODE (disp) == UNSPEC
16323 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16326 /* True if operand X should be loaded from GOT. */
16328 bool
16329 ix86_force_load_from_GOT_p (rtx x)
16331 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16332 && !TARGET_PECOFF && !TARGET_MACHO
16333 && !flag_plt && !flag_pic
16334 && ix86_cmodel != CM_LARGE
16335 && GET_CODE (x) == SYMBOL_REF
16336 && SYMBOL_REF_FUNCTION_P (x)
16337 && !SYMBOL_REF_LOCAL_P (x));
16340 /* Determine if a given RTX is a valid constant. We already know this
16341 satisfies CONSTANT_P. */
16343 static bool
16344 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16346 /* Pointer bounds constants are not valid. */
16347 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16348 return false;
16350 switch (GET_CODE (x))
16352 case CONST:
16353 x = XEXP (x, 0);
16355 if (GET_CODE (x) == PLUS)
16357 if (!CONST_INT_P (XEXP (x, 1)))
16358 return false;
16359 x = XEXP (x, 0);
16362 if (TARGET_MACHO && darwin_local_data_pic (x))
16363 return true;
16365 /* Only some unspecs are valid as "constants". */
16366 if (GET_CODE (x) == UNSPEC)
16367 switch (XINT (x, 1))
16369 case UNSPEC_GOT:
16370 case UNSPEC_GOTOFF:
16371 case UNSPEC_PLTOFF:
16372 return TARGET_64BIT;
16373 case UNSPEC_TPOFF:
16374 case UNSPEC_NTPOFF:
16375 x = XVECEXP (x, 0, 0);
16376 return (GET_CODE (x) == SYMBOL_REF
16377 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16378 case UNSPEC_DTPOFF:
16379 x = XVECEXP (x, 0, 0);
16380 return (GET_CODE (x) == SYMBOL_REF
16381 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16382 default:
16383 return false;
16386 /* We must have drilled down to a symbol. */
16387 if (GET_CODE (x) == LABEL_REF)
16388 return true;
16389 if (GET_CODE (x) != SYMBOL_REF)
16390 return false;
16391 /* FALLTHRU */
16393 case SYMBOL_REF:
16394 /* TLS symbols are never valid. */
16395 if (SYMBOL_REF_TLS_MODEL (x))
16396 return false;
16398 /* DLLIMPORT symbols are never valid. */
16399 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16400 && SYMBOL_REF_DLLIMPORT_P (x))
16401 return false;
16403 #if TARGET_MACHO
16404 /* mdynamic-no-pic */
16405 if (MACHO_DYNAMIC_NO_PIC_P)
16406 return machopic_symbol_defined_p (x);
16407 #endif
16409 /* External function address should be loaded
16410 via the GOT slot to avoid PLT. */
16411 if (ix86_force_load_from_GOT_p (x))
16412 return false;
16414 break;
16416 CASE_CONST_SCALAR_INT:
16417 switch (mode)
16419 case TImode:
16420 if (TARGET_64BIT)
16421 return true;
16422 /* FALLTHRU */
16423 case OImode:
16424 case XImode:
16425 if (!standard_sse_constant_p (x, mode))
16426 return false;
16427 default:
16428 break;
16430 break;
16432 case CONST_VECTOR:
16433 if (!standard_sse_constant_p (x, mode))
16434 return false;
16436 default:
16437 break;
16440 /* Otherwise we handle everything else in the move patterns. */
16441 return true;
16444 /* Determine if it's legal to put X into the constant pool. This
16445 is not possible for the address of thread-local symbols, which
16446 is checked above. */
16448 static bool
16449 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16451 /* We can put any immediate constant in memory. */
16452 switch (GET_CODE (x))
16454 CASE_CONST_ANY:
16455 return false;
16457 default:
16458 break;
16461 return !ix86_legitimate_constant_p (mode, x);
16464 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16465 otherwise zero. */
16467 static bool
16468 is_imported_p (rtx x)
16470 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16471 || GET_CODE (x) != SYMBOL_REF)
16472 return false;
16474 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16478 /* Nonzero if the constant value X is a legitimate general operand
16479 when generating PIC code. It is given that flag_pic is on and
16480 that X satisfies CONSTANT_P. */
16482 bool
16483 legitimate_pic_operand_p (rtx x)
16485 rtx inner;
16487 switch (GET_CODE (x))
16489 case CONST:
16490 inner = XEXP (x, 0);
16491 if (GET_CODE (inner) == PLUS
16492 && CONST_INT_P (XEXP (inner, 1)))
16493 inner = XEXP (inner, 0);
16495 /* Only some unspecs are valid as "constants". */
16496 if (GET_CODE (inner) == UNSPEC)
16497 switch (XINT (inner, 1))
16499 case UNSPEC_GOT:
16500 case UNSPEC_GOTOFF:
16501 case UNSPEC_PLTOFF:
16502 return TARGET_64BIT;
16503 case UNSPEC_TPOFF:
16504 x = XVECEXP (inner, 0, 0);
16505 return (GET_CODE (x) == SYMBOL_REF
16506 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16507 case UNSPEC_MACHOPIC_OFFSET:
16508 return legitimate_pic_address_disp_p (x);
16509 default:
16510 return false;
16512 /* FALLTHRU */
16514 case SYMBOL_REF:
16515 case LABEL_REF:
16516 return legitimate_pic_address_disp_p (x);
16518 default:
16519 return true;
16523 /* Determine if a given CONST RTX is a valid memory displacement
16524 in PIC mode. */
16526 bool
16527 legitimate_pic_address_disp_p (rtx disp)
16529 bool saw_plus;
16531 /* In 64bit mode we can allow direct addresses of symbols and labels
16532 when they are not dynamic symbols. */
16533 if (TARGET_64BIT)
16535 rtx op0 = disp, op1;
16537 switch (GET_CODE (disp))
16539 case LABEL_REF:
16540 return true;
16542 case CONST:
16543 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16544 break;
16545 op0 = XEXP (XEXP (disp, 0), 0);
16546 op1 = XEXP (XEXP (disp, 0), 1);
16547 if (!CONST_INT_P (op1)
16548 || INTVAL (op1) >= 16*1024*1024
16549 || INTVAL (op1) < -16*1024*1024)
16550 break;
16551 if (GET_CODE (op0) == LABEL_REF)
16552 return true;
16553 if (GET_CODE (op0) == CONST
16554 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16555 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16556 return true;
16557 if (GET_CODE (op0) == UNSPEC
16558 && XINT (op0, 1) == UNSPEC_PCREL)
16559 return true;
16560 if (GET_CODE (op0) != SYMBOL_REF)
16561 break;
16562 /* FALLTHRU */
16564 case SYMBOL_REF:
16565 /* TLS references should always be enclosed in UNSPEC.
16566 The dllimported symbol needs always to be resolved. */
16567 if (SYMBOL_REF_TLS_MODEL (op0)
16568 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16569 return false;
16571 if (TARGET_PECOFF)
16573 if (is_imported_p (op0))
16574 return true;
16576 if (SYMBOL_REF_FAR_ADDR_P (op0)
16577 || !SYMBOL_REF_LOCAL_P (op0))
16578 break;
16580 /* Function-symbols need to be resolved only for
16581 large-model.
16582 For the small-model we don't need to resolve anything
16583 here. */
16584 if ((ix86_cmodel != CM_LARGE_PIC
16585 && SYMBOL_REF_FUNCTION_P (op0))
16586 || ix86_cmodel == CM_SMALL_PIC)
16587 return true;
16588 /* Non-external symbols don't need to be resolved for
16589 large, and medium-model. */
16590 if ((ix86_cmodel == CM_LARGE_PIC
16591 || ix86_cmodel == CM_MEDIUM_PIC)
16592 && !SYMBOL_REF_EXTERNAL_P (op0))
16593 return true;
16595 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16596 && (SYMBOL_REF_LOCAL_P (op0)
16597 || (HAVE_LD_PIE_COPYRELOC
16598 && flag_pie
16599 && !SYMBOL_REF_WEAK (op0)
16600 && !SYMBOL_REF_FUNCTION_P (op0)))
16601 && ix86_cmodel != CM_LARGE_PIC)
16602 return true;
16603 break;
16605 default:
16606 break;
16609 if (GET_CODE (disp) != CONST)
16610 return false;
16611 disp = XEXP (disp, 0);
16613 if (TARGET_64BIT)
16615 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16616 of GOT tables. We should not need these anyway. */
16617 if (GET_CODE (disp) != UNSPEC
16618 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16619 && XINT (disp, 1) != UNSPEC_GOTOFF
16620 && XINT (disp, 1) != UNSPEC_PCREL
16621 && XINT (disp, 1) != UNSPEC_PLTOFF))
16622 return false;
16624 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16625 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16626 return false;
16627 return true;
16630 saw_plus = false;
16631 if (GET_CODE (disp) == PLUS)
16633 if (!CONST_INT_P (XEXP (disp, 1)))
16634 return false;
16635 disp = XEXP (disp, 0);
16636 saw_plus = true;
16639 if (TARGET_MACHO && darwin_local_data_pic (disp))
16640 return true;
16642 if (GET_CODE (disp) != UNSPEC)
16643 return false;
16645 switch (XINT (disp, 1))
16647 case UNSPEC_GOT:
16648 if (saw_plus)
16649 return false;
16650 /* We need to check for both symbols and labels because VxWorks loads
16651 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16652 details. */
16653 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16654 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16655 case UNSPEC_GOTOFF:
16656 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16657 While ABI specify also 32bit relocation but we don't produce it in
16658 small PIC model at all. */
16659 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16660 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16661 && !TARGET_64BIT)
16662 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16663 return false;
16664 case UNSPEC_GOTTPOFF:
16665 case UNSPEC_GOTNTPOFF:
16666 case UNSPEC_INDNTPOFF:
16667 if (saw_plus)
16668 return false;
16669 disp = XVECEXP (disp, 0, 0);
16670 return (GET_CODE (disp) == SYMBOL_REF
16671 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16672 case UNSPEC_NTPOFF:
16673 disp = XVECEXP (disp, 0, 0);
16674 return (GET_CODE (disp) == SYMBOL_REF
16675 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16676 case UNSPEC_DTPOFF:
16677 disp = XVECEXP (disp, 0, 0);
16678 return (GET_CODE (disp) == SYMBOL_REF
16679 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16682 return false;
16685 /* Determine if op is suitable RTX for an address register.
16686 Return naked register if a register or a register subreg is
16687 found, otherwise return NULL_RTX. */
16689 static rtx
16690 ix86_validate_address_register (rtx op)
16692 machine_mode mode = GET_MODE (op);
16694 /* Only SImode or DImode registers can form the address. */
16695 if (mode != SImode && mode != DImode)
16696 return NULL_RTX;
16698 if (REG_P (op))
16699 return op;
16700 else if (SUBREG_P (op))
16702 rtx reg = SUBREG_REG (op);
16704 if (!REG_P (reg))
16705 return NULL_RTX;
16707 mode = GET_MODE (reg);
16709 /* Don't allow SUBREGs that span more than a word. It can
16710 lead to spill failures when the register is one word out
16711 of a two word structure. */
16712 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16713 return NULL_RTX;
16715 /* Allow only SUBREGs of non-eliminable hard registers. */
16716 if (register_no_elim_operand (reg, mode))
16717 return reg;
16720 /* Op is not a register. */
16721 return NULL_RTX;
16724 /* Recognizes RTL expressions that are valid memory addresses for an
16725 instruction. The MODE argument is the machine mode for the MEM
16726 expression that wants to use this address.
16728 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16729 convert common non-canonical forms to canonical form so that they will
16730 be recognized. */
16732 static bool
16733 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16735 struct ix86_address parts;
16736 rtx base, index, disp;
16737 HOST_WIDE_INT scale;
16738 addr_space_t seg;
16740 if (ix86_decompose_address (addr, &parts) <= 0)
16741 /* Decomposition failed. */
16742 return false;
16744 base = parts.base;
16745 index = parts.index;
16746 disp = parts.disp;
16747 scale = parts.scale;
16748 seg = parts.seg;
16750 /* Validate base register. */
16751 if (base)
16753 rtx reg = ix86_validate_address_register (base);
16755 if (reg == NULL_RTX)
16756 return false;
16758 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16759 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16760 /* Base is not valid. */
16761 return false;
16764 /* Validate index register. */
16765 if (index)
16767 rtx reg = ix86_validate_address_register (index);
16769 if (reg == NULL_RTX)
16770 return false;
16772 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16773 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16774 /* Index is not valid. */
16775 return false;
16778 /* Index and base should have the same mode. */
16779 if (base && index
16780 && GET_MODE (base) != GET_MODE (index))
16781 return false;
16783 /* Address override works only on the (%reg) part of %fs:(%reg). */
16784 if (seg != ADDR_SPACE_GENERIC
16785 && ((base && GET_MODE (base) != word_mode)
16786 || (index && GET_MODE (index) != word_mode)))
16787 return false;
16789 /* Validate scale factor. */
16790 if (scale != 1)
16792 if (!index)
16793 /* Scale without index. */
16794 return false;
16796 if (scale != 2 && scale != 4 && scale != 8)
16797 /* Scale is not a valid multiplier. */
16798 return false;
16801 /* Validate displacement. */
16802 if (disp)
16804 if (GET_CODE (disp) == CONST
16805 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16806 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16807 switch (XINT (XEXP (disp, 0), 1))
16809 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16810 when used. While ABI specify also 32bit relocations, we
16811 don't produce them at all and use IP relative instead.
16812 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16813 should be loaded via GOT. */
16814 case UNSPEC_GOT:
16815 if (!TARGET_64BIT
16816 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16817 goto is_legitimate_pic;
16818 /* FALLTHRU */
16819 case UNSPEC_GOTOFF:
16820 gcc_assert (flag_pic);
16821 if (!TARGET_64BIT)
16822 goto is_legitimate_pic;
16824 /* 64bit address unspec. */
16825 return false;
16827 case UNSPEC_GOTPCREL:
16828 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16829 goto is_legitimate_pic;
16830 /* FALLTHRU */
16831 case UNSPEC_PCREL:
16832 gcc_assert (flag_pic);
16833 goto is_legitimate_pic;
16835 case UNSPEC_GOTTPOFF:
16836 case UNSPEC_GOTNTPOFF:
16837 case UNSPEC_INDNTPOFF:
16838 case UNSPEC_NTPOFF:
16839 case UNSPEC_DTPOFF:
16840 break;
16842 case UNSPEC_STACK_CHECK:
16843 gcc_assert (flag_split_stack);
16844 break;
16846 default:
16847 /* Invalid address unspec. */
16848 return false;
16851 else if (SYMBOLIC_CONST (disp)
16852 && (flag_pic
16853 || (TARGET_MACHO
16854 #if TARGET_MACHO
16855 && MACHOPIC_INDIRECT
16856 && !machopic_operand_p (disp)
16857 #endif
16861 is_legitimate_pic:
16862 if (TARGET_64BIT && (index || base))
16864 /* foo@dtpoff(%rX) is ok. */
16865 if (GET_CODE (disp) != CONST
16866 || GET_CODE (XEXP (disp, 0)) != PLUS
16867 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16868 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16869 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16870 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16871 /* Non-constant pic memory reference. */
16872 return false;
16874 else if ((!TARGET_MACHO || flag_pic)
16875 && ! legitimate_pic_address_disp_p (disp))
16876 /* Displacement is an invalid pic construct. */
16877 return false;
16878 #if TARGET_MACHO
16879 else if (MACHO_DYNAMIC_NO_PIC_P
16880 && !ix86_legitimate_constant_p (Pmode, disp))
16881 /* displacment must be referenced via non_lazy_pointer */
16882 return false;
16883 #endif
16885 /* This code used to verify that a symbolic pic displacement
16886 includes the pic_offset_table_rtx register.
16888 While this is good idea, unfortunately these constructs may
16889 be created by "adds using lea" optimization for incorrect
16890 code like:
16892 int a;
16893 int foo(int i)
16895 return *(&a+i);
16898 This code is nonsensical, but results in addressing
16899 GOT table with pic_offset_table_rtx base. We can't
16900 just refuse it easily, since it gets matched by
16901 "addsi3" pattern, that later gets split to lea in the
16902 case output register differs from input. While this
16903 can be handled by separate addsi pattern for this case
16904 that never results in lea, this seems to be easier and
16905 correct fix for crash to disable this test. */
16907 else if (GET_CODE (disp) != LABEL_REF
16908 && !CONST_INT_P (disp)
16909 && (GET_CODE (disp) != CONST
16910 || !ix86_legitimate_constant_p (Pmode, disp))
16911 && (GET_CODE (disp) != SYMBOL_REF
16912 || !ix86_legitimate_constant_p (Pmode, disp)))
16913 /* Displacement is not constant. */
16914 return false;
16915 else if (TARGET_64BIT
16916 && !x86_64_immediate_operand (disp, VOIDmode))
16917 /* Displacement is out of range. */
16918 return false;
16919 /* In x32 mode, constant addresses are sign extended to 64bit, so
16920 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16921 else if (TARGET_X32 && !(index || base)
16922 && CONST_INT_P (disp)
16923 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16924 return false;
16927 /* Everything looks valid. */
16928 return true;
16931 /* Determine if a given RTX is a valid constant address. */
16933 bool
16934 constant_address_p (rtx x)
16936 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16939 /* Return a unique alias set for the GOT. */
16941 static alias_set_type
16942 ix86_GOT_alias_set (void)
16944 static alias_set_type set = -1;
16945 if (set == -1)
16946 set = new_alias_set ();
16947 return set;
16950 /* Return a legitimate reference for ORIG (an address) using the
16951 register REG. If REG is 0, a new pseudo is generated.
16953 There are two types of references that must be handled:
16955 1. Global data references must load the address from the GOT, via
16956 the PIC reg. An insn is emitted to do this load, and the reg is
16957 returned.
16959 2. Static data references, constant pool addresses, and code labels
16960 compute the address as an offset from the GOT, whose base is in
16961 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16962 differentiate them from global data objects. The returned
16963 address is the PIC reg + an unspec constant.
16965 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16966 reg also appears in the address. */
16968 static rtx
16969 legitimize_pic_address (rtx orig, rtx reg)
16971 rtx addr = orig;
16972 rtx new_rtx = orig;
16974 #if TARGET_MACHO
16975 if (TARGET_MACHO && !TARGET_64BIT)
16977 if (reg == 0)
16978 reg = gen_reg_rtx (Pmode);
16979 /* Use the generic Mach-O PIC machinery. */
16980 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16982 #endif
16984 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16986 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16987 if (tmp)
16988 return tmp;
16991 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16992 new_rtx = addr;
16993 else if ((!TARGET_64BIT
16994 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16995 && !TARGET_PECOFF
16996 && gotoff_operand (addr, Pmode))
16998 /* This symbol may be referenced via a displacement
16999 from the PIC base address (@GOTOFF). */
17000 if (GET_CODE (addr) == CONST)
17001 addr = XEXP (addr, 0);
17003 if (GET_CODE (addr) == PLUS)
17005 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
17006 UNSPEC_GOTOFF);
17007 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
17009 else
17010 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17012 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17014 if (TARGET_64BIT)
17015 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17017 if (reg != 0)
17019 gcc_assert (REG_P (reg));
17020 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17021 new_rtx, reg, 1, OPTAB_DIRECT);
17023 else
17024 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17026 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17027 /* We can't use @GOTOFF for text labels
17028 on VxWorks, see gotoff_operand. */
17029 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17031 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17032 if (tmp)
17033 return tmp;
17035 /* For x64 PE-COFF there is no GOT table,
17036 so we use address directly. */
17037 if (TARGET_64BIT && TARGET_PECOFF)
17039 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17040 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17042 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17044 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17045 UNSPEC_GOTPCREL);
17046 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17047 new_rtx = gen_const_mem (Pmode, new_rtx);
17048 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17050 else
17052 /* This symbol must be referenced via a load
17053 from the Global Offset Table (@GOT). */
17054 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17055 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17056 if (TARGET_64BIT)
17057 new_rtx = force_reg (Pmode, new_rtx);
17058 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17059 new_rtx = gen_const_mem (Pmode, new_rtx);
17060 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17063 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17065 else
17067 if (CONST_INT_P (addr)
17068 && !x86_64_immediate_operand (addr, VOIDmode))
17069 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17070 else if (GET_CODE (addr) == CONST)
17072 addr = XEXP (addr, 0);
17074 /* We must match stuff we generate before. Assume the only
17075 unspecs that can get here are ours. Not that we could do
17076 anything with them anyway.... */
17077 if (GET_CODE (addr) == UNSPEC
17078 || (GET_CODE (addr) == PLUS
17079 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17080 return orig;
17081 gcc_assert (GET_CODE (addr) == PLUS);
17084 if (GET_CODE (addr) == PLUS)
17086 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17088 /* Check first to see if this is a constant
17089 offset from a @GOTOFF symbol reference. */
17090 if (!TARGET_PECOFF
17091 && gotoff_operand (op0, Pmode)
17092 && CONST_INT_P (op1))
17094 if (!TARGET_64BIT)
17096 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17097 UNSPEC_GOTOFF);
17098 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17099 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17101 if (reg != 0)
17103 gcc_assert (REG_P (reg));
17104 new_rtx = expand_simple_binop (Pmode, PLUS,
17105 pic_offset_table_rtx,
17106 new_rtx, reg, 1,
17107 OPTAB_DIRECT);
17109 else
17110 new_rtx
17111 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17113 else
17115 if (INTVAL (op1) < -16*1024*1024
17116 || INTVAL (op1) >= 16*1024*1024)
17118 if (!x86_64_immediate_operand (op1, Pmode))
17119 op1 = force_reg (Pmode, op1);
17121 new_rtx
17122 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17126 else
17128 rtx base = legitimize_pic_address (op0, reg);
17129 machine_mode mode = GET_MODE (base);
17130 new_rtx
17131 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17133 if (CONST_INT_P (new_rtx))
17135 if (INTVAL (new_rtx) < -16*1024*1024
17136 || INTVAL (new_rtx) >= 16*1024*1024)
17138 if (!x86_64_immediate_operand (new_rtx, mode))
17139 new_rtx = force_reg (mode, new_rtx);
17141 new_rtx
17142 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17144 else
17145 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17147 else
17149 /* For %rip addressing, we have to use
17150 just disp32, not base nor index. */
17151 if (TARGET_64BIT
17152 && (GET_CODE (base) == SYMBOL_REF
17153 || GET_CODE (base) == LABEL_REF))
17154 base = force_reg (mode, base);
17155 if (GET_CODE (new_rtx) == PLUS
17156 && CONSTANT_P (XEXP (new_rtx, 1)))
17158 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17159 new_rtx = XEXP (new_rtx, 1);
17161 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17166 return new_rtx;
17169 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17171 static rtx
17172 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17174 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17176 if (GET_MODE (tp) != tp_mode)
17178 gcc_assert (GET_MODE (tp) == SImode);
17179 gcc_assert (tp_mode == DImode);
17181 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17184 if (to_reg)
17185 tp = copy_to_mode_reg (tp_mode, tp);
17187 return tp;
17190 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17192 static GTY(()) rtx ix86_tls_symbol;
17194 static rtx
17195 ix86_tls_get_addr (void)
17197 if (!ix86_tls_symbol)
17199 const char *sym
17200 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17201 ? "___tls_get_addr" : "__tls_get_addr");
17203 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17206 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17208 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17209 UNSPEC_PLTOFF);
17210 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17211 gen_rtx_CONST (Pmode, unspec));
17214 return ix86_tls_symbol;
17217 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17219 static GTY(()) rtx ix86_tls_module_base_symbol;
17222 ix86_tls_module_base (void)
17224 if (!ix86_tls_module_base_symbol)
17226 ix86_tls_module_base_symbol
17227 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17229 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17230 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17233 return ix86_tls_module_base_symbol;
17236 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17237 false if we expect this to be used for a memory address and true if
17238 we expect to load the address into a register. */
17240 static rtx
17241 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17243 rtx dest, base, off;
17244 rtx pic = NULL_RTX, tp = NULL_RTX;
17245 machine_mode tp_mode = Pmode;
17246 int type;
17248 /* Fall back to global dynamic model if tool chain cannot support local
17249 dynamic. */
17250 if (TARGET_SUN_TLS && !TARGET_64BIT
17251 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17252 && model == TLS_MODEL_LOCAL_DYNAMIC)
17253 model = TLS_MODEL_GLOBAL_DYNAMIC;
17255 switch (model)
17257 case TLS_MODEL_GLOBAL_DYNAMIC:
17258 dest = gen_reg_rtx (Pmode);
17260 if (!TARGET_64BIT)
17262 if (flag_pic && !TARGET_PECOFF)
17263 pic = pic_offset_table_rtx;
17264 else
17266 pic = gen_reg_rtx (Pmode);
17267 emit_insn (gen_set_got (pic));
17271 if (TARGET_GNU2_TLS)
17273 if (TARGET_64BIT)
17274 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17275 else
17276 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17278 tp = get_thread_pointer (Pmode, true);
17279 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17281 if (GET_MODE (x) != Pmode)
17282 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17284 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17286 else
17288 rtx caddr = ix86_tls_get_addr ();
17290 if (TARGET_64BIT)
17292 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17293 rtx_insn *insns;
17295 start_sequence ();
17296 emit_call_insn
17297 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17298 insns = get_insns ();
17299 end_sequence ();
17301 if (GET_MODE (x) != Pmode)
17302 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17304 RTL_CONST_CALL_P (insns) = 1;
17305 emit_libcall_block (insns, dest, rax, x);
17307 else
17308 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17310 break;
17312 case TLS_MODEL_LOCAL_DYNAMIC:
17313 base = gen_reg_rtx (Pmode);
17315 if (!TARGET_64BIT)
17317 if (flag_pic)
17318 pic = pic_offset_table_rtx;
17319 else
17321 pic = gen_reg_rtx (Pmode);
17322 emit_insn (gen_set_got (pic));
17326 if (TARGET_GNU2_TLS)
17328 rtx tmp = ix86_tls_module_base ();
17330 if (TARGET_64BIT)
17331 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17332 else
17333 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17335 tp = get_thread_pointer (Pmode, true);
17336 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17337 gen_rtx_MINUS (Pmode, tmp, tp));
17339 else
17341 rtx caddr = ix86_tls_get_addr ();
17343 if (TARGET_64BIT)
17345 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17346 rtx_insn *insns;
17347 rtx eqv;
17349 start_sequence ();
17350 emit_call_insn
17351 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17352 insns = get_insns ();
17353 end_sequence ();
17355 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17356 share the LD_BASE result with other LD model accesses. */
17357 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17358 UNSPEC_TLS_LD_BASE);
17360 RTL_CONST_CALL_P (insns) = 1;
17361 emit_libcall_block (insns, base, rax, eqv);
17363 else
17364 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17367 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17368 off = gen_rtx_CONST (Pmode, off);
17370 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17372 if (TARGET_GNU2_TLS)
17374 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17376 if (GET_MODE (x) != Pmode)
17377 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17379 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17381 break;
17383 case TLS_MODEL_INITIAL_EXEC:
17384 if (TARGET_64BIT)
17386 if (TARGET_SUN_TLS && !TARGET_X32)
17388 /* The Sun linker took the AMD64 TLS spec literally
17389 and can only handle %rax as destination of the
17390 initial executable code sequence. */
17392 dest = gen_reg_rtx (DImode);
17393 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17394 return dest;
17397 /* Generate DImode references to avoid %fs:(%reg32)
17398 problems and linker IE->LE relaxation bug. */
17399 tp_mode = DImode;
17400 pic = NULL;
17401 type = UNSPEC_GOTNTPOFF;
17403 else if (flag_pic)
17405 pic = pic_offset_table_rtx;
17406 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17408 else if (!TARGET_ANY_GNU_TLS)
17410 pic = gen_reg_rtx (Pmode);
17411 emit_insn (gen_set_got (pic));
17412 type = UNSPEC_GOTTPOFF;
17414 else
17416 pic = NULL;
17417 type = UNSPEC_INDNTPOFF;
17420 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17421 off = gen_rtx_CONST (tp_mode, off);
17422 if (pic)
17423 off = gen_rtx_PLUS (tp_mode, pic, off);
17424 off = gen_const_mem (tp_mode, off);
17425 set_mem_alias_set (off, ix86_GOT_alias_set ());
17427 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17429 base = get_thread_pointer (tp_mode,
17430 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17431 off = force_reg (tp_mode, off);
17432 dest = gen_rtx_PLUS (tp_mode, base, off);
17433 if (tp_mode != Pmode)
17434 dest = convert_to_mode (Pmode, dest, 1);
17436 else
17438 base = get_thread_pointer (Pmode, true);
17439 dest = gen_reg_rtx (Pmode);
17440 emit_insn (ix86_gen_sub3 (dest, base, off));
17442 break;
17444 case TLS_MODEL_LOCAL_EXEC:
17445 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17446 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17447 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17448 off = gen_rtx_CONST (Pmode, off);
17450 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17452 base = get_thread_pointer (Pmode,
17453 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17454 return gen_rtx_PLUS (Pmode, base, off);
17456 else
17458 base = get_thread_pointer (Pmode, true);
17459 dest = gen_reg_rtx (Pmode);
17460 emit_insn (ix86_gen_sub3 (dest, base, off));
17462 break;
17464 default:
17465 gcc_unreachable ();
17468 return dest;
17471 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17472 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17473 unique refptr-DECL symbol corresponding to symbol DECL. */
17475 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17477 static inline hashval_t hash (tree_map *m) { return m->hash; }
17478 static inline bool
17479 equal (tree_map *a, tree_map *b)
17481 return a->base.from == b->base.from;
17484 static int
17485 keep_cache_entry (tree_map *&m)
17487 return ggc_marked_p (m->base.from);
17491 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17493 static tree
17494 get_dllimport_decl (tree decl, bool beimport)
17496 struct tree_map *h, in;
17497 const char *name;
17498 const char *prefix;
17499 size_t namelen, prefixlen;
17500 char *imp_name;
17501 tree to;
17502 rtx rtl;
17504 if (!dllimport_map)
17505 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17507 in.hash = htab_hash_pointer (decl);
17508 in.base.from = decl;
17509 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17510 h = *loc;
17511 if (h)
17512 return h->to;
17514 *loc = h = ggc_alloc<tree_map> ();
17515 h->hash = in.hash;
17516 h->base.from = decl;
17517 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17518 VAR_DECL, NULL, ptr_type_node);
17519 DECL_ARTIFICIAL (to) = 1;
17520 DECL_IGNORED_P (to) = 1;
17521 DECL_EXTERNAL (to) = 1;
17522 TREE_READONLY (to) = 1;
17524 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17525 name = targetm.strip_name_encoding (name);
17526 if (beimport)
17527 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17528 ? "*__imp_" : "*__imp__";
17529 else
17530 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17531 namelen = strlen (name);
17532 prefixlen = strlen (prefix);
17533 imp_name = (char *) alloca (namelen + prefixlen + 1);
17534 memcpy (imp_name, prefix, prefixlen);
17535 memcpy (imp_name + prefixlen, name, namelen + 1);
17537 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17538 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17539 SET_SYMBOL_REF_DECL (rtl, to);
17540 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17541 if (!beimport)
17543 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17544 #ifdef SUB_TARGET_RECORD_STUB
17545 SUB_TARGET_RECORD_STUB (name);
17546 #endif
17549 rtl = gen_const_mem (Pmode, rtl);
17550 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17552 SET_DECL_RTL (to, rtl);
17553 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17555 return to;
17558 /* Expand SYMBOL into its corresponding far-address symbol.
17559 WANT_REG is true if we require the result be a register. */
17561 static rtx
17562 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17564 tree imp_decl;
17565 rtx x;
17567 gcc_assert (SYMBOL_REF_DECL (symbol));
17568 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17570 x = DECL_RTL (imp_decl);
17571 if (want_reg)
17572 x = force_reg (Pmode, x);
17573 return x;
17576 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17577 true if we require the result be a register. */
17579 static rtx
17580 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17582 tree imp_decl;
17583 rtx x;
17585 gcc_assert (SYMBOL_REF_DECL (symbol));
17586 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17588 x = DECL_RTL (imp_decl);
17589 if (want_reg)
17590 x = force_reg (Pmode, x);
17591 return x;
17594 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17595 is true if we require the result be a register. */
17597 static rtx
17598 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17600 if (!TARGET_PECOFF)
17601 return NULL_RTX;
17603 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17605 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17606 return legitimize_dllimport_symbol (addr, inreg);
17607 if (GET_CODE (addr) == CONST
17608 && GET_CODE (XEXP (addr, 0)) == PLUS
17609 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17610 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17612 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17613 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17617 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17618 return NULL_RTX;
17619 if (GET_CODE (addr) == SYMBOL_REF
17620 && !is_imported_p (addr)
17621 && SYMBOL_REF_EXTERNAL_P (addr)
17622 && SYMBOL_REF_DECL (addr))
17623 return legitimize_pe_coff_extern_decl (addr, inreg);
17625 if (GET_CODE (addr) == CONST
17626 && GET_CODE (XEXP (addr, 0)) == PLUS
17627 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17628 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17629 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17630 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17632 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17633 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17635 return NULL_RTX;
17638 /* Try machine-dependent ways of modifying an illegitimate address
17639 to be legitimate. If we find one, return the new, valid address.
17640 This macro is used in only one place: `memory_address' in explow.c.
17642 OLDX is the address as it was before break_out_memory_refs was called.
17643 In some cases it is useful to look at this to decide what needs to be done.
17645 It is always safe for this macro to do nothing. It exists to recognize
17646 opportunities to optimize the output.
17648 For the 80386, we handle X+REG by loading X into a register R and
17649 using R+REG. R will go in a general reg and indexing will be used.
17650 However, if REG is a broken-out memory address or multiplication,
17651 nothing needs to be done because REG can certainly go in a general reg.
17653 When -fpic is used, special handling is needed for symbolic references.
17654 See comments by legitimize_pic_address in i386.c for details. */
17656 static rtx
17657 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17659 bool changed = false;
17660 unsigned log;
17662 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17663 if (log)
17664 return legitimize_tls_address (x, (enum tls_model) log, false);
17665 if (GET_CODE (x) == CONST
17666 && GET_CODE (XEXP (x, 0)) == PLUS
17667 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17668 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17670 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17671 (enum tls_model) log, false);
17672 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17675 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17677 rtx tmp = legitimize_pe_coff_symbol (x, true);
17678 if (tmp)
17679 return tmp;
17682 if (flag_pic && SYMBOLIC_CONST (x))
17683 return legitimize_pic_address (x, 0);
17685 #if TARGET_MACHO
17686 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17687 return machopic_indirect_data_reference (x, 0);
17688 #endif
17690 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17691 if (GET_CODE (x) == ASHIFT
17692 && CONST_INT_P (XEXP (x, 1))
17693 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17695 changed = true;
17696 log = INTVAL (XEXP (x, 1));
17697 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17698 GEN_INT (1 << log));
17701 if (GET_CODE (x) == PLUS)
17703 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17705 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17706 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17707 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17709 changed = true;
17710 log = INTVAL (XEXP (XEXP (x, 0), 1));
17711 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17712 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17713 GEN_INT (1 << log));
17716 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17717 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17718 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17720 changed = true;
17721 log = INTVAL (XEXP (XEXP (x, 1), 1));
17722 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17723 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17724 GEN_INT (1 << log));
17727 /* Put multiply first if it isn't already. */
17728 if (GET_CODE (XEXP (x, 1)) == MULT)
17730 std::swap (XEXP (x, 0), XEXP (x, 1));
17731 changed = true;
17734 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17735 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17736 created by virtual register instantiation, register elimination, and
17737 similar optimizations. */
17738 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17740 changed = true;
17741 x = gen_rtx_PLUS (Pmode,
17742 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17743 XEXP (XEXP (x, 1), 0)),
17744 XEXP (XEXP (x, 1), 1));
17747 /* Canonicalize
17748 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17749 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17750 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17751 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17752 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17753 && CONSTANT_P (XEXP (x, 1)))
17755 rtx constant;
17756 rtx other = NULL_RTX;
17758 if (CONST_INT_P (XEXP (x, 1)))
17760 constant = XEXP (x, 1);
17761 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17763 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17765 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17766 other = XEXP (x, 1);
17768 else
17769 constant = 0;
17771 if (constant)
17773 changed = true;
17774 x = gen_rtx_PLUS (Pmode,
17775 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17776 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17777 plus_constant (Pmode, other,
17778 INTVAL (constant)));
17782 if (changed && ix86_legitimate_address_p (mode, x, false))
17783 return x;
17785 if (GET_CODE (XEXP (x, 0)) == MULT)
17787 changed = true;
17788 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17791 if (GET_CODE (XEXP (x, 1)) == MULT)
17793 changed = true;
17794 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17797 if (changed
17798 && REG_P (XEXP (x, 1))
17799 && REG_P (XEXP (x, 0)))
17800 return x;
17802 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17804 changed = true;
17805 x = legitimize_pic_address (x, 0);
17808 if (changed && ix86_legitimate_address_p (mode, x, false))
17809 return x;
17811 if (REG_P (XEXP (x, 0)))
17813 rtx temp = gen_reg_rtx (Pmode);
17814 rtx val = force_operand (XEXP (x, 1), temp);
17815 if (val != temp)
17817 val = convert_to_mode (Pmode, val, 1);
17818 emit_move_insn (temp, val);
17821 XEXP (x, 1) = temp;
17822 return x;
17825 else if (REG_P (XEXP (x, 1)))
17827 rtx temp = gen_reg_rtx (Pmode);
17828 rtx val = force_operand (XEXP (x, 0), temp);
17829 if (val != temp)
17831 val = convert_to_mode (Pmode, val, 1);
17832 emit_move_insn (temp, val);
17835 XEXP (x, 0) = temp;
17836 return x;
17840 return x;
17843 /* Print an integer constant expression in assembler syntax. Addition
17844 and subtraction are the only arithmetic that may appear in these
17845 expressions. FILE is the stdio stream to write to, X is the rtx, and
17846 CODE is the operand print code from the output string. */
17848 static void
17849 output_pic_addr_const (FILE *file, rtx x, int code)
17851 char buf[256];
17853 switch (GET_CODE (x))
17855 case PC:
17856 gcc_assert (flag_pic);
17857 putc ('.', file);
17858 break;
17860 case SYMBOL_REF:
17861 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17862 output_addr_const (file, x);
17863 else
17865 const char *name = XSTR (x, 0);
17867 /* Mark the decl as referenced so that cgraph will
17868 output the function. */
17869 if (SYMBOL_REF_DECL (x))
17870 mark_decl_referenced (SYMBOL_REF_DECL (x));
17872 #if TARGET_MACHO
17873 if (MACHOPIC_INDIRECT
17874 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17875 name = machopic_indirection_name (x, /*stub_p=*/true);
17876 #endif
17877 assemble_name (file, name);
17879 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17880 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17881 fputs ("@PLT", file);
17882 break;
17884 case LABEL_REF:
17885 x = XEXP (x, 0);
17886 /* FALLTHRU */
17887 case CODE_LABEL:
17888 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17889 assemble_name (asm_out_file, buf);
17890 break;
17892 case CONST_INT:
17893 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17894 break;
17896 case CONST:
17897 /* This used to output parentheses around the expression,
17898 but that does not work on the 386 (either ATT or BSD assembler). */
17899 output_pic_addr_const (file, XEXP (x, 0), code);
17900 break;
17902 case CONST_DOUBLE:
17903 /* We can't handle floating point constants;
17904 TARGET_PRINT_OPERAND must handle them. */
17905 output_operand_lossage ("floating constant misused");
17906 break;
17908 case PLUS:
17909 /* Some assemblers need integer constants to appear first. */
17910 if (CONST_INT_P (XEXP (x, 0)))
17912 output_pic_addr_const (file, XEXP (x, 0), code);
17913 putc ('+', file);
17914 output_pic_addr_const (file, XEXP (x, 1), code);
17916 else
17918 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17919 output_pic_addr_const (file, XEXP (x, 1), code);
17920 putc ('+', file);
17921 output_pic_addr_const (file, XEXP (x, 0), code);
17923 break;
17925 case MINUS:
17926 if (!TARGET_MACHO)
17927 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17928 output_pic_addr_const (file, XEXP (x, 0), code);
17929 putc ('-', file);
17930 output_pic_addr_const (file, XEXP (x, 1), code);
17931 if (!TARGET_MACHO)
17932 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17933 break;
17935 case UNSPEC:
17936 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17938 bool f = i386_asm_output_addr_const_extra (file, x);
17939 gcc_assert (f);
17940 break;
17943 gcc_assert (XVECLEN (x, 0) == 1);
17944 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17945 switch (XINT (x, 1))
17947 case UNSPEC_GOT:
17948 fputs ("@GOT", file);
17949 break;
17950 case UNSPEC_GOTOFF:
17951 fputs ("@GOTOFF", file);
17952 break;
17953 case UNSPEC_PLTOFF:
17954 fputs ("@PLTOFF", file);
17955 break;
17956 case UNSPEC_PCREL:
17957 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17958 "(%rip)" : "[rip]", file);
17959 break;
17960 case UNSPEC_GOTPCREL:
17961 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17962 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17963 break;
17964 case UNSPEC_GOTTPOFF:
17965 /* FIXME: This might be @TPOFF in Sun ld too. */
17966 fputs ("@gottpoff", file);
17967 break;
17968 case UNSPEC_TPOFF:
17969 fputs ("@tpoff", file);
17970 break;
17971 case UNSPEC_NTPOFF:
17972 if (TARGET_64BIT)
17973 fputs ("@tpoff", file);
17974 else
17975 fputs ("@ntpoff", file);
17976 break;
17977 case UNSPEC_DTPOFF:
17978 fputs ("@dtpoff", file);
17979 break;
17980 case UNSPEC_GOTNTPOFF:
17981 if (TARGET_64BIT)
17982 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17983 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17984 else
17985 fputs ("@gotntpoff", file);
17986 break;
17987 case UNSPEC_INDNTPOFF:
17988 fputs ("@indntpoff", file);
17989 break;
17990 #if TARGET_MACHO
17991 case UNSPEC_MACHOPIC_OFFSET:
17992 putc ('-', file);
17993 machopic_output_function_base_name (file);
17994 break;
17995 #endif
17996 default:
17997 output_operand_lossage ("invalid UNSPEC as operand");
17998 break;
18000 break;
18002 default:
18003 output_operand_lossage ("invalid expression as operand");
18007 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
18008 We need to emit DTP-relative relocations. */
18010 static void ATTRIBUTE_UNUSED
18011 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18013 fputs (ASM_LONG, file);
18014 output_addr_const (file, x);
18015 fputs ("@dtpoff", file);
18016 switch (size)
18018 case 4:
18019 break;
18020 case 8:
18021 fputs (", 0", file);
18022 break;
18023 default:
18024 gcc_unreachable ();
18028 /* Return true if X is a representation of the PIC register. This copes
18029 with calls from ix86_find_base_term, where the register might have
18030 been replaced by a cselib value. */
18032 static bool
18033 ix86_pic_register_p (rtx x)
18035 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18036 return (pic_offset_table_rtx
18037 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18038 else if (!REG_P (x))
18039 return false;
18040 else if (pic_offset_table_rtx)
18042 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18043 return true;
18044 if (HARD_REGISTER_P (x)
18045 && !HARD_REGISTER_P (pic_offset_table_rtx)
18046 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18047 return true;
18048 return false;
18050 else
18051 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18054 /* Helper function for ix86_delegitimize_address.
18055 Attempt to delegitimize TLS local-exec accesses. */
18057 static rtx
18058 ix86_delegitimize_tls_address (rtx orig_x)
18060 rtx x = orig_x, unspec;
18061 struct ix86_address addr;
18063 if (!TARGET_TLS_DIRECT_SEG_REFS)
18064 return orig_x;
18065 if (MEM_P (x))
18066 x = XEXP (x, 0);
18067 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18068 return orig_x;
18069 if (ix86_decompose_address (x, &addr) == 0
18070 || addr.seg != DEFAULT_TLS_SEG_REG
18071 || addr.disp == NULL_RTX
18072 || GET_CODE (addr.disp) != CONST)
18073 return orig_x;
18074 unspec = XEXP (addr.disp, 0);
18075 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18076 unspec = XEXP (unspec, 0);
18077 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18078 return orig_x;
18079 x = XVECEXP (unspec, 0, 0);
18080 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18081 if (unspec != XEXP (addr.disp, 0))
18082 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18083 if (addr.index)
18085 rtx idx = addr.index;
18086 if (addr.scale != 1)
18087 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18088 x = gen_rtx_PLUS (Pmode, idx, x);
18090 if (addr.base)
18091 x = gen_rtx_PLUS (Pmode, addr.base, x);
18092 if (MEM_P (orig_x))
18093 x = replace_equiv_address_nv (orig_x, x);
18094 return x;
18097 /* In the name of slightly smaller debug output, and to cater to
18098 general assembler lossage, recognize PIC+GOTOFF and turn it back
18099 into a direct symbol reference.
18101 On Darwin, this is necessary to avoid a crash, because Darwin
18102 has a different PIC label for each routine but the DWARF debugging
18103 information is not associated with any particular routine, so it's
18104 necessary to remove references to the PIC label from RTL stored by
18105 the DWARF output code.
18107 This helper is used in the normal ix86_delegitimize_address
18108 entrypoint (e.g. used in the target delegitimization hook) and
18109 in ix86_find_base_term. As compile time memory optimization, we
18110 avoid allocating rtxes that will not change anything on the outcome
18111 of the callers (find_base_value and find_base_term). */
18113 static inline rtx
18114 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18116 rtx orig_x = delegitimize_mem_from_attrs (x);
18117 /* addend is NULL or some rtx if x is something+GOTOFF where
18118 something doesn't include the PIC register. */
18119 rtx addend = NULL_RTX;
18120 /* reg_addend is NULL or a multiple of some register. */
18121 rtx reg_addend = NULL_RTX;
18122 /* const_addend is NULL or a const_int. */
18123 rtx const_addend = NULL_RTX;
18124 /* This is the result, or NULL. */
18125 rtx result = NULL_RTX;
18127 x = orig_x;
18129 if (MEM_P (x))
18130 x = XEXP (x, 0);
18132 if (TARGET_64BIT)
18134 if (GET_CODE (x) == CONST
18135 && GET_CODE (XEXP (x, 0)) == PLUS
18136 && GET_MODE (XEXP (x, 0)) == Pmode
18137 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18138 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18139 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18141 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18142 base. A CONST can't be arg_pointer_rtx based. */
18143 if (base_term_p && MEM_P (orig_x))
18144 return orig_x;
18145 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18146 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18147 if (MEM_P (orig_x))
18148 x = replace_equiv_address_nv (orig_x, x);
18149 return x;
18152 if (GET_CODE (x) == CONST
18153 && GET_CODE (XEXP (x, 0)) == UNSPEC
18154 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18155 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18156 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18158 x = XVECEXP (XEXP (x, 0), 0, 0);
18159 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18161 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18162 if (x == NULL_RTX)
18163 return orig_x;
18165 return x;
18168 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18169 return ix86_delegitimize_tls_address (orig_x);
18171 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18172 and -mcmodel=medium -fpic. */
18175 if (GET_CODE (x) != PLUS
18176 || GET_CODE (XEXP (x, 1)) != CONST)
18177 return ix86_delegitimize_tls_address (orig_x);
18179 if (ix86_pic_register_p (XEXP (x, 0)))
18180 /* %ebx + GOT/GOTOFF */
18182 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18184 /* %ebx + %reg * scale + GOT/GOTOFF */
18185 reg_addend = XEXP (x, 0);
18186 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18187 reg_addend = XEXP (reg_addend, 1);
18188 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18189 reg_addend = XEXP (reg_addend, 0);
18190 else
18192 reg_addend = NULL_RTX;
18193 addend = XEXP (x, 0);
18196 else
18197 addend = XEXP (x, 0);
18199 x = XEXP (XEXP (x, 1), 0);
18200 if (GET_CODE (x) == PLUS
18201 && CONST_INT_P (XEXP (x, 1)))
18203 const_addend = XEXP (x, 1);
18204 x = XEXP (x, 0);
18207 if (GET_CODE (x) == UNSPEC
18208 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18209 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18210 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18211 && !MEM_P (orig_x) && !addend)))
18212 result = XVECEXP (x, 0, 0);
18214 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18215 && !MEM_P (orig_x))
18216 result = XVECEXP (x, 0, 0);
18218 if (! result)
18219 return ix86_delegitimize_tls_address (orig_x);
18221 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18222 recurse on the first operand. */
18223 if (const_addend && !base_term_p)
18224 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18225 if (reg_addend)
18226 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18227 if (addend)
18229 /* If the rest of original X doesn't involve the PIC register, add
18230 addend and subtract pic_offset_table_rtx. This can happen e.g.
18231 for code like:
18232 leal (%ebx, %ecx, 4), %ecx
18234 movl foo@GOTOFF(%ecx), %edx
18235 in which case we return (%ecx - %ebx) + foo
18236 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18237 and reload has completed. */
18238 if (pic_offset_table_rtx
18239 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18240 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18241 pic_offset_table_rtx),
18242 result);
18243 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18245 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18246 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18247 result = gen_rtx_PLUS (Pmode, tmp, result);
18249 else
18250 return orig_x;
18252 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18254 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18255 if (result == NULL_RTX)
18256 return orig_x;
18258 return result;
18261 /* The normal instantiation of the above template. */
18263 static rtx
18264 ix86_delegitimize_address (rtx x)
18266 return ix86_delegitimize_address_1 (x, false);
18269 /* If X is a machine specific address (i.e. a symbol or label being
18270 referenced as a displacement from the GOT implemented using an
18271 UNSPEC), then return the base term. Otherwise return X. */
18274 ix86_find_base_term (rtx x)
18276 rtx term;
18278 if (TARGET_64BIT)
18280 if (GET_CODE (x) != CONST)
18281 return x;
18282 term = XEXP (x, 0);
18283 if (GET_CODE (term) == PLUS
18284 && CONST_INT_P (XEXP (term, 1)))
18285 term = XEXP (term, 0);
18286 if (GET_CODE (term) != UNSPEC
18287 || (XINT (term, 1) != UNSPEC_GOTPCREL
18288 && XINT (term, 1) != UNSPEC_PCREL))
18289 return x;
18291 return XVECEXP (term, 0, 0);
18294 return ix86_delegitimize_address_1 (x, true);
18297 static void
18298 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18299 bool fp, FILE *file)
18301 const char *suffix;
18303 if (mode == CCFPmode || mode == CCFPUmode)
18305 code = ix86_fp_compare_code_to_integer (code);
18306 mode = CCmode;
18308 if (reverse)
18309 code = reverse_condition (code);
18311 switch (code)
18313 case EQ:
18314 switch (mode)
18316 case CCAmode:
18317 suffix = "a";
18318 break;
18319 case CCCmode:
18320 suffix = "c";
18321 break;
18322 case CCOmode:
18323 suffix = "o";
18324 break;
18325 case CCPmode:
18326 suffix = "p";
18327 break;
18328 case CCSmode:
18329 suffix = "s";
18330 break;
18331 default:
18332 suffix = "e";
18333 break;
18335 break;
18336 case NE:
18337 switch (mode)
18339 case CCAmode:
18340 suffix = "na";
18341 break;
18342 case CCCmode:
18343 suffix = "nc";
18344 break;
18345 case CCOmode:
18346 suffix = "no";
18347 break;
18348 case CCPmode:
18349 suffix = "np";
18350 break;
18351 case CCSmode:
18352 suffix = "ns";
18353 break;
18354 default:
18355 suffix = "ne";
18356 break;
18358 break;
18359 case GT:
18360 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18361 suffix = "g";
18362 break;
18363 case GTU:
18364 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18365 Those same assemblers have the same but opposite lossage on cmov. */
18366 if (mode == CCmode)
18367 suffix = fp ? "nbe" : "a";
18368 else
18369 gcc_unreachable ();
18370 break;
18371 case LT:
18372 switch (mode)
18374 case CCNOmode:
18375 case CCGOCmode:
18376 suffix = "s";
18377 break;
18379 case CCmode:
18380 case CCGCmode:
18381 suffix = "l";
18382 break;
18384 default:
18385 gcc_unreachable ();
18387 break;
18388 case LTU:
18389 if (mode == CCmode)
18390 suffix = "b";
18391 else if (mode == CCCmode)
18392 suffix = fp ? "b" : "c";
18393 else
18394 gcc_unreachable ();
18395 break;
18396 case GE:
18397 switch (mode)
18399 case CCNOmode:
18400 case CCGOCmode:
18401 suffix = "ns";
18402 break;
18404 case CCmode:
18405 case CCGCmode:
18406 suffix = "ge";
18407 break;
18409 default:
18410 gcc_unreachable ();
18412 break;
18413 case GEU:
18414 if (mode == CCmode)
18415 suffix = "nb";
18416 else if (mode == CCCmode)
18417 suffix = fp ? "nb" : "nc";
18418 else
18419 gcc_unreachable ();
18420 break;
18421 case LE:
18422 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18423 suffix = "le";
18424 break;
18425 case LEU:
18426 if (mode == CCmode)
18427 suffix = "be";
18428 else
18429 gcc_unreachable ();
18430 break;
18431 case UNORDERED:
18432 suffix = fp ? "u" : "p";
18433 break;
18434 case ORDERED:
18435 suffix = fp ? "nu" : "np";
18436 break;
18437 default:
18438 gcc_unreachable ();
18440 fputs (suffix, file);
18443 /* Print the name of register X to FILE based on its machine mode and number.
18444 If CODE is 'w', pretend the mode is HImode.
18445 If CODE is 'b', pretend the mode is QImode.
18446 If CODE is 'k', pretend the mode is SImode.
18447 If CODE is 'q', pretend the mode is DImode.
18448 If CODE is 'x', pretend the mode is V4SFmode.
18449 If CODE is 't', pretend the mode is V8SFmode.
18450 If CODE is 'g', pretend the mode is V16SFmode.
18451 If CODE is 'h', pretend the reg is the 'high' byte register.
18452 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18453 If CODE is 'd', duplicate the operand for AVX instruction.
18456 void
18457 print_reg (rtx x, int code, FILE *file)
18459 const char *reg;
18460 int msize;
18461 unsigned int regno;
18462 bool duplicated;
18464 if (ASSEMBLER_DIALECT == ASM_ATT)
18465 putc ('%', file);
18467 if (x == pc_rtx)
18469 gcc_assert (TARGET_64BIT);
18470 fputs ("rip", file);
18471 return;
18474 if (code == 'y' && STACK_TOP_P (x))
18476 fputs ("st(0)", file);
18477 return;
18480 if (code == 'w')
18481 msize = 2;
18482 else if (code == 'b')
18483 msize = 1;
18484 else if (code == 'k')
18485 msize = 4;
18486 else if (code == 'q')
18487 msize = 8;
18488 else if (code == 'h')
18489 msize = 0;
18490 else if (code == 'x')
18491 msize = 16;
18492 else if (code == 't')
18493 msize = 32;
18494 else if (code == 'g')
18495 msize = 64;
18496 else
18497 msize = GET_MODE_SIZE (GET_MODE (x));
18499 regno = REGNO (x);
18501 if (regno == ARG_POINTER_REGNUM
18502 || regno == FRAME_POINTER_REGNUM
18503 || regno == FPSR_REG
18504 || regno == FPCR_REG)
18506 output_operand_lossage
18507 ("invalid use of register '%s'", reg_names[regno]);
18508 return;
18510 else if (regno == FLAGS_REG)
18512 output_operand_lossage ("invalid use of asm flag output");
18513 return;
18516 duplicated = code == 'd' && TARGET_AVX;
18518 switch (msize)
18520 case 16:
18521 case 12:
18522 case 8:
18523 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18524 warning (0, "unsupported size for integer register");
18525 /* FALLTHRU */
18526 case 4:
18527 if (LEGACY_INT_REGNO_P (regno))
18528 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18529 /* FALLTHRU */
18530 case 2:
18531 normal:
18532 reg = hi_reg_name[regno];
18533 break;
18534 case 1:
18535 if (regno >= ARRAY_SIZE (qi_reg_name))
18536 goto normal;
18537 if (!ANY_QI_REGNO_P (regno))
18538 error ("unsupported size for integer register");
18539 reg = qi_reg_name[regno];
18540 break;
18541 case 0:
18542 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18543 goto normal;
18544 reg = qi_high_reg_name[regno];
18545 break;
18546 case 32:
18547 case 64:
18548 if (SSE_REGNO_P (regno))
18550 gcc_assert (!duplicated);
18551 putc (msize == 32 ? 'y' : 'z', file);
18552 reg = hi_reg_name[regno] + 1;
18553 break;
18555 goto normal;
18556 default:
18557 gcc_unreachable ();
18560 fputs (reg, file);
18562 /* Irritatingly, AMD extended registers use
18563 different naming convention: "r%d[bwd]" */
18564 if (REX_INT_REGNO_P (regno))
18566 gcc_assert (TARGET_64BIT);
18567 switch (msize)
18569 case 0:
18570 error ("extended registers have no high halves");
18571 break;
18572 case 1:
18573 putc ('b', file);
18574 break;
18575 case 2:
18576 putc ('w', file);
18577 break;
18578 case 4:
18579 putc ('d', file);
18580 break;
18581 case 8:
18582 /* no suffix */
18583 break;
18584 default:
18585 error ("unsupported operand size for extended register");
18586 break;
18588 return;
18591 if (duplicated)
18593 if (ASSEMBLER_DIALECT == ASM_ATT)
18594 fprintf (file, ", %%%s", reg);
18595 else
18596 fprintf (file, ", %s", reg);
18600 /* Meaning of CODE:
18601 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18602 C -- print opcode suffix for set/cmov insn.
18603 c -- like C, but print reversed condition
18604 F,f -- likewise, but for floating-point.
18605 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18606 otherwise nothing
18607 R -- print embeded rounding and sae.
18608 r -- print only sae.
18609 z -- print the opcode suffix for the size of the current operand.
18610 Z -- likewise, with special suffixes for x87 instructions.
18611 * -- print a star (in certain assembler syntax)
18612 A -- print an absolute memory reference.
18613 E -- print address with DImode register names if TARGET_64BIT.
18614 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18615 s -- print a shift double count, followed by the assemblers argument
18616 delimiter.
18617 b -- print the QImode name of the register for the indicated operand.
18618 %b0 would print %al if operands[0] is reg 0.
18619 w -- likewise, print the HImode name of the register.
18620 k -- likewise, print the SImode name of the register.
18621 q -- likewise, print the DImode name of the register.
18622 x -- likewise, print the V4SFmode name of the register.
18623 t -- likewise, print the V8SFmode name of the register.
18624 g -- likewise, print the V16SFmode name of the register.
18625 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18626 y -- print "st(0)" instead of "st" as a register.
18627 d -- print duplicated register operand for AVX instruction.
18628 D -- print condition for SSE cmp instruction.
18629 P -- if PIC, print an @PLT suffix.
18630 p -- print raw symbol name.
18631 X -- don't print any sort of PIC '@' suffix for a symbol.
18632 & -- print some in-use local-dynamic symbol name.
18633 H -- print a memory address offset by 8; used for sse high-parts
18634 Y -- print condition for XOP pcom* instruction.
18635 + -- print a branch hint as 'cs' or 'ds' prefix
18636 ; -- print a semicolon (after prefixes due to bug in older gas).
18637 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18638 @ -- print a segment register of thread base pointer load
18639 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18640 ! -- print MPX prefix for jxx/call/ret instructions if required.
18643 void
18644 ix86_print_operand (FILE *file, rtx x, int code)
18646 if (code)
18648 switch (code)
18650 case 'A':
18651 switch (ASSEMBLER_DIALECT)
18653 case ASM_ATT:
18654 putc ('*', file);
18655 break;
18657 case ASM_INTEL:
18658 /* Intel syntax. For absolute addresses, registers should not
18659 be surrounded by braces. */
18660 if (!REG_P (x))
18662 putc ('[', file);
18663 ix86_print_operand (file, x, 0);
18664 putc (']', file);
18665 return;
18667 break;
18669 default:
18670 gcc_unreachable ();
18673 ix86_print_operand (file, x, 0);
18674 return;
18676 case 'E':
18677 /* Wrap address in an UNSPEC to declare special handling. */
18678 if (TARGET_64BIT)
18679 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18681 output_address (VOIDmode, x);
18682 return;
18684 case 'L':
18685 if (ASSEMBLER_DIALECT == ASM_ATT)
18686 putc ('l', file);
18687 return;
18689 case 'W':
18690 if (ASSEMBLER_DIALECT == ASM_ATT)
18691 putc ('w', file);
18692 return;
18694 case 'B':
18695 if (ASSEMBLER_DIALECT == ASM_ATT)
18696 putc ('b', file);
18697 return;
18699 case 'Q':
18700 if (ASSEMBLER_DIALECT == ASM_ATT)
18701 putc ('l', file);
18702 return;
18704 case 'S':
18705 if (ASSEMBLER_DIALECT == ASM_ATT)
18706 putc ('s', file);
18707 return;
18709 case 'T':
18710 if (ASSEMBLER_DIALECT == ASM_ATT)
18711 putc ('t', file);
18712 return;
18714 case 'O':
18715 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18716 if (ASSEMBLER_DIALECT != ASM_ATT)
18717 return;
18719 switch (GET_MODE_SIZE (GET_MODE (x)))
18721 case 2:
18722 putc ('w', file);
18723 break;
18725 case 4:
18726 putc ('l', file);
18727 break;
18729 case 8:
18730 putc ('q', file);
18731 break;
18733 default:
18734 output_operand_lossage ("invalid operand size for operand "
18735 "code 'O'");
18736 return;
18739 putc ('.', file);
18740 #endif
18741 return;
18743 case 'z':
18744 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18746 /* Opcodes don't get size suffixes if using Intel opcodes. */
18747 if (ASSEMBLER_DIALECT == ASM_INTEL)
18748 return;
18750 switch (GET_MODE_SIZE (GET_MODE (x)))
18752 case 1:
18753 putc ('b', file);
18754 return;
18756 case 2:
18757 putc ('w', file);
18758 return;
18760 case 4:
18761 putc ('l', file);
18762 return;
18764 case 8:
18765 putc ('q', file);
18766 return;
18768 default:
18769 output_operand_lossage ("invalid operand size for operand "
18770 "code 'z'");
18771 return;
18775 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18776 warning (0, "non-integer operand used with operand code 'z'");
18777 /* FALLTHRU */
18779 case 'Z':
18780 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18781 if (ASSEMBLER_DIALECT == ASM_INTEL)
18782 return;
18784 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18786 switch (GET_MODE_SIZE (GET_MODE (x)))
18788 case 2:
18789 #ifdef HAVE_AS_IX86_FILDS
18790 putc ('s', file);
18791 #endif
18792 return;
18794 case 4:
18795 putc ('l', file);
18796 return;
18798 case 8:
18799 #ifdef HAVE_AS_IX86_FILDQ
18800 putc ('q', file);
18801 #else
18802 fputs ("ll", file);
18803 #endif
18804 return;
18806 default:
18807 break;
18810 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18812 /* 387 opcodes don't get size suffixes
18813 if the operands are registers. */
18814 if (STACK_REG_P (x))
18815 return;
18817 switch (GET_MODE_SIZE (GET_MODE (x)))
18819 case 4:
18820 putc ('s', file);
18821 return;
18823 case 8:
18824 putc ('l', file);
18825 return;
18827 case 12:
18828 case 16:
18829 putc ('t', file);
18830 return;
18832 default:
18833 break;
18836 else
18838 output_operand_lossage ("invalid operand type used with "
18839 "operand code 'Z'");
18840 return;
18843 output_operand_lossage ("invalid operand size for operand code 'Z'");
18844 return;
18846 case 'd':
18847 case 'b':
18848 case 'w':
18849 case 'k':
18850 case 'q':
18851 case 'h':
18852 case 't':
18853 case 'g':
18854 case 'y':
18855 case 'x':
18856 case 'X':
18857 case 'P':
18858 case 'p':
18859 break;
18861 case 's':
18862 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18864 ix86_print_operand (file, x, 0);
18865 fputs (", ", file);
18867 return;
18869 case 'Y':
18870 switch (GET_CODE (x))
18872 case NE:
18873 fputs ("neq", file);
18874 break;
18875 case EQ:
18876 fputs ("eq", file);
18877 break;
18878 case GE:
18879 case GEU:
18880 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18881 break;
18882 case GT:
18883 case GTU:
18884 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18885 break;
18886 case LE:
18887 case LEU:
18888 fputs ("le", file);
18889 break;
18890 case LT:
18891 case LTU:
18892 fputs ("lt", file);
18893 break;
18894 case UNORDERED:
18895 fputs ("unord", file);
18896 break;
18897 case ORDERED:
18898 fputs ("ord", file);
18899 break;
18900 case UNEQ:
18901 fputs ("ueq", file);
18902 break;
18903 case UNGE:
18904 fputs ("nlt", file);
18905 break;
18906 case UNGT:
18907 fputs ("nle", file);
18908 break;
18909 case UNLE:
18910 fputs ("ule", file);
18911 break;
18912 case UNLT:
18913 fputs ("ult", file);
18914 break;
18915 case LTGT:
18916 fputs ("une", file);
18917 break;
18918 default:
18919 output_operand_lossage ("operand is not a condition code, "
18920 "invalid operand code 'Y'");
18921 return;
18923 return;
18925 case 'D':
18926 /* Little bit of braindamage here. The SSE compare instructions
18927 does use completely different names for the comparisons that the
18928 fp conditional moves. */
18929 switch (GET_CODE (x))
18931 case UNEQ:
18932 if (TARGET_AVX)
18934 fputs ("eq_us", file);
18935 break;
18937 /* FALLTHRU */
18938 case EQ:
18939 fputs ("eq", file);
18940 break;
18941 case UNLT:
18942 if (TARGET_AVX)
18944 fputs ("nge", file);
18945 break;
18947 /* FALLTHRU */
18948 case LT:
18949 fputs ("lt", file);
18950 break;
18951 case UNLE:
18952 if (TARGET_AVX)
18954 fputs ("ngt", file);
18955 break;
18957 /* FALLTHRU */
18958 case LE:
18959 fputs ("le", file);
18960 break;
18961 case UNORDERED:
18962 fputs ("unord", file);
18963 break;
18964 case LTGT:
18965 if (TARGET_AVX)
18967 fputs ("neq_oq", file);
18968 break;
18970 /* FALLTHRU */
18971 case NE:
18972 fputs ("neq", file);
18973 break;
18974 case GE:
18975 if (TARGET_AVX)
18977 fputs ("ge", file);
18978 break;
18980 /* FALLTHRU */
18981 case UNGE:
18982 fputs ("nlt", file);
18983 break;
18984 case GT:
18985 if (TARGET_AVX)
18987 fputs ("gt", file);
18988 break;
18990 /* FALLTHRU */
18991 case UNGT:
18992 fputs ("nle", file);
18993 break;
18994 case ORDERED:
18995 fputs ("ord", file);
18996 break;
18997 default:
18998 output_operand_lossage ("operand is not a condition code, "
18999 "invalid operand code 'D'");
19000 return;
19002 return;
19004 case 'F':
19005 case 'f':
19006 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19007 if (ASSEMBLER_DIALECT == ASM_ATT)
19008 putc ('.', file);
19009 gcc_fallthrough ();
19010 #endif
19012 case 'C':
19013 case 'c':
19014 if (!COMPARISON_P (x))
19016 output_operand_lossage ("operand is not a condition code, "
19017 "invalid operand code '%c'", code);
19018 return;
19020 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19021 code == 'c' || code == 'f',
19022 code == 'F' || code == 'f',
19023 file);
19024 return;
19026 case 'H':
19027 if (!offsettable_memref_p (x))
19029 output_operand_lossage ("operand is not an offsettable memory "
19030 "reference, invalid operand code 'H'");
19031 return;
19033 /* It doesn't actually matter what mode we use here, as we're
19034 only going to use this for printing. */
19035 x = adjust_address_nv (x, DImode, 8);
19036 /* Output 'qword ptr' for intel assembler dialect. */
19037 if (ASSEMBLER_DIALECT == ASM_INTEL)
19038 code = 'q';
19039 break;
19041 case 'K':
19042 if (!CONST_INT_P (x))
19044 output_operand_lossage ("operand is not an integer, invalid "
19045 "operand code 'K'");
19046 return;
19049 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19050 #ifdef HAVE_AS_IX86_HLE
19051 fputs ("xacquire ", file);
19052 #else
19053 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19054 #endif
19055 else if (INTVAL (x) & IX86_HLE_RELEASE)
19056 #ifdef HAVE_AS_IX86_HLE
19057 fputs ("xrelease ", file);
19058 #else
19059 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19060 #endif
19061 /* We do not want to print value of the operand. */
19062 return;
19064 case 'N':
19065 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19066 fputs ("{z}", file);
19067 return;
19069 case 'r':
19070 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19072 output_operand_lossage ("operand is not a specific integer, "
19073 "invalid operand code 'r'");
19074 return;
19077 if (ASSEMBLER_DIALECT == ASM_INTEL)
19078 fputs (", ", file);
19080 fputs ("{sae}", file);
19082 if (ASSEMBLER_DIALECT == ASM_ATT)
19083 fputs (", ", file);
19085 return;
19087 case 'R':
19088 if (!CONST_INT_P (x))
19090 output_operand_lossage ("operand is not an integer, invalid "
19091 "operand code 'R'");
19092 return;
19095 if (ASSEMBLER_DIALECT == ASM_INTEL)
19096 fputs (", ", file);
19098 switch (INTVAL (x))
19100 case ROUND_NEAREST_INT | ROUND_SAE:
19101 fputs ("{rn-sae}", file);
19102 break;
19103 case ROUND_NEG_INF | ROUND_SAE:
19104 fputs ("{rd-sae}", file);
19105 break;
19106 case ROUND_POS_INF | ROUND_SAE:
19107 fputs ("{ru-sae}", file);
19108 break;
19109 case ROUND_ZERO | ROUND_SAE:
19110 fputs ("{rz-sae}", file);
19111 break;
19112 default:
19113 output_operand_lossage ("operand is not a specific integer, "
19114 "invalid operand code 'R'");
19117 if (ASSEMBLER_DIALECT == ASM_ATT)
19118 fputs (", ", file);
19120 return;
19122 case '*':
19123 if (ASSEMBLER_DIALECT == ASM_ATT)
19124 putc ('*', file);
19125 return;
19127 case '&':
19129 const char *name = get_some_local_dynamic_name ();
19130 if (name == NULL)
19131 output_operand_lossage ("'%%&' used without any "
19132 "local dynamic TLS references");
19133 else
19134 assemble_name (file, name);
19135 return;
19138 case '+':
19140 rtx x;
19142 if (!optimize
19143 || optimize_function_for_size_p (cfun)
19144 || !TARGET_BRANCH_PREDICTION_HINTS)
19145 return;
19147 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19148 if (x)
19150 int pred_val = profile_probability::from_reg_br_prob_note
19151 (XINT (x, 0)).to_reg_br_prob_base ();
19153 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19154 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19156 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19157 bool cputaken
19158 = final_forward_branch_p (current_output_insn) == 0;
19160 /* Emit hints only in the case default branch prediction
19161 heuristics would fail. */
19162 if (taken != cputaken)
19164 /* We use 3e (DS) prefix for taken branches and
19165 2e (CS) prefix for not taken branches. */
19166 if (taken)
19167 fputs ("ds ; ", file);
19168 else
19169 fputs ("cs ; ", file);
19173 return;
19176 case ';':
19177 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19178 putc (';', file);
19179 #endif
19180 return;
19182 case '@':
19183 if (ASSEMBLER_DIALECT == ASM_ATT)
19184 putc ('%', file);
19186 /* The kernel uses a different segment register for performance
19187 reasons; a system call would not have to trash the userspace
19188 segment register, which would be expensive. */
19189 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
19190 fputs ("fs", file);
19191 else
19192 fputs ("gs", file);
19193 return;
19195 case '~':
19196 putc (TARGET_AVX2 ? 'i' : 'f', file);
19197 return;
19199 case '^':
19200 if (TARGET_64BIT && Pmode != word_mode)
19201 fputs ("addr32 ", file);
19202 return;
19204 case '!':
19205 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19206 fputs ("bnd ", file);
19207 return;
19209 default:
19210 output_operand_lossage ("invalid operand code '%c'", code);
19214 if (REG_P (x))
19215 print_reg (x, code, file);
19217 else if (MEM_P (x))
19219 rtx addr = XEXP (x, 0);
19221 /* No `byte ptr' prefix for call instructions ... */
19222 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19224 machine_mode mode = GET_MODE (x);
19225 const char *size;
19227 /* Check for explicit size override codes. */
19228 if (code == 'b')
19229 size = "BYTE";
19230 else if (code == 'w')
19231 size = "WORD";
19232 else if (code == 'k')
19233 size = "DWORD";
19234 else if (code == 'q')
19235 size = "QWORD";
19236 else if (code == 'x')
19237 size = "XMMWORD";
19238 else if (code == 't')
19239 size = "YMMWORD";
19240 else if (code == 'g')
19241 size = "ZMMWORD";
19242 else if (mode == BLKmode)
19243 /* ... or BLKmode operands, when not overridden. */
19244 size = NULL;
19245 else
19246 switch (GET_MODE_SIZE (mode))
19248 case 1: size = "BYTE"; break;
19249 case 2: size = "WORD"; break;
19250 case 4: size = "DWORD"; break;
19251 case 8: size = "QWORD"; break;
19252 case 12: size = "TBYTE"; break;
19253 case 16:
19254 if (mode == XFmode)
19255 size = "TBYTE";
19256 else
19257 size = "XMMWORD";
19258 break;
19259 case 32: size = "YMMWORD"; break;
19260 case 64: size = "ZMMWORD"; break;
19261 default:
19262 gcc_unreachable ();
19264 if (size)
19266 fputs (size, file);
19267 fputs (" PTR ", file);
19271 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19272 output_operand_lossage ("invalid constraints for operand");
19273 else
19274 ix86_print_operand_address_as
19275 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19278 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19280 long l;
19282 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19284 if (ASSEMBLER_DIALECT == ASM_ATT)
19285 putc ('$', file);
19286 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19287 if (code == 'q')
19288 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19289 (unsigned long long) (int) l);
19290 else
19291 fprintf (file, "0x%08x", (unsigned int) l);
19294 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19296 long l[2];
19298 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19300 if (ASSEMBLER_DIALECT == ASM_ATT)
19301 putc ('$', file);
19302 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19305 /* These float cases don't actually occur as immediate operands. */
19306 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19308 char dstr[30];
19310 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19311 fputs (dstr, file);
19314 else
19316 /* We have patterns that allow zero sets of memory, for instance.
19317 In 64-bit mode, we should probably support all 8-byte vectors,
19318 since we can in fact encode that into an immediate. */
19319 if (GET_CODE (x) == CONST_VECTOR)
19321 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19322 x = const0_rtx;
19325 if (code != 'P' && code != 'p')
19327 if (CONST_INT_P (x))
19329 if (ASSEMBLER_DIALECT == ASM_ATT)
19330 putc ('$', file);
19332 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19333 || GET_CODE (x) == LABEL_REF)
19335 if (ASSEMBLER_DIALECT == ASM_ATT)
19336 putc ('$', file);
19337 else
19338 fputs ("OFFSET FLAT:", file);
19341 if (CONST_INT_P (x))
19342 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19343 else if (flag_pic || MACHOPIC_INDIRECT)
19344 output_pic_addr_const (file, x, code);
19345 else
19346 output_addr_const (file, x);
19350 static bool
19351 ix86_print_operand_punct_valid_p (unsigned char code)
19353 return (code == '@' || code == '*' || code == '+' || code == '&'
19354 || code == ';' || code == '~' || code == '^' || code == '!');
19357 /* Print a memory operand whose address is ADDR. */
19359 static void
19360 ix86_print_operand_address_as (FILE *file, rtx addr,
19361 addr_space_t as, bool no_rip)
19363 struct ix86_address parts;
19364 rtx base, index, disp;
19365 int scale;
19366 int ok;
19367 bool vsib = false;
19368 int code = 0;
19370 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19372 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19373 gcc_assert (parts.index == NULL_RTX);
19374 parts.index = XVECEXP (addr, 0, 1);
19375 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19376 addr = XVECEXP (addr, 0, 0);
19377 vsib = true;
19379 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19381 gcc_assert (TARGET_64BIT);
19382 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19383 code = 'q';
19385 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19387 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19388 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19389 if (parts.base != NULL_RTX)
19391 parts.index = parts.base;
19392 parts.scale = 1;
19394 parts.base = XVECEXP (addr, 0, 0);
19395 addr = XVECEXP (addr, 0, 0);
19397 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19399 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19400 gcc_assert (parts.index == NULL_RTX);
19401 parts.index = XVECEXP (addr, 0, 1);
19402 addr = XVECEXP (addr, 0, 0);
19404 else
19405 ok = ix86_decompose_address (addr, &parts);
19407 gcc_assert (ok);
19409 base = parts.base;
19410 index = parts.index;
19411 disp = parts.disp;
19412 scale = parts.scale;
19414 if (ADDR_SPACE_GENERIC_P (as))
19415 as = parts.seg;
19416 else
19417 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19419 if (!ADDR_SPACE_GENERIC_P (as))
19421 const char *string;
19423 if (as == ADDR_SPACE_SEG_FS)
19424 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19425 else if (as == ADDR_SPACE_SEG_GS)
19426 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19427 else
19428 gcc_unreachable ();
19429 fputs (string, file);
19432 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19433 if (TARGET_64BIT && !base && !index && !no_rip)
19435 rtx symbol = disp;
19437 if (GET_CODE (disp) == CONST
19438 && GET_CODE (XEXP (disp, 0)) == PLUS
19439 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19440 symbol = XEXP (XEXP (disp, 0), 0);
19442 if (GET_CODE (symbol) == LABEL_REF
19443 || (GET_CODE (symbol) == SYMBOL_REF
19444 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19445 base = pc_rtx;
19448 if (!base && !index)
19450 /* Displacement only requires special attention. */
19451 if (CONST_INT_P (disp))
19453 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19454 fputs ("ds:", file);
19455 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19457 /* Load the external function address via the GOT slot to avoid PLT. */
19458 else if (GET_CODE (disp) == CONST
19459 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19460 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19461 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19462 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19463 output_pic_addr_const (file, disp, 0);
19464 else if (flag_pic)
19465 output_pic_addr_const (file, disp, 0);
19466 else
19467 output_addr_const (file, disp);
19469 else
19471 /* Print SImode register names to force addr32 prefix. */
19472 if (SImode_address_operand (addr, VOIDmode))
19474 if (flag_checking)
19476 gcc_assert (TARGET_64BIT);
19477 switch (GET_CODE (addr))
19479 case SUBREG:
19480 gcc_assert (GET_MODE (addr) == SImode);
19481 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19482 break;
19483 case ZERO_EXTEND:
19484 case AND:
19485 gcc_assert (GET_MODE (addr) == DImode);
19486 break;
19487 default:
19488 gcc_unreachable ();
19491 gcc_assert (!code);
19492 code = 'k';
19494 else if (code == 0
19495 && TARGET_X32
19496 && disp
19497 && CONST_INT_P (disp)
19498 && INTVAL (disp) < -16*1024*1024)
19500 /* X32 runs in 64-bit mode, where displacement, DISP, in
19501 address DISP(%r64), is encoded as 32-bit immediate sign-
19502 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19503 address is %r64 + 0xffffffffbffffd00. When %r64 <
19504 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19505 which is invalid for x32. The correct address is %r64
19506 - 0x40000300 == 0xf7ffdd64. To properly encode
19507 -0x40000300(%r64) for x32, we zero-extend negative
19508 displacement by forcing addr32 prefix which truncates
19509 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19510 zero-extend all negative displacements, including -1(%rsp).
19511 However, for small negative displacements, sign-extension
19512 won't cause overflow. We only zero-extend negative
19513 displacements if they < -16*1024*1024, which is also used
19514 to check legitimate address displacements for PIC. */
19515 code = 'k';
19518 if (ASSEMBLER_DIALECT == ASM_ATT)
19520 if (disp)
19522 if (flag_pic)
19523 output_pic_addr_const (file, disp, 0);
19524 else if (GET_CODE (disp) == LABEL_REF)
19525 output_asm_label (disp);
19526 else
19527 output_addr_const (file, disp);
19530 putc ('(', file);
19531 if (base)
19532 print_reg (base, code, file);
19533 if (index)
19535 putc (',', file);
19536 print_reg (index, vsib ? 0 : code, file);
19537 if (scale != 1 || vsib)
19538 fprintf (file, ",%d", scale);
19540 putc (')', file);
19542 else
19544 rtx offset = NULL_RTX;
19546 if (disp)
19548 /* Pull out the offset of a symbol; print any symbol itself. */
19549 if (GET_CODE (disp) == CONST
19550 && GET_CODE (XEXP (disp, 0)) == PLUS
19551 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19553 offset = XEXP (XEXP (disp, 0), 1);
19554 disp = gen_rtx_CONST (VOIDmode,
19555 XEXP (XEXP (disp, 0), 0));
19558 if (flag_pic)
19559 output_pic_addr_const (file, disp, 0);
19560 else if (GET_CODE (disp) == LABEL_REF)
19561 output_asm_label (disp);
19562 else if (CONST_INT_P (disp))
19563 offset = disp;
19564 else
19565 output_addr_const (file, disp);
19568 putc ('[', file);
19569 if (base)
19571 print_reg (base, code, file);
19572 if (offset)
19574 if (INTVAL (offset) >= 0)
19575 putc ('+', file);
19576 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19579 else if (offset)
19580 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19581 else
19582 putc ('0', file);
19584 if (index)
19586 putc ('+', file);
19587 print_reg (index, vsib ? 0 : code, file);
19588 if (scale != 1 || vsib)
19589 fprintf (file, "*%d", scale);
19591 putc (']', file);
19596 static void
19597 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19599 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19602 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19604 static bool
19605 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19607 rtx op;
19609 if (GET_CODE (x) != UNSPEC)
19610 return false;
19612 op = XVECEXP (x, 0, 0);
19613 switch (XINT (x, 1))
19615 case UNSPEC_GOTTPOFF:
19616 output_addr_const (file, op);
19617 /* FIXME: This might be @TPOFF in Sun ld. */
19618 fputs ("@gottpoff", file);
19619 break;
19620 case UNSPEC_TPOFF:
19621 output_addr_const (file, op);
19622 fputs ("@tpoff", file);
19623 break;
19624 case UNSPEC_NTPOFF:
19625 output_addr_const (file, op);
19626 if (TARGET_64BIT)
19627 fputs ("@tpoff", file);
19628 else
19629 fputs ("@ntpoff", file);
19630 break;
19631 case UNSPEC_DTPOFF:
19632 output_addr_const (file, op);
19633 fputs ("@dtpoff", file);
19634 break;
19635 case UNSPEC_GOTNTPOFF:
19636 output_addr_const (file, op);
19637 if (TARGET_64BIT)
19638 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19639 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19640 else
19641 fputs ("@gotntpoff", file);
19642 break;
19643 case UNSPEC_INDNTPOFF:
19644 output_addr_const (file, op);
19645 fputs ("@indntpoff", file);
19646 break;
19647 #if TARGET_MACHO
19648 case UNSPEC_MACHOPIC_OFFSET:
19649 output_addr_const (file, op);
19650 putc ('-', file);
19651 machopic_output_function_base_name (file);
19652 break;
19653 #endif
19655 case UNSPEC_STACK_CHECK:
19657 int offset;
19659 gcc_assert (flag_split_stack);
19661 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
19662 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
19663 #else
19664 gcc_unreachable ();
19665 #endif
19667 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
19669 break;
19671 default:
19672 return false;
19675 return true;
19678 /* Split one or more double-mode RTL references into pairs of half-mode
19679 references. The RTL can be REG, offsettable MEM, integer constant, or
19680 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19681 split and "num" is its length. lo_half and hi_half are output arrays
19682 that parallel "operands". */
19684 void
19685 split_double_mode (machine_mode mode, rtx operands[],
19686 int num, rtx lo_half[], rtx hi_half[])
19688 machine_mode half_mode;
19689 unsigned int byte;
19691 switch (mode)
19693 case TImode:
19694 half_mode = DImode;
19695 break;
19696 case DImode:
19697 half_mode = SImode;
19698 break;
19699 default:
19700 gcc_unreachable ();
19703 byte = GET_MODE_SIZE (half_mode);
19705 while (num--)
19707 rtx op = operands[num];
19709 /* simplify_subreg refuse to split volatile memory addresses,
19710 but we still have to handle it. */
19711 if (MEM_P (op))
19713 lo_half[num] = adjust_address (op, half_mode, 0);
19714 hi_half[num] = adjust_address (op, half_mode, byte);
19716 else
19718 lo_half[num] = simplify_gen_subreg (half_mode, op,
19719 GET_MODE (op) == VOIDmode
19720 ? mode : GET_MODE (op), 0);
19721 hi_half[num] = simplify_gen_subreg (half_mode, op,
19722 GET_MODE (op) == VOIDmode
19723 ? mode : GET_MODE (op), byte);
19728 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19729 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19730 is the expression of the binary operation. The output may either be
19731 emitted here, or returned to the caller, like all output_* functions.
19733 There is no guarantee that the operands are the same mode, as they
19734 might be within FLOAT or FLOAT_EXTEND expressions. */
19736 #ifndef SYSV386_COMPAT
19737 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19738 wants to fix the assemblers because that causes incompatibility
19739 with gcc. No-one wants to fix gcc because that causes
19740 incompatibility with assemblers... You can use the option of
19741 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19742 #define SYSV386_COMPAT 1
19743 #endif
19745 const char *
19746 output_387_binary_op (rtx_insn *insn, rtx *operands)
19748 static char buf[40];
19749 const char *p;
19750 const char *ssep;
19751 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19753 /* Even if we do not want to check the inputs, this documents input
19754 constraints. Which helps in understanding the following code. */
19755 if (flag_checking)
19757 if (STACK_REG_P (operands[0])
19758 && ((REG_P (operands[1])
19759 && REGNO (operands[0]) == REGNO (operands[1])
19760 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19761 || (REG_P (operands[2])
19762 && REGNO (operands[0]) == REGNO (operands[2])
19763 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19764 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19765 ; /* ok */
19766 else
19767 gcc_assert (is_sse);
19770 switch (GET_CODE (operands[3]))
19772 case PLUS:
19773 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19774 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19775 p = "fiadd";
19776 else
19777 p = "fadd";
19778 ssep = "vadd";
19779 break;
19781 case MINUS:
19782 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19783 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19784 p = "fisub";
19785 else
19786 p = "fsub";
19787 ssep = "vsub";
19788 break;
19790 case MULT:
19791 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19792 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19793 p = "fimul";
19794 else
19795 p = "fmul";
19796 ssep = "vmul";
19797 break;
19799 case DIV:
19800 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19801 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19802 p = "fidiv";
19803 else
19804 p = "fdiv";
19805 ssep = "vdiv";
19806 break;
19808 default:
19809 gcc_unreachable ();
19812 if (is_sse)
19814 if (TARGET_AVX)
19816 strcpy (buf, ssep);
19817 if (GET_MODE (operands[0]) == SFmode)
19818 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19819 else
19820 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19822 else
19824 strcpy (buf, ssep + 1);
19825 if (GET_MODE (operands[0]) == SFmode)
19826 strcat (buf, "ss\t{%2, %0|%0, %2}");
19827 else
19828 strcat (buf, "sd\t{%2, %0|%0, %2}");
19830 return buf;
19832 strcpy (buf, p);
19834 switch (GET_CODE (operands[3]))
19836 case MULT:
19837 case PLUS:
19838 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19839 std::swap (operands[1], operands[2]);
19841 /* know operands[0] == operands[1]. */
19843 if (MEM_P (operands[2]))
19845 p = "%Z2\t%2";
19846 break;
19849 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19851 if (STACK_TOP_P (operands[0]))
19852 /* How is it that we are storing to a dead operand[2]?
19853 Well, presumably operands[1] is dead too. We can't
19854 store the result to st(0) as st(0) gets popped on this
19855 instruction. Instead store to operands[2] (which I
19856 think has to be st(1)). st(1) will be popped later.
19857 gcc <= 2.8.1 didn't have this check and generated
19858 assembly code that the Unixware assembler rejected. */
19859 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19860 else
19861 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19862 break;
19865 if (STACK_TOP_P (operands[0]))
19866 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19867 else
19868 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19869 break;
19871 case MINUS:
19872 case DIV:
19873 if (MEM_P (operands[1]))
19875 p = "r%Z1\t%1";
19876 break;
19879 if (MEM_P (operands[2]))
19881 p = "%Z2\t%2";
19882 break;
19885 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19887 #if SYSV386_COMPAT
19888 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19889 derived assemblers, confusingly reverse the direction of
19890 the operation for fsub{r} and fdiv{r} when the
19891 destination register is not st(0). The Intel assembler
19892 doesn't have this brain damage. Read !SYSV386_COMPAT to
19893 figure out what the hardware really does. */
19894 if (STACK_TOP_P (operands[0]))
19895 p = "{p\t%0, %2|rp\t%2, %0}";
19896 else
19897 p = "{rp\t%2, %0|p\t%0, %2}";
19898 #else
19899 if (STACK_TOP_P (operands[0]))
19900 /* As above for fmul/fadd, we can't store to st(0). */
19901 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19902 else
19903 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19904 #endif
19905 break;
19908 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19910 #if SYSV386_COMPAT
19911 if (STACK_TOP_P (operands[0]))
19912 p = "{rp\t%0, %1|p\t%1, %0}";
19913 else
19914 p = "{p\t%1, %0|rp\t%0, %1}";
19915 #else
19916 if (STACK_TOP_P (operands[0]))
19917 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19918 else
19919 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19920 #endif
19921 break;
19924 if (STACK_TOP_P (operands[0]))
19926 if (STACK_TOP_P (operands[1]))
19927 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19928 else
19929 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19930 break;
19932 else if (STACK_TOP_P (operands[1]))
19934 #if SYSV386_COMPAT
19935 p = "{\t%1, %0|r\t%0, %1}";
19936 #else
19937 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19938 #endif
19940 else
19942 #if SYSV386_COMPAT
19943 p = "{r\t%2, %0|\t%0, %2}";
19944 #else
19945 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19946 #endif
19948 break;
19950 default:
19951 gcc_unreachable ();
19954 strcat (buf, p);
19955 return buf;
19958 /* Return needed mode for entity in optimize_mode_switching pass. */
19960 static int
19961 ix86_dirflag_mode_needed (rtx_insn *insn)
19963 if (CALL_P (insn))
19965 if (cfun->machine->func_type == TYPE_NORMAL)
19966 return X86_DIRFLAG_ANY;
19967 else
19968 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19969 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19972 if (recog_memoized (insn) < 0)
19973 return X86_DIRFLAG_ANY;
19975 if (get_attr_type (insn) == TYPE_STR)
19977 /* Emit cld instruction if stringops are used in the function. */
19978 if (cfun->machine->func_type == TYPE_NORMAL)
19979 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19980 else
19981 return X86_DIRFLAG_RESET;
19984 return X86_DIRFLAG_ANY;
19987 /* Check if a 256bit AVX register is referenced inside of EXP. */
19989 static bool
19990 ix86_check_avx256_register (const_rtx exp)
19992 if (SUBREG_P (exp))
19993 exp = SUBREG_REG (exp);
19995 return (REG_P (exp)
19996 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19999 /* Return needed mode for entity in optimize_mode_switching pass. */
20001 static int
20002 ix86_avx_u128_mode_needed (rtx_insn *insn)
20004 if (CALL_P (insn))
20006 rtx link;
20008 /* Needed mode is set to AVX_U128_CLEAN if there are
20009 no 256bit modes used in function arguments. */
20010 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20011 link;
20012 link = XEXP (link, 1))
20014 if (GET_CODE (XEXP (link, 0)) == USE)
20016 rtx arg = XEXP (XEXP (link, 0), 0);
20018 if (ix86_check_avx256_register (arg))
20019 return AVX_U128_DIRTY;
20023 return AVX_U128_CLEAN;
20026 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20027 changes state only when a 256bit register is written to, but we need
20028 to prevent the compiler from moving optimal insertion point above
20029 eventual read from 256bit register. */
20030 subrtx_iterator::array_type array;
20031 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20032 if (ix86_check_avx256_register (*iter))
20033 return AVX_U128_DIRTY;
20035 return AVX_U128_ANY;
20038 /* Return mode that i387 must be switched into
20039 prior to the execution of insn. */
20041 static int
20042 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20044 enum attr_i387_cw mode;
20046 /* The mode UNINITIALIZED is used to store control word after a
20047 function call or ASM pattern. The mode ANY specify that function
20048 has no requirements on the control word and make no changes in the
20049 bits we are interested in. */
20051 if (CALL_P (insn)
20052 || (NONJUMP_INSN_P (insn)
20053 && (asm_noperands (PATTERN (insn)) >= 0
20054 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20055 return I387_CW_UNINITIALIZED;
20057 if (recog_memoized (insn) < 0)
20058 return I387_CW_ANY;
20060 mode = get_attr_i387_cw (insn);
20062 switch (entity)
20064 case I387_TRUNC:
20065 if (mode == I387_CW_TRUNC)
20066 return mode;
20067 break;
20069 case I387_FLOOR:
20070 if (mode == I387_CW_FLOOR)
20071 return mode;
20072 break;
20074 case I387_CEIL:
20075 if (mode == I387_CW_CEIL)
20076 return mode;
20077 break;
20079 case I387_MASK_PM:
20080 if (mode == I387_CW_MASK_PM)
20081 return mode;
20082 break;
20084 default:
20085 gcc_unreachable ();
20088 return I387_CW_ANY;
20091 /* Return mode that entity must be switched into
20092 prior to the execution of insn. */
20094 static int
20095 ix86_mode_needed (int entity, rtx_insn *insn)
20097 switch (entity)
20099 case X86_DIRFLAG:
20100 return ix86_dirflag_mode_needed (insn);
20101 case AVX_U128:
20102 return ix86_avx_u128_mode_needed (insn);
20103 case I387_TRUNC:
20104 case I387_FLOOR:
20105 case I387_CEIL:
20106 case I387_MASK_PM:
20107 return ix86_i387_mode_needed (entity, insn);
20108 default:
20109 gcc_unreachable ();
20111 return 0;
20114 /* Check if a 256bit AVX register is referenced in stores. */
20116 static void
20117 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20119 if (ix86_check_avx256_register (dest))
20121 bool *used = (bool *) data;
20122 *used = true;
20126 /* Calculate mode of upper 128bit AVX registers after the insn. */
20128 static int
20129 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20131 rtx pat = PATTERN (insn);
20133 if (vzeroupper_operation (pat, VOIDmode)
20134 || vzeroall_operation (pat, VOIDmode))
20135 return AVX_U128_CLEAN;
20137 /* We know that state is clean after CALL insn if there are no
20138 256bit registers used in the function return register. */
20139 if (CALL_P (insn))
20141 bool avx_reg256_found = false;
20142 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20144 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20147 /* Otherwise, return current mode. Remember that if insn
20148 references AVX 256bit registers, the mode was already changed
20149 to DIRTY from MODE_NEEDED. */
20150 return mode;
20153 /* Return the mode that an insn results in. */
20155 static int
20156 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20158 switch (entity)
20160 case X86_DIRFLAG:
20161 return mode;
20162 case AVX_U128:
20163 return ix86_avx_u128_mode_after (mode, insn);
20164 case I387_TRUNC:
20165 case I387_FLOOR:
20166 case I387_CEIL:
20167 case I387_MASK_PM:
20168 return mode;
20169 default:
20170 gcc_unreachable ();
20174 static int
20175 ix86_dirflag_mode_entry (void)
20177 /* For TARGET_CLD or in the interrupt handler we can't assume
20178 direction flag state at function entry. */
20179 if (TARGET_CLD
20180 || cfun->machine->func_type != TYPE_NORMAL)
20181 return X86_DIRFLAG_ANY;
20183 return X86_DIRFLAG_RESET;
20186 static int
20187 ix86_avx_u128_mode_entry (void)
20189 tree arg;
20191 /* Entry mode is set to AVX_U128_DIRTY if there are
20192 256bit modes used in function arguments. */
20193 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20194 arg = TREE_CHAIN (arg))
20196 rtx incoming = DECL_INCOMING_RTL (arg);
20198 if (incoming && ix86_check_avx256_register (incoming))
20199 return AVX_U128_DIRTY;
20202 return AVX_U128_CLEAN;
20205 /* Return a mode that ENTITY is assumed to be
20206 switched to at function entry. */
20208 static int
20209 ix86_mode_entry (int entity)
20211 switch (entity)
20213 case X86_DIRFLAG:
20214 return ix86_dirflag_mode_entry ();
20215 case AVX_U128:
20216 return ix86_avx_u128_mode_entry ();
20217 case I387_TRUNC:
20218 case I387_FLOOR:
20219 case I387_CEIL:
20220 case I387_MASK_PM:
20221 return I387_CW_ANY;
20222 default:
20223 gcc_unreachable ();
20227 static int
20228 ix86_avx_u128_mode_exit (void)
20230 rtx reg = crtl->return_rtx;
20232 /* Exit mode is set to AVX_U128_DIRTY if there are
20233 256bit modes used in the function return register. */
20234 if (reg && ix86_check_avx256_register (reg))
20235 return AVX_U128_DIRTY;
20237 return AVX_U128_CLEAN;
20240 /* Return a mode that ENTITY is assumed to be
20241 switched to at function exit. */
20243 static int
20244 ix86_mode_exit (int entity)
20246 switch (entity)
20248 case X86_DIRFLAG:
20249 return X86_DIRFLAG_ANY;
20250 case AVX_U128:
20251 return ix86_avx_u128_mode_exit ();
20252 case I387_TRUNC:
20253 case I387_FLOOR:
20254 case I387_CEIL:
20255 case I387_MASK_PM:
20256 return I387_CW_ANY;
20257 default:
20258 gcc_unreachable ();
20262 static int
20263 ix86_mode_priority (int, int n)
20265 return n;
20268 /* Output code to initialize control word copies used by trunc?f?i and
20269 rounding patterns. CURRENT_MODE is set to current control word,
20270 while NEW_MODE is set to new control word. */
20272 static void
20273 emit_i387_cw_initialization (int mode)
20275 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20276 rtx new_mode;
20278 enum ix86_stack_slot slot;
20280 rtx reg = gen_reg_rtx (HImode);
20282 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20283 emit_move_insn (reg, copy_rtx (stored_mode));
20285 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20286 || optimize_insn_for_size_p ())
20288 switch (mode)
20290 case I387_CW_TRUNC:
20291 /* round toward zero (truncate) */
20292 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20293 slot = SLOT_CW_TRUNC;
20294 break;
20296 case I387_CW_FLOOR:
20297 /* round down toward -oo */
20298 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20299 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20300 slot = SLOT_CW_FLOOR;
20301 break;
20303 case I387_CW_CEIL:
20304 /* round up toward +oo */
20305 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20306 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20307 slot = SLOT_CW_CEIL;
20308 break;
20310 case I387_CW_MASK_PM:
20311 /* mask precision exception for nearbyint() */
20312 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20313 slot = SLOT_CW_MASK_PM;
20314 break;
20316 default:
20317 gcc_unreachable ();
20320 else
20322 switch (mode)
20324 case I387_CW_TRUNC:
20325 /* round toward zero (truncate) */
20326 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20327 slot = SLOT_CW_TRUNC;
20328 break;
20330 case I387_CW_FLOOR:
20331 /* round down toward -oo */
20332 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20333 slot = SLOT_CW_FLOOR;
20334 break;
20336 case I387_CW_CEIL:
20337 /* round up toward +oo */
20338 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20339 slot = SLOT_CW_CEIL;
20340 break;
20342 case I387_CW_MASK_PM:
20343 /* mask precision exception for nearbyint() */
20344 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20345 slot = SLOT_CW_MASK_PM;
20346 break;
20348 default:
20349 gcc_unreachable ();
20353 gcc_assert (slot < MAX_386_STACK_LOCALS);
20355 new_mode = assign_386_stack_local (HImode, slot);
20356 emit_move_insn (new_mode, reg);
20359 /* Emit vzeroupper. */
20361 void
20362 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20364 int i;
20366 /* Cancel automatic vzeroupper insertion if there are
20367 live call-saved SSE registers at the insertion point. */
20369 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20370 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20371 return;
20373 if (TARGET_64BIT)
20374 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20375 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20376 return;
20378 emit_insn (gen_avx_vzeroupper ());
20381 /* Generate one or more insns to set ENTITY to MODE. */
20383 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20384 is the set of hard registers live at the point where the insn(s)
20385 are to be inserted. */
20387 static void
20388 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20389 HARD_REG_SET regs_live)
20391 switch (entity)
20393 case X86_DIRFLAG:
20394 if (mode == X86_DIRFLAG_RESET)
20395 emit_insn (gen_cld ());
20396 break;
20397 case AVX_U128:
20398 if (mode == AVX_U128_CLEAN)
20399 ix86_avx_emit_vzeroupper (regs_live);
20400 break;
20401 case I387_TRUNC:
20402 case I387_FLOOR:
20403 case I387_CEIL:
20404 case I387_MASK_PM:
20405 if (mode != I387_CW_ANY
20406 && mode != I387_CW_UNINITIALIZED)
20407 emit_i387_cw_initialization (mode);
20408 break;
20409 default:
20410 gcc_unreachable ();
20414 /* Output code for INSN to convert a float to a signed int. OPERANDS
20415 are the insn operands. The output may be [HSD]Imode and the input
20416 operand may be [SDX]Fmode. */
20418 const char *
20419 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20421 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20422 int dimode_p = GET_MODE (operands[0]) == DImode;
20423 int round_mode = get_attr_i387_cw (insn);
20425 /* Jump through a hoop or two for DImode, since the hardware has no
20426 non-popping instruction. We used to do this a different way, but
20427 that was somewhat fragile and broke with post-reload splitters. */
20428 if ((dimode_p || fisttp) && !stack_top_dies)
20429 output_asm_insn ("fld\t%y1", operands);
20431 gcc_assert (STACK_TOP_P (operands[1]));
20432 gcc_assert (MEM_P (operands[0]));
20433 gcc_assert (GET_MODE (operands[1]) != TFmode);
20435 if (fisttp)
20436 output_asm_insn ("fisttp%Z0\t%0", operands);
20437 else
20439 if (round_mode != I387_CW_ANY)
20440 output_asm_insn ("fldcw\t%3", operands);
20441 if (stack_top_dies || dimode_p)
20442 output_asm_insn ("fistp%Z0\t%0", operands);
20443 else
20444 output_asm_insn ("fist%Z0\t%0", operands);
20445 if (round_mode != I387_CW_ANY)
20446 output_asm_insn ("fldcw\t%2", operands);
20449 return "";
20452 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20453 have the values zero or one, indicates the ffreep insn's operand
20454 from the OPERANDS array. */
20456 static const char *
20457 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20459 if (TARGET_USE_FFREEP)
20460 #ifdef HAVE_AS_IX86_FFREEP
20461 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20462 #else
20464 static char retval[32];
20465 int regno = REGNO (operands[opno]);
20467 gcc_assert (STACK_REGNO_P (regno));
20469 regno -= FIRST_STACK_REG;
20471 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20472 return retval;
20474 #endif
20476 return opno ? "fstp\t%y1" : "fstp\t%y0";
20480 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20481 should be used. UNORDERED_P is true when fucom should be used. */
20483 const char *
20484 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20486 int stack_top_dies;
20487 rtx cmp_op0, cmp_op1;
20488 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20490 if (eflags_p)
20492 cmp_op0 = operands[0];
20493 cmp_op1 = operands[1];
20495 else
20497 cmp_op0 = operands[1];
20498 cmp_op1 = operands[2];
20501 if (is_sse)
20503 if (GET_MODE (operands[0]) == SFmode)
20504 if (unordered_p)
20505 return "%vucomiss\t{%1, %0|%0, %1}";
20506 else
20507 return "%vcomiss\t{%1, %0|%0, %1}";
20508 else
20509 if (unordered_p)
20510 return "%vucomisd\t{%1, %0|%0, %1}";
20511 else
20512 return "%vcomisd\t{%1, %0|%0, %1}";
20515 gcc_assert (STACK_TOP_P (cmp_op0));
20517 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20519 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20521 if (stack_top_dies)
20523 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20524 return output_387_ffreep (operands, 1);
20526 else
20527 return "ftst\n\tfnstsw\t%0";
20530 if (STACK_REG_P (cmp_op1)
20531 && stack_top_dies
20532 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20533 && REGNO (cmp_op1) != FIRST_STACK_REG)
20535 /* If both the top of the 387 stack dies, and the other operand
20536 is also a stack register that dies, then this must be a
20537 `fcompp' float compare */
20539 if (eflags_p)
20541 /* There is no double popping fcomi variant. Fortunately,
20542 eflags is immune from the fstp's cc clobbering. */
20543 if (unordered_p)
20544 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20545 else
20546 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20547 return output_387_ffreep (operands, 0);
20549 else
20551 if (unordered_p)
20552 return "fucompp\n\tfnstsw\t%0";
20553 else
20554 return "fcompp\n\tfnstsw\t%0";
20557 else
20559 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20561 static const char * const alt[16] =
20563 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20564 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20565 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20566 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20568 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20569 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20570 NULL,
20571 NULL,
20573 "fcomi\t{%y1, %0|%0, %y1}",
20574 "fcomip\t{%y1, %0|%0, %y1}",
20575 "fucomi\t{%y1, %0|%0, %y1}",
20576 "fucomip\t{%y1, %0|%0, %y1}",
20578 NULL,
20579 NULL,
20580 NULL,
20581 NULL
20584 int mask;
20585 const char *ret;
20587 mask = eflags_p << 3;
20588 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20589 mask |= unordered_p << 1;
20590 mask |= stack_top_dies;
20592 gcc_assert (mask < 16);
20593 ret = alt[mask];
20594 gcc_assert (ret);
20596 return ret;
20600 void
20601 ix86_output_addr_vec_elt (FILE *file, int value)
20603 const char *directive = ASM_LONG;
20605 #ifdef ASM_QUAD
20606 if (TARGET_LP64)
20607 directive = ASM_QUAD;
20608 #else
20609 gcc_assert (!TARGET_64BIT);
20610 #endif
20612 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20615 void
20616 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20618 const char *directive = ASM_LONG;
20620 #ifdef ASM_QUAD
20621 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20622 directive = ASM_QUAD;
20623 #else
20624 gcc_assert (!TARGET_64BIT);
20625 #endif
20626 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20627 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20628 fprintf (file, "%s%s%d-%s%d\n",
20629 directive, LPREFIX, value, LPREFIX, rel);
20630 else if (HAVE_AS_GOTOFF_IN_DATA)
20631 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20632 #if TARGET_MACHO
20633 else if (TARGET_MACHO)
20635 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20636 machopic_output_function_base_name (file);
20637 putc ('\n', file);
20639 #endif
20640 else
20641 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20642 GOT_SYMBOL_NAME, LPREFIX, value);
20645 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20646 for the target. */
20648 void
20649 ix86_expand_clear (rtx dest)
20651 rtx tmp;
20653 /* We play register width games, which are only valid after reload. */
20654 gcc_assert (reload_completed);
20656 /* Avoid HImode and its attendant prefix byte. */
20657 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20658 dest = gen_rtx_REG (SImode, REGNO (dest));
20659 tmp = gen_rtx_SET (dest, const0_rtx);
20661 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20663 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20664 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20667 emit_insn (tmp);
20670 /* X is an unchanging MEM. If it is a constant pool reference, return
20671 the constant pool rtx, else NULL. */
20674 maybe_get_pool_constant (rtx x)
20676 x = ix86_delegitimize_address (XEXP (x, 0));
20678 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20679 return get_pool_constant (x);
20681 return NULL_RTX;
20684 void
20685 ix86_expand_move (machine_mode mode, rtx operands[])
20687 rtx op0, op1;
20688 rtx tmp, addend = NULL_RTX;
20689 enum tls_model model;
20691 op0 = operands[0];
20692 op1 = operands[1];
20694 switch (GET_CODE (op1))
20696 case CONST:
20697 tmp = XEXP (op1, 0);
20699 if (GET_CODE (tmp) != PLUS
20700 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20701 break;
20703 op1 = XEXP (tmp, 0);
20704 addend = XEXP (tmp, 1);
20705 /* FALLTHRU */
20707 case SYMBOL_REF:
20708 model = SYMBOL_REF_TLS_MODEL (op1);
20710 if (model)
20711 op1 = legitimize_tls_address (op1, model, true);
20712 else if (ix86_force_load_from_GOT_p (op1))
20714 /* Load the external function address via GOT slot to avoid PLT. */
20715 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20716 (TARGET_64BIT
20717 ? UNSPEC_GOTPCREL
20718 : UNSPEC_GOT));
20719 op1 = gen_rtx_CONST (Pmode, op1);
20720 op1 = gen_const_mem (Pmode, op1);
20721 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20723 else
20725 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20726 if (tmp)
20728 op1 = tmp;
20729 if (!addend)
20730 break;
20732 else
20734 op1 = operands[1];
20735 break;
20739 if (addend)
20741 op1 = force_operand (op1, NULL_RTX);
20742 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20743 op0, 1, OPTAB_DIRECT);
20745 else
20746 op1 = force_operand (op1, op0);
20748 if (op1 == op0)
20749 return;
20751 op1 = convert_to_mode (mode, op1, 1);
20753 default:
20754 break;
20757 if ((flag_pic || MACHOPIC_INDIRECT)
20758 && symbolic_operand (op1, mode))
20760 if (TARGET_MACHO && !TARGET_64BIT)
20762 #if TARGET_MACHO
20763 /* dynamic-no-pic */
20764 if (MACHOPIC_INDIRECT)
20766 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20767 ? op0 : gen_reg_rtx (Pmode);
20768 op1 = machopic_indirect_data_reference (op1, temp);
20769 if (MACHOPIC_PURE)
20770 op1 = machopic_legitimize_pic_address (op1, mode,
20771 temp == op1 ? 0 : temp);
20773 if (op0 != op1 && GET_CODE (op0) != MEM)
20775 rtx insn = gen_rtx_SET (op0, op1);
20776 emit_insn (insn);
20777 return;
20779 if (GET_CODE (op0) == MEM)
20780 op1 = force_reg (Pmode, op1);
20781 else
20783 rtx temp = op0;
20784 if (GET_CODE (temp) != REG)
20785 temp = gen_reg_rtx (Pmode);
20786 temp = legitimize_pic_address (op1, temp);
20787 if (temp == op0)
20788 return;
20789 op1 = temp;
20791 /* dynamic-no-pic */
20792 #endif
20794 else
20796 if (MEM_P (op0))
20797 op1 = force_reg (mode, op1);
20798 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20800 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20801 op1 = legitimize_pic_address (op1, reg);
20802 if (op0 == op1)
20803 return;
20804 op1 = convert_to_mode (mode, op1, 1);
20808 else
20810 if (MEM_P (op0)
20811 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20812 || !push_operand (op0, mode))
20813 && MEM_P (op1))
20814 op1 = force_reg (mode, op1);
20816 if (push_operand (op0, mode)
20817 && ! general_no_elim_operand (op1, mode))
20818 op1 = copy_to_mode_reg (mode, op1);
20820 /* Force large constants in 64bit compilation into register
20821 to get them CSEed. */
20822 if (can_create_pseudo_p ()
20823 && (mode == DImode) && TARGET_64BIT
20824 && immediate_operand (op1, mode)
20825 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20826 && !register_operand (op0, mode)
20827 && optimize)
20828 op1 = copy_to_mode_reg (mode, op1);
20830 if (can_create_pseudo_p ()
20831 && CONST_DOUBLE_P (op1))
20833 /* If we are loading a floating point constant to a register,
20834 force the value to memory now, since we'll get better code
20835 out the back end. */
20837 op1 = validize_mem (force_const_mem (mode, op1));
20838 if (!register_operand (op0, mode))
20840 rtx temp = gen_reg_rtx (mode);
20841 emit_insn (gen_rtx_SET (temp, op1));
20842 emit_move_insn (op0, temp);
20843 return;
20848 emit_insn (gen_rtx_SET (op0, op1));
20851 void
20852 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20854 rtx op0 = operands[0], op1 = operands[1];
20855 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20856 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20857 unsigned int align = (TARGET_IAMCU
20858 ? GET_MODE_BITSIZE (mode)
20859 : GET_MODE_ALIGNMENT (mode));
20861 if (push_operand (op0, VOIDmode))
20862 op0 = emit_move_resolve_push (mode, op0);
20864 /* Force constants other than zero into memory. We do not know how
20865 the instructions used to build constants modify the upper 64 bits
20866 of the register, once we have that information we may be able
20867 to handle some of them more efficiently. */
20868 if (can_create_pseudo_p ()
20869 && (CONSTANT_P (op1)
20870 || (SUBREG_P (op1)
20871 && CONSTANT_P (SUBREG_REG (op1))))
20872 && ((register_operand (op0, mode)
20873 && !standard_sse_constant_p (op1, mode))
20874 /* ix86_expand_vector_move_misalign() does not like constants. */
20875 || (SSE_REG_MODE_P (mode)
20876 && MEM_P (op0)
20877 && MEM_ALIGN (op0) < align)))
20879 if (SUBREG_P (op1))
20881 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20882 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20883 if (r)
20884 r = validize_mem (r);
20885 else
20886 r = force_reg (imode, SUBREG_REG (op1));
20887 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20889 else
20890 op1 = validize_mem (force_const_mem (mode, op1));
20893 /* We need to check memory alignment for SSE mode since attribute
20894 can make operands unaligned. */
20895 if (can_create_pseudo_p ()
20896 && SSE_REG_MODE_P (mode)
20897 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20898 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20900 rtx tmp[2];
20902 /* ix86_expand_vector_move_misalign() does not like both
20903 arguments in memory. */
20904 if (!register_operand (op0, mode)
20905 && !register_operand (op1, mode))
20906 op1 = force_reg (mode, op1);
20908 tmp[0] = op0; tmp[1] = op1;
20909 ix86_expand_vector_move_misalign (mode, tmp);
20910 return;
20913 /* Make operand1 a register if it isn't already. */
20914 if (can_create_pseudo_p ()
20915 && !register_operand (op0, mode)
20916 && !register_operand (op1, mode))
20918 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20919 return;
20922 emit_insn (gen_rtx_SET (op0, op1));
20925 /* Split 32-byte AVX unaligned load and store if needed. */
20927 static void
20928 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20930 rtx m;
20931 rtx (*extract) (rtx, rtx, rtx);
20932 machine_mode mode;
20934 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20935 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20937 emit_insn (gen_rtx_SET (op0, op1));
20938 return;
20941 rtx orig_op0 = NULL_RTX;
20942 mode = GET_MODE (op0);
20943 switch (GET_MODE_CLASS (mode))
20945 case MODE_VECTOR_INT:
20946 case MODE_INT:
20947 if (mode != V32QImode)
20949 if (!MEM_P (op0))
20951 orig_op0 = op0;
20952 op0 = gen_reg_rtx (V32QImode);
20954 else
20955 op0 = gen_lowpart (V32QImode, op0);
20956 op1 = gen_lowpart (V32QImode, op1);
20957 mode = V32QImode;
20959 break;
20960 case MODE_VECTOR_FLOAT:
20961 break;
20962 default:
20963 gcc_unreachable ();
20966 switch (mode)
20968 default:
20969 gcc_unreachable ();
20970 case V32QImode:
20971 extract = gen_avx_vextractf128v32qi;
20972 mode = V16QImode;
20973 break;
20974 case V8SFmode:
20975 extract = gen_avx_vextractf128v8sf;
20976 mode = V4SFmode;
20977 break;
20978 case V4DFmode:
20979 extract = gen_avx_vextractf128v4df;
20980 mode = V2DFmode;
20981 break;
20984 if (MEM_P (op1))
20986 rtx r = gen_reg_rtx (mode);
20987 m = adjust_address (op1, mode, 0);
20988 emit_move_insn (r, m);
20989 m = adjust_address (op1, mode, 16);
20990 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20991 emit_move_insn (op0, r);
20993 else if (MEM_P (op0))
20995 m = adjust_address (op0, mode, 0);
20996 emit_insn (extract (m, op1, const0_rtx));
20997 m = adjust_address (op0, mode, 16);
20998 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
21000 else
21001 gcc_unreachable ();
21003 if (orig_op0)
21004 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
21007 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
21008 straight to ix86_expand_vector_move. */
21009 /* Code generation for scalar reg-reg moves of single and double precision data:
21010 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21011 movaps reg, reg
21012 else
21013 movss reg, reg
21014 if (x86_sse_partial_reg_dependency == true)
21015 movapd reg, reg
21016 else
21017 movsd reg, reg
21019 Code generation for scalar loads of double precision data:
21020 if (x86_sse_split_regs == true)
21021 movlpd mem, reg (gas syntax)
21022 else
21023 movsd mem, reg
21025 Code generation for unaligned packed loads of single precision data
21026 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21027 if (x86_sse_unaligned_move_optimal)
21028 movups mem, reg
21030 if (x86_sse_partial_reg_dependency == true)
21032 xorps reg, reg
21033 movlps mem, reg
21034 movhps mem+8, reg
21036 else
21038 movlps mem, reg
21039 movhps mem+8, reg
21042 Code generation for unaligned packed loads of double precision data
21043 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21044 if (x86_sse_unaligned_move_optimal)
21045 movupd mem, reg
21047 if (x86_sse_split_regs == true)
21049 movlpd mem, reg
21050 movhpd mem+8, reg
21052 else
21054 movsd mem, reg
21055 movhpd mem+8, reg
21059 void
21060 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21062 rtx op0, op1, m;
21064 op0 = operands[0];
21065 op1 = operands[1];
21067 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21068 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21070 emit_insn (gen_rtx_SET (op0, op1));
21071 return;
21074 if (TARGET_AVX)
21076 if (GET_MODE_SIZE (mode) == 32)
21077 ix86_avx256_split_vector_move_misalign (op0, op1);
21078 else
21079 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21080 emit_insn (gen_rtx_SET (op0, op1));
21081 return;
21084 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21085 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21087 emit_insn (gen_rtx_SET (op0, op1));
21088 return;
21091 /* ??? If we have typed data, then it would appear that using
21092 movdqu is the only way to get unaligned data loaded with
21093 integer type. */
21094 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21096 emit_insn (gen_rtx_SET (op0, op1));
21097 return;
21100 if (MEM_P (op1))
21102 if (TARGET_SSE2 && mode == V2DFmode)
21104 rtx zero;
21106 /* When SSE registers are split into halves, we can avoid
21107 writing to the top half twice. */
21108 if (TARGET_SSE_SPLIT_REGS)
21110 emit_clobber (op0);
21111 zero = op0;
21113 else
21115 /* ??? Not sure about the best option for the Intel chips.
21116 The following would seem to satisfy; the register is
21117 entirely cleared, breaking the dependency chain. We
21118 then store to the upper half, with a dependency depth
21119 of one. A rumor has it that Intel recommends two movsd
21120 followed by an unpacklpd, but this is unconfirmed. And
21121 given that the dependency depth of the unpacklpd would
21122 still be one, I'm not sure why this would be better. */
21123 zero = CONST0_RTX (V2DFmode);
21126 m = adjust_address (op1, DFmode, 0);
21127 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21128 m = adjust_address (op1, DFmode, 8);
21129 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21131 else
21133 rtx t;
21135 if (mode != V4SFmode)
21136 t = gen_reg_rtx (V4SFmode);
21137 else
21138 t = op0;
21140 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21141 emit_move_insn (t, CONST0_RTX (V4SFmode));
21142 else
21143 emit_clobber (t);
21145 m = adjust_address (op1, V2SFmode, 0);
21146 emit_insn (gen_sse_loadlps (t, t, m));
21147 m = adjust_address (op1, V2SFmode, 8);
21148 emit_insn (gen_sse_loadhps (t, t, m));
21149 if (mode != V4SFmode)
21150 emit_move_insn (op0, gen_lowpart (mode, t));
21153 else if (MEM_P (op0))
21155 if (TARGET_SSE2 && mode == V2DFmode)
21157 m = adjust_address (op0, DFmode, 0);
21158 emit_insn (gen_sse2_storelpd (m, op1));
21159 m = adjust_address (op0, DFmode, 8);
21160 emit_insn (gen_sse2_storehpd (m, op1));
21162 else
21164 if (mode != V4SFmode)
21165 op1 = gen_lowpart (V4SFmode, op1);
21167 m = adjust_address (op0, V2SFmode, 0);
21168 emit_insn (gen_sse_storelps (m, op1));
21169 m = adjust_address (op0, V2SFmode, 8);
21170 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21173 else
21174 gcc_unreachable ();
21177 /* Helper function of ix86_fixup_binary_operands to canonicalize
21178 operand order. Returns true if the operands should be swapped. */
21180 static bool
21181 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21182 rtx operands[])
21184 rtx dst = operands[0];
21185 rtx src1 = operands[1];
21186 rtx src2 = operands[2];
21188 /* If the operation is not commutative, we can't do anything. */
21189 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21190 return false;
21192 /* Highest priority is that src1 should match dst. */
21193 if (rtx_equal_p (dst, src1))
21194 return false;
21195 if (rtx_equal_p (dst, src2))
21196 return true;
21198 /* Next highest priority is that immediate constants come second. */
21199 if (immediate_operand (src2, mode))
21200 return false;
21201 if (immediate_operand (src1, mode))
21202 return true;
21204 /* Lowest priority is that memory references should come second. */
21205 if (MEM_P (src2))
21206 return false;
21207 if (MEM_P (src1))
21208 return true;
21210 return false;
21214 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21215 destination to use for the operation. If different from the true
21216 destination in operands[0], a copy operation will be required. */
21219 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21220 rtx operands[])
21222 rtx dst = operands[0];
21223 rtx src1 = operands[1];
21224 rtx src2 = operands[2];
21226 /* Canonicalize operand order. */
21227 if (ix86_swap_binary_operands_p (code, mode, operands))
21229 /* It is invalid to swap operands of different modes. */
21230 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21232 std::swap (src1, src2);
21235 /* Both source operands cannot be in memory. */
21236 if (MEM_P (src1) && MEM_P (src2))
21238 /* Optimization: Only read from memory once. */
21239 if (rtx_equal_p (src1, src2))
21241 src2 = force_reg (mode, src2);
21242 src1 = src2;
21244 else if (rtx_equal_p (dst, src1))
21245 src2 = force_reg (mode, src2);
21246 else
21247 src1 = force_reg (mode, src1);
21250 /* If the destination is memory, and we do not have matching source
21251 operands, do things in registers. */
21252 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21253 dst = gen_reg_rtx (mode);
21255 /* Source 1 cannot be a constant. */
21256 if (CONSTANT_P (src1))
21257 src1 = force_reg (mode, src1);
21259 /* Source 1 cannot be a non-matching memory. */
21260 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21261 src1 = force_reg (mode, src1);
21263 /* Improve address combine. */
21264 if (code == PLUS
21265 && GET_MODE_CLASS (mode) == MODE_INT
21266 && MEM_P (src2))
21267 src2 = force_reg (mode, src2);
21269 operands[1] = src1;
21270 operands[2] = src2;
21271 return dst;
21274 /* Similarly, but assume that the destination has already been
21275 set up properly. */
21277 void
21278 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21279 machine_mode mode, rtx operands[])
21281 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21282 gcc_assert (dst == operands[0]);
21285 /* Attempt to expand a binary operator. Make the expansion closer to the
21286 actual machine, then just general_operand, which will allow 3 separate
21287 memory references (one output, two input) in a single insn. */
21289 void
21290 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21291 rtx operands[])
21293 rtx src1, src2, dst, op, clob;
21295 dst = ix86_fixup_binary_operands (code, mode, operands);
21296 src1 = operands[1];
21297 src2 = operands[2];
21299 /* Emit the instruction. */
21301 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21303 if (reload_completed
21304 && code == PLUS
21305 && !rtx_equal_p (dst, src1))
21307 /* This is going to be an LEA; avoid splitting it later. */
21308 emit_insn (op);
21310 else
21312 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21313 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21316 /* Fix up the destination if needed. */
21317 if (dst != operands[0])
21318 emit_move_insn (operands[0], dst);
21321 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21322 the given OPERANDS. */
21324 void
21325 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21326 rtx operands[])
21328 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21329 if (SUBREG_P (operands[1]))
21331 op1 = operands[1];
21332 op2 = operands[2];
21334 else if (SUBREG_P (operands[2]))
21336 op1 = operands[2];
21337 op2 = operands[1];
21339 /* Optimize (__m128i) d | (__m128i) e and similar code
21340 when d and e are float vectors into float vector logical
21341 insn. In C/C++ without using intrinsics there is no other way
21342 to express vector logical operation on float vectors than
21343 to cast them temporarily to integer vectors. */
21344 if (op1
21345 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21346 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21347 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21348 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21349 && SUBREG_BYTE (op1) == 0
21350 && (GET_CODE (op2) == CONST_VECTOR
21351 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21352 && SUBREG_BYTE (op2) == 0))
21353 && can_create_pseudo_p ())
21355 rtx dst;
21356 switch (GET_MODE (SUBREG_REG (op1)))
21358 case V4SFmode:
21359 case V8SFmode:
21360 case V16SFmode:
21361 case V2DFmode:
21362 case V4DFmode:
21363 case V8DFmode:
21364 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21365 if (GET_CODE (op2) == CONST_VECTOR)
21367 op2 = gen_lowpart (GET_MODE (dst), op2);
21368 op2 = force_reg (GET_MODE (dst), op2);
21370 else
21372 op1 = operands[1];
21373 op2 = SUBREG_REG (operands[2]);
21374 if (!vector_operand (op2, GET_MODE (dst)))
21375 op2 = force_reg (GET_MODE (dst), op2);
21377 op1 = SUBREG_REG (op1);
21378 if (!vector_operand (op1, GET_MODE (dst)))
21379 op1 = force_reg (GET_MODE (dst), op1);
21380 emit_insn (gen_rtx_SET (dst,
21381 gen_rtx_fmt_ee (code, GET_MODE (dst),
21382 op1, op2)));
21383 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21384 return;
21385 default:
21386 break;
21389 if (!vector_operand (operands[1], mode))
21390 operands[1] = force_reg (mode, operands[1]);
21391 if (!vector_operand (operands[2], mode))
21392 operands[2] = force_reg (mode, operands[2]);
21393 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21394 emit_insn (gen_rtx_SET (operands[0],
21395 gen_rtx_fmt_ee (code, mode, operands[1],
21396 operands[2])));
21399 /* Return TRUE or FALSE depending on whether the binary operator meets the
21400 appropriate constraints. */
21402 bool
21403 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21404 rtx operands[3])
21406 rtx dst = operands[0];
21407 rtx src1 = operands[1];
21408 rtx src2 = operands[2];
21410 /* Both source operands cannot be in memory. */
21411 if (MEM_P (src1) && MEM_P (src2))
21412 return false;
21414 /* Canonicalize operand order for commutative operators. */
21415 if (ix86_swap_binary_operands_p (code, mode, operands))
21416 std::swap (src1, src2);
21418 /* If the destination is memory, we must have a matching source operand. */
21419 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21420 return false;
21422 /* Source 1 cannot be a constant. */
21423 if (CONSTANT_P (src1))
21424 return false;
21426 /* Source 1 cannot be a non-matching memory. */
21427 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21428 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21429 return (code == AND
21430 && (mode == HImode
21431 || mode == SImode
21432 || (TARGET_64BIT && mode == DImode))
21433 && satisfies_constraint_L (src2));
21435 return true;
21438 /* Attempt to expand a unary operator. Make the expansion closer to the
21439 actual machine, then just general_operand, which will allow 2 separate
21440 memory references (one output, one input) in a single insn. */
21442 void
21443 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21444 rtx operands[])
21446 bool matching_memory = false;
21447 rtx src, dst, op, clob;
21449 dst = operands[0];
21450 src = operands[1];
21452 /* If the destination is memory, and we do not have matching source
21453 operands, do things in registers. */
21454 if (MEM_P (dst))
21456 if (rtx_equal_p (dst, src))
21457 matching_memory = true;
21458 else
21459 dst = gen_reg_rtx (mode);
21462 /* When source operand is memory, destination must match. */
21463 if (MEM_P (src) && !matching_memory)
21464 src = force_reg (mode, src);
21466 /* Emit the instruction. */
21468 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21470 if (code == NOT)
21471 emit_insn (op);
21472 else
21474 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21475 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21478 /* Fix up the destination if needed. */
21479 if (dst != operands[0])
21480 emit_move_insn (operands[0], dst);
21483 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21484 divisor are within the range [0-255]. */
21486 void
21487 ix86_split_idivmod (machine_mode mode, rtx operands[],
21488 bool signed_p)
21490 rtx_code_label *end_label, *qimode_label;
21491 rtx div, mod;
21492 rtx_insn *insn;
21493 rtx scratch, tmp0, tmp1, tmp2;
21494 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21495 rtx (*gen_zero_extend) (rtx, rtx);
21496 rtx (*gen_test_ccno_1) (rtx, rtx);
21498 switch (mode)
21500 case SImode:
21501 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21502 gen_test_ccno_1 = gen_testsi_ccno_1;
21503 gen_zero_extend = gen_zero_extendqisi2;
21504 break;
21505 case DImode:
21506 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21507 gen_test_ccno_1 = gen_testdi_ccno_1;
21508 gen_zero_extend = gen_zero_extendqidi2;
21509 break;
21510 default:
21511 gcc_unreachable ();
21514 end_label = gen_label_rtx ();
21515 qimode_label = gen_label_rtx ();
21517 scratch = gen_reg_rtx (mode);
21519 /* Use 8bit unsigned divimod if dividend and divisor are within
21520 the range [0-255]. */
21521 emit_move_insn (scratch, operands[2]);
21522 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21523 scratch, 1, OPTAB_DIRECT);
21524 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21525 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21526 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21527 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21528 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21529 pc_rtx);
21530 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21531 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21532 JUMP_LABEL (insn) = qimode_label;
21534 /* Generate original signed/unsigned divimod. */
21535 div = gen_divmod4_1 (operands[0], operands[1],
21536 operands[2], operands[3]);
21537 emit_insn (div);
21539 /* Branch to the end. */
21540 emit_jump_insn (gen_jump (end_label));
21541 emit_barrier ();
21543 /* Generate 8bit unsigned divide. */
21544 emit_label (qimode_label);
21545 /* Don't use operands[0] for result of 8bit divide since not all
21546 registers support QImode ZERO_EXTRACT. */
21547 tmp0 = lowpart_subreg (HImode, scratch, mode);
21548 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21549 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21550 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21552 if (signed_p)
21554 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21555 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21557 else
21559 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21560 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21563 /* Extract remainder from AH. */
21564 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21565 if (REG_P (operands[1]))
21566 insn = emit_move_insn (operands[1], tmp1);
21567 else
21569 /* Need a new scratch register since the old one has result
21570 of 8bit divide. */
21571 scratch = gen_reg_rtx (mode);
21572 emit_move_insn (scratch, tmp1);
21573 insn = emit_move_insn (operands[1], scratch);
21575 set_unique_reg_note (insn, REG_EQUAL, mod);
21577 /* Zero extend quotient from AL. */
21578 tmp1 = gen_lowpart (QImode, tmp0);
21579 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21580 set_unique_reg_note (insn, REG_EQUAL, div);
21582 emit_label (end_label);
21585 #define LEA_MAX_STALL (3)
21586 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21588 /* Increase given DISTANCE in half-cycles according to
21589 dependencies between PREV and NEXT instructions.
21590 Add 1 half-cycle if there is no dependency and
21591 go to next cycle if there is some dependecy. */
21593 static unsigned int
21594 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21596 df_ref def, use;
21598 if (!prev || !next)
21599 return distance + (distance & 1) + 2;
21601 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21602 return distance + 1;
21604 FOR_EACH_INSN_USE (use, next)
21605 FOR_EACH_INSN_DEF (def, prev)
21606 if (!DF_REF_IS_ARTIFICIAL (def)
21607 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21608 return distance + (distance & 1) + 2;
21610 return distance + 1;
21613 /* Function checks if instruction INSN defines register number
21614 REGNO1 or REGNO2. */
21616 static bool
21617 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21618 rtx_insn *insn)
21620 df_ref def;
21622 FOR_EACH_INSN_DEF (def, insn)
21623 if (DF_REF_REG_DEF_P (def)
21624 && !DF_REF_IS_ARTIFICIAL (def)
21625 && (regno1 == DF_REF_REGNO (def)
21626 || regno2 == DF_REF_REGNO (def)))
21627 return true;
21629 return false;
21632 /* Function checks if instruction INSN uses register number
21633 REGNO as a part of address expression. */
21635 static bool
21636 insn_uses_reg_mem (unsigned int regno, rtx insn)
21638 df_ref use;
21640 FOR_EACH_INSN_USE (use, insn)
21641 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21642 return true;
21644 return false;
21647 /* Search backward for non-agu definition of register number REGNO1
21648 or register number REGNO2 in basic block starting from instruction
21649 START up to head of basic block or instruction INSN.
21651 Function puts true value into *FOUND var if definition was found
21652 and false otherwise.
21654 Distance in half-cycles between START and found instruction or head
21655 of BB is added to DISTANCE and returned. */
21657 static int
21658 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21659 rtx_insn *insn, int distance,
21660 rtx_insn *start, bool *found)
21662 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21663 rtx_insn *prev = start;
21664 rtx_insn *next = NULL;
21666 *found = false;
21668 while (prev
21669 && prev != insn
21670 && distance < LEA_SEARCH_THRESHOLD)
21672 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21674 distance = increase_distance (prev, next, distance);
21675 if (insn_defines_reg (regno1, regno2, prev))
21677 if (recog_memoized (prev) < 0
21678 || get_attr_type (prev) != TYPE_LEA)
21680 *found = true;
21681 return distance;
21685 next = prev;
21687 if (prev == BB_HEAD (bb))
21688 break;
21690 prev = PREV_INSN (prev);
21693 return distance;
21696 /* Search backward for non-agu definition of register number REGNO1
21697 or register number REGNO2 in INSN's basic block until
21698 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21699 2. Reach neighbor BBs boundary, or
21700 3. Reach agu definition.
21701 Returns the distance between the non-agu definition point and INSN.
21702 If no definition point, returns -1. */
21704 static int
21705 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21706 rtx_insn *insn)
21708 basic_block bb = BLOCK_FOR_INSN (insn);
21709 int distance = 0;
21710 bool found = false;
21712 if (insn != BB_HEAD (bb))
21713 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21714 distance, PREV_INSN (insn),
21715 &found);
21717 if (!found && distance < LEA_SEARCH_THRESHOLD)
21719 edge e;
21720 edge_iterator ei;
21721 bool simple_loop = false;
21723 FOR_EACH_EDGE (e, ei, bb->preds)
21724 if (e->src == bb)
21726 simple_loop = true;
21727 break;
21730 if (simple_loop)
21731 distance = distance_non_agu_define_in_bb (regno1, regno2,
21732 insn, distance,
21733 BB_END (bb), &found);
21734 else
21736 int shortest_dist = -1;
21737 bool found_in_bb = false;
21739 FOR_EACH_EDGE (e, ei, bb->preds)
21741 int bb_dist
21742 = distance_non_agu_define_in_bb (regno1, regno2,
21743 insn, distance,
21744 BB_END (e->src),
21745 &found_in_bb);
21746 if (found_in_bb)
21748 if (shortest_dist < 0)
21749 shortest_dist = bb_dist;
21750 else if (bb_dist > 0)
21751 shortest_dist = MIN (bb_dist, shortest_dist);
21753 found = true;
21757 distance = shortest_dist;
21761 /* get_attr_type may modify recog data. We want to make sure
21762 that recog data is valid for instruction INSN, on which
21763 distance_non_agu_define is called. INSN is unchanged here. */
21764 extract_insn_cached (insn);
21766 if (!found)
21767 return -1;
21769 return distance >> 1;
21772 /* Return the distance in half-cycles between INSN and the next
21773 insn that uses register number REGNO in memory address added
21774 to DISTANCE. Return -1 if REGNO0 is set.
21776 Put true value into *FOUND if register usage was found and
21777 false otherwise.
21778 Put true value into *REDEFINED if register redefinition was
21779 found and false otherwise. */
21781 static int
21782 distance_agu_use_in_bb (unsigned int regno,
21783 rtx_insn *insn, int distance, rtx_insn *start,
21784 bool *found, bool *redefined)
21786 basic_block bb = NULL;
21787 rtx_insn *next = start;
21788 rtx_insn *prev = NULL;
21790 *found = false;
21791 *redefined = false;
21793 if (start != NULL_RTX)
21795 bb = BLOCK_FOR_INSN (start);
21796 if (start != BB_HEAD (bb))
21797 /* If insn and start belong to the same bb, set prev to insn,
21798 so the call to increase_distance will increase the distance
21799 between insns by 1. */
21800 prev = insn;
21803 while (next
21804 && next != insn
21805 && distance < LEA_SEARCH_THRESHOLD)
21807 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21809 distance = increase_distance(prev, next, distance);
21810 if (insn_uses_reg_mem (regno, next))
21812 /* Return DISTANCE if OP0 is used in memory
21813 address in NEXT. */
21814 *found = true;
21815 return distance;
21818 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21820 /* Return -1 if OP0 is set in NEXT. */
21821 *redefined = true;
21822 return -1;
21825 prev = next;
21828 if (next == BB_END (bb))
21829 break;
21831 next = NEXT_INSN (next);
21834 return distance;
21837 /* Return the distance between INSN and the next insn that uses
21838 register number REGNO0 in memory address. Return -1 if no such
21839 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21841 static int
21842 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21844 basic_block bb = BLOCK_FOR_INSN (insn);
21845 int distance = 0;
21846 bool found = false;
21847 bool redefined = false;
21849 if (insn != BB_END (bb))
21850 distance = distance_agu_use_in_bb (regno0, insn, distance,
21851 NEXT_INSN (insn),
21852 &found, &redefined);
21854 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21856 edge e;
21857 edge_iterator ei;
21858 bool simple_loop = false;
21860 FOR_EACH_EDGE (e, ei, bb->succs)
21861 if (e->dest == bb)
21863 simple_loop = true;
21864 break;
21867 if (simple_loop)
21868 distance = distance_agu_use_in_bb (regno0, insn,
21869 distance, BB_HEAD (bb),
21870 &found, &redefined);
21871 else
21873 int shortest_dist = -1;
21874 bool found_in_bb = false;
21875 bool redefined_in_bb = false;
21877 FOR_EACH_EDGE (e, ei, bb->succs)
21879 int bb_dist
21880 = distance_agu_use_in_bb (regno0, insn,
21881 distance, BB_HEAD (e->dest),
21882 &found_in_bb, &redefined_in_bb);
21883 if (found_in_bb)
21885 if (shortest_dist < 0)
21886 shortest_dist = bb_dist;
21887 else if (bb_dist > 0)
21888 shortest_dist = MIN (bb_dist, shortest_dist);
21890 found = true;
21894 distance = shortest_dist;
21898 if (!found || redefined)
21899 return -1;
21901 return distance >> 1;
21904 /* Define this macro to tune LEA priority vs ADD, it take effect when
21905 there is a dilemma of choicing LEA or ADD
21906 Negative value: ADD is more preferred than LEA
21907 Zero: Netrual
21908 Positive value: LEA is more preferred than ADD*/
21909 #define IX86_LEA_PRIORITY 0
21911 /* Return true if usage of lea INSN has performance advantage
21912 over a sequence of instructions. Instructions sequence has
21913 SPLIT_COST cycles higher latency than lea latency. */
21915 static bool
21916 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21917 unsigned int regno2, int split_cost, bool has_scale)
21919 int dist_define, dist_use;
21921 /* For Silvermont if using a 2-source or 3-source LEA for
21922 non-destructive destination purposes, or due to wanting
21923 ability to use SCALE, the use of LEA is justified. */
21924 if (TARGET_SILVERMONT || TARGET_INTEL)
21926 if (has_scale)
21927 return true;
21928 if (split_cost < 1)
21929 return false;
21930 if (regno0 == regno1 || regno0 == regno2)
21931 return false;
21932 return true;
21935 dist_define = distance_non_agu_define (regno1, regno2, insn);
21936 dist_use = distance_agu_use (regno0, insn);
21938 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21940 /* If there is no non AGU operand definition, no AGU
21941 operand usage and split cost is 0 then both lea
21942 and non lea variants have same priority. Currently
21943 we prefer lea for 64 bit code and non lea on 32 bit
21944 code. */
21945 if (dist_use < 0 && split_cost == 0)
21946 return TARGET_64BIT || IX86_LEA_PRIORITY;
21947 else
21948 return true;
21951 /* With longer definitions distance lea is more preferable.
21952 Here we change it to take into account splitting cost and
21953 lea priority. */
21954 dist_define += split_cost + IX86_LEA_PRIORITY;
21956 /* If there is no use in memory addess then we just check
21957 that split cost exceeds AGU stall. */
21958 if (dist_use < 0)
21959 return dist_define > LEA_MAX_STALL;
21961 /* If this insn has both backward non-agu dependence and forward
21962 agu dependence, the one with short distance takes effect. */
21963 return dist_define >= dist_use;
21966 /* Return true if it is legal to clobber flags by INSN and
21967 false otherwise. */
21969 static bool
21970 ix86_ok_to_clobber_flags (rtx_insn *insn)
21972 basic_block bb = BLOCK_FOR_INSN (insn);
21973 df_ref use;
21974 bitmap live;
21976 while (insn)
21978 if (NONDEBUG_INSN_P (insn))
21980 FOR_EACH_INSN_USE (use, insn)
21981 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21982 return false;
21984 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21985 return true;
21988 if (insn == BB_END (bb))
21989 break;
21991 insn = NEXT_INSN (insn);
21994 live = df_get_live_out(bb);
21995 return !REGNO_REG_SET_P (live, FLAGS_REG);
21998 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21999 move and add to avoid AGU stalls. */
22001 bool
22002 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
22004 unsigned int regno0, regno1, regno2;
22006 /* Check if we need to optimize. */
22007 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22008 return false;
22010 /* Check it is correct to split here. */
22011 if (!ix86_ok_to_clobber_flags(insn))
22012 return false;
22014 regno0 = true_regnum (operands[0]);
22015 regno1 = true_regnum (operands[1]);
22016 regno2 = true_regnum (operands[2]);
22018 /* We need to split only adds with non destructive
22019 destination operand. */
22020 if (regno0 == regno1 || regno0 == regno2)
22021 return false;
22022 else
22023 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22026 /* Return true if we should emit lea instruction instead of mov
22027 instruction. */
22029 bool
22030 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22032 unsigned int regno0, regno1;
22034 /* Check if we need to optimize. */
22035 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22036 return false;
22038 /* Use lea for reg to reg moves only. */
22039 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22040 return false;
22042 regno0 = true_regnum (operands[0]);
22043 regno1 = true_regnum (operands[1]);
22045 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22048 /* Return true if we need to split lea into a sequence of
22049 instructions to avoid AGU stalls. */
22051 bool
22052 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22054 unsigned int regno0, regno1, regno2;
22055 int split_cost;
22056 struct ix86_address parts;
22057 int ok;
22059 /* Check we need to optimize. */
22060 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22061 return false;
22063 /* The "at least two components" test below might not catch simple
22064 move or zero extension insns if parts.base is non-NULL and parts.disp
22065 is const0_rtx as the only components in the address, e.g. if the
22066 register is %rbp or %r13. As this test is much cheaper and moves or
22067 zero extensions are the common case, do this check first. */
22068 if (REG_P (operands[1])
22069 || (SImode_address_operand (operands[1], VOIDmode)
22070 && REG_P (XEXP (operands[1], 0))))
22071 return false;
22073 /* Check if it is OK to split here. */
22074 if (!ix86_ok_to_clobber_flags (insn))
22075 return false;
22077 ok = ix86_decompose_address (operands[1], &parts);
22078 gcc_assert (ok);
22080 /* There should be at least two components in the address. */
22081 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22082 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22083 return false;
22085 /* We should not split into add if non legitimate pic
22086 operand is used as displacement. */
22087 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22088 return false;
22090 regno0 = true_regnum (operands[0]) ;
22091 regno1 = INVALID_REGNUM;
22092 regno2 = INVALID_REGNUM;
22094 if (parts.base)
22095 regno1 = true_regnum (parts.base);
22096 if (parts.index)
22097 regno2 = true_regnum (parts.index);
22099 split_cost = 0;
22101 /* Compute how many cycles we will add to execution time
22102 if split lea into a sequence of instructions. */
22103 if (parts.base || parts.index)
22105 /* Have to use mov instruction if non desctructive
22106 destination form is used. */
22107 if (regno1 != regno0 && regno2 != regno0)
22108 split_cost += 1;
22110 /* Have to add index to base if both exist. */
22111 if (parts.base && parts.index)
22112 split_cost += 1;
22114 /* Have to use shift and adds if scale is 2 or greater. */
22115 if (parts.scale > 1)
22117 if (regno0 != regno1)
22118 split_cost += 1;
22119 else if (regno2 == regno0)
22120 split_cost += 4;
22121 else
22122 split_cost += parts.scale;
22125 /* Have to use add instruction with immediate if
22126 disp is non zero. */
22127 if (parts.disp && parts.disp != const0_rtx)
22128 split_cost += 1;
22130 /* Subtract the price of lea. */
22131 split_cost -= 1;
22134 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22135 parts.scale > 1);
22138 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22139 matches destination. RTX includes clobber of FLAGS_REG. */
22141 static void
22142 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22143 rtx dst, rtx src)
22145 rtx op, clob;
22147 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22148 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22150 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22153 /* Return true if regno1 def is nearest to the insn. */
22155 static bool
22156 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22158 rtx_insn *prev = insn;
22159 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22161 if (insn == start)
22162 return false;
22163 while (prev && prev != start)
22165 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22167 prev = PREV_INSN (prev);
22168 continue;
22170 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22171 return true;
22172 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22173 return false;
22174 prev = PREV_INSN (prev);
22177 /* None of the regs is defined in the bb. */
22178 return false;
22181 /* Split lea instructions into a sequence of instructions
22182 which are executed on ALU to avoid AGU stalls.
22183 It is assumed that it is allowed to clobber flags register
22184 at lea position. */
22186 void
22187 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22189 unsigned int regno0, regno1, regno2;
22190 struct ix86_address parts;
22191 rtx target, tmp;
22192 int ok, adds;
22194 ok = ix86_decompose_address (operands[1], &parts);
22195 gcc_assert (ok);
22197 target = gen_lowpart (mode, operands[0]);
22199 regno0 = true_regnum (target);
22200 regno1 = INVALID_REGNUM;
22201 regno2 = INVALID_REGNUM;
22203 if (parts.base)
22205 parts.base = gen_lowpart (mode, parts.base);
22206 regno1 = true_regnum (parts.base);
22209 if (parts.index)
22211 parts.index = gen_lowpart (mode, parts.index);
22212 regno2 = true_regnum (parts.index);
22215 if (parts.disp)
22216 parts.disp = gen_lowpart (mode, parts.disp);
22218 if (parts.scale > 1)
22220 /* Case r1 = r1 + ... */
22221 if (regno1 == regno0)
22223 /* If we have a case r1 = r1 + C * r2 then we
22224 should use multiplication which is very
22225 expensive. Assume cost model is wrong if we
22226 have such case here. */
22227 gcc_assert (regno2 != regno0);
22229 for (adds = parts.scale; adds > 0; adds--)
22230 ix86_emit_binop (PLUS, mode, target, parts.index);
22232 else
22234 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22235 if (regno0 != regno2)
22236 emit_insn (gen_rtx_SET (target, parts.index));
22238 /* Use shift for scaling. */
22239 ix86_emit_binop (ASHIFT, mode, target,
22240 GEN_INT (exact_log2 (parts.scale)));
22242 if (parts.base)
22243 ix86_emit_binop (PLUS, mode, target, parts.base);
22245 if (parts.disp && parts.disp != const0_rtx)
22246 ix86_emit_binop (PLUS, mode, target, parts.disp);
22249 else if (!parts.base && !parts.index)
22251 gcc_assert(parts.disp);
22252 emit_insn (gen_rtx_SET (target, parts.disp));
22254 else
22256 if (!parts.base)
22258 if (regno0 != regno2)
22259 emit_insn (gen_rtx_SET (target, parts.index));
22261 else if (!parts.index)
22263 if (regno0 != regno1)
22264 emit_insn (gen_rtx_SET (target, parts.base));
22266 else
22268 if (regno0 == regno1)
22269 tmp = parts.index;
22270 else if (regno0 == regno2)
22271 tmp = parts.base;
22272 else
22274 rtx tmp1;
22276 /* Find better operand for SET instruction, depending
22277 on which definition is farther from the insn. */
22278 if (find_nearest_reg_def (insn, regno1, regno2))
22279 tmp = parts.index, tmp1 = parts.base;
22280 else
22281 tmp = parts.base, tmp1 = parts.index;
22283 emit_insn (gen_rtx_SET (target, tmp));
22285 if (parts.disp && parts.disp != const0_rtx)
22286 ix86_emit_binop (PLUS, mode, target, parts.disp);
22288 ix86_emit_binop (PLUS, mode, target, tmp1);
22289 return;
22292 ix86_emit_binop (PLUS, mode, target, tmp);
22295 if (parts.disp && parts.disp != const0_rtx)
22296 ix86_emit_binop (PLUS, mode, target, parts.disp);
22300 /* Return true if it is ok to optimize an ADD operation to LEA
22301 operation to avoid flag register consumation. For most processors,
22302 ADD is faster than LEA. For the processors like BONNELL, if the
22303 destination register of LEA holds an actual address which will be
22304 used soon, LEA is better and otherwise ADD is better. */
22306 bool
22307 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22309 unsigned int regno0 = true_regnum (operands[0]);
22310 unsigned int regno1 = true_regnum (operands[1]);
22311 unsigned int regno2 = true_regnum (operands[2]);
22313 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22314 if (regno0 != regno1 && regno0 != regno2)
22315 return true;
22317 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22318 return false;
22320 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22323 /* Return true if destination reg of SET_BODY is shift count of
22324 USE_BODY. */
22326 static bool
22327 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22329 rtx set_dest;
22330 rtx shift_rtx;
22331 int i;
22333 /* Retrieve destination of SET_BODY. */
22334 switch (GET_CODE (set_body))
22336 case SET:
22337 set_dest = SET_DEST (set_body);
22338 if (!set_dest || !REG_P (set_dest))
22339 return false;
22340 break;
22341 case PARALLEL:
22342 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22343 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22344 use_body))
22345 return true;
22346 /* FALLTHROUGH */
22347 default:
22348 return false;
22351 /* Retrieve shift count of USE_BODY. */
22352 switch (GET_CODE (use_body))
22354 case SET:
22355 shift_rtx = XEXP (use_body, 1);
22356 break;
22357 case PARALLEL:
22358 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22359 if (ix86_dep_by_shift_count_body (set_body,
22360 XVECEXP (use_body, 0, i)))
22361 return true;
22362 /* FALLTHROUGH */
22363 default:
22364 return false;
22367 if (shift_rtx
22368 && (GET_CODE (shift_rtx) == ASHIFT
22369 || GET_CODE (shift_rtx) == LSHIFTRT
22370 || GET_CODE (shift_rtx) == ASHIFTRT
22371 || GET_CODE (shift_rtx) == ROTATE
22372 || GET_CODE (shift_rtx) == ROTATERT))
22374 rtx shift_count = XEXP (shift_rtx, 1);
22376 /* Return true if shift count is dest of SET_BODY. */
22377 if (REG_P (shift_count))
22379 /* Add check since it can be invoked before register
22380 allocation in pre-reload schedule. */
22381 if (reload_completed
22382 && true_regnum (set_dest) == true_regnum (shift_count))
22383 return true;
22384 else if (REGNO(set_dest) == REGNO(shift_count))
22385 return true;
22389 return false;
22392 /* Return true if destination reg of SET_INSN is shift count of
22393 USE_INSN. */
22395 bool
22396 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22398 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22399 PATTERN (use_insn));
22402 /* Return TRUE or FALSE depending on whether the unary operator meets the
22403 appropriate constraints. */
22405 bool
22406 ix86_unary_operator_ok (enum rtx_code,
22407 machine_mode,
22408 rtx operands[2])
22410 /* If one of operands is memory, source and destination must match. */
22411 if ((MEM_P (operands[0])
22412 || MEM_P (operands[1]))
22413 && ! rtx_equal_p (operands[0], operands[1]))
22414 return false;
22415 return true;
22418 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22419 are ok, keeping in mind the possible movddup alternative. */
22421 bool
22422 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22424 if (MEM_P (operands[0]))
22425 return rtx_equal_p (operands[0], operands[1 + high]);
22426 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22427 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22428 return true;
22431 /* Post-reload splitter for converting an SF or DFmode value in an
22432 SSE register into an unsigned SImode. */
22434 void
22435 ix86_split_convert_uns_si_sse (rtx operands[])
22437 machine_mode vecmode;
22438 rtx value, large, zero_or_two31, input, two31, x;
22440 large = operands[1];
22441 zero_or_two31 = operands[2];
22442 input = operands[3];
22443 two31 = operands[4];
22444 vecmode = GET_MODE (large);
22445 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22447 /* Load up the value into the low element. We must ensure that the other
22448 elements are valid floats -- zero is the easiest such value. */
22449 if (MEM_P (input))
22451 if (vecmode == V4SFmode)
22452 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22453 else
22454 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22456 else
22458 input = gen_rtx_REG (vecmode, REGNO (input));
22459 emit_move_insn (value, CONST0_RTX (vecmode));
22460 if (vecmode == V4SFmode)
22461 emit_insn (gen_sse_movss (value, value, input));
22462 else
22463 emit_insn (gen_sse2_movsd (value, value, input));
22466 emit_move_insn (large, two31);
22467 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22469 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22470 emit_insn (gen_rtx_SET (large, x));
22472 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22473 emit_insn (gen_rtx_SET (zero_or_two31, x));
22475 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22476 emit_insn (gen_rtx_SET (value, x));
22478 large = gen_rtx_REG (V4SImode, REGNO (large));
22479 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22481 x = gen_rtx_REG (V4SImode, REGNO (value));
22482 if (vecmode == V4SFmode)
22483 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22484 else
22485 emit_insn (gen_sse2_cvttpd2dq (x, value));
22486 value = x;
22488 emit_insn (gen_xorv4si3 (value, value, large));
22491 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22492 Expects the 64-bit DImode to be supplied in a pair of integral
22493 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22494 -mfpmath=sse, !optimize_size only. */
22496 void
22497 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22499 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22500 rtx int_xmm, fp_xmm;
22501 rtx biases, exponents;
22502 rtx x;
22504 int_xmm = gen_reg_rtx (V4SImode);
22505 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22506 emit_insn (gen_movdi_to_sse (int_xmm, input));
22507 else if (TARGET_SSE_SPLIT_REGS)
22509 emit_clobber (int_xmm);
22510 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22512 else
22514 x = gen_reg_rtx (V2DImode);
22515 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22516 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22519 x = gen_rtx_CONST_VECTOR (V4SImode,
22520 gen_rtvec (4, GEN_INT (0x43300000UL),
22521 GEN_INT (0x45300000UL),
22522 const0_rtx, const0_rtx));
22523 exponents = validize_mem (force_const_mem (V4SImode, x));
22525 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22526 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22528 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22529 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22530 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22531 (0x1.0p84 + double(fp_value_hi_xmm)).
22532 Note these exponents differ by 32. */
22534 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22536 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22537 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22538 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22539 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22540 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22541 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22542 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22543 biases = validize_mem (force_const_mem (V2DFmode, biases));
22544 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22546 /* Add the upper and lower DFmode values together. */
22547 if (TARGET_SSE3)
22548 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22549 else
22551 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22552 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22553 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22556 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22559 /* Not used, but eases macroization of patterns. */
22560 void
22561 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22563 gcc_unreachable ();
22566 /* Convert an unsigned SImode value into a DFmode. Only currently used
22567 for SSE, but applicable anywhere. */
22569 void
22570 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22572 REAL_VALUE_TYPE TWO31r;
22573 rtx x, fp;
22575 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22576 NULL, 1, OPTAB_DIRECT);
22578 fp = gen_reg_rtx (DFmode);
22579 emit_insn (gen_floatsidf2 (fp, x));
22581 real_ldexp (&TWO31r, &dconst1, 31);
22582 x = const_double_from_real_value (TWO31r, DFmode);
22584 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22585 if (x != target)
22586 emit_move_insn (target, x);
22589 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22590 32-bit mode; otherwise we have a direct convert instruction. */
22592 void
22593 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22595 REAL_VALUE_TYPE TWO32r;
22596 rtx fp_lo, fp_hi, x;
22598 fp_lo = gen_reg_rtx (DFmode);
22599 fp_hi = gen_reg_rtx (DFmode);
22601 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22603 real_ldexp (&TWO32r, &dconst1, 32);
22604 x = const_double_from_real_value (TWO32r, DFmode);
22605 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22607 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22609 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22610 0, OPTAB_DIRECT);
22611 if (x != target)
22612 emit_move_insn (target, x);
22615 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22616 For x86_32, -mfpmath=sse, !optimize_size only. */
22617 void
22618 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22620 REAL_VALUE_TYPE ONE16r;
22621 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22623 real_ldexp (&ONE16r, &dconst1, 16);
22624 x = const_double_from_real_value (ONE16r, SFmode);
22625 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22626 NULL, 0, OPTAB_DIRECT);
22627 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22628 NULL, 0, OPTAB_DIRECT);
22629 fp_hi = gen_reg_rtx (SFmode);
22630 fp_lo = gen_reg_rtx (SFmode);
22631 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22632 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22633 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22634 0, OPTAB_DIRECT);
22635 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22636 0, OPTAB_DIRECT);
22637 if (!rtx_equal_p (target, fp_hi))
22638 emit_move_insn (target, fp_hi);
22641 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22642 a vector of unsigned ints VAL to vector of floats TARGET. */
22644 void
22645 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22647 rtx tmp[8];
22648 REAL_VALUE_TYPE TWO16r;
22649 machine_mode intmode = GET_MODE (val);
22650 machine_mode fltmode = GET_MODE (target);
22651 rtx (*cvt) (rtx, rtx);
22653 if (intmode == V4SImode)
22654 cvt = gen_floatv4siv4sf2;
22655 else
22656 cvt = gen_floatv8siv8sf2;
22657 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22658 tmp[0] = force_reg (intmode, tmp[0]);
22659 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22660 OPTAB_DIRECT);
22661 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22662 NULL_RTX, 1, OPTAB_DIRECT);
22663 tmp[3] = gen_reg_rtx (fltmode);
22664 emit_insn (cvt (tmp[3], tmp[1]));
22665 tmp[4] = gen_reg_rtx (fltmode);
22666 emit_insn (cvt (tmp[4], tmp[2]));
22667 real_ldexp (&TWO16r, &dconst1, 16);
22668 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22669 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22670 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22671 OPTAB_DIRECT);
22672 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22673 OPTAB_DIRECT);
22674 if (tmp[7] != target)
22675 emit_move_insn (target, tmp[7]);
22678 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22679 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22680 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22681 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22684 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22686 REAL_VALUE_TYPE TWO31r;
22687 rtx two31r, tmp[4];
22688 machine_mode mode = GET_MODE (val);
22689 machine_mode scalarmode = GET_MODE_INNER (mode);
22690 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22691 rtx (*cmp) (rtx, rtx, rtx, rtx);
22692 int i;
22694 for (i = 0; i < 3; i++)
22695 tmp[i] = gen_reg_rtx (mode);
22696 real_ldexp (&TWO31r, &dconst1, 31);
22697 two31r = const_double_from_real_value (TWO31r, scalarmode);
22698 two31r = ix86_build_const_vector (mode, 1, two31r);
22699 two31r = force_reg (mode, two31r);
22700 switch (mode)
22702 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22703 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22704 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22705 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22706 default: gcc_unreachable ();
22708 tmp[3] = gen_rtx_LE (mode, two31r, val);
22709 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22710 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22711 0, OPTAB_DIRECT);
22712 if (intmode == V4SImode || TARGET_AVX2)
22713 *xorp = expand_simple_binop (intmode, ASHIFT,
22714 gen_lowpart (intmode, tmp[0]),
22715 GEN_INT (31), NULL_RTX, 0,
22716 OPTAB_DIRECT);
22717 else
22719 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22720 two31 = ix86_build_const_vector (intmode, 1, two31);
22721 *xorp = expand_simple_binop (intmode, AND,
22722 gen_lowpart (intmode, tmp[0]),
22723 two31, NULL_RTX, 0,
22724 OPTAB_DIRECT);
22726 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22727 0, OPTAB_DIRECT);
22730 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22731 then replicate the value for all elements of the vector
22732 register. */
22735 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22737 int i, n_elt;
22738 rtvec v;
22739 machine_mode scalar_mode;
22741 switch (mode)
22743 case V64QImode:
22744 case V32QImode:
22745 case V16QImode:
22746 case V32HImode:
22747 case V16HImode:
22748 case V8HImode:
22749 case V16SImode:
22750 case V8SImode:
22751 case V4SImode:
22752 case V8DImode:
22753 case V4DImode:
22754 case V2DImode:
22755 gcc_assert (vect);
22756 /* FALLTHRU */
22757 case V16SFmode:
22758 case V8SFmode:
22759 case V4SFmode:
22760 case V8DFmode:
22761 case V4DFmode:
22762 case V2DFmode:
22763 n_elt = GET_MODE_NUNITS (mode);
22764 v = rtvec_alloc (n_elt);
22765 scalar_mode = GET_MODE_INNER (mode);
22767 RTVEC_ELT (v, 0) = value;
22769 for (i = 1; i < n_elt; ++i)
22770 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22772 return gen_rtx_CONST_VECTOR (mode, v);
22774 default:
22775 gcc_unreachable ();
22779 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22780 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22781 for an SSE register. If VECT is true, then replicate the mask for
22782 all elements of the vector register. If INVERT is true, then create
22783 a mask excluding the sign bit. */
22786 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22788 machine_mode vec_mode, imode;
22789 wide_int w;
22790 rtx mask, v;
22792 switch (mode)
22794 case V16SImode:
22795 case V16SFmode:
22796 case V8SImode:
22797 case V4SImode:
22798 case V8SFmode:
22799 case V4SFmode:
22800 vec_mode = mode;
22801 imode = SImode;
22802 break;
22804 case V8DImode:
22805 case V4DImode:
22806 case V2DImode:
22807 case V8DFmode:
22808 case V4DFmode:
22809 case V2DFmode:
22810 vec_mode = mode;
22811 imode = DImode;
22812 break;
22814 case TImode:
22815 case TFmode:
22816 vec_mode = VOIDmode;
22817 imode = TImode;
22818 break;
22820 default:
22821 gcc_unreachable ();
22824 machine_mode inner_mode = GET_MODE_INNER (mode);
22825 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22826 GET_MODE_BITSIZE (inner_mode));
22827 if (invert)
22828 w = wi::bit_not (w);
22830 /* Force this value into the low part of a fp vector constant. */
22831 mask = immed_wide_int_const (w, imode);
22832 mask = gen_lowpart (inner_mode, mask);
22834 if (vec_mode == VOIDmode)
22835 return force_reg (inner_mode, mask);
22837 v = ix86_build_const_vector (vec_mode, vect, mask);
22838 return force_reg (vec_mode, v);
22841 /* Generate code for floating point ABS or NEG. */
22843 void
22844 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22845 rtx operands[])
22847 rtx mask, set, dst, src;
22848 bool use_sse = false;
22849 bool vector_mode = VECTOR_MODE_P (mode);
22850 machine_mode vmode = mode;
22852 if (vector_mode)
22853 use_sse = true;
22854 else if (mode == TFmode)
22855 use_sse = true;
22856 else if (TARGET_SSE_MATH)
22858 use_sse = SSE_FLOAT_MODE_P (mode);
22859 if (mode == SFmode)
22860 vmode = V4SFmode;
22861 else if (mode == DFmode)
22862 vmode = V2DFmode;
22865 /* NEG and ABS performed with SSE use bitwise mask operations.
22866 Create the appropriate mask now. */
22867 if (use_sse)
22868 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22869 else
22870 mask = NULL_RTX;
22872 dst = operands[0];
22873 src = operands[1];
22875 set = gen_rtx_fmt_e (code, mode, src);
22876 set = gen_rtx_SET (dst, set);
22878 if (mask)
22880 rtx use, clob;
22881 rtvec par;
22883 use = gen_rtx_USE (VOIDmode, mask);
22884 if (vector_mode)
22885 par = gen_rtvec (2, set, use);
22886 else
22888 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22889 par = gen_rtvec (3, set, use, clob);
22891 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22893 else
22894 emit_insn (set);
22897 /* Expand a copysign operation. Special case operand 0 being a constant. */
22899 void
22900 ix86_expand_copysign (rtx operands[])
22902 machine_mode mode, vmode;
22903 rtx dest, op0, op1, mask, nmask;
22905 dest = operands[0];
22906 op0 = operands[1];
22907 op1 = operands[2];
22909 mode = GET_MODE (dest);
22911 if (mode == SFmode)
22912 vmode = V4SFmode;
22913 else if (mode == DFmode)
22914 vmode = V2DFmode;
22915 else
22916 vmode = mode;
22918 if (CONST_DOUBLE_P (op0))
22920 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22922 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22923 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22925 if (mode == SFmode || mode == DFmode)
22927 if (op0 == CONST0_RTX (mode))
22928 op0 = CONST0_RTX (vmode);
22929 else
22931 rtx v = ix86_build_const_vector (vmode, false, op0);
22933 op0 = force_reg (vmode, v);
22936 else if (op0 != CONST0_RTX (mode))
22937 op0 = force_reg (mode, op0);
22939 mask = ix86_build_signbit_mask (vmode, 0, 0);
22941 if (mode == SFmode)
22942 copysign_insn = gen_copysignsf3_const;
22943 else if (mode == DFmode)
22944 copysign_insn = gen_copysigndf3_const;
22945 else
22946 copysign_insn = gen_copysigntf3_const;
22948 emit_insn (copysign_insn (dest, op0, op1, mask));
22950 else
22952 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22954 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22955 mask = ix86_build_signbit_mask (vmode, 0, 0);
22957 if (mode == SFmode)
22958 copysign_insn = gen_copysignsf3_var;
22959 else if (mode == DFmode)
22960 copysign_insn = gen_copysigndf3_var;
22961 else
22962 copysign_insn = gen_copysigntf3_var;
22964 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22968 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22969 be a constant, and so has already been expanded into a vector constant. */
22971 void
22972 ix86_split_copysign_const (rtx operands[])
22974 machine_mode mode, vmode;
22975 rtx dest, op0, mask, x;
22977 dest = operands[0];
22978 op0 = operands[1];
22979 mask = operands[3];
22981 mode = GET_MODE (dest);
22982 vmode = GET_MODE (mask);
22984 dest = lowpart_subreg (vmode, dest, mode);
22985 x = gen_rtx_AND (vmode, dest, mask);
22986 emit_insn (gen_rtx_SET (dest, x));
22988 if (op0 != CONST0_RTX (vmode))
22990 x = gen_rtx_IOR (vmode, dest, op0);
22991 emit_insn (gen_rtx_SET (dest, x));
22995 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22996 so we have to do two masks. */
22998 void
22999 ix86_split_copysign_var (rtx operands[])
23001 machine_mode mode, vmode;
23002 rtx dest, scratch, op0, op1, mask, nmask, x;
23004 dest = operands[0];
23005 scratch = operands[1];
23006 op0 = operands[2];
23007 op1 = operands[3];
23008 nmask = operands[4];
23009 mask = operands[5];
23011 mode = GET_MODE (dest);
23012 vmode = GET_MODE (mask);
23014 if (rtx_equal_p (op0, op1))
23016 /* Shouldn't happen often (it's useless, obviously), but when it does
23017 we'd generate incorrect code if we continue below. */
23018 emit_move_insn (dest, op0);
23019 return;
23022 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23024 gcc_assert (REGNO (op1) == REGNO (scratch));
23026 x = gen_rtx_AND (vmode, scratch, mask);
23027 emit_insn (gen_rtx_SET (scratch, x));
23029 dest = mask;
23030 op0 = lowpart_subreg (vmode, op0, mode);
23031 x = gen_rtx_NOT (vmode, dest);
23032 x = gen_rtx_AND (vmode, x, op0);
23033 emit_insn (gen_rtx_SET (dest, x));
23035 else
23037 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23039 x = gen_rtx_AND (vmode, scratch, mask);
23041 else /* alternative 2,4 */
23043 gcc_assert (REGNO (mask) == REGNO (scratch));
23044 op1 = lowpart_subreg (vmode, op1, mode);
23045 x = gen_rtx_AND (vmode, scratch, op1);
23047 emit_insn (gen_rtx_SET (scratch, x));
23049 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23051 dest = lowpart_subreg (vmode, op0, mode);
23052 x = gen_rtx_AND (vmode, dest, nmask);
23054 else /* alternative 3,4 */
23056 gcc_assert (REGNO (nmask) == REGNO (dest));
23057 dest = nmask;
23058 op0 = lowpart_subreg (vmode, op0, mode);
23059 x = gen_rtx_AND (vmode, dest, op0);
23061 emit_insn (gen_rtx_SET (dest, x));
23064 x = gen_rtx_IOR (vmode, dest, scratch);
23065 emit_insn (gen_rtx_SET (dest, x));
23068 /* Return TRUE or FALSE depending on whether the first SET in INSN
23069 has source and destination with matching CC modes, and that the
23070 CC mode is at least as constrained as REQ_MODE. */
23072 bool
23073 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23075 rtx set;
23076 machine_mode set_mode;
23078 set = PATTERN (insn);
23079 if (GET_CODE (set) == PARALLEL)
23080 set = XVECEXP (set, 0, 0);
23081 gcc_assert (GET_CODE (set) == SET);
23082 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23084 set_mode = GET_MODE (SET_DEST (set));
23085 switch (set_mode)
23087 case CCNOmode:
23088 if (req_mode != CCNOmode
23089 && (req_mode != CCmode
23090 || XEXP (SET_SRC (set), 1) != const0_rtx))
23091 return false;
23092 break;
23093 case CCmode:
23094 if (req_mode == CCGCmode)
23095 return false;
23096 /* FALLTHRU */
23097 case CCGCmode:
23098 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23099 return false;
23100 /* FALLTHRU */
23101 case CCGOCmode:
23102 if (req_mode == CCZmode)
23103 return false;
23104 /* FALLTHRU */
23105 case CCZmode:
23106 break;
23108 case CCAmode:
23109 case CCCmode:
23110 case CCOmode:
23111 case CCPmode:
23112 case CCSmode:
23113 if (set_mode != req_mode)
23114 return false;
23115 break;
23117 default:
23118 gcc_unreachable ();
23121 return GET_MODE (SET_SRC (set)) == set_mode;
23124 /* Generate insn patterns to do an integer compare of OPERANDS. */
23126 static rtx
23127 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23129 machine_mode cmpmode;
23130 rtx tmp, flags;
23132 cmpmode = SELECT_CC_MODE (code, op0, op1);
23133 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23135 /* This is very simple, but making the interface the same as in the
23136 FP case makes the rest of the code easier. */
23137 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23138 emit_insn (gen_rtx_SET (flags, tmp));
23140 /* Return the test that should be put into the flags user, i.e.
23141 the bcc, scc, or cmov instruction. */
23142 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23145 /* Figure out whether to use ordered or unordered fp comparisons.
23146 Return the appropriate mode to use. */
23148 machine_mode
23149 ix86_fp_compare_mode (enum rtx_code)
23151 /* ??? In order to make all comparisons reversible, we do all comparisons
23152 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23153 all forms trapping and nontrapping comparisons, we can make inequality
23154 comparisons trapping again, since it results in better code when using
23155 FCOM based compares. */
23156 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23159 machine_mode
23160 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23162 machine_mode mode = GET_MODE (op0);
23164 if (SCALAR_FLOAT_MODE_P (mode))
23166 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23167 return ix86_fp_compare_mode (code);
23170 switch (code)
23172 /* Only zero flag is needed. */
23173 case EQ: /* ZF=0 */
23174 case NE: /* ZF!=0 */
23175 return CCZmode;
23176 /* Codes needing carry flag. */
23177 case GEU: /* CF=0 */
23178 case LTU: /* CF=1 */
23179 /* Detect overflow checks. They need just the carry flag. */
23180 if (GET_CODE (op0) == PLUS
23181 && (rtx_equal_p (op1, XEXP (op0, 0))
23182 || rtx_equal_p (op1, XEXP (op0, 1))))
23183 return CCCmode;
23184 else
23185 return CCmode;
23186 case GTU: /* CF=0 & ZF=0 */
23187 case LEU: /* CF=1 | ZF=1 */
23188 return CCmode;
23189 /* Codes possibly doable only with sign flag when
23190 comparing against zero. */
23191 case GE: /* SF=OF or SF=0 */
23192 case LT: /* SF<>OF or SF=1 */
23193 if (op1 == const0_rtx)
23194 return CCGOCmode;
23195 else
23196 /* For other cases Carry flag is not required. */
23197 return CCGCmode;
23198 /* Codes doable only with sign flag when comparing
23199 against zero, but we miss jump instruction for it
23200 so we need to use relational tests against overflow
23201 that thus needs to be zero. */
23202 case GT: /* ZF=0 & SF=OF */
23203 case LE: /* ZF=1 | SF<>OF */
23204 if (op1 == const0_rtx)
23205 return CCNOmode;
23206 else
23207 return CCGCmode;
23208 /* strcmp pattern do (use flags) and combine may ask us for proper
23209 mode. */
23210 case USE:
23211 return CCmode;
23212 default:
23213 gcc_unreachable ();
23217 /* Return the fixed registers used for condition codes. */
23219 static bool
23220 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23222 *p1 = FLAGS_REG;
23223 *p2 = FPSR_REG;
23224 return true;
23227 /* If two condition code modes are compatible, return a condition code
23228 mode which is compatible with both. Otherwise, return
23229 VOIDmode. */
23231 static machine_mode
23232 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23234 if (m1 == m2)
23235 return m1;
23237 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23238 return VOIDmode;
23240 if ((m1 == CCGCmode && m2 == CCGOCmode)
23241 || (m1 == CCGOCmode && m2 == CCGCmode))
23242 return CCGCmode;
23244 if ((m1 == CCNOmode && m2 == CCGOCmode)
23245 || (m1 == CCGOCmode && m2 == CCNOmode))
23246 return CCNOmode;
23248 if (m1 == CCZmode
23249 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23250 return m2;
23251 else if (m2 == CCZmode
23252 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23253 return m1;
23255 switch (m1)
23257 default:
23258 gcc_unreachable ();
23260 case CCmode:
23261 case CCGCmode:
23262 case CCGOCmode:
23263 case CCNOmode:
23264 case CCAmode:
23265 case CCCmode:
23266 case CCOmode:
23267 case CCPmode:
23268 case CCSmode:
23269 case CCZmode:
23270 switch (m2)
23272 default:
23273 return VOIDmode;
23275 case CCmode:
23276 case CCGCmode:
23277 case CCGOCmode:
23278 case CCNOmode:
23279 case CCAmode:
23280 case CCCmode:
23281 case CCOmode:
23282 case CCPmode:
23283 case CCSmode:
23284 case CCZmode:
23285 return CCmode;
23288 case CCFPmode:
23289 case CCFPUmode:
23290 /* These are only compatible with themselves, which we already
23291 checked above. */
23292 return VOIDmode;
23297 /* Return a comparison we can do and that it is equivalent to
23298 swap_condition (code) apart possibly from orderedness.
23299 But, never change orderedness if TARGET_IEEE_FP, returning
23300 UNKNOWN in that case if necessary. */
23302 static enum rtx_code
23303 ix86_fp_swap_condition (enum rtx_code code)
23305 switch (code)
23307 case GT: /* GTU - CF=0 & ZF=0 */
23308 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23309 case GE: /* GEU - CF=0 */
23310 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23311 case UNLT: /* LTU - CF=1 */
23312 return TARGET_IEEE_FP ? UNKNOWN : GT;
23313 case UNLE: /* LEU - CF=1 | ZF=1 */
23314 return TARGET_IEEE_FP ? UNKNOWN : GE;
23315 default:
23316 return swap_condition (code);
23320 /* Return cost of comparison CODE using the best strategy for performance.
23321 All following functions do use number of instructions as a cost metrics.
23322 In future this should be tweaked to compute bytes for optimize_size and
23323 take into account performance of various instructions on various CPUs. */
23325 static int
23326 ix86_fp_comparison_cost (enum rtx_code code)
23328 int arith_cost;
23330 /* The cost of code using bit-twiddling on %ah. */
23331 switch (code)
23333 case UNLE:
23334 case UNLT:
23335 case LTGT:
23336 case GT:
23337 case GE:
23338 case UNORDERED:
23339 case ORDERED:
23340 case UNEQ:
23341 arith_cost = 4;
23342 break;
23343 case LT:
23344 case NE:
23345 case EQ:
23346 case UNGE:
23347 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23348 break;
23349 case LE:
23350 case UNGT:
23351 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23352 break;
23353 default:
23354 gcc_unreachable ();
23357 switch (ix86_fp_comparison_strategy (code))
23359 case IX86_FPCMP_COMI:
23360 return arith_cost > 4 ? 3 : 2;
23361 case IX86_FPCMP_SAHF:
23362 return arith_cost > 4 ? 4 : 3;
23363 default:
23364 return arith_cost;
23368 /* Return strategy to use for floating-point. We assume that fcomi is always
23369 preferrable where available, since that is also true when looking at size
23370 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23372 enum ix86_fpcmp_strategy
23373 ix86_fp_comparison_strategy (enum rtx_code)
23375 /* Do fcomi/sahf based test when profitable. */
23377 if (TARGET_CMOVE)
23378 return IX86_FPCMP_COMI;
23380 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23381 return IX86_FPCMP_SAHF;
23383 return IX86_FPCMP_ARITH;
23386 /* Swap, force into registers, or otherwise massage the two operands
23387 to a fp comparison. The operands are updated in place; the new
23388 comparison code is returned. */
23390 static enum rtx_code
23391 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23393 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23394 rtx op0 = *pop0, op1 = *pop1;
23395 machine_mode op_mode = GET_MODE (op0);
23396 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23398 /* All of the unordered compare instructions only work on registers.
23399 The same is true of the fcomi compare instructions. The XFmode
23400 compare instructions require registers except when comparing
23401 against zero or when converting operand 1 from fixed point to
23402 floating point. */
23404 if (!is_sse
23405 && (fpcmp_mode == CCFPUmode
23406 || (op_mode == XFmode
23407 && ! (standard_80387_constant_p (op0) == 1
23408 || standard_80387_constant_p (op1) == 1)
23409 && GET_CODE (op1) != FLOAT)
23410 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23412 op0 = force_reg (op_mode, op0);
23413 op1 = force_reg (op_mode, op1);
23415 else
23417 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23418 things around if they appear profitable, otherwise force op0
23419 into a register. */
23421 if (standard_80387_constant_p (op0) == 0
23422 || (MEM_P (op0)
23423 && ! (standard_80387_constant_p (op1) == 0
23424 || MEM_P (op1))))
23426 enum rtx_code new_code = ix86_fp_swap_condition (code);
23427 if (new_code != UNKNOWN)
23429 std::swap (op0, op1);
23430 code = new_code;
23434 if (!REG_P (op0))
23435 op0 = force_reg (op_mode, op0);
23437 if (CONSTANT_P (op1))
23439 int tmp = standard_80387_constant_p (op1);
23440 if (tmp == 0)
23441 op1 = validize_mem (force_const_mem (op_mode, op1));
23442 else if (tmp == 1)
23444 if (TARGET_CMOVE)
23445 op1 = force_reg (op_mode, op1);
23447 else
23448 op1 = force_reg (op_mode, op1);
23452 /* Try to rearrange the comparison to make it cheaper. */
23453 if (ix86_fp_comparison_cost (code)
23454 > ix86_fp_comparison_cost (swap_condition (code))
23455 && (REG_P (op1) || can_create_pseudo_p ()))
23457 std::swap (op0, op1);
23458 code = swap_condition (code);
23459 if (!REG_P (op0))
23460 op0 = force_reg (op_mode, op0);
23463 *pop0 = op0;
23464 *pop1 = op1;
23465 return code;
23468 /* Convert comparison codes we use to represent FP comparison to integer
23469 code that will result in proper branch. Return UNKNOWN if no such code
23470 is available. */
23472 enum rtx_code
23473 ix86_fp_compare_code_to_integer (enum rtx_code code)
23475 switch (code)
23477 case GT:
23478 return GTU;
23479 case GE:
23480 return GEU;
23481 case ORDERED:
23482 case UNORDERED:
23483 return code;
23484 case UNEQ:
23485 return EQ;
23486 case UNLT:
23487 return LTU;
23488 case UNLE:
23489 return LEU;
23490 case LTGT:
23491 return NE;
23492 default:
23493 return UNKNOWN;
23497 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23499 static rtx
23500 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23502 machine_mode fpcmp_mode, intcmp_mode;
23503 rtx tmp, tmp2;
23505 fpcmp_mode = ix86_fp_compare_mode (code);
23506 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23508 /* Do fcomi/sahf based test when profitable. */
23509 switch (ix86_fp_comparison_strategy (code))
23511 case IX86_FPCMP_COMI:
23512 intcmp_mode = fpcmp_mode;
23513 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23514 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23515 emit_insn (tmp);
23516 break;
23518 case IX86_FPCMP_SAHF:
23519 intcmp_mode = fpcmp_mode;
23520 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23521 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23523 if (!scratch)
23524 scratch = gen_reg_rtx (HImode);
23525 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23526 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23527 break;
23529 case IX86_FPCMP_ARITH:
23530 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23531 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23532 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23533 if (!scratch)
23534 scratch = gen_reg_rtx (HImode);
23535 emit_insn (gen_rtx_SET (scratch, tmp2));
23537 /* In the unordered case, we have to check C2 for NaN's, which
23538 doesn't happen to work out to anything nice combination-wise.
23539 So do some bit twiddling on the value we've got in AH to come
23540 up with an appropriate set of condition codes. */
23542 intcmp_mode = CCNOmode;
23543 switch (code)
23545 case GT:
23546 case UNGT:
23547 if (code == GT || !TARGET_IEEE_FP)
23549 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23550 code = EQ;
23552 else
23554 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23555 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23556 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23557 intcmp_mode = CCmode;
23558 code = GEU;
23560 break;
23561 case LT:
23562 case UNLT:
23563 if (code == LT && TARGET_IEEE_FP)
23565 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23566 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23567 intcmp_mode = CCmode;
23568 code = EQ;
23570 else
23572 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23573 code = NE;
23575 break;
23576 case GE:
23577 case UNGE:
23578 if (code == GE || !TARGET_IEEE_FP)
23580 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23581 code = EQ;
23583 else
23585 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23586 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23587 code = NE;
23589 break;
23590 case LE:
23591 case UNLE:
23592 if (code == LE && TARGET_IEEE_FP)
23594 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23595 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23596 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23597 intcmp_mode = CCmode;
23598 code = LTU;
23600 else
23602 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23603 code = NE;
23605 break;
23606 case EQ:
23607 case UNEQ:
23608 if (code == EQ && TARGET_IEEE_FP)
23610 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23611 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23612 intcmp_mode = CCmode;
23613 code = EQ;
23615 else
23617 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23618 code = NE;
23620 break;
23621 case NE:
23622 case LTGT:
23623 if (code == NE && TARGET_IEEE_FP)
23625 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23626 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23627 GEN_INT (0x40)));
23628 code = NE;
23630 else
23632 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23633 code = EQ;
23635 break;
23637 case UNORDERED:
23638 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23639 code = NE;
23640 break;
23641 case ORDERED:
23642 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23643 code = EQ;
23644 break;
23646 default:
23647 gcc_unreachable ();
23649 break;
23651 default:
23652 gcc_unreachable();
23655 /* Return the test that should be put into the flags user, i.e.
23656 the bcc, scc, or cmov instruction. */
23657 return gen_rtx_fmt_ee (code, VOIDmode,
23658 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23659 const0_rtx);
23662 static rtx
23663 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23665 rtx ret;
23667 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23668 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23670 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23672 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23673 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23675 else
23676 ret = ix86_expand_int_compare (code, op0, op1);
23678 return ret;
23681 void
23682 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23684 machine_mode mode = GET_MODE (op0);
23685 rtx tmp;
23687 /* Handle special case - vector comparsion with boolean result, transform
23688 it using ptest instruction. */
23689 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23691 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23692 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23694 gcc_assert (code == EQ || code == NE);
23695 /* Generate XOR since we can't check that one operand is zero vector. */
23696 tmp = gen_reg_rtx (mode);
23697 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23698 tmp = gen_lowpart (p_mode, tmp);
23699 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23700 gen_rtx_UNSPEC (CCmode,
23701 gen_rtvec (2, tmp, tmp),
23702 UNSPEC_PTEST)));
23703 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23704 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23705 gen_rtx_LABEL_REF (VOIDmode, label),
23706 pc_rtx);
23707 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23708 return;
23711 switch (mode)
23713 case SFmode:
23714 case DFmode:
23715 case XFmode:
23716 case QImode:
23717 case HImode:
23718 case SImode:
23719 simple:
23720 tmp = ix86_expand_compare (code, op0, op1);
23721 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23722 gen_rtx_LABEL_REF (VOIDmode, label),
23723 pc_rtx);
23724 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23725 return;
23727 case DImode:
23728 if (TARGET_64BIT)
23729 goto simple;
23730 /* For 32-bit target DI comparison may be performed on
23731 SSE registers. To allow this we should avoid split
23732 to SI mode which is achieved by doing xor in DI mode
23733 and then comparing with zero (which is recognized by
23734 STV pass). We don't compare using xor when optimizing
23735 for size. */
23736 if (!optimize_insn_for_size_p ()
23737 && TARGET_STV
23738 && (code == EQ || code == NE))
23740 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23741 op1 = const0_rtx;
23743 /* FALLTHRU */
23744 case TImode:
23745 /* Expand DImode branch into multiple compare+branch. */
23747 rtx lo[2], hi[2];
23748 rtx_code_label *label2;
23749 enum rtx_code code1, code2, code3;
23750 machine_mode submode;
23752 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23754 std::swap (op0, op1);
23755 code = swap_condition (code);
23758 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23759 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23761 submode = mode == DImode ? SImode : DImode;
23763 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23764 avoid two branches. This costs one extra insn, so disable when
23765 optimizing for size. */
23767 if ((code == EQ || code == NE)
23768 && (!optimize_insn_for_size_p ()
23769 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23771 rtx xor0, xor1;
23773 xor1 = hi[0];
23774 if (hi[1] != const0_rtx)
23775 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23776 NULL_RTX, 0, OPTAB_WIDEN);
23778 xor0 = lo[0];
23779 if (lo[1] != const0_rtx)
23780 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23781 NULL_RTX, 0, OPTAB_WIDEN);
23783 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23784 NULL_RTX, 0, OPTAB_WIDEN);
23786 ix86_expand_branch (code, tmp, const0_rtx, label);
23787 return;
23790 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23791 op1 is a constant and the low word is zero, then we can just
23792 examine the high word. Similarly for low word -1 and
23793 less-or-equal-than or greater-than. */
23795 if (CONST_INT_P (hi[1]))
23796 switch (code)
23798 case LT: case LTU: case GE: case GEU:
23799 if (lo[1] == const0_rtx)
23801 ix86_expand_branch (code, hi[0], hi[1], label);
23802 return;
23804 break;
23805 case LE: case LEU: case GT: case GTU:
23806 if (lo[1] == constm1_rtx)
23808 ix86_expand_branch (code, hi[0], hi[1], label);
23809 return;
23811 break;
23812 default:
23813 break;
23816 /* Otherwise, we need two or three jumps. */
23818 label2 = gen_label_rtx ();
23820 code1 = code;
23821 code2 = swap_condition (code);
23822 code3 = unsigned_condition (code);
23824 switch (code)
23826 case LT: case GT: case LTU: case GTU:
23827 break;
23829 case LE: code1 = LT; code2 = GT; break;
23830 case GE: code1 = GT; code2 = LT; break;
23831 case LEU: code1 = LTU; code2 = GTU; break;
23832 case GEU: code1 = GTU; code2 = LTU; break;
23834 case EQ: code1 = UNKNOWN; code2 = NE; break;
23835 case NE: code2 = UNKNOWN; break;
23837 default:
23838 gcc_unreachable ();
23842 * a < b =>
23843 * if (hi(a) < hi(b)) goto true;
23844 * if (hi(a) > hi(b)) goto false;
23845 * if (lo(a) < lo(b)) goto true;
23846 * false:
23849 if (code1 != UNKNOWN)
23850 ix86_expand_branch (code1, hi[0], hi[1], label);
23851 if (code2 != UNKNOWN)
23852 ix86_expand_branch (code2, hi[0], hi[1], label2);
23854 ix86_expand_branch (code3, lo[0], lo[1], label);
23856 if (code2 != UNKNOWN)
23857 emit_label (label2);
23858 return;
23861 default:
23862 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23863 goto simple;
23867 /* Split branch based on floating point condition. */
23868 void
23869 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23870 rtx target1, rtx target2, rtx tmp)
23872 rtx condition;
23873 rtx_insn *i;
23875 if (target2 != pc_rtx)
23877 std::swap (target1, target2);
23878 code = reverse_condition_maybe_unordered (code);
23881 condition = ix86_expand_fp_compare (code, op1, op2,
23882 tmp);
23884 i = emit_jump_insn (gen_rtx_SET
23885 (pc_rtx,
23886 gen_rtx_IF_THEN_ELSE (VOIDmode,
23887 condition, target1, target2)));
23888 if (split_branch_probability.initialized_p ())
23889 add_reg_br_prob_note (i, split_branch_probability);
23892 void
23893 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23895 rtx ret;
23897 gcc_assert (GET_MODE (dest) == QImode);
23899 ret = ix86_expand_compare (code, op0, op1);
23900 PUT_MODE (ret, QImode);
23901 emit_insn (gen_rtx_SET (dest, ret));
23904 /* Expand comparison setting or clearing carry flag. Return true when
23905 successful and set pop for the operation. */
23906 static bool
23907 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23909 machine_mode mode =
23910 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23912 /* Do not handle double-mode compares that go through special path. */
23913 if (mode == (TARGET_64BIT ? TImode : DImode))
23914 return false;
23916 if (SCALAR_FLOAT_MODE_P (mode))
23918 rtx compare_op;
23919 rtx_insn *compare_seq;
23921 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23923 /* Shortcut: following common codes never translate
23924 into carry flag compares. */
23925 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23926 || code == ORDERED || code == UNORDERED)
23927 return false;
23929 /* These comparisons require zero flag; swap operands so they won't. */
23930 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23931 && !TARGET_IEEE_FP)
23933 std::swap (op0, op1);
23934 code = swap_condition (code);
23937 /* Try to expand the comparison and verify that we end up with
23938 carry flag based comparison. This fails to be true only when
23939 we decide to expand comparison using arithmetic that is not
23940 too common scenario. */
23941 start_sequence ();
23942 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23943 compare_seq = get_insns ();
23944 end_sequence ();
23946 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23947 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23948 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23949 else
23950 code = GET_CODE (compare_op);
23952 if (code != LTU && code != GEU)
23953 return false;
23955 emit_insn (compare_seq);
23956 *pop = compare_op;
23957 return true;
23960 if (!INTEGRAL_MODE_P (mode))
23961 return false;
23963 switch (code)
23965 case LTU:
23966 case GEU:
23967 break;
23969 /* Convert a==0 into (unsigned)a<1. */
23970 case EQ:
23971 case NE:
23972 if (op1 != const0_rtx)
23973 return false;
23974 op1 = const1_rtx;
23975 code = (code == EQ ? LTU : GEU);
23976 break;
23978 /* Convert a>b into b<a or a>=b-1. */
23979 case GTU:
23980 case LEU:
23981 if (CONST_INT_P (op1))
23983 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23984 /* Bail out on overflow. We still can swap operands but that
23985 would force loading of the constant into register. */
23986 if (op1 == const0_rtx
23987 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23988 return false;
23989 code = (code == GTU ? GEU : LTU);
23991 else
23993 std::swap (op0, op1);
23994 code = (code == GTU ? LTU : GEU);
23996 break;
23998 /* Convert a>=0 into (unsigned)a<0x80000000. */
23999 case LT:
24000 case GE:
24001 if (mode == DImode || op1 != const0_rtx)
24002 return false;
24003 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24004 code = (code == LT ? GEU : LTU);
24005 break;
24006 case LE:
24007 case GT:
24008 if (mode == DImode || op1 != constm1_rtx)
24009 return false;
24010 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24011 code = (code == LE ? GEU : LTU);
24012 break;
24014 default:
24015 return false;
24017 /* Swapping operands may cause constant to appear as first operand. */
24018 if (!nonimmediate_operand (op0, VOIDmode))
24020 if (!can_create_pseudo_p ())
24021 return false;
24022 op0 = force_reg (mode, op0);
24024 *pop = ix86_expand_compare (code, op0, op1);
24025 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24026 return true;
24029 bool
24030 ix86_expand_int_movcc (rtx operands[])
24032 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24033 rtx_insn *compare_seq;
24034 rtx compare_op;
24035 machine_mode mode = GET_MODE (operands[0]);
24036 bool sign_bit_compare_p = false;
24037 rtx op0 = XEXP (operands[1], 0);
24038 rtx op1 = XEXP (operands[1], 1);
24040 if (GET_MODE (op0) == TImode
24041 || (GET_MODE (op0) == DImode
24042 && !TARGET_64BIT))
24043 return false;
24045 start_sequence ();
24046 compare_op = ix86_expand_compare (code, op0, op1);
24047 compare_seq = get_insns ();
24048 end_sequence ();
24050 compare_code = GET_CODE (compare_op);
24052 if ((op1 == const0_rtx && (code == GE || code == LT))
24053 || (op1 == constm1_rtx && (code == GT || code == LE)))
24054 sign_bit_compare_p = true;
24056 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24057 HImode insns, we'd be swallowed in word prefix ops. */
24059 if ((mode != HImode || TARGET_FAST_PREFIX)
24060 && (mode != (TARGET_64BIT ? TImode : DImode))
24061 && CONST_INT_P (operands[2])
24062 && CONST_INT_P (operands[3]))
24064 rtx out = operands[0];
24065 HOST_WIDE_INT ct = INTVAL (operands[2]);
24066 HOST_WIDE_INT cf = INTVAL (operands[3]);
24067 HOST_WIDE_INT diff;
24069 diff = ct - cf;
24070 /* Sign bit compares are better done using shifts than we do by using
24071 sbb. */
24072 if (sign_bit_compare_p
24073 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24075 /* Detect overlap between destination and compare sources. */
24076 rtx tmp = out;
24078 if (!sign_bit_compare_p)
24080 rtx flags;
24081 bool fpcmp = false;
24083 compare_code = GET_CODE (compare_op);
24085 flags = XEXP (compare_op, 0);
24087 if (GET_MODE (flags) == CCFPmode
24088 || GET_MODE (flags) == CCFPUmode)
24090 fpcmp = true;
24091 compare_code
24092 = ix86_fp_compare_code_to_integer (compare_code);
24095 /* To simplify rest of code, restrict to the GEU case. */
24096 if (compare_code == LTU)
24098 std::swap (ct, cf);
24099 compare_code = reverse_condition (compare_code);
24100 code = reverse_condition (code);
24102 else
24104 if (fpcmp)
24105 PUT_CODE (compare_op,
24106 reverse_condition_maybe_unordered
24107 (GET_CODE (compare_op)));
24108 else
24109 PUT_CODE (compare_op,
24110 reverse_condition (GET_CODE (compare_op)));
24112 diff = ct - cf;
24114 if (reg_overlap_mentioned_p (out, op0)
24115 || reg_overlap_mentioned_p (out, op1))
24116 tmp = gen_reg_rtx (mode);
24118 if (mode == DImode)
24119 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24120 else
24121 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24122 flags, compare_op));
24124 else
24126 if (code == GT || code == GE)
24127 code = reverse_condition (code);
24128 else
24130 std::swap (ct, cf);
24131 diff = ct - cf;
24133 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24136 if (diff == 1)
24139 * cmpl op0,op1
24140 * sbbl dest,dest
24141 * [addl dest, ct]
24143 * Size 5 - 8.
24145 if (ct)
24146 tmp = expand_simple_binop (mode, PLUS,
24147 tmp, GEN_INT (ct),
24148 copy_rtx (tmp), 1, OPTAB_DIRECT);
24150 else if (cf == -1)
24153 * cmpl op0,op1
24154 * sbbl dest,dest
24155 * orl $ct, dest
24157 * Size 8.
24159 tmp = expand_simple_binop (mode, IOR,
24160 tmp, GEN_INT (ct),
24161 copy_rtx (tmp), 1, OPTAB_DIRECT);
24163 else if (diff == -1 && ct)
24166 * cmpl op0,op1
24167 * sbbl dest,dest
24168 * notl dest
24169 * [addl dest, cf]
24171 * Size 8 - 11.
24173 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24174 if (cf)
24175 tmp = expand_simple_binop (mode, PLUS,
24176 copy_rtx (tmp), GEN_INT (cf),
24177 copy_rtx (tmp), 1, OPTAB_DIRECT);
24179 else
24182 * cmpl op0,op1
24183 * sbbl dest,dest
24184 * [notl dest]
24185 * andl cf - ct, dest
24186 * [addl dest, ct]
24188 * Size 8 - 11.
24191 if (cf == 0)
24193 cf = ct;
24194 ct = 0;
24195 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24198 tmp = expand_simple_binop (mode, AND,
24199 copy_rtx (tmp),
24200 gen_int_mode (cf - ct, mode),
24201 copy_rtx (tmp), 1, OPTAB_DIRECT);
24202 if (ct)
24203 tmp = expand_simple_binop (mode, PLUS,
24204 copy_rtx (tmp), GEN_INT (ct),
24205 copy_rtx (tmp), 1, OPTAB_DIRECT);
24208 if (!rtx_equal_p (tmp, out))
24209 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24211 return true;
24214 if (diff < 0)
24216 machine_mode cmp_mode = GET_MODE (op0);
24217 enum rtx_code new_code;
24219 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24221 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24223 /* We may be reversing unordered compare to normal compare, that
24224 is not valid in general (we may convert non-trapping condition
24225 to trapping one), however on i386 we currently emit all
24226 comparisons unordered. */
24227 new_code = reverse_condition_maybe_unordered (code);
24229 else
24230 new_code = ix86_reverse_condition (code, cmp_mode);
24231 if (new_code != UNKNOWN)
24233 std::swap (ct, cf);
24234 diff = -diff;
24235 code = new_code;
24239 compare_code = UNKNOWN;
24240 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24241 && CONST_INT_P (op1))
24243 if (op1 == const0_rtx
24244 && (code == LT || code == GE))
24245 compare_code = code;
24246 else if (op1 == constm1_rtx)
24248 if (code == LE)
24249 compare_code = LT;
24250 else if (code == GT)
24251 compare_code = GE;
24255 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24256 if (compare_code != UNKNOWN
24257 && GET_MODE (op0) == GET_MODE (out)
24258 && (cf == -1 || ct == -1))
24260 /* If lea code below could be used, only optimize
24261 if it results in a 2 insn sequence. */
24263 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24264 || diff == 3 || diff == 5 || diff == 9)
24265 || (compare_code == LT && ct == -1)
24266 || (compare_code == GE && cf == -1))
24269 * notl op1 (if necessary)
24270 * sarl $31, op1
24271 * orl cf, op1
24273 if (ct != -1)
24275 cf = ct;
24276 ct = -1;
24277 code = reverse_condition (code);
24280 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24282 out = expand_simple_binop (mode, IOR,
24283 out, GEN_INT (cf),
24284 out, 1, OPTAB_DIRECT);
24285 if (out != operands[0])
24286 emit_move_insn (operands[0], out);
24288 return true;
24293 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24294 || diff == 3 || diff == 5 || diff == 9)
24295 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24296 && (mode != DImode
24297 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24300 * xorl dest,dest
24301 * cmpl op1,op2
24302 * setcc dest
24303 * lea cf(dest*(ct-cf)),dest
24305 * Size 14.
24307 * This also catches the degenerate setcc-only case.
24310 rtx tmp;
24311 int nops;
24313 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24315 nops = 0;
24316 /* On x86_64 the lea instruction operates on Pmode, so we need
24317 to get arithmetics done in proper mode to match. */
24318 if (diff == 1)
24319 tmp = copy_rtx (out);
24320 else
24322 rtx out1;
24323 out1 = copy_rtx (out);
24324 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24325 nops++;
24326 if (diff & 1)
24328 tmp = gen_rtx_PLUS (mode, tmp, out1);
24329 nops++;
24332 if (cf != 0)
24334 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24335 nops++;
24337 if (!rtx_equal_p (tmp, out))
24339 if (nops == 1)
24340 out = force_operand (tmp, copy_rtx (out));
24341 else
24342 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24344 if (!rtx_equal_p (out, operands[0]))
24345 emit_move_insn (operands[0], copy_rtx (out));
24347 return true;
24351 * General case: Jumpful:
24352 * xorl dest,dest cmpl op1, op2
24353 * cmpl op1, op2 movl ct, dest
24354 * setcc dest jcc 1f
24355 * decl dest movl cf, dest
24356 * andl (cf-ct),dest 1:
24357 * addl ct,dest
24359 * Size 20. Size 14.
24361 * This is reasonably steep, but branch mispredict costs are
24362 * high on modern cpus, so consider failing only if optimizing
24363 * for space.
24366 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24367 && BRANCH_COST (optimize_insn_for_speed_p (),
24368 false) >= 2)
24370 if (cf == 0)
24372 machine_mode cmp_mode = GET_MODE (op0);
24373 enum rtx_code new_code;
24375 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24377 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24379 /* We may be reversing unordered compare to normal compare,
24380 that is not valid in general (we may convert non-trapping
24381 condition to trapping one), however on i386 we currently
24382 emit all comparisons unordered. */
24383 new_code = reverse_condition_maybe_unordered (code);
24385 else
24387 new_code = ix86_reverse_condition (code, cmp_mode);
24388 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24389 compare_code = reverse_condition (compare_code);
24392 if (new_code != UNKNOWN)
24394 cf = ct;
24395 ct = 0;
24396 code = new_code;
24400 if (compare_code != UNKNOWN)
24402 /* notl op1 (if needed)
24403 sarl $31, op1
24404 andl (cf-ct), op1
24405 addl ct, op1
24407 For x < 0 (resp. x <= -1) there will be no notl,
24408 so if possible swap the constants to get rid of the
24409 complement.
24410 True/false will be -1/0 while code below (store flag
24411 followed by decrement) is 0/-1, so the constants need
24412 to be exchanged once more. */
24414 if (compare_code == GE || !cf)
24416 code = reverse_condition (code);
24417 compare_code = LT;
24419 else
24420 std::swap (ct, cf);
24422 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24424 else
24426 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24428 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24429 constm1_rtx,
24430 copy_rtx (out), 1, OPTAB_DIRECT);
24433 out = expand_simple_binop (mode, AND, copy_rtx (out),
24434 gen_int_mode (cf - ct, mode),
24435 copy_rtx (out), 1, OPTAB_DIRECT);
24436 if (ct)
24437 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24438 copy_rtx (out), 1, OPTAB_DIRECT);
24439 if (!rtx_equal_p (out, operands[0]))
24440 emit_move_insn (operands[0], copy_rtx (out));
24442 return true;
24446 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24448 /* Try a few things more with specific constants and a variable. */
24450 optab op;
24451 rtx var, orig_out, out, tmp;
24453 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24454 return false;
24456 /* If one of the two operands is an interesting constant, load a
24457 constant with the above and mask it in with a logical operation. */
24459 if (CONST_INT_P (operands[2]))
24461 var = operands[3];
24462 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24463 operands[3] = constm1_rtx, op = and_optab;
24464 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24465 operands[3] = const0_rtx, op = ior_optab;
24466 else
24467 return false;
24469 else if (CONST_INT_P (operands[3]))
24471 var = operands[2];
24472 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24473 operands[2] = constm1_rtx, op = and_optab;
24474 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24475 operands[2] = const0_rtx, op = ior_optab;
24476 else
24477 return false;
24479 else
24480 return false;
24482 orig_out = operands[0];
24483 tmp = gen_reg_rtx (mode);
24484 operands[0] = tmp;
24486 /* Recurse to get the constant loaded. */
24487 if (!ix86_expand_int_movcc (operands))
24488 return false;
24490 /* Mask in the interesting variable. */
24491 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24492 OPTAB_WIDEN);
24493 if (!rtx_equal_p (out, orig_out))
24494 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24496 return true;
24500 * For comparison with above,
24502 * movl cf,dest
24503 * movl ct,tmp
24504 * cmpl op1,op2
24505 * cmovcc tmp,dest
24507 * Size 15.
24510 if (! nonimmediate_operand (operands[2], mode))
24511 operands[2] = force_reg (mode, operands[2]);
24512 if (! nonimmediate_operand (operands[3], mode))
24513 operands[3] = force_reg (mode, operands[3]);
24515 if (! register_operand (operands[2], VOIDmode)
24516 && (mode == QImode
24517 || ! register_operand (operands[3], VOIDmode)))
24518 operands[2] = force_reg (mode, operands[2]);
24520 if (mode == QImode
24521 && ! register_operand (operands[3], VOIDmode))
24522 operands[3] = force_reg (mode, operands[3]);
24524 emit_insn (compare_seq);
24525 emit_insn (gen_rtx_SET (operands[0],
24526 gen_rtx_IF_THEN_ELSE (mode,
24527 compare_op, operands[2],
24528 operands[3])));
24529 return true;
24532 /* Swap, force into registers, or otherwise massage the two operands
24533 to an sse comparison with a mask result. Thus we differ a bit from
24534 ix86_prepare_fp_compare_args which expects to produce a flags result.
24536 The DEST operand exists to help determine whether to commute commutative
24537 operators. The POP0/POP1 operands are updated in place. The new
24538 comparison code is returned, or UNKNOWN if not implementable. */
24540 static enum rtx_code
24541 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24542 rtx *pop0, rtx *pop1)
24544 switch (code)
24546 case LTGT:
24547 case UNEQ:
24548 /* AVX supports all the needed comparisons. */
24549 if (TARGET_AVX)
24550 break;
24551 /* We have no LTGT as an operator. We could implement it with
24552 NE & ORDERED, but this requires an extra temporary. It's
24553 not clear that it's worth it. */
24554 return UNKNOWN;
24556 case LT:
24557 case LE:
24558 case UNGT:
24559 case UNGE:
24560 /* These are supported directly. */
24561 break;
24563 case EQ:
24564 case NE:
24565 case UNORDERED:
24566 case ORDERED:
24567 /* AVX has 3 operand comparisons, no need to swap anything. */
24568 if (TARGET_AVX)
24569 break;
24570 /* For commutative operators, try to canonicalize the destination
24571 operand to be first in the comparison - this helps reload to
24572 avoid extra moves. */
24573 if (!dest || !rtx_equal_p (dest, *pop1))
24574 break;
24575 /* FALLTHRU */
24577 case GE:
24578 case GT:
24579 case UNLE:
24580 case UNLT:
24581 /* These are not supported directly before AVX, and furthermore
24582 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24583 comparison operands to transform into something that is
24584 supported. */
24585 std::swap (*pop0, *pop1);
24586 code = swap_condition (code);
24587 break;
24589 default:
24590 gcc_unreachable ();
24593 return code;
24596 /* Detect conditional moves that exactly match min/max operational
24597 semantics. Note that this is IEEE safe, as long as we don't
24598 interchange the operands.
24600 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24601 and TRUE if the operation is successful and instructions are emitted. */
24603 static bool
24604 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24605 rtx cmp_op1, rtx if_true, rtx if_false)
24607 machine_mode mode;
24608 bool is_min;
24609 rtx tmp;
24611 if (code == LT)
24613 else if (code == UNGE)
24614 std::swap (if_true, if_false);
24615 else
24616 return false;
24618 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24619 is_min = true;
24620 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24621 is_min = false;
24622 else
24623 return false;
24625 mode = GET_MODE (dest);
24627 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24628 but MODE may be a vector mode and thus not appropriate. */
24629 if (!flag_finite_math_only || flag_signed_zeros)
24631 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24632 rtvec v;
24634 if_true = force_reg (mode, if_true);
24635 v = gen_rtvec (2, if_true, if_false);
24636 tmp = gen_rtx_UNSPEC (mode, v, u);
24638 else
24640 code = is_min ? SMIN : SMAX;
24641 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24644 emit_insn (gen_rtx_SET (dest, tmp));
24645 return true;
24648 /* Expand an sse vector comparison. Return the register with the result. */
24650 static rtx
24651 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24652 rtx op_true, rtx op_false)
24654 machine_mode mode = GET_MODE (dest);
24655 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24657 /* In general case result of comparison can differ from operands' type. */
24658 machine_mode cmp_mode;
24660 /* In AVX512F the result of comparison is an integer mask. */
24661 bool maskcmp = false;
24662 rtx x;
24664 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24666 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24667 gcc_assert (cmp_mode != BLKmode);
24669 maskcmp = true;
24671 else
24672 cmp_mode = cmp_ops_mode;
24675 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24676 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24677 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24679 if (optimize
24680 || (maskcmp && cmp_mode != mode)
24681 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24682 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24683 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24685 /* Compare patterns for int modes are unspec in AVX512F only. */
24686 if (maskcmp && (code == GT || code == EQ))
24688 rtx (*gen)(rtx, rtx, rtx);
24690 switch (cmp_ops_mode)
24692 case V64QImode:
24693 gcc_assert (TARGET_AVX512BW);
24694 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24695 break;
24696 case V32HImode:
24697 gcc_assert (TARGET_AVX512BW);
24698 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24699 break;
24700 case V16SImode:
24701 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24702 break;
24703 case V8DImode:
24704 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24705 break;
24706 default:
24707 gen = NULL;
24710 if (gen)
24712 emit_insn (gen (dest, cmp_op0, cmp_op1));
24713 return dest;
24716 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24718 if (cmp_mode != mode && !maskcmp)
24720 x = force_reg (cmp_ops_mode, x);
24721 convert_move (dest, x, false);
24723 else
24724 emit_insn (gen_rtx_SET (dest, x));
24726 return dest;
24729 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24730 operations. This is used for both scalar and vector conditional moves. */
24732 void
24733 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24735 machine_mode mode = GET_MODE (dest);
24736 machine_mode cmpmode = GET_MODE (cmp);
24738 /* In AVX512F the result of comparison is an integer mask. */
24739 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24741 rtx t2, t3, x;
24743 /* If we have an integer mask and FP value then we need
24744 to cast mask to FP mode. */
24745 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24747 cmp = force_reg (cmpmode, cmp);
24748 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24751 if (vector_all_ones_operand (op_true, mode)
24752 && rtx_equal_p (op_false, CONST0_RTX (mode))
24753 && !maskcmp)
24755 emit_insn (gen_rtx_SET (dest, cmp));
24757 else if (op_false == CONST0_RTX (mode)
24758 && !maskcmp)
24760 op_true = force_reg (mode, op_true);
24761 x = gen_rtx_AND (mode, cmp, op_true);
24762 emit_insn (gen_rtx_SET (dest, x));
24764 else if (op_true == CONST0_RTX (mode)
24765 && !maskcmp)
24767 op_false = force_reg (mode, op_false);
24768 x = gen_rtx_NOT (mode, cmp);
24769 x = gen_rtx_AND (mode, x, op_false);
24770 emit_insn (gen_rtx_SET (dest, x));
24772 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24773 && !maskcmp)
24775 op_false = force_reg (mode, op_false);
24776 x = gen_rtx_IOR (mode, cmp, op_false);
24777 emit_insn (gen_rtx_SET (dest, x));
24779 else if (TARGET_XOP
24780 && !maskcmp)
24782 op_true = force_reg (mode, op_true);
24784 if (!nonimmediate_operand (op_false, mode))
24785 op_false = force_reg (mode, op_false);
24787 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24788 op_true,
24789 op_false)));
24791 else
24793 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24794 rtx d = dest;
24796 if (!nonimmediate_operand (op_true, mode))
24797 op_true = force_reg (mode, op_true);
24799 op_false = force_reg (mode, op_false);
24801 switch (mode)
24803 case V4SFmode:
24804 if (TARGET_SSE4_1)
24805 gen = gen_sse4_1_blendvps;
24806 break;
24807 case V2DFmode:
24808 if (TARGET_SSE4_1)
24809 gen = gen_sse4_1_blendvpd;
24810 break;
24811 case V16QImode:
24812 case V8HImode:
24813 case V4SImode:
24814 case V2DImode:
24815 if (TARGET_SSE4_1)
24817 gen = gen_sse4_1_pblendvb;
24818 if (mode != V16QImode)
24819 d = gen_reg_rtx (V16QImode);
24820 op_false = gen_lowpart (V16QImode, op_false);
24821 op_true = gen_lowpart (V16QImode, op_true);
24822 cmp = gen_lowpart (V16QImode, cmp);
24824 break;
24825 case V8SFmode:
24826 if (TARGET_AVX)
24827 gen = gen_avx_blendvps256;
24828 break;
24829 case V4DFmode:
24830 if (TARGET_AVX)
24831 gen = gen_avx_blendvpd256;
24832 break;
24833 case V32QImode:
24834 case V16HImode:
24835 case V8SImode:
24836 case V4DImode:
24837 if (TARGET_AVX2)
24839 gen = gen_avx2_pblendvb;
24840 if (mode != V32QImode)
24841 d = gen_reg_rtx (V32QImode);
24842 op_false = gen_lowpart (V32QImode, op_false);
24843 op_true = gen_lowpart (V32QImode, op_true);
24844 cmp = gen_lowpart (V32QImode, cmp);
24846 break;
24848 case V64QImode:
24849 gen = gen_avx512bw_blendmv64qi;
24850 break;
24851 case V32HImode:
24852 gen = gen_avx512bw_blendmv32hi;
24853 break;
24854 case V16SImode:
24855 gen = gen_avx512f_blendmv16si;
24856 break;
24857 case V8DImode:
24858 gen = gen_avx512f_blendmv8di;
24859 break;
24860 case V8DFmode:
24861 gen = gen_avx512f_blendmv8df;
24862 break;
24863 case V16SFmode:
24864 gen = gen_avx512f_blendmv16sf;
24865 break;
24867 default:
24868 break;
24871 if (gen != NULL)
24873 emit_insn (gen (d, op_false, op_true, cmp));
24874 if (d != dest)
24875 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24877 else
24879 op_true = force_reg (mode, op_true);
24881 t2 = gen_reg_rtx (mode);
24882 if (optimize)
24883 t3 = gen_reg_rtx (mode);
24884 else
24885 t3 = dest;
24887 x = gen_rtx_AND (mode, op_true, cmp);
24888 emit_insn (gen_rtx_SET (t2, x));
24890 x = gen_rtx_NOT (mode, cmp);
24891 x = gen_rtx_AND (mode, x, op_false);
24892 emit_insn (gen_rtx_SET (t3, x));
24894 x = gen_rtx_IOR (mode, t3, t2);
24895 emit_insn (gen_rtx_SET (dest, x));
24900 /* Expand a floating-point conditional move. Return true if successful. */
24902 bool
24903 ix86_expand_fp_movcc (rtx operands[])
24905 machine_mode mode = GET_MODE (operands[0]);
24906 enum rtx_code code = GET_CODE (operands[1]);
24907 rtx tmp, compare_op;
24908 rtx op0 = XEXP (operands[1], 0);
24909 rtx op1 = XEXP (operands[1], 1);
24911 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24913 machine_mode cmode;
24915 /* Since we've no cmove for sse registers, don't force bad register
24916 allocation just to gain access to it. Deny movcc when the
24917 comparison mode doesn't match the move mode. */
24918 cmode = GET_MODE (op0);
24919 if (cmode == VOIDmode)
24920 cmode = GET_MODE (op1);
24921 if (cmode != mode)
24922 return false;
24924 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24925 if (code == UNKNOWN)
24926 return false;
24928 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24929 operands[2], operands[3]))
24930 return true;
24932 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24933 operands[2], operands[3]);
24934 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24935 return true;
24938 if (GET_MODE (op0) == TImode
24939 || (GET_MODE (op0) == DImode
24940 && !TARGET_64BIT))
24941 return false;
24943 /* The floating point conditional move instructions don't directly
24944 support conditions resulting from a signed integer comparison. */
24946 compare_op = ix86_expand_compare (code, op0, op1);
24947 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24949 tmp = gen_reg_rtx (QImode);
24950 ix86_expand_setcc (tmp, code, op0, op1);
24952 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24955 emit_insn (gen_rtx_SET (operands[0],
24956 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24957 operands[2], operands[3])));
24959 return true;
24962 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24964 static int
24965 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24967 switch (code)
24969 case EQ:
24970 return 0;
24971 case LT:
24972 case LTU:
24973 return 1;
24974 case LE:
24975 case LEU:
24976 return 2;
24977 case NE:
24978 return 4;
24979 case GE:
24980 case GEU:
24981 return 5;
24982 case GT:
24983 case GTU:
24984 return 6;
24985 default:
24986 gcc_unreachable ();
24990 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24992 static int
24993 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24995 switch (code)
24997 case EQ:
24998 return 0x00;
24999 case NE:
25000 return 0x04;
25001 case GT:
25002 return 0x0e;
25003 case LE:
25004 return 0x02;
25005 case GE:
25006 return 0x0d;
25007 case LT:
25008 return 0x01;
25009 case UNLE:
25010 return 0x0a;
25011 case UNLT:
25012 return 0x09;
25013 case UNGE:
25014 return 0x05;
25015 case UNGT:
25016 return 0x06;
25017 case UNEQ:
25018 return 0x18;
25019 case LTGT:
25020 return 0x0c;
25021 case ORDERED:
25022 return 0x07;
25023 case UNORDERED:
25024 return 0x03;
25025 default:
25026 gcc_unreachable ();
25030 /* Return immediate value to be used in UNSPEC_PCMP
25031 for comparison CODE in MODE. */
25033 static int
25034 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25036 if (FLOAT_MODE_P (mode))
25037 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25038 return ix86_int_cmp_code_to_pcmp_immediate (code);
25041 /* Expand AVX-512 vector comparison. */
25043 bool
25044 ix86_expand_mask_vec_cmp (rtx operands[])
25046 machine_mode mask_mode = GET_MODE (operands[0]);
25047 machine_mode cmp_mode = GET_MODE (operands[2]);
25048 enum rtx_code code = GET_CODE (operands[1]);
25049 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25050 int unspec_code;
25051 rtx unspec;
25053 switch (code)
25055 case LEU:
25056 case GTU:
25057 case GEU:
25058 case LTU:
25059 unspec_code = UNSPEC_UNSIGNED_PCMP;
25060 break;
25062 default:
25063 unspec_code = UNSPEC_PCMP;
25066 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25067 operands[3], imm),
25068 unspec_code);
25069 emit_insn (gen_rtx_SET (operands[0], unspec));
25071 return true;
25074 /* Expand fp vector comparison. */
25076 bool
25077 ix86_expand_fp_vec_cmp (rtx operands[])
25079 enum rtx_code code = GET_CODE (operands[1]);
25080 rtx cmp;
25082 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25083 &operands[2], &operands[3]);
25084 if (code == UNKNOWN)
25086 rtx temp;
25087 switch (GET_CODE (operands[1]))
25089 case LTGT:
25090 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25091 operands[3], NULL, NULL);
25092 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25093 operands[3], NULL, NULL);
25094 code = AND;
25095 break;
25096 case UNEQ:
25097 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25098 operands[3], NULL, NULL);
25099 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25100 operands[3], NULL, NULL);
25101 code = IOR;
25102 break;
25103 default:
25104 gcc_unreachable ();
25106 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25107 OPTAB_DIRECT);
25109 else
25110 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25111 operands[1], operands[2]);
25113 if (operands[0] != cmp)
25114 emit_move_insn (operands[0], cmp);
25116 return true;
25119 static rtx
25120 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25121 rtx op_true, rtx op_false, bool *negate)
25123 machine_mode data_mode = GET_MODE (dest);
25124 machine_mode mode = GET_MODE (cop0);
25125 rtx x;
25127 *negate = false;
25129 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25130 if (TARGET_XOP
25131 && (mode == V16QImode || mode == V8HImode
25132 || mode == V4SImode || mode == V2DImode))
25134 else
25136 /* Canonicalize the comparison to EQ, GT, GTU. */
25137 switch (code)
25139 case EQ:
25140 case GT:
25141 case GTU:
25142 break;
25144 case NE:
25145 case LE:
25146 case LEU:
25147 code = reverse_condition (code);
25148 *negate = true;
25149 break;
25151 case GE:
25152 case GEU:
25153 code = reverse_condition (code);
25154 *negate = true;
25155 /* FALLTHRU */
25157 case LT:
25158 case LTU:
25159 std::swap (cop0, cop1);
25160 code = swap_condition (code);
25161 break;
25163 default:
25164 gcc_unreachable ();
25167 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25168 if (mode == V2DImode)
25170 switch (code)
25172 case EQ:
25173 /* SSE4.1 supports EQ. */
25174 if (!TARGET_SSE4_1)
25175 return NULL;
25176 break;
25178 case GT:
25179 case GTU:
25180 /* SSE4.2 supports GT/GTU. */
25181 if (!TARGET_SSE4_2)
25182 return NULL;
25183 break;
25185 default:
25186 gcc_unreachable ();
25190 /* Unsigned parallel compare is not supported by the hardware.
25191 Play some tricks to turn this into a signed comparison
25192 against 0. */
25193 if (code == GTU)
25195 cop0 = force_reg (mode, cop0);
25197 switch (mode)
25199 case V16SImode:
25200 case V8DImode:
25201 case V8SImode:
25202 case V4DImode:
25203 case V4SImode:
25204 case V2DImode:
25206 rtx t1, t2, mask;
25207 rtx (*gen_sub3) (rtx, rtx, rtx);
25209 switch (mode)
25211 case V16SImode: gen_sub3 = gen_subv16si3; break;
25212 case V8DImode: gen_sub3 = gen_subv8di3; break;
25213 case V8SImode: gen_sub3 = gen_subv8si3; break;
25214 case V4DImode: gen_sub3 = gen_subv4di3; break;
25215 case V4SImode: gen_sub3 = gen_subv4si3; break;
25216 case V2DImode: gen_sub3 = gen_subv2di3; break;
25217 default:
25218 gcc_unreachable ();
25220 /* Subtract (-(INT MAX) - 1) from both operands to make
25221 them signed. */
25222 mask = ix86_build_signbit_mask (mode, true, false);
25223 t1 = gen_reg_rtx (mode);
25224 emit_insn (gen_sub3 (t1, cop0, mask));
25226 t2 = gen_reg_rtx (mode);
25227 emit_insn (gen_sub3 (t2, cop1, mask));
25229 cop0 = t1;
25230 cop1 = t2;
25231 code = GT;
25233 break;
25235 case V64QImode:
25236 case V32HImode:
25237 case V32QImode:
25238 case V16HImode:
25239 case V16QImode:
25240 case V8HImode:
25241 /* Perform a parallel unsigned saturating subtraction. */
25242 x = gen_reg_rtx (mode);
25243 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25244 cop1)));
25246 cop0 = x;
25247 cop1 = CONST0_RTX (mode);
25248 code = EQ;
25249 *negate = !*negate;
25250 break;
25252 default:
25253 gcc_unreachable ();
25258 if (*negate)
25259 std::swap (op_true, op_false);
25261 /* Allow the comparison to be done in one mode, but the movcc to
25262 happen in another mode. */
25263 if (data_mode == mode)
25265 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25266 op_true, op_false);
25268 else
25270 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25271 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25272 op_true, op_false);
25273 if (GET_MODE (x) == mode)
25274 x = gen_lowpart (data_mode, x);
25277 return x;
25280 /* Expand integer vector comparison. */
25282 bool
25283 ix86_expand_int_vec_cmp (rtx operands[])
25285 rtx_code code = GET_CODE (operands[1]);
25286 bool negate = false;
25287 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25288 operands[3], NULL, NULL, &negate);
25290 if (!cmp)
25291 return false;
25293 if (negate)
25294 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25295 CONST0_RTX (GET_MODE (cmp)),
25296 NULL, NULL, &negate);
25298 gcc_assert (!negate);
25300 if (operands[0] != cmp)
25301 emit_move_insn (operands[0], cmp);
25303 return true;
25306 /* Expand a floating-point vector conditional move; a vcond operation
25307 rather than a movcc operation. */
25309 bool
25310 ix86_expand_fp_vcond (rtx operands[])
25312 enum rtx_code code = GET_CODE (operands[3]);
25313 rtx cmp;
25315 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25316 &operands[4], &operands[5]);
25317 if (code == UNKNOWN)
25319 rtx temp;
25320 switch (GET_CODE (operands[3]))
25322 case LTGT:
25323 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25324 operands[5], operands[0], operands[0]);
25325 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25326 operands[5], operands[1], operands[2]);
25327 code = AND;
25328 break;
25329 case UNEQ:
25330 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25331 operands[5], operands[0], operands[0]);
25332 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25333 operands[5], operands[1], operands[2]);
25334 code = IOR;
25335 break;
25336 default:
25337 gcc_unreachable ();
25339 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25340 OPTAB_DIRECT);
25341 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25342 return true;
25345 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25346 operands[5], operands[1], operands[2]))
25347 return true;
25349 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25350 operands[1], operands[2]);
25351 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25352 return true;
25355 /* Expand a signed/unsigned integral vector conditional move. */
25357 bool
25358 ix86_expand_int_vcond (rtx operands[])
25360 machine_mode data_mode = GET_MODE (operands[0]);
25361 machine_mode mode = GET_MODE (operands[4]);
25362 enum rtx_code code = GET_CODE (operands[3]);
25363 bool negate = false;
25364 rtx x, cop0, cop1;
25366 cop0 = operands[4];
25367 cop1 = operands[5];
25369 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25370 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25371 if ((code == LT || code == GE)
25372 && data_mode == mode
25373 && cop1 == CONST0_RTX (mode)
25374 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25375 && GET_MODE_UNIT_SIZE (data_mode) > 1
25376 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25377 && (GET_MODE_SIZE (data_mode) == 16
25378 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25380 rtx negop = operands[2 - (code == LT)];
25381 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25382 if (negop == CONST1_RTX (data_mode))
25384 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25385 operands[0], 1, OPTAB_DIRECT);
25386 if (res != operands[0])
25387 emit_move_insn (operands[0], res);
25388 return true;
25390 else if (GET_MODE_INNER (data_mode) != DImode
25391 && vector_all_ones_operand (negop, data_mode))
25393 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25394 operands[0], 0, OPTAB_DIRECT);
25395 if (res != operands[0])
25396 emit_move_insn (operands[0], res);
25397 return true;
25401 if (!nonimmediate_operand (cop1, mode))
25402 cop1 = force_reg (mode, cop1);
25403 if (!general_operand (operands[1], data_mode))
25404 operands[1] = force_reg (data_mode, operands[1]);
25405 if (!general_operand (operands[2], data_mode))
25406 operands[2] = force_reg (data_mode, operands[2]);
25408 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25409 operands[1], operands[2], &negate);
25411 if (!x)
25412 return false;
25414 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25415 operands[2-negate]);
25416 return true;
25419 /* AVX512F does support 64-byte integer vector operations,
25420 thus the longest vector we are faced with is V64QImode. */
25421 #define MAX_VECT_LEN 64
25423 struct expand_vec_perm_d
25425 rtx target, op0, op1;
25426 unsigned char perm[MAX_VECT_LEN];
25427 machine_mode vmode;
25428 unsigned char nelt;
25429 bool one_operand_p;
25430 bool testing_p;
25433 static bool
25434 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25435 struct expand_vec_perm_d *d)
25437 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25438 expander, so args are either in d, or in op0, op1 etc. */
25439 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25440 machine_mode maskmode = mode;
25441 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25443 switch (mode)
25445 case V8HImode:
25446 if (TARGET_AVX512VL && TARGET_AVX512BW)
25447 gen = gen_avx512vl_vpermi2varv8hi3;
25448 break;
25449 case V16HImode:
25450 if (TARGET_AVX512VL && TARGET_AVX512BW)
25451 gen = gen_avx512vl_vpermi2varv16hi3;
25452 break;
25453 case V64QImode:
25454 if (TARGET_AVX512VBMI)
25455 gen = gen_avx512bw_vpermi2varv64qi3;
25456 break;
25457 case V32HImode:
25458 if (TARGET_AVX512BW)
25459 gen = gen_avx512bw_vpermi2varv32hi3;
25460 break;
25461 case V4SImode:
25462 if (TARGET_AVX512VL)
25463 gen = gen_avx512vl_vpermi2varv4si3;
25464 break;
25465 case V8SImode:
25466 if (TARGET_AVX512VL)
25467 gen = gen_avx512vl_vpermi2varv8si3;
25468 break;
25469 case V16SImode:
25470 if (TARGET_AVX512F)
25471 gen = gen_avx512f_vpermi2varv16si3;
25472 break;
25473 case V4SFmode:
25474 if (TARGET_AVX512VL)
25476 gen = gen_avx512vl_vpermi2varv4sf3;
25477 maskmode = V4SImode;
25479 break;
25480 case V8SFmode:
25481 if (TARGET_AVX512VL)
25483 gen = gen_avx512vl_vpermi2varv8sf3;
25484 maskmode = V8SImode;
25486 break;
25487 case V16SFmode:
25488 if (TARGET_AVX512F)
25490 gen = gen_avx512f_vpermi2varv16sf3;
25491 maskmode = V16SImode;
25493 break;
25494 case V2DImode:
25495 if (TARGET_AVX512VL)
25496 gen = gen_avx512vl_vpermi2varv2di3;
25497 break;
25498 case V4DImode:
25499 if (TARGET_AVX512VL)
25500 gen = gen_avx512vl_vpermi2varv4di3;
25501 break;
25502 case V8DImode:
25503 if (TARGET_AVX512F)
25504 gen = gen_avx512f_vpermi2varv8di3;
25505 break;
25506 case V2DFmode:
25507 if (TARGET_AVX512VL)
25509 gen = gen_avx512vl_vpermi2varv2df3;
25510 maskmode = V2DImode;
25512 break;
25513 case V4DFmode:
25514 if (TARGET_AVX512VL)
25516 gen = gen_avx512vl_vpermi2varv4df3;
25517 maskmode = V4DImode;
25519 break;
25520 case V8DFmode:
25521 if (TARGET_AVX512F)
25523 gen = gen_avx512f_vpermi2varv8df3;
25524 maskmode = V8DImode;
25526 break;
25527 default:
25528 break;
25531 if (gen == NULL)
25532 return false;
25534 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25535 expander, so args are either in d, or in op0, op1 etc. */
25536 if (d)
25538 rtx vec[64];
25539 target = d->target;
25540 op0 = d->op0;
25541 op1 = d->op1;
25542 for (int i = 0; i < d->nelt; ++i)
25543 vec[i] = GEN_INT (d->perm[i]);
25544 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25547 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25548 return true;
25551 /* Expand a variable vector permutation. */
25553 void
25554 ix86_expand_vec_perm (rtx operands[])
25556 rtx target = operands[0];
25557 rtx op0 = operands[1];
25558 rtx op1 = operands[2];
25559 rtx mask = operands[3];
25560 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25561 machine_mode mode = GET_MODE (op0);
25562 machine_mode maskmode = GET_MODE (mask);
25563 int w, e, i;
25564 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25566 /* Number of elements in the vector. */
25567 w = GET_MODE_NUNITS (mode);
25568 e = GET_MODE_UNIT_SIZE (mode);
25569 gcc_assert (w <= 64);
25571 if (TARGET_AVX512F && one_operand_shuffle)
25573 rtx (*gen) (rtx, rtx, rtx) = NULL;
25574 switch (mode)
25576 case V16SImode:
25577 gen =gen_avx512f_permvarv16si;
25578 break;
25579 case V16SFmode:
25580 gen = gen_avx512f_permvarv16sf;
25581 break;
25582 case V8DImode:
25583 gen = gen_avx512f_permvarv8di;
25584 break;
25585 case V8DFmode:
25586 gen = gen_avx512f_permvarv8df;
25587 break;
25588 default:
25589 break;
25591 if (gen != NULL)
25593 emit_insn (gen (target, op0, mask));
25594 return;
25598 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25599 return;
25601 if (TARGET_AVX2)
25603 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25605 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25606 an constant shuffle operand. With a tiny bit of effort we can
25607 use VPERMD instead. A re-interpretation stall for V4DFmode is
25608 unfortunate but there's no avoiding it.
25609 Similarly for V16HImode we don't have instructions for variable
25610 shuffling, while for V32QImode we can use after preparing suitable
25611 masks vpshufb; vpshufb; vpermq; vpor. */
25613 if (mode == V16HImode)
25615 maskmode = mode = V32QImode;
25616 w = 32;
25617 e = 1;
25619 else
25621 maskmode = mode = V8SImode;
25622 w = 8;
25623 e = 4;
25625 t1 = gen_reg_rtx (maskmode);
25627 /* Replicate the low bits of the V4DImode mask into V8SImode:
25628 mask = { A B C D }
25629 t1 = { A A B B C C D D }. */
25630 for (i = 0; i < w / 2; ++i)
25631 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25632 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25633 vt = force_reg (maskmode, vt);
25634 mask = gen_lowpart (maskmode, mask);
25635 if (maskmode == V8SImode)
25636 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25637 else
25638 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25640 /* Multiply the shuffle indicies by two. */
25641 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25642 OPTAB_DIRECT);
25644 /* Add one to the odd shuffle indicies:
25645 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25646 for (i = 0; i < w / 2; ++i)
25648 vec[i * 2] = const0_rtx;
25649 vec[i * 2 + 1] = const1_rtx;
25651 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25652 vt = validize_mem (force_const_mem (maskmode, vt));
25653 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25654 OPTAB_DIRECT);
25656 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25657 operands[3] = mask = t1;
25658 target = gen_reg_rtx (mode);
25659 op0 = gen_lowpart (mode, op0);
25660 op1 = gen_lowpart (mode, op1);
25663 switch (mode)
25665 case V8SImode:
25666 /* The VPERMD and VPERMPS instructions already properly ignore
25667 the high bits of the shuffle elements. No need for us to
25668 perform an AND ourselves. */
25669 if (one_operand_shuffle)
25671 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25672 if (target != operands[0])
25673 emit_move_insn (operands[0],
25674 gen_lowpart (GET_MODE (operands[0]), target));
25676 else
25678 t1 = gen_reg_rtx (V8SImode);
25679 t2 = gen_reg_rtx (V8SImode);
25680 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25681 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25682 goto merge_two;
25684 return;
25686 case V8SFmode:
25687 mask = gen_lowpart (V8SImode, mask);
25688 if (one_operand_shuffle)
25689 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25690 else
25692 t1 = gen_reg_rtx (V8SFmode);
25693 t2 = gen_reg_rtx (V8SFmode);
25694 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25695 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25696 goto merge_two;
25698 return;
25700 case V4SImode:
25701 /* By combining the two 128-bit input vectors into one 256-bit
25702 input vector, we can use VPERMD and VPERMPS for the full
25703 two-operand shuffle. */
25704 t1 = gen_reg_rtx (V8SImode);
25705 t2 = gen_reg_rtx (V8SImode);
25706 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25707 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25708 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25709 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25710 return;
25712 case V4SFmode:
25713 t1 = gen_reg_rtx (V8SFmode);
25714 t2 = gen_reg_rtx (V8SImode);
25715 mask = gen_lowpart (V4SImode, mask);
25716 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25717 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25718 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25719 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25720 return;
25722 case V32QImode:
25723 t1 = gen_reg_rtx (V32QImode);
25724 t2 = gen_reg_rtx (V32QImode);
25725 t3 = gen_reg_rtx (V32QImode);
25726 vt2 = GEN_INT (-128);
25727 for (i = 0; i < 32; i++)
25728 vec[i] = vt2;
25729 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25730 vt = force_reg (V32QImode, vt);
25731 for (i = 0; i < 32; i++)
25732 vec[i] = i < 16 ? vt2 : const0_rtx;
25733 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25734 vt2 = force_reg (V32QImode, vt2);
25735 /* From mask create two adjusted masks, which contain the same
25736 bits as mask in the low 7 bits of each vector element.
25737 The first mask will have the most significant bit clear
25738 if it requests element from the same 128-bit lane
25739 and MSB set if it requests element from the other 128-bit lane.
25740 The second mask will have the opposite values of the MSB,
25741 and additionally will have its 128-bit lanes swapped.
25742 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25743 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25744 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25745 stands for other 12 bytes. */
25746 /* The bit whether element is from the same lane or the other
25747 lane is bit 4, so shift it up by 3 to the MSB position. */
25748 t5 = gen_reg_rtx (V4DImode);
25749 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25750 GEN_INT (3)));
25751 /* Clear MSB bits from the mask just in case it had them set. */
25752 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25753 /* After this t1 will have MSB set for elements from other lane. */
25754 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25755 /* Clear bits other than MSB. */
25756 emit_insn (gen_andv32qi3 (t1, t1, vt));
25757 /* Or in the lower bits from mask into t3. */
25758 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25759 /* And invert MSB bits in t1, so MSB is set for elements from the same
25760 lane. */
25761 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25762 /* Swap 128-bit lanes in t3. */
25763 t6 = gen_reg_rtx (V4DImode);
25764 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25765 const2_rtx, GEN_INT (3),
25766 const0_rtx, const1_rtx));
25767 /* And or in the lower bits from mask into t1. */
25768 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25769 if (one_operand_shuffle)
25771 /* Each of these shuffles will put 0s in places where
25772 element from the other 128-bit lane is needed, otherwise
25773 will shuffle in the requested value. */
25774 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25775 gen_lowpart (V32QImode, t6)));
25776 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25777 /* For t3 the 128-bit lanes are swapped again. */
25778 t7 = gen_reg_rtx (V4DImode);
25779 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25780 const2_rtx, GEN_INT (3),
25781 const0_rtx, const1_rtx));
25782 /* And oring both together leads to the result. */
25783 emit_insn (gen_iorv32qi3 (target, t1,
25784 gen_lowpart (V32QImode, t7)));
25785 if (target != operands[0])
25786 emit_move_insn (operands[0],
25787 gen_lowpart (GET_MODE (operands[0]), target));
25788 return;
25791 t4 = gen_reg_rtx (V32QImode);
25792 /* Similarly to the above one_operand_shuffle code,
25793 just for repeated twice for each operand. merge_two:
25794 code will merge the two results together. */
25795 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25796 gen_lowpart (V32QImode, t6)));
25797 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25798 gen_lowpart (V32QImode, t6)));
25799 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25800 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25801 t7 = gen_reg_rtx (V4DImode);
25802 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25803 const2_rtx, GEN_INT (3),
25804 const0_rtx, const1_rtx));
25805 t8 = gen_reg_rtx (V4DImode);
25806 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25807 const2_rtx, GEN_INT (3),
25808 const0_rtx, const1_rtx));
25809 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25810 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25811 t1 = t4;
25812 t2 = t3;
25813 goto merge_two;
25815 default:
25816 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25817 break;
25821 if (TARGET_XOP)
25823 /* The XOP VPPERM insn supports three inputs. By ignoring the
25824 one_operand_shuffle special case, we avoid creating another
25825 set of constant vectors in memory. */
25826 one_operand_shuffle = false;
25828 /* mask = mask & {2*w-1, ...} */
25829 vt = GEN_INT (2*w - 1);
25831 else
25833 /* mask = mask & {w-1, ...} */
25834 vt = GEN_INT (w - 1);
25837 for (i = 0; i < w; i++)
25838 vec[i] = vt;
25839 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25840 mask = expand_simple_binop (maskmode, AND, mask, vt,
25841 NULL_RTX, 0, OPTAB_DIRECT);
25843 /* For non-QImode operations, convert the word permutation control
25844 into a byte permutation control. */
25845 if (mode != V16QImode)
25847 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25848 GEN_INT (exact_log2 (e)),
25849 NULL_RTX, 0, OPTAB_DIRECT);
25851 /* Convert mask to vector of chars. */
25852 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25854 /* Replicate each of the input bytes into byte positions:
25855 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25856 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25857 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25858 for (i = 0; i < 16; ++i)
25859 vec[i] = GEN_INT (i/e * e);
25860 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25861 vt = validize_mem (force_const_mem (V16QImode, vt));
25862 if (TARGET_XOP)
25863 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25864 else
25865 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25867 /* Convert it into the byte positions by doing
25868 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25869 for (i = 0; i < 16; ++i)
25870 vec[i] = GEN_INT (i % e);
25871 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25872 vt = validize_mem (force_const_mem (V16QImode, vt));
25873 emit_insn (gen_addv16qi3 (mask, mask, vt));
25876 /* The actual shuffle operations all operate on V16QImode. */
25877 op0 = gen_lowpart (V16QImode, op0);
25878 op1 = gen_lowpart (V16QImode, op1);
25880 if (TARGET_XOP)
25882 if (GET_MODE (target) != V16QImode)
25883 target = gen_reg_rtx (V16QImode);
25884 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25885 if (target != operands[0])
25886 emit_move_insn (operands[0],
25887 gen_lowpart (GET_MODE (operands[0]), target));
25889 else if (one_operand_shuffle)
25891 if (GET_MODE (target) != V16QImode)
25892 target = gen_reg_rtx (V16QImode);
25893 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25894 if (target != operands[0])
25895 emit_move_insn (operands[0],
25896 gen_lowpart (GET_MODE (operands[0]), target));
25898 else
25900 rtx xops[6];
25901 bool ok;
25903 /* Shuffle the two input vectors independently. */
25904 t1 = gen_reg_rtx (V16QImode);
25905 t2 = gen_reg_rtx (V16QImode);
25906 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25907 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25909 merge_two:
25910 /* Then merge them together. The key is whether any given control
25911 element contained a bit set that indicates the second word. */
25912 mask = operands[3];
25913 vt = GEN_INT (w);
25914 if (maskmode == V2DImode && !TARGET_SSE4_1)
25916 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25917 more shuffle to convert the V2DI input mask into a V4SI
25918 input mask. At which point the masking that expand_int_vcond
25919 will work as desired. */
25920 rtx t3 = gen_reg_rtx (V4SImode);
25921 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25922 const0_rtx, const0_rtx,
25923 const2_rtx, const2_rtx));
25924 mask = t3;
25925 maskmode = V4SImode;
25926 e = w = 4;
25929 for (i = 0; i < w; i++)
25930 vec[i] = vt;
25931 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25932 vt = force_reg (maskmode, vt);
25933 mask = expand_simple_binop (maskmode, AND, mask, vt,
25934 NULL_RTX, 0, OPTAB_DIRECT);
25936 if (GET_MODE (target) != mode)
25937 target = gen_reg_rtx (mode);
25938 xops[0] = target;
25939 xops[1] = gen_lowpart (mode, t2);
25940 xops[2] = gen_lowpart (mode, t1);
25941 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25942 xops[4] = mask;
25943 xops[5] = vt;
25944 ok = ix86_expand_int_vcond (xops);
25945 gcc_assert (ok);
25946 if (target != operands[0])
25947 emit_move_insn (operands[0],
25948 gen_lowpart (GET_MODE (operands[0]), target));
25952 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25953 true if we should do zero extension, else sign extension. HIGH_P is
25954 true if we want the N/2 high elements, else the low elements. */
25956 void
25957 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25959 machine_mode imode = GET_MODE (src);
25960 rtx tmp;
25962 if (TARGET_SSE4_1)
25964 rtx (*unpack)(rtx, rtx);
25965 rtx (*extract)(rtx, rtx) = NULL;
25966 machine_mode halfmode = BLKmode;
25968 switch (imode)
25970 case V64QImode:
25971 if (unsigned_p)
25972 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25973 else
25974 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25975 halfmode = V32QImode;
25976 extract
25977 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25978 break;
25979 case V32QImode:
25980 if (unsigned_p)
25981 unpack = gen_avx2_zero_extendv16qiv16hi2;
25982 else
25983 unpack = gen_avx2_sign_extendv16qiv16hi2;
25984 halfmode = V16QImode;
25985 extract
25986 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25987 break;
25988 case V32HImode:
25989 if (unsigned_p)
25990 unpack = gen_avx512f_zero_extendv16hiv16si2;
25991 else
25992 unpack = gen_avx512f_sign_extendv16hiv16si2;
25993 halfmode = V16HImode;
25994 extract
25995 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25996 break;
25997 case V16HImode:
25998 if (unsigned_p)
25999 unpack = gen_avx2_zero_extendv8hiv8si2;
26000 else
26001 unpack = gen_avx2_sign_extendv8hiv8si2;
26002 halfmode = V8HImode;
26003 extract
26004 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
26005 break;
26006 case V16SImode:
26007 if (unsigned_p)
26008 unpack = gen_avx512f_zero_extendv8siv8di2;
26009 else
26010 unpack = gen_avx512f_sign_extendv8siv8di2;
26011 halfmode = V8SImode;
26012 extract
26013 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26014 break;
26015 case V8SImode:
26016 if (unsigned_p)
26017 unpack = gen_avx2_zero_extendv4siv4di2;
26018 else
26019 unpack = gen_avx2_sign_extendv4siv4di2;
26020 halfmode = V4SImode;
26021 extract
26022 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26023 break;
26024 case V16QImode:
26025 if (unsigned_p)
26026 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26027 else
26028 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26029 break;
26030 case V8HImode:
26031 if (unsigned_p)
26032 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26033 else
26034 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26035 break;
26036 case V4SImode:
26037 if (unsigned_p)
26038 unpack = gen_sse4_1_zero_extendv2siv2di2;
26039 else
26040 unpack = gen_sse4_1_sign_extendv2siv2di2;
26041 break;
26042 default:
26043 gcc_unreachable ();
26046 if (GET_MODE_SIZE (imode) >= 32)
26048 tmp = gen_reg_rtx (halfmode);
26049 emit_insn (extract (tmp, src));
26051 else if (high_p)
26053 /* Shift higher 8 bytes to lower 8 bytes. */
26054 tmp = gen_reg_rtx (V1TImode);
26055 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26056 GEN_INT (64)));
26057 tmp = gen_lowpart (imode, tmp);
26059 else
26060 tmp = src;
26062 emit_insn (unpack (dest, tmp));
26064 else
26066 rtx (*unpack)(rtx, rtx, rtx);
26068 switch (imode)
26070 case V16QImode:
26071 if (high_p)
26072 unpack = gen_vec_interleave_highv16qi;
26073 else
26074 unpack = gen_vec_interleave_lowv16qi;
26075 break;
26076 case V8HImode:
26077 if (high_p)
26078 unpack = gen_vec_interleave_highv8hi;
26079 else
26080 unpack = gen_vec_interleave_lowv8hi;
26081 break;
26082 case V4SImode:
26083 if (high_p)
26084 unpack = gen_vec_interleave_highv4si;
26085 else
26086 unpack = gen_vec_interleave_lowv4si;
26087 break;
26088 default:
26089 gcc_unreachable ();
26092 if (unsigned_p)
26093 tmp = force_reg (imode, CONST0_RTX (imode));
26094 else
26095 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26096 src, pc_rtx, pc_rtx);
26098 rtx tmp2 = gen_reg_rtx (imode);
26099 emit_insn (unpack (tmp2, src, tmp));
26100 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26104 /* Expand conditional increment or decrement using adb/sbb instructions.
26105 The default case using setcc followed by the conditional move can be
26106 done by generic code. */
26107 bool
26108 ix86_expand_int_addcc (rtx operands[])
26110 enum rtx_code code = GET_CODE (operands[1]);
26111 rtx flags;
26112 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26113 rtx compare_op;
26114 rtx val = const0_rtx;
26115 bool fpcmp = false;
26116 machine_mode mode;
26117 rtx op0 = XEXP (operands[1], 0);
26118 rtx op1 = XEXP (operands[1], 1);
26120 if (operands[3] != const1_rtx
26121 && operands[3] != constm1_rtx)
26122 return false;
26123 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26124 return false;
26125 code = GET_CODE (compare_op);
26127 flags = XEXP (compare_op, 0);
26129 if (GET_MODE (flags) == CCFPmode
26130 || GET_MODE (flags) == CCFPUmode)
26132 fpcmp = true;
26133 code = ix86_fp_compare_code_to_integer (code);
26136 if (code != LTU)
26138 val = constm1_rtx;
26139 if (fpcmp)
26140 PUT_CODE (compare_op,
26141 reverse_condition_maybe_unordered
26142 (GET_CODE (compare_op)));
26143 else
26144 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26147 mode = GET_MODE (operands[0]);
26149 /* Construct either adc or sbb insn. */
26150 if ((code == LTU) == (operands[3] == constm1_rtx))
26152 switch (mode)
26154 case QImode:
26155 insn = gen_subqi3_carry;
26156 break;
26157 case HImode:
26158 insn = gen_subhi3_carry;
26159 break;
26160 case SImode:
26161 insn = gen_subsi3_carry;
26162 break;
26163 case DImode:
26164 insn = gen_subdi3_carry;
26165 break;
26166 default:
26167 gcc_unreachable ();
26170 else
26172 switch (mode)
26174 case QImode:
26175 insn = gen_addqi3_carry;
26176 break;
26177 case HImode:
26178 insn = gen_addhi3_carry;
26179 break;
26180 case SImode:
26181 insn = gen_addsi3_carry;
26182 break;
26183 case DImode:
26184 insn = gen_adddi3_carry;
26185 break;
26186 default:
26187 gcc_unreachable ();
26190 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26192 return true;
26196 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26197 but works for floating pointer parameters and nonoffsetable memories.
26198 For pushes, it returns just stack offsets; the values will be saved
26199 in the right order. Maximally three parts are generated. */
26201 static int
26202 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26204 int size;
26206 if (!TARGET_64BIT)
26207 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26208 else
26209 size = (GET_MODE_SIZE (mode) + 4) / 8;
26211 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26212 gcc_assert (size >= 2 && size <= 4);
26214 /* Optimize constant pool reference to immediates. This is used by fp
26215 moves, that force all constants to memory to allow combining. */
26216 if (MEM_P (operand) && MEM_READONLY_P (operand))
26218 rtx tmp = maybe_get_pool_constant (operand);
26219 if (tmp)
26220 operand = tmp;
26223 if (MEM_P (operand) && !offsettable_memref_p (operand))
26225 /* The only non-offsetable memories we handle are pushes. */
26226 int ok = push_operand (operand, VOIDmode);
26228 gcc_assert (ok);
26230 operand = copy_rtx (operand);
26231 PUT_MODE (operand, word_mode);
26232 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26233 return size;
26236 if (GET_CODE (operand) == CONST_VECTOR)
26238 machine_mode imode = int_mode_for_mode (mode);
26239 /* Caution: if we looked through a constant pool memory above,
26240 the operand may actually have a different mode now. That's
26241 ok, since we want to pun this all the way back to an integer. */
26242 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26243 gcc_assert (operand != NULL);
26244 mode = imode;
26247 if (!TARGET_64BIT)
26249 if (mode == DImode)
26250 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26251 else
26253 int i;
26255 if (REG_P (operand))
26257 gcc_assert (reload_completed);
26258 for (i = 0; i < size; i++)
26259 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26261 else if (offsettable_memref_p (operand))
26263 operand = adjust_address (operand, SImode, 0);
26264 parts[0] = operand;
26265 for (i = 1; i < size; i++)
26266 parts[i] = adjust_address (operand, SImode, 4 * i);
26268 else if (CONST_DOUBLE_P (operand))
26270 const REAL_VALUE_TYPE *r;
26271 long l[4];
26273 r = CONST_DOUBLE_REAL_VALUE (operand);
26274 switch (mode)
26276 case TFmode:
26277 real_to_target (l, r, mode);
26278 parts[3] = gen_int_mode (l[3], SImode);
26279 parts[2] = gen_int_mode (l[2], SImode);
26280 break;
26281 case XFmode:
26282 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26283 long double may not be 80-bit. */
26284 real_to_target (l, r, mode);
26285 parts[2] = gen_int_mode (l[2], SImode);
26286 break;
26287 case DFmode:
26288 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26289 break;
26290 default:
26291 gcc_unreachable ();
26293 parts[1] = gen_int_mode (l[1], SImode);
26294 parts[0] = gen_int_mode (l[0], SImode);
26296 else
26297 gcc_unreachable ();
26300 else
26302 if (mode == TImode)
26303 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26304 if (mode == XFmode || mode == TFmode)
26306 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26307 if (REG_P (operand))
26309 gcc_assert (reload_completed);
26310 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26311 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26313 else if (offsettable_memref_p (operand))
26315 operand = adjust_address (operand, DImode, 0);
26316 parts[0] = operand;
26317 parts[1] = adjust_address (operand, upper_mode, 8);
26319 else if (CONST_DOUBLE_P (operand))
26321 long l[4];
26323 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26325 /* real_to_target puts 32-bit pieces in each long. */
26326 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26327 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26328 << 32), DImode);
26330 if (upper_mode == SImode)
26331 parts[1] = gen_int_mode (l[2], SImode);
26332 else
26333 parts[1]
26334 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26335 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26336 << 32), DImode);
26338 else
26339 gcc_unreachable ();
26343 return size;
26346 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26347 Return false when normal moves are needed; true when all required
26348 insns have been emitted. Operands 2-4 contain the input values
26349 int the correct order; operands 5-7 contain the output values. */
26351 void
26352 ix86_split_long_move (rtx operands[])
26354 rtx part[2][4];
26355 int nparts, i, j;
26356 int push = 0;
26357 int collisions = 0;
26358 machine_mode mode = GET_MODE (operands[0]);
26359 bool collisionparts[4];
26361 /* The DFmode expanders may ask us to move double.
26362 For 64bit target this is single move. By hiding the fact
26363 here we simplify i386.md splitters. */
26364 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26366 /* Optimize constant pool reference to immediates. This is used by
26367 fp moves, that force all constants to memory to allow combining. */
26369 if (MEM_P (operands[1])
26370 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26371 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26372 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26373 if (push_operand (operands[0], VOIDmode))
26375 operands[0] = copy_rtx (operands[0]);
26376 PUT_MODE (operands[0], word_mode);
26378 else
26379 operands[0] = gen_lowpart (DImode, operands[0]);
26380 operands[1] = gen_lowpart (DImode, operands[1]);
26381 emit_move_insn (operands[0], operands[1]);
26382 return;
26385 /* The only non-offsettable memory we handle is push. */
26386 if (push_operand (operands[0], VOIDmode))
26387 push = 1;
26388 else
26389 gcc_assert (!MEM_P (operands[0])
26390 || offsettable_memref_p (operands[0]));
26392 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26393 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26395 /* When emitting push, take care for source operands on the stack. */
26396 if (push && MEM_P (operands[1])
26397 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26399 rtx src_base = XEXP (part[1][nparts - 1], 0);
26401 /* Compensate for the stack decrement by 4. */
26402 if (!TARGET_64BIT && nparts == 3
26403 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26404 src_base = plus_constant (Pmode, src_base, 4);
26406 /* src_base refers to the stack pointer and is
26407 automatically decreased by emitted push. */
26408 for (i = 0; i < nparts; i++)
26409 part[1][i] = change_address (part[1][i],
26410 GET_MODE (part[1][i]), src_base);
26413 /* We need to do copy in the right order in case an address register
26414 of the source overlaps the destination. */
26415 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26417 rtx tmp;
26419 for (i = 0; i < nparts; i++)
26421 collisionparts[i]
26422 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26423 if (collisionparts[i])
26424 collisions++;
26427 /* Collision in the middle part can be handled by reordering. */
26428 if (collisions == 1 && nparts == 3 && collisionparts [1])
26430 std::swap (part[0][1], part[0][2]);
26431 std::swap (part[1][1], part[1][2]);
26433 else if (collisions == 1
26434 && nparts == 4
26435 && (collisionparts [1] || collisionparts [2]))
26437 if (collisionparts [1])
26439 std::swap (part[0][1], part[0][2]);
26440 std::swap (part[1][1], part[1][2]);
26442 else
26444 std::swap (part[0][2], part[0][3]);
26445 std::swap (part[1][2], part[1][3]);
26449 /* If there are more collisions, we can't handle it by reordering.
26450 Do an lea to the last part and use only one colliding move. */
26451 else if (collisions > 1)
26453 rtx base, addr, tls_base = NULL_RTX;
26455 collisions = 1;
26457 base = part[0][nparts - 1];
26459 /* Handle the case when the last part isn't valid for lea.
26460 Happens in 64-bit mode storing the 12-byte XFmode. */
26461 if (GET_MODE (base) != Pmode)
26462 base = gen_rtx_REG (Pmode, REGNO (base));
26464 addr = XEXP (part[1][0], 0);
26465 if (TARGET_TLS_DIRECT_SEG_REFS)
26467 struct ix86_address parts;
26468 int ok = ix86_decompose_address (addr, &parts);
26469 gcc_assert (ok);
26470 if (parts.seg == DEFAULT_TLS_SEG_REG)
26472 /* It is not valid to use %gs: or %fs: in
26473 lea though, so we need to remove it from the
26474 address used for lea and add it to each individual
26475 memory loads instead. */
26476 addr = copy_rtx (addr);
26477 rtx *x = &addr;
26478 while (GET_CODE (*x) == PLUS)
26480 for (i = 0; i < 2; i++)
26482 rtx u = XEXP (*x, i);
26483 if (GET_CODE (u) == ZERO_EXTEND)
26484 u = XEXP (u, 0);
26485 if (GET_CODE (u) == UNSPEC
26486 && XINT (u, 1) == UNSPEC_TP)
26488 tls_base = XEXP (*x, i);
26489 *x = XEXP (*x, 1 - i);
26490 break;
26493 if (tls_base)
26494 break;
26495 x = &XEXP (*x, 0);
26497 gcc_assert (tls_base);
26500 emit_insn (gen_rtx_SET (base, addr));
26501 if (tls_base)
26502 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26503 part[1][0] = replace_equiv_address (part[1][0], base);
26504 for (i = 1; i < nparts; i++)
26506 if (tls_base)
26507 base = copy_rtx (base);
26508 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26509 part[1][i] = replace_equiv_address (part[1][i], tmp);
26514 if (push)
26516 if (!TARGET_64BIT)
26518 if (nparts == 3)
26520 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26521 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26522 stack_pointer_rtx, GEN_INT (-4)));
26523 emit_move_insn (part[0][2], part[1][2]);
26525 else if (nparts == 4)
26527 emit_move_insn (part[0][3], part[1][3]);
26528 emit_move_insn (part[0][2], part[1][2]);
26531 else
26533 /* In 64bit mode we don't have 32bit push available. In case this is
26534 register, it is OK - we will just use larger counterpart. We also
26535 retype memory - these comes from attempt to avoid REX prefix on
26536 moving of second half of TFmode value. */
26537 if (GET_MODE (part[1][1]) == SImode)
26539 switch (GET_CODE (part[1][1]))
26541 case MEM:
26542 part[1][1] = adjust_address (part[1][1], DImode, 0);
26543 break;
26545 case REG:
26546 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26547 break;
26549 default:
26550 gcc_unreachable ();
26553 if (GET_MODE (part[1][0]) == SImode)
26554 part[1][0] = part[1][1];
26557 emit_move_insn (part[0][1], part[1][1]);
26558 emit_move_insn (part[0][0], part[1][0]);
26559 return;
26562 /* Choose correct order to not overwrite the source before it is copied. */
26563 if ((REG_P (part[0][0])
26564 && REG_P (part[1][1])
26565 && (REGNO (part[0][0]) == REGNO (part[1][1])
26566 || (nparts == 3
26567 && REGNO (part[0][0]) == REGNO (part[1][2]))
26568 || (nparts == 4
26569 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26570 || (collisions > 0
26571 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26573 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26575 operands[2 + i] = part[0][j];
26576 operands[6 + i] = part[1][j];
26579 else
26581 for (i = 0; i < nparts; i++)
26583 operands[2 + i] = part[0][i];
26584 operands[6 + i] = part[1][i];
26588 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26589 if (optimize_insn_for_size_p ())
26591 for (j = 0; j < nparts - 1; j++)
26592 if (CONST_INT_P (operands[6 + j])
26593 && operands[6 + j] != const0_rtx
26594 && REG_P (operands[2 + j]))
26595 for (i = j; i < nparts - 1; i++)
26596 if (CONST_INT_P (operands[7 + i])
26597 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26598 operands[7 + i] = operands[2 + j];
26601 for (i = 0; i < nparts; i++)
26602 emit_move_insn (operands[2 + i], operands[6 + i]);
26604 return;
26607 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26608 left shift by a constant, either using a single shift or
26609 a sequence of add instructions. */
26611 static void
26612 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26614 rtx (*insn)(rtx, rtx, rtx);
26616 if (count == 1
26617 || (count * ix86_cost->add <= ix86_cost->shift_const
26618 && !optimize_insn_for_size_p ()))
26620 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26621 while (count-- > 0)
26622 emit_insn (insn (operand, operand, operand));
26624 else
26626 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26627 emit_insn (insn (operand, operand, GEN_INT (count)));
26631 void
26632 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26634 rtx (*gen_ashl3)(rtx, rtx, rtx);
26635 rtx (*gen_shld)(rtx, rtx, rtx);
26636 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26638 rtx low[2], high[2];
26639 int count;
26641 if (CONST_INT_P (operands[2]))
26643 split_double_mode (mode, operands, 2, low, high);
26644 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26646 if (count >= half_width)
26648 emit_move_insn (high[0], low[1]);
26649 emit_move_insn (low[0], const0_rtx);
26651 if (count > half_width)
26652 ix86_expand_ashl_const (high[0], count - half_width, mode);
26654 else
26656 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26658 if (!rtx_equal_p (operands[0], operands[1]))
26659 emit_move_insn (operands[0], operands[1]);
26661 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26662 ix86_expand_ashl_const (low[0], count, mode);
26664 return;
26667 split_double_mode (mode, operands, 1, low, high);
26669 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26671 if (operands[1] == const1_rtx)
26673 /* Assuming we've chosen a QImode capable registers, then 1 << N
26674 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26675 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26677 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26679 ix86_expand_clear (low[0]);
26680 ix86_expand_clear (high[0]);
26681 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26683 d = gen_lowpart (QImode, low[0]);
26684 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26685 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26686 emit_insn (gen_rtx_SET (d, s));
26688 d = gen_lowpart (QImode, high[0]);
26689 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26690 s = gen_rtx_NE (QImode, flags, const0_rtx);
26691 emit_insn (gen_rtx_SET (d, s));
26694 /* Otherwise, we can get the same results by manually performing
26695 a bit extract operation on bit 5/6, and then performing the two
26696 shifts. The two methods of getting 0/1 into low/high are exactly
26697 the same size. Avoiding the shift in the bit extract case helps
26698 pentium4 a bit; no one else seems to care much either way. */
26699 else
26701 machine_mode half_mode;
26702 rtx (*gen_lshr3)(rtx, rtx, rtx);
26703 rtx (*gen_and3)(rtx, rtx, rtx);
26704 rtx (*gen_xor3)(rtx, rtx, rtx);
26705 HOST_WIDE_INT bits;
26706 rtx x;
26708 if (mode == DImode)
26710 half_mode = SImode;
26711 gen_lshr3 = gen_lshrsi3;
26712 gen_and3 = gen_andsi3;
26713 gen_xor3 = gen_xorsi3;
26714 bits = 5;
26716 else
26718 half_mode = DImode;
26719 gen_lshr3 = gen_lshrdi3;
26720 gen_and3 = gen_anddi3;
26721 gen_xor3 = gen_xordi3;
26722 bits = 6;
26725 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26726 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26727 else
26728 x = gen_lowpart (half_mode, operands[2]);
26729 emit_insn (gen_rtx_SET (high[0], x));
26731 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26732 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26733 emit_move_insn (low[0], high[0]);
26734 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26737 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26738 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26739 return;
26742 if (operands[1] == constm1_rtx)
26744 /* For -1 << N, we can avoid the shld instruction, because we
26745 know that we're shifting 0...31/63 ones into a -1. */
26746 emit_move_insn (low[0], constm1_rtx);
26747 if (optimize_insn_for_size_p ())
26748 emit_move_insn (high[0], low[0]);
26749 else
26750 emit_move_insn (high[0], constm1_rtx);
26752 else
26754 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26756 if (!rtx_equal_p (operands[0], operands[1]))
26757 emit_move_insn (operands[0], operands[1]);
26759 split_double_mode (mode, operands, 1, low, high);
26760 emit_insn (gen_shld (high[0], low[0], operands[2]));
26763 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26765 if (TARGET_CMOVE && scratch)
26767 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26768 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26770 ix86_expand_clear (scratch);
26771 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26773 else
26775 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26776 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26778 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26782 void
26783 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26785 rtx (*gen_ashr3)(rtx, rtx, rtx)
26786 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26787 rtx (*gen_shrd)(rtx, rtx, rtx);
26788 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26790 rtx low[2], high[2];
26791 int count;
26793 if (CONST_INT_P (operands[2]))
26795 split_double_mode (mode, operands, 2, low, high);
26796 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26798 if (count == GET_MODE_BITSIZE (mode) - 1)
26800 emit_move_insn (high[0], high[1]);
26801 emit_insn (gen_ashr3 (high[0], high[0],
26802 GEN_INT (half_width - 1)));
26803 emit_move_insn (low[0], high[0]);
26806 else if (count >= half_width)
26808 emit_move_insn (low[0], high[1]);
26809 emit_move_insn (high[0], low[0]);
26810 emit_insn (gen_ashr3 (high[0], high[0],
26811 GEN_INT (half_width - 1)));
26813 if (count > half_width)
26814 emit_insn (gen_ashr3 (low[0], low[0],
26815 GEN_INT (count - half_width)));
26817 else
26819 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26821 if (!rtx_equal_p (operands[0], operands[1]))
26822 emit_move_insn (operands[0], operands[1]);
26824 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26825 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26828 else
26830 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26832 if (!rtx_equal_p (operands[0], operands[1]))
26833 emit_move_insn (operands[0], operands[1]);
26835 split_double_mode (mode, operands, 1, low, high);
26837 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26838 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26840 if (TARGET_CMOVE && scratch)
26842 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26843 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26845 emit_move_insn (scratch, high[0]);
26846 emit_insn (gen_ashr3 (scratch, scratch,
26847 GEN_INT (half_width - 1)));
26848 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26849 scratch));
26851 else
26853 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26854 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26856 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26861 void
26862 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26864 rtx (*gen_lshr3)(rtx, rtx, rtx)
26865 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26866 rtx (*gen_shrd)(rtx, rtx, rtx);
26867 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26869 rtx low[2], high[2];
26870 int count;
26872 if (CONST_INT_P (operands[2]))
26874 split_double_mode (mode, operands, 2, low, high);
26875 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26877 if (count >= half_width)
26879 emit_move_insn (low[0], high[1]);
26880 ix86_expand_clear (high[0]);
26882 if (count > half_width)
26883 emit_insn (gen_lshr3 (low[0], low[0],
26884 GEN_INT (count - half_width)));
26886 else
26888 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26890 if (!rtx_equal_p (operands[0], operands[1]))
26891 emit_move_insn (operands[0], operands[1]);
26893 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26894 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26897 else
26899 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26901 if (!rtx_equal_p (operands[0], operands[1]))
26902 emit_move_insn (operands[0], operands[1]);
26904 split_double_mode (mode, operands, 1, low, high);
26906 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26907 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26909 if (TARGET_CMOVE && scratch)
26911 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26912 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26914 ix86_expand_clear (scratch);
26915 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26916 scratch));
26918 else
26920 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26921 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26923 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26928 /* Predict just emitted jump instruction to be taken with probability PROB. */
26929 static void
26930 predict_jump (int prob)
26932 rtx_insn *insn = get_last_insn ();
26933 gcc_assert (JUMP_P (insn));
26934 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26937 /* Helper function for the string operations below. Dest VARIABLE whether
26938 it is aligned to VALUE bytes. If true, jump to the label. */
26939 static rtx_code_label *
26940 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26942 rtx_code_label *label = gen_label_rtx ();
26943 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26944 if (GET_MODE (variable) == DImode)
26945 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26946 else
26947 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26948 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26949 1, label);
26950 if (epilogue)
26951 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26952 else
26953 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26954 return label;
26957 /* Adjust COUNTER by the VALUE. */
26958 static void
26959 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26961 rtx (*gen_add)(rtx, rtx, rtx)
26962 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26964 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26967 /* Zero extend possibly SImode EXP to Pmode register. */
26969 ix86_zero_extend_to_Pmode (rtx exp)
26971 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26974 /* Divide COUNTREG by SCALE. */
26975 static rtx
26976 scale_counter (rtx countreg, int scale)
26978 rtx sc;
26980 if (scale == 1)
26981 return countreg;
26982 if (CONST_INT_P (countreg))
26983 return GEN_INT (INTVAL (countreg) / scale);
26984 gcc_assert (REG_P (countreg));
26986 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26987 GEN_INT (exact_log2 (scale)),
26988 NULL, 1, OPTAB_DIRECT);
26989 return sc;
26992 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26993 DImode for constant loop counts. */
26995 static machine_mode
26996 counter_mode (rtx count_exp)
26998 if (GET_MODE (count_exp) != VOIDmode)
26999 return GET_MODE (count_exp);
27000 if (!CONST_INT_P (count_exp))
27001 return Pmode;
27002 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
27003 return DImode;
27004 return SImode;
27007 /* Copy the address to a Pmode register. This is used for x32 to
27008 truncate DImode TLS address to a SImode register. */
27010 static rtx
27011 ix86_copy_addr_to_reg (rtx addr)
27013 rtx reg;
27014 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27016 reg = copy_addr_to_reg (addr);
27017 REG_POINTER (reg) = 1;
27018 return reg;
27020 else
27022 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27023 reg = copy_to_mode_reg (DImode, addr);
27024 REG_POINTER (reg) = 1;
27025 return gen_rtx_SUBREG (SImode, reg, 0);
27029 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27030 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27031 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27032 memory by VALUE (supposed to be in MODE).
27034 The size is rounded down to whole number of chunk size moved at once.
27035 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27038 static void
27039 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27040 rtx destptr, rtx srcptr, rtx value,
27041 rtx count, machine_mode mode, int unroll,
27042 int expected_size, bool issetmem)
27044 rtx_code_label *out_label, *top_label;
27045 rtx iter, tmp;
27046 machine_mode iter_mode = counter_mode (count);
27047 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27048 rtx piece_size = GEN_INT (piece_size_n);
27049 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27050 rtx size;
27051 int i;
27053 top_label = gen_label_rtx ();
27054 out_label = gen_label_rtx ();
27055 iter = gen_reg_rtx (iter_mode);
27057 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27058 NULL, 1, OPTAB_DIRECT);
27059 /* Those two should combine. */
27060 if (piece_size == const1_rtx)
27062 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27063 true, out_label);
27064 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27066 emit_move_insn (iter, const0_rtx);
27068 emit_label (top_label);
27070 tmp = convert_modes (Pmode, iter_mode, iter, true);
27072 /* This assert could be relaxed - in this case we'll need to compute
27073 smallest power of two, containing in PIECE_SIZE_N and pass it to
27074 offset_address. */
27075 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27076 destmem = offset_address (destmem, tmp, piece_size_n);
27077 destmem = adjust_address (destmem, mode, 0);
27079 if (!issetmem)
27081 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27082 srcmem = adjust_address (srcmem, mode, 0);
27084 /* When unrolling for chips that reorder memory reads and writes,
27085 we can save registers by using single temporary.
27086 Also using 4 temporaries is overkill in 32bit mode. */
27087 if (!TARGET_64BIT && 0)
27089 for (i = 0; i < unroll; i++)
27091 if (i)
27093 destmem =
27094 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27095 srcmem =
27096 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27098 emit_move_insn (destmem, srcmem);
27101 else
27103 rtx tmpreg[4];
27104 gcc_assert (unroll <= 4);
27105 for (i = 0; i < unroll; i++)
27107 tmpreg[i] = gen_reg_rtx (mode);
27108 if (i)
27110 srcmem =
27111 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27113 emit_move_insn (tmpreg[i], srcmem);
27115 for (i = 0; i < unroll; i++)
27117 if (i)
27119 destmem =
27120 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27122 emit_move_insn (destmem, tmpreg[i]);
27126 else
27127 for (i = 0; i < unroll; i++)
27129 if (i)
27130 destmem =
27131 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27132 emit_move_insn (destmem, value);
27135 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27136 true, OPTAB_LIB_WIDEN);
27137 if (tmp != iter)
27138 emit_move_insn (iter, tmp);
27140 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27141 true, top_label);
27142 if (expected_size != -1)
27144 expected_size /= GET_MODE_SIZE (mode) * unroll;
27145 if (expected_size == 0)
27146 predict_jump (0);
27147 else if (expected_size > REG_BR_PROB_BASE)
27148 predict_jump (REG_BR_PROB_BASE - 1);
27149 else
27150 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27152 else
27153 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27154 iter = ix86_zero_extend_to_Pmode (iter);
27155 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27156 true, OPTAB_LIB_WIDEN);
27157 if (tmp != destptr)
27158 emit_move_insn (destptr, tmp);
27159 if (!issetmem)
27161 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27162 true, OPTAB_LIB_WIDEN);
27163 if (tmp != srcptr)
27164 emit_move_insn (srcptr, tmp);
27166 emit_label (out_label);
27169 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27170 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27171 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27172 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27173 ORIG_VALUE is the original value passed to memset to fill the memory with.
27174 Other arguments have same meaning as for previous function. */
27176 static void
27177 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27178 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27179 rtx count,
27180 machine_mode mode, bool issetmem)
27182 rtx destexp;
27183 rtx srcexp;
27184 rtx countreg;
27185 HOST_WIDE_INT rounded_count;
27187 /* If possible, it is shorter to use rep movs.
27188 TODO: Maybe it is better to move this logic to decide_alg. */
27189 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27190 && (!issetmem || orig_value == const0_rtx))
27191 mode = SImode;
27193 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27194 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27196 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27197 GET_MODE_SIZE (mode)));
27198 if (mode != QImode)
27200 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27201 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27202 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27204 else
27205 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27206 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27208 rounded_count
27209 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27210 destmem = shallow_copy_rtx (destmem);
27211 set_mem_size (destmem, rounded_count);
27213 else if (MEM_SIZE_KNOWN_P (destmem))
27214 clear_mem_size (destmem);
27216 if (issetmem)
27218 value = force_reg (mode, gen_lowpart (mode, value));
27219 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27221 else
27223 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27224 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27225 if (mode != QImode)
27227 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27228 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27229 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27231 else
27232 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27233 if (CONST_INT_P (count))
27235 rounded_count
27236 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27237 srcmem = shallow_copy_rtx (srcmem);
27238 set_mem_size (srcmem, rounded_count);
27240 else
27242 if (MEM_SIZE_KNOWN_P (srcmem))
27243 clear_mem_size (srcmem);
27245 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27246 destexp, srcexp));
27250 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27251 DESTMEM.
27252 SRC is passed by pointer to be updated on return.
27253 Return value is updated DST. */
27254 static rtx
27255 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27256 HOST_WIDE_INT size_to_move)
27258 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27259 enum insn_code code;
27260 machine_mode move_mode;
27261 int piece_size, i;
27263 /* Find the widest mode in which we could perform moves.
27264 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27265 it until move of such size is supported. */
27266 piece_size = 1 << floor_log2 (size_to_move);
27267 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27268 code = optab_handler (mov_optab, move_mode);
27269 while (code == CODE_FOR_nothing && piece_size > 1)
27271 piece_size >>= 1;
27272 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27273 code = optab_handler (mov_optab, move_mode);
27276 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27277 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27278 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27280 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27281 move_mode = mode_for_vector (word_mode, nunits);
27282 code = optab_handler (mov_optab, move_mode);
27283 if (code == CODE_FOR_nothing)
27285 move_mode = word_mode;
27286 piece_size = GET_MODE_SIZE (move_mode);
27287 code = optab_handler (mov_optab, move_mode);
27290 gcc_assert (code != CODE_FOR_nothing);
27292 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27293 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27295 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27296 gcc_assert (size_to_move % piece_size == 0);
27297 adjust = GEN_INT (piece_size);
27298 for (i = 0; i < size_to_move; i += piece_size)
27300 /* We move from memory to memory, so we'll need to do it via
27301 a temporary register. */
27302 tempreg = gen_reg_rtx (move_mode);
27303 emit_insn (GEN_FCN (code) (tempreg, src));
27304 emit_insn (GEN_FCN (code) (dst, tempreg));
27306 emit_move_insn (destptr,
27307 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27308 emit_move_insn (srcptr,
27309 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27311 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27312 piece_size);
27313 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27314 piece_size);
27317 /* Update DST and SRC rtx. */
27318 *srcmem = src;
27319 return dst;
27322 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27323 static void
27324 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27325 rtx destptr, rtx srcptr, rtx count, int max_size)
27327 rtx src, dest;
27328 if (CONST_INT_P (count))
27330 HOST_WIDE_INT countval = INTVAL (count);
27331 HOST_WIDE_INT epilogue_size = countval % max_size;
27332 int i;
27334 /* For now MAX_SIZE should be a power of 2. This assert could be
27335 relaxed, but it'll require a bit more complicated epilogue
27336 expanding. */
27337 gcc_assert ((max_size & (max_size - 1)) == 0);
27338 for (i = max_size; i >= 1; i >>= 1)
27340 if (epilogue_size & i)
27341 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27343 return;
27345 if (max_size > 8)
27347 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27348 count, 1, OPTAB_DIRECT);
27349 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27350 count, QImode, 1, 4, false);
27351 return;
27354 /* When there are stringops, we can cheaply increase dest and src pointers.
27355 Otherwise we save code size by maintaining offset (zero is readily
27356 available from preceding rep operation) and using x86 addressing modes.
27358 if (TARGET_SINGLE_STRINGOP)
27360 if (max_size > 4)
27362 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27363 src = change_address (srcmem, SImode, srcptr);
27364 dest = change_address (destmem, SImode, destptr);
27365 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27366 emit_label (label);
27367 LABEL_NUSES (label) = 1;
27369 if (max_size > 2)
27371 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27372 src = change_address (srcmem, HImode, srcptr);
27373 dest = change_address (destmem, HImode, destptr);
27374 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27375 emit_label (label);
27376 LABEL_NUSES (label) = 1;
27378 if (max_size > 1)
27380 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27381 src = change_address (srcmem, QImode, srcptr);
27382 dest = change_address (destmem, QImode, destptr);
27383 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27384 emit_label (label);
27385 LABEL_NUSES (label) = 1;
27388 else
27390 rtx offset = force_reg (Pmode, const0_rtx);
27391 rtx tmp;
27393 if (max_size > 4)
27395 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27396 src = change_address (srcmem, SImode, srcptr);
27397 dest = change_address (destmem, SImode, destptr);
27398 emit_move_insn (dest, src);
27399 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27400 true, OPTAB_LIB_WIDEN);
27401 if (tmp != offset)
27402 emit_move_insn (offset, tmp);
27403 emit_label (label);
27404 LABEL_NUSES (label) = 1;
27406 if (max_size > 2)
27408 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27409 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27410 src = change_address (srcmem, HImode, tmp);
27411 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27412 dest = change_address (destmem, HImode, tmp);
27413 emit_move_insn (dest, src);
27414 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27415 true, OPTAB_LIB_WIDEN);
27416 if (tmp != offset)
27417 emit_move_insn (offset, tmp);
27418 emit_label (label);
27419 LABEL_NUSES (label) = 1;
27421 if (max_size > 1)
27423 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27424 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27425 src = change_address (srcmem, QImode, tmp);
27426 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27427 dest = change_address (destmem, QImode, tmp);
27428 emit_move_insn (dest, src);
27429 emit_label (label);
27430 LABEL_NUSES (label) = 1;
27435 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27436 with value PROMOTED_VAL.
27437 SRC is passed by pointer to be updated on return.
27438 Return value is updated DST. */
27439 static rtx
27440 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27441 HOST_WIDE_INT size_to_move)
27443 rtx dst = destmem, adjust;
27444 enum insn_code code;
27445 machine_mode move_mode;
27446 int piece_size, i;
27448 /* Find the widest mode in which we could perform moves.
27449 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27450 it until move of such size is supported. */
27451 move_mode = GET_MODE (promoted_val);
27452 if (move_mode == VOIDmode)
27453 move_mode = QImode;
27454 if (size_to_move < GET_MODE_SIZE (move_mode))
27456 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27457 promoted_val = gen_lowpart (move_mode, promoted_val);
27459 piece_size = GET_MODE_SIZE (move_mode);
27460 code = optab_handler (mov_optab, move_mode);
27461 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27463 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27465 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27466 gcc_assert (size_to_move % piece_size == 0);
27467 adjust = GEN_INT (piece_size);
27468 for (i = 0; i < size_to_move; i += piece_size)
27470 if (piece_size <= GET_MODE_SIZE (word_mode))
27472 emit_insn (gen_strset (destptr, dst, promoted_val));
27473 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27474 piece_size);
27475 continue;
27478 emit_insn (GEN_FCN (code) (dst, promoted_val));
27480 emit_move_insn (destptr,
27481 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27483 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27484 piece_size);
27487 /* Update DST rtx. */
27488 return dst;
27490 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27491 static void
27492 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27493 rtx count, int max_size)
27495 count =
27496 expand_simple_binop (counter_mode (count), AND, count,
27497 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27498 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27499 gen_lowpart (QImode, value), count, QImode,
27500 1, max_size / 2, true);
27503 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27504 static void
27505 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27506 rtx count, int max_size)
27508 rtx dest;
27510 if (CONST_INT_P (count))
27512 HOST_WIDE_INT countval = INTVAL (count);
27513 HOST_WIDE_INT epilogue_size = countval % max_size;
27514 int i;
27516 /* For now MAX_SIZE should be a power of 2. This assert could be
27517 relaxed, but it'll require a bit more complicated epilogue
27518 expanding. */
27519 gcc_assert ((max_size & (max_size - 1)) == 0);
27520 for (i = max_size; i >= 1; i >>= 1)
27522 if (epilogue_size & i)
27524 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27525 destmem = emit_memset (destmem, destptr, vec_value, i);
27526 else
27527 destmem = emit_memset (destmem, destptr, value, i);
27530 return;
27532 if (max_size > 32)
27534 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27535 return;
27537 if (max_size > 16)
27539 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27540 if (TARGET_64BIT)
27542 dest = change_address (destmem, DImode, destptr);
27543 emit_insn (gen_strset (destptr, dest, value));
27544 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27545 emit_insn (gen_strset (destptr, dest, value));
27547 else
27549 dest = change_address (destmem, SImode, destptr);
27550 emit_insn (gen_strset (destptr, dest, value));
27551 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27552 emit_insn (gen_strset (destptr, dest, value));
27553 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27554 emit_insn (gen_strset (destptr, dest, value));
27555 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27556 emit_insn (gen_strset (destptr, dest, value));
27558 emit_label (label);
27559 LABEL_NUSES (label) = 1;
27561 if (max_size > 8)
27563 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27564 if (TARGET_64BIT)
27566 dest = change_address (destmem, DImode, destptr);
27567 emit_insn (gen_strset (destptr, dest, value));
27569 else
27571 dest = change_address (destmem, SImode, destptr);
27572 emit_insn (gen_strset (destptr, dest, value));
27573 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27574 emit_insn (gen_strset (destptr, dest, value));
27576 emit_label (label);
27577 LABEL_NUSES (label) = 1;
27579 if (max_size > 4)
27581 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27582 dest = change_address (destmem, SImode, destptr);
27583 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27584 emit_label (label);
27585 LABEL_NUSES (label) = 1;
27587 if (max_size > 2)
27589 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27590 dest = change_address (destmem, HImode, destptr);
27591 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27592 emit_label (label);
27593 LABEL_NUSES (label) = 1;
27595 if (max_size > 1)
27597 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27598 dest = change_address (destmem, QImode, destptr);
27599 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27600 emit_label (label);
27601 LABEL_NUSES (label) = 1;
27605 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27606 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27607 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27608 ignored.
27609 Return value is updated DESTMEM. */
27610 static rtx
27611 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27612 rtx destptr, rtx srcptr, rtx value,
27613 rtx vec_value, rtx count, int align,
27614 int desired_alignment, bool issetmem)
27616 int i;
27617 for (i = 1; i < desired_alignment; i <<= 1)
27619 if (align <= i)
27621 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27622 if (issetmem)
27624 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27625 destmem = emit_memset (destmem, destptr, vec_value, i);
27626 else
27627 destmem = emit_memset (destmem, destptr, value, i);
27629 else
27630 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27631 ix86_adjust_counter (count, i);
27632 emit_label (label);
27633 LABEL_NUSES (label) = 1;
27634 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27637 return destmem;
27640 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27641 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27642 and jump to DONE_LABEL. */
27643 static void
27644 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27645 rtx destptr, rtx srcptr,
27646 rtx value, rtx vec_value,
27647 rtx count, int size,
27648 rtx done_label, bool issetmem)
27650 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27651 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27652 rtx modesize;
27653 int n;
27655 /* If we do not have vector value to copy, we must reduce size. */
27656 if (issetmem)
27658 if (!vec_value)
27660 if (GET_MODE (value) == VOIDmode && size > 8)
27661 mode = Pmode;
27662 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27663 mode = GET_MODE (value);
27665 else
27666 mode = GET_MODE (vec_value), value = vec_value;
27668 else
27670 /* Choose appropriate vector mode. */
27671 if (size >= 32)
27672 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27673 else if (size >= 16)
27674 mode = TARGET_SSE ? V16QImode : DImode;
27675 srcmem = change_address (srcmem, mode, srcptr);
27677 destmem = change_address (destmem, mode, destptr);
27678 modesize = GEN_INT (GET_MODE_SIZE (mode));
27679 gcc_assert (GET_MODE_SIZE (mode) <= size);
27680 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27682 if (issetmem)
27683 emit_move_insn (destmem, gen_lowpart (mode, value));
27684 else
27686 emit_move_insn (destmem, srcmem);
27687 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27689 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27692 destmem = offset_address (destmem, count, 1);
27693 destmem = offset_address (destmem, GEN_INT (-2 * size),
27694 GET_MODE_SIZE (mode));
27695 if (!issetmem)
27697 srcmem = offset_address (srcmem, count, 1);
27698 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27699 GET_MODE_SIZE (mode));
27701 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27703 if (issetmem)
27704 emit_move_insn (destmem, gen_lowpart (mode, value));
27705 else
27707 emit_move_insn (destmem, srcmem);
27708 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27710 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27712 emit_jump_insn (gen_jump (done_label));
27713 emit_barrier ();
27715 emit_label (label);
27716 LABEL_NUSES (label) = 1;
27719 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27720 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27721 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27722 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27723 DONE_LABEL is a label after the whole copying sequence. The label is created
27724 on demand if *DONE_LABEL is NULL.
27725 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27726 bounds after the initial copies.
27728 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27729 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27730 we will dispatch to a library call for large blocks.
27732 In pseudocode we do:
27734 if (COUNT < SIZE)
27736 Assume that SIZE is 4. Bigger sizes are handled analogously
27737 if (COUNT & 4)
27739 copy 4 bytes from SRCPTR to DESTPTR
27740 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27741 goto done_label
27743 if (!COUNT)
27744 goto done_label;
27745 copy 1 byte from SRCPTR to DESTPTR
27746 if (COUNT & 2)
27748 copy 2 bytes from SRCPTR to DESTPTR
27749 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27752 else
27754 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27755 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27757 OLD_DESPTR = DESTPTR;
27758 Align DESTPTR up to DESIRED_ALIGN
27759 SRCPTR += DESTPTR - OLD_DESTPTR
27760 COUNT -= DEST_PTR - OLD_DESTPTR
27761 if (DYNAMIC_CHECK)
27762 Round COUNT down to multiple of SIZE
27763 << optional caller supplied zero size guard is here >>
27764 << optional caller supplied dynamic check is here >>
27765 << caller supplied main copy loop is here >>
27767 done_label:
27769 static void
27770 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27771 rtx *destptr, rtx *srcptr,
27772 machine_mode mode,
27773 rtx value, rtx vec_value,
27774 rtx *count,
27775 rtx_code_label **done_label,
27776 int size,
27777 int desired_align,
27778 int align,
27779 unsigned HOST_WIDE_INT *min_size,
27780 bool dynamic_check,
27781 bool issetmem)
27783 rtx_code_label *loop_label = NULL, *label;
27784 int n;
27785 rtx modesize;
27786 int prolog_size = 0;
27787 rtx mode_value;
27789 /* Chose proper value to copy. */
27790 if (issetmem && VECTOR_MODE_P (mode))
27791 mode_value = vec_value;
27792 else
27793 mode_value = value;
27794 gcc_assert (GET_MODE_SIZE (mode) <= size);
27796 /* See if block is big or small, handle small blocks. */
27797 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27799 int size2 = size;
27800 loop_label = gen_label_rtx ();
27802 if (!*done_label)
27803 *done_label = gen_label_rtx ();
27805 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27806 1, loop_label);
27807 size2 >>= 1;
27809 /* Handle sizes > 3. */
27810 for (;size2 > 2; size2 >>= 1)
27811 expand_small_movmem_or_setmem (destmem, srcmem,
27812 *destptr, *srcptr,
27813 value, vec_value,
27814 *count,
27815 size2, *done_label, issetmem);
27816 /* Nothing to copy? Jump to DONE_LABEL if so */
27817 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27818 1, *done_label);
27820 /* Do a byte copy. */
27821 destmem = change_address (destmem, QImode, *destptr);
27822 if (issetmem)
27823 emit_move_insn (destmem, gen_lowpart (QImode, value));
27824 else
27826 srcmem = change_address (srcmem, QImode, *srcptr);
27827 emit_move_insn (destmem, srcmem);
27830 /* Handle sizes 2 and 3. */
27831 label = ix86_expand_aligntest (*count, 2, false);
27832 destmem = change_address (destmem, HImode, *destptr);
27833 destmem = offset_address (destmem, *count, 1);
27834 destmem = offset_address (destmem, GEN_INT (-2), 2);
27835 if (issetmem)
27836 emit_move_insn (destmem, gen_lowpart (HImode, value));
27837 else
27839 srcmem = change_address (srcmem, HImode, *srcptr);
27840 srcmem = offset_address (srcmem, *count, 1);
27841 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27842 emit_move_insn (destmem, srcmem);
27845 emit_label (label);
27846 LABEL_NUSES (label) = 1;
27847 emit_jump_insn (gen_jump (*done_label));
27848 emit_barrier ();
27850 else
27851 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27852 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27854 /* Start memcpy for COUNT >= SIZE. */
27855 if (loop_label)
27857 emit_label (loop_label);
27858 LABEL_NUSES (loop_label) = 1;
27861 /* Copy first desired_align bytes. */
27862 if (!issetmem)
27863 srcmem = change_address (srcmem, mode, *srcptr);
27864 destmem = change_address (destmem, mode, *destptr);
27865 modesize = GEN_INT (GET_MODE_SIZE (mode));
27866 for (n = 0; prolog_size < desired_align - align; n++)
27868 if (issetmem)
27869 emit_move_insn (destmem, mode_value);
27870 else
27872 emit_move_insn (destmem, srcmem);
27873 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27875 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27876 prolog_size += GET_MODE_SIZE (mode);
27880 /* Copy last SIZE bytes. */
27881 destmem = offset_address (destmem, *count, 1);
27882 destmem = offset_address (destmem,
27883 GEN_INT (-size - prolog_size),
27885 if (issetmem)
27886 emit_move_insn (destmem, mode_value);
27887 else
27889 srcmem = offset_address (srcmem, *count, 1);
27890 srcmem = offset_address (srcmem,
27891 GEN_INT (-size - prolog_size),
27893 emit_move_insn (destmem, srcmem);
27895 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27897 destmem = offset_address (destmem, modesize, 1);
27898 if (issetmem)
27899 emit_move_insn (destmem, mode_value);
27900 else
27902 srcmem = offset_address (srcmem, modesize, 1);
27903 emit_move_insn (destmem, srcmem);
27907 /* Align destination. */
27908 if (desired_align > 1 && desired_align > align)
27910 rtx saveddest = *destptr;
27912 gcc_assert (desired_align <= size);
27913 /* Align destptr up, place it to new register. */
27914 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27915 GEN_INT (prolog_size),
27916 NULL_RTX, 1, OPTAB_DIRECT);
27917 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27918 REG_POINTER (*destptr) = 1;
27919 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27920 GEN_INT (-desired_align),
27921 *destptr, 1, OPTAB_DIRECT);
27922 /* See how many bytes we skipped. */
27923 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27924 *destptr,
27925 saveddest, 1, OPTAB_DIRECT);
27926 /* Adjust srcptr and count. */
27927 if (!issetmem)
27928 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27929 saveddest, *srcptr, 1, OPTAB_DIRECT);
27930 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27931 saveddest, *count, 1, OPTAB_DIRECT);
27932 /* We copied at most size + prolog_size. */
27933 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27934 *min_size
27935 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27936 else
27937 *min_size = 0;
27939 /* Our loops always round down the block size, but for dispatch to
27940 library we need precise value. */
27941 if (dynamic_check)
27942 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27943 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27945 else
27947 gcc_assert (prolog_size == 0);
27948 /* Decrease count, so we won't end up copying last word twice. */
27949 if (!CONST_INT_P (*count))
27950 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27951 constm1_rtx, *count, 1, OPTAB_DIRECT);
27952 else
27953 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27954 (unsigned HOST_WIDE_INT)size));
27955 if (*min_size)
27956 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27961 /* This function is like the previous one, except here we know how many bytes
27962 need to be copied. That allows us to update alignment not only of DST, which
27963 is returned, but also of SRC, which is passed as a pointer for that
27964 reason. */
27965 static rtx
27966 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27967 rtx srcreg, rtx value, rtx vec_value,
27968 int desired_align, int align_bytes,
27969 bool issetmem)
27971 rtx src = NULL;
27972 rtx orig_dst = dst;
27973 rtx orig_src = NULL;
27974 int piece_size = 1;
27975 int copied_bytes = 0;
27977 if (!issetmem)
27979 gcc_assert (srcp != NULL);
27980 src = *srcp;
27981 orig_src = src;
27984 for (piece_size = 1;
27985 piece_size <= desired_align && copied_bytes < align_bytes;
27986 piece_size <<= 1)
27988 if (align_bytes & piece_size)
27990 if (issetmem)
27992 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27993 dst = emit_memset (dst, destreg, vec_value, piece_size);
27994 else
27995 dst = emit_memset (dst, destreg, value, piece_size);
27997 else
27998 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27999 copied_bytes += piece_size;
28002 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
28003 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28004 if (MEM_SIZE_KNOWN_P (orig_dst))
28005 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
28007 if (!issetmem)
28009 int src_align_bytes = get_mem_align_offset (src, desired_align
28010 * BITS_PER_UNIT);
28011 if (src_align_bytes >= 0)
28012 src_align_bytes = desired_align - src_align_bytes;
28013 if (src_align_bytes >= 0)
28015 unsigned int src_align;
28016 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28018 if ((src_align_bytes & (src_align - 1))
28019 == (align_bytes & (src_align - 1)))
28020 break;
28022 if (src_align > (unsigned int) desired_align)
28023 src_align = desired_align;
28024 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28025 set_mem_align (src, src_align * BITS_PER_UNIT);
28027 if (MEM_SIZE_KNOWN_P (orig_src))
28028 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28029 *srcp = src;
28032 return dst;
28035 /* Return true if ALG can be used in current context.
28036 Assume we expand memset if MEMSET is true. */
28037 static bool
28038 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28040 if (alg == no_stringop)
28041 return false;
28042 if (alg == vector_loop)
28043 return TARGET_SSE || TARGET_AVX;
28044 /* Algorithms using the rep prefix want at least edi and ecx;
28045 additionally, memset wants eax and memcpy wants esi. Don't
28046 consider such algorithms if the user has appropriated those
28047 registers for their own purposes, or if we have a non-default
28048 address space, since some string insns cannot override the segment. */
28049 if (alg == rep_prefix_1_byte
28050 || alg == rep_prefix_4_byte
28051 || alg == rep_prefix_8_byte)
28053 if (have_as)
28054 return false;
28055 if (fixed_regs[CX_REG]
28056 || fixed_regs[DI_REG]
28057 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28058 return false;
28060 return true;
28063 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28064 static enum stringop_alg
28065 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28066 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28067 bool memset, bool zero_memset, bool have_as,
28068 int *dynamic_check, bool *noalign, bool recur)
28070 const struct stringop_algs *algs;
28071 bool optimize_for_speed;
28072 int max = 0;
28073 const struct processor_costs *cost;
28074 int i;
28075 bool any_alg_usable_p = false;
28077 *noalign = false;
28078 *dynamic_check = -1;
28080 /* Even if the string operation call is cold, we still might spend a lot
28081 of time processing large blocks. */
28082 if (optimize_function_for_size_p (cfun)
28083 || (optimize_insn_for_size_p ()
28084 && (max_size < 256
28085 || (expected_size != -1 && expected_size < 256))))
28086 optimize_for_speed = false;
28087 else
28088 optimize_for_speed = true;
28090 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28091 if (memset)
28092 algs = &cost->memset[TARGET_64BIT != 0];
28093 else
28094 algs = &cost->memcpy[TARGET_64BIT != 0];
28096 /* See maximal size for user defined algorithm. */
28097 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28099 enum stringop_alg candidate = algs->size[i].alg;
28100 bool usable = alg_usable_p (candidate, memset, have_as);
28101 any_alg_usable_p |= usable;
28103 if (candidate != libcall && candidate && usable)
28104 max = algs->size[i].max;
28107 /* If expected size is not known but max size is small enough
28108 so inline version is a win, set expected size into
28109 the range. */
28110 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28111 && expected_size == -1)
28112 expected_size = min_size / 2 + max_size / 2;
28114 /* If user specified the algorithm, honor it if possible. */
28115 if (ix86_stringop_alg != no_stringop
28116 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28117 return ix86_stringop_alg;
28118 /* rep; movq or rep; movl is the smallest variant. */
28119 else if (!optimize_for_speed)
28121 *noalign = true;
28122 if (!count || (count & 3) || (memset && !zero_memset))
28123 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28124 ? rep_prefix_1_byte : loop_1_byte;
28125 else
28126 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28127 ? rep_prefix_4_byte : loop;
28129 /* Very tiny blocks are best handled via the loop, REP is expensive to
28130 setup. */
28131 else if (expected_size != -1 && expected_size < 4)
28132 return loop_1_byte;
28133 else if (expected_size != -1)
28135 enum stringop_alg alg = libcall;
28136 bool alg_noalign = false;
28137 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28139 /* We get here if the algorithms that were not libcall-based
28140 were rep-prefix based and we are unable to use rep prefixes
28141 based on global register usage. Break out of the loop and
28142 use the heuristic below. */
28143 if (algs->size[i].max == 0)
28144 break;
28145 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28147 enum stringop_alg candidate = algs->size[i].alg;
28149 if (candidate != libcall
28150 && alg_usable_p (candidate, memset, have_as))
28152 alg = candidate;
28153 alg_noalign = algs->size[i].noalign;
28155 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28156 last non-libcall inline algorithm. */
28157 if (TARGET_INLINE_ALL_STRINGOPS)
28159 /* When the current size is best to be copied by a libcall,
28160 but we are still forced to inline, run the heuristic below
28161 that will pick code for medium sized blocks. */
28162 if (alg != libcall)
28164 *noalign = alg_noalign;
28165 return alg;
28167 else if (!any_alg_usable_p)
28168 break;
28170 else if (alg_usable_p (candidate, memset, have_as))
28172 *noalign = algs->size[i].noalign;
28173 return candidate;
28178 /* When asked to inline the call anyway, try to pick meaningful choice.
28179 We look for maximal size of block that is faster to copy by hand and
28180 take blocks of at most of that size guessing that average size will
28181 be roughly half of the block.
28183 If this turns out to be bad, we might simply specify the preferred
28184 choice in ix86_costs. */
28185 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28186 && (algs->unknown_size == libcall
28187 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28189 enum stringop_alg alg;
28190 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28192 /* If there aren't any usable algorithms or if recursing already,
28193 then recursing on smaller sizes or same size isn't going to
28194 find anything. Just return the simple byte-at-a-time copy loop. */
28195 if (!any_alg_usable_p || recur)
28197 /* Pick something reasonable. */
28198 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28199 *dynamic_check = 128;
28200 return loop_1_byte;
28202 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28203 zero_memset, have_as, dynamic_check, noalign, true);
28204 gcc_assert (*dynamic_check == -1);
28205 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28206 *dynamic_check = max;
28207 else
28208 gcc_assert (alg != libcall);
28209 return alg;
28211 return (alg_usable_p (algs->unknown_size, memset, have_as)
28212 ? algs->unknown_size : libcall);
28215 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28216 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28217 static int
28218 decide_alignment (int align,
28219 enum stringop_alg alg,
28220 int expected_size,
28221 machine_mode move_mode)
28223 int desired_align = 0;
28225 gcc_assert (alg != no_stringop);
28227 if (alg == libcall)
28228 return 0;
28229 if (move_mode == VOIDmode)
28230 return 0;
28232 desired_align = GET_MODE_SIZE (move_mode);
28233 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28234 copying whole cacheline at once. */
28235 if (TARGET_PENTIUMPRO
28236 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28237 desired_align = 8;
28239 if (optimize_size)
28240 desired_align = 1;
28241 if (desired_align < align)
28242 desired_align = align;
28243 if (expected_size != -1 && expected_size < 4)
28244 desired_align = align;
28246 return desired_align;
28250 /* Helper function for memcpy. For QImode value 0xXY produce
28251 0xXYXYXYXY of wide specified by MODE. This is essentially
28252 a * 0x10101010, but we can do slightly better than
28253 synth_mult by unwinding the sequence by hand on CPUs with
28254 slow multiply. */
28255 static rtx
28256 promote_duplicated_reg (machine_mode mode, rtx val)
28258 machine_mode valmode = GET_MODE (val);
28259 rtx tmp;
28260 int nops = mode == DImode ? 3 : 2;
28262 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28263 if (val == const0_rtx)
28264 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28265 if (CONST_INT_P (val))
28267 HOST_WIDE_INT v = INTVAL (val) & 255;
28269 v |= v << 8;
28270 v |= v << 16;
28271 if (mode == DImode)
28272 v |= (v << 16) << 16;
28273 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28276 if (valmode == VOIDmode)
28277 valmode = QImode;
28278 if (valmode != QImode)
28279 val = gen_lowpart (QImode, val);
28280 if (mode == QImode)
28281 return val;
28282 if (!TARGET_PARTIAL_REG_STALL)
28283 nops--;
28284 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28285 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28286 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28287 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28289 rtx reg = convert_modes (mode, QImode, val, true);
28290 tmp = promote_duplicated_reg (mode, const1_rtx);
28291 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28292 OPTAB_DIRECT);
28294 else
28296 rtx reg = convert_modes (mode, QImode, val, true);
28298 if (!TARGET_PARTIAL_REG_STALL)
28299 if (mode == SImode)
28300 emit_insn (gen_insvsi_1 (reg, reg));
28301 else
28302 emit_insn (gen_insvdi_1 (reg, reg));
28303 else
28305 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28306 NULL, 1, OPTAB_DIRECT);
28307 reg =
28308 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28310 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28311 NULL, 1, OPTAB_DIRECT);
28312 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28313 if (mode == SImode)
28314 return reg;
28315 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28316 NULL, 1, OPTAB_DIRECT);
28317 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28318 return reg;
28322 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28323 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28324 alignment from ALIGN to DESIRED_ALIGN. */
28325 static rtx
28326 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28327 int align)
28329 rtx promoted_val;
28331 if (TARGET_64BIT
28332 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28333 promoted_val = promote_duplicated_reg (DImode, val);
28334 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28335 promoted_val = promote_duplicated_reg (SImode, val);
28336 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28337 promoted_val = promote_duplicated_reg (HImode, val);
28338 else
28339 promoted_val = val;
28341 return promoted_val;
28344 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28345 operations when profitable. The code depends upon architecture, block size
28346 and alignment, but always has one of the following overall structures:
28348 Aligned move sequence:
28350 1) Prologue guard: Conditional that jumps up to epilogues for small
28351 blocks that can be handled by epilogue alone. This is faster
28352 but also needed for correctness, since prologue assume the block
28353 is larger than the desired alignment.
28355 Optional dynamic check for size and libcall for large
28356 blocks is emitted here too, with -minline-stringops-dynamically.
28358 2) Prologue: copy first few bytes in order to get destination
28359 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28360 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28361 copied. We emit either a jump tree on power of two sized
28362 blocks, or a byte loop.
28364 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28365 with specified algorithm.
28367 4) Epilogue: code copying tail of the block that is too small to be
28368 handled by main body (or up to size guarded by prologue guard).
28370 Misaligned move sequence
28372 1) missaligned move prologue/epilogue containing:
28373 a) Prologue handling small memory blocks and jumping to done_label
28374 (skipped if blocks are known to be large enough)
28375 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28376 needed by single possibly misaligned move
28377 (skipped if alignment is not needed)
28378 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28380 2) Zero size guard dispatching to done_label, if needed
28382 3) dispatch to library call, if needed,
28384 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28385 with specified algorithm. */
28386 bool
28387 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28388 rtx align_exp, rtx expected_align_exp,
28389 rtx expected_size_exp, rtx min_size_exp,
28390 rtx max_size_exp, rtx probable_max_size_exp,
28391 bool issetmem)
28393 rtx destreg;
28394 rtx srcreg = NULL;
28395 rtx_code_label *label = NULL;
28396 rtx tmp;
28397 rtx_code_label *jump_around_label = NULL;
28398 HOST_WIDE_INT align = 1;
28399 unsigned HOST_WIDE_INT count = 0;
28400 HOST_WIDE_INT expected_size = -1;
28401 int size_needed = 0, epilogue_size_needed;
28402 int desired_align = 0, align_bytes = 0;
28403 enum stringop_alg alg;
28404 rtx promoted_val = NULL;
28405 rtx vec_promoted_val = NULL;
28406 bool force_loopy_epilogue = false;
28407 int dynamic_check;
28408 bool need_zero_guard = false;
28409 bool noalign;
28410 machine_mode move_mode = VOIDmode;
28411 int unroll_factor = 1;
28412 /* TODO: Once value ranges are available, fill in proper data. */
28413 unsigned HOST_WIDE_INT min_size = 0;
28414 unsigned HOST_WIDE_INT max_size = -1;
28415 unsigned HOST_WIDE_INT probable_max_size = -1;
28416 bool misaligned_prologue_used = false;
28417 bool have_as;
28419 if (CONST_INT_P (align_exp))
28420 align = INTVAL (align_exp);
28421 /* i386 can do misaligned access on reasonably increased cost. */
28422 if (CONST_INT_P (expected_align_exp)
28423 && INTVAL (expected_align_exp) > align)
28424 align = INTVAL (expected_align_exp);
28425 /* ALIGN is the minimum of destination and source alignment, but we care here
28426 just about destination alignment. */
28427 else if (!issetmem
28428 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28429 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28431 if (CONST_INT_P (count_exp))
28433 min_size = max_size = probable_max_size = count = expected_size
28434 = INTVAL (count_exp);
28435 /* When COUNT is 0, there is nothing to do. */
28436 if (!count)
28437 return true;
28439 else
28441 if (min_size_exp)
28442 min_size = INTVAL (min_size_exp);
28443 if (max_size_exp)
28444 max_size = INTVAL (max_size_exp);
28445 if (probable_max_size_exp)
28446 probable_max_size = INTVAL (probable_max_size_exp);
28447 if (CONST_INT_P (expected_size_exp))
28448 expected_size = INTVAL (expected_size_exp);
28451 /* Make sure we don't need to care about overflow later on. */
28452 if (count > (HOST_WIDE_INT_1U << 30))
28453 return false;
28455 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28456 if (!issetmem)
28457 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28459 /* Step 0: Decide on preferred algorithm, desired alignment and
28460 size of chunks to be copied by main loop. */
28461 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28462 issetmem,
28463 issetmem && val_exp == const0_rtx, have_as,
28464 &dynamic_check, &noalign, false);
28465 if (alg == libcall)
28466 return false;
28467 gcc_assert (alg != no_stringop);
28469 /* For now vector-version of memset is generated only for memory zeroing, as
28470 creating of promoted vector value is very cheap in this case. */
28471 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28472 alg = unrolled_loop;
28474 if (!count)
28475 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28476 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28477 if (!issetmem)
28478 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28480 unroll_factor = 1;
28481 move_mode = word_mode;
28482 switch (alg)
28484 case libcall:
28485 case no_stringop:
28486 case last_alg:
28487 gcc_unreachable ();
28488 case loop_1_byte:
28489 need_zero_guard = true;
28490 move_mode = QImode;
28491 break;
28492 case loop:
28493 need_zero_guard = true;
28494 break;
28495 case unrolled_loop:
28496 need_zero_guard = true;
28497 unroll_factor = (TARGET_64BIT ? 4 : 2);
28498 break;
28499 case vector_loop:
28500 need_zero_guard = true;
28501 unroll_factor = 4;
28502 /* Find the widest supported mode. */
28503 move_mode = word_mode;
28504 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
28505 != CODE_FOR_nothing)
28506 move_mode = GET_MODE_WIDER_MODE (move_mode);
28508 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28509 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28510 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28512 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28513 move_mode = mode_for_vector (word_mode, nunits);
28514 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28515 move_mode = word_mode;
28517 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28518 break;
28519 case rep_prefix_8_byte:
28520 move_mode = DImode;
28521 break;
28522 case rep_prefix_4_byte:
28523 move_mode = SImode;
28524 break;
28525 case rep_prefix_1_byte:
28526 move_mode = QImode;
28527 break;
28529 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28530 epilogue_size_needed = size_needed;
28532 /* If we are going to call any library calls conditionally, make sure any
28533 pending stack adjustment happen before the first conditional branch,
28534 otherwise they will be emitted before the library call only and won't
28535 happen from the other branches. */
28536 if (dynamic_check != -1)
28537 do_pending_stack_adjust ();
28539 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28540 if (!TARGET_ALIGN_STRINGOPS || noalign)
28541 align = desired_align;
28543 /* Step 1: Prologue guard. */
28545 /* Alignment code needs count to be in register. */
28546 if (CONST_INT_P (count_exp) && desired_align > align)
28548 if (INTVAL (count_exp) > desired_align
28549 && INTVAL (count_exp) > size_needed)
28551 align_bytes
28552 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28553 if (align_bytes <= 0)
28554 align_bytes = 0;
28555 else
28556 align_bytes = desired_align - align_bytes;
28558 if (align_bytes == 0)
28559 count_exp = force_reg (counter_mode (count_exp), count_exp);
28561 gcc_assert (desired_align >= 1 && align >= 1);
28563 /* Misaligned move sequences handle both prologue and epilogue at once.
28564 Default code generation results in a smaller code for large alignments
28565 and also avoids redundant job when sizes are known precisely. */
28566 misaligned_prologue_used
28567 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28568 && MAX (desired_align, epilogue_size_needed) <= 32
28569 && desired_align <= epilogue_size_needed
28570 && ((desired_align > align && !align_bytes)
28571 || (!count && epilogue_size_needed > 1)));
28573 /* Do the cheap promotion to allow better CSE across the
28574 main loop and epilogue (ie one load of the big constant in the
28575 front of all code.
28576 For now the misaligned move sequences do not have fast path
28577 without broadcasting. */
28578 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28580 if (alg == vector_loop)
28582 gcc_assert (val_exp == const0_rtx);
28583 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28584 promoted_val = promote_duplicated_reg_to_size (val_exp,
28585 GET_MODE_SIZE (word_mode),
28586 desired_align, align);
28588 else
28590 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28591 desired_align, align);
28594 /* Misaligned move sequences handles both prologues and epilogues at once.
28595 Default code generation results in smaller code for large alignments and
28596 also avoids redundant job when sizes are known precisely. */
28597 if (misaligned_prologue_used)
28599 /* Misaligned move prologue handled small blocks by itself. */
28600 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28601 (dst, src, &destreg, &srcreg,
28602 move_mode, promoted_val, vec_promoted_val,
28603 &count_exp,
28604 &jump_around_label,
28605 desired_align < align
28606 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28607 desired_align, align, &min_size, dynamic_check, issetmem);
28608 if (!issetmem)
28609 src = change_address (src, BLKmode, srcreg);
28610 dst = change_address (dst, BLKmode, destreg);
28611 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28612 epilogue_size_needed = 0;
28613 if (need_zero_guard
28614 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28616 /* It is possible that we copied enough so the main loop will not
28617 execute. */
28618 gcc_assert (size_needed > 1);
28619 if (jump_around_label == NULL_RTX)
28620 jump_around_label = gen_label_rtx ();
28621 emit_cmp_and_jump_insns (count_exp,
28622 GEN_INT (size_needed),
28623 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28624 if (expected_size == -1
28625 || expected_size < (desired_align - align) / 2 + size_needed)
28626 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28627 else
28628 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28631 /* Ensure that alignment prologue won't copy past end of block. */
28632 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28634 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28635 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28636 Make sure it is power of 2. */
28637 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28639 /* To improve performance of small blocks, we jump around the VAL
28640 promoting mode. This mean that if the promoted VAL is not constant,
28641 we might not use it in the epilogue and have to use byte
28642 loop variant. */
28643 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28644 force_loopy_epilogue = true;
28645 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28646 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28648 /* If main algorithm works on QImode, no epilogue is needed.
28649 For small sizes just don't align anything. */
28650 if (size_needed == 1)
28651 desired_align = align;
28652 else
28653 goto epilogue;
28655 else if (!count
28656 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28658 label = gen_label_rtx ();
28659 emit_cmp_and_jump_insns (count_exp,
28660 GEN_INT (epilogue_size_needed),
28661 LTU, 0, counter_mode (count_exp), 1, label);
28662 if (expected_size == -1 || expected_size < epilogue_size_needed)
28663 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28664 else
28665 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28669 /* Emit code to decide on runtime whether library call or inline should be
28670 used. */
28671 if (dynamic_check != -1)
28673 if (!issetmem && CONST_INT_P (count_exp))
28675 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28677 emit_block_copy_via_libcall (dst, src, count_exp);
28678 count_exp = const0_rtx;
28679 goto epilogue;
28682 else
28684 rtx_code_label *hot_label = gen_label_rtx ();
28685 if (jump_around_label == NULL_RTX)
28686 jump_around_label = gen_label_rtx ();
28687 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28688 LEU, 0, counter_mode (count_exp),
28689 1, hot_label);
28690 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28691 if (issetmem)
28692 set_storage_via_libcall (dst, count_exp, val_exp);
28693 else
28694 emit_block_copy_via_libcall (dst, src, count_exp);
28695 emit_jump (jump_around_label);
28696 emit_label (hot_label);
28700 /* Step 2: Alignment prologue. */
28701 /* Do the expensive promotion once we branched off the small blocks. */
28702 if (issetmem && !promoted_val)
28703 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28704 desired_align, align);
28706 if (desired_align > align && !misaligned_prologue_used)
28708 if (align_bytes == 0)
28710 /* Except for the first move in prologue, we no longer know
28711 constant offset in aliasing info. It don't seems to worth
28712 the pain to maintain it for the first move, so throw away
28713 the info early. */
28714 dst = change_address (dst, BLKmode, destreg);
28715 if (!issetmem)
28716 src = change_address (src, BLKmode, srcreg);
28717 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28718 promoted_val, vec_promoted_val,
28719 count_exp, align, desired_align,
28720 issetmem);
28721 /* At most desired_align - align bytes are copied. */
28722 if (min_size < (unsigned)(desired_align - align))
28723 min_size = 0;
28724 else
28725 min_size -= desired_align - align;
28727 else
28729 /* If we know how many bytes need to be stored before dst is
28730 sufficiently aligned, maintain aliasing info accurately. */
28731 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28732 srcreg,
28733 promoted_val,
28734 vec_promoted_val,
28735 desired_align,
28736 align_bytes,
28737 issetmem);
28739 count_exp = plus_constant (counter_mode (count_exp),
28740 count_exp, -align_bytes);
28741 count -= align_bytes;
28742 min_size -= align_bytes;
28743 max_size -= align_bytes;
28745 if (need_zero_guard
28746 && min_size < (unsigned HOST_WIDE_INT) size_needed
28747 && (count < (unsigned HOST_WIDE_INT) size_needed
28748 || (align_bytes == 0
28749 && count < ((unsigned HOST_WIDE_INT) size_needed
28750 + desired_align - align))))
28752 /* It is possible that we copied enough so the main loop will not
28753 execute. */
28754 gcc_assert (size_needed > 1);
28755 if (label == NULL_RTX)
28756 label = gen_label_rtx ();
28757 emit_cmp_and_jump_insns (count_exp,
28758 GEN_INT (size_needed),
28759 LTU, 0, counter_mode (count_exp), 1, label);
28760 if (expected_size == -1
28761 || expected_size < (desired_align - align) / 2 + size_needed)
28762 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28763 else
28764 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28767 if (label && size_needed == 1)
28769 emit_label (label);
28770 LABEL_NUSES (label) = 1;
28771 label = NULL;
28772 epilogue_size_needed = 1;
28773 if (issetmem)
28774 promoted_val = val_exp;
28776 else if (label == NULL_RTX && !misaligned_prologue_used)
28777 epilogue_size_needed = size_needed;
28779 /* Step 3: Main loop. */
28781 switch (alg)
28783 case libcall:
28784 case no_stringop:
28785 case last_alg:
28786 gcc_unreachable ();
28787 case loop_1_byte:
28788 case loop:
28789 case unrolled_loop:
28790 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28791 count_exp, move_mode, unroll_factor,
28792 expected_size, issetmem);
28793 break;
28794 case vector_loop:
28795 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28796 vec_promoted_val, count_exp, move_mode,
28797 unroll_factor, expected_size, issetmem);
28798 break;
28799 case rep_prefix_8_byte:
28800 case rep_prefix_4_byte:
28801 case rep_prefix_1_byte:
28802 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28803 val_exp, count_exp, move_mode, issetmem);
28804 break;
28806 /* Adjust properly the offset of src and dest memory for aliasing. */
28807 if (CONST_INT_P (count_exp))
28809 if (!issetmem)
28810 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28811 (count / size_needed) * size_needed);
28812 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28813 (count / size_needed) * size_needed);
28815 else
28817 if (!issetmem)
28818 src = change_address (src, BLKmode, srcreg);
28819 dst = change_address (dst, BLKmode, destreg);
28822 /* Step 4: Epilogue to copy the remaining bytes. */
28823 epilogue:
28824 if (label)
28826 /* When the main loop is done, COUNT_EXP might hold original count,
28827 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28828 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28829 bytes. Compensate if needed. */
28831 if (size_needed < epilogue_size_needed)
28833 tmp =
28834 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28835 GEN_INT (size_needed - 1), count_exp, 1,
28836 OPTAB_DIRECT);
28837 if (tmp != count_exp)
28838 emit_move_insn (count_exp, tmp);
28840 emit_label (label);
28841 LABEL_NUSES (label) = 1;
28844 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28846 if (force_loopy_epilogue)
28847 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28848 epilogue_size_needed);
28849 else
28851 if (issetmem)
28852 expand_setmem_epilogue (dst, destreg, promoted_val,
28853 vec_promoted_val, count_exp,
28854 epilogue_size_needed);
28855 else
28856 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28857 epilogue_size_needed);
28860 if (jump_around_label)
28861 emit_label (jump_around_label);
28862 return true;
28866 /* Expand the appropriate insns for doing strlen if not just doing
28867 repnz; scasb
28869 out = result, initialized with the start address
28870 align_rtx = alignment of the address.
28871 scratch = scratch register, initialized with the startaddress when
28872 not aligned, otherwise undefined
28874 This is just the body. It needs the initializations mentioned above and
28875 some address computing at the end. These things are done in i386.md. */
28877 static void
28878 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28880 int align;
28881 rtx tmp;
28882 rtx_code_label *align_2_label = NULL;
28883 rtx_code_label *align_3_label = NULL;
28884 rtx_code_label *align_4_label = gen_label_rtx ();
28885 rtx_code_label *end_0_label = gen_label_rtx ();
28886 rtx mem;
28887 rtx tmpreg = gen_reg_rtx (SImode);
28888 rtx scratch = gen_reg_rtx (SImode);
28889 rtx cmp;
28891 align = 0;
28892 if (CONST_INT_P (align_rtx))
28893 align = INTVAL (align_rtx);
28895 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28897 /* Is there a known alignment and is it less than 4? */
28898 if (align < 4)
28900 rtx scratch1 = gen_reg_rtx (Pmode);
28901 emit_move_insn (scratch1, out);
28902 /* Is there a known alignment and is it not 2? */
28903 if (align != 2)
28905 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28906 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28908 /* Leave just the 3 lower bits. */
28909 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28910 NULL_RTX, 0, OPTAB_WIDEN);
28912 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28913 Pmode, 1, align_4_label);
28914 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28915 Pmode, 1, align_2_label);
28916 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28917 Pmode, 1, align_3_label);
28919 else
28921 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28922 check if is aligned to 4 - byte. */
28924 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28925 NULL_RTX, 0, OPTAB_WIDEN);
28927 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28928 Pmode, 1, align_4_label);
28931 mem = change_address (src, QImode, out);
28933 /* Now compare the bytes. */
28935 /* Compare the first n unaligned byte on a byte per byte basis. */
28936 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28937 QImode, 1, end_0_label);
28939 /* Increment the address. */
28940 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28942 /* Not needed with an alignment of 2 */
28943 if (align != 2)
28945 emit_label (align_2_label);
28947 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28948 end_0_label);
28950 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28952 emit_label (align_3_label);
28955 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28956 end_0_label);
28958 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28961 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28962 align this loop. It gives only huge programs, but does not help to
28963 speed up. */
28964 emit_label (align_4_label);
28966 mem = change_address (src, SImode, out);
28967 emit_move_insn (scratch, mem);
28968 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28970 /* This formula yields a nonzero result iff one of the bytes is zero.
28971 This saves three branches inside loop and many cycles. */
28973 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28974 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28975 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28976 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28977 gen_int_mode (0x80808080, SImode)));
28978 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28979 align_4_label);
28981 if (TARGET_CMOVE)
28983 rtx reg = gen_reg_rtx (SImode);
28984 rtx reg2 = gen_reg_rtx (Pmode);
28985 emit_move_insn (reg, tmpreg);
28986 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28988 /* If zero is not in the first two bytes, move two bytes forward. */
28989 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28990 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28991 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28992 emit_insn (gen_rtx_SET (tmpreg,
28993 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28994 reg,
28995 tmpreg)));
28996 /* Emit lea manually to avoid clobbering of flags. */
28997 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28999 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29000 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29001 emit_insn (gen_rtx_SET (out,
29002 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
29003 reg2,
29004 out)));
29006 else
29008 rtx_code_label *end_2_label = gen_label_rtx ();
29009 /* Is zero in the first two bytes? */
29011 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29012 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29013 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29014 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29015 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29016 pc_rtx);
29017 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29018 JUMP_LABEL (tmp) = end_2_label;
29020 /* Not in the first two. Move two bytes forward. */
29021 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29022 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29024 emit_label (end_2_label);
29028 /* Avoid branch in fixing the byte. */
29029 tmpreg = gen_lowpart (QImode, tmpreg);
29030 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29031 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29032 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29033 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29035 emit_label (end_0_label);
29038 /* Expand strlen. */
29040 bool
29041 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29043 rtx addr, scratch1, scratch2, scratch3, scratch4;
29045 /* The generic case of strlen expander is long. Avoid it's
29046 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29048 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29049 && !TARGET_INLINE_ALL_STRINGOPS
29050 && !optimize_insn_for_size_p ()
29051 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29052 return false;
29054 addr = force_reg (Pmode, XEXP (src, 0));
29055 scratch1 = gen_reg_rtx (Pmode);
29057 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29058 && !optimize_insn_for_size_p ())
29060 /* Well it seems that some optimizer does not combine a call like
29061 foo(strlen(bar), strlen(bar));
29062 when the move and the subtraction is done here. It does calculate
29063 the length just once when these instructions are done inside of
29064 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29065 often used and I use one fewer register for the lifetime of
29066 output_strlen_unroll() this is better. */
29068 emit_move_insn (out, addr);
29070 ix86_expand_strlensi_unroll_1 (out, src, align);
29072 /* strlensi_unroll_1 returns the address of the zero at the end of
29073 the string, like memchr(), so compute the length by subtracting
29074 the start address. */
29075 emit_insn (ix86_gen_sub3 (out, out, addr));
29077 else
29079 rtx unspec;
29081 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29082 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29083 return false;
29084 /* Can't use this for non-default address spaces. */
29085 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29086 return false;
29088 scratch2 = gen_reg_rtx (Pmode);
29089 scratch3 = gen_reg_rtx (Pmode);
29090 scratch4 = force_reg (Pmode, constm1_rtx);
29092 emit_move_insn (scratch3, addr);
29093 eoschar = force_reg (QImode, eoschar);
29095 src = replace_equiv_address_nv (src, scratch3);
29097 /* If .md starts supporting :P, this can be done in .md. */
29098 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29099 scratch4), UNSPEC_SCAS);
29100 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29101 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29102 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29104 return true;
29107 /* For given symbol (function) construct code to compute address of it's PLT
29108 entry in large x86-64 PIC model. */
29109 static rtx
29110 construct_plt_address (rtx symbol)
29112 rtx tmp, unspec;
29114 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29115 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29116 gcc_assert (Pmode == DImode);
29118 tmp = gen_reg_rtx (Pmode);
29119 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29121 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29122 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29123 return tmp;
29127 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29128 rtx callarg2,
29129 rtx pop, bool sibcall)
29131 rtx vec[3];
29132 rtx use = NULL, call;
29133 unsigned int vec_len = 0;
29134 tree fndecl;
29136 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29138 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29139 if (fndecl
29140 && (lookup_attribute ("interrupt",
29141 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29142 error ("interrupt service routine can't be called directly");
29144 else
29145 fndecl = NULL_TREE;
29147 if (pop == const0_rtx)
29148 pop = NULL;
29149 gcc_assert (!TARGET_64BIT || !pop);
29151 if (TARGET_MACHO && !TARGET_64BIT)
29153 #if TARGET_MACHO
29154 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29155 fnaddr = machopic_indirect_call_target (fnaddr);
29156 #endif
29158 else
29160 /* Static functions and indirect calls don't need the pic register. Also,
29161 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29162 it an indirect call. */
29163 rtx addr = XEXP (fnaddr, 0);
29164 if (flag_pic
29165 && GET_CODE (addr) == SYMBOL_REF
29166 && !SYMBOL_REF_LOCAL_P (addr))
29168 if (flag_plt
29169 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29170 || !lookup_attribute ("noplt",
29171 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29173 if (!TARGET_64BIT
29174 || (ix86_cmodel == CM_LARGE_PIC
29175 && DEFAULT_ABI != MS_ABI))
29177 use_reg (&use, gen_rtx_REG (Pmode,
29178 REAL_PIC_OFFSET_TABLE_REGNUM));
29179 if (ix86_use_pseudo_pic_reg ())
29180 emit_move_insn (gen_rtx_REG (Pmode,
29181 REAL_PIC_OFFSET_TABLE_REGNUM),
29182 pic_offset_table_rtx);
29185 else if (!TARGET_PECOFF && !TARGET_MACHO)
29187 if (TARGET_64BIT)
29189 fnaddr = gen_rtx_UNSPEC (Pmode,
29190 gen_rtvec (1, addr),
29191 UNSPEC_GOTPCREL);
29192 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29194 else
29196 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29197 UNSPEC_GOT);
29198 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29199 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29200 fnaddr);
29202 fnaddr = gen_const_mem (Pmode, fnaddr);
29203 /* Pmode may not be the same as word_mode for x32, which
29204 doesn't support indirect branch via 32-bit memory slot.
29205 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29206 indirect branch via x32 GOT slot is OK. */
29207 if (GET_MODE (fnaddr) != word_mode)
29208 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29209 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29214 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29215 parameters passed in vector registers. */
29216 if (TARGET_64BIT
29217 && (INTVAL (callarg2) > 0
29218 || (INTVAL (callarg2) == 0
29219 && (TARGET_SSE || !flag_skip_rax_setup))))
29221 rtx al = gen_rtx_REG (QImode, AX_REG);
29222 emit_move_insn (al, callarg2);
29223 use_reg (&use, al);
29226 if (ix86_cmodel == CM_LARGE_PIC
29227 && !TARGET_PECOFF
29228 && MEM_P (fnaddr)
29229 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29230 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29231 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29232 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29233 branch via x32 GOT slot is OK. */
29234 else if (!(TARGET_X32
29235 && MEM_P (fnaddr)
29236 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29237 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29238 && (sibcall
29239 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29240 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29242 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29243 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29246 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29248 if (retval)
29250 /* We should add bounds as destination register in case
29251 pointer with bounds may be returned. */
29252 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29254 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29255 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29256 if (GET_CODE (retval) == PARALLEL)
29258 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29259 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29260 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29261 retval = chkp_join_splitted_slot (retval, par);
29263 else
29265 retval = gen_rtx_PARALLEL (VOIDmode,
29266 gen_rtvec (3, retval, b0, b1));
29267 chkp_put_regs_to_expr_list (retval);
29271 call = gen_rtx_SET (retval, call);
29273 vec[vec_len++] = call;
29275 if (pop)
29277 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29278 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29279 vec[vec_len++] = pop;
29282 if (cfun->machine->no_caller_saved_registers
29283 && (!fndecl
29284 || (!TREE_THIS_VOLATILE (fndecl)
29285 && !lookup_attribute ("no_caller_saved_registers",
29286 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29288 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29289 bool is_64bit_ms_abi = (TARGET_64BIT
29290 && ix86_function_abi (fndecl) == MS_ABI);
29291 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29293 /* If there are no caller-saved registers, add all registers
29294 that are clobbered by the call which returns. */
29295 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29296 if (!fixed_regs[i]
29297 && (ix86_call_used_regs[i] == 1
29298 || (ix86_call_used_regs[i] & c_mask))
29299 && !STACK_REGNO_P (i)
29300 && !MMX_REGNO_P (i))
29301 clobber_reg (&use,
29302 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29304 else if (TARGET_64BIT_MS_ABI
29305 && (!callarg2 || INTVAL (callarg2) != -2))
29307 unsigned i;
29309 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29311 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29312 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29314 clobber_reg (&use, gen_rtx_REG (mode, regno));
29317 /* Set here, but it may get cleared later. */
29318 if (TARGET_CALL_MS2SYSV_XLOGUES)
29320 if (!TARGET_SSE)
29323 /* Don't break hot-patched functions. */
29324 else if (ix86_function_ms_hook_prologue (current_function_decl))
29327 /* TODO: Cases not yet examined. */
29328 else if (flag_split_stack)
29329 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29331 else
29333 gcc_assert (!reload_completed);
29334 cfun->machine->call_ms2sysv = true;
29339 if (vec_len > 1)
29340 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29341 call = emit_call_insn (call);
29342 if (use)
29343 CALL_INSN_FUNCTION_USAGE (call) = use;
29345 return call;
29348 /* Return true if the function being called was marked with attribute
29349 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29350 to handle the non-PIC case in the backend because there is no easy
29351 interface for the front-end to force non-PLT calls to use the GOT.
29352 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29353 to call the function marked "noplt" indirectly. */
29355 static bool
29356 ix86_nopic_noplt_attribute_p (rtx call_op)
29358 if (flag_pic || ix86_cmodel == CM_LARGE
29359 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29360 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29361 || SYMBOL_REF_LOCAL_P (call_op))
29362 return false;
29364 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29366 if (!flag_plt
29367 || (symbol_decl != NULL_TREE
29368 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29369 return true;
29371 return false;
29374 /* Output the assembly for a call instruction. */
29376 const char *
29377 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29379 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29380 bool seh_nop_p = false;
29381 const char *xasm;
29383 if (SIBLING_CALL_P (insn))
29385 if (direct_p)
29387 if (ix86_nopic_noplt_attribute_p (call_op))
29389 if (TARGET_64BIT)
29390 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29391 else
29392 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29394 else
29395 xasm = "%!jmp\t%P0";
29397 /* SEH epilogue detection requires the indirect branch case
29398 to include REX.W. */
29399 else if (TARGET_SEH)
29400 xasm = "%!rex.W jmp\t%A0";
29401 else
29402 xasm = "%!jmp\t%A0";
29404 output_asm_insn (xasm, &call_op);
29405 return "";
29408 /* SEH unwinding can require an extra nop to be emitted in several
29409 circumstances. Determine if we have one of those. */
29410 if (TARGET_SEH)
29412 rtx_insn *i;
29414 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29416 /* If we get to another real insn, we don't need the nop. */
29417 if (INSN_P (i))
29418 break;
29420 /* If we get to the epilogue note, prevent a catch region from
29421 being adjacent to the standard epilogue sequence. If non-
29422 call-exceptions, we'll have done this during epilogue emission. */
29423 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29424 && !flag_non_call_exceptions
29425 && !can_throw_internal (insn))
29427 seh_nop_p = true;
29428 break;
29432 /* If we didn't find a real insn following the call, prevent the
29433 unwinder from looking into the next function. */
29434 if (i == NULL)
29435 seh_nop_p = true;
29438 if (direct_p)
29440 if (ix86_nopic_noplt_attribute_p (call_op))
29442 if (TARGET_64BIT)
29443 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29444 else
29445 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29447 else
29448 xasm = "%!call\t%P0";
29450 else
29451 xasm = "%!call\t%A0";
29453 output_asm_insn (xasm, &call_op);
29455 if (seh_nop_p)
29456 return "nop";
29458 return "";
29461 /* Clear stack slot assignments remembered from previous functions.
29462 This is called from INIT_EXPANDERS once before RTL is emitted for each
29463 function. */
29465 static struct machine_function *
29466 ix86_init_machine_status (void)
29468 struct machine_function *f;
29470 f = ggc_cleared_alloc<machine_function> ();
29471 f->call_abi = ix86_abi;
29473 return f;
29476 /* Return a MEM corresponding to a stack slot with mode MODE.
29477 Allocate a new slot if necessary.
29479 The RTL for a function can have several slots available: N is
29480 which slot to use. */
29483 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29485 struct stack_local_entry *s;
29487 gcc_assert (n < MAX_386_STACK_LOCALS);
29489 for (s = ix86_stack_locals; s; s = s->next)
29490 if (s->mode == mode && s->n == n)
29491 return validize_mem (copy_rtx (s->rtl));
29493 s = ggc_alloc<stack_local_entry> ();
29494 s->n = n;
29495 s->mode = mode;
29496 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29498 s->next = ix86_stack_locals;
29499 ix86_stack_locals = s;
29500 return validize_mem (copy_rtx (s->rtl));
29503 static void
29504 ix86_instantiate_decls (void)
29506 struct stack_local_entry *s;
29508 for (s = ix86_stack_locals; s; s = s->next)
29509 if (s->rtl != NULL_RTX)
29510 instantiate_decl_rtl (s->rtl);
29513 /* Return the number used for encoding REG, in the range 0..7. */
29515 static int
29516 reg_encoded_number (rtx reg)
29518 unsigned regno = REGNO (reg);
29519 switch (regno)
29521 case AX_REG:
29522 return 0;
29523 case CX_REG:
29524 return 1;
29525 case DX_REG:
29526 return 2;
29527 case BX_REG:
29528 return 3;
29529 case SP_REG:
29530 return 4;
29531 case BP_REG:
29532 return 5;
29533 case SI_REG:
29534 return 6;
29535 case DI_REG:
29536 return 7;
29537 default:
29538 break;
29540 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29541 return regno - FIRST_STACK_REG;
29542 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29543 return regno - FIRST_SSE_REG;
29544 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29545 return regno - FIRST_MMX_REG;
29546 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29547 return regno - FIRST_REX_SSE_REG;
29548 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29549 return regno - FIRST_REX_INT_REG;
29550 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29551 return regno - FIRST_MASK_REG;
29552 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29553 return regno - FIRST_BND_REG;
29554 return -1;
29557 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29558 in its encoding if it could be relevant for ROP mitigation, otherwise
29559 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29560 used for calculating it into them. */
29562 static int
29563 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29564 int *popno0 = 0, int *popno1 = 0)
29566 if (asm_noperands (PATTERN (insn)) >= 0)
29567 return -1;
29568 int has_modrm = get_attr_modrm (insn);
29569 if (!has_modrm)
29570 return -1;
29571 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29572 rtx op0, op1;
29573 switch (cls)
29575 case MODRM_CLASS_OP02:
29576 gcc_assert (noperands >= 3);
29577 if (popno0)
29579 *popno0 = 0;
29580 *popno1 = 2;
29582 op0 = operands[0];
29583 op1 = operands[2];
29584 break;
29585 case MODRM_CLASS_OP01:
29586 gcc_assert (noperands >= 2);
29587 if (popno0)
29589 *popno0 = 0;
29590 *popno1 = 1;
29592 op0 = operands[0];
29593 op1 = operands[1];
29594 break;
29595 default:
29596 return -1;
29598 if (REG_P (op0) && REG_P (op1))
29600 int enc0 = reg_encoded_number (op0);
29601 int enc1 = reg_encoded_number (op1);
29602 return 0xc0 + (enc1 << 3) + enc0;
29604 return -1;
29607 /* Check whether x86 address PARTS is a pc-relative address. */
29609 static bool
29610 rip_relative_addr_p (struct ix86_address *parts)
29612 rtx base, index, disp;
29614 base = parts->base;
29615 index = parts->index;
29616 disp = parts->disp;
29618 if (disp && !base && !index)
29620 if (TARGET_64BIT)
29622 rtx symbol = disp;
29624 if (GET_CODE (disp) == CONST)
29625 symbol = XEXP (disp, 0);
29626 if (GET_CODE (symbol) == PLUS
29627 && CONST_INT_P (XEXP (symbol, 1)))
29628 symbol = XEXP (symbol, 0);
29630 if (GET_CODE (symbol) == LABEL_REF
29631 || (GET_CODE (symbol) == SYMBOL_REF
29632 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29633 || (GET_CODE (symbol) == UNSPEC
29634 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29635 || XINT (symbol, 1) == UNSPEC_PCREL
29636 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29637 return true;
29640 return false;
29643 /* Calculate the length of the memory address in the instruction encoding.
29644 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29645 or other prefixes. We never generate addr32 prefix for LEA insn. */
29648 memory_address_length (rtx addr, bool lea)
29650 struct ix86_address parts;
29651 rtx base, index, disp;
29652 int len;
29653 int ok;
29655 if (GET_CODE (addr) == PRE_DEC
29656 || GET_CODE (addr) == POST_INC
29657 || GET_CODE (addr) == PRE_MODIFY
29658 || GET_CODE (addr) == POST_MODIFY)
29659 return 0;
29661 ok = ix86_decompose_address (addr, &parts);
29662 gcc_assert (ok);
29664 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29666 /* If this is not LEA instruction, add the length of addr32 prefix. */
29667 if (TARGET_64BIT && !lea
29668 && (SImode_address_operand (addr, VOIDmode)
29669 || (parts.base && GET_MODE (parts.base) == SImode)
29670 || (parts.index && GET_MODE (parts.index) == SImode)))
29671 len++;
29673 base = parts.base;
29674 index = parts.index;
29675 disp = parts.disp;
29677 if (base && SUBREG_P (base))
29678 base = SUBREG_REG (base);
29679 if (index && SUBREG_P (index))
29680 index = SUBREG_REG (index);
29682 gcc_assert (base == NULL_RTX || REG_P (base));
29683 gcc_assert (index == NULL_RTX || REG_P (index));
29685 /* Rule of thumb:
29686 - esp as the base always wants an index,
29687 - ebp as the base always wants a displacement,
29688 - r12 as the base always wants an index,
29689 - r13 as the base always wants a displacement. */
29691 /* Register Indirect. */
29692 if (base && !index && !disp)
29694 /* esp (for its index) and ebp (for its displacement) need
29695 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29696 code. */
29697 if (base == arg_pointer_rtx
29698 || base == frame_pointer_rtx
29699 || REGNO (base) == SP_REG
29700 || REGNO (base) == BP_REG
29701 || REGNO (base) == R12_REG
29702 || REGNO (base) == R13_REG)
29703 len++;
29706 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29707 is not disp32, but disp32(%rip), so for disp32
29708 SIB byte is needed, unless print_operand_address
29709 optimizes it into disp32(%rip) or (%rip) is implied
29710 by UNSPEC. */
29711 else if (disp && !base && !index)
29713 len += 4;
29714 if (!rip_relative_addr_p (&parts))
29715 len++;
29717 else
29719 /* Find the length of the displacement constant. */
29720 if (disp)
29722 if (base && satisfies_constraint_K (disp))
29723 len += 1;
29724 else
29725 len += 4;
29727 /* ebp always wants a displacement. Similarly r13. */
29728 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29729 len++;
29731 /* An index requires the two-byte modrm form.... */
29732 if (index
29733 /* ...like esp (or r12), which always wants an index. */
29734 || base == arg_pointer_rtx
29735 || base == frame_pointer_rtx
29736 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29737 len++;
29740 return len;
29743 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29744 is set, expect that insn have 8bit immediate alternative. */
29746 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29748 int len = 0;
29749 int i;
29750 extract_insn_cached (insn);
29751 for (i = recog_data.n_operands - 1; i >= 0; --i)
29752 if (CONSTANT_P (recog_data.operand[i]))
29754 enum attr_mode mode = get_attr_mode (insn);
29756 gcc_assert (!len);
29757 if (shortform && CONST_INT_P (recog_data.operand[i]))
29759 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29760 switch (mode)
29762 case MODE_QI:
29763 len = 1;
29764 continue;
29765 case MODE_HI:
29766 ival = trunc_int_for_mode (ival, HImode);
29767 break;
29768 case MODE_SI:
29769 ival = trunc_int_for_mode (ival, SImode);
29770 break;
29771 default:
29772 break;
29774 if (IN_RANGE (ival, -128, 127))
29776 len = 1;
29777 continue;
29780 switch (mode)
29782 case MODE_QI:
29783 len = 1;
29784 break;
29785 case MODE_HI:
29786 len = 2;
29787 break;
29788 case MODE_SI:
29789 len = 4;
29790 break;
29791 /* Immediates for DImode instructions are encoded
29792 as 32bit sign extended values. */
29793 case MODE_DI:
29794 len = 4;
29795 break;
29796 default:
29797 fatal_insn ("unknown insn mode", insn);
29800 return len;
29803 /* Compute default value for "length_address" attribute. */
29805 ix86_attr_length_address_default (rtx_insn *insn)
29807 int i;
29809 if (get_attr_type (insn) == TYPE_LEA)
29811 rtx set = PATTERN (insn), addr;
29813 if (GET_CODE (set) == PARALLEL)
29814 set = XVECEXP (set, 0, 0);
29816 gcc_assert (GET_CODE (set) == SET);
29818 addr = SET_SRC (set);
29820 return memory_address_length (addr, true);
29823 extract_insn_cached (insn);
29824 for (i = recog_data.n_operands - 1; i >= 0; --i)
29826 rtx op = recog_data.operand[i];
29827 if (MEM_P (op))
29829 constrain_operands_cached (insn, reload_completed);
29830 if (which_alternative != -1)
29832 const char *constraints = recog_data.constraints[i];
29833 int alt = which_alternative;
29835 while (*constraints == '=' || *constraints == '+')
29836 constraints++;
29837 while (alt-- > 0)
29838 while (*constraints++ != ',')
29840 /* Skip ignored operands. */
29841 if (*constraints == 'X')
29842 continue;
29845 int len = memory_address_length (XEXP (op, 0), false);
29847 /* Account for segment prefix for non-default addr spaces. */
29848 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29849 len++;
29851 return len;
29854 return 0;
29857 /* Compute default value for "length_vex" attribute. It includes
29858 2 or 3 byte VEX prefix and 1 opcode byte. */
29861 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29862 bool has_vex_w)
29864 int i;
29866 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29867 byte VEX prefix. */
29868 if (!has_0f_opcode || has_vex_w)
29869 return 3 + 1;
29871 /* We can always use 2 byte VEX prefix in 32bit. */
29872 if (!TARGET_64BIT)
29873 return 2 + 1;
29875 extract_insn_cached (insn);
29877 for (i = recog_data.n_operands - 1; i >= 0; --i)
29878 if (REG_P (recog_data.operand[i]))
29880 /* REX.W bit uses 3 byte VEX prefix. */
29881 if (GET_MODE (recog_data.operand[i]) == DImode
29882 && GENERAL_REG_P (recog_data.operand[i]))
29883 return 3 + 1;
29885 else
29887 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29888 if (MEM_P (recog_data.operand[i])
29889 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29890 return 3 + 1;
29893 return 2 + 1;
29896 /* Return the maximum number of instructions a cpu can issue. */
29898 static int
29899 ix86_issue_rate (void)
29901 switch (ix86_tune)
29903 case PROCESSOR_PENTIUM:
29904 case PROCESSOR_LAKEMONT:
29905 case PROCESSOR_BONNELL:
29906 case PROCESSOR_SILVERMONT:
29907 case PROCESSOR_KNL:
29908 case PROCESSOR_INTEL:
29909 case PROCESSOR_K6:
29910 case PROCESSOR_BTVER2:
29911 case PROCESSOR_PENTIUM4:
29912 case PROCESSOR_NOCONA:
29913 return 2;
29915 case PROCESSOR_PENTIUMPRO:
29916 case PROCESSOR_ATHLON:
29917 case PROCESSOR_K8:
29918 case PROCESSOR_AMDFAM10:
29919 case PROCESSOR_GENERIC:
29920 case PROCESSOR_BTVER1:
29921 return 3;
29923 case PROCESSOR_BDVER1:
29924 case PROCESSOR_BDVER2:
29925 case PROCESSOR_BDVER3:
29926 case PROCESSOR_BDVER4:
29927 case PROCESSOR_ZNVER1:
29928 case PROCESSOR_CORE2:
29929 case PROCESSOR_NEHALEM:
29930 case PROCESSOR_SANDYBRIDGE:
29931 case PROCESSOR_HASWELL:
29932 return 4;
29934 default:
29935 return 1;
29939 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29940 by DEP_INSN and nothing set by DEP_INSN. */
29942 static bool
29943 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29945 rtx set, set2;
29947 /* Simplify the test for uninteresting insns. */
29948 if (insn_type != TYPE_SETCC
29949 && insn_type != TYPE_ICMOV
29950 && insn_type != TYPE_FCMOV
29951 && insn_type != TYPE_IBR)
29952 return false;
29954 if ((set = single_set (dep_insn)) != 0)
29956 set = SET_DEST (set);
29957 set2 = NULL_RTX;
29959 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29960 && XVECLEN (PATTERN (dep_insn), 0) == 2
29961 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29962 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29964 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29965 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29967 else
29968 return false;
29970 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29971 return false;
29973 /* This test is true if the dependent insn reads the flags but
29974 not any other potentially set register. */
29975 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29976 return false;
29978 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29979 return false;
29981 return true;
29984 /* Return true iff USE_INSN has a memory address with operands set by
29985 SET_INSN. */
29987 bool
29988 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29990 int i;
29991 extract_insn_cached (use_insn);
29992 for (i = recog_data.n_operands - 1; i >= 0; --i)
29993 if (MEM_P (recog_data.operand[i]))
29995 rtx addr = XEXP (recog_data.operand[i], 0);
29996 if (modified_in_p (addr, set_insn) != 0)
29998 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
29999 has SP based memory (unless index reg is modified in a pop). */
30000 rtx set = single_set (set_insn);
30001 if (set
30002 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
30003 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
30005 struct ix86_address parts;
30006 if (ix86_decompose_address (addr, &parts)
30007 && parts.base == stack_pointer_rtx
30008 && (parts.index == NULL_RTX
30009 || MEM_P (SET_DEST (set))
30010 || !modified_in_p (parts.index, set_insn)))
30011 return false;
30013 return true;
30015 return false;
30017 return false;
30020 /* Helper function for exact_store_load_dependency.
30021 Return true if addr is found in insn. */
30022 static bool
30023 exact_dependency_1 (rtx addr, rtx insn)
30025 enum rtx_code code;
30026 const char *format_ptr;
30027 int i, j;
30029 code = GET_CODE (insn);
30030 switch (code)
30032 case MEM:
30033 if (rtx_equal_p (addr, insn))
30034 return true;
30035 break;
30036 case REG:
30037 CASE_CONST_ANY:
30038 case SYMBOL_REF:
30039 case CODE_LABEL:
30040 case PC:
30041 case CC0:
30042 case EXPR_LIST:
30043 return false;
30044 default:
30045 break;
30048 format_ptr = GET_RTX_FORMAT (code);
30049 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30051 switch (*format_ptr++)
30053 case 'e':
30054 if (exact_dependency_1 (addr, XEXP (insn, i)))
30055 return true;
30056 break;
30057 case 'E':
30058 for (j = 0; j < XVECLEN (insn, i); j++)
30059 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30060 return true;
30061 break;
30064 return false;
30067 /* Return true if there exists exact dependency for store & load, i.e.
30068 the same memory address is used in them. */
30069 static bool
30070 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30072 rtx set1, set2;
30074 set1 = single_set (store);
30075 if (!set1)
30076 return false;
30077 if (!MEM_P (SET_DEST (set1)))
30078 return false;
30079 set2 = single_set (load);
30080 if (!set2)
30081 return false;
30082 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30083 return true;
30084 return false;
30087 static int
30088 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30089 unsigned int)
30091 enum attr_type insn_type, dep_insn_type;
30092 enum attr_memory memory;
30093 rtx set, set2;
30094 int dep_insn_code_number;
30096 /* Anti and output dependencies have zero cost on all CPUs. */
30097 if (dep_type != 0)
30098 return 0;
30100 dep_insn_code_number = recog_memoized (dep_insn);
30102 /* If we can't recognize the insns, we can't really do anything. */
30103 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30104 return cost;
30106 insn_type = get_attr_type (insn);
30107 dep_insn_type = get_attr_type (dep_insn);
30109 switch (ix86_tune)
30111 case PROCESSOR_PENTIUM:
30112 case PROCESSOR_LAKEMONT:
30113 /* Address Generation Interlock adds a cycle of latency. */
30114 if (insn_type == TYPE_LEA)
30116 rtx addr = PATTERN (insn);
30118 if (GET_CODE (addr) == PARALLEL)
30119 addr = XVECEXP (addr, 0, 0);
30121 gcc_assert (GET_CODE (addr) == SET);
30123 addr = SET_SRC (addr);
30124 if (modified_in_p (addr, dep_insn))
30125 cost += 1;
30127 else if (ix86_agi_dependent (dep_insn, insn))
30128 cost += 1;
30130 /* ??? Compares pair with jump/setcc. */
30131 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30132 cost = 0;
30134 /* Floating point stores require value to be ready one cycle earlier. */
30135 if (insn_type == TYPE_FMOV
30136 && get_attr_memory (insn) == MEMORY_STORE
30137 && !ix86_agi_dependent (dep_insn, insn))
30138 cost += 1;
30139 break;
30141 case PROCESSOR_PENTIUMPRO:
30142 /* INT->FP conversion is expensive. */
30143 if (get_attr_fp_int_src (dep_insn))
30144 cost += 5;
30146 /* There is one cycle extra latency between an FP op and a store. */
30147 if (insn_type == TYPE_FMOV
30148 && (set = single_set (dep_insn)) != NULL_RTX
30149 && (set2 = single_set (insn)) != NULL_RTX
30150 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30151 && MEM_P (SET_DEST (set2)))
30152 cost += 1;
30154 memory = get_attr_memory (insn);
30156 /* Show ability of reorder buffer to hide latency of load by executing
30157 in parallel with previous instruction in case
30158 previous instruction is not needed to compute the address. */
30159 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30160 && !ix86_agi_dependent (dep_insn, insn))
30162 /* Claim moves to take one cycle, as core can issue one load
30163 at time and the next load can start cycle later. */
30164 if (dep_insn_type == TYPE_IMOV
30165 || dep_insn_type == TYPE_FMOV)
30166 cost = 1;
30167 else if (cost > 1)
30168 cost--;
30170 break;
30172 case PROCESSOR_K6:
30173 /* The esp dependency is resolved before
30174 the instruction is really finished. */
30175 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30176 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30177 return 1;
30179 /* INT->FP conversion is expensive. */
30180 if (get_attr_fp_int_src (dep_insn))
30181 cost += 5;
30183 memory = get_attr_memory (insn);
30185 /* Show ability of reorder buffer to hide latency of load by executing
30186 in parallel with previous instruction in case
30187 previous instruction is not needed to compute the address. */
30188 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30189 && !ix86_agi_dependent (dep_insn, insn))
30191 /* Claim moves to take one cycle, as core can issue one load
30192 at time and the next load can start cycle later. */
30193 if (dep_insn_type == TYPE_IMOV
30194 || dep_insn_type == TYPE_FMOV)
30195 cost = 1;
30196 else if (cost > 2)
30197 cost -= 2;
30198 else
30199 cost = 1;
30201 break;
30203 case PROCESSOR_AMDFAM10:
30204 case PROCESSOR_BDVER1:
30205 case PROCESSOR_BDVER2:
30206 case PROCESSOR_BDVER3:
30207 case PROCESSOR_BDVER4:
30208 case PROCESSOR_ZNVER1:
30209 case PROCESSOR_BTVER1:
30210 case PROCESSOR_BTVER2:
30211 case PROCESSOR_GENERIC:
30212 /* Stack engine allows to execute push&pop instructions in parall. */
30213 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30214 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30215 return 0;
30216 /* FALLTHRU */
30218 case PROCESSOR_ATHLON:
30219 case PROCESSOR_K8:
30220 memory = get_attr_memory (insn);
30222 /* Show ability of reorder buffer to hide latency of load by executing
30223 in parallel with previous instruction in case
30224 previous instruction is not needed to compute the address. */
30225 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30226 && !ix86_agi_dependent (dep_insn, insn))
30228 enum attr_unit unit = get_attr_unit (insn);
30229 int loadcost = 3;
30231 /* Because of the difference between the length of integer and
30232 floating unit pipeline preparation stages, the memory operands
30233 for floating point are cheaper.
30235 ??? For Athlon it the difference is most probably 2. */
30236 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30237 loadcost = 3;
30238 else
30239 loadcost = TARGET_ATHLON ? 2 : 0;
30241 if (cost >= loadcost)
30242 cost -= loadcost;
30243 else
30244 cost = 0;
30246 break;
30248 case PROCESSOR_CORE2:
30249 case PROCESSOR_NEHALEM:
30250 case PROCESSOR_SANDYBRIDGE:
30251 case PROCESSOR_HASWELL:
30252 /* Stack engine allows to execute push&pop instructions in parall. */
30253 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30254 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30255 return 0;
30257 memory = get_attr_memory (insn);
30259 /* Show ability of reorder buffer to hide latency of load by executing
30260 in parallel with previous instruction in case
30261 previous instruction is not needed to compute the address. */
30262 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30263 && !ix86_agi_dependent (dep_insn, insn))
30265 if (cost >= 4)
30266 cost -= 4;
30267 else
30268 cost = 0;
30270 break;
30272 case PROCESSOR_SILVERMONT:
30273 case PROCESSOR_KNL:
30274 case PROCESSOR_INTEL:
30275 if (!reload_completed)
30276 return cost;
30278 /* Increase cost of integer loads. */
30279 memory = get_attr_memory (dep_insn);
30280 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30282 enum attr_unit unit = get_attr_unit (dep_insn);
30283 if (unit == UNIT_INTEGER && cost == 1)
30285 if (memory == MEMORY_LOAD)
30286 cost = 3;
30287 else
30289 /* Increase cost of ld/st for short int types only
30290 because of store forwarding issue. */
30291 rtx set = single_set (dep_insn);
30292 if (set && (GET_MODE (SET_DEST (set)) == QImode
30293 || GET_MODE (SET_DEST (set)) == HImode))
30295 /* Increase cost of store/load insn if exact
30296 dependence exists and it is load insn. */
30297 enum attr_memory insn_memory = get_attr_memory (insn);
30298 if (insn_memory == MEMORY_LOAD
30299 && exact_store_load_dependency (dep_insn, insn))
30300 cost = 3;
30306 default:
30307 break;
30310 return cost;
30313 /* How many alternative schedules to try. This should be as wide as the
30314 scheduling freedom in the DFA, but no wider. Making this value too
30315 large results extra work for the scheduler. */
30317 static int
30318 ia32_multipass_dfa_lookahead (void)
30320 switch (ix86_tune)
30322 case PROCESSOR_PENTIUM:
30323 case PROCESSOR_LAKEMONT:
30324 return 2;
30326 case PROCESSOR_PENTIUMPRO:
30327 case PROCESSOR_K6:
30328 return 1;
30330 case PROCESSOR_BDVER1:
30331 case PROCESSOR_BDVER2:
30332 case PROCESSOR_BDVER3:
30333 case PROCESSOR_BDVER4:
30334 /* We use lookahead value 4 for BD both before and after reload
30335 schedules. Plan is to have value 8 included for O3. */
30336 return 4;
30338 case PROCESSOR_CORE2:
30339 case PROCESSOR_NEHALEM:
30340 case PROCESSOR_SANDYBRIDGE:
30341 case PROCESSOR_HASWELL:
30342 case PROCESSOR_BONNELL:
30343 case PROCESSOR_SILVERMONT:
30344 case PROCESSOR_KNL:
30345 case PROCESSOR_INTEL:
30346 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30347 as many instructions can be executed on a cycle, i.e.,
30348 issue_rate. I wonder why tuning for many CPUs does not do this. */
30349 if (reload_completed)
30350 return ix86_issue_rate ();
30351 /* Don't use lookahead for pre-reload schedule to save compile time. */
30352 return 0;
30354 default:
30355 return 0;
30359 /* Return true if target platform supports macro-fusion. */
30361 static bool
30362 ix86_macro_fusion_p ()
30364 return TARGET_FUSE_CMP_AND_BRANCH;
30367 /* Check whether current microarchitecture support macro fusion
30368 for insn pair "CONDGEN + CONDJMP". Refer to
30369 "Intel Architectures Optimization Reference Manual". */
30371 static bool
30372 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30374 rtx src, dest;
30375 enum rtx_code ccode;
30376 rtx compare_set = NULL_RTX, test_if, cond;
30377 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30379 if (!any_condjump_p (condjmp))
30380 return false;
30382 unsigned int condreg1, condreg2;
30383 rtx cc_reg_1;
30384 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30385 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30386 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30387 || !condgen
30388 || !modified_in_p (cc_reg_1, condgen))
30389 return false;
30391 if (get_attr_type (condgen) != TYPE_TEST
30392 && get_attr_type (condgen) != TYPE_ICMP
30393 && get_attr_type (condgen) != TYPE_INCDEC
30394 && get_attr_type (condgen) != TYPE_ALU)
30395 return false;
30397 compare_set = single_set (condgen);
30398 if (compare_set == NULL_RTX
30399 && !TARGET_FUSE_ALU_AND_BRANCH)
30400 return false;
30402 if (compare_set == NULL_RTX)
30404 int i;
30405 rtx pat = PATTERN (condgen);
30406 for (i = 0; i < XVECLEN (pat, 0); i++)
30407 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30409 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30410 if (GET_CODE (set_src) == COMPARE)
30411 compare_set = XVECEXP (pat, 0, i);
30412 else
30413 alu_set = XVECEXP (pat, 0, i);
30416 if (compare_set == NULL_RTX)
30417 return false;
30418 src = SET_SRC (compare_set);
30419 if (GET_CODE (src) != COMPARE)
30420 return false;
30422 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30423 supported. */
30424 if ((MEM_P (XEXP (src, 0))
30425 && CONST_INT_P (XEXP (src, 1)))
30426 || (MEM_P (XEXP (src, 1))
30427 && CONST_INT_P (XEXP (src, 0))))
30428 return false;
30430 /* No fusion for RIP-relative address. */
30431 if (MEM_P (XEXP (src, 0)))
30432 addr = XEXP (XEXP (src, 0), 0);
30433 else if (MEM_P (XEXP (src, 1)))
30434 addr = XEXP (XEXP (src, 1), 0);
30436 if (addr) {
30437 ix86_address parts;
30438 int ok = ix86_decompose_address (addr, &parts);
30439 gcc_assert (ok);
30441 if (rip_relative_addr_p (&parts))
30442 return false;
30445 test_if = SET_SRC (pc_set (condjmp));
30446 cond = XEXP (test_if, 0);
30447 ccode = GET_CODE (cond);
30448 /* Check whether conditional jump use Sign or Overflow Flags. */
30449 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30450 && (ccode == GE
30451 || ccode == GT
30452 || ccode == LE
30453 || ccode == LT))
30454 return false;
30456 /* Return true for TYPE_TEST and TYPE_ICMP. */
30457 if (get_attr_type (condgen) == TYPE_TEST
30458 || get_attr_type (condgen) == TYPE_ICMP)
30459 return true;
30461 /* The following is the case that macro-fusion for alu + jmp. */
30462 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30463 return false;
30465 /* No fusion for alu op with memory destination operand. */
30466 dest = SET_DEST (alu_set);
30467 if (MEM_P (dest))
30468 return false;
30470 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30471 supported. */
30472 if (get_attr_type (condgen) == TYPE_INCDEC
30473 && (ccode == GEU
30474 || ccode == GTU
30475 || ccode == LEU
30476 || ccode == LTU))
30477 return false;
30479 return true;
30482 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30483 execution. It is applied if
30484 (1) IMUL instruction is on the top of list;
30485 (2) There exists the only producer of independent IMUL instruction in
30486 ready list.
30487 Return index of IMUL producer if it was found and -1 otherwise. */
30488 static int
30489 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30491 rtx_insn *insn;
30492 rtx set, insn1, insn2;
30493 sd_iterator_def sd_it;
30494 dep_t dep;
30495 int index = -1;
30496 int i;
30498 if (!TARGET_BONNELL)
30499 return index;
30501 /* Check that IMUL instruction is on the top of ready list. */
30502 insn = ready[n_ready - 1];
30503 set = single_set (insn);
30504 if (!set)
30505 return index;
30506 if (!(GET_CODE (SET_SRC (set)) == MULT
30507 && GET_MODE (SET_SRC (set)) == SImode))
30508 return index;
30510 /* Search for producer of independent IMUL instruction. */
30511 for (i = n_ready - 2; i >= 0; i--)
30513 insn = ready[i];
30514 if (!NONDEBUG_INSN_P (insn))
30515 continue;
30516 /* Skip IMUL instruction. */
30517 insn2 = PATTERN (insn);
30518 if (GET_CODE (insn2) == PARALLEL)
30519 insn2 = XVECEXP (insn2, 0, 0);
30520 if (GET_CODE (insn2) == SET
30521 && GET_CODE (SET_SRC (insn2)) == MULT
30522 && GET_MODE (SET_SRC (insn2)) == SImode)
30523 continue;
30525 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30527 rtx con;
30528 con = DEP_CON (dep);
30529 if (!NONDEBUG_INSN_P (con))
30530 continue;
30531 insn1 = PATTERN (con);
30532 if (GET_CODE (insn1) == PARALLEL)
30533 insn1 = XVECEXP (insn1, 0, 0);
30535 if (GET_CODE (insn1) == SET
30536 && GET_CODE (SET_SRC (insn1)) == MULT
30537 && GET_MODE (SET_SRC (insn1)) == SImode)
30539 sd_iterator_def sd_it1;
30540 dep_t dep1;
30541 /* Check if there is no other dependee for IMUL. */
30542 index = i;
30543 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30545 rtx pro;
30546 pro = DEP_PRO (dep1);
30547 if (!NONDEBUG_INSN_P (pro))
30548 continue;
30549 if (pro != insn)
30550 index = -1;
30552 if (index >= 0)
30553 break;
30556 if (index >= 0)
30557 break;
30559 return index;
30562 /* Try to find the best candidate on the top of ready list if two insns
30563 have the same priority - candidate is best if its dependees were
30564 scheduled earlier. Applied for Silvermont only.
30565 Return true if top 2 insns must be interchanged. */
30566 static bool
30567 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30569 rtx_insn *top = ready[n_ready - 1];
30570 rtx_insn *next = ready[n_ready - 2];
30571 rtx set;
30572 sd_iterator_def sd_it;
30573 dep_t dep;
30574 int clock1 = -1;
30575 int clock2 = -1;
30576 #define INSN_TICK(INSN) (HID (INSN)->tick)
30578 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30579 return false;
30581 if (!NONDEBUG_INSN_P (top))
30582 return false;
30583 if (!NONJUMP_INSN_P (top))
30584 return false;
30585 if (!NONDEBUG_INSN_P (next))
30586 return false;
30587 if (!NONJUMP_INSN_P (next))
30588 return false;
30589 set = single_set (top);
30590 if (!set)
30591 return false;
30592 set = single_set (next);
30593 if (!set)
30594 return false;
30596 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30598 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30599 return false;
30600 /* Determine winner more precise. */
30601 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30603 rtx pro;
30604 pro = DEP_PRO (dep);
30605 if (!NONDEBUG_INSN_P (pro))
30606 continue;
30607 if (INSN_TICK (pro) > clock1)
30608 clock1 = INSN_TICK (pro);
30610 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30612 rtx pro;
30613 pro = DEP_PRO (dep);
30614 if (!NONDEBUG_INSN_P (pro))
30615 continue;
30616 if (INSN_TICK (pro) > clock2)
30617 clock2 = INSN_TICK (pro);
30620 if (clock1 == clock2)
30622 /* Determine winner - load must win. */
30623 enum attr_memory memory1, memory2;
30624 memory1 = get_attr_memory (top);
30625 memory2 = get_attr_memory (next);
30626 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30627 return true;
30629 return (bool) (clock2 < clock1);
30631 return false;
30632 #undef INSN_TICK
30635 /* Perform possible reodering of ready list for Atom/Silvermont only.
30636 Return issue rate. */
30637 static int
30638 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30639 int *pn_ready, int clock_var)
30641 int issue_rate = -1;
30642 int n_ready = *pn_ready;
30643 int i;
30644 rtx_insn *insn;
30645 int index = -1;
30647 /* Set up issue rate. */
30648 issue_rate = ix86_issue_rate ();
30650 /* Do reodering for BONNELL/SILVERMONT only. */
30651 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30652 return issue_rate;
30654 /* Nothing to do if ready list contains only 1 instruction. */
30655 if (n_ready <= 1)
30656 return issue_rate;
30658 /* Do reodering for post-reload scheduler only. */
30659 if (!reload_completed)
30660 return issue_rate;
30662 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30664 if (sched_verbose > 1)
30665 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30666 INSN_UID (ready[index]));
30668 /* Put IMUL producer (ready[index]) at the top of ready list. */
30669 insn = ready[index];
30670 for (i = index; i < n_ready - 1; i++)
30671 ready[i] = ready[i + 1];
30672 ready[n_ready - 1] = insn;
30673 return issue_rate;
30676 /* Skip selective scheduling since HID is not populated in it. */
30677 if (clock_var != 0
30678 && !sel_sched_p ()
30679 && swap_top_of_ready_list (ready, n_ready))
30681 if (sched_verbose > 1)
30682 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30683 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30684 /* Swap 2 top elements of ready list. */
30685 insn = ready[n_ready - 1];
30686 ready[n_ready - 1] = ready[n_ready - 2];
30687 ready[n_ready - 2] = insn;
30689 return issue_rate;
30692 static bool
30693 ix86_class_likely_spilled_p (reg_class_t);
30695 /* Returns true if lhs of insn is HW function argument register and set up
30696 is_spilled to true if it is likely spilled HW register. */
30697 static bool
30698 insn_is_function_arg (rtx insn, bool* is_spilled)
30700 rtx dst;
30702 if (!NONDEBUG_INSN_P (insn))
30703 return false;
30704 /* Call instructions are not movable, ignore it. */
30705 if (CALL_P (insn))
30706 return false;
30707 insn = PATTERN (insn);
30708 if (GET_CODE (insn) == PARALLEL)
30709 insn = XVECEXP (insn, 0, 0);
30710 if (GET_CODE (insn) != SET)
30711 return false;
30712 dst = SET_DEST (insn);
30713 if (REG_P (dst) && HARD_REGISTER_P (dst)
30714 && ix86_function_arg_regno_p (REGNO (dst)))
30716 /* Is it likely spilled HW register? */
30717 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30718 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30719 *is_spilled = true;
30720 return true;
30722 return false;
30725 /* Add output dependencies for chain of function adjacent arguments if only
30726 there is a move to likely spilled HW register. Return first argument
30727 if at least one dependence was added or NULL otherwise. */
30728 static rtx_insn *
30729 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30731 rtx_insn *insn;
30732 rtx_insn *last = call;
30733 rtx_insn *first_arg = NULL;
30734 bool is_spilled = false;
30736 head = PREV_INSN (head);
30738 /* Find nearest to call argument passing instruction. */
30739 while (true)
30741 last = PREV_INSN (last);
30742 if (last == head)
30743 return NULL;
30744 if (!NONDEBUG_INSN_P (last))
30745 continue;
30746 if (insn_is_function_arg (last, &is_spilled))
30747 break;
30748 return NULL;
30751 first_arg = last;
30752 while (true)
30754 insn = PREV_INSN (last);
30755 if (!INSN_P (insn))
30756 break;
30757 if (insn == head)
30758 break;
30759 if (!NONDEBUG_INSN_P (insn))
30761 last = insn;
30762 continue;
30764 if (insn_is_function_arg (insn, &is_spilled))
30766 /* Add output depdendence between two function arguments if chain
30767 of output arguments contains likely spilled HW registers. */
30768 if (is_spilled)
30769 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30770 first_arg = last = insn;
30772 else
30773 break;
30775 if (!is_spilled)
30776 return NULL;
30777 return first_arg;
30780 /* Add output or anti dependency from insn to first_arg to restrict its code
30781 motion. */
30782 static void
30783 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30785 rtx set;
30786 rtx tmp;
30788 /* Add anti dependencies for bounds stores. */
30789 if (INSN_P (insn)
30790 && GET_CODE (PATTERN (insn)) == PARALLEL
30791 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30792 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30794 add_dependence (first_arg, insn, REG_DEP_ANTI);
30795 return;
30798 set = single_set (insn);
30799 if (!set)
30800 return;
30801 tmp = SET_DEST (set);
30802 if (REG_P (tmp))
30804 /* Add output dependency to the first function argument. */
30805 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30806 return;
30808 /* Add anti dependency. */
30809 add_dependence (first_arg, insn, REG_DEP_ANTI);
30812 /* Avoid cross block motion of function argument through adding dependency
30813 from the first non-jump instruction in bb. */
30814 static void
30815 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30817 rtx_insn *insn = BB_END (bb);
30819 while (insn)
30821 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30823 rtx set = single_set (insn);
30824 if (set)
30826 avoid_func_arg_motion (arg, insn);
30827 return;
30830 if (insn == BB_HEAD (bb))
30831 return;
30832 insn = PREV_INSN (insn);
30836 /* Hook for pre-reload schedule - avoid motion of function arguments
30837 passed in likely spilled HW registers. */
30838 static void
30839 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30841 rtx_insn *insn;
30842 rtx_insn *first_arg = NULL;
30843 if (reload_completed)
30844 return;
30845 while (head != tail && DEBUG_INSN_P (head))
30846 head = NEXT_INSN (head);
30847 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30848 if (INSN_P (insn) && CALL_P (insn))
30850 first_arg = add_parameter_dependencies (insn, head);
30851 if (first_arg)
30853 /* Add dependee for first argument to predecessors if only
30854 region contains more than one block. */
30855 basic_block bb = BLOCK_FOR_INSN (insn);
30856 int rgn = CONTAINING_RGN (bb->index);
30857 int nr_blks = RGN_NR_BLOCKS (rgn);
30858 /* Skip trivial regions and region head blocks that can have
30859 predecessors outside of region. */
30860 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30862 edge e;
30863 edge_iterator ei;
30865 /* Regions are SCCs with the exception of selective
30866 scheduling with pipelining of outer blocks enabled.
30867 So also check that immediate predecessors of a non-head
30868 block are in the same region. */
30869 FOR_EACH_EDGE (e, ei, bb->preds)
30871 /* Avoid creating of loop-carried dependencies through
30872 using topological ordering in the region. */
30873 if (rgn == CONTAINING_RGN (e->src->index)
30874 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
30875 add_dependee_for_func_arg (first_arg, e->src);
30878 insn = first_arg;
30879 if (insn == head)
30880 break;
30883 else if (first_arg)
30884 avoid_func_arg_motion (first_arg, insn);
30887 /* Hook for pre-reload schedule - set priority of moves from likely spilled
30888 HW registers to maximum, to schedule them at soon as possible. These are
30889 moves from function argument registers at the top of the function entry
30890 and moves from function return value registers after call. */
30891 static int
30892 ix86_adjust_priority (rtx_insn *insn, int priority)
30894 rtx set;
30896 if (reload_completed)
30897 return priority;
30899 if (!NONDEBUG_INSN_P (insn))
30900 return priority;
30902 set = single_set (insn);
30903 if (set)
30905 rtx tmp = SET_SRC (set);
30906 if (REG_P (tmp)
30907 && HARD_REGISTER_P (tmp)
30908 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
30909 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
30910 return current_sched_info->sched_max_insns_priority;
30913 return priority;
30916 /* Model decoder of Core 2/i7.
30917 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
30918 track the instruction fetch block boundaries and make sure that long
30919 (9+ bytes) instructions are assigned to D0. */
30921 /* Maximum length of an insn that can be handled by
30922 a secondary decoder unit. '8' for Core 2/i7. */
30923 static int core2i7_secondary_decoder_max_insn_size;
30925 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
30926 '16' for Core 2/i7. */
30927 static int core2i7_ifetch_block_size;
30929 /* Maximum number of instructions decoder can handle per cycle.
30930 '6' for Core 2/i7. */
30931 static int core2i7_ifetch_block_max_insns;
30933 typedef struct ix86_first_cycle_multipass_data_ *
30934 ix86_first_cycle_multipass_data_t;
30935 typedef const struct ix86_first_cycle_multipass_data_ *
30936 const_ix86_first_cycle_multipass_data_t;
30938 /* A variable to store target state across calls to max_issue within
30939 one cycle. */
30940 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30941 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30943 /* Initialize DATA. */
30944 static void
30945 core2i7_first_cycle_multipass_init (void *_data)
30947 ix86_first_cycle_multipass_data_t data
30948 = (ix86_first_cycle_multipass_data_t) _data;
30950 data->ifetch_block_len = 0;
30951 data->ifetch_block_n_insns = 0;
30952 data->ready_try_change = NULL;
30953 data->ready_try_change_size = 0;
30956 /* Advancing the cycle; reset ifetch block counts. */
30957 static void
30958 core2i7_dfa_post_advance_cycle (void)
30960 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30962 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30964 data->ifetch_block_len = 0;
30965 data->ifetch_block_n_insns = 0;
30968 static int min_insn_size (rtx_insn *);
30970 /* Filter out insns from ready_try that the core will not be able to issue
30971 on current cycle due to decoder. */
30972 static void
30973 core2i7_first_cycle_multipass_filter_ready_try
30974 (const_ix86_first_cycle_multipass_data_t data,
30975 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30977 while (n_ready--)
30979 rtx_insn *insn;
30980 int insn_size;
30982 if (ready_try[n_ready])
30983 continue;
30985 insn = get_ready_element (n_ready);
30986 insn_size = min_insn_size (insn);
30988 if (/* If this is a too long an insn for a secondary decoder ... */
30989 (!first_cycle_insn_p
30990 && insn_size > core2i7_secondary_decoder_max_insn_size)
30991 /* ... or it would not fit into the ifetch block ... */
30992 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30993 /* ... or the decoder is full already ... */
30994 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30995 /* ... mask the insn out. */
30997 ready_try[n_ready] = 1;
30999 if (data->ready_try_change)
31000 bitmap_set_bit (data->ready_try_change, n_ready);
31005 /* Prepare for a new round of multipass lookahead scheduling. */
31006 static void
31007 core2i7_first_cycle_multipass_begin (void *_data,
31008 signed char *ready_try, int n_ready,
31009 bool first_cycle_insn_p)
31011 ix86_first_cycle_multipass_data_t data
31012 = (ix86_first_cycle_multipass_data_t) _data;
31013 const_ix86_first_cycle_multipass_data_t prev_data
31014 = ix86_first_cycle_multipass_data;
31016 /* Restore the state from the end of the previous round. */
31017 data->ifetch_block_len = prev_data->ifetch_block_len;
31018 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31020 /* Filter instructions that cannot be issued on current cycle due to
31021 decoder restrictions. */
31022 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31023 first_cycle_insn_p);
31026 /* INSN is being issued in current solution. Account for its impact on
31027 the decoder model. */
31028 static void
31029 core2i7_first_cycle_multipass_issue (void *_data,
31030 signed char *ready_try, int n_ready,
31031 rtx_insn *insn, const void *_prev_data)
31033 ix86_first_cycle_multipass_data_t data
31034 = (ix86_first_cycle_multipass_data_t) _data;
31035 const_ix86_first_cycle_multipass_data_t prev_data
31036 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31038 int insn_size = min_insn_size (insn);
31040 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31041 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31042 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31043 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31045 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31046 if (!data->ready_try_change)
31048 data->ready_try_change = sbitmap_alloc (n_ready);
31049 data->ready_try_change_size = n_ready;
31051 else if (data->ready_try_change_size < n_ready)
31053 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31054 n_ready, 0);
31055 data->ready_try_change_size = n_ready;
31057 bitmap_clear (data->ready_try_change);
31059 /* Filter out insns from ready_try that the core will not be able to issue
31060 on current cycle due to decoder. */
31061 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31062 false);
31065 /* Revert the effect on ready_try. */
31066 static void
31067 core2i7_first_cycle_multipass_backtrack (const void *_data,
31068 signed char *ready_try,
31069 int n_ready ATTRIBUTE_UNUSED)
31071 const_ix86_first_cycle_multipass_data_t data
31072 = (const_ix86_first_cycle_multipass_data_t) _data;
31073 unsigned int i = 0;
31074 sbitmap_iterator sbi;
31076 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31077 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31079 ready_try[i] = 0;
31083 /* Save the result of multipass lookahead scheduling for the next round. */
31084 static void
31085 core2i7_first_cycle_multipass_end (const void *_data)
31087 const_ix86_first_cycle_multipass_data_t data
31088 = (const_ix86_first_cycle_multipass_data_t) _data;
31089 ix86_first_cycle_multipass_data_t next_data
31090 = ix86_first_cycle_multipass_data;
31092 if (data != NULL)
31094 next_data->ifetch_block_len = data->ifetch_block_len;
31095 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31099 /* Deallocate target data. */
31100 static void
31101 core2i7_first_cycle_multipass_fini (void *_data)
31103 ix86_first_cycle_multipass_data_t data
31104 = (ix86_first_cycle_multipass_data_t) _data;
31106 if (data->ready_try_change)
31108 sbitmap_free (data->ready_try_change);
31109 data->ready_try_change = NULL;
31110 data->ready_try_change_size = 0;
31114 /* Prepare for scheduling pass. */
31115 static void
31116 ix86_sched_init_global (FILE *, int, int)
31118 /* Install scheduling hooks for current CPU. Some of these hooks are used
31119 in time-critical parts of the scheduler, so we only set them up when
31120 they are actually used. */
31121 switch (ix86_tune)
31123 case PROCESSOR_CORE2:
31124 case PROCESSOR_NEHALEM:
31125 case PROCESSOR_SANDYBRIDGE:
31126 case PROCESSOR_HASWELL:
31127 /* Do not perform multipass scheduling for pre-reload schedule
31128 to save compile time. */
31129 if (reload_completed)
31131 targetm.sched.dfa_post_advance_cycle
31132 = core2i7_dfa_post_advance_cycle;
31133 targetm.sched.first_cycle_multipass_init
31134 = core2i7_first_cycle_multipass_init;
31135 targetm.sched.first_cycle_multipass_begin
31136 = core2i7_first_cycle_multipass_begin;
31137 targetm.sched.first_cycle_multipass_issue
31138 = core2i7_first_cycle_multipass_issue;
31139 targetm.sched.first_cycle_multipass_backtrack
31140 = core2i7_first_cycle_multipass_backtrack;
31141 targetm.sched.first_cycle_multipass_end
31142 = core2i7_first_cycle_multipass_end;
31143 targetm.sched.first_cycle_multipass_fini
31144 = core2i7_first_cycle_multipass_fini;
31146 /* Set decoder parameters. */
31147 core2i7_secondary_decoder_max_insn_size = 8;
31148 core2i7_ifetch_block_size = 16;
31149 core2i7_ifetch_block_max_insns = 6;
31150 break;
31152 /* Fall through. */
31153 default:
31154 targetm.sched.dfa_post_advance_cycle = NULL;
31155 targetm.sched.first_cycle_multipass_init = NULL;
31156 targetm.sched.first_cycle_multipass_begin = NULL;
31157 targetm.sched.first_cycle_multipass_issue = NULL;
31158 targetm.sched.first_cycle_multipass_backtrack = NULL;
31159 targetm.sched.first_cycle_multipass_end = NULL;
31160 targetm.sched.first_cycle_multipass_fini = NULL;
31161 break;
31166 /* Compute the alignment given to a constant that is being placed in memory.
31167 EXP is the constant and ALIGN is the alignment that the object would
31168 ordinarily have.
31169 The value of this function is used instead of that alignment to align
31170 the object. */
31173 ix86_constant_alignment (tree exp, int align)
31175 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31176 || TREE_CODE (exp) == INTEGER_CST)
31178 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31179 return 64;
31180 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31181 return 128;
31183 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31184 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31185 return BITS_PER_WORD;
31187 return align;
31190 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31191 the data type, and ALIGN is the alignment that the object would
31192 ordinarily have. */
31194 static int
31195 iamcu_alignment (tree type, int align)
31197 machine_mode mode;
31199 if (align < 32 || TYPE_USER_ALIGN (type))
31200 return align;
31202 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31203 bytes. */
31204 mode = TYPE_MODE (strip_array_types (type));
31205 switch (GET_MODE_CLASS (mode))
31207 case MODE_INT:
31208 case MODE_COMPLEX_INT:
31209 case MODE_COMPLEX_FLOAT:
31210 case MODE_FLOAT:
31211 case MODE_DECIMAL_FLOAT:
31212 return 32;
31213 default:
31214 return align;
31218 /* Compute the alignment for a static variable.
31219 TYPE is the data type, and ALIGN is the alignment that
31220 the object would ordinarily have. The value of this function is used
31221 instead of that alignment to align the object. */
31224 ix86_data_alignment (tree type, int align, bool opt)
31226 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31227 for symbols from other compilation units or symbols that don't need
31228 to bind locally. In order to preserve some ABI compatibility with
31229 those compilers, ensure we don't decrease alignment from what we
31230 used to assume. */
31232 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31234 /* A data structure, equal or greater than the size of a cache line
31235 (64 bytes in the Pentium 4 and other recent Intel processors, including
31236 processors based on Intel Core microarchitecture) should be aligned
31237 so that its base address is a multiple of a cache line size. */
31239 int max_align
31240 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31242 if (max_align < BITS_PER_WORD)
31243 max_align = BITS_PER_WORD;
31245 switch (ix86_align_data_type)
31247 case ix86_align_data_type_abi: opt = false; break;
31248 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31249 case ix86_align_data_type_cacheline: break;
31252 if (TARGET_IAMCU)
31253 align = iamcu_alignment (type, align);
31255 if (opt
31256 && AGGREGATE_TYPE_P (type)
31257 && TYPE_SIZE (type)
31258 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31260 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31261 && align < max_align_compat)
31262 align = max_align_compat;
31263 if (wi::geu_p (TYPE_SIZE (type), max_align)
31264 && align < max_align)
31265 align = max_align;
31268 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31269 to 16byte boundary. */
31270 if (TARGET_64BIT)
31272 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31273 && TYPE_SIZE (type)
31274 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31275 && wi::geu_p (TYPE_SIZE (type), 128)
31276 && align < 128)
31277 return 128;
31280 if (!opt)
31281 return align;
31283 if (TREE_CODE (type) == ARRAY_TYPE)
31285 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31286 return 64;
31287 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31288 return 128;
31290 else if (TREE_CODE (type) == COMPLEX_TYPE)
31293 if (TYPE_MODE (type) == DCmode && align < 64)
31294 return 64;
31295 if ((TYPE_MODE (type) == XCmode
31296 || TYPE_MODE (type) == TCmode) && align < 128)
31297 return 128;
31299 else if ((TREE_CODE (type) == RECORD_TYPE
31300 || TREE_CODE (type) == UNION_TYPE
31301 || TREE_CODE (type) == QUAL_UNION_TYPE)
31302 && TYPE_FIELDS (type))
31304 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31305 return 64;
31306 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31307 return 128;
31309 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31310 || TREE_CODE (type) == INTEGER_TYPE)
31312 if (TYPE_MODE (type) == DFmode && align < 64)
31313 return 64;
31314 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31315 return 128;
31318 return align;
31321 /* Compute the alignment for a local variable or a stack slot. EXP is
31322 the data type or decl itself, MODE is the widest mode available and
31323 ALIGN is the alignment that the object would ordinarily have. The
31324 value of this macro is used instead of that alignment to align the
31325 object. */
31327 unsigned int
31328 ix86_local_alignment (tree exp, machine_mode mode,
31329 unsigned int align)
31331 tree type, decl;
31333 if (exp && DECL_P (exp))
31335 type = TREE_TYPE (exp);
31336 decl = exp;
31338 else
31340 type = exp;
31341 decl = NULL;
31344 /* Don't do dynamic stack realignment for long long objects with
31345 -mpreferred-stack-boundary=2. */
31346 if (!TARGET_64BIT
31347 && align == 64
31348 && ix86_preferred_stack_boundary < 64
31349 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31350 && (!type || !TYPE_USER_ALIGN (type))
31351 && (!decl || !DECL_USER_ALIGN (decl)))
31352 align = 32;
31354 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31355 register in MODE. We will return the largest alignment of XF
31356 and DF. */
31357 if (!type)
31359 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31360 align = GET_MODE_ALIGNMENT (DFmode);
31361 return align;
31364 /* Don't increase alignment for Intel MCU psABI. */
31365 if (TARGET_IAMCU)
31366 return align;
31368 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31369 to 16byte boundary. Exact wording is:
31371 An array uses the same alignment as its elements, except that a local or
31372 global array variable of length at least 16 bytes or
31373 a C99 variable-length array variable always has alignment of at least 16 bytes.
31375 This was added to allow use of aligned SSE instructions at arrays. This
31376 rule is meant for static storage (where compiler can not do the analysis
31377 by itself). We follow it for automatic variables only when convenient.
31378 We fully control everything in the function compiled and functions from
31379 other unit can not rely on the alignment.
31381 Exclude va_list type. It is the common case of local array where
31382 we can not benefit from the alignment.
31384 TODO: Probably one should optimize for size only when var is not escaping. */
31385 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31386 && TARGET_SSE)
31388 if (AGGREGATE_TYPE_P (type)
31389 && (va_list_type_node == NULL_TREE
31390 || (TYPE_MAIN_VARIANT (type)
31391 != TYPE_MAIN_VARIANT (va_list_type_node)))
31392 && TYPE_SIZE (type)
31393 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31394 && wi::geu_p (TYPE_SIZE (type), 128)
31395 && align < 128)
31396 return 128;
31398 if (TREE_CODE (type) == ARRAY_TYPE)
31400 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31401 return 64;
31402 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31403 return 128;
31405 else if (TREE_CODE (type) == COMPLEX_TYPE)
31407 if (TYPE_MODE (type) == DCmode && align < 64)
31408 return 64;
31409 if ((TYPE_MODE (type) == XCmode
31410 || TYPE_MODE (type) == TCmode) && align < 128)
31411 return 128;
31413 else if ((TREE_CODE (type) == RECORD_TYPE
31414 || TREE_CODE (type) == UNION_TYPE
31415 || TREE_CODE (type) == QUAL_UNION_TYPE)
31416 && TYPE_FIELDS (type))
31418 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31419 return 64;
31420 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31421 return 128;
31423 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31424 || TREE_CODE (type) == INTEGER_TYPE)
31427 if (TYPE_MODE (type) == DFmode && align < 64)
31428 return 64;
31429 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31430 return 128;
31432 return align;
31435 /* Compute the minimum required alignment for dynamic stack realignment
31436 purposes for a local variable, parameter or a stack slot. EXP is
31437 the data type or decl itself, MODE is its mode and ALIGN is the
31438 alignment that the object would ordinarily have. */
31440 unsigned int
31441 ix86_minimum_alignment (tree exp, machine_mode mode,
31442 unsigned int align)
31444 tree type, decl;
31446 if (exp && DECL_P (exp))
31448 type = TREE_TYPE (exp);
31449 decl = exp;
31451 else
31453 type = exp;
31454 decl = NULL;
31457 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31458 return align;
31460 /* Don't do dynamic stack realignment for long long objects with
31461 -mpreferred-stack-boundary=2. */
31462 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31463 && (!type || !TYPE_USER_ALIGN (type))
31464 && (!decl || !DECL_USER_ALIGN (decl)))
31466 gcc_checking_assert (!TARGET_STV);
31467 return 32;
31470 return align;
31473 /* Find a location for the static chain incoming to a nested function.
31474 This is a register, unless all free registers are used by arguments. */
31476 static rtx
31477 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31479 unsigned regno;
31481 /* While this function won't be called by the middle-end when a static
31482 chain isn't needed, it's also used throughout the backend so it's
31483 easiest to keep this check centralized. */
31484 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31485 return NULL;
31487 if (TARGET_64BIT)
31489 /* We always use R10 in 64-bit mode. */
31490 regno = R10_REG;
31492 else
31494 const_tree fntype, fndecl;
31495 unsigned int ccvt;
31497 /* By default in 32-bit mode we use ECX to pass the static chain. */
31498 regno = CX_REG;
31500 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31502 fntype = TREE_TYPE (fndecl_or_type);
31503 fndecl = fndecl_or_type;
31505 else
31507 fntype = fndecl_or_type;
31508 fndecl = NULL;
31511 ccvt = ix86_get_callcvt (fntype);
31512 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31514 /* Fastcall functions use ecx/edx for arguments, which leaves
31515 us with EAX for the static chain.
31516 Thiscall functions use ecx for arguments, which also
31517 leaves us with EAX for the static chain. */
31518 regno = AX_REG;
31520 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31522 /* Thiscall functions use ecx for arguments, which leaves
31523 us with EAX and EDX for the static chain.
31524 We are using for abi-compatibility EAX. */
31525 regno = AX_REG;
31527 else if (ix86_function_regparm (fntype, fndecl) == 3)
31529 /* For regparm 3, we have no free call-clobbered registers in
31530 which to store the static chain. In order to implement this,
31531 we have the trampoline push the static chain to the stack.
31532 However, we can't push a value below the return address when
31533 we call the nested function directly, so we have to use an
31534 alternate entry point. For this we use ESI, and have the
31535 alternate entry point push ESI, so that things appear the
31536 same once we're executing the nested function. */
31537 if (incoming_p)
31539 if (fndecl == current_function_decl
31540 && !ix86_static_chain_on_stack)
31542 gcc_assert (!reload_completed);
31543 ix86_static_chain_on_stack = true;
31545 return gen_frame_mem (SImode,
31546 plus_constant (Pmode,
31547 arg_pointer_rtx, -8));
31549 regno = SI_REG;
31553 return gen_rtx_REG (Pmode, regno);
31556 /* Emit RTL insns to initialize the variable parts of a trampoline.
31557 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31558 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31559 to be passed to the target function. */
31561 static void
31562 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31564 rtx mem, fnaddr;
31565 int opcode;
31566 int offset = 0;
31568 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31570 if (TARGET_64BIT)
31572 int size;
31574 /* Load the function address to r11. Try to load address using
31575 the shorter movl instead of movabs. We may want to support
31576 movq for kernel mode, but kernel does not use trampolines at
31577 the moment. FNADDR is a 32bit address and may not be in
31578 DImode when ptr_mode == SImode. Always use movl in this
31579 case. */
31580 if (ptr_mode == SImode
31581 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31583 fnaddr = copy_addr_to_reg (fnaddr);
31585 mem = adjust_address (m_tramp, HImode, offset);
31586 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31588 mem = adjust_address (m_tramp, SImode, offset + 2);
31589 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31590 offset += 6;
31592 else
31594 mem = adjust_address (m_tramp, HImode, offset);
31595 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31597 mem = adjust_address (m_tramp, DImode, offset + 2);
31598 emit_move_insn (mem, fnaddr);
31599 offset += 10;
31602 /* Load static chain using movabs to r10. Use the shorter movl
31603 instead of movabs when ptr_mode == SImode. */
31604 if (ptr_mode == SImode)
31606 opcode = 0xba41;
31607 size = 6;
31609 else
31611 opcode = 0xba49;
31612 size = 10;
31615 mem = adjust_address (m_tramp, HImode, offset);
31616 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31618 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31619 emit_move_insn (mem, chain_value);
31620 offset += size;
31622 /* Jump to r11; the last (unused) byte is a nop, only there to
31623 pad the write out to a single 32-bit store. */
31624 mem = adjust_address (m_tramp, SImode, offset);
31625 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31626 offset += 4;
31628 else
31630 rtx disp, chain;
31632 /* Depending on the static chain location, either load a register
31633 with a constant, or push the constant to the stack. All of the
31634 instructions are the same size. */
31635 chain = ix86_static_chain (fndecl, true);
31636 if (REG_P (chain))
31638 switch (REGNO (chain))
31640 case AX_REG:
31641 opcode = 0xb8; break;
31642 case CX_REG:
31643 opcode = 0xb9; break;
31644 default:
31645 gcc_unreachable ();
31648 else
31649 opcode = 0x68;
31651 mem = adjust_address (m_tramp, QImode, offset);
31652 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31654 mem = adjust_address (m_tramp, SImode, offset + 1);
31655 emit_move_insn (mem, chain_value);
31656 offset += 5;
31658 mem = adjust_address (m_tramp, QImode, offset);
31659 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31661 mem = adjust_address (m_tramp, SImode, offset + 1);
31663 /* Compute offset from the end of the jmp to the target function.
31664 In the case in which the trampoline stores the static chain on
31665 the stack, we need to skip the first insn which pushes the
31666 (call-saved) register static chain; this push is 1 byte. */
31667 offset += 5;
31668 disp = expand_binop (SImode, sub_optab, fnaddr,
31669 plus_constant (Pmode, XEXP (m_tramp, 0),
31670 offset - (MEM_P (chain) ? 1 : 0)),
31671 NULL_RTX, 1, OPTAB_DIRECT);
31672 emit_move_insn (mem, disp);
31675 gcc_assert (offset <= TRAMPOLINE_SIZE);
31677 #ifdef HAVE_ENABLE_EXECUTE_STACK
31678 #ifdef CHECK_EXECUTE_STACK_ENABLED
31679 if (CHECK_EXECUTE_STACK_ENABLED)
31680 #endif
31681 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31682 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31683 #endif
31686 static bool
31687 ix86_allocate_stack_slots_for_args (void)
31689 /* Naked functions should not allocate stack slots for arguments. */
31690 return !ix86_function_naked (current_function_decl);
31693 static bool
31694 ix86_warn_func_return (tree decl)
31696 /* Naked functions are implemented entirely in assembly, including the
31697 return sequence, so suppress warnings about this. */
31698 return !ix86_function_naked (decl);
31701 /* The following file contains several enumerations and data structures
31702 built from the definitions in i386-builtin-types.def. */
31704 #include "i386-builtin-types.inc"
31706 /* Table for the ix86 builtin non-function types. */
31707 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31709 /* Retrieve an element from the above table, building some of
31710 the types lazily. */
31712 static tree
31713 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31715 unsigned int index;
31716 tree type, itype;
31718 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31720 type = ix86_builtin_type_tab[(int) tcode];
31721 if (type != NULL)
31722 return type;
31724 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31725 if (tcode <= IX86_BT_LAST_VECT)
31727 machine_mode mode;
31729 index = tcode - IX86_BT_LAST_PRIM - 1;
31730 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31731 mode = ix86_builtin_type_vect_mode[index];
31733 type = build_vector_type_for_mode (itype, mode);
31735 else
31737 int quals;
31739 index = tcode - IX86_BT_LAST_VECT - 1;
31740 if (tcode <= IX86_BT_LAST_PTR)
31741 quals = TYPE_UNQUALIFIED;
31742 else
31743 quals = TYPE_QUAL_CONST;
31745 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31746 if (quals != TYPE_UNQUALIFIED)
31747 itype = build_qualified_type (itype, quals);
31749 type = build_pointer_type (itype);
31752 ix86_builtin_type_tab[(int) tcode] = type;
31753 return type;
31756 /* Table for the ix86 builtin function types. */
31757 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31759 /* Retrieve an element from the above table, building some of
31760 the types lazily. */
31762 static tree
31763 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31765 tree type;
31767 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31769 type = ix86_builtin_func_type_tab[(int) tcode];
31770 if (type != NULL)
31771 return type;
31773 if (tcode <= IX86_BT_LAST_FUNC)
31775 unsigned start = ix86_builtin_func_start[(int) tcode];
31776 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31777 tree rtype, atype, args = void_list_node;
31778 unsigned i;
31780 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31781 for (i = after - 1; i > start; --i)
31783 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31784 args = tree_cons (NULL, atype, args);
31787 type = build_function_type (rtype, args);
31789 else
31791 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31792 enum ix86_builtin_func_type icode;
31794 icode = ix86_builtin_func_alias_base[index];
31795 type = ix86_get_builtin_func_type (icode);
31798 ix86_builtin_func_type_tab[(int) tcode] = type;
31799 return type;
31803 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31804 bdesc_* arrays below should come first, then builtins for each bdesc_*
31805 array in ascending order, so that we can use direct array accesses. */
31806 enum ix86_builtins
31808 IX86_BUILTIN_MASKMOVQ,
31809 IX86_BUILTIN_LDMXCSR,
31810 IX86_BUILTIN_STMXCSR,
31811 IX86_BUILTIN_MASKMOVDQU,
31812 IX86_BUILTIN_PSLLDQ128,
31813 IX86_BUILTIN_CLFLUSH,
31814 IX86_BUILTIN_MONITOR,
31815 IX86_BUILTIN_MWAIT,
31816 IX86_BUILTIN_CLZERO,
31817 IX86_BUILTIN_VEC_INIT_V2SI,
31818 IX86_BUILTIN_VEC_INIT_V4HI,
31819 IX86_BUILTIN_VEC_INIT_V8QI,
31820 IX86_BUILTIN_VEC_EXT_V2DF,
31821 IX86_BUILTIN_VEC_EXT_V2DI,
31822 IX86_BUILTIN_VEC_EXT_V4SF,
31823 IX86_BUILTIN_VEC_EXT_V4SI,
31824 IX86_BUILTIN_VEC_EXT_V8HI,
31825 IX86_BUILTIN_VEC_EXT_V2SI,
31826 IX86_BUILTIN_VEC_EXT_V4HI,
31827 IX86_BUILTIN_VEC_EXT_V16QI,
31828 IX86_BUILTIN_VEC_SET_V2DI,
31829 IX86_BUILTIN_VEC_SET_V4SF,
31830 IX86_BUILTIN_VEC_SET_V4SI,
31831 IX86_BUILTIN_VEC_SET_V8HI,
31832 IX86_BUILTIN_VEC_SET_V4HI,
31833 IX86_BUILTIN_VEC_SET_V16QI,
31834 IX86_BUILTIN_GATHERSIV2DF,
31835 IX86_BUILTIN_GATHERSIV4DF,
31836 IX86_BUILTIN_GATHERDIV2DF,
31837 IX86_BUILTIN_GATHERDIV4DF,
31838 IX86_BUILTIN_GATHERSIV4SF,
31839 IX86_BUILTIN_GATHERSIV8SF,
31840 IX86_BUILTIN_GATHERDIV4SF,
31841 IX86_BUILTIN_GATHERDIV8SF,
31842 IX86_BUILTIN_GATHERSIV2DI,
31843 IX86_BUILTIN_GATHERSIV4DI,
31844 IX86_BUILTIN_GATHERDIV2DI,
31845 IX86_BUILTIN_GATHERDIV4DI,
31846 IX86_BUILTIN_GATHERSIV4SI,
31847 IX86_BUILTIN_GATHERSIV8SI,
31848 IX86_BUILTIN_GATHERDIV4SI,
31849 IX86_BUILTIN_GATHERDIV8SI,
31850 IX86_BUILTIN_VFMSUBSD3_MASK3,
31851 IX86_BUILTIN_VFMSUBSS3_MASK3,
31852 IX86_BUILTIN_GATHER3SIV8SF,
31853 IX86_BUILTIN_GATHER3SIV4SF,
31854 IX86_BUILTIN_GATHER3SIV4DF,
31855 IX86_BUILTIN_GATHER3SIV2DF,
31856 IX86_BUILTIN_GATHER3DIV8SF,
31857 IX86_BUILTIN_GATHER3DIV4SF,
31858 IX86_BUILTIN_GATHER3DIV4DF,
31859 IX86_BUILTIN_GATHER3DIV2DF,
31860 IX86_BUILTIN_GATHER3SIV8SI,
31861 IX86_BUILTIN_GATHER3SIV4SI,
31862 IX86_BUILTIN_GATHER3SIV4DI,
31863 IX86_BUILTIN_GATHER3SIV2DI,
31864 IX86_BUILTIN_GATHER3DIV8SI,
31865 IX86_BUILTIN_GATHER3DIV4SI,
31866 IX86_BUILTIN_GATHER3DIV4DI,
31867 IX86_BUILTIN_GATHER3DIV2DI,
31868 IX86_BUILTIN_SCATTERSIV8SF,
31869 IX86_BUILTIN_SCATTERSIV4SF,
31870 IX86_BUILTIN_SCATTERSIV4DF,
31871 IX86_BUILTIN_SCATTERSIV2DF,
31872 IX86_BUILTIN_SCATTERDIV8SF,
31873 IX86_BUILTIN_SCATTERDIV4SF,
31874 IX86_BUILTIN_SCATTERDIV4DF,
31875 IX86_BUILTIN_SCATTERDIV2DF,
31876 IX86_BUILTIN_SCATTERSIV8SI,
31877 IX86_BUILTIN_SCATTERSIV4SI,
31878 IX86_BUILTIN_SCATTERSIV4DI,
31879 IX86_BUILTIN_SCATTERSIV2DI,
31880 IX86_BUILTIN_SCATTERDIV8SI,
31881 IX86_BUILTIN_SCATTERDIV4SI,
31882 IX86_BUILTIN_SCATTERDIV4DI,
31883 IX86_BUILTIN_SCATTERDIV2DI,
31884 /* Alternate 4 and 8 element gather/scatter for the vectorizer
31885 where all operands are 32-byte or 64-byte wide respectively. */
31886 IX86_BUILTIN_GATHERALTSIV4DF,
31887 IX86_BUILTIN_GATHERALTDIV8SF,
31888 IX86_BUILTIN_GATHERALTSIV4DI,
31889 IX86_BUILTIN_GATHERALTDIV8SI,
31890 IX86_BUILTIN_GATHER3ALTDIV16SF,
31891 IX86_BUILTIN_GATHER3ALTDIV16SI,
31892 IX86_BUILTIN_GATHER3ALTSIV4DF,
31893 IX86_BUILTIN_GATHER3ALTDIV8SF,
31894 IX86_BUILTIN_GATHER3ALTSIV4DI,
31895 IX86_BUILTIN_GATHER3ALTDIV8SI,
31896 IX86_BUILTIN_GATHER3ALTSIV8DF,
31897 IX86_BUILTIN_GATHER3ALTSIV8DI,
31898 IX86_BUILTIN_GATHER3DIV16SF,
31899 IX86_BUILTIN_GATHER3DIV16SI,
31900 IX86_BUILTIN_GATHER3DIV8DF,
31901 IX86_BUILTIN_GATHER3DIV8DI,
31902 IX86_BUILTIN_GATHER3SIV16SF,
31903 IX86_BUILTIN_GATHER3SIV16SI,
31904 IX86_BUILTIN_GATHER3SIV8DF,
31905 IX86_BUILTIN_GATHER3SIV8DI,
31906 IX86_BUILTIN_SCATTERALTSIV8DF,
31907 IX86_BUILTIN_SCATTERALTDIV16SF,
31908 IX86_BUILTIN_SCATTERALTSIV8DI,
31909 IX86_BUILTIN_SCATTERALTDIV16SI,
31910 IX86_BUILTIN_SCATTERDIV16SF,
31911 IX86_BUILTIN_SCATTERDIV16SI,
31912 IX86_BUILTIN_SCATTERDIV8DF,
31913 IX86_BUILTIN_SCATTERDIV8DI,
31914 IX86_BUILTIN_SCATTERSIV16SF,
31915 IX86_BUILTIN_SCATTERSIV16SI,
31916 IX86_BUILTIN_SCATTERSIV8DF,
31917 IX86_BUILTIN_SCATTERSIV8DI,
31918 IX86_BUILTIN_GATHERPFQPD,
31919 IX86_BUILTIN_GATHERPFDPS,
31920 IX86_BUILTIN_GATHERPFDPD,
31921 IX86_BUILTIN_GATHERPFQPS,
31922 IX86_BUILTIN_SCATTERPFDPD,
31923 IX86_BUILTIN_SCATTERPFDPS,
31924 IX86_BUILTIN_SCATTERPFQPD,
31925 IX86_BUILTIN_SCATTERPFQPS,
31926 IX86_BUILTIN_CLWB,
31927 IX86_BUILTIN_CLFLUSHOPT,
31928 IX86_BUILTIN_INFQ,
31929 IX86_BUILTIN_HUGE_VALQ,
31930 IX86_BUILTIN_NANQ,
31931 IX86_BUILTIN_NANSQ,
31932 IX86_BUILTIN_XABORT,
31933 IX86_BUILTIN_ADDCARRYX32,
31934 IX86_BUILTIN_ADDCARRYX64,
31935 IX86_BUILTIN_SBB32,
31936 IX86_BUILTIN_SBB64,
31937 IX86_BUILTIN_RDRAND16_STEP,
31938 IX86_BUILTIN_RDRAND32_STEP,
31939 IX86_BUILTIN_RDRAND64_STEP,
31940 IX86_BUILTIN_RDSEED16_STEP,
31941 IX86_BUILTIN_RDSEED32_STEP,
31942 IX86_BUILTIN_RDSEED64_STEP,
31943 IX86_BUILTIN_MONITORX,
31944 IX86_BUILTIN_MWAITX,
31945 IX86_BUILTIN_CFSTRING,
31946 IX86_BUILTIN_CPU_INIT,
31947 IX86_BUILTIN_CPU_IS,
31948 IX86_BUILTIN_CPU_SUPPORTS,
31949 IX86_BUILTIN_READ_FLAGS,
31950 IX86_BUILTIN_WRITE_FLAGS,
31952 /* All the remaining builtins are tracked in bdesc_* arrays in
31953 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
31954 this point. */
31955 #define BDESC(mask, icode, name, code, comparison, flag) \
31956 code,
31957 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31958 code, \
31959 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31960 #define BDESC_END(kind, next_kind)
31962 #include "i386-builtin.def"
31964 #undef BDESC
31965 #undef BDESC_FIRST
31966 #undef BDESC_END
31968 IX86_BUILTIN_MAX,
31970 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31972 /* Now just the aliases for bdesc_* start/end. */
31973 #define BDESC(mask, icode, name, code, comparison, flag)
31974 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31975 #define BDESC_END(kind, next_kind) \
31976 IX86_BUILTIN__BDESC_##kind##_LAST \
31977 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31979 #include "i386-builtin.def"
31981 #undef BDESC
31982 #undef BDESC_FIRST
31983 #undef BDESC_END
31985 /* Just to make sure there is no comma after the last enumerator. */
31986 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31989 /* Table for the ix86 builtin decls. */
31990 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31992 /* Table of all of the builtin functions that are possible with different ISA's
31993 but are waiting to be built until a function is declared to use that
31994 ISA. */
31995 struct builtin_isa {
31996 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31997 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31998 const char *name; /* function name */
31999 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
32000 unsigned char const_p:1; /* true if the declaration is constant */
32001 unsigned char pure_p:1; /* true if the declaration has pure attribute */
32002 bool leaf_p; /* true if the declaration has leaf attribute */
32003 bool nothrow_p; /* true if the declaration has nothrow attribute */
32004 bool set_and_not_built_p;
32007 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
32009 /* Bits that can still enable any inclusion of a builtin. */
32010 static HOST_WIDE_INT deferred_isa_values = 0;
32011 static HOST_WIDE_INT deferred_isa_values2 = 0;
32013 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32014 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32015 function decl in the ix86_builtins array. Returns the function decl or
32016 NULL_TREE, if the builtin was not added.
32018 If the front end has a special hook for builtin functions, delay adding
32019 builtin functions that aren't in the current ISA until the ISA is changed
32020 with function specific optimization. Doing so, can save about 300K for the
32021 default compiler. When the builtin is expanded, check at that time whether
32022 it is valid.
32024 If the front end doesn't have a special hook, record all builtins, even if
32025 it isn't an instruction set in the current ISA in case the user uses
32026 function specific options for a different ISA, so that we don't get scope
32027 errors if a builtin is added in the middle of a function scope. */
32029 static inline tree
32030 def_builtin (HOST_WIDE_INT mask, const char *name,
32031 enum ix86_builtin_func_type tcode,
32032 enum ix86_builtins code)
32034 tree decl = NULL_TREE;
32036 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32038 ix86_builtins_isa[(int) code].isa = mask;
32040 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32041 where any bit set means that built-in is enable, this bit must be *and-ed*
32042 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32043 means that *both* cpuid bits must be set for the built-in to be available.
32044 Handle this here. */
32045 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32046 mask &= ~OPTION_MASK_ISA_AVX512VL;
32048 mask &= ~OPTION_MASK_ISA_64BIT;
32049 if (mask == 0
32050 || (mask & ix86_isa_flags) != 0
32051 || (lang_hooks.builtin_function
32052 == lang_hooks.builtin_function_ext_scope))
32055 tree type = ix86_get_builtin_func_type (tcode);
32056 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32057 NULL, NULL_TREE);
32058 ix86_builtins[(int) code] = decl;
32059 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32061 else
32063 /* Just a MASK where set_and_not_built_p == true can potentially
32064 include a builtin. */
32065 deferred_isa_values |= mask;
32066 ix86_builtins[(int) code] = NULL_TREE;
32067 ix86_builtins_isa[(int) code].tcode = tcode;
32068 ix86_builtins_isa[(int) code].name = name;
32069 ix86_builtins_isa[(int) code].leaf_p = false;
32070 ix86_builtins_isa[(int) code].nothrow_p = false;
32071 ix86_builtins_isa[(int) code].const_p = false;
32072 ix86_builtins_isa[(int) code].pure_p = false;
32073 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32077 return decl;
32080 /* Like def_builtin, but also marks the function decl "const". */
32082 static inline tree
32083 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32084 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32086 tree decl = def_builtin (mask, name, tcode, code);
32087 if (decl)
32088 TREE_READONLY (decl) = 1;
32089 else
32090 ix86_builtins_isa[(int) code].const_p = true;
32092 return decl;
32095 /* Like def_builtin, but also marks the function decl "pure". */
32097 static inline tree
32098 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32099 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32101 tree decl = def_builtin (mask, name, tcode, code);
32102 if (decl)
32103 DECL_PURE_P (decl) = 1;
32104 else
32105 ix86_builtins_isa[(int) code].pure_p = true;
32107 return decl;
32110 /* Like def_builtin, but for additional isa2 flags. */
32112 static inline tree
32113 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32114 enum ix86_builtin_func_type tcode,
32115 enum ix86_builtins code)
32117 tree decl = NULL_TREE;
32119 ix86_builtins_isa[(int) code].isa2 = mask;
32121 if (mask == 0
32122 || (mask & ix86_isa_flags2) != 0
32123 || (lang_hooks.builtin_function
32124 == lang_hooks.builtin_function_ext_scope))
32127 tree type = ix86_get_builtin_func_type (tcode);
32128 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32129 NULL, NULL_TREE);
32130 ix86_builtins[(int) code] = decl;
32131 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32133 else
32135 /* Just a MASK where set_and_not_built_p == true can potentially
32136 include a builtin. */
32137 deferred_isa_values2 |= mask;
32138 ix86_builtins[(int) code] = NULL_TREE;
32139 ix86_builtins_isa[(int) code].tcode = tcode;
32140 ix86_builtins_isa[(int) code].name = name;
32141 ix86_builtins_isa[(int) code].leaf_p = false;
32142 ix86_builtins_isa[(int) code].nothrow_p = false;
32143 ix86_builtins_isa[(int) code].const_p = false;
32144 ix86_builtins_isa[(int) code].pure_p = false;
32145 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32148 return decl;
32151 /* Like def_builtin, but also marks the function decl "const". */
32153 static inline tree
32154 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32155 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32157 tree decl = def_builtin2 (mask, name, tcode, code);
32158 if (decl)
32159 TREE_READONLY (decl) = 1;
32160 else
32161 ix86_builtins_isa[(int) code].const_p = true;
32163 return decl;
32166 /* Like def_builtin, but also marks the function decl "pure". */
32168 static inline tree
32169 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32170 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32172 tree decl = def_builtin2 (mask, name, tcode, code);
32173 if (decl)
32174 DECL_PURE_P (decl) = 1;
32175 else
32176 ix86_builtins_isa[(int) code].pure_p = true;
32178 return decl;
32181 /* Add any new builtin functions for a given ISA that may not have been
32182 declared. This saves a bit of space compared to adding all of the
32183 declarations to the tree, even if we didn't use them. */
32185 static void
32186 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32188 if ((isa & deferred_isa_values) == 0
32189 && (isa2 & deferred_isa_values2) == 0)
32190 return;
32192 /* Bits in ISA value can be removed from potential isa values. */
32193 deferred_isa_values &= ~isa;
32194 deferred_isa_values2 &= ~isa2;
32196 int i;
32197 tree saved_current_target_pragma = current_target_pragma;
32198 current_target_pragma = NULL_TREE;
32200 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32202 if (((ix86_builtins_isa[i].isa & isa) != 0
32203 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32204 && ix86_builtins_isa[i].set_and_not_built_p)
32206 tree decl, type;
32208 /* Don't define the builtin again. */
32209 ix86_builtins_isa[i].set_and_not_built_p = false;
32211 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32212 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32213 type, i, BUILT_IN_MD, NULL,
32214 NULL_TREE);
32216 ix86_builtins[i] = decl;
32217 if (ix86_builtins_isa[i].const_p)
32218 TREE_READONLY (decl) = 1;
32219 if (ix86_builtins_isa[i].pure_p)
32220 DECL_PURE_P (decl) = 1;
32221 if (ix86_builtins_isa[i].leaf_p)
32222 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32223 NULL_TREE);
32224 if (ix86_builtins_isa[i].nothrow_p)
32225 TREE_NOTHROW (decl) = 1;
32229 current_target_pragma = saved_current_target_pragma;
32232 /* Bits for builtin_description.flag. */
32234 /* Set when we don't support the comparison natively, and should
32235 swap_comparison in order to support it. */
32236 #define BUILTIN_DESC_SWAP_OPERANDS 1
32238 struct builtin_description
32240 const HOST_WIDE_INT mask;
32241 const enum insn_code icode;
32242 const char *const name;
32243 const enum ix86_builtins code;
32244 const enum rtx_code comparison;
32245 const int flag;
32248 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32249 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32250 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32251 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32252 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32253 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32254 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32255 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32256 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32257 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32258 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32259 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32260 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32261 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32262 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32263 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32264 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32265 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32266 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32267 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32268 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32269 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32270 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32271 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32272 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32273 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32274 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32275 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32276 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32277 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32278 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32279 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32280 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32281 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32282 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32283 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32284 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32285 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32286 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32287 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32288 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32289 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32290 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32291 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32292 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32293 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32294 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32295 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32296 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32297 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32298 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32299 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32301 #define BDESC(mask, icode, name, code, comparison, flag) \
32302 { mask, icode, name, code, comparison, flag },
32303 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32304 static const struct builtin_description bdesc_##kind[] = \
32306 BDESC (mask, icode, name, code, comparison, flag)
32307 #define BDESC_END(kind, next_kind) \
32310 #include "i386-builtin.def"
32312 #undef BDESC
32313 #undef BDESC_FIRST
32314 #undef BDESC_END
32316 /* TM vector builtins. */
32318 /* Reuse the existing x86-specific `struct builtin_description' cause
32319 we're lazy. Add casts to make them fit. */
32320 static const struct builtin_description bdesc_tm[] =
32322 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32323 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32324 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32325 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32326 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32327 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32328 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32330 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32331 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32332 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32333 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32334 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32335 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32336 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32338 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32339 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32340 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32341 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32342 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32343 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32344 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32346 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32347 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32348 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32351 /* Initialize the transactional memory vector load/store builtins. */
32353 static void
32354 ix86_init_tm_builtins (void)
32356 enum ix86_builtin_func_type ftype;
32357 const struct builtin_description *d;
32358 size_t i;
32359 tree decl;
32360 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32361 tree attrs_log, attrs_type_log;
32363 if (!flag_tm)
32364 return;
32366 /* If there are no builtins defined, we must be compiling in a
32367 language without trans-mem support. */
32368 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32369 return;
32371 /* Use whatever attributes a normal TM load has. */
32372 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32373 attrs_load = DECL_ATTRIBUTES (decl);
32374 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32375 /* Use whatever attributes a normal TM store has. */
32376 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32377 attrs_store = DECL_ATTRIBUTES (decl);
32378 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32379 /* Use whatever attributes a normal TM log has. */
32380 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32381 attrs_log = DECL_ATTRIBUTES (decl);
32382 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32384 for (i = 0, d = bdesc_tm;
32385 i < ARRAY_SIZE (bdesc_tm);
32386 i++, d++)
32388 if ((d->mask & ix86_isa_flags) != 0
32389 || (lang_hooks.builtin_function
32390 == lang_hooks.builtin_function_ext_scope))
32392 tree type, attrs, attrs_type;
32393 enum built_in_function code = (enum built_in_function) d->code;
32395 ftype = (enum ix86_builtin_func_type) d->flag;
32396 type = ix86_get_builtin_func_type (ftype);
32398 if (BUILTIN_TM_LOAD_P (code))
32400 attrs = attrs_load;
32401 attrs_type = attrs_type_load;
32403 else if (BUILTIN_TM_STORE_P (code))
32405 attrs = attrs_store;
32406 attrs_type = attrs_type_store;
32408 else
32410 attrs = attrs_log;
32411 attrs_type = attrs_type_log;
32413 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32414 /* The builtin without the prefix for
32415 calling it directly. */
32416 d->name + strlen ("__builtin_"),
32417 attrs);
32418 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32419 set the TYPE_ATTRIBUTES. */
32420 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32422 set_builtin_decl (code, decl, false);
32427 /* Macros for verification of enum ix86_builtins order. */
32428 #define BDESC_VERIFY(x, y, z) \
32429 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32430 #define BDESC_VERIFYS(x, y, z) \
32431 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32433 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32434 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32435 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32436 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32437 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32438 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32439 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32440 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32441 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32442 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32443 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32444 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32445 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32446 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32447 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32448 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32449 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32450 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32451 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32452 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32454 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32455 in the current target ISA to allow the user to compile particular modules
32456 with different target specific options that differ from the command line
32457 options. */
32458 static void
32459 ix86_init_mmx_sse_builtins (void)
32461 const struct builtin_description * d;
32462 enum ix86_builtin_func_type ftype;
32463 size_t i;
32465 /* Add all special builtins with variable number of operands. */
32466 for (i = 0, d = bdesc_special_args;
32467 i < ARRAY_SIZE (bdesc_special_args);
32468 i++, d++)
32470 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32471 if (d->name == 0)
32472 continue;
32474 ftype = (enum ix86_builtin_func_type) d->flag;
32475 def_builtin (d->mask, d->name, ftype, d->code);
32477 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32478 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32479 ARRAY_SIZE (bdesc_special_args) - 1);
32481 /* Add all builtins with variable number of operands. */
32482 for (i = 0, d = bdesc_args;
32483 i < ARRAY_SIZE (bdesc_args);
32484 i++, d++)
32486 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32487 if (d->name == 0)
32488 continue;
32490 ftype = (enum ix86_builtin_func_type) d->flag;
32491 def_builtin_const (d->mask, d->name, ftype, d->code);
32493 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32494 IX86_BUILTIN__BDESC_ARGS_FIRST,
32495 ARRAY_SIZE (bdesc_args) - 1);
32497 /* Add all builtins with variable number of operands. */
32498 for (i = 0, d = bdesc_args2;
32499 i < ARRAY_SIZE (bdesc_args2);
32500 i++, d++)
32502 if (d->name == 0)
32503 continue;
32505 ftype = (enum ix86_builtin_func_type) d->flag;
32506 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32509 /* Add all builtins with rounding. */
32510 for (i = 0, d = bdesc_round_args;
32511 i < ARRAY_SIZE (bdesc_round_args);
32512 i++, d++)
32514 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32515 if (d->name == 0)
32516 continue;
32518 ftype = (enum ix86_builtin_func_type) d->flag;
32519 def_builtin_const (d->mask, d->name, ftype, d->code);
32521 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32522 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32523 ARRAY_SIZE (bdesc_round_args) - 1);
32525 /* pcmpestr[im] insns. */
32526 for (i = 0, d = bdesc_pcmpestr;
32527 i < ARRAY_SIZE (bdesc_pcmpestr);
32528 i++, d++)
32530 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32531 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32532 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32533 else
32534 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32535 def_builtin_const (d->mask, d->name, ftype, d->code);
32537 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32538 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32539 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32541 /* pcmpistr[im] insns. */
32542 for (i = 0, d = bdesc_pcmpistr;
32543 i < ARRAY_SIZE (bdesc_pcmpistr);
32544 i++, d++)
32546 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32547 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32548 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32549 else
32550 ftype = INT_FTYPE_V16QI_V16QI_INT;
32551 def_builtin_const (d->mask, d->name, ftype, d->code);
32553 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32554 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32555 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32557 /* comi/ucomi insns. */
32558 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32560 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32561 if (d->mask == OPTION_MASK_ISA_SSE2)
32562 ftype = INT_FTYPE_V2DF_V2DF;
32563 else
32564 ftype = INT_FTYPE_V4SF_V4SF;
32565 def_builtin_const (d->mask, d->name, ftype, d->code);
32567 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32568 IX86_BUILTIN__BDESC_COMI_FIRST,
32569 ARRAY_SIZE (bdesc_comi) - 1);
32571 /* SSE */
32572 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32573 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32574 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32575 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32577 /* SSE or 3DNow!A */
32578 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32579 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32580 IX86_BUILTIN_MASKMOVQ);
32582 /* SSE2 */
32583 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32584 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32586 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32587 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32588 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32589 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32591 /* SSE3. */
32592 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32593 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32594 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32595 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32597 /* AES */
32598 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32599 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32600 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32601 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32602 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32603 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32604 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32605 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32606 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32607 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32608 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32609 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32611 /* PCLMUL */
32612 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32613 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32615 /* RDRND */
32616 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32617 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32618 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32619 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32620 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32621 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32622 IX86_BUILTIN_RDRAND64_STEP);
32624 /* AVX2 */
32625 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32626 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32627 IX86_BUILTIN_GATHERSIV2DF);
32629 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32630 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32631 IX86_BUILTIN_GATHERSIV4DF);
32633 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32634 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32635 IX86_BUILTIN_GATHERDIV2DF);
32637 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32638 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32639 IX86_BUILTIN_GATHERDIV4DF);
32641 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32642 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32643 IX86_BUILTIN_GATHERSIV4SF);
32645 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32646 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32647 IX86_BUILTIN_GATHERSIV8SF);
32649 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32650 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32651 IX86_BUILTIN_GATHERDIV4SF);
32653 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32654 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32655 IX86_BUILTIN_GATHERDIV8SF);
32657 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32658 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32659 IX86_BUILTIN_GATHERSIV2DI);
32661 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32662 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32663 IX86_BUILTIN_GATHERSIV4DI);
32665 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32666 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32667 IX86_BUILTIN_GATHERDIV2DI);
32669 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32670 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32671 IX86_BUILTIN_GATHERDIV4DI);
32673 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32674 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32675 IX86_BUILTIN_GATHERSIV4SI);
32677 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32678 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32679 IX86_BUILTIN_GATHERSIV8SI);
32681 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32682 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32683 IX86_BUILTIN_GATHERDIV4SI);
32685 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32686 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32687 IX86_BUILTIN_GATHERDIV8SI);
32689 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32690 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32691 IX86_BUILTIN_GATHERALTSIV4DF);
32693 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32694 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32695 IX86_BUILTIN_GATHERALTDIV8SF);
32697 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32698 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32699 IX86_BUILTIN_GATHERALTSIV4DI);
32701 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32702 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32703 IX86_BUILTIN_GATHERALTDIV8SI);
32705 /* AVX512F */
32706 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32707 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32708 IX86_BUILTIN_GATHER3SIV16SF);
32710 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32711 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32712 IX86_BUILTIN_GATHER3SIV8DF);
32714 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32715 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32716 IX86_BUILTIN_GATHER3DIV16SF);
32718 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32719 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32720 IX86_BUILTIN_GATHER3DIV8DF);
32722 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32723 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32724 IX86_BUILTIN_GATHER3SIV16SI);
32726 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32727 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32728 IX86_BUILTIN_GATHER3SIV8DI);
32730 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32731 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32732 IX86_BUILTIN_GATHER3DIV16SI);
32734 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32735 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32736 IX86_BUILTIN_GATHER3DIV8DI);
32738 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32739 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32740 IX86_BUILTIN_GATHER3ALTSIV8DF);
32742 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32743 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32744 IX86_BUILTIN_GATHER3ALTDIV16SF);
32746 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32747 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32748 IX86_BUILTIN_GATHER3ALTSIV8DI);
32750 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32751 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32752 IX86_BUILTIN_GATHER3ALTDIV16SI);
32754 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32755 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32756 IX86_BUILTIN_SCATTERSIV16SF);
32758 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32759 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32760 IX86_BUILTIN_SCATTERSIV8DF);
32762 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32763 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32764 IX86_BUILTIN_SCATTERDIV16SF);
32766 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32767 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32768 IX86_BUILTIN_SCATTERDIV8DF);
32770 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32771 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32772 IX86_BUILTIN_SCATTERSIV16SI);
32774 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32775 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32776 IX86_BUILTIN_SCATTERSIV8DI);
32778 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32779 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32780 IX86_BUILTIN_SCATTERDIV16SI);
32782 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32783 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32784 IX86_BUILTIN_SCATTERDIV8DI);
32786 /* AVX512VL */
32787 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32788 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32789 IX86_BUILTIN_GATHER3SIV2DF);
32791 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32792 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32793 IX86_BUILTIN_GATHER3SIV4DF);
32795 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32796 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32797 IX86_BUILTIN_GATHER3DIV2DF);
32799 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32800 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32801 IX86_BUILTIN_GATHER3DIV4DF);
32803 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32804 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32805 IX86_BUILTIN_GATHER3SIV4SF);
32807 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32808 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32809 IX86_BUILTIN_GATHER3SIV8SF);
32811 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32812 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32813 IX86_BUILTIN_GATHER3DIV4SF);
32815 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32816 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32817 IX86_BUILTIN_GATHER3DIV8SF);
32819 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32820 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32821 IX86_BUILTIN_GATHER3SIV2DI);
32823 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32824 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32825 IX86_BUILTIN_GATHER3SIV4DI);
32827 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32828 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32829 IX86_BUILTIN_GATHER3DIV2DI);
32831 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32832 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32833 IX86_BUILTIN_GATHER3DIV4DI);
32835 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32836 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32837 IX86_BUILTIN_GATHER3SIV4SI);
32839 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32840 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32841 IX86_BUILTIN_GATHER3SIV8SI);
32843 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32844 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32845 IX86_BUILTIN_GATHER3DIV4SI);
32847 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32848 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32849 IX86_BUILTIN_GATHER3DIV8SI);
32851 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32852 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32853 IX86_BUILTIN_GATHER3ALTSIV4DF);
32855 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32856 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32857 IX86_BUILTIN_GATHER3ALTDIV8SF);
32859 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32860 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32861 IX86_BUILTIN_GATHER3ALTSIV4DI);
32863 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32864 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32865 IX86_BUILTIN_GATHER3ALTDIV8SI);
32867 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32868 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32869 IX86_BUILTIN_SCATTERSIV8SF);
32871 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32872 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
32873 IX86_BUILTIN_SCATTERSIV4SF);
32875 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
32876 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
32877 IX86_BUILTIN_SCATTERSIV4DF);
32879 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
32880 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
32881 IX86_BUILTIN_SCATTERSIV2DF);
32883 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
32884 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
32885 IX86_BUILTIN_SCATTERDIV8SF);
32887 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
32888 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
32889 IX86_BUILTIN_SCATTERDIV4SF);
32891 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
32892 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
32893 IX86_BUILTIN_SCATTERDIV4DF);
32895 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
32896 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
32897 IX86_BUILTIN_SCATTERDIV2DF);
32899 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
32900 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
32901 IX86_BUILTIN_SCATTERSIV8SI);
32903 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
32904 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
32905 IX86_BUILTIN_SCATTERSIV4SI);
32907 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
32908 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
32909 IX86_BUILTIN_SCATTERSIV4DI);
32911 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
32912 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
32913 IX86_BUILTIN_SCATTERSIV2DI);
32915 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
32916 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
32917 IX86_BUILTIN_SCATTERDIV8SI);
32919 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
32920 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
32921 IX86_BUILTIN_SCATTERDIV4SI);
32923 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
32924 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
32925 IX86_BUILTIN_SCATTERDIV4DI);
32927 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
32928 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
32929 IX86_BUILTIN_SCATTERDIV2DI);
32930 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
32931 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
32932 IX86_BUILTIN_SCATTERALTSIV8DF);
32934 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
32935 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
32936 IX86_BUILTIN_SCATTERALTDIV16SF);
32938 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
32939 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
32940 IX86_BUILTIN_SCATTERALTSIV8DI);
32942 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
32943 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
32944 IX86_BUILTIN_SCATTERALTDIV16SI);
32946 /* AVX512PF */
32947 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
32948 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32949 IX86_BUILTIN_GATHERPFDPD);
32950 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
32951 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32952 IX86_BUILTIN_GATHERPFDPS);
32953 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
32954 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32955 IX86_BUILTIN_GATHERPFQPD);
32956 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
32957 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32958 IX86_BUILTIN_GATHERPFQPS);
32959 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
32960 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32961 IX86_BUILTIN_SCATTERPFDPD);
32962 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
32963 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32964 IX86_BUILTIN_SCATTERPFDPS);
32965 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
32966 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32967 IX86_BUILTIN_SCATTERPFQPD);
32968 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
32969 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32970 IX86_BUILTIN_SCATTERPFQPS);
32972 /* SHA */
32973 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
32974 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
32975 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
32976 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
32977 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
32978 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
32979 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
32980 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
32981 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
32982 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
32983 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
32984 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
32985 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
32986 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
32988 /* RTM. */
32989 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
32990 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32992 /* MMX access to the vec_init patterns. */
32993 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32994 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32996 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32997 V4HI_FTYPE_HI_HI_HI_HI,
32998 IX86_BUILTIN_VEC_INIT_V4HI);
33000 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
33001 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
33002 IX86_BUILTIN_VEC_INIT_V8QI);
33004 /* Access to the vec_extract patterns. */
33005 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
33006 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
33007 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
33008 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
33009 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
33010 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33011 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33012 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33013 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33014 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33016 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33017 "__builtin_ia32_vec_ext_v4hi",
33018 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33020 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33021 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33023 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33024 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33026 /* Access to the vec_set patterns. */
33027 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33028 "__builtin_ia32_vec_set_v2di",
33029 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33031 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33032 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33034 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33035 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33037 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33038 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33040 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33041 "__builtin_ia32_vec_set_v4hi",
33042 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33044 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33045 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33047 /* RDSEED */
33048 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33049 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33050 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33051 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33052 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33053 "__builtin_ia32_rdseed_di_step",
33054 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33056 /* ADCX */
33057 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33058 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33059 def_builtin (OPTION_MASK_ISA_64BIT,
33060 "__builtin_ia32_addcarryx_u64",
33061 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33062 IX86_BUILTIN_ADDCARRYX64);
33064 /* SBB */
33065 def_builtin (0, "__builtin_ia32_sbb_u32",
33066 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33067 def_builtin (OPTION_MASK_ISA_64BIT,
33068 "__builtin_ia32_sbb_u64",
33069 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33070 IX86_BUILTIN_SBB64);
33072 /* Read/write FLAGS. */
33073 def_builtin (0, "__builtin_ia32_readeflags_u32",
33074 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33075 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33076 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33077 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33078 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33079 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33080 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33082 /* CLFLUSHOPT. */
33083 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33084 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33086 /* CLWB. */
33087 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33088 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33090 /* MONITORX and MWAITX. */
33091 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33092 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33093 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33094 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33096 /* CLZERO. */
33097 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33098 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33100 /* Add FMA4 multi-arg argument instructions */
33101 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33103 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33104 if (d->name == 0)
33105 continue;
33107 ftype = (enum ix86_builtin_func_type) d->flag;
33108 def_builtin_const (d->mask, d->name, ftype, d->code);
33110 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33111 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33112 ARRAY_SIZE (bdesc_multi_arg) - 1);
33115 static void
33116 ix86_init_mpx_builtins ()
33118 const struct builtin_description * d;
33119 enum ix86_builtin_func_type ftype;
33120 tree decl;
33121 size_t i;
33123 for (i = 0, d = bdesc_mpx;
33124 i < ARRAY_SIZE (bdesc_mpx);
33125 i++, d++)
33127 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33128 if (d->name == 0)
33129 continue;
33131 ftype = (enum ix86_builtin_func_type) d->flag;
33132 decl = def_builtin (d->mask, d->name, ftype, d->code);
33134 /* With no leaf and nothrow flags for MPX builtins
33135 abnormal edges may follow its call when setjmp
33136 presents in the function. Since we may have a lot
33137 of MPX builtins calls it causes lots of useless
33138 edges and enormous PHI nodes. To avoid this we mark
33139 MPX builtins as leaf and nothrow. */
33140 if (decl)
33142 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33143 NULL_TREE);
33144 TREE_NOTHROW (decl) = 1;
33146 else
33148 ix86_builtins_isa[(int)d->code].leaf_p = true;
33149 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33152 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33153 IX86_BUILTIN__BDESC_MPX_FIRST,
33154 ARRAY_SIZE (bdesc_mpx) - 1);
33156 for (i = 0, d = bdesc_mpx_const;
33157 i < ARRAY_SIZE (bdesc_mpx_const);
33158 i++, d++)
33160 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33161 if (d->name == 0)
33162 continue;
33164 ftype = (enum ix86_builtin_func_type) d->flag;
33165 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33167 if (decl)
33169 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33170 NULL_TREE);
33171 TREE_NOTHROW (decl) = 1;
33173 else
33175 ix86_builtins_isa[(int)d->code].leaf_p = true;
33176 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33179 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33180 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33181 ARRAY_SIZE (bdesc_mpx_const) - 1);
33183 #undef BDESC_VERIFY
33184 #undef BDESC_VERIFYS
33186 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33187 to return a pointer to VERSION_DECL if the outcome of the expression
33188 formed by PREDICATE_CHAIN is true. This function will be called during
33189 version dispatch to decide which function version to execute. It returns
33190 the basic block at the end, to which more conditions can be added. */
33192 static basic_block
33193 add_condition_to_bb (tree function_decl, tree version_decl,
33194 tree predicate_chain, basic_block new_bb)
33196 gimple *return_stmt;
33197 tree convert_expr, result_var;
33198 gimple *convert_stmt;
33199 gimple *call_cond_stmt;
33200 gimple *if_else_stmt;
33202 basic_block bb1, bb2, bb3;
33203 edge e12, e23;
33205 tree cond_var, and_expr_var = NULL_TREE;
33206 gimple_seq gseq;
33208 tree predicate_decl, predicate_arg;
33210 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33212 gcc_assert (new_bb != NULL);
33213 gseq = bb_seq (new_bb);
33216 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33217 build_fold_addr_expr (version_decl));
33218 result_var = create_tmp_var (ptr_type_node);
33219 convert_stmt = gimple_build_assign (result_var, convert_expr);
33220 return_stmt = gimple_build_return (result_var);
33222 if (predicate_chain == NULL_TREE)
33224 gimple_seq_add_stmt (&gseq, convert_stmt);
33225 gimple_seq_add_stmt (&gseq, return_stmt);
33226 set_bb_seq (new_bb, gseq);
33227 gimple_set_bb (convert_stmt, new_bb);
33228 gimple_set_bb (return_stmt, new_bb);
33229 pop_cfun ();
33230 return new_bb;
33233 while (predicate_chain != NULL)
33235 cond_var = create_tmp_var (integer_type_node);
33236 predicate_decl = TREE_PURPOSE (predicate_chain);
33237 predicate_arg = TREE_VALUE (predicate_chain);
33238 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33239 gimple_call_set_lhs (call_cond_stmt, cond_var);
33241 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33242 gimple_set_bb (call_cond_stmt, new_bb);
33243 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33245 predicate_chain = TREE_CHAIN (predicate_chain);
33247 if (and_expr_var == NULL)
33248 and_expr_var = cond_var;
33249 else
33251 gimple *assign_stmt;
33252 /* Use MIN_EXPR to check if any integer is zero?.
33253 and_expr_var = min_expr <cond_var, and_expr_var> */
33254 assign_stmt = gimple_build_assign (and_expr_var,
33255 build2 (MIN_EXPR, integer_type_node,
33256 cond_var, and_expr_var));
33258 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33259 gimple_set_bb (assign_stmt, new_bb);
33260 gimple_seq_add_stmt (&gseq, assign_stmt);
33264 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33265 integer_zero_node,
33266 NULL_TREE, NULL_TREE);
33267 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33268 gimple_set_bb (if_else_stmt, new_bb);
33269 gimple_seq_add_stmt (&gseq, if_else_stmt);
33271 gimple_seq_add_stmt (&gseq, convert_stmt);
33272 gimple_seq_add_stmt (&gseq, return_stmt);
33273 set_bb_seq (new_bb, gseq);
33275 bb1 = new_bb;
33276 e12 = split_block (bb1, if_else_stmt);
33277 bb2 = e12->dest;
33278 e12->flags &= ~EDGE_FALLTHRU;
33279 e12->flags |= EDGE_TRUE_VALUE;
33281 e23 = split_block (bb2, return_stmt);
33283 gimple_set_bb (convert_stmt, bb2);
33284 gimple_set_bb (return_stmt, bb2);
33286 bb3 = e23->dest;
33287 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33289 remove_edge (e23);
33290 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33292 pop_cfun ();
33294 return bb3;
33297 /* This parses the attribute arguments to target in DECL and determines
33298 the right builtin to use to match the platform specification.
33299 It returns the priority value for this version decl. If PREDICATE_LIST
33300 is not NULL, it stores the list of cpu features that need to be checked
33301 before dispatching this function. */
33303 static unsigned int
33304 get_builtin_code_for_version (tree decl, tree *predicate_list)
33306 tree attrs;
33307 struct cl_target_option cur_target;
33308 tree target_node;
33309 struct cl_target_option *new_target;
33310 const char *arg_str = NULL;
33311 const char *attrs_str = NULL;
33312 char *tok_str = NULL;
33313 char *token;
33315 /* Priority of i386 features, greater value is higher priority. This is
33316 used to decide the order in which function dispatch must happen. For
33317 instance, a version specialized for SSE4.2 should be checked for dispatch
33318 before a version for SSE3, as SSE4.2 implies SSE3. */
33319 enum feature_priority
33321 P_ZERO = 0,
33322 P_MMX,
33323 P_SSE,
33324 P_SSE2,
33325 P_SSE3,
33326 P_SSSE3,
33327 P_PROC_SSSE3,
33328 P_SSE4_A,
33329 P_PROC_SSE4_A,
33330 P_SSE4_1,
33331 P_SSE4_2,
33332 P_PROC_SSE4_2,
33333 P_POPCNT,
33334 P_AES,
33335 P_PCLMUL,
33336 P_AVX,
33337 P_PROC_AVX,
33338 P_BMI,
33339 P_PROC_BMI,
33340 P_FMA4,
33341 P_XOP,
33342 P_PROC_XOP,
33343 P_FMA,
33344 P_PROC_FMA,
33345 P_BMI2,
33346 P_AVX2,
33347 P_PROC_AVX2,
33348 P_AVX512F,
33349 P_PROC_AVX512F
33352 enum feature_priority priority = P_ZERO;
33354 /* These are the target attribute strings for which a dispatcher is
33355 available, from fold_builtin_cpu. */
33357 static struct _feature_list
33359 const char *const name;
33360 const enum feature_priority priority;
33362 const feature_list[] =
33364 {"mmx", P_MMX},
33365 {"sse", P_SSE},
33366 {"sse2", P_SSE2},
33367 {"sse3", P_SSE3},
33368 {"sse4a", P_SSE4_A},
33369 {"ssse3", P_SSSE3},
33370 {"sse4.1", P_SSE4_1},
33371 {"sse4.2", P_SSE4_2},
33372 {"popcnt", P_POPCNT},
33373 {"aes", P_AES},
33374 {"pclmul", P_PCLMUL},
33375 {"avx", P_AVX},
33376 {"bmi", P_BMI},
33377 {"fma4", P_FMA4},
33378 {"xop", P_XOP},
33379 {"fma", P_FMA},
33380 {"bmi2", P_BMI2},
33381 {"avx2", P_AVX2},
33382 {"avx512f", P_AVX512F}
33386 static unsigned int NUM_FEATURES
33387 = sizeof (feature_list) / sizeof (struct _feature_list);
33389 unsigned int i;
33391 tree predicate_chain = NULL_TREE;
33392 tree predicate_decl, predicate_arg;
33394 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33395 gcc_assert (attrs != NULL);
33397 attrs = TREE_VALUE (TREE_VALUE (attrs));
33399 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33400 attrs_str = TREE_STRING_POINTER (attrs);
33402 /* Return priority zero for default function. */
33403 if (strcmp (attrs_str, "default") == 0)
33404 return 0;
33406 /* Handle arch= if specified. For priority, set it to be 1 more than
33407 the best instruction set the processor can handle. For instance, if
33408 there is a version for atom and a version for ssse3 (the highest ISA
33409 priority for atom), the atom version must be checked for dispatch
33410 before the ssse3 version. */
33411 if (strstr (attrs_str, "arch=") != NULL)
33413 cl_target_option_save (&cur_target, &global_options);
33414 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33415 &global_options_set);
33417 gcc_assert (target_node);
33418 new_target = TREE_TARGET_OPTION (target_node);
33419 gcc_assert (new_target);
33421 if (new_target->arch_specified && new_target->arch > 0)
33423 switch (new_target->arch)
33425 case PROCESSOR_CORE2:
33426 arg_str = "core2";
33427 priority = P_PROC_SSSE3;
33428 break;
33429 case PROCESSOR_NEHALEM:
33430 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33431 arg_str = "westmere";
33432 else
33433 /* We translate "arch=corei7" and "arch=nehalem" to
33434 "corei7" so that it will be mapped to M_INTEL_COREI7
33435 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33436 arg_str = "corei7";
33437 priority = P_PROC_SSE4_2;
33438 break;
33439 case PROCESSOR_SANDYBRIDGE:
33440 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33441 arg_str = "ivybridge";
33442 else
33443 arg_str = "sandybridge";
33444 priority = P_PROC_AVX;
33445 break;
33446 case PROCESSOR_HASWELL:
33447 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33448 arg_str = "skylake-avx512";
33449 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33450 arg_str = "skylake";
33451 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33452 arg_str = "broadwell";
33453 else
33454 arg_str = "haswell";
33455 priority = P_PROC_AVX2;
33456 break;
33457 case PROCESSOR_BONNELL:
33458 arg_str = "bonnell";
33459 priority = P_PROC_SSSE3;
33460 break;
33461 case PROCESSOR_KNL:
33462 arg_str = "knl";
33463 priority = P_PROC_AVX512F;
33464 break;
33465 case PROCESSOR_SILVERMONT:
33466 arg_str = "silvermont";
33467 priority = P_PROC_SSE4_2;
33468 break;
33469 case PROCESSOR_AMDFAM10:
33470 arg_str = "amdfam10h";
33471 priority = P_PROC_SSE4_A;
33472 break;
33473 case PROCESSOR_BTVER1:
33474 arg_str = "btver1";
33475 priority = P_PROC_SSE4_A;
33476 break;
33477 case PROCESSOR_BTVER2:
33478 arg_str = "btver2";
33479 priority = P_PROC_BMI;
33480 break;
33481 case PROCESSOR_BDVER1:
33482 arg_str = "bdver1";
33483 priority = P_PROC_XOP;
33484 break;
33485 case PROCESSOR_BDVER2:
33486 arg_str = "bdver2";
33487 priority = P_PROC_FMA;
33488 break;
33489 case PROCESSOR_BDVER3:
33490 arg_str = "bdver3";
33491 priority = P_PROC_FMA;
33492 break;
33493 case PROCESSOR_BDVER4:
33494 arg_str = "bdver4";
33495 priority = P_PROC_AVX2;
33496 break;
33497 case PROCESSOR_ZNVER1:
33498 arg_str = "znver1";
33499 priority = P_PROC_AVX2;
33500 break;
33504 cl_target_option_restore (&global_options, &cur_target);
33506 if (predicate_list && arg_str == NULL)
33508 error_at (DECL_SOURCE_LOCATION (decl),
33509 "No dispatcher found for the versioning attributes");
33510 return 0;
33513 if (predicate_list)
33515 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33516 /* For a C string literal the length includes the trailing NULL. */
33517 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33518 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33519 predicate_chain);
33523 /* Process feature name. */
33524 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33525 strcpy (tok_str, attrs_str);
33526 token = strtok (tok_str, ",");
33527 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33529 while (token != NULL)
33531 /* Do not process "arch=" */
33532 if (strncmp (token, "arch=", 5) == 0)
33534 token = strtok (NULL, ",");
33535 continue;
33537 for (i = 0; i < NUM_FEATURES; ++i)
33539 if (strcmp (token, feature_list[i].name) == 0)
33541 if (predicate_list)
33543 predicate_arg = build_string_literal (
33544 strlen (feature_list[i].name) + 1,
33545 feature_list[i].name);
33546 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33547 predicate_chain);
33549 /* Find the maximum priority feature. */
33550 if (feature_list[i].priority > priority)
33551 priority = feature_list[i].priority;
33553 break;
33556 if (predicate_list && i == NUM_FEATURES)
33558 error_at (DECL_SOURCE_LOCATION (decl),
33559 "No dispatcher found for %s", token);
33560 return 0;
33562 token = strtok (NULL, ",");
33564 free (tok_str);
33566 if (predicate_list && predicate_chain == NULL_TREE)
33568 error_at (DECL_SOURCE_LOCATION (decl),
33569 "No dispatcher found for the versioning attributes : %s",
33570 attrs_str);
33571 return 0;
33573 else if (predicate_list)
33575 predicate_chain = nreverse (predicate_chain);
33576 *predicate_list = predicate_chain;
33579 return priority;
33582 /* This compares the priority of target features in function DECL1
33583 and DECL2. It returns positive value if DECL1 is higher priority,
33584 negative value if DECL2 is higher priority and 0 if they are the
33585 same. */
33587 static int
33588 ix86_compare_version_priority (tree decl1, tree decl2)
33590 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33591 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33593 return (int)priority1 - (int)priority2;
33596 /* V1 and V2 point to function versions with different priorities
33597 based on the target ISA. This function compares their priorities. */
33599 static int
33600 feature_compare (const void *v1, const void *v2)
33602 typedef struct _function_version_info
33604 tree version_decl;
33605 tree predicate_chain;
33606 unsigned int dispatch_priority;
33607 } function_version_info;
33609 const function_version_info c1 = *(const function_version_info *)v1;
33610 const function_version_info c2 = *(const function_version_info *)v2;
33611 return (c2.dispatch_priority - c1.dispatch_priority);
33614 /* This function generates the dispatch function for
33615 multi-versioned functions. DISPATCH_DECL is the function which will
33616 contain the dispatch logic. FNDECLS are the function choices for
33617 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33618 in DISPATCH_DECL in which the dispatch code is generated. */
33620 static int
33621 dispatch_function_versions (tree dispatch_decl,
33622 void *fndecls_p,
33623 basic_block *empty_bb)
33625 tree default_decl;
33626 gimple *ifunc_cpu_init_stmt;
33627 gimple_seq gseq;
33628 int ix;
33629 tree ele;
33630 vec<tree> *fndecls;
33631 unsigned int num_versions = 0;
33632 unsigned int actual_versions = 0;
33633 unsigned int i;
33635 struct _function_version_info
33637 tree version_decl;
33638 tree predicate_chain;
33639 unsigned int dispatch_priority;
33640 }*function_version_info;
33642 gcc_assert (dispatch_decl != NULL
33643 && fndecls_p != NULL
33644 && empty_bb != NULL);
33646 /*fndecls_p is actually a vector. */
33647 fndecls = static_cast<vec<tree> *> (fndecls_p);
33649 /* At least one more version other than the default. */
33650 num_versions = fndecls->length ();
33651 gcc_assert (num_versions >= 2);
33653 function_version_info = (struct _function_version_info *)
33654 XNEWVEC (struct _function_version_info, (num_versions - 1));
33656 /* The first version in the vector is the default decl. */
33657 default_decl = (*fndecls)[0];
33659 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33661 gseq = bb_seq (*empty_bb);
33662 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33663 constructors, so explicity call __builtin_cpu_init here. */
33664 ifunc_cpu_init_stmt = gimple_build_call_vec (
33665 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33666 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33667 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33668 set_bb_seq (*empty_bb, gseq);
33670 pop_cfun ();
33673 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33675 tree version_decl = ele;
33676 tree predicate_chain = NULL_TREE;
33677 unsigned int priority;
33678 /* Get attribute string, parse it and find the right predicate decl.
33679 The predicate function could be a lengthy combination of many
33680 features, like arch-type and various isa-variants. */
33681 priority = get_builtin_code_for_version (version_decl,
33682 &predicate_chain);
33684 if (predicate_chain == NULL_TREE)
33685 continue;
33687 function_version_info [actual_versions].version_decl = version_decl;
33688 function_version_info [actual_versions].predicate_chain
33689 = predicate_chain;
33690 function_version_info [actual_versions].dispatch_priority = priority;
33691 actual_versions++;
33694 /* Sort the versions according to descending order of dispatch priority. The
33695 priority is based on the ISA. This is not a perfect solution. There
33696 could still be ambiguity. If more than one function version is suitable
33697 to execute, which one should be dispatched? In future, allow the user
33698 to specify a dispatch priority next to the version. */
33699 qsort (function_version_info, actual_versions,
33700 sizeof (struct _function_version_info), feature_compare);
33702 for (i = 0; i < actual_versions; ++i)
33703 *empty_bb = add_condition_to_bb (dispatch_decl,
33704 function_version_info[i].version_decl,
33705 function_version_info[i].predicate_chain,
33706 *empty_bb);
33708 /* dispatch default version at the end. */
33709 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33710 NULL, *empty_bb);
33712 free (function_version_info);
33713 return 0;
33716 /* This function changes the assembler name for functions that are
33717 versions. If DECL is a function version and has a "target"
33718 attribute, it appends the attribute string to its assembler name. */
33720 static tree
33721 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33723 tree version_attr;
33724 const char *orig_name, *version_string;
33725 char *attr_str, *assembler_name;
33727 if (DECL_DECLARED_INLINE_P (decl)
33728 && lookup_attribute ("gnu_inline",
33729 DECL_ATTRIBUTES (decl)))
33730 error_at (DECL_SOURCE_LOCATION (decl),
33731 "Function versions cannot be marked as gnu_inline,"
33732 " bodies have to be generated");
33734 if (DECL_VIRTUAL_P (decl)
33735 || DECL_VINDEX (decl))
33736 sorry ("Virtual function multiversioning not supported");
33738 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33740 /* target attribute string cannot be NULL. */
33741 gcc_assert (version_attr != NULL_TREE);
33743 orig_name = IDENTIFIER_POINTER (id);
33744 version_string
33745 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33747 if (strcmp (version_string, "default") == 0)
33748 return id;
33750 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33751 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33753 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33755 /* Allow assembler name to be modified if already set. */
33756 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33757 SET_DECL_RTL (decl, NULL);
33759 tree ret = get_identifier (assembler_name);
33760 XDELETEVEC (attr_str);
33761 XDELETEVEC (assembler_name);
33762 return ret;
33766 static tree
33767 ix86_mangle_decl_assembler_name (tree decl, tree id)
33769 /* For function version, add the target suffix to the assembler name. */
33770 if (TREE_CODE (decl) == FUNCTION_DECL
33771 && DECL_FUNCTION_VERSIONED (decl))
33772 id = ix86_mangle_function_version_assembler_name (decl, id);
33773 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33774 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33775 #endif
33777 return id;
33780 /* Make a dispatcher declaration for the multi-versioned function DECL.
33781 Calls to DECL function will be replaced with calls to the dispatcher
33782 by the front-end. Returns the decl of the dispatcher function. */
33784 static tree
33785 ix86_get_function_versions_dispatcher (void *decl)
33787 tree fn = (tree) decl;
33788 struct cgraph_node *node = NULL;
33789 struct cgraph_node *default_node = NULL;
33790 struct cgraph_function_version_info *node_v = NULL;
33791 struct cgraph_function_version_info *first_v = NULL;
33793 tree dispatch_decl = NULL;
33795 struct cgraph_function_version_info *default_version_info = NULL;
33797 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33799 node = cgraph_node::get (fn);
33800 gcc_assert (node != NULL);
33802 node_v = node->function_version ();
33803 gcc_assert (node_v != NULL);
33805 if (node_v->dispatcher_resolver != NULL)
33806 return node_v->dispatcher_resolver;
33808 /* Find the default version and make it the first node. */
33809 first_v = node_v;
33810 /* Go to the beginning of the chain. */
33811 while (first_v->prev != NULL)
33812 first_v = first_v->prev;
33813 default_version_info = first_v;
33814 while (default_version_info != NULL)
33816 if (is_function_default_version
33817 (default_version_info->this_node->decl))
33818 break;
33819 default_version_info = default_version_info->next;
33822 /* If there is no default node, just return NULL. */
33823 if (default_version_info == NULL)
33824 return NULL;
33826 /* Make default info the first node. */
33827 if (first_v != default_version_info)
33829 default_version_info->prev->next = default_version_info->next;
33830 if (default_version_info->next)
33831 default_version_info->next->prev = default_version_info->prev;
33832 first_v->prev = default_version_info;
33833 default_version_info->next = first_v;
33834 default_version_info->prev = NULL;
33837 default_node = default_version_info->this_node;
33839 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33840 if (targetm.has_ifunc_p ())
33842 struct cgraph_function_version_info *it_v = NULL;
33843 struct cgraph_node *dispatcher_node = NULL;
33844 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33846 /* Right now, the dispatching is done via ifunc. */
33847 dispatch_decl = make_dispatcher_decl (default_node->decl);
33849 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33850 gcc_assert (dispatcher_node != NULL);
33851 dispatcher_node->dispatcher_function = 1;
33852 dispatcher_version_info
33853 = dispatcher_node->insert_new_function_version ();
33854 dispatcher_version_info->next = default_version_info;
33855 dispatcher_node->definition = 1;
33857 /* Set the dispatcher for all the versions. */
33858 it_v = default_version_info;
33859 while (it_v != NULL)
33861 it_v->dispatcher_resolver = dispatch_decl;
33862 it_v = it_v->next;
33865 else
33866 #endif
33868 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33869 "multiversioning needs ifunc which is not supported "
33870 "on this target");
33873 return dispatch_decl;
33876 /* Make the resolver function decl to dispatch the versions of
33877 a multi-versioned function, DEFAULT_DECL. Create an
33878 empty basic block in the resolver and store the pointer in
33879 EMPTY_BB. Return the decl of the resolver function. */
33881 static tree
33882 make_resolver_func (const tree default_decl,
33883 const tree dispatch_decl,
33884 basic_block *empty_bb)
33886 char *resolver_name;
33887 tree decl, type, decl_name, t;
33888 bool is_uniq = false;
33890 /* IFUNC's have to be globally visible. So, if the default_decl is
33891 not, then the name of the IFUNC should be made unique. */
33892 if (TREE_PUBLIC (default_decl) == 0)
33893 is_uniq = true;
33895 /* Append the filename to the resolver function if the versions are
33896 not externally visible. This is because the resolver function has
33897 to be externally visible for the loader to find it. So, appending
33898 the filename will prevent conflicts with a resolver function from
33899 another module which is based on the same version name. */
33900 resolver_name = make_unique_name (default_decl, "resolver", is_uniq);
33902 /* The resolver function should return a (void *). */
33903 type = build_function_type_list (ptr_type_node, NULL_TREE);
33905 decl = build_fn_decl (resolver_name, type);
33906 decl_name = get_identifier (resolver_name);
33907 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33909 DECL_NAME (decl) = decl_name;
33910 TREE_USED (decl) = 1;
33911 DECL_ARTIFICIAL (decl) = 1;
33912 DECL_IGNORED_P (decl) = 0;
33913 /* IFUNC resolvers have to be externally visible. */
33914 TREE_PUBLIC (decl) = 1;
33915 DECL_UNINLINABLE (decl) = 1;
33917 /* Resolver is not external, body is generated. */
33918 DECL_EXTERNAL (decl) = 0;
33919 DECL_EXTERNAL (dispatch_decl) = 0;
33921 DECL_CONTEXT (decl) = NULL_TREE;
33922 DECL_INITIAL (decl) = make_node (BLOCK);
33923 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33925 if (DECL_COMDAT_GROUP (default_decl)
33926 || TREE_PUBLIC (default_decl))
33928 /* In this case, each translation unit with a call to this
33929 versioned function will put out a resolver. Ensure it
33930 is comdat to keep just one copy. */
33931 DECL_COMDAT (decl) = 1;
33932 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33934 /* Build result decl and add to function_decl. */
33935 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33936 DECL_ARTIFICIAL (t) = 1;
33937 DECL_IGNORED_P (t) = 1;
33938 DECL_RESULT (decl) = t;
33940 gimplify_function_tree (decl);
33941 push_cfun (DECL_STRUCT_FUNCTION (decl));
33942 *empty_bb = init_lowered_empty_function (decl, false,
33943 profile_count::uninitialized ());
33945 cgraph_node::add_new_function (decl, true);
33946 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33948 pop_cfun ();
33950 gcc_assert (dispatch_decl != NULL);
33951 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33952 DECL_ATTRIBUTES (dispatch_decl)
33953 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33955 /* Create the alias for dispatch to resolver here. */
33956 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33957 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33958 XDELETEVEC (resolver_name);
33959 return decl;
33962 /* Generate the dispatching code body to dispatch multi-versioned function
33963 DECL. The target hook is called to process the "target" attributes and
33964 provide the code to dispatch the right function at run-time. NODE points
33965 to the dispatcher decl whose body will be created. */
33967 static tree
33968 ix86_generate_version_dispatcher_body (void *node_p)
33970 tree resolver_decl;
33971 basic_block empty_bb;
33972 tree default_ver_decl;
33973 struct cgraph_node *versn;
33974 struct cgraph_node *node;
33976 struct cgraph_function_version_info *node_version_info = NULL;
33977 struct cgraph_function_version_info *versn_info = NULL;
33979 node = (cgraph_node *)node_p;
33981 node_version_info = node->function_version ();
33982 gcc_assert (node->dispatcher_function
33983 && node_version_info != NULL);
33985 if (node_version_info->dispatcher_resolver)
33986 return node_version_info->dispatcher_resolver;
33988 /* The first version in the chain corresponds to the default version. */
33989 default_ver_decl = node_version_info->next->this_node->decl;
33991 /* node is going to be an alias, so remove the finalized bit. */
33992 node->definition = false;
33994 resolver_decl = make_resolver_func (default_ver_decl,
33995 node->decl, &empty_bb);
33997 node_version_info->dispatcher_resolver = resolver_decl;
33999 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
34001 auto_vec<tree, 2> fn_ver_vec;
34003 for (versn_info = node_version_info->next; versn_info;
34004 versn_info = versn_info->next)
34006 versn = versn_info->this_node;
34007 /* Check for virtual functions here again, as by this time it should
34008 have been determined if this function needs a vtable index or
34009 not. This happens for methods in derived classes that override
34010 virtual methods in base classes but are not explicitly marked as
34011 virtual. */
34012 if (DECL_VINDEX (versn->decl))
34013 sorry ("Virtual function multiversioning not supported");
34015 fn_ver_vec.safe_push (versn->decl);
34018 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34019 cgraph_edge::rebuild_edges ();
34020 pop_cfun ();
34021 return resolver_decl;
34023 /* This builds the processor_model struct type defined in
34024 libgcc/config/i386/cpuinfo.c */
34026 static tree
34027 build_processor_model_struct (void)
34029 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34030 "__cpu_features"};
34031 tree field = NULL_TREE, field_chain = NULL_TREE;
34032 int i;
34033 tree type = make_node (RECORD_TYPE);
34035 /* The first 3 fields are unsigned int. */
34036 for (i = 0; i < 3; ++i)
34038 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34039 get_identifier (field_name[i]), unsigned_type_node);
34040 if (field_chain != NULL_TREE)
34041 DECL_CHAIN (field) = field_chain;
34042 field_chain = field;
34045 /* The last field is an array of unsigned integers of size one. */
34046 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34047 get_identifier (field_name[3]),
34048 build_array_type (unsigned_type_node,
34049 build_index_type (size_one_node)));
34050 if (field_chain != NULL_TREE)
34051 DECL_CHAIN (field) = field_chain;
34052 field_chain = field;
34054 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34055 return type;
34058 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34060 static tree
34061 make_var_decl (tree type, const char *name)
34063 tree new_decl;
34065 new_decl = build_decl (UNKNOWN_LOCATION,
34066 VAR_DECL,
34067 get_identifier(name),
34068 type);
34070 DECL_EXTERNAL (new_decl) = 1;
34071 TREE_STATIC (new_decl) = 1;
34072 TREE_PUBLIC (new_decl) = 1;
34073 DECL_INITIAL (new_decl) = 0;
34074 DECL_ARTIFICIAL (new_decl) = 0;
34075 DECL_PRESERVE_P (new_decl) = 1;
34077 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34078 assemble_variable (new_decl, 0, 0, 0);
34080 return new_decl;
34083 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34084 into an integer defined in libgcc/config/i386/cpuinfo.c */
34086 static tree
34087 fold_builtin_cpu (tree fndecl, tree *args)
34089 unsigned int i;
34090 enum ix86_builtins fn_code = (enum ix86_builtins)
34091 DECL_FUNCTION_CODE (fndecl);
34092 tree param_string_cst = NULL;
34094 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34095 enum processor_features
34097 F_CMOV = 0,
34098 F_MMX,
34099 F_POPCNT,
34100 F_SSE,
34101 F_SSE2,
34102 F_SSE3,
34103 F_SSSE3,
34104 F_SSE4_1,
34105 F_SSE4_2,
34106 F_AVX,
34107 F_AVX2,
34108 F_SSE4_A,
34109 F_FMA4,
34110 F_XOP,
34111 F_FMA,
34112 F_AVX512F,
34113 F_BMI,
34114 F_BMI2,
34115 F_AES,
34116 F_PCLMUL,
34117 F_AVX512VL,
34118 F_AVX512BW,
34119 F_AVX512DQ,
34120 F_AVX512CD,
34121 F_AVX512ER,
34122 F_AVX512PF,
34123 F_AVX512VBMI,
34124 F_AVX512IFMA,
34125 F_AVX5124VNNIW,
34126 F_AVX5124FMAPS,
34127 F_AVX512VPOPCNTDQ,
34128 F_MAX
34131 /* These are the values for vendor types and cpu types and subtypes
34132 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34133 the corresponding start value. */
34134 enum processor_model
34136 M_INTEL = 1,
34137 M_AMD,
34138 M_CPU_TYPE_START,
34139 M_INTEL_BONNELL,
34140 M_INTEL_CORE2,
34141 M_INTEL_COREI7,
34142 M_AMDFAM10H,
34143 M_AMDFAM15H,
34144 M_INTEL_SILVERMONT,
34145 M_INTEL_KNL,
34146 M_AMD_BTVER1,
34147 M_AMD_BTVER2,
34148 M_CPU_SUBTYPE_START,
34149 M_INTEL_COREI7_NEHALEM,
34150 M_INTEL_COREI7_WESTMERE,
34151 M_INTEL_COREI7_SANDYBRIDGE,
34152 M_AMDFAM10H_BARCELONA,
34153 M_AMDFAM10H_SHANGHAI,
34154 M_AMDFAM10H_ISTANBUL,
34155 M_AMDFAM15H_BDVER1,
34156 M_AMDFAM15H_BDVER2,
34157 M_AMDFAM15H_BDVER3,
34158 M_AMDFAM15H_BDVER4,
34159 M_AMDFAM17H_ZNVER1,
34160 M_INTEL_COREI7_IVYBRIDGE,
34161 M_INTEL_COREI7_HASWELL,
34162 M_INTEL_COREI7_BROADWELL,
34163 M_INTEL_COREI7_SKYLAKE,
34164 M_INTEL_COREI7_SKYLAKE_AVX512
34167 static struct _arch_names_table
34169 const char *const name;
34170 const enum processor_model model;
34172 const arch_names_table[] =
34174 {"amd", M_AMD},
34175 {"intel", M_INTEL},
34176 {"atom", M_INTEL_BONNELL},
34177 {"slm", M_INTEL_SILVERMONT},
34178 {"core2", M_INTEL_CORE2},
34179 {"corei7", M_INTEL_COREI7},
34180 {"nehalem", M_INTEL_COREI7_NEHALEM},
34181 {"westmere", M_INTEL_COREI7_WESTMERE},
34182 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34183 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34184 {"haswell", M_INTEL_COREI7_HASWELL},
34185 {"broadwell", M_INTEL_COREI7_BROADWELL},
34186 {"skylake", M_INTEL_COREI7_SKYLAKE},
34187 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34188 {"bonnell", M_INTEL_BONNELL},
34189 {"silvermont", M_INTEL_SILVERMONT},
34190 {"knl", M_INTEL_KNL},
34191 {"amdfam10h", M_AMDFAM10H},
34192 {"barcelona", M_AMDFAM10H_BARCELONA},
34193 {"shanghai", M_AMDFAM10H_SHANGHAI},
34194 {"istanbul", M_AMDFAM10H_ISTANBUL},
34195 {"btver1", M_AMD_BTVER1},
34196 {"amdfam15h", M_AMDFAM15H},
34197 {"bdver1", M_AMDFAM15H_BDVER1},
34198 {"bdver2", M_AMDFAM15H_BDVER2},
34199 {"bdver3", M_AMDFAM15H_BDVER3},
34200 {"bdver4", M_AMDFAM15H_BDVER4},
34201 {"btver2", M_AMD_BTVER2},
34202 {"znver1", M_AMDFAM17H_ZNVER1},
34205 static struct _isa_names_table
34207 const char *const name;
34208 const enum processor_features feature;
34210 const isa_names_table[] =
34212 {"cmov", F_CMOV},
34213 {"mmx", F_MMX},
34214 {"popcnt", F_POPCNT},
34215 {"sse", F_SSE},
34216 {"sse2", F_SSE2},
34217 {"sse3", F_SSE3},
34218 {"ssse3", F_SSSE3},
34219 {"sse4a", F_SSE4_A},
34220 {"sse4.1", F_SSE4_1},
34221 {"sse4.2", F_SSE4_2},
34222 {"avx", F_AVX},
34223 {"fma4", F_FMA4},
34224 {"xop", F_XOP},
34225 {"fma", F_FMA},
34226 {"avx2", F_AVX2},
34227 {"avx512f", F_AVX512F},
34228 {"bmi", F_BMI},
34229 {"bmi2", F_BMI2},
34230 {"aes", F_AES},
34231 {"pclmul", F_PCLMUL},
34232 {"avx512vl",F_AVX512VL},
34233 {"avx512bw",F_AVX512BW},
34234 {"avx512dq",F_AVX512DQ},
34235 {"avx512cd",F_AVX512CD},
34236 {"avx512er",F_AVX512ER},
34237 {"avx512pf",F_AVX512PF},
34238 {"avx512vbmi",F_AVX512VBMI},
34239 {"avx512ifma",F_AVX512IFMA},
34240 {"avx5124vnniw",F_AVX5124VNNIW},
34241 {"avx5124fmaps",F_AVX5124FMAPS},
34242 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34245 tree __processor_model_type = build_processor_model_struct ();
34246 tree __cpu_model_var = make_var_decl (__processor_model_type,
34247 "__cpu_model");
34250 varpool_node::add (__cpu_model_var);
34252 gcc_assert ((args != NULL) && (*args != NULL));
34254 param_string_cst = *args;
34255 while (param_string_cst
34256 && TREE_CODE (param_string_cst) != STRING_CST)
34258 /* *args must be a expr that can contain other EXPRS leading to a
34259 STRING_CST. */
34260 if (!EXPR_P (param_string_cst))
34262 error ("Parameter to builtin must be a string constant or literal");
34263 return integer_zero_node;
34265 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34268 gcc_assert (param_string_cst);
34270 if (fn_code == IX86_BUILTIN_CPU_IS)
34272 tree ref;
34273 tree field;
34274 tree final;
34276 unsigned int field_val = 0;
34277 unsigned int NUM_ARCH_NAMES
34278 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34280 for (i = 0; i < NUM_ARCH_NAMES; i++)
34281 if (strcmp (arch_names_table[i].name,
34282 TREE_STRING_POINTER (param_string_cst)) == 0)
34283 break;
34285 if (i == NUM_ARCH_NAMES)
34287 error ("Parameter to builtin not valid: %s",
34288 TREE_STRING_POINTER (param_string_cst));
34289 return integer_zero_node;
34292 field = TYPE_FIELDS (__processor_model_type);
34293 field_val = arch_names_table[i].model;
34295 /* CPU types are stored in the next field. */
34296 if (field_val > M_CPU_TYPE_START
34297 && field_val < M_CPU_SUBTYPE_START)
34299 field = DECL_CHAIN (field);
34300 field_val -= M_CPU_TYPE_START;
34303 /* CPU subtypes are stored in the next field. */
34304 if (field_val > M_CPU_SUBTYPE_START)
34306 field = DECL_CHAIN ( DECL_CHAIN (field));
34307 field_val -= M_CPU_SUBTYPE_START;
34310 /* Get the appropriate field in __cpu_model. */
34311 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34312 field, NULL_TREE);
34314 /* Check the value. */
34315 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34316 build_int_cstu (unsigned_type_node, field_val));
34317 return build1 (CONVERT_EXPR, integer_type_node, final);
34319 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34321 tree ref;
34322 tree array_elt;
34323 tree field;
34324 tree final;
34326 unsigned int field_val = 0;
34327 unsigned int NUM_ISA_NAMES
34328 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34330 for (i = 0; i < NUM_ISA_NAMES; i++)
34331 if (strcmp (isa_names_table[i].name,
34332 TREE_STRING_POINTER (param_string_cst)) == 0)
34333 break;
34335 if (i == NUM_ISA_NAMES)
34337 error ("Parameter to builtin not valid: %s",
34338 TREE_STRING_POINTER (param_string_cst));
34339 return integer_zero_node;
34342 field = TYPE_FIELDS (__processor_model_type);
34343 /* Get the last field, which is __cpu_features. */
34344 while (DECL_CHAIN (field))
34345 field = DECL_CHAIN (field);
34347 /* Get the appropriate field: __cpu_model.__cpu_features */
34348 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34349 field, NULL_TREE);
34351 /* Access the 0th element of __cpu_features array. */
34352 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34353 integer_zero_node, NULL_TREE, NULL_TREE);
34355 field_val = (1 << isa_names_table[i].feature);
34356 /* Return __cpu_model.__cpu_features[0] & field_val */
34357 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34358 build_int_cstu (unsigned_type_node, field_val));
34359 return build1 (CONVERT_EXPR, integer_type_node, final);
34361 gcc_unreachable ();
34364 static tree
34365 ix86_fold_builtin (tree fndecl, int n_args,
34366 tree *args, bool ignore ATTRIBUTE_UNUSED)
34368 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34370 enum ix86_builtins fn_code = (enum ix86_builtins)
34371 DECL_FUNCTION_CODE (fndecl);
34372 switch (fn_code)
34374 case IX86_BUILTIN_CPU_IS:
34375 case IX86_BUILTIN_CPU_SUPPORTS:
34376 gcc_assert (n_args == 1);
34377 return fold_builtin_cpu (fndecl, args);
34379 case IX86_BUILTIN_NANQ:
34380 case IX86_BUILTIN_NANSQ:
34382 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34383 const char *str = c_getstr (*args);
34384 int quiet = fn_code == IX86_BUILTIN_NANQ;
34385 REAL_VALUE_TYPE real;
34387 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34388 return build_real (type, real);
34389 return NULL_TREE;
34392 case IX86_BUILTIN_INFQ:
34393 case IX86_BUILTIN_HUGE_VALQ:
34395 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34396 REAL_VALUE_TYPE inf;
34397 real_inf (&inf);
34398 return build_real (type, inf);
34401 case IX86_BUILTIN_TZCNT16:
34402 case IX86_BUILTIN_CTZS:
34403 case IX86_BUILTIN_TZCNT32:
34404 case IX86_BUILTIN_TZCNT64:
34405 gcc_assert (n_args == 1);
34406 if (TREE_CODE (args[0]) == INTEGER_CST)
34408 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34409 tree arg = args[0];
34410 if (fn_code == IX86_BUILTIN_TZCNT16
34411 || fn_code == IX86_BUILTIN_CTZS)
34412 arg = fold_convert (short_unsigned_type_node, arg);
34413 if (integer_zerop (arg))
34414 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34415 else
34416 return fold_const_call (CFN_CTZ, type, arg);
34418 break;
34420 case IX86_BUILTIN_LZCNT16:
34421 case IX86_BUILTIN_CLZS:
34422 case IX86_BUILTIN_LZCNT32:
34423 case IX86_BUILTIN_LZCNT64:
34424 gcc_assert (n_args == 1);
34425 if (TREE_CODE (args[0]) == INTEGER_CST)
34427 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34428 tree arg = args[0];
34429 if (fn_code == IX86_BUILTIN_LZCNT16
34430 || fn_code == IX86_BUILTIN_CLZS)
34431 arg = fold_convert (short_unsigned_type_node, arg);
34432 if (integer_zerop (arg))
34433 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34434 else
34435 return fold_const_call (CFN_CLZ, type, arg);
34437 break;
34439 case IX86_BUILTIN_BEXTR32:
34440 case IX86_BUILTIN_BEXTR64:
34441 case IX86_BUILTIN_BEXTRI32:
34442 case IX86_BUILTIN_BEXTRI64:
34443 gcc_assert (n_args == 2);
34444 if (tree_fits_uhwi_p (args[1]))
34446 unsigned HOST_WIDE_INT res = 0;
34447 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34448 unsigned int start = tree_to_uhwi (args[1]);
34449 unsigned int len = (start & 0xff00) >> 8;
34450 start &= 0xff;
34451 if (start >= prec || len == 0)
34452 res = 0;
34453 else if (!tree_fits_uhwi_p (args[0]))
34454 break;
34455 else
34456 res = tree_to_uhwi (args[0]) >> start;
34457 if (len > prec)
34458 len = prec;
34459 if (len < HOST_BITS_PER_WIDE_INT)
34460 res &= (HOST_WIDE_INT_1U << len) - 1;
34461 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34463 break;
34465 case IX86_BUILTIN_BZHI32:
34466 case IX86_BUILTIN_BZHI64:
34467 gcc_assert (n_args == 2);
34468 if (tree_fits_uhwi_p (args[1]))
34470 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34471 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34472 return args[0];
34473 if (!tree_fits_uhwi_p (args[0]))
34474 break;
34475 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34476 res &= ~(HOST_WIDE_INT_M1U << idx);
34477 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34479 break;
34481 case IX86_BUILTIN_PDEP32:
34482 case IX86_BUILTIN_PDEP64:
34483 gcc_assert (n_args == 2);
34484 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34486 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34487 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34488 unsigned HOST_WIDE_INT res = 0;
34489 unsigned HOST_WIDE_INT m, k = 1;
34490 for (m = 1; m; m <<= 1)
34491 if ((mask & m) != 0)
34493 if ((src & k) != 0)
34494 res |= m;
34495 k <<= 1;
34497 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34499 break;
34501 case IX86_BUILTIN_PEXT32:
34502 case IX86_BUILTIN_PEXT64:
34503 gcc_assert (n_args == 2);
34504 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34506 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34507 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34508 unsigned HOST_WIDE_INT res = 0;
34509 unsigned HOST_WIDE_INT m, k = 1;
34510 for (m = 1; m; m <<= 1)
34511 if ((mask & m) != 0)
34513 if ((src & m) != 0)
34514 res |= k;
34515 k <<= 1;
34517 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34519 break;
34521 default:
34522 break;
34526 #ifdef SUBTARGET_FOLD_BUILTIN
34527 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34528 #endif
34530 return NULL_TREE;
34533 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34534 constant) in GIMPLE. */
34536 bool
34537 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34539 gimple *stmt = gsi_stmt (*gsi);
34540 tree fndecl = gimple_call_fndecl (stmt);
34541 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34542 int n_args = gimple_call_num_args (stmt);
34543 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34544 tree decl = NULL_TREE;
34545 tree arg0, arg1;
34547 switch (fn_code)
34549 case IX86_BUILTIN_TZCNT32:
34550 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34551 goto fold_tzcnt_lzcnt;
34553 case IX86_BUILTIN_TZCNT64:
34554 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34555 goto fold_tzcnt_lzcnt;
34557 case IX86_BUILTIN_LZCNT32:
34558 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34559 goto fold_tzcnt_lzcnt;
34561 case IX86_BUILTIN_LZCNT64:
34562 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34563 goto fold_tzcnt_lzcnt;
34565 fold_tzcnt_lzcnt:
34566 gcc_assert (n_args == 1);
34567 arg0 = gimple_call_arg (stmt, 0);
34568 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34570 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34571 /* If arg0 is provably non-zero, optimize into generic
34572 __builtin_c[tl]z{,ll} function the middle-end handles
34573 better. */
34574 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34575 return false;
34577 location_t loc = gimple_location (stmt);
34578 gimple *g = gimple_build_call (decl, 1, arg0);
34579 gimple_set_location (g, loc);
34580 tree lhs = make_ssa_name (integer_type_node);
34581 gimple_call_set_lhs (g, lhs);
34582 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34583 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34584 gimple_set_location (g, loc);
34585 gsi_replace (gsi, g, false);
34586 return true;
34588 break;
34590 case IX86_BUILTIN_BZHI32:
34591 case IX86_BUILTIN_BZHI64:
34592 gcc_assert (n_args == 2);
34593 arg1 = gimple_call_arg (stmt, 1);
34594 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34596 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34597 arg0 = gimple_call_arg (stmt, 0);
34598 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34599 break;
34600 location_t loc = gimple_location (stmt);
34601 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34602 gimple_set_location (g, loc);
34603 gsi_replace (gsi, g, false);
34604 return true;
34606 break;
34608 case IX86_BUILTIN_PDEP32:
34609 case IX86_BUILTIN_PDEP64:
34610 case IX86_BUILTIN_PEXT32:
34611 case IX86_BUILTIN_PEXT64:
34612 gcc_assert (n_args == 2);
34613 arg1 = gimple_call_arg (stmt, 1);
34614 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34616 location_t loc = gimple_location (stmt);
34617 arg0 = gimple_call_arg (stmt, 0);
34618 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34619 gimple_set_location (g, loc);
34620 gsi_replace (gsi, g, false);
34621 return true;
34623 break;
34625 default:
34626 break;
34629 return false;
34632 /* Make builtins to detect cpu type and features supported. NAME is
34633 the builtin name, CODE is the builtin code, and FTYPE is the function
34634 type of the builtin. */
34636 static void
34637 make_cpu_type_builtin (const char* name, int code,
34638 enum ix86_builtin_func_type ftype, bool is_const)
34640 tree decl;
34641 tree type;
34643 type = ix86_get_builtin_func_type (ftype);
34644 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34645 NULL, NULL_TREE);
34646 gcc_assert (decl != NULL_TREE);
34647 ix86_builtins[(int) code] = decl;
34648 TREE_READONLY (decl) = is_const;
34651 /* Make builtins to get CPU type and features supported. The created
34652 builtins are :
34654 __builtin_cpu_init (), to detect cpu type and features,
34655 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34656 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34659 static void
34660 ix86_init_platform_type_builtins (void)
34662 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34663 INT_FTYPE_VOID, false);
34664 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34665 INT_FTYPE_PCCHAR, true);
34666 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34667 INT_FTYPE_PCCHAR, true);
34670 /* Internal method for ix86_init_builtins. */
34672 static void
34673 ix86_init_builtins_va_builtins_abi (void)
34675 tree ms_va_ref, sysv_va_ref;
34676 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34677 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34678 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34679 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34681 if (!TARGET_64BIT)
34682 return;
34683 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34684 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34685 ms_va_ref = build_reference_type (ms_va_list_type_node);
34686 sysv_va_ref =
34687 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34689 fnvoid_va_end_ms =
34690 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34691 fnvoid_va_start_ms =
34692 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34693 fnvoid_va_end_sysv =
34694 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34695 fnvoid_va_start_sysv =
34696 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34697 NULL_TREE);
34698 fnvoid_va_copy_ms =
34699 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34700 NULL_TREE);
34701 fnvoid_va_copy_sysv =
34702 build_function_type_list (void_type_node, sysv_va_ref,
34703 sysv_va_ref, NULL_TREE);
34705 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34706 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34707 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34708 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34709 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34710 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34711 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34712 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34713 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34714 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34715 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34716 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34719 static void
34720 ix86_init_builtin_types (void)
34722 tree float80_type_node, const_string_type_node;
34724 /* The __float80 type. */
34725 float80_type_node = long_double_type_node;
34726 if (TYPE_MODE (float80_type_node) != XFmode)
34728 if (float64x_type_node != NULL_TREE
34729 && TYPE_MODE (float64x_type_node) == XFmode)
34730 float80_type_node = float64x_type_node;
34731 else
34733 /* The __float80 type. */
34734 float80_type_node = make_node (REAL_TYPE);
34736 TYPE_PRECISION (float80_type_node) = 80;
34737 layout_type (float80_type_node);
34740 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34742 /* The __float128 type. The node has already been created as
34743 _Float128, so we only need to register the __float128 name for
34744 it. */
34745 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34747 const_string_type_node
34748 = build_pointer_type (build_qualified_type
34749 (char_type_node, TYPE_QUAL_CONST));
34751 /* This macro is built by i386-builtin-types.awk. */
34752 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34755 static void
34756 ix86_init_builtins (void)
34758 tree ftype, decl;
34760 ix86_init_builtin_types ();
34762 /* Builtins to get CPU type and features. */
34763 ix86_init_platform_type_builtins ();
34765 /* TFmode support builtins. */
34766 def_builtin_const (0, "__builtin_infq",
34767 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34768 def_builtin_const (0, "__builtin_huge_valq",
34769 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34771 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34772 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34773 BUILT_IN_MD, "nanq", NULL_TREE);
34774 TREE_READONLY (decl) = 1;
34775 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34777 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34778 BUILT_IN_MD, "nansq", NULL_TREE);
34779 TREE_READONLY (decl) = 1;
34780 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34782 /* We will expand them to normal call if SSE isn't available since
34783 they are used by libgcc. */
34784 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34785 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34786 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34787 TREE_READONLY (decl) = 1;
34788 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34790 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34791 decl = add_builtin_function ("__builtin_copysignq", ftype,
34792 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34793 "__copysigntf3", NULL_TREE);
34794 TREE_READONLY (decl) = 1;
34795 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34797 ix86_init_tm_builtins ();
34798 ix86_init_mmx_sse_builtins ();
34799 ix86_init_mpx_builtins ();
34801 if (TARGET_LP64)
34802 ix86_init_builtins_va_builtins_abi ();
34804 #ifdef SUBTARGET_INIT_BUILTINS
34805 SUBTARGET_INIT_BUILTINS;
34806 #endif
34809 /* Return the ix86 builtin for CODE. */
34811 static tree
34812 ix86_builtin_decl (unsigned code, bool)
34814 if (code >= IX86_BUILTIN_MAX)
34815 return error_mark_node;
34817 return ix86_builtins[code];
34820 /* Errors in the source file can cause expand_expr to return const0_rtx
34821 where we expect a vector. To avoid crashing, use one of the vector
34822 clear instructions. */
34823 static rtx
34824 safe_vector_operand (rtx x, machine_mode mode)
34826 if (x == const0_rtx)
34827 x = CONST0_RTX (mode);
34828 return x;
34831 /* Fixup modeless constants to fit required mode. */
34832 static rtx
34833 fixup_modeless_constant (rtx x, machine_mode mode)
34835 if (GET_MODE (x) == VOIDmode)
34836 x = convert_to_mode (mode, x, 1);
34837 return x;
34840 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34842 static rtx
34843 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34845 rtx pat;
34846 tree arg0 = CALL_EXPR_ARG (exp, 0);
34847 tree arg1 = CALL_EXPR_ARG (exp, 1);
34848 rtx op0 = expand_normal (arg0);
34849 rtx op1 = expand_normal (arg1);
34850 machine_mode tmode = insn_data[icode].operand[0].mode;
34851 machine_mode mode0 = insn_data[icode].operand[1].mode;
34852 machine_mode mode1 = insn_data[icode].operand[2].mode;
34854 if (VECTOR_MODE_P (mode0))
34855 op0 = safe_vector_operand (op0, mode0);
34856 if (VECTOR_MODE_P (mode1))
34857 op1 = safe_vector_operand (op1, mode1);
34859 if (optimize || !target
34860 || GET_MODE (target) != tmode
34861 || !insn_data[icode].operand[0].predicate (target, tmode))
34862 target = gen_reg_rtx (tmode);
34864 if (GET_MODE (op1) == SImode && mode1 == TImode)
34866 rtx x = gen_reg_rtx (V4SImode);
34867 emit_insn (gen_sse2_loadd (x, op1));
34868 op1 = gen_lowpart (TImode, x);
34871 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34872 op0 = copy_to_mode_reg (mode0, op0);
34873 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34874 op1 = copy_to_mode_reg (mode1, op1);
34876 pat = GEN_FCN (icode) (target, op0, op1);
34877 if (! pat)
34878 return 0;
34880 emit_insn (pat);
34882 return target;
34885 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34887 static rtx
34888 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34889 enum ix86_builtin_func_type m_type,
34890 enum rtx_code sub_code)
34892 rtx pat;
34893 int i;
34894 int nargs;
34895 bool comparison_p = false;
34896 bool tf_p = false;
34897 bool last_arg_constant = false;
34898 int num_memory = 0;
34899 struct {
34900 rtx op;
34901 machine_mode mode;
34902 } args[4];
34904 machine_mode tmode = insn_data[icode].operand[0].mode;
34906 switch (m_type)
34908 case MULTI_ARG_4_DF2_DI_I:
34909 case MULTI_ARG_4_DF2_DI_I1:
34910 case MULTI_ARG_4_SF2_SI_I:
34911 case MULTI_ARG_4_SF2_SI_I1:
34912 nargs = 4;
34913 last_arg_constant = true;
34914 break;
34916 case MULTI_ARG_3_SF:
34917 case MULTI_ARG_3_DF:
34918 case MULTI_ARG_3_SF2:
34919 case MULTI_ARG_3_DF2:
34920 case MULTI_ARG_3_DI:
34921 case MULTI_ARG_3_SI:
34922 case MULTI_ARG_3_SI_DI:
34923 case MULTI_ARG_3_HI:
34924 case MULTI_ARG_3_HI_SI:
34925 case MULTI_ARG_3_QI:
34926 case MULTI_ARG_3_DI2:
34927 case MULTI_ARG_3_SI2:
34928 case MULTI_ARG_3_HI2:
34929 case MULTI_ARG_3_QI2:
34930 nargs = 3;
34931 break;
34933 case MULTI_ARG_2_SF:
34934 case MULTI_ARG_2_DF:
34935 case MULTI_ARG_2_DI:
34936 case MULTI_ARG_2_SI:
34937 case MULTI_ARG_2_HI:
34938 case MULTI_ARG_2_QI:
34939 nargs = 2;
34940 break;
34942 case MULTI_ARG_2_DI_IMM:
34943 case MULTI_ARG_2_SI_IMM:
34944 case MULTI_ARG_2_HI_IMM:
34945 case MULTI_ARG_2_QI_IMM:
34946 nargs = 2;
34947 last_arg_constant = true;
34948 break;
34950 case MULTI_ARG_1_SF:
34951 case MULTI_ARG_1_DF:
34952 case MULTI_ARG_1_SF2:
34953 case MULTI_ARG_1_DF2:
34954 case MULTI_ARG_1_DI:
34955 case MULTI_ARG_1_SI:
34956 case MULTI_ARG_1_HI:
34957 case MULTI_ARG_1_QI:
34958 case MULTI_ARG_1_SI_DI:
34959 case MULTI_ARG_1_HI_DI:
34960 case MULTI_ARG_1_HI_SI:
34961 case MULTI_ARG_1_QI_DI:
34962 case MULTI_ARG_1_QI_SI:
34963 case MULTI_ARG_1_QI_HI:
34964 nargs = 1;
34965 break;
34967 case MULTI_ARG_2_DI_CMP:
34968 case MULTI_ARG_2_SI_CMP:
34969 case MULTI_ARG_2_HI_CMP:
34970 case MULTI_ARG_2_QI_CMP:
34971 nargs = 2;
34972 comparison_p = true;
34973 break;
34975 case MULTI_ARG_2_SF_TF:
34976 case MULTI_ARG_2_DF_TF:
34977 case MULTI_ARG_2_DI_TF:
34978 case MULTI_ARG_2_SI_TF:
34979 case MULTI_ARG_2_HI_TF:
34980 case MULTI_ARG_2_QI_TF:
34981 nargs = 2;
34982 tf_p = true;
34983 break;
34985 default:
34986 gcc_unreachable ();
34989 if (optimize || !target
34990 || GET_MODE (target) != tmode
34991 || !insn_data[icode].operand[0].predicate (target, tmode))
34992 target = gen_reg_rtx (tmode);
34993 else if (memory_operand (target, tmode))
34994 num_memory++;
34996 gcc_assert (nargs <= 4);
34998 for (i = 0; i < nargs; i++)
35000 tree arg = CALL_EXPR_ARG (exp, i);
35001 rtx op = expand_normal (arg);
35002 int adjust = (comparison_p) ? 1 : 0;
35003 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
35005 if (last_arg_constant && i == nargs - 1)
35007 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
35009 enum insn_code new_icode = icode;
35010 switch (icode)
35012 case CODE_FOR_xop_vpermil2v2df3:
35013 case CODE_FOR_xop_vpermil2v4sf3:
35014 case CODE_FOR_xop_vpermil2v4df3:
35015 case CODE_FOR_xop_vpermil2v8sf3:
35016 error ("the last argument must be a 2-bit immediate");
35017 return gen_reg_rtx (tmode);
35018 case CODE_FOR_xop_rotlv2di3:
35019 new_icode = CODE_FOR_rotlv2di3;
35020 goto xop_rotl;
35021 case CODE_FOR_xop_rotlv4si3:
35022 new_icode = CODE_FOR_rotlv4si3;
35023 goto xop_rotl;
35024 case CODE_FOR_xop_rotlv8hi3:
35025 new_icode = CODE_FOR_rotlv8hi3;
35026 goto xop_rotl;
35027 case CODE_FOR_xop_rotlv16qi3:
35028 new_icode = CODE_FOR_rotlv16qi3;
35029 xop_rotl:
35030 if (CONST_INT_P (op))
35032 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35033 op = GEN_INT (INTVAL (op) & mask);
35034 gcc_checking_assert
35035 (insn_data[icode].operand[i + 1].predicate (op, mode));
35037 else
35039 gcc_checking_assert
35040 (nargs == 2
35041 && insn_data[new_icode].operand[0].mode == tmode
35042 && insn_data[new_icode].operand[1].mode == tmode
35043 && insn_data[new_icode].operand[2].mode == mode
35044 && insn_data[new_icode].operand[0].predicate
35045 == insn_data[icode].operand[0].predicate
35046 && insn_data[new_icode].operand[1].predicate
35047 == insn_data[icode].operand[1].predicate);
35048 icode = new_icode;
35049 goto non_constant;
35051 break;
35052 default:
35053 gcc_unreachable ();
35057 else
35059 non_constant:
35060 if (VECTOR_MODE_P (mode))
35061 op = safe_vector_operand (op, mode);
35063 /* If we aren't optimizing, only allow one memory operand to be
35064 generated. */
35065 if (memory_operand (op, mode))
35066 num_memory++;
35068 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35070 if (optimize
35071 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35072 || num_memory > 1)
35073 op = force_reg (mode, op);
35076 args[i].op = op;
35077 args[i].mode = mode;
35080 switch (nargs)
35082 case 1:
35083 pat = GEN_FCN (icode) (target, args[0].op);
35084 break;
35086 case 2:
35087 if (tf_p)
35088 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35089 GEN_INT ((int)sub_code));
35090 else if (! comparison_p)
35091 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35092 else
35094 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35095 args[0].op,
35096 args[1].op);
35098 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35100 break;
35102 case 3:
35103 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35104 break;
35106 case 4:
35107 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35108 break;
35110 default:
35111 gcc_unreachable ();
35114 if (! pat)
35115 return 0;
35117 emit_insn (pat);
35118 return target;
35121 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35122 insns with vec_merge. */
35124 static rtx
35125 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35126 rtx target)
35128 rtx pat;
35129 tree arg0 = CALL_EXPR_ARG (exp, 0);
35130 rtx op1, op0 = expand_normal (arg0);
35131 machine_mode tmode = insn_data[icode].operand[0].mode;
35132 machine_mode mode0 = insn_data[icode].operand[1].mode;
35134 if (optimize || !target
35135 || GET_MODE (target) != tmode
35136 || !insn_data[icode].operand[0].predicate (target, tmode))
35137 target = gen_reg_rtx (tmode);
35139 if (VECTOR_MODE_P (mode0))
35140 op0 = safe_vector_operand (op0, mode0);
35142 if ((optimize && !register_operand (op0, mode0))
35143 || !insn_data[icode].operand[1].predicate (op0, mode0))
35144 op0 = copy_to_mode_reg (mode0, op0);
35146 op1 = op0;
35147 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35148 op1 = copy_to_mode_reg (mode0, op1);
35150 pat = GEN_FCN (icode) (target, op0, op1);
35151 if (! pat)
35152 return 0;
35153 emit_insn (pat);
35154 return target;
35157 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35159 static rtx
35160 ix86_expand_sse_compare (const struct builtin_description *d,
35161 tree exp, rtx target, bool swap)
35163 rtx pat;
35164 tree arg0 = CALL_EXPR_ARG (exp, 0);
35165 tree arg1 = CALL_EXPR_ARG (exp, 1);
35166 rtx op0 = expand_normal (arg0);
35167 rtx op1 = expand_normal (arg1);
35168 rtx op2;
35169 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35170 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35171 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35172 enum rtx_code comparison = d->comparison;
35174 if (VECTOR_MODE_P (mode0))
35175 op0 = safe_vector_operand (op0, mode0);
35176 if (VECTOR_MODE_P (mode1))
35177 op1 = safe_vector_operand (op1, mode1);
35179 /* Swap operands if we have a comparison that isn't available in
35180 hardware. */
35181 if (swap)
35182 std::swap (op0, op1);
35184 if (optimize || !target
35185 || GET_MODE (target) != tmode
35186 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35187 target = gen_reg_rtx (tmode);
35189 if ((optimize && !register_operand (op0, mode0))
35190 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35191 op0 = copy_to_mode_reg (mode0, op0);
35192 if ((optimize && !register_operand (op1, mode1))
35193 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35194 op1 = copy_to_mode_reg (mode1, op1);
35196 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35197 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35198 if (! pat)
35199 return 0;
35200 emit_insn (pat);
35201 return target;
35204 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35206 static rtx
35207 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35208 rtx target)
35210 rtx pat;
35211 tree arg0 = CALL_EXPR_ARG (exp, 0);
35212 tree arg1 = CALL_EXPR_ARG (exp, 1);
35213 rtx op0 = expand_normal (arg0);
35214 rtx op1 = expand_normal (arg1);
35215 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35216 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35217 enum rtx_code comparison = d->comparison;
35219 if (VECTOR_MODE_P (mode0))
35220 op0 = safe_vector_operand (op0, mode0);
35221 if (VECTOR_MODE_P (mode1))
35222 op1 = safe_vector_operand (op1, mode1);
35224 /* Swap operands if we have a comparison that isn't available in
35225 hardware. */
35226 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35227 std::swap (op0, op1);
35229 target = gen_reg_rtx (SImode);
35230 emit_move_insn (target, const0_rtx);
35231 target = gen_rtx_SUBREG (QImode, target, 0);
35233 if ((optimize && !register_operand (op0, mode0))
35234 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35235 op0 = copy_to_mode_reg (mode0, op0);
35236 if ((optimize && !register_operand (op1, mode1))
35237 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35238 op1 = copy_to_mode_reg (mode1, op1);
35240 pat = GEN_FCN (d->icode) (op0, op1);
35241 if (! pat)
35242 return 0;
35243 emit_insn (pat);
35244 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35245 gen_rtx_fmt_ee (comparison, QImode,
35246 SET_DEST (pat),
35247 const0_rtx)));
35249 return SUBREG_REG (target);
35252 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35254 static rtx
35255 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35256 rtx target)
35258 rtx pat;
35259 tree arg0 = CALL_EXPR_ARG (exp, 0);
35260 rtx op1, op0 = expand_normal (arg0);
35261 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35262 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35264 if (optimize || target == 0
35265 || GET_MODE (target) != tmode
35266 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35267 target = gen_reg_rtx (tmode);
35269 if (VECTOR_MODE_P (mode0))
35270 op0 = safe_vector_operand (op0, mode0);
35272 if ((optimize && !register_operand (op0, mode0))
35273 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35274 op0 = copy_to_mode_reg (mode0, op0);
35276 op1 = GEN_INT (d->comparison);
35278 pat = GEN_FCN (d->icode) (target, op0, op1);
35279 if (! pat)
35280 return 0;
35281 emit_insn (pat);
35282 return target;
35285 static rtx
35286 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35287 tree exp, rtx target)
35289 rtx pat;
35290 tree arg0 = CALL_EXPR_ARG (exp, 0);
35291 tree arg1 = CALL_EXPR_ARG (exp, 1);
35292 rtx op0 = expand_normal (arg0);
35293 rtx op1 = expand_normal (arg1);
35294 rtx op2;
35295 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35296 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35297 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35299 if (optimize || target == 0
35300 || GET_MODE (target) != tmode
35301 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35302 target = gen_reg_rtx (tmode);
35304 op0 = safe_vector_operand (op0, mode0);
35305 op1 = safe_vector_operand (op1, mode1);
35307 if ((optimize && !register_operand (op0, mode0))
35308 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35309 op0 = copy_to_mode_reg (mode0, op0);
35310 if ((optimize && !register_operand (op1, mode1))
35311 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35312 op1 = copy_to_mode_reg (mode1, op1);
35314 op2 = GEN_INT (d->comparison);
35316 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35317 if (! pat)
35318 return 0;
35319 emit_insn (pat);
35320 return target;
35323 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35325 static rtx
35326 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35327 rtx target)
35329 rtx pat;
35330 tree arg0 = CALL_EXPR_ARG (exp, 0);
35331 tree arg1 = CALL_EXPR_ARG (exp, 1);
35332 rtx op0 = expand_normal (arg0);
35333 rtx op1 = expand_normal (arg1);
35334 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35335 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35336 enum rtx_code comparison = d->comparison;
35338 if (VECTOR_MODE_P (mode0))
35339 op0 = safe_vector_operand (op0, mode0);
35340 if (VECTOR_MODE_P (mode1))
35341 op1 = safe_vector_operand (op1, mode1);
35343 target = gen_reg_rtx (SImode);
35344 emit_move_insn (target, const0_rtx);
35345 target = gen_rtx_SUBREG (QImode, target, 0);
35347 if ((optimize && !register_operand (op0, mode0))
35348 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35349 op0 = copy_to_mode_reg (mode0, op0);
35350 if ((optimize && !register_operand (op1, mode1))
35351 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35352 op1 = copy_to_mode_reg (mode1, op1);
35354 pat = GEN_FCN (d->icode) (op0, op1);
35355 if (! pat)
35356 return 0;
35357 emit_insn (pat);
35358 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35359 gen_rtx_fmt_ee (comparison, QImode,
35360 SET_DEST (pat),
35361 const0_rtx)));
35363 return SUBREG_REG (target);
35366 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35368 static rtx
35369 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35370 tree exp, rtx target)
35372 rtx pat;
35373 tree arg0 = CALL_EXPR_ARG (exp, 0);
35374 tree arg1 = CALL_EXPR_ARG (exp, 1);
35375 tree arg2 = CALL_EXPR_ARG (exp, 2);
35376 tree arg3 = CALL_EXPR_ARG (exp, 3);
35377 tree arg4 = CALL_EXPR_ARG (exp, 4);
35378 rtx scratch0, scratch1;
35379 rtx op0 = expand_normal (arg0);
35380 rtx op1 = expand_normal (arg1);
35381 rtx op2 = expand_normal (arg2);
35382 rtx op3 = expand_normal (arg3);
35383 rtx op4 = expand_normal (arg4);
35384 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35386 tmode0 = insn_data[d->icode].operand[0].mode;
35387 tmode1 = insn_data[d->icode].operand[1].mode;
35388 modev2 = insn_data[d->icode].operand[2].mode;
35389 modei3 = insn_data[d->icode].operand[3].mode;
35390 modev4 = insn_data[d->icode].operand[4].mode;
35391 modei5 = insn_data[d->icode].operand[5].mode;
35392 modeimm = insn_data[d->icode].operand[6].mode;
35394 if (VECTOR_MODE_P (modev2))
35395 op0 = safe_vector_operand (op0, modev2);
35396 if (VECTOR_MODE_P (modev4))
35397 op2 = safe_vector_operand (op2, modev4);
35399 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35400 op0 = copy_to_mode_reg (modev2, op0);
35401 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35402 op1 = copy_to_mode_reg (modei3, op1);
35403 if ((optimize && !register_operand (op2, modev4))
35404 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35405 op2 = copy_to_mode_reg (modev4, op2);
35406 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35407 op3 = copy_to_mode_reg (modei5, op3);
35409 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35411 error ("the fifth argument must be an 8-bit immediate");
35412 return const0_rtx;
35415 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35417 if (optimize || !target
35418 || GET_MODE (target) != tmode0
35419 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35420 target = gen_reg_rtx (tmode0);
35422 scratch1 = gen_reg_rtx (tmode1);
35424 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35426 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35428 if (optimize || !target
35429 || GET_MODE (target) != tmode1
35430 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35431 target = gen_reg_rtx (tmode1);
35433 scratch0 = gen_reg_rtx (tmode0);
35435 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35437 else
35439 gcc_assert (d->flag);
35441 scratch0 = gen_reg_rtx (tmode0);
35442 scratch1 = gen_reg_rtx (tmode1);
35444 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35447 if (! pat)
35448 return 0;
35450 emit_insn (pat);
35452 if (d->flag)
35454 target = gen_reg_rtx (SImode);
35455 emit_move_insn (target, const0_rtx);
35456 target = gen_rtx_SUBREG (QImode, target, 0);
35458 emit_insn
35459 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35460 gen_rtx_fmt_ee (EQ, QImode,
35461 gen_rtx_REG ((machine_mode) d->flag,
35462 FLAGS_REG),
35463 const0_rtx)));
35464 return SUBREG_REG (target);
35466 else
35467 return target;
35471 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35473 static rtx
35474 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35475 tree exp, rtx target)
35477 rtx pat;
35478 tree arg0 = CALL_EXPR_ARG (exp, 0);
35479 tree arg1 = CALL_EXPR_ARG (exp, 1);
35480 tree arg2 = CALL_EXPR_ARG (exp, 2);
35481 rtx scratch0, scratch1;
35482 rtx op0 = expand_normal (arg0);
35483 rtx op1 = expand_normal (arg1);
35484 rtx op2 = expand_normal (arg2);
35485 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35487 tmode0 = insn_data[d->icode].operand[0].mode;
35488 tmode1 = insn_data[d->icode].operand[1].mode;
35489 modev2 = insn_data[d->icode].operand[2].mode;
35490 modev3 = insn_data[d->icode].operand[3].mode;
35491 modeimm = insn_data[d->icode].operand[4].mode;
35493 if (VECTOR_MODE_P (modev2))
35494 op0 = safe_vector_operand (op0, modev2);
35495 if (VECTOR_MODE_P (modev3))
35496 op1 = safe_vector_operand (op1, modev3);
35498 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35499 op0 = copy_to_mode_reg (modev2, op0);
35500 if ((optimize && !register_operand (op1, modev3))
35501 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35502 op1 = copy_to_mode_reg (modev3, op1);
35504 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35506 error ("the third argument must be an 8-bit immediate");
35507 return const0_rtx;
35510 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35512 if (optimize || !target
35513 || GET_MODE (target) != tmode0
35514 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35515 target = gen_reg_rtx (tmode0);
35517 scratch1 = gen_reg_rtx (tmode1);
35519 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35521 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35523 if (optimize || !target
35524 || GET_MODE (target) != tmode1
35525 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35526 target = gen_reg_rtx (tmode1);
35528 scratch0 = gen_reg_rtx (tmode0);
35530 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35532 else
35534 gcc_assert (d->flag);
35536 scratch0 = gen_reg_rtx (tmode0);
35537 scratch1 = gen_reg_rtx (tmode1);
35539 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35542 if (! pat)
35543 return 0;
35545 emit_insn (pat);
35547 if (d->flag)
35549 target = gen_reg_rtx (SImode);
35550 emit_move_insn (target, const0_rtx);
35551 target = gen_rtx_SUBREG (QImode, target, 0);
35553 emit_insn
35554 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35555 gen_rtx_fmt_ee (EQ, QImode,
35556 gen_rtx_REG ((machine_mode) d->flag,
35557 FLAGS_REG),
35558 const0_rtx)));
35559 return SUBREG_REG (target);
35561 else
35562 return target;
35565 /* Subroutine of ix86_expand_builtin to take care of insns with
35566 variable number of operands. */
35568 static rtx
35569 ix86_expand_args_builtin (const struct builtin_description *d,
35570 tree exp, rtx target)
35572 rtx pat, real_target;
35573 unsigned int i, nargs;
35574 unsigned int nargs_constant = 0;
35575 unsigned int mask_pos = 0;
35576 int num_memory = 0;
35577 struct
35579 rtx op;
35580 machine_mode mode;
35581 } args[6];
35582 bool second_arg_count = false;
35583 enum insn_code icode = d->icode;
35584 const struct insn_data_d *insn_p = &insn_data[icode];
35585 machine_mode tmode = insn_p->operand[0].mode;
35586 machine_mode rmode = VOIDmode;
35587 bool swap = false;
35588 enum rtx_code comparison = d->comparison;
35590 switch ((enum ix86_builtin_func_type) d->flag)
35592 case V2DF_FTYPE_V2DF_ROUND:
35593 case V4DF_FTYPE_V4DF_ROUND:
35594 case V8DF_FTYPE_V8DF_ROUND:
35595 case V4SF_FTYPE_V4SF_ROUND:
35596 case V8SF_FTYPE_V8SF_ROUND:
35597 case V16SF_FTYPE_V16SF_ROUND:
35598 case V4SI_FTYPE_V4SF_ROUND:
35599 case V8SI_FTYPE_V8SF_ROUND:
35600 case V16SI_FTYPE_V16SF_ROUND:
35601 return ix86_expand_sse_round (d, exp, target);
35602 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35603 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35604 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35605 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35606 case INT_FTYPE_V8SF_V8SF_PTEST:
35607 case INT_FTYPE_V4DI_V4DI_PTEST:
35608 case INT_FTYPE_V4DF_V4DF_PTEST:
35609 case INT_FTYPE_V4SF_V4SF_PTEST:
35610 case INT_FTYPE_V2DI_V2DI_PTEST:
35611 case INT_FTYPE_V2DF_V2DF_PTEST:
35612 return ix86_expand_sse_ptest (d, exp, target);
35613 case FLOAT128_FTYPE_FLOAT128:
35614 case FLOAT_FTYPE_FLOAT:
35615 case INT_FTYPE_INT:
35616 case UINT_FTYPE_UINT:
35617 case UINT16_FTYPE_UINT16:
35618 case UINT64_FTYPE_INT:
35619 case UINT64_FTYPE_UINT64:
35620 case INT64_FTYPE_INT64:
35621 case INT64_FTYPE_V4SF:
35622 case INT64_FTYPE_V2DF:
35623 case INT_FTYPE_V16QI:
35624 case INT_FTYPE_V8QI:
35625 case INT_FTYPE_V8SF:
35626 case INT_FTYPE_V4DF:
35627 case INT_FTYPE_V4SF:
35628 case INT_FTYPE_V2DF:
35629 case INT_FTYPE_V32QI:
35630 case V16QI_FTYPE_V16QI:
35631 case V8SI_FTYPE_V8SF:
35632 case V8SI_FTYPE_V4SI:
35633 case V8HI_FTYPE_V8HI:
35634 case V8HI_FTYPE_V16QI:
35635 case V8QI_FTYPE_V8QI:
35636 case V8SF_FTYPE_V8SF:
35637 case V8SF_FTYPE_V8SI:
35638 case V8SF_FTYPE_V4SF:
35639 case V8SF_FTYPE_V8HI:
35640 case V4SI_FTYPE_V4SI:
35641 case V4SI_FTYPE_V16QI:
35642 case V4SI_FTYPE_V4SF:
35643 case V4SI_FTYPE_V8SI:
35644 case V4SI_FTYPE_V8HI:
35645 case V4SI_FTYPE_V4DF:
35646 case V4SI_FTYPE_V2DF:
35647 case V4HI_FTYPE_V4HI:
35648 case V4DF_FTYPE_V4DF:
35649 case V4DF_FTYPE_V4SI:
35650 case V4DF_FTYPE_V4SF:
35651 case V4DF_FTYPE_V2DF:
35652 case V4SF_FTYPE_V4SF:
35653 case V4SF_FTYPE_V4SI:
35654 case V4SF_FTYPE_V8SF:
35655 case V4SF_FTYPE_V4DF:
35656 case V4SF_FTYPE_V8HI:
35657 case V4SF_FTYPE_V2DF:
35658 case V2DI_FTYPE_V2DI:
35659 case V2DI_FTYPE_V16QI:
35660 case V2DI_FTYPE_V8HI:
35661 case V2DI_FTYPE_V4SI:
35662 case V2DF_FTYPE_V2DF:
35663 case V2DF_FTYPE_V4SI:
35664 case V2DF_FTYPE_V4DF:
35665 case V2DF_FTYPE_V4SF:
35666 case V2DF_FTYPE_V2SI:
35667 case V2SI_FTYPE_V2SI:
35668 case V2SI_FTYPE_V4SF:
35669 case V2SI_FTYPE_V2SF:
35670 case V2SI_FTYPE_V2DF:
35671 case V2SF_FTYPE_V2SF:
35672 case V2SF_FTYPE_V2SI:
35673 case V32QI_FTYPE_V32QI:
35674 case V32QI_FTYPE_V16QI:
35675 case V16HI_FTYPE_V16HI:
35676 case V16HI_FTYPE_V8HI:
35677 case V8SI_FTYPE_V8SI:
35678 case V16HI_FTYPE_V16QI:
35679 case V8SI_FTYPE_V16QI:
35680 case V4DI_FTYPE_V16QI:
35681 case V8SI_FTYPE_V8HI:
35682 case V4DI_FTYPE_V8HI:
35683 case V4DI_FTYPE_V4SI:
35684 case V4DI_FTYPE_V2DI:
35685 case UQI_FTYPE_UQI:
35686 case UHI_FTYPE_UHI:
35687 case USI_FTYPE_USI:
35688 case USI_FTYPE_UQI:
35689 case USI_FTYPE_UHI:
35690 case UDI_FTYPE_UDI:
35691 case UHI_FTYPE_V16QI:
35692 case USI_FTYPE_V32QI:
35693 case UDI_FTYPE_V64QI:
35694 case V16QI_FTYPE_UHI:
35695 case V32QI_FTYPE_USI:
35696 case V64QI_FTYPE_UDI:
35697 case V8HI_FTYPE_UQI:
35698 case V16HI_FTYPE_UHI:
35699 case V32HI_FTYPE_USI:
35700 case V4SI_FTYPE_UQI:
35701 case V8SI_FTYPE_UQI:
35702 case V4SI_FTYPE_UHI:
35703 case V8SI_FTYPE_UHI:
35704 case UQI_FTYPE_V8HI:
35705 case UHI_FTYPE_V16HI:
35706 case USI_FTYPE_V32HI:
35707 case UQI_FTYPE_V4SI:
35708 case UQI_FTYPE_V8SI:
35709 case UHI_FTYPE_V16SI:
35710 case UQI_FTYPE_V2DI:
35711 case UQI_FTYPE_V4DI:
35712 case UQI_FTYPE_V8DI:
35713 case V16SI_FTYPE_UHI:
35714 case V2DI_FTYPE_UQI:
35715 case V4DI_FTYPE_UQI:
35716 case V16SI_FTYPE_INT:
35717 case V16SF_FTYPE_V8SF:
35718 case V16SI_FTYPE_V8SI:
35719 case V16SF_FTYPE_V4SF:
35720 case V16SI_FTYPE_V4SI:
35721 case V16SI_FTYPE_V16SF:
35722 case V16SI_FTYPE_V16SI:
35723 case V16SF_FTYPE_V16SF:
35724 case V8DI_FTYPE_UQI:
35725 case V8DI_FTYPE_V8DI:
35726 case V8DF_FTYPE_V4DF:
35727 case V8DF_FTYPE_V2DF:
35728 case V8DF_FTYPE_V8DF:
35729 nargs = 1;
35730 break;
35731 case V4SF_FTYPE_V4SF_VEC_MERGE:
35732 case V2DF_FTYPE_V2DF_VEC_MERGE:
35733 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35734 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35735 case V16QI_FTYPE_V16QI_V16QI:
35736 case V16QI_FTYPE_V8HI_V8HI:
35737 case V16SF_FTYPE_V16SF_V16SF:
35738 case V8QI_FTYPE_V8QI_V8QI:
35739 case V8QI_FTYPE_V4HI_V4HI:
35740 case V8HI_FTYPE_V8HI_V8HI:
35741 case V8HI_FTYPE_V16QI_V16QI:
35742 case V8HI_FTYPE_V4SI_V4SI:
35743 case V8SF_FTYPE_V8SF_V8SF:
35744 case V8SF_FTYPE_V8SF_V8SI:
35745 case V8DF_FTYPE_V8DF_V8DF:
35746 case V4SI_FTYPE_V4SI_V4SI:
35747 case V4SI_FTYPE_V8HI_V8HI:
35748 case V4SI_FTYPE_V2DF_V2DF:
35749 case V4HI_FTYPE_V4HI_V4HI:
35750 case V4HI_FTYPE_V8QI_V8QI:
35751 case V4HI_FTYPE_V2SI_V2SI:
35752 case V4DF_FTYPE_V4DF_V4DF:
35753 case V4DF_FTYPE_V4DF_V4DI:
35754 case V4SF_FTYPE_V4SF_V4SF:
35755 case V4SF_FTYPE_V4SF_V4SI:
35756 case V4SF_FTYPE_V4SF_V2SI:
35757 case V4SF_FTYPE_V4SF_V2DF:
35758 case V4SF_FTYPE_V4SF_UINT:
35759 case V4SF_FTYPE_V4SF_DI:
35760 case V4SF_FTYPE_V4SF_SI:
35761 case V2DI_FTYPE_V2DI_V2DI:
35762 case V2DI_FTYPE_V16QI_V16QI:
35763 case V2DI_FTYPE_V4SI_V4SI:
35764 case V2DI_FTYPE_V2DI_V16QI:
35765 case V2SI_FTYPE_V2SI_V2SI:
35766 case V2SI_FTYPE_V4HI_V4HI:
35767 case V2SI_FTYPE_V2SF_V2SF:
35768 case V2DF_FTYPE_V2DF_V2DF:
35769 case V2DF_FTYPE_V2DF_V4SF:
35770 case V2DF_FTYPE_V2DF_V2DI:
35771 case V2DF_FTYPE_V2DF_DI:
35772 case V2DF_FTYPE_V2DF_SI:
35773 case V2DF_FTYPE_V2DF_UINT:
35774 case V2SF_FTYPE_V2SF_V2SF:
35775 case V1DI_FTYPE_V1DI_V1DI:
35776 case V1DI_FTYPE_V8QI_V8QI:
35777 case V1DI_FTYPE_V2SI_V2SI:
35778 case V32QI_FTYPE_V16HI_V16HI:
35779 case V16HI_FTYPE_V8SI_V8SI:
35780 case V32QI_FTYPE_V32QI_V32QI:
35781 case V16HI_FTYPE_V32QI_V32QI:
35782 case V16HI_FTYPE_V16HI_V16HI:
35783 case V8SI_FTYPE_V4DF_V4DF:
35784 case V8SI_FTYPE_V8SI_V8SI:
35785 case V8SI_FTYPE_V16HI_V16HI:
35786 case V4DI_FTYPE_V4DI_V4DI:
35787 case V4DI_FTYPE_V8SI_V8SI:
35788 case V8DI_FTYPE_V64QI_V64QI:
35789 if (comparison == UNKNOWN)
35790 return ix86_expand_binop_builtin (icode, exp, target);
35791 nargs = 2;
35792 break;
35793 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35794 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35795 gcc_assert (comparison != UNKNOWN);
35796 nargs = 2;
35797 swap = true;
35798 break;
35799 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35800 case V16HI_FTYPE_V16HI_SI_COUNT:
35801 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35802 case V8SI_FTYPE_V8SI_SI_COUNT:
35803 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35804 case V4DI_FTYPE_V4DI_INT_COUNT:
35805 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35806 case V8HI_FTYPE_V8HI_SI_COUNT:
35807 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35808 case V4SI_FTYPE_V4SI_SI_COUNT:
35809 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35810 case V4HI_FTYPE_V4HI_SI_COUNT:
35811 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35812 case V2DI_FTYPE_V2DI_SI_COUNT:
35813 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35814 case V2SI_FTYPE_V2SI_SI_COUNT:
35815 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35816 case V1DI_FTYPE_V1DI_SI_COUNT:
35817 nargs = 2;
35818 second_arg_count = true;
35819 break;
35820 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35821 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35822 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35823 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35824 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35825 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35826 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35827 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35828 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35829 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35830 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35831 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35832 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35833 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35834 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35835 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35836 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35837 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35838 nargs = 4;
35839 second_arg_count = true;
35840 break;
35841 case UINT64_FTYPE_UINT64_UINT64:
35842 case UINT_FTYPE_UINT_UINT:
35843 case UINT_FTYPE_UINT_USHORT:
35844 case UINT_FTYPE_UINT_UCHAR:
35845 case UINT16_FTYPE_UINT16_INT:
35846 case UINT8_FTYPE_UINT8_INT:
35847 case UQI_FTYPE_UQI_UQI:
35848 case UHI_FTYPE_UHI_UHI:
35849 case USI_FTYPE_USI_USI:
35850 case UDI_FTYPE_UDI_UDI:
35851 case V16SI_FTYPE_V8DF_V8DF:
35852 nargs = 2;
35853 break;
35854 case V2DI_FTYPE_V2DI_INT_CONVERT:
35855 nargs = 2;
35856 rmode = V1TImode;
35857 nargs_constant = 1;
35858 break;
35859 case V4DI_FTYPE_V4DI_INT_CONVERT:
35860 nargs = 2;
35861 rmode = V2TImode;
35862 nargs_constant = 1;
35863 break;
35864 case V8DI_FTYPE_V8DI_INT_CONVERT:
35865 nargs = 2;
35866 rmode = V4TImode;
35867 nargs_constant = 1;
35868 break;
35869 case V8HI_FTYPE_V8HI_INT:
35870 case V8HI_FTYPE_V8SF_INT:
35871 case V16HI_FTYPE_V16SF_INT:
35872 case V8HI_FTYPE_V4SF_INT:
35873 case V8SF_FTYPE_V8SF_INT:
35874 case V4SF_FTYPE_V16SF_INT:
35875 case V16SF_FTYPE_V16SF_INT:
35876 case V4SI_FTYPE_V4SI_INT:
35877 case V4SI_FTYPE_V8SI_INT:
35878 case V4HI_FTYPE_V4HI_INT:
35879 case V4DF_FTYPE_V4DF_INT:
35880 case V4DF_FTYPE_V8DF_INT:
35881 case V4SF_FTYPE_V4SF_INT:
35882 case V4SF_FTYPE_V8SF_INT:
35883 case V2DI_FTYPE_V2DI_INT:
35884 case V2DF_FTYPE_V2DF_INT:
35885 case V2DF_FTYPE_V4DF_INT:
35886 case V16HI_FTYPE_V16HI_INT:
35887 case V8SI_FTYPE_V8SI_INT:
35888 case V16SI_FTYPE_V16SI_INT:
35889 case V4SI_FTYPE_V16SI_INT:
35890 case V4DI_FTYPE_V4DI_INT:
35891 case V2DI_FTYPE_V4DI_INT:
35892 case V4DI_FTYPE_V8DI_INT:
35893 case QI_FTYPE_V4SF_INT:
35894 case QI_FTYPE_V2DF_INT:
35895 case UQI_FTYPE_UQI_UQI_CONST:
35896 case UHI_FTYPE_UHI_UQI:
35897 case USI_FTYPE_USI_UQI:
35898 case UDI_FTYPE_UDI_UQI:
35899 nargs = 2;
35900 nargs_constant = 1;
35901 break;
35902 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35903 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35904 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35905 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35906 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35907 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35908 case UHI_FTYPE_V16SI_V16SI_UHI:
35909 case UQI_FTYPE_V8DI_V8DI_UQI:
35910 case V16HI_FTYPE_V16SI_V16HI_UHI:
35911 case V16QI_FTYPE_V16SI_V16QI_UHI:
35912 case V16QI_FTYPE_V8DI_V16QI_UQI:
35913 case V16SF_FTYPE_V16SF_V16SF_UHI:
35914 case V16SF_FTYPE_V4SF_V16SF_UHI:
35915 case V16SI_FTYPE_SI_V16SI_UHI:
35916 case V16SI_FTYPE_V16HI_V16SI_UHI:
35917 case V16SI_FTYPE_V16QI_V16SI_UHI:
35918 case V8SF_FTYPE_V4SF_V8SF_UQI:
35919 case V4DF_FTYPE_V2DF_V4DF_UQI:
35920 case V8SI_FTYPE_V4SI_V8SI_UQI:
35921 case V8SI_FTYPE_SI_V8SI_UQI:
35922 case V4SI_FTYPE_V4SI_V4SI_UQI:
35923 case V4SI_FTYPE_SI_V4SI_UQI:
35924 case V4DI_FTYPE_V2DI_V4DI_UQI:
35925 case V4DI_FTYPE_DI_V4DI_UQI:
35926 case V2DI_FTYPE_V2DI_V2DI_UQI:
35927 case V2DI_FTYPE_DI_V2DI_UQI:
35928 case V64QI_FTYPE_V64QI_V64QI_UDI:
35929 case V64QI_FTYPE_V16QI_V64QI_UDI:
35930 case V64QI_FTYPE_QI_V64QI_UDI:
35931 case V32QI_FTYPE_V32QI_V32QI_USI:
35932 case V32QI_FTYPE_V16QI_V32QI_USI:
35933 case V32QI_FTYPE_QI_V32QI_USI:
35934 case V16QI_FTYPE_V16QI_V16QI_UHI:
35935 case V16QI_FTYPE_QI_V16QI_UHI:
35936 case V32HI_FTYPE_V8HI_V32HI_USI:
35937 case V32HI_FTYPE_HI_V32HI_USI:
35938 case V16HI_FTYPE_V8HI_V16HI_UHI:
35939 case V16HI_FTYPE_HI_V16HI_UHI:
35940 case V8HI_FTYPE_V8HI_V8HI_UQI:
35941 case V8HI_FTYPE_HI_V8HI_UQI:
35942 case V8SF_FTYPE_V8HI_V8SF_UQI:
35943 case V4SF_FTYPE_V8HI_V4SF_UQI:
35944 case V8SI_FTYPE_V8SF_V8SI_UQI:
35945 case V4SI_FTYPE_V4SF_V4SI_UQI:
35946 case V4DI_FTYPE_V4SF_V4DI_UQI:
35947 case V2DI_FTYPE_V4SF_V2DI_UQI:
35948 case V4SF_FTYPE_V4DI_V4SF_UQI:
35949 case V4SF_FTYPE_V2DI_V4SF_UQI:
35950 case V4DF_FTYPE_V4DI_V4DF_UQI:
35951 case V2DF_FTYPE_V2DI_V2DF_UQI:
35952 case V16QI_FTYPE_V8HI_V16QI_UQI:
35953 case V16QI_FTYPE_V16HI_V16QI_UHI:
35954 case V16QI_FTYPE_V4SI_V16QI_UQI:
35955 case V16QI_FTYPE_V8SI_V16QI_UQI:
35956 case V8HI_FTYPE_V4SI_V8HI_UQI:
35957 case V8HI_FTYPE_V8SI_V8HI_UQI:
35958 case V16QI_FTYPE_V2DI_V16QI_UQI:
35959 case V16QI_FTYPE_V4DI_V16QI_UQI:
35960 case V8HI_FTYPE_V2DI_V8HI_UQI:
35961 case V8HI_FTYPE_V4DI_V8HI_UQI:
35962 case V4SI_FTYPE_V2DI_V4SI_UQI:
35963 case V4SI_FTYPE_V4DI_V4SI_UQI:
35964 case V32QI_FTYPE_V32HI_V32QI_USI:
35965 case UHI_FTYPE_V16QI_V16QI_UHI:
35966 case USI_FTYPE_V32QI_V32QI_USI:
35967 case UDI_FTYPE_V64QI_V64QI_UDI:
35968 case UQI_FTYPE_V8HI_V8HI_UQI:
35969 case UHI_FTYPE_V16HI_V16HI_UHI:
35970 case USI_FTYPE_V32HI_V32HI_USI:
35971 case UQI_FTYPE_V4SI_V4SI_UQI:
35972 case UQI_FTYPE_V8SI_V8SI_UQI:
35973 case UQI_FTYPE_V2DI_V2DI_UQI:
35974 case UQI_FTYPE_V4DI_V4DI_UQI:
35975 case V4SF_FTYPE_V2DF_V4SF_UQI:
35976 case V4SF_FTYPE_V4DF_V4SF_UQI:
35977 case V16SI_FTYPE_V16SI_V16SI_UHI:
35978 case V16SI_FTYPE_V4SI_V16SI_UHI:
35979 case V2DI_FTYPE_V4SI_V2DI_UQI:
35980 case V2DI_FTYPE_V8HI_V2DI_UQI:
35981 case V2DI_FTYPE_V16QI_V2DI_UQI:
35982 case V4DI_FTYPE_V4DI_V4DI_UQI:
35983 case V4DI_FTYPE_V4SI_V4DI_UQI:
35984 case V4DI_FTYPE_V8HI_V4DI_UQI:
35985 case V4DI_FTYPE_V16QI_V4DI_UQI:
35986 case V4DI_FTYPE_V4DF_V4DI_UQI:
35987 case V2DI_FTYPE_V2DF_V2DI_UQI:
35988 case V4SI_FTYPE_V4DF_V4SI_UQI:
35989 case V4SI_FTYPE_V2DF_V4SI_UQI:
35990 case V4SI_FTYPE_V8HI_V4SI_UQI:
35991 case V4SI_FTYPE_V16QI_V4SI_UQI:
35992 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35993 case V8DF_FTYPE_V2DF_V8DF_UQI:
35994 case V8DF_FTYPE_V4DF_V8DF_UQI:
35995 case V8DF_FTYPE_V8DF_V8DF_UQI:
35996 case V8SF_FTYPE_V8SF_V8SF_UQI:
35997 case V8SF_FTYPE_V8SI_V8SF_UQI:
35998 case V4DF_FTYPE_V4DF_V4DF_UQI:
35999 case V4SF_FTYPE_V4SF_V4SF_UQI:
36000 case V2DF_FTYPE_V2DF_V2DF_UQI:
36001 case V2DF_FTYPE_V4SF_V2DF_UQI:
36002 case V2DF_FTYPE_V4SI_V2DF_UQI:
36003 case V4SF_FTYPE_V4SI_V4SF_UQI:
36004 case V4DF_FTYPE_V4SF_V4DF_UQI:
36005 case V4DF_FTYPE_V4SI_V4DF_UQI:
36006 case V8SI_FTYPE_V8SI_V8SI_UQI:
36007 case V8SI_FTYPE_V8HI_V8SI_UQI:
36008 case V8SI_FTYPE_V16QI_V8SI_UQI:
36009 case V8DF_FTYPE_V8SI_V8DF_UQI:
36010 case V8DI_FTYPE_DI_V8DI_UQI:
36011 case V16SF_FTYPE_V8SF_V16SF_UHI:
36012 case V16SI_FTYPE_V8SI_V16SI_UHI:
36013 case V16HI_FTYPE_V16HI_V16HI_UHI:
36014 case V8HI_FTYPE_V16QI_V8HI_UQI:
36015 case V16HI_FTYPE_V16QI_V16HI_UHI:
36016 case V32HI_FTYPE_V32HI_V32HI_USI:
36017 case V32HI_FTYPE_V32QI_V32HI_USI:
36018 case V8DI_FTYPE_V16QI_V8DI_UQI:
36019 case V8DI_FTYPE_V2DI_V8DI_UQI:
36020 case V8DI_FTYPE_V4DI_V8DI_UQI:
36021 case V8DI_FTYPE_V8DI_V8DI_UQI:
36022 case V8DI_FTYPE_V8HI_V8DI_UQI:
36023 case V8DI_FTYPE_V8SI_V8DI_UQI:
36024 case V8HI_FTYPE_V8DI_V8HI_UQI:
36025 case V8SI_FTYPE_V8DI_V8SI_UQI:
36026 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36027 nargs = 3;
36028 break;
36029 case V32QI_FTYPE_V32QI_V32QI_INT:
36030 case V16HI_FTYPE_V16HI_V16HI_INT:
36031 case V16QI_FTYPE_V16QI_V16QI_INT:
36032 case V4DI_FTYPE_V4DI_V4DI_INT:
36033 case V8HI_FTYPE_V8HI_V8HI_INT:
36034 case V8SI_FTYPE_V8SI_V8SI_INT:
36035 case V8SI_FTYPE_V8SI_V4SI_INT:
36036 case V8SF_FTYPE_V8SF_V8SF_INT:
36037 case V8SF_FTYPE_V8SF_V4SF_INT:
36038 case V4SI_FTYPE_V4SI_V4SI_INT:
36039 case V4DF_FTYPE_V4DF_V4DF_INT:
36040 case V16SF_FTYPE_V16SF_V16SF_INT:
36041 case V16SF_FTYPE_V16SF_V4SF_INT:
36042 case V16SI_FTYPE_V16SI_V4SI_INT:
36043 case V4DF_FTYPE_V4DF_V2DF_INT:
36044 case V4SF_FTYPE_V4SF_V4SF_INT:
36045 case V2DI_FTYPE_V2DI_V2DI_INT:
36046 case V4DI_FTYPE_V4DI_V2DI_INT:
36047 case V2DF_FTYPE_V2DF_V2DF_INT:
36048 case UQI_FTYPE_V8DI_V8UDI_INT:
36049 case UQI_FTYPE_V8DF_V8DF_INT:
36050 case UQI_FTYPE_V2DF_V2DF_INT:
36051 case UQI_FTYPE_V4SF_V4SF_INT:
36052 case UHI_FTYPE_V16SI_V16SI_INT:
36053 case UHI_FTYPE_V16SF_V16SF_INT:
36054 nargs = 3;
36055 nargs_constant = 1;
36056 break;
36057 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36058 nargs = 3;
36059 rmode = V4DImode;
36060 nargs_constant = 1;
36061 break;
36062 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36063 nargs = 3;
36064 rmode = V2DImode;
36065 nargs_constant = 1;
36066 break;
36067 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36068 nargs = 3;
36069 rmode = DImode;
36070 nargs_constant = 1;
36071 break;
36072 case V2DI_FTYPE_V2DI_UINT_UINT:
36073 nargs = 3;
36074 nargs_constant = 2;
36075 break;
36076 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36077 nargs = 3;
36078 rmode = V8DImode;
36079 nargs_constant = 1;
36080 break;
36081 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36082 nargs = 5;
36083 rmode = V8DImode;
36084 mask_pos = 2;
36085 nargs_constant = 1;
36086 break;
36087 case QI_FTYPE_V8DF_INT_UQI:
36088 case QI_FTYPE_V4DF_INT_UQI:
36089 case QI_FTYPE_V2DF_INT_UQI:
36090 case HI_FTYPE_V16SF_INT_UHI:
36091 case QI_FTYPE_V8SF_INT_UQI:
36092 case QI_FTYPE_V4SF_INT_UQI:
36093 nargs = 3;
36094 mask_pos = 1;
36095 nargs_constant = 1;
36096 break;
36097 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36098 nargs = 5;
36099 rmode = V4DImode;
36100 mask_pos = 2;
36101 nargs_constant = 1;
36102 break;
36103 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36104 nargs = 5;
36105 rmode = V2DImode;
36106 mask_pos = 2;
36107 nargs_constant = 1;
36108 break;
36109 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36110 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36111 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36112 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36113 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36114 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36115 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36116 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36117 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36118 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36119 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36120 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36121 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36122 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36123 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36124 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36125 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36126 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36127 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36128 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36129 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36130 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36131 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36132 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36133 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36134 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36135 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36136 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36137 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36138 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36139 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36140 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36141 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36142 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36143 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36144 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36145 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36146 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36147 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36148 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36149 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36150 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36151 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36152 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36153 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36154 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36155 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36156 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36157 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36158 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36159 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36160 nargs = 4;
36161 break;
36162 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36163 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36164 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36165 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36166 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36167 nargs = 4;
36168 nargs_constant = 1;
36169 break;
36170 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36171 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36172 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36173 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36174 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36175 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36176 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36177 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36178 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36179 case USI_FTYPE_V32QI_V32QI_INT_USI:
36180 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36181 case USI_FTYPE_V32HI_V32HI_INT_USI:
36182 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36183 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36184 nargs = 4;
36185 mask_pos = 1;
36186 nargs_constant = 1;
36187 break;
36188 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36189 nargs = 4;
36190 nargs_constant = 2;
36191 break;
36192 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36193 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36194 nargs = 4;
36195 break;
36196 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36197 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36198 mask_pos = 1;
36199 nargs = 4;
36200 nargs_constant = 1;
36201 break;
36202 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36203 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36204 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36205 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36206 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36207 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36208 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36209 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36210 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36211 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36212 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36213 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36214 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36215 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36216 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36217 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36218 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36219 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36220 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36221 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36222 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36223 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36224 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36225 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36226 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36227 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36228 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36229 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36230 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36231 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36232 nargs = 4;
36233 mask_pos = 2;
36234 nargs_constant = 1;
36235 break;
36236 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36237 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36238 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36239 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36240 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36241 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36242 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36243 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36244 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36245 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36246 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36247 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36248 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36249 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36250 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36251 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36252 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36253 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36254 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36255 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36256 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36257 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36258 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36259 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36260 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36261 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36262 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36263 nargs = 5;
36264 mask_pos = 2;
36265 nargs_constant = 1;
36266 break;
36267 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36268 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36269 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36270 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36271 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36272 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36273 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36274 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36275 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36276 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36277 nargs = 5;
36278 mask_pos = 1;
36279 nargs_constant = 1;
36280 break;
36282 default:
36283 gcc_unreachable ();
36286 gcc_assert (nargs <= ARRAY_SIZE (args));
36288 if (comparison != UNKNOWN)
36290 gcc_assert (nargs == 2);
36291 return ix86_expand_sse_compare (d, exp, target, swap);
36294 if (rmode == VOIDmode || rmode == tmode)
36296 if (optimize
36297 || target == 0
36298 || GET_MODE (target) != tmode
36299 || !insn_p->operand[0].predicate (target, tmode))
36300 target = gen_reg_rtx (tmode);
36301 else if (memory_operand (target, tmode))
36302 num_memory++;
36303 real_target = target;
36305 else
36307 real_target = gen_reg_rtx (tmode);
36308 target = lowpart_subreg (rmode, real_target, tmode);
36311 for (i = 0; i < nargs; i++)
36313 tree arg = CALL_EXPR_ARG (exp, i);
36314 rtx op = expand_normal (arg);
36315 machine_mode mode = insn_p->operand[i + 1].mode;
36316 bool match = insn_p->operand[i + 1].predicate (op, mode);
36318 if (second_arg_count && i == 1)
36320 /* SIMD shift insns take either an 8-bit immediate or
36321 register as count. But builtin functions take int as
36322 count. If count doesn't match, we put it in register.
36323 The instructions are using 64-bit count, if op is just
36324 32-bit, zero-extend it, as negative shift counts
36325 are undefined behavior and zero-extension is more
36326 efficient. */
36327 if (!match)
36329 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36330 op = convert_modes (mode, GET_MODE (op), op, 1);
36331 else
36332 op = lowpart_subreg (mode, op, GET_MODE (op));
36333 if (!insn_p->operand[i + 1].predicate (op, mode))
36334 op = copy_to_reg (op);
36337 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36338 (!mask_pos && (nargs - i) <= nargs_constant))
36340 if (!match)
36341 switch (icode)
36343 case CODE_FOR_avx_vinsertf128v4di:
36344 case CODE_FOR_avx_vextractf128v4di:
36345 error ("the last argument must be an 1-bit immediate");
36346 return const0_rtx;
36348 case CODE_FOR_avx512f_cmpv8di3_mask:
36349 case CODE_FOR_avx512f_cmpv16si3_mask:
36350 case CODE_FOR_avx512f_ucmpv8di3_mask:
36351 case CODE_FOR_avx512f_ucmpv16si3_mask:
36352 case CODE_FOR_avx512vl_cmpv4di3_mask:
36353 case CODE_FOR_avx512vl_cmpv8si3_mask:
36354 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36355 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36356 case CODE_FOR_avx512vl_cmpv2di3_mask:
36357 case CODE_FOR_avx512vl_cmpv4si3_mask:
36358 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36359 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36360 error ("the last argument must be a 3-bit immediate");
36361 return const0_rtx;
36363 case CODE_FOR_sse4_1_roundsd:
36364 case CODE_FOR_sse4_1_roundss:
36366 case CODE_FOR_sse4_1_roundpd:
36367 case CODE_FOR_sse4_1_roundps:
36368 case CODE_FOR_avx_roundpd256:
36369 case CODE_FOR_avx_roundps256:
36371 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36372 case CODE_FOR_sse4_1_roundps_sfix:
36373 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36374 case CODE_FOR_avx_roundps_sfix256:
36376 case CODE_FOR_sse4_1_blendps:
36377 case CODE_FOR_avx_blendpd256:
36378 case CODE_FOR_avx_vpermilv4df:
36379 case CODE_FOR_avx_vpermilv4df_mask:
36380 case CODE_FOR_avx512f_getmantv8df_mask:
36381 case CODE_FOR_avx512f_getmantv16sf_mask:
36382 case CODE_FOR_avx512vl_getmantv8sf_mask:
36383 case CODE_FOR_avx512vl_getmantv4df_mask:
36384 case CODE_FOR_avx512vl_getmantv4sf_mask:
36385 case CODE_FOR_avx512vl_getmantv2df_mask:
36386 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36387 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36388 case CODE_FOR_avx512dq_rangepv4df_mask:
36389 case CODE_FOR_avx512dq_rangepv8sf_mask:
36390 case CODE_FOR_avx512dq_rangepv2df_mask:
36391 case CODE_FOR_avx512dq_rangepv4sf_mask:
36392 case CODE_FOR_avx_shufpd256_mask:
36393 error ("the last argument must be a 4-bit immediate");
36394 return const0_rtx;
36396 case CODE_FOR_sha1rnds4:
36397 case CODE_FOR_sse4_1_blendpd:
36398 case CODE_FOR_avx_vpermilv2df:
36399 case CODE_FOR_avx_vpermilv2df_mask:
36400 case CODE_FOR_xop_vpermil2v2df3:
36401 case CODE_FOR_xop_vpermil2v4sf3:
36402 case CODE_FOR_xop_vpermil2v4df3:
36403 case CODE_FOR_xop_vpermil2v8sf3:
36404 case CODE_FOR_avx512f_vinsertf32x4_mask:
36405 case CODE_FOR_avx512f_vinserti32x4_mask:
36406 case CODE_FOR_avx512f_vextractf32x4_mask:
36407 case CODE_FOR_avx512f_vextracti32x4_mask:
36408 case CODE_FOR_sse2_shufpd:
36409 case CODE_FOR_sse2_shufpd_mask:
36410 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36411 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36412 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36413 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36414 error ("the last argument must be a 2-bit immediate");
36415 return const0_rtx;
36417 case CODE_FOR_avx_vextractf128v4df:
36418 case CODE_FOR_avx_vextractf128v8sf:
36419 case CODE_FOR_avx_vextractf128v8si:
36420 case CODE_FOR_avx_vinsertf128v4df:
36421 case CODE_FOR_avx_vinsertf128v8sf:
36422 case CODE_FOR_avx_vinsertf128v8si:
36423 case CODE_FOR_avx512f_vinsertf64x4_mask:
36424 case CODE_FOR_avx512f_vinserti64x4_mask:
36425 case CODE_FOR_avx512f_vextractf64x4_mask:
36426 case CODE_FOR_avx512f_vextracti64x4_mask:
36427 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36428 case CODE_FOR_avx512dq_vinserti32x8_mask:
36429 case CODE_FOR_avx512vl_vinsertv4df:
36430 case CODE_FOR_avx512vl_vinsertv4di:
36431 case CODE_FOR_avx512vl_vinsertv8sf:
36432 case CODE_FOR_avx512vl_vinsertv8si:
36433 error ("the last argument must be a 1-bit immediate");
36434 return const0_rtx;
36436 case CODE_FOR_avx_vmcmpv2df3:
36437 case CODE_FOR_avx_vmcmpv4sf3:
36438 case CODE_FOR_avx_cmpv2df3:
36439 case CODE_FOR_avx_cmpv4sf3:
36440 case CODE_FOR_avx_cmpv4df3:
36441 case CODE_FOR_avx_cmpv8sf3:
36442 case CODE_FOR_avx512f_cmpv8df3_mask:
36443 case CODE_FOR_avx512f_cmpv16sf3_mask:
36444 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36445 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36446 error ("the last argument must be a 5-bit immediate");
36447 return const0_rtx;
36449 default:
36450 switch (nargs_constant)
36452 case 2:
36453 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36454 (!mask_pos && (nargs - i) == nargs_constant))
36456 error ("the next to last argument must be an 8-bit immediate");
36457 break;
36459 /* FALLTHRU */
36460 case 1:
36461 error ("the last argument must be an 8-bit immediate");
36462 break;
36463 default:
36464 gcc_unreachable ();
36466 return const0_rtx;
36469 else
36471 if (VECTOR_MODE_P (mode))
36472 op = safe_vector_operand (op, mode);
36474 /* If we aren't optimizing, only allow one memory operand to
36475 be generated. */
36476 if (memory_operand (op, mode))
36477 num_memory++;
36479 op = fixup_modeless_constant (op, mode);
36481 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36483 if (optimize || !match || num_memory > 1)
36484 op = copy_to_mode_reg (mode, op);
36486 else
36488 op = copy_to_reg (op);
36489 op = lowpart_subreg (mode, op, GET_MODE (op));
36493 args[i].op = op;
36494 args[i].mode = mode;
36497 switch (nargs)
36499 case 1:
36500 pat = GEN_FCN (icode) (real_target, args[0].op);
36501 break;
36502 case 2:
36503 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36504 break;
36505 case 3:
36506 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36507 args[2].op);
36508 break;
36509 case 4:
36510 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36511 args[2].op, args[3].op);
36512 break;
36513 case 5:
36514 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36515 args[2].op, args[3].op, args[4].op);
36516 break;
36517 case 6:
36518 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36519 args[2].op, args[3].op, args[4].op,
36520 args[5].op);
36521 break;
36522 default:
36523 gcc_unreachable ();
36526 if (! pat)
36527 return 0;
36529 emit_insn (pat);
36530 return target;
36533 /* Transform pattern of following layout:
36534 (set A
36535 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36537 into:
36538 (set (A B)) */
36540 static rtx
36541 ix86_erase_embedded_rounding (rtx pat)
36543 if (GET_CODE (pat) == INSN)
36544 pat = PATTERN (pat);
36546 gcc_assert (GET_CODE (pat) == SET);
36547 rtx src = SET_SRC (pat);
36548 gcc_assert (XVECLEN (src, 0) == 2);
36549 rtx p0 = XVECEXP (src, 0, 0);
36550 gcc_assert (GET_CODE (src) == UNSPEC
36551 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36552 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36553 return res;
36556 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36557 with rounding. */
36558 static rtx
36559 ix86_expand_sse_comi_round (const struct builtin_description *d,
36560 tree exp, rtx target)
36562 rtx pat, set_dst;
36563 tree arg0 = CALL_EXPR_ARG (exp, 0);
36564 tree arg1 = CALL_EXPR_ARG (exp, 1);
36565 tree arg2 = CALL_EXPR_ARG (exp, 2);
36566 tree arg3 = CALL_EXPR_ARG (exp, 3);
36567 rtx op0 = expand_normal (arg0);
36568 rtx op1 = expand_normal (arg1);
36569 rtx op2 = expand_normal (arg2);
36570 rtx op3 = expand_normal (arg3);
36571 enum insn_code icode = d->icode;
36572 const struct insn_data_d *insn_p = &insn_data[icode];
36573 machine_mode mode0 = insn_p->operand[0].mode;
36574 machine_mode mode1 = insn_p->operand[1].mode;
36575 enum rtx_code comparison = UNEQ;
36576 bool need_ucomi = false;
36578 /* See avxintrin.h for values. */
36579 enum rtx_code comi_comparisons[32] =
36581 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36582 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36583 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36585 bool need_ucomi_values[32] =
36587 true, false, false, true, true, false, false, true,
36588 true, false, false, true, true, false, false, true,
36589 false, true, true, false, false, true, true, false,
36590 false, true, true, false, false, true, true, false
36593 if (!CONST_INT_P (op2))
36595 error ("the third argument must be comparison constant");
36596 return const0_rtx;
36598 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36600 error ("incorrect comparison mode");
36601 return const0_rtx;
36604 if (!insn_p->operand[2].predicate (op3, SImode))
36606 error ("incorrect rounding operand");
36607 return const0_rtx;
36610 comparison = comi_comparisons[INTVAL (op2)];
36611 need_ucomi = need_ucomi_values[INTVAL (op2)];
36613 if (VECTOR_MODE_P (mode0))
36614 op0 = safe_vector_operand (op0, mode0);
36615 if (VECTOR_MODE_P (mode1))
36616 op1 = safe_vector_operand (op1, mode1);
36618 target = gen_reg_rtx (SImode);
36619 emit_move_insn (target, const0_rtx);
36620 target = gen_rtx_SUBREG (QImode, target, 0);
36622 if ((optimize && !register_operand (op0, mode0))
36623 || !insn_p->operand[0].predicate (op0, mode0))
36624 op0 = copy_to_mode_reg (mode0, op0);
36625 if ((optimize && !register_operand (op1, mode1))
36626 || !insn_p->operand[1].predicate (op1, mode1))
36627 op1 = copy_to_mode_reg (mode1, op1);
36629 if (need_ucomi)
36630 icode = icode == CODE_FOR_sse_comi_round
36631 ? CODE_FOR_sse_ucomi_round
36632 : CODE_FOR_sse2_ucomi_round;
36634 pat = GEN_FCN (icode) (op0, op1, op3);
36635 if (! pat)
36636 return 0;
36638 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36639 if (INTVAL (op3) == NO_ROUND)
36641 pat = ix86_erase_embedded_rounding (pat);
36642 if (! pat)
36643 return 0;
36645 set_dst = SET_DEST (pat);
36647 else
36649 gcc_assert (GET_CODE (pat) == SET);
36650 set_dst = SET_DEST (pat);
36653 emit_insn (pat);
36654 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36655 gen_rtx_fmt_ee (comparison, QImode,
36656 set_dst,
36657 const0_rtx)));
36659 return SUBREG_REG (target);
36662 static rtx
36663 ix86_expand_round_builtin (const struct builtin_description *d,
36664 tree exp, rtx target)
36666 rtx pat;
36667 unsigned int i, nargs;
36668 struct
36670 rtx op;
36671 machine_mode mode;
36672 } args[6];
36673 enum insn_code icode = d->icode;
36674 const struct insn_data_d *insn_p = &insn_data[icode];
36675 machine_mode tmode = insn_p->operand[0].mode;
36676 unsigned int nargs_constant = 0;
36677 unsigned int redundant_embed_rnd = 0;
36679 switch ((enum ix86_builtin_func_type) d->flag)
36681 case UINT64_FTYPE_V2DF_INT:
36682 case UINT64_FTYPE_V4SF_INT:
36683 case UINT_FTYPE_V2DF_INT:
36684 case UINT_FTYPE_V4SF_INT:
36685 case INT64_FTYPE_V2DF_INT:
36686 case INT64_FTYPE_V4SF_INT:
36687 case INT_FTYPE_V2DF_INT:
36688 case INT_FTYPE_V4SF_INT:
36689 nargs = 2;
36690 break;
36691 case V4SF_FTYPE_V4SF_UINT_INT:
36692 case V4SF_FTYPE_V4SF_UINT64_INT:
36693 case V2DF_FTYPE_V2DF_UINT64_INT:
36694 case V4SF_FTYPE_V4SF_INT_INT:
36695 case V4SF_FTYPE_V4SF_INT64_INT:
36696 case V2DF_FTYPE_V2DF_INT64_INT:
36697 case V4SF_FTYPE_V4SF_V4SF_INT:
36698 case V2DF_FTYPE_V2DF_V2DF_INT:
36699 case V4SF_FTYPE_V4SF_V2DF_INT:
36700 case V2DF_FTYPE_V2DF_V4SF_INT:
36701 nargs = 3;
36702 break;
36703 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36704 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36705 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36706 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36707 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36708 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36709 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36710 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36711 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36712 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36713 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36714 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36715 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36716 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36717 nargs = 4;
36718 break;
36719 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36720 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36721 nargs_constant = 2;
36722 nargs = 4;
36723 break;
36724 case INT_FTYPE_V4SF_V4SF_INT_INT:
36725 case INT_FTYPE_V2DF_V2DF_INT_INT:
36726 return ix86_expand_sse_comi_round (d, exp, target);
36727 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36728 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36729 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36730 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36731 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36732 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36733 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36734 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36735 nargs = 5;
36736 break;
36737 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36738 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36739 nargs_constant = 4;
36740 nargs = 5;
36741 break;
36742 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36743 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36744 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36745 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36746 nargs_constant = 3;
36747 nargs = 5;
36748 break;
36749 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36750 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36751 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36752 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36753 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36754 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36755 nargs = 6;
36756 nargs_constant = 4;
36757 break;
36758 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36759 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36760 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36761 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36762 nargs = 6;
36763 nargs_constant = 3;
36764 break;
36765 default:
36766 gcc_unreachable ();
36768 gcc_assert (nargs <= ARRAY_SIZE (args));
36770 if (optimize
36771 || target == 0
36772 || GET_MODE (target) != tmode
36773 || !insn_p->operand[0].predicate (target, tmode))
36774 target = gen_reg_rtx (tmode);
36776 for (i = 0; i < nargs; i++)
36778 tree arg = CALL_EXPR_ARG (exp, i);
36779 rtx op = expand_normal (arg);
36780 machine_mode mode = insn_p->operand[i + 1].mode;
36781 bool match = insn_p->operand[i + 1].predicate (op, mode);
36783 if (i == nargs - nargs_constant)
36785 if (!match)
36787 switch (icode)
36789 case CODE_FOR_avx512f_getmantv8df_mask_round:
36790 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36791 case CODE_FOR_avx512f_vgetmantv2df_round:
36792 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36793 case CODE_FOR_avx512f_vgetmantv4sf_round:
36794 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36795 error ("the immediate argument must be a 4-bit immediate");
36796 return const0_rtx;
36797 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36798 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36799 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36800 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36801 error ("the immediate argument must be a 5-bit immediate");
36802 return const0_rtx;
36803 default:
36804 error ("the immediate argument must be an 8-bit immediate");
36805 return const0_rtx;
36809 else if (i == nargs-1)
36811 if (!insn_p->operand[nargs].predicate (op, SImode))
36813 error ("incorrect rounding operand");
36814 return const0_rtx;
36817 /* If there is no rounding use normal version of the pattern. */
36818 if (INTVAL (op) == NO_ROUND)
36819 redundant_embed_rnd = 1;
36821 else
36823 if (VECTOR_MODE_P (mode))
36824 op = safe_vector_operand (op, mode);
36826 op = fixup_modeless_constant (op, mode);
36828 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36830 if (optimize || !match)
36831 op = copy_to_mode_reg (mode, op);
36833 else
36835 op = copy_to_reg (op);
36836 op = lowpart_subreg (mode, op, GET_MODE (op));
36840 args[i].op = op;
36841 args[i].mode = mode;
36844 switch (nargs)
36846 case 1:
36847 pat = GEN_FCN (icode) (target, args[0].op);
36848 break;
36849 case 2:
36850 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36851 break;
36852 case 3:
36853 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36854 args[2].op);
36855 break;
36856 case 4:
36857 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36858 args[2].op, args[3].op);
36859 break;
36860 case 5:
36861 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36862 args[2].op, args[3].op, args[4].op);
36863 break;
36864 case 6:
36865 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36866 args[2].op, args[3].op, args[4].op,
36867 args[5].op);
36868 break;
36869 default:
36870 gcc_unreachable ();
36873 if (!pat)
36874 return 0;
36876 if (redundant_embed_rnd)
36877 pat = ix86_erase_embedded_rounding (pat);
36879 emit_insn (pat);
36880 return target;
36883 /* Subroutine of ix86_expand_builtin to take care of special insns
36884 with variable number of operands. */
36886 static rtx
36887 ix86_expand_special_args_builtin (const struct builtin_description *d,
36888 tree exp, rtx target)
36890 tree arg;
36891 rtx pat, op;
36892 unsigned int i, nargs, arg_adjust, memory;
36893 bool aligned_mem = false;
36894 struct
36896 rtx op;
36897 machine_mode mode;
36898 } args[3];
36899 enum insn_code icode = d->icode;
36900 bool last_arg_constant = false;
36901 const struct insn_data_d *insn_p = &insn_data[icode];
36902 machine_mode tmode = insn_p->operand[0].mode;
36903 enum { load, store } klass;
36905 switch ((enum ix86_builtin_func_type) d->flag)
36907 case VOID_FTYPE_VOID:
36908 emit_insn (GEN_FCN (icode) (target));
36909 return 0;
36910 case VOID_FTYPE_UINT64:
36911 case VOID_FTYPE_UNSIGNED:
36912 nargs = 0;
36913 klass = store;
36914 memory = 0;
36915 break;
36917 case INT_FTYPE_VOID:
36918 case USHORT_FTYPE_VOID:
36919 case UINT64_FTYPE_VOID:
36920 case UNSIGNED_FTYPE_VOID:
36921 nargs = 0;
36922 klass = load;
36923 memory = 0;
36924 break;
36925 case UINT64_FTYPE_PUNSIGNED:
36926 case V2DI_FTYPE_PV2DI:
36927 case V4DI_FTYPE_PV4DI:
36928 case V32QI_FTYPE_PCCHAR:
36929 case V16QI_FTYPE_PCCHAR:
36930 case V8SF_FTYPE_PCV4SF:
36931 case V8SF_FTYPE_PCFLOAT:
36932 case V4SF_FTYPE_PCFLOAT:
36933 case V4DF_FTYPE_PCV2DF:
36934 case V4DF_FTYPE_PCDOUBLE:
36935 case V2DF_FTYPE_PCDOUBLE:
36936 case VOID_FTYPE_PVOID:
36937 case V8DI_FTYPE_PV8DI:
36938 nargs = 1;
36939 klass = load;
36940 memory = 0;
36941 switch (icode)
36943 case CODE_FOR_sse4_1_movntdqa:
36944 case CODE_FOR_avx2_movntdqa:
36945 case CODE_FOR_avx512f_movntdqa:
36946 aligned_mem = true;
36947 break;
36948 default:
36949 break;
36951 break;
36952 case VOID_FTYPE_PV2SF_V4SF:
36953 case VOID_FTYPE_PV8DI_V8DI:
36954 case VOID_FTYPE_PV4DI_V4DI:
36955 case VOID_FTYPE_PV2DI_V2DI:
36956 case VOID_FTYPE_PCHAR_V32QI:
36957 case VOID_FTYPE_PCHAR_V16QI:
36958 case VOID_FTYPE_PFLOAT_V16SF:
36959 case VOID_FTYPE_PFLOAT_V8SF:
36960 case VOID_FTYPE_PFLOAT_V4SF:
36961 case VOID_FTYPE_PDOUBLE_V8DF:
36962 case VOID_FTYPE_PDOUBLE_V4DF:
36963 case VOID_FTYPE_PDOUBLE_V2DF:
36964 case VOID_FTYPE_PLONGLONG_LONGLONG:
36965 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36966 case VOID_FTYPE_PINT_INT:
36967 nargs = 1;
36968 klass = store;
36969 /* Reserve memory operand for target. */
36970 memory = ARRAY_SIZE (args);
36971 switch (icode)
36973 /* These builtins and instructions require the memory
36974 to be properly aligned. */
36975 case CODE_FOR_avx_movntv4di:
36976 case CODE_FOR_sse2_movntv2di:
36977 case CODE_FOR_avx_movntv8sf:
36978 case CODE_FOR_sse_movntv4sf:
36979 case CODE_FOR_sse4a_vmmovntv4sf:
36980 case CODE_FOR_avx_movntv4df:
36981 case CODE_FOR_sse2_movntv2df:
36982 case CODE_FOR_sse4a_vmmovntv2df:
36983 case CODE_FOR_sse2_movntidi:
36984 case CODE_FOR_sse_movntq:
36985 case CODE_FOR_sse2_movntisi:
36986 case CODE_FOR_avx512f_movntv16sf:
36987 case CODE_FOR_avx512f_movntv8df:
36988 case CODE_FOR_avx512f_movntv8di:
36989 aligned_mem = true;
36990 break;
36991 default:
36992 break;
36994 break;
36995 case V4SF_FTYPE_V4SF_PCV2SF:
36996 case V2DF_FTYPE_V2DF_PCDOUBLE:
36997 nargs = 2;
36998 klass = load;
36999 memory = 1;
37000 break;
37001 case V8SF_FTYPE_PCV8SF_V8SI:
37002 case V4DF_FTYPE_PCV4DF_V4DI:
37003 case V4SF_FTYPE_PCV4SF_V4SI:
37004 case V2DF_FTYPE_PCV2DF_V2DI:
37005 case V8SI_FTYPE_PCV8SI_V8SI:
37006 case V4DI_FTYPE_PCV4DI_V4DI:
37007 case V4SI_FTYPE_PCV4SI_V4SI:
37008 case V2DI_FTYPE_PCV2DI_V2DI:
37009 case VOID_FTYPE_INT_INT64:
37010 nargs = 2;
37011 klass = load;
37012 memory = 0;
37013 break;
37014 case VOID_FTYPE_PV8DF_V8DF_UQI:
37015 case VOID_FTYPE_PV4DF_V4DF_UQI:
37016 case VOID_FTYPE_PV2DF_V2DF_UQI:
37017 case VOID_FTYPE_PV16SF_V16SF_UHI:
37018 case VOID_FTYPE_PV8SF_V8SF_UQI:
37019 case VOID_FTYPE_PV4SF_V4SF_UQI:
37020 case VOID_FTYPE_PV8DI_V8DI_UQI:
37021 case VOID_FTYPE_PV4DI_V4DI_UQI:
37022 case VOID_FTYPE_PV2DI_V2DI_UQI:
37023 case VOID_FTYPE_PV16SI_V16SI_UHI:
37024 case VOID_FTYPE_PV8SI_V8SI_UQI:
37025 case VOID_FTYPE_PV4SI_V4SI_UQI:
37026 switch (icode)
37028 /* These builtins and instructions require the memory
37029 to be properly aligned. */
37030 case CODE_FOR_avx512f_storev16sf_mask:
37031 case CODE_FOR_avx512f_storev16si_mask:
37032 case CODE_FOR_avx512f_storev8df_mask:
37033 case CODE_FOR_avx512f_storev8di_mask:
37034 case CODE_FOR_avx512vl_storev8sf_mask:
37035 case CODE_FOR_avx512vl_storev8si_mask:
37036 case CODE_FOR_avx512vl_storev4df_mask:
37037 case CODE_FOR_avx512vl_storev4di_mask:
37038 case CODE_FOR_avx512vl_storev4sf_mask:
37039 case CODE_FOR_avx512vl_storev4si_mask:
37040 case CODE_FOR_avx512vl_storev2df_mask:
37041 case CODE_FOR_avx512vl_storev2di_mask:
37042 aligned_mem = true;
37043 break;
37044 default:
37045 break;
37047 /* FALLTHRU */
37048 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37049 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37050 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37051 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37052 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37053 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37054 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37055 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37056 case VOID_FTYPE_PV8SI_V8DI_UQI:
37057 case VOID_FTYPE_PV8HI_V8DI_UQI:
37058 case VOID_FTYPE_PV16HI_V16SI_UHI:
37059 case VOID_FTYPE_PV16QI_V8DI_UQI:
37060 case VOID_FTYPE_PV16QI_V16SI_UHI:
37061 case VOID_FTYPE_PV4SI_V4DI_UQI:
37062 case VOID_FTYPE_PV4SI_V2DI_UQI:
37063 case VOID_FTYPE_PV8HI_V4DI_UQI:
37064 case VOID_FTYPE_PV8HI_V2DI_UQI:
37065 case VOID_FTYPE_PV8HI_V8SI_UQI:
37066 case VOID_FTYPE_PV8HI_V4SI_UQI:
37067 case VOID_FTYPE_PV16QI_V4DI_UQI:
37068 case VOID_FTYPE_PV16QI_V2DI_UQI:
37069 case VOID_FTYPE_PV16QI_V8SI_UQI:
37070 case VOID_FTYPE_PV16QI_V4SI_UQI:
37071 case VOID_FTYPE_PCHAR_V64QI_UDI:
37072 case VOID_FTYPE_PCHAR_V32QI_USI:
37073 case VOID_FTYPE_PCHAR_V16QI_UHI:
37074 case VOID_FTYPE_PSHORT_V32HI_USI:
37075 case VOID_FTYPE_PSHORT_V16HI_UHI:
37076 case VOID_FTYPE_PSHORT_V8HI_UQI:
37077 case VOID_FTYPE_PINT_V16SI_UHI:
37078 case VOID_FTYPE_PINT_V8SI_UQI:
37079 case VOID_FTYPE_PINT_V4SI_UQI:
37080 case VOID_FTYPE_PINT64_V8DI_UQI:
37081 case VOID_FTYPE_PINT64_V4DI_UQI:
37082 case VOID_FTYPE_PINT64_V2DI_UQI:
37083 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37084 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37085 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37086 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37087 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37088 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37089 case VOID_FTYPE_PV32QI_V32HI_USI:
37090 case VOID_FTYPE_PV16QI_V16HI_UHI:
37091 case VOID_FTYPE_PV8QI_V8HI_UQI:
37092 nargs = 2;
37093 klass = store;
37094 /* Reserve memory operand for target. */
37095 memory = ARRAY_SIZE (args);
37096 break;
37097 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37098 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37099 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37100 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37101 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37102 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37103 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37104 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37105 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37106 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37107 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37108 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37109 switch (icode)
37111 /* These builtins and instructions require the memory
37112 to be properly aligned. */
37113 case CODE_FOR_avx512f_loadv16sf_mask:
37114 case CODE_FOR_avx512f_loadv16si_mask:
37115 case CODE_FOR_avx512f_loadv8df_mask:
37116 case CODE_FOR_avx512f_loadv8di_mask:
37117 case CODE_FOR_avx512vl_loadv8sf_mask:
37118 case CODE_FOR_avx512vl_loadv8si_mask:
37119 case CODE_FOR_avx512vl_loadv4df_mask:
37120 case CODE_FOR_avx512vl_loadv4di_mask:
37121 case CODE_FOR_avx512vl_loadv4sf_mask:
37122 case CODE_FOR_avx512vl_loadv4si_mask:
37123 case CODE_FOR_avx512vl_loadv2df_mask:
37124 case CODE_FOR_avx512vl_loadv2di_mask:
37125 case CODE_FOR_avx512bw_loadv64qi_mask:
37126 case CODE_FOR_avx512vl_loadv32qi_mask:
37127 case CODE_FOR_avx512vl_loadv16qi_mask:
37128 case CODE_FOR_avx512bw_loadv32hi_mask:
37129 case CODE_FOR_avx512vl_loadv16hi_mask:
37130 case CODE_FOR_avx512vl_loadv8hi_mask:
37131 aligned_mem = true;
37132 break;
37133 default:
37134 break;
37136 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37137 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37138 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37139 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37140 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37141 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37142 case V16SI_FTYPE_PCINT_V16SI_UHI:
37143 case V8SI_FTYPE_PCINT_V8SI_UQI:
37144 case V4SI_FTYPE_PCINT_V4SI_UQI:
37145 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37146 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37147 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37148 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37149 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37150 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37151 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37152 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37153 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37154 nargs = 3;
37155 klass = load;
37156 memory = 0;
37157 break;
37158 case VOID_FTYPE_UINT_UINT_UINT:
37159 case VOID_FTYPE_UINT64_UINT_UINT:
37160 case UCHAR_FTYPE_UINT_UINT_UINT:
37161 case UCHAR_FTYPE_UINT64_UINT_UINT:
37162 nargs = 3;
37163 klass = load;
37164 memory = ARRAY_SIZE (args);
37165 last_arg_constant = true;
37166 break;
37167 default:
37168 gcc_unreachable ();
37171 gcc_assert (nargs <= ARRAY_SIZE (args));
37173 if (klass == store)
37175 arg = CALL_EXPR_ARG (exp, 0);
37176 op = expand_normal (arg);
37177 gcc_assert (target == 0);
37178 if (memory)
37180 op = ix86_zero_extend_to_Pmode (op);
37181 target = gen_rtx_MEM (tmode, op);
37182 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37183 on it. Try to improve it using get_pointer_alignment,
37184 and if the special builtin is one that requires strict
37185 mode alignment, also from it's GET_MODE_ALIGNMENT.
37186 Failure to do so could lead to ix86_legitimate_combined_insn
37187 rejecting all changes to such insns. */
37188 unsigned int align = get_pointer_alignment (arg);
37189 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37190 align = GET_MODE_ALIGNMENT (tmode);
37191 if (MEM_ALIGN (target) < align)
37192 set_mem_align (target, align);
37194 else
37195 target = force_reg (tmode, op);
37196 arg_adjust = 1;
37198 else
37200 arg_adjust = 0;
37201 if (optimize
37202 || target == 0
37203 || !register_operand (target, tmode)
37204 || GET_MODE (target) != tmode)
37205 target = gen_reg_rtx (tmode);
37208 for (i = 0; i < nargs; i++)
37210 machine_mode mode = insn_p->operand[i + 1].mode;
37211 bool match;
37213 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37214 op = expand_normal (arg);
37215 match = insn_p->operand[i + 1].predicate (op, mode);
37217 if (last_arg_constant && (i + 1) == nargs)
37219 if (!match)
37221 if (icode == CODE_FOR_lwp_lwpvalsi3
37222 || icode == CODE_FOR_lwp_lwpinssi3
37223 || icode == CODE_FOR_lwp_lwpvaldi3
37224 || icode == CODE_FOR_lwp_lwpinsdi3)
37225 error ("the last argument must be a 32-bit immediate");
37226 else
37227 error ("the last argument must be an 8-bit immediate");
37228 return const0_rtx;
37231 else
37233 if (i == memory)
37235 /* This must be the memory operand. */
37236 op = ix86_zero_extend_to_Pmode (op);
37237 op = gen_rtx_MEM (mode, op);
37238 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37239 on it. Try to improve it using get_pointer_alignment,
37240 and if the special builtin is one that requires strict
37241 mode alignment, also from it's GET_MODE_ALIGNMENT.
37242 Failure to do so could lead to ix86_legitimate_combined_insn
37243 rejecting all changes to such insns. */
37244 unsigned int align = get_pointer_alignment (arg);
37245 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37246 align = GET_MODE_ALIGNMENT (mode);
37247 if (MEM_ALIGN (op) < align)
37248 set_mem_align (op, align);
37250 else
37252 /* This must be register. */
37253 if (VECTOR_MODE_P (mode))
37254 op = safe_vector_operand (op, mode);
37256 op = fixup_modeless_constant (op, mode);
37258 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37259 op = copy_to_mode_reg (mode, op);
37260 else
37262 op = copy_to_reg (op);
37263 op = lowpart_subreg (mode, op, GET_MODE (op));
37268 args[i].op = op;
37269 args[i].mode = mode;
37272 switch (nargs)
37274 case 0:
37275 pat = GEN_FCN (icode) (target);
37276 break;
37277 case 1:
37278 pat = GEN_FCN (icode) (target, args[0].op);
37279 break;
37280 case 2:
37281 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37282 break;
37283 case 3:
37284 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37285 break;
37286 default:
37287 gcc_unreachable ();
37290 if (! pat)
37291 return 0;
37292 emit_insn (pat);
37293 return klass == store ? 0 : target;
37296 /* Return the integer constant in ARG. Constrain it to be in the range
37297 of the subparts of VEC_TYPE; issue an error if not. */
37299 static int
37300 get_element_number (tree vec_type, tree arg)
37302 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37304 if (!tree_fits_uhwi_p (arg)
37305 || (elt = tree_to_uhwi (arg), elt > max))
37307 error ("selector must be an integer constant in the range 0..%wi", max);
37308 return 0;
37311 return elt;
37314 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37315 ix86_expand_vector_init. We DO have language-level syntax for this, in
37316 the form of (type){ init-list }. Except that since we can't place emms
37317 instructions from inside the compiler, we can't allow the use of MMX
37318 registers unless the user explicitly asks for it. So we do *not* define
37319 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37320 we have builtins invoked by mmintrin.h that gives us license to emit
37321 these sorts of instructions. */
37323 static rtx
37324 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37326 machine_mode tmode = TYPE_MODE (type);
37327 machine_mode inner_mode = GET_MODE_INNER (tmode);
37328 int i, n_elt = GET_MODE_NUNITS (tmode);
37329 rtvec v = rtvec_alloc (n_elt);
37331 gcc_assert (VECTOR_MODE_P (tmode));
37332 gcc_assert (call_expr_nargs (exp) == n_elt);
37334 for (i = 0; i < n_elt; ++i)
37336 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37337 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37340 if (!target || !register_operand (target, tmode))
37341 target = gen_reg_rtx (tmode);
37343 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37344 return target;
37347 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37348 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37349 had a language-level syntax for referencing vector elements. */
37351 static rtx
37352 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37354 machine_mode tmode, mode0;
37355 tree arg0, arg1;
37356 int elt;
37357 rtx op0;
37359 arg0 = CALL_EXPR_ARG (exp, 0);
37360 arg1 = CALL_EXPR_ARG (exp, 1);
37362 op0 = expand_normal (arg0);
37363 elt = get_element_number (TREE_TYPE (arg0), arg1);
37365 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37366 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37367 gcc_assert (VECTOR_MODE_P (mode0));
37369 op0 = force_reg (mode0, op0);
37371 if (optimize || !target || !register_operand (target, tmode))
37372 target = gen_reg_rtx (tmode);
37374 ix86_expand_vector_extract (true, target, op0, elt);
37376 return target;
37379 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37380 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37381 a language-level syntax for referencing vector elements. */
37383 static rtx
37384 ix86_expand_vec_set_builtin (tree exp)
37386 machine_mode tmode, mode1;
37387 tree arg0, arg1, arg2;
37388 int elt;
37389 rtx op0, op1, target;
37391 arg0 = CALL_EXPR_ARG (exp, 0);
37392 arg1 = CALL_EXPR_ARG (exp, 1);
37393 arg2 = CALL_EXPR_ARG (exp, 2);
37395 tmode = TYPE_MODE (TREE_TYPE (arg0));
37396 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37397 gcc_assert (VECTOR_MODE_P (tmode));
37399 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37400 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37401 elt = get_element_number (TREE_TYPE (arg0), arg2);
37403 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37404 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37406 op0 = force_reg (tmode, op0);
37407 op1 = force_reg (mode1, op1);
37409 /* OP0 is the source of these builtin functions and shouldn't be
37410 modified. Create a copy, use it and return it as target. */
37411 target = gen_reg_rtx (tmode);
37412 emit_move_insn (target, op0);
37413 ix86_expand_vector_set (true, target, op1, elt);
37415 return target;
37418 /* Emit conditional move of SRC to DST with condition
37419 OP1 CODE OP2. */
37420 static void
37421 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37423 rtx t;
37425 if (TARGET_CMOVE)
37427 t = ix86_expand_compare (code, op1, op2);
37428 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37429 src, dst)));
37431 else
37433 rtx_code_label *nomove = gen_label_rtx ();
37434 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37435 const0_rtx, GET_MODE (op1), 1, nomove);
37436 emit_move_insn (dst, src);
37437 emit_label (nomove);
37441 /* Choose max of DST and SRC and put it to DST. */
37442 static void
37443 ix86_emit_move_max (rtx dst, rtx src)
37445 ix86_emit_cmove (dst, src, LTU, dst, src);
37448 /* Expand an expression EXP that calls a built-in function,
37449 with result going to TARGET if that's convenient
37450 (and in mode MODE if that's convenient).
37451 SUBTARGET may be used as the target for computing one of EXP's operands.
37452 IGNORE is nonzero if the value is to be ignored. */
37454 static rtx
37455 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37456 machine_mode mode, int ignore)
37458 size_t i;
37459 enum insn_code icode;
37460 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37461 tree arg0, arg1, arg2, arg3, arg4;
37462 rtx op0, op1, op2, op3, op4, pat, insn;
37463 machine_mode mode0, mode1, mode2, mode3, mode4;
37464 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37466 /* For CPU builtins that can be folded, fold first and expand the fold. */
37467 switch (fcode)
37469 case IX86_BUILTIN_CPU_INIT:
37471 /* Make it call __cpu_indicator_init in libgcc. */
37472 tree call_expr, fndecl, type;
37473 type = build_function_type_list (integer_type_node, NULL_TREE);
37474 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37475 call_expr = build_call_expr (fndecl, 0);
37476 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37478 case IX86_BUILTIN_CPU_IS:
37479 case IX86_BUILTIN_CPU_SUPPORTS:
37481 tree arg0 = CALL_EXPR_ARG (exp, 0);
37482 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37483 gcc_assert (fold_expr != NULL_TREE);
37484 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37488 /* Determine whether the builtin function is available under the current ISA.
37489 Originally the builtin was not created if it wasn't applicable to the
37490 current ISA based on the command line switches. With function specific
37491 options, we need to check in the context of the function making the call
37492 whether it is supported. Treat AVX512VL specially. For other flags,
37493 if isa includes more than one ISA bit, treat those are requiring any
37494 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37495 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37496 at all, -m64 is a whole TU option. */
37497 if (((ix86_builtins_isa[fcode].isa
37498 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37499 && !(ix86_builtins_isa[fcode].isa
37500 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37501 & ix86_isa_flags))
37502 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37503 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37504 || (ix86_builtins_isa[fcode].isa2
37505 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37507 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37508 ix86_builtins_isa[fcode].isa2, 0, 0,
37509 NULL, NULL, (enum fpmath_unit) 0,
37510 false);
37511 if (!opts)
37512 error ("%qE needs unknown isa option", fndecl);
37513 else
37515 gcc_assert (opts != NULL);
37516 error ("%qE needs isa option %s", fndecl, opts);
37517 free (opts);
37519 return expand_call (exp, target, ignore);
37522 switch (fcode)
37524 case IX86_BUILTIN_BNDMK:
37525 if (!target
37526 || GET_MODE (target) != BNDmode
37527 || !register_operand (target, BNDmode))
37528 target = gen_reg_rtx (BNDmode);
37530 arg0 = CALL_EXPR_ARG (exp, 0);
37531 arg1 = CALL_EXPR_ARG (exp, 1);
37533 op0 = expand_normal (arg0);
37534 op1 = expand_normal (arg1);
37536 if (!register_operand (op0, Pmode))
37537 op0 = ix86_zero_extend_to_Pmode (op0);
37538 if (!register_operand (op1, Pmode))
37539 op1 = ix86_zero_extend_to_Pmode (op1);
37541 /* Builtin arg1 is size of block but instruction op1 should
37542 be (size - 1). */
37543 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37544 NULL_RTX, 1, OPTAB_DIRECT);
37546 emit_insn (BNDmode == BND64mode
37547 ? gen_bnd64_mk (target, op0, op1)
37548 : gen_bnd32_mk (target, op0, op1));
37549 return target;
37551 case IX86_BUILTIN_BNDSTX:
37552 arg0 = CALL_EXPR_ARG (exp, 0);
37553 arg1 = CALL_EXPR_ARG (exp, 1);
37554 arg2 = CALL_EXPR_ARG (exp, 2);
37556 op0 = expand_normal (arg0);
37557 op1 = expand_normal (arg1);
37558 op2 = expand_normal (arg2);
37560 if (!register_operand (op0, Pmode))
37561 op0 = ix86_zero_extend_to_Pmode (op0);
37562 if (!register_operand (op1, BNDmode))
37563 op1 = copy_to_mode_reg (BNDmode, op1);
37564 if (!register_operand (op2, Pmode))
37565 op2 = ix86_zero_extend_to_Pmode (op2);
37567 emit_insn (BNDmode == BND64mode
37568 ? gen_bnd64_stx (op2, op0, op1)
37569 : gen_bnd32_stx (op2, op0, op1));
37570 return 0;
37572 case IX86_BUILTIN_BNDLDX:
37573 if (!target
37574 || GET_MODE (target) != BNDmode
37575 || !register_operand (target, BNDmode))
37576 target = gen_reg_rtx (BNDmode);
37578 arg0 = CALL_EXPR_ARG (exp, 0);
37579 arg1 = CALL_EXPR_ARG (exp, 1);
37581 op0 = expand_normal (arg0);
37582 op1 = expand_normal (arg1);
37584 if (!register_operand (op0, Pmode))
37585 op0 = ix86_zero_extend_to_Pmode (op0);
37586 if (!register_operand (op1, Pmode))
37587 op1 = ix86_zero_extend_to_Pmode (op1);
37589 emit_insn (BNDmode == BND64mode
37590 ? gen_bnd64_ldx (target, op0, op1)
37591 : gen_bnd32_ldx (target, op0, op1));
37592 return target;
37594 case IX86_BUILTIN_BNDCL:
37595 arg0 = CALL_EXPR_ARG (exp, 0);
37596 arg1 = CALL_EXPR_ARG (exp, 1);
37598 op0 = expand_normal (arg0);
37599 op1 = expand_normal (arg1);
37601 if (!register_operand (op0, Pmode))
37602 op0 = ix86_zero_extend_to_Pmode (op0);
37603 if (!register_operand (op1, BNDmode))
37604 op1 = copy_to_mode_reg (BNDmode, op1);
37606 emit_insn (BNDmode == BND64mode
37607 ? gen_bnd64_cl (op1, op0)
37608 : gen_bnd32_cl (op1, op0));
37609 return 0;
37611 case IX86_BUILTIN_BNDCU:
37612 arg0 = CALL_EXPR_ARG (exp, 0);
37613 arg1 = CALL_EXPR_ARG (exp, 1);
37615 op0 = expand_normal (arg0);
37616 op1 = expand_normal (arg1);
37618 if (!register_operand (op0, Pmode))
37619 op0 = ix86_zero_extend_to_Pmode (op0);
37620 if (!register_operand (op1, BNDmode))
37621 op1 = copy_to_mode_reg (BNDmode, op1);
37623 emit_insn (BNDmode == BND64mode
37624 ? gen_bnd64_cu (op1, op0)
37625 : gen_bnd32_cu (op1, op0));
37626 return 0;
37628 case IX86_BUILTIN_BNDRET:
37629 arg0 = CALL_EXPR_ARG (exp, 0);
37630 target = chkp_get_rtl_bounds (arg0);
37632 /* If no bounds were specified for returned value,
37633 then use INIT bounds. It usually happens when
37634 some built-in function is expanded. */
37635 if (!target)
37637 rtx t1 = gen_reg_rtx (Pmode);
37638 rtx t2 = gen_reg_rtx (Pmode);
37639 target = gen_reg_rtx (BNDmode);
37640 emit_move_insn (t1, const0_rtx);
37641 emit_move_insn (t2, constm1_rtx);
37642 emit_insn (BNDmode == BND64mode
37643 ? gen_bnd64_mk (target, t1, t2)
37644 : gen_bnd32_mk (target, t1, t2));
37647 gcc_assert (target && REG_P (target));
37648 return target;
37650 case IX86_BUILTIN_BNDNARROW:
37652 rtx m1, m1h1, m1h2, lb, ub, t1;
37654 /* Return value and lb. */
37655 arg0 = CALL_EXPR_ARG (exp, 0);
37656 /* Bounds. */
37657 arg1 = CALL_EXPR_ARG (exp, 1);
37658 /* Size. */
37659 arg2 = CALL_EXPR_ARG (exp, 2);
37661 lb = expand_normal (arg0);
37662 op1 = expand_normal (arg1);
37663 op2 = expand_normal (arg2);
37665 /* Size was passed but we need to use (size - 1) as for bndmk. */
37666 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37667 NULL_RTX, 1, OPTAB_DIRECT);
37669 /* Add LB to size and inverse to get UB. */
37670 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37671 op2, 1, OPTAB_DIRECT);
37672 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37674 if (!register_operand (lb, Pmode))
37675 lb = ix86_zero_extend_to_Pmode (lb);
37676 if (!register_operand (ub, Pmode))
37677 ub = ix86_zero_extend_to_Pmode (ub);
37679 /* We need to move bounds to memory before any computations. */
37680 if (MEM_P (op1))
37681 m1 = op1;
37682 else
37684 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37685 emit_move_insn (m1, op1);
37688 /* Generate mem expression to be used for access to LB and UB. */
37689 m1h1 = adjust_address (m1, Pmode, 0);
37690 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37692 t1 = gen_reg_rtx (Pmode);
37694 /* Compute LB. */
37695 emit_move_insn (t1, m1h1);
37696 ix86_emit_move_max (t1, lb);
37697 emit_move_insn (m1h1, t1);
37699 /* Compute UB. UB is stored in 1's complement form. Therefore
37700 we also use max here. */
37701 emit_move_insn (t1, m1h2);
37702 ix86_emit_move_max (t1, ub);
37703 emit_move_insn (m1h2, t1);
37705 op2 = gen_reg_rtx (BNDmode);
37706 emit_move_insn (op2, m1);
37708 return chkp_join_splitted_slot (lb, op2);
37711 case IX86_BUILTIN_BNDINT:
37713 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37715 if (!target
37716 || GET_MODE (target) != BNDmode
37717 || !register_operand (target, BNDmode))
37718 target = gen_reg_rtx (BNDmode);
37720 arg0 = CALL_EXPR_ARG (exp, 0);
37721 arg1 = CALL_EXPR_ARG (exp, 1);
37723 op0 = expand_normal (arg0);
37724 op1 = expand_normal (arg1);
37726 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37727 rh1 = adjust_address (res, Pmode, 0);
37728 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37730 /* Put first bounds to temporaries. */
37731 lb1 = gen_reg_rtx (Pmode);
37732 ub1 = gen_reg_rtx (Pmode);
37733 if (MEM_P (op0))
37735 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37736 emit_move_insn (ub1, adjust_address (op0, Pmode,
37737 GET_MODE_SIZE (Pmode)));
37739 else
37741 emit_move_insn (res, op0);
37742 emit_move_insn (lb1, rh1);
37743 emit_move_insn (ub1, rh2);
37746 /* Put second bounds to temporaries. */
37747 lb2 = gen_reg_rtx (Pmode);
37748 ub2 = gen_reg_rtx (Pmode);
37749 if (MEM_P (op1))
37751 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37752 emit_move_insn (ub2, adjust_address (op1, Pmode,
37753 GET_MODE_SIZE (Pmode)));
37755 else
37757 emit_move_insn (res, op1);
37758 emit_move_insn (lb2, rh1);
37759 emit_move_insn (ub2, rh2);
37762 /* Compute LB. */
37763 ix86_emit_move_max (lb1, lb2);
37764 emit_move_insn (rh1, lb1);
37766 /* Compute UB. UB is stored in 1's complement form. Therefore
37767 we also use max here. */
37768 ix86_emit_move_max (ub1, ub2);
37769 emit_move_insn (rh2, ub1);
37771 emit_move_insn (target, res);
37773 return target;
37776 case IX86_BUILTIN_SIZEOF:
37778 tree name;
37779 rtx symbol;
37781 if (!target
37782 || GET_MODE (target) != Pmode
37783 || !register_operand (target, Pmode))
37784 target = gen_reg_rtx (Pmode);
37786 arg0 = CALL_EXPR_ARG (exp, 0);
37787 gcc_assert (VAR_P (arg0));
37789 name = DECL_ASSEMBLER_NAME (arg0);
37790 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37792 emit_insn (Pmode == SImode
37793 ? gen_move_size_reloc_si (target, symbol)
37794 : gen_move_size_reloc_di (target, symbol));
37796 return target;
37799 case IX86_BUILTIN_BNDLOWER:
37801 rtx mem, hmem;
37803 if (!target
37804 || GET_MODE (target) != Pmode
37805 || !register_operand (target, Pmode))
37806 target = gen_reg_rtx (Pmode);
37808 arg0 = CALL_EXPR_ARG (exp, 0);
37809 op0 = expand_normal (arg0);
37811 /* We need to move bounds to memory first. */
37812 if (MEM_P (op0))
37813 mem = op0;
37814 else
37816 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37817 emit_move_insn (mem, op0);
37820 /* Generate mem expression to access LB and load it. */
37821 hmem = adjust_address (mem, Pmode, 0);
37822 emit_move_insn (target, hmem);
37824 return target;
37827 case IX86_BUILTIN_BNDUPPER:
37829 rtx mem, hmem, res;
37831 if (!target
37832 || GET_MODE (target) != Pmode
37833 || !register_operand (target, Pmode))
37834 target = gen_reg_rtx (Pmode);
37836 arg0 = CALL_EXPR_ARG (exp, 0);
37837 op0 = expand_normal (arg0);
37839 /* We need to move bounds to memory first. */
37840 if (MEM_P (op0))
37841 mem = op0;
37842 else
37844 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37845 emit_move_insn (mem, op0);
37848 /* Generate mem expression to access UB. */
37849 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37851 /* We need to inverse all bits of UB. */
37852 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37854 if (res != target)
37855 emit_move_insn (target, res);
37857 return target;
37860 case IX86_BUILTIN_MASKMOVQ:
37861 case IX86_BUILTIN_MASKMOVDQU:
37862 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37863 ? CODE_FOR_mmx_maskmovq
37864 : CODE_FOR_sse2_maskmovdqu);
37865 /* Note the arg order is different from the operand order. */
37866 arg1 = CALL_EXPR_ARG (exp, 0);
37867 arg2 = CALL_EXPR_ARG (exp, 1);
37868 arg0 = CALL_EXPR_ARG (exp, 2);
37869 op0 = expand_normal (arg0);
37870 op1 = expand_normal (arg1);
37871 op2 = expand_normal (arg2);
37872 mode0 = insn_data[icode].operand[0].mode;
37873 mode1 = insn_data[icode].operand[1].mode;
37874 mode2 = insn_data[icode].operand[2].mode;
37876 op0 = ix86_zero_extend_to_Pmode (op0);
37877 op0 = gen_rtx_MEM (mode1, op0);
37879 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37880 op0 = copy_to_mode_reg (mode0, op0);
37881 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37882 op1 = copy_to_mode_reg (mode1, op1);
37883 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37884 op2 = copy_to_mode_reg (mode2, op2);
37885 pat = GEN_FCN (icode) (op0, op1, op2);
37886 if (! pat)
37887 return 0;
37888 emit_insn (pat);
37889 return 0;
37891 case IX86_BUILTIN_LDMXCSR:
37892 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37893 target = assign_386_stack_local (SImode, SLOT_TEMP);
37894 emit_move_insn (target, op0);
37895 emit_insn (gen_sse_ldmxcsr (target));
37896 return 0;
37898 case IX86_BUILTIN_STMXCSR:
37899 target = assign_386_stack_local (SImode, SLOT_TEMP);
37900 emit_insn (gen_sse_stmxcsr (target));
37901 return copy_to_mode_reg (SImode, target);
37903 case IX86_BUILTIN_CLFLUSH:
37904 arg0 = CALL_EXPR_ARG (exp, 0);
37905 op0 = expand_normal (arg0);
37906 icode = CODE_FOR_sse2_clflush;
37907 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37908 op0 = ix86_zero_extend_to_Pmode (op0);
37910 emit_insn (gen_sse2_clflush (op0));
37911 return 0;
37913 case IX86_BUILTIN_CLWB:
37914 arg0 = CALL_EXPR_ARG (exp, 0);
37915 op0 = expand_normal (arg0);
37916 icode = CODE_FOR_clwb;
37917 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37918 op0 = ix86_zero_extend_to_Pmode (op0);
37920 emit_insn (gen_clwb (op0));
37921 return 0;
37923 case IX86_BUILTIN_CLFLUSHOPT:
37924 arg0 = CALL_EXPR_ARG (exp, 0);
37925 op0 = expand_normal (arg0);
37926 icode = CODE_FOR_clflushopt;
37927 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37928 op0 = ix86_zero_extend_to_Pmode (op0);
37930 emit_insn (gen_clflushopt (op0));
37931 return 0;
37933 case IX86_BUILTIN_MONITOR:
37934 case IX86_BUILTIN_MONITORX:
37935 arg0 = CALL_EXPR_ARG (exp, 0);
37936 arg1 = CALL_EXPR_ARG (exp, 1);
37937 arg2 = CALL_EXPR_ARG (exp, 2);
37938 op0 = expand_normal (arg0);
37939 op1 = expand_normal (arg1);
37940 op2 = expand_normal (arg2);
37941 if (!REG_P (op0))
37942 op0 = ix86_zero_extend_to_Pmode (op0);
37943 if (!REG_P (op1))
37944 op1 = copy_to_mode_reg (SImode, op1);
37945 if (!REG_P (op2))
37946 op2 = copy_to_mode_reg (SImode, op2);
37948 emit_insn (fcode == IX86_BUILTIN_MONITOR
37949 ? ix86_gen_monitor (op0, op1, op2)
37950 : ix86_gen_monitorx (op0, op1, op2));
37951 return 0;
37953 case IX86_BUILTIN_MWAIT:
37954 arg0 = CALL_EXPR_ARG (exp, 0);
37955 arg1 = CALL_EXPR_ARG (exp, 1);
37956 op0 = expand_normal (arg0);
37957 op1 = expand_normal (arg1);
37958 if (!REG_P (op0))
37959 op0 = copy_to_mode_reg (SImode, op0);
37960 if (!REG_P (op1))
37961 op1 = copy_to_mode_reg (SImode, op1);
37962 emit_insn (gen_sse3_mwait (op0, op1));
37963 return 0;
37965 case IX86_BUILTIN_MWAITX:
37966 arg0 = CALL_EXPR_ARG (exp, 0);
37967 arg1 = CALL_EXPR_ARG (exp, 1);
37968 arg2 = CALL_EXPR_ARG (exp, 2);
37969 op0 = expand_normal (arg0);
37970 op1 = expand_normal (arg1);
37971 op2 = expand_normal (arg2);
37972 if (!REG_P (op0))
37973 op0 = copy_to_mode_reg (SImode, op0);
37974 if (!REG_P (op1))
37975 op1 = copy_to_mode_reg (SImode, op1);
37976 if (!REG_P (op2))
37977 op2 = copy_to_mode_reg (SImode, op2);
37978 emit_insn (gen_mwaitx (op0, op1, op2));
37979 return 0;
37981 case IX86_BUILTIN_CLZERO:
37982 arg0 = CALL_EXPR_ARG (exp, 0);
37983 op0 = expand_normal (arg0);
37984 if (!REG_P (op0))
37985 op0 = ix86_zero_extend_to_Pmode (op0);
37986 emit_insn (ix86_gen_clzero (op0));
37987 return 0;
37989 case IX86_BUILTIN_VEC_INIT_V2SI:
37990 case IX86_BUILTIN_VEC_INIT_V4HI:
37991 case IX86_BUILTIN_VEC_INIT_V8QI:
37992 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37994 case IX86_BUILTIN_VEC_EXT_V2DF:
37995 case IX86_BUILTIN_VEC_EXT_V2DI:
37996 case IX86_BUILTIN_VEC_EXT_V4SF:
37997 case IX86_BUILTIN_VEC_EXT_V4SI:
37998 case IX86_BUILTIN_VEC_EXT_V8HI:
37999 case IX86_BUILTIN_VEC_EXT_V2SI:
38000 case IX86_BUILTIN_VEC_EXT_V4HI:
38001 case IX86_BUILTIN_VEC_EXT_V16QI:
38002 return ix86_expand_vec_ext_builtin (exp, target);
38004 case IX86_BUILTIN_VEC_SET_V2DI:
38005 case IX86_BUILTIN_VEC_SET_V4SF:
38006 case IX86_BUILTIN_VEC_SET_V4SI:
38007 case IX86_BUILTIN_VEC_SET_V8HI:
38008 case IX86_BUILTIN_VEC_SET_V4HI:
38009 case IX86_BUILTIN_VEC_SET_V16QI:
38010 return ix86_expand_vec_set_builtin (exp);
38012 case IX86_BUILTIN_NANQ:
38013 case IX86_BUILTIN_NANSQ:
38014 return expand_call (exp, target, ignore);
38016 case IX86_BUILTIN_RDPMC:
38017 case IX86_BUILTIN_RDTSC:
38018 case IX86_BUILTIN_RDTSCP:
38019 case IX86_BUILTIN_XGETBV:
38021 op0 = gen_reg_rtx (DImode);
38022 op1 = gen_reg_rtx (DImode);
38024 if (fcode == IX86_BUILTIN_RDPMC)
38026 arg0 = CALL_EXPR_ARG (exp, 0);
38027 op2 = expand_normal (arg0);
38028 if (!register_operand (op2, SImode))
38029 op2 = copy_to_mode_reg (SImode, op2);
38031 insn = (TARGET_64BIT
38032 ? gen_rdpmc_rex64 (op0, op1, op2)
38033 : gen_rdpmc (op0, op2));
38034 emit_insn (insn);
38036 else if (fcode == IX86_BUILTIN_XGETBV)
38038 arg0 = CALL_EXPR_ARG (exp, 0);
38039 op2 = expand_normal (arg0);
38040 if (!register_operand (op2, SImode))
38041 op2 = copy_to_mode_reg (SImode, op2);
38043 insn = (TARGET_64BIT
38044 ? gen_xgetbv_rex64 (op0, op1, op2)
38045 : gen_xgetbv (op0, op2));
38046 emit_insn (insn);
38048 else if (fcode == IX86_BUILTIN_RDTSC)
38050 insn = (TARGET_64BIT
38051 ? gen_rdtsc_rex64 (op0, op1)
38052 : gen_rdtsc (op0));
38053 emit_insn (insn);
38055 else
38057 op2 = gen_reg_rtx (SImode);
38059 insn = (TARGET_64BIT
38060 ? gen_rdtscp_rex64 (op0, op1, op2)
38061 : gen_rdtscp (op0, op2));
38062 emit_insn (insn);
38064 arg0 = CALL_EXPR_ARG (exp, 0);
38065 op4 = expand_normal (arg0);
38066 if (!address_operand (op4, VOIDmode))
38068 op4 = convert_memory_address (Pmode, op4);
38069 op4 = copy_addr_to_reg (op4);
38071 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38074 if (target == 0)
38076 /* mode is VOIDmode if __builtin_rd* has been called
38077 without lhs. */
38078 if (mode == VOIDmode)
38079 return target;
38080 target = gen_reg_rtx (mode);
38083 if (TARGET_64BIT)
38085 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38086 op1, 1, OPTAB_DIRECT);
38087 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38088 op0, 1, OPTAB_DIRECT);
38091 emit_move_insn (target, op0);
38092 return target;
38094 case IX86_BUILTIN_FXSAVE:
38095 case IX86_BUILTIN_FXRSTOR:
38096 case IX86_BUILTIN_FXSAVE64:
38097 case IX86_BUILTIN_FXRSTOR64:
38098 case IX86_BUILTIN_FNSTENV:
38099 case IX86_BUILTIN_FLDENV:
38100 mode0 = BLKmode;
38101 switch (fcode)
38103 case IX86_BUILTIN_FXSAVE:
38104 icode = CODE_FOR_fxsave;
38105 break;
38106 case IX86_BUILTIN_FXRSTOR:
38107 icode = CODE_FOR_fxrstor;
38108 break;
38109 case IX86_BUILTIN_FXSAVE64:
38110 icode = CODE_FOR_fxsave64;
38111 break;
38112 case IX86_BUILTIN_FXRSTOR64:
38113 icode = CODE_FOR_fxrstor64;
38114 break;
38115 case IX86_BUILTIN_FNSTENV:
38116 icode = CODE_FOR_fnstenv;
38117 break;
38118 case IX86_BUILTIN_FLDENV:
38119 icode = CODE_FOR_fldenv;
38120 break;
38121 default:
38122 gcc_unreachable ();
38125 arg0 = CALL_EXPR_ARG (exp, 0);
38126 op0 = expand_normal (arg0);
38128 if (!address_operand (op0, VOIDmode))
38130 op0 = convert_memory_address (Pmode, op0);
38131 op0 = copy_addr_to_reg (op0);
38133 op0 = gen_rtx_MEM (mode0, op0);
38135 pat = GEN_FCN (icode) (op0);
38136 if (pat)
38137 emit_insn (pat);
38138 return 0;
38140 case IX86_BUILTIN_XSETBV:
38141 arg0 = CALL_EXPR_ARG (exp, 0);
38142 arg1 = CALL_EXPR_ARG (exp, 1);
38143 op0 = expand_normal (arg0);
38144 op1 = expand_normal (arg1);
38146 if (!REG_P (op0))
38147 op0 = copy_to_mode_reg (SImode, op0);
38149 if (TARGET_64BIT)
38151 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38152 NULL, 1, OPTAB_DIRECT);
38154 op2 = gen_lowpart (SImode, op2);
38155 op1 = gen_lowpart (SImode, op1);
38156 if (!REG_P (op1))
38157 op1 = copy_to_mode_reg (SImode, op1);
38158 if (!REG_P (op2))
38159 op2 = copy_to_mode_reg (SImode, op2);
38160 icode = CODE_FOR_xsetbv_rex64;
38161 pat = GEN_FCN (icode) (op0, op1, op2);
38163 else
38165 if (!REG_P (op1))
38166 op1 = copy_to_mode_reg (DImode, op1);
38167 icode = CODE_FOR_xsetbv;
38168 pat = GEN_FCN (icode) (op0, op1);
38170 if (pat)
38171 emit_insn (pat);
38172 return 0;
38174 case IX86_BUILTIN_XSAVE:
38175 case IX86_BUILTIN_XRSTOR:
38176 case IX86_BUILTIN_XSAVE64:
38177 case IX86_BUILTIN_XRSTOR64:
38178 case IX86_BUILTIN_XSAVEOPT:
38179 case IX86_BUILTIN_XSAVEOPT64:
38180 case IX86_BUILTIN_XSAVES:
38181 case IX86_BUILTIN_XRSTORS:
38182 case IX86_BUILTIN_XSAVES64:
38183 case IX86_BUILTIN_XRSTORS64:
38184 case IX86_BUILTIN_XSAVEC:
38185 case IX86_BUILTIN_XSAVEC64:
38186 arg0 = CALL_EXPR_ARG (exp, 0);
38187 arg1 = CALL_EXPR_ARG (exp, 1);
38188 op0 = expand_normal (arg0);
38189 op1 = expand_normal (arg1);
38191 if (!address_operand (op0, VOIDmode))
38193 op0 = convert_memory_address (Pmode, op0);
38194 op0 = copy_addr_to_reg (op0);
38196 op0 = gen_rtx_MEM (BLKmode, op0);
38198 op1 = force_reg (DImode, op1);
38200 if (TARGET_64BIT)
38202 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38203 NULL, 1, OPTAB_DIRECT);
38204 switch (fcode)
38206 case IX86_BUILTIN_XSAVE:
38207 icode = CODE_FOR_xsave_rex64;
38208 break;
38209 case IX86_BUILTIN_XRSTOR:
38210 icode = CODE_FOR_xrstor_rex64;
38211 break;
38212 case IX86_BUILTIN_XSAVE64:
38213 icode = CODE_FOR_xsave64;
38214 break;
38215 case IX86_BUILTIN_XRSTOR64:
38216 icode = CODE_FOR_xrstor64;
38217 break;
38218 case IX86_BUILTIN_XSAVEOPT:
38219 icode = CODE_FOR_xsaveopt_rex64;
38220 break;
38221 case IX86_BUILTIN_XSAVEOPT64:
38222 icode = CODE_FOR_xsaveopt64;
38223 break;
38224 case IX86_BUILTIN_XSAVES:
38225 icode = CODE_FOR_xsaves_rex64;
38226 break;
38227 case IX86_BUILTIN_XRSTORS:
38228 icode = CODE_FOR_xrstors_rex64;
38229 break;
38230 case IX86_BUILTIN_XSAVES64:
38231 icode = CODE_FOR_xsaves64;
38232 break;
38233 case IX86_BUILTIN_XRSTORS64:
38234 icode = CODE_FOR_xrstors64;
38235 break;
38236 case IX86_BUILTIN_XSAVEC:
38237 icode = CODE_FOR_xsavec_rex64;
38238 break;
38239 case IX86_BUILTIN_XSAVEC64:
38240 icode = CODE_FOR_xsavec64;
38241 break;
38242 default:
38243 gcc_unreachable ();
38246 op2 = gen_lowpart (SImode, op2);
38247 op1 = gen_lowpart (SImode, op1);
38248 pat = GEN_FCN (icode) (op0, op1, op2);
38250 else
38252 switch (fcode)
38254 case IX86_BUILTIN_XSAVE:
38255 icode = CODE_FOR_xsave;
38256 break;
38257 case IX86_BUILTIN_XRSTOR:
38258 icode = CODE_FOR_xrstor;
38259 break;
38260 case IX86_BUILTIN_XSAVEOPT:
38261 icode = CODE_FOR_xsaveopt;
38262 break;
38263 case IX86_BUILTIN_XSAVES:
38264 icode = CODE_FOR_xsaves;
38265 break;
38266 case IX86_BUILTIN_XRSTORS:
38267 icode = CODE_FOR_xrstors;
38268 break;
38269 case IX86_BUILTIN_XSAVEC:
38270 icode = CODE_FOR_xsavec;
38271 break;
38272 default:
38273 gcc_unreachable ();
38275 pat = GEN_FCN (icode) (op0, op1);
38278 if (pat)
38279 emit_insn (pat);
38280 return 0;
38282 case IX86_BUILTIN_LLWPCB:
38283 arg0 = CALL_EXPR_ARG (exp, 0);
38284 op0 = expand_normal (arg0);
38285 icode = CODE_FOR_lwp_llwpcb;
38286 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38287 op0 = ix86_zero_extend_to_Pmode (op0);
38288 emit_insn (gen_lwp_llwpcb (op0));
38289 return 0;
38291 case IX86_BUILTIN_SLWPCB:
38292 icode = CODE_FOR_lwp_slwpcb;
38293 if (!target
38294 || !insn_data[icode].operand[0].predicate (target, Pmode))
38295 target = gen_reg_rtx (Pmode);
38296 emit_insn (gen_lwp_slwpcb (target));
38297 return target;
38299 case IX86_BUILTIN_BEXTRI32:
38300 case IX86_BUILTIN_BEXTRI64:
38301 arg0 = CALL_EXPR_ARG (exp, 0);
38302 arg1 = CALL_EXPR_ARG (exp, 1);
38303 op0 = expand_normal (arg0);
38304 op1 = expand_normal (arg1);
38305 icode = (fcode == IX86_BUILTIN_BEXTRI32
38306 ? CODE_FOR_tbm_bextri_si
38307 : CODE_FOR_tbm_bextri_di);
38308 if (!CONST_INT_P (op1))
38310 error ("last argument must be an immediate");
38311 return const0_rtx;
38313 else
38315 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38316 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38317 op1 = GEN_INT (length);
38318 op2 = GEN_INT (lsb_index);
38319 pat = GEN_FCN (icode) (target, op0, op1, op2);
38320 if (pat)
38321 emit_insn (pat);
38322 return target;
38325 case IX86_BUILTIN_RDRAND16_STEP:
38326 icode = CODE_FOR_rdrandhi_1;
38327 mode0 = HImode;
38328 goto rdrand_step;
38330 case IX86_BUILTIN_RDRAND32_STEP:
38331 icode = CODE_FOR_rdrandsi_1;
38332 mode0 = SImode;
38333 goto rdrand_step;
38335 case IX86_BUILTIN_RDRAND64_STEP:
38336 icode = CODE_FOR_rdranddi_1;
38337 mode0 = DImode;
38339 rdrand_step:
38340 arg0 = CALL_EXPR_ARG (exp, 0);
38341 op1 = expand_normal (arg0);
38342 if (!address_operand (op1, VOIDmode))
38344 op1 = convert_memory_address (Pmode, op1);
38345 op1 = copy_addr_to_reg (op1);
38348 op0 = gen_reg_rtx (mode0);
38349 emit_insn (GEN_FCN (icode) (op0));
38351 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38353 op1 = gen_reg_rtx (SImode);
38354 emit_move_insn (op1, CONST1_RTX (SImode));
38356 /* Emit SImode conditional move. */
38357 if (mode0 == HImode)
38359 if (TARGET_ZERO_EXTEND_WITH_AND
38360 && optimize_function_for_speed_p (cfun))
38362 op2 = force_reg (SImode, const0_rtx);
38364 emit_insn (gen_movstricthi
38365 (gen_lowpart (HImode, op2), op0));
38367 else
38369 op2 = gen_reg_rtx (SImode);
38371 emit_insn (gen_zero_extendhisi2 (op2, op0));
38374 else if (mode0 == SImode)
38375 op2 = op0;
38376 else
38377 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38379 if (target == 0
38380 || !register_operand (target, SImode))
38381 target = gen_reg_rtx (SImode);
38383 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38384 const0_rtx);
38385 emit_insn (gen_rtx_SET (target,
38386 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38387 return target;
38389 case IX86_BUILTIN_RDSEED16_STEP:
38390 icode = CODE_FOR_rdseedhi_1;
38391 mode0 = HImode;
38392 goto rdseed_step;
38394 case IX86_BUILTIN_RDSEED32_STEP:
38395 icode = CODE_FOR_rdseedsi_1;
38396 mode0 = SImode;
38397 goto rdseed_step;
38399 case IX86_BUILTIN_RDSEED64_STEP:
38400 icode = CODE_FOR_rdseeddi_1;
38401 mode0 = DImode;
38403 rdseed_step:
38404 arg0 = CALL_EXPR_ARG (exp, 0);
38405 op1 = expand_normal (arg0);
38406 if (!address_operand (op1, VOIDmode))
38408 op1 = convert_memory_address (Pmode, op1);
38409 op1 = copy_addr_to_reg (op1);
38412 op0 = gen_reg_rtx (mode0);
38413 emit_insn (GEN_FCN (icode) (op0));
38415 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38417 op2 = gen_reg_rtx (QImode);
38419 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38420 const0_rtx);
38421 emit_insn (gen_rtx_SET (op2, pat));
38423 if (target == 0
38424 || !register_operand (target, SImode))
38425 target = gen_reg_rtx (SImode);
38427 emit_insn (gen_zero_extendqisi2 (target, op2));
38428 return target;
38430 case IX86_BUILTIN_SBB32:
38431 icode = CODE_FOR_subborrowsi;
38432 mode0 = SImode;
38433 goto handlecarry;
38435 case IX86_BUILTIN_SBB64:
38436 icode = CODE_FOR_subborrowdi;
38437 mode0 = DImode;
38438 goto handlecarry;
38440 case IX86_BUILTIN_ADDCARRYX32:
38441 icode = CODE_FOR_addcarrysi;
38442 mode0 = SImode;
38443 goto handlecarry;
38445 case IX86_BUILTIN_ADDCARRYX64:
38446 icode = CODE_FOR_addcarrydi;
38447 mode0 = DImode;
38449 handlecarry:
38450 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38451 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38452 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38453 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38455 op1 = expand_normal (arg0);
38456 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38458 op2 = expand_normal (arg1);
38459 if (!register_operand (op2, mode0))
38460 op2 = copy_to_mode_reg (mode0, op2);
38462 op3 = expand_normal (arg2);
38463 if (!register_operand (op3, mode0))
38464 op3 = copy_to_mode_reg (mode0, op3);
38466 op4 = expand_normal (arg3);
38467 if (!address_operand (op4, VOIDmode))
38469 op4 = convert_memory_address (Pmode, op4);
38470 op4 = copy_addr_to_reg (op4);
38473 /* Generate CF from input operand. */
38474 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38476 /* Generate instruction that consumes CF. */
38477 op0 = gen_reg_rtx (mode0);
38479 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38480 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38481 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38483 /* Return current CF value. */
38484 if (target == 0)
38485 target = gen_reg_rtx (QImode);
38487 PUT_MODE (pat, QImode);
38488 emit_insn (gen_rtx_SET (target, pat));
38490 /* Store the result. */
38491 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38493 return target;
38495 case IX86_BUILTIN_READ_FLAGS:
38496 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38498 if (optimize
38499 || target == NULL_RTX
38500 || !nonimmediate_operand (target, word_mode)
38501 || GET_MODE (target) != word_mode)
38502 target = gen_reg_rtx (word_mode);
38504 emit_insn (gen_pop (target));
38505 return target;
38507 case IX86_BUILTIN_WRITE_FLAGS:
38509 arg0 = CALL_EXPR_ARG (exp, 0);
38510 op0 = expand_normal (arg0);
38511 if (!general_no_elim_operand (op0, word_mode))
38512 op0 = copy_to_mode_reg (word_mode, op0);
38514 emit_insn (gen_push (op0));
38515 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38516 return 0;
38518 case IX86_BUILTIN_KTESTC8:
38519 icode = CODE_FOR_ktestqi;
38520 mode3 = CCCmode;
38521 goto kortest;
38523 case IX86_BUILTIN_KTESTZ8:
38524 icode = CODE_FOR_ktestqi;
38525 mode3 = CCZmode;
38526 goto kortest;
38528 case IX86_BUILTIN_KTESTC16:
38529 icode = CODE_FOR_ktesthi;
38530 mode3 = CCCmode;
38531 goto kortest;
38533 case IX86_BUILTIN_KTESTZ16:
38534 icode = CODE_FOR_ktesthi;
38535 mode3 = CCZmode;
38536 goto kortest;
38538 case IX86_BUILTIN_KTESTC32:
38539 icode = CODE_FOR_ktestsi;
38540 mode3 = CCCmode;
38541 goto kortest;
38543 case IX86_BUILTIN_KTESTZ32:
38544 icode = CODE_FOR_ktestsi;
38545 mode3 = CCZmode;
38546 goto kortest;
38548 case IX86_BUILTIN_KTESTC64:
38549 icode = CODE_FOR_ktestdi;
38550 mode3 = CCCmode;
38551 goto kortest;
38553 case IX86_BUILTIN_KTESTZ64:
38554 icode = CODE_FOR_ktestdi;
38555 mode3 = CCZmode;
38556 goto kortest;
38558 case IX86_BUILTIN_KORTESTC8:
38559 icode = CODE_FOR_kortestqi;
38560 mode3 = CCCmode;
38561 goto kortest;
38563 case IX86_BUILTIN_KORTESTZ8:
38564 icode = CODE_FOR_kortestqi;
38565 mode3 = CCZmode;
38566 goto kortest;
38568 case IX86_BUILTIN_KORTESTC16:
38569 icode = CODE_FOR_kortesthi;
38570 mode3 = CCCmode;
38571 goto kortest;
38573 case IX86_BUILTIN_KORTESTZ16:
38574 icode = CODE_FOR_kortesthi;
38575 mode3 = CCZmode;
38576 goto kortest;
38578 case IX86_BUILTIN_KORTESTC32:
38579 icode = CODE_FOR_kortestsi;
38580 mode3 = CCCmode;
38581 goto kortest;
38583 case IX86_BUILTIN_KORTESTZ32:
38584 icode = CODE_FOR_kortestsi;
38585 mode3 = CCZmode;
38586 goto kortest;
38588 case IX86_BUILTIN_KORTESTC64:
38589 icode = CODE_FOR_kortestdi;
38590 mode3 = CCCmode;
38591 goto kortest;
38593 case IX86_BUILTIN_KORTESTZ64:
38594 icode = CODE_FOR_kortestdi;
38595 mode3 = CCZmode;
38597 kortest:
38598 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38599 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38600 op0 = expand_normal (arg0);
38601 op1 = expand_normal (arg1);
38603 mode0 = insn_data[icode].operand[0].mode;
38604 mode1 = insn_data[icode].operand[1].mode;
38606 if (GET_MODE (op0) != VOIDmode)
38607 op0 = force_reg (GET_MODE (op0), op0);
38609 op0 = gen_lowpart (mode0, op0);
38611 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38612 op0 = copy_to_mode_reg (mode0, op0);
38614 if (GET_MODE (op1) != VOIDmode)
38615 op1 = force_reg (GET_MODE (op1), op1);
38617 op1 = gen_lowpart (mode1, op1);
38619 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38620 op1 = copy_to_mode_reg (mode1, op1);
38622 target = gen_reg_rtx (QImode);
38624 /* Emit kortest. */
38625 emit_insn (GEN_FCN (icode) (op0, op1));
38626 /* And use setcc to return result from flags. */
38627 ix86_expand_setcc (target, EQ,
38628 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38629 return target;
38631 case IX86_BUILTIN_GATHERSIV2DF:
38632 icode = CODE_FOR_avx2_gathersiv2df;
38633 goto gather_gen;
38634 case IX86_BUILTIN_GATHERSIV4DF:
38635 icode = CODE_FOR_avx2_gathersiv4df;
38636 goto gather_gen;
38637 case IX86_BUILTIN_GATHERDIV2DF:
38638 icode = CODE_FOR_avx2_gatherdiv2df;
38639 goto gather_gen;
38640 case IX86_BUILTIN_GATHERDIV4DF:
38641 icode = CODE_FOR_avx2_gatherdiv4df;
38642 goto gather_gen;
38643 case IX86_BUILTIN_GATHERSIV4SF:
38644 icode = CODE_FOR_avx2_gathersiv4sf;
38645 goto gather_gen;
38646 case IX86_BUILTIN_GATHERSIV8SF:
38647 icode = CODE_FOR_avx2_gathersiv8sf;
38648 goto gather_gen;
38649 case IX86_BUILTIN_GATHERDIV4SF:
38650 icode = CODE_FOR_avx2_gatherdiv4sf;
38651 goto gather_gen;
38652 case IX86_BUILTIN_GATHERDIV8SF:
38653 icode = CODE_FOR_avx2_gatherdiv8sf;
38654 goto gather_gen;
38655 case IX86_BUILTIN_GATHERSIV2DI:
38656 icode = CODE_FOR_avx2_gathersiv2di;
38657 goto gather_gen;
38658 case IX86_BUILTIN_GATHERSIV4DI:
38659 icode = CODE_FOR_avx2_gathersiv4di;
38660 goto gather_gen;
38661 case IX86_BUILTIN_GATHERDIV2DI:
38662 icode = CODE_FOR_avx2_gatherdiv2di;
38663 goto gather_gen;
38664 case IX86_BUILTIN_GATHERDIV4DI:
38665 icode = CODE_FOR_avx2_gatherdiv4di;
38666 goto gather_gen;
38667 case IX86_BUILTIN_GATHERSIV4SI:
38668 icode = CODE_FOR_avx2_gathersiv4si;
38669 goto gather_gen;
38670 case IX86_BUILTIN_GATHERSIV8SI:
38671 icode = CODE_FOR_avx2_gathersiv8si;
38672 goto gather_gen;
38673 case IX86_BUILTIN_GATHERDIV4SI:
38674 icode = CODE_FOR_avx2_gatherdiv4si;
38675 goto gather_gen;
38676 case IX86_BUILTIN_GATHERDIV8SI:
38677 icode = CODE_FOR_avx2_gatherdiv8si;
38678 goto gather_gen;
38679 case IX86_BUILTIN_GATHERALTSIV4DF:
38680 icode = CODE_FOR_avx2_gathersiv4df;
38681 goto gather_gen;
38682 case IX86_BUILTIN_GATHERALTDIV8SF:
38683 icode = CODE_FOR_avx2_gatherdiv8sf;
38684 goto gather_gen;
38685 case IX86_BUILTIN_GATHERALTSIV4DI:
38686 icode = CODE_FOR_avx2_gathersiv4di;
38687 goto gather_gen;
38688 case IX86_BUILTIN_GATHERALTDIV8SI:
38689 icode = CODE_FOR_avx2_gatherdiv8si;
38690 goto gather_gen;
38691 case IX86_BUILTIN_GATHER3SIV16SF:
38692 icode = CODE_FOR_avx512f_gathersiv16sf;
38693 goto gather_gen;
38694 case IX86_BUILTIN_GATHER3SIV8DF:
38695 icode = CODE_FOR_avx512f_gathersiv8df;
38696 goto gather_gen;
38697 case IX86_BUILTIN_GATHER3DIV16SF:
38698 icode = CODE_FOR_avx512f_gatherdiv16sf;
38699 goto gather_gen;
38700 case IX86_BUILTIN_GATHER3DIV8DF:
38701 icode = CODE_FOR_avx512f_gatherdiv8df;
38702 goto gather_gen;
38703 case IX86_BUILTIN_GATHER3SIV16SI:
38704 icode = CODE_FOR_avx512f_gathersiv16si;
38705 goto gather_gen;
38706 case IX86_BUILTIN_GATHER3SIV8DI:
38707 icode = CODE_FOR_avx512f_gathersiv8di;
38708 goto gather_gen;
38709 case IX86_BUILTIN_GATHER3DIV16SI:
38710 icode = CODE_FOR_avx512f_gatherdiv16si;
38711 goto gather_gen;
38712 case IX86_BUILTIN_GATHER3DIV8DI:
38713 icode = CODE_FOR_avx512f_gatherdiv8di;
38714 goto gather_gen;
38715 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38716 icode = CODE_FOR_avx512f_gathersiv8df;
38717 goto gather_gen;
38718 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38719 icode = CODE_FOR_avx512f_gatherdiv16sf;
38720 goto gather_gen;
38721 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38722 icode = CODE_FOR_avx512f_gathersiv8di;
38723 goto gather_gen;
38724 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38725 icode = CODE_FOR_avx512f_gatherdiv16si;
38726 goto gather_gen;
38727 case IX86_BUILTIN_GATHER3SIV2DF:
38728 icode = CODE_FOR_avx512vl_gathersiv2df;
38729 goto gather_gen;
38730 case IX86_BUILTIN_GATHER3SIV4DF:
38731 icode = CODE_FOR_avx512vl_gathersiv4df;
38732 goto gather_gen;
38733 case IX86_BUILTIN_GATHER3DIV2DF:
38734 icode = CODE_FOR_avx512vl_gatherdiv2df;
38735 goto gather_gen;
38736 case IX86_BUILTIN_GATHER3DIV4DF:
38737 icode = CODE_FOR_avx512vl_gatherdiv4df;
38738 goto gather_gen;
38739 case IX86_BUILTIN_GATHER3SIV4SF:
38740 icode = CODE_FOR_avx512vl_gathersiv4sf;
38741 goto gather_gen;
38742 case IX86_BUILTIN_GATHER3SIV8SF:
38743 icode = CODE_FOR_avx512vl_gathersiv8sf;
38744 goto gather_gen;
38745 case IX86_BUILTIN_GATHER3DIV4SF:
38746 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38747 goto gather_gen;
38748 case IX86_BUILTIN_GATHER3DIV8SF:
38749 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38750 goto gather_gen;
38751 case IX86_BUILTIN_GATHER3SIV2DI:
38752 icode = CODE_FOR_avx512vl_gathersiv2di;
38753 goto gather_gen;
38754 case IX86_BUILTIN_GATHER3SIV4DI:
38755 icode = CODE_FOR_avx512vl_gathersiv4di;
38756 goto gather_gen;
38757 case IX86_BUILTIN_GATHER3DIV2DI:
38758 icode = CODE_FOR_avx512vl_gatherdiv2di;
38759 goto gather_gen;
38760 case IX86_BUILTIN_GATHER3DIV4DI:
38761 icode = CODE_FOR_avx512vl_gatherdiv4di;
38762 goto gather_gen;
38763 case IX86_BUILTIN_GATHER3SIV4SI:
38764 icode = CODE_FOR_avx512vl_gathersiv4si;
38765 goto gather_gen;
38766 case IX86_BUILTIN_GATHER3SIV8SI:
38767 icode = CODE_FOR_avx512vl_gathersiv8si;
38768 goto gather_gen;
38769 case IX86_BUILTIN_GATHER3DIV4SI:
38770 icode = CODE_FOR_avx512vl_gatherdiv4si;
38771 goto gather_gen;
38772 case IX86_BUILTIN_GATHER3DIV8SI:
38773 icode = CODE_FOR_avx512vl_gatherdiv8si;
38774 goto gather_gen;
38775 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38776 icode = CODE_FOR_avx512vl_gathersiv4df;
38777 goto gather_gen;
38778 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38779 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38780 goto gather_gen;
38781 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38782 icode = CODE_FOR_avx512vl_gathersiv4di;
38783 goto gather_gen;
38784 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38785 icode = CODE_FOR_avx512vl_gatherdiv8si;
38786 goto gather_gen;
38787 case IX86_BUILTIN_SCATTERSIV16SF:
38788 icode = CODE_FOR_avx512f_scattersiv16sf;
38789 goto scatter_gen;
38790 case IX86_BUILTIN_SCATTERSIV8DF:
38791 icode = CODE_FOR_avx512f_scattersiv8df;
38792 goto scatter_gen;
38793 case IX86_BUILTIN_SCATTERDIV16SF:
38794 icode = CODE_FOR_avx512f_scatterdiv16sf;
38795 goto scatter_gen;
38796 case IX86_BUILTIN_SCATTERDIV8DF:
38797 icode = CODE_FOR_avx512f_scatterdiv8df;
38798 goto scatter_gen;
38799 case IX86_BUILTIN_SCATTERSIV16SI:
38800 icode = CODE_FOR_avx512f_scattersiv16si;
38801 goto scatter_gen;
38802 case IX86_BUILTIN_SCATTERSIV8DI:
38803 icode = CODE_FOR_avx512f_scattersiv8di;
38804 goto scatter_gen;
38805 case IX86_BUILTIN_SCATTERDIV16SI:
38806 icode = CODE_FOR_avx512f_scatterdiv16si;
38807 goto scatter_gen;
38808 case IX86_BUILTIN_SCATTERDIV8DI:
38809 icode = CODE_FOR_avx512f_scatterdiv8di;
38810 goto scatter_gen;
38811 case IX86_BUILTIN_SCATTERSIV8SF:
38812 icode = CODE_FOR_avx512vl_scattersiv8sf;
38813 goto scatter_gen;
38814 case IX86_BUILTIN_SCATTERSIV4SF:
38815 icode = CODE_FOR_avx512vl_scattersiv4sf;
38816 goto scatter_gen;
38817 case IX86_BUILTIN_SCATTERSIV4DF:
38818 icode = CODE_FOR_avx512vl_scattersiv4df;
38819 goto scatter_gen;
38820 case IX86_BUILTIN_SCATTERSIV2DF:
38821 icode = CODE_FOR_avx512vl_scattersiv2df;
38822 goto scatter_gen;
38823 case IX86_BUILTIN_SCATTERDIV8SF:
38824 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38825 goto scatter_gen;
38826 case IX86_BUILTIN_SCATTERDIV4SF:
38827 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38828 goto scatter_gen;
38829 case IX86_BUILTIN_SCATTERDIV4DF:
38830 icode = CODE_FOR_avx512vl_scatterdiv4df;
38831 goto scatter_gen;
38832 case IX86_BUILTIN_SCATTERDIV2DF:
38833 icode = CODE_FOR_avx512vl_scatterdiv2df;
38834 goto scatter_gen;
38835 case IX86_BUILTIN_SCATTERSIV8SI:
38836 icode = CODE_FOR_avx512vl_scattersiv8si;
38837 goto scatter_gen;
38838 case IX86_BUILTIN_SCATTERSIV4SI:
38839 icode = CODE_FOR_avx512vl_scattersiv4si;
38840 goto scatter_gen;
38841 case IX86_BUILTIN_SCATTERSIV4DI:
38842 icode = CODE_FOR_avx512vl_scattersiv4di;
38843 goto scatter_gen;
38844 case IX86_BUILTIN_SCATTERSIV2DI:
38845 icode = CODE_FOR_avx512vl_scattersiv2di;
38846 goto scatter_gen;
38847 case IX86_BUILTIN_SCATTERDIV8SI:
38848 icode = CODE_FOR_avx512vl_scatterdiv8si;
38849 goto scatter_gen;
38850 case IX86_BUILTIN_SCATTERDIV4SI:
38851 icode = CODE_FOR_avx512vl_scatterdiv4si;
38852 goto scatter_gen;
38853 case IX86_BUILTIN_SCATTERDIV4DI:
38854 icode = CODE_FOR_avx512vl_scatterdiv4di;
38855 goto scatter_gen;
38856 case IX86_BUILTIN_SCATTERDIV2DI:
38857 icode = CODE_FOR_avx512vl_scatterdiv2di;
38858 goto scatter_gen;
38859 case IX86_BUILTIN_GATHERPFDPD:
38860 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38861 goto vec_prefetch_gen;
38862 case IX86_BUILTIN_SCATTERALTSIV8DF:
38863 icode = CODE_FOR_avx512f_scattersiv8df;
38864 goto scatter_gen;
38865 case IX86_BUILTIN_SCATTERALTDIV16SF:
38866 icode = CODE_FOR_avx512f_scatterdiv16sf;
38867 goto scatter_gen;
38868 case IX86_BUILTIN_SCATTERALTSIV8DI:
38869 icode = CODE_FOR_avx512f_scattersiv8di;
38870 goto scatter_gen;
38871 case IX86_BUILTIN_SCATTERALTDIV16SI:
38872 icode = CODE_FOR_avx512f_scatterdiv16si;
38873 goto scatter_gen;
38874 case IX86_BUILTIN_GATHERPFDPS:
38875 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38876 goto vec_prefetch_gen;
38877 case IX86_BUILTIN_GATHERPFQPD:
38878 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38879 goto vec_prefetch_gen;
38880 case IX86_BUILTIN_GATHERPFQPS:
38881 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38882 goto vec_prefetch_gen;
38883 case IX86_BUILTIN_SCATTERPFDPD:
38884 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38885 goto vec_prefetch_gen;
38886 case IX86_BUILTIN_SCATTERPFDPS:
38887 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38888 goto vec_prefetch_gen;
38889 case IX86_BUILTIN_SCATTERPFQPD:
38890 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38891 goto vec_prefetch_gen;
38892 case IX86_BUILTIN_SCATTERPFQPS:
38893 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38894 goto vec_prefetch_gen;
38896 gather_gen:
38897 rtx half;
38898 rtx (*gen) (rtx, rtx);
38900 arg0 = CALL_EXPR_ARG (exp, 0);
38901 arg1 = CALL_EXPR_ARG (exp, 1);
38902 arg2 = CALL_EXPR_ARG (exp, 2);
38903 arg3 = CALL_EXPR_ARG (exp, 3);
38904 arg4 = CALL_EXPR_ARG (exp, 4);
38905 op0 = expand_normal (arg0);
38906 op1 = expand_normal (arg1);
38907 op2 = expand_normal (arg2);
38908 op3 = expand_normal (arg3);
38909 op4 = expand_normal (arg4);
38910 /* Note the arg order is different from the operand order. */
38911 mode0 = insn_data[icode].operand[1].mode;
38912 mode2 = insn_data[icode].operand[3].mode;
38913 mode3 = insn_data[icode].operand[4].mode;
38914 mode4 = insn_data[icode].operand[5].mode;
38916 if (target == NULL_RTX
38917 || GET_MODE (target) != insn_data[icode].operand[0].mode
38918 || !insn_data[icode].operand[0].predicate (target,
38919 GET_MODE (target)))
38920 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38921 else
38922 subtarget = target;
38924 switch (fcode)
38926 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38927 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38928 half = gen_reg_rtx (V8SImode);
38929 if (!nonimmediate_operand (op2, V16SImode))
38930 op2 = copy_to_mode_reg (V16SImode, op2);
38931 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38932 op2 = half;
38933 break;
38934 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38935 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38936 case IX86_BUILTIN_GATHERALTSIV4DF:
38937 case IX86_BUILTIN_GATHERALTSIV4DI:
38938 half = gen_reg_rtx (V4SImode);
38939 if (!nonimmediate_operand (op2, V8SImode))
38940 op2 = copy_to_mode_reg (V8SImode, op2);
38941 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38942 op2 = half;
38943 break;
38944 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38945 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38946 half = gen_reg_rtx (mode0);
38947 if (mode0 == V8SFmode)
38948 gen = gen_vec_extract_lo_v16sf;
38949 else
38950 gen = gen_vec_extract_lo_v16si;
38951 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38952 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38953 emit_insn (gen (half, op0));
38954 op0 = half;
38955 if (GET_MODE (op3) != VOIDmode)
38957 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38958 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38959 emit_insn (gen (half, op3));
38960 op3 = half;
38962 break;
38963 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38964 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38965 case IX86_BUILTIN_GATHERALTDIV8SF:
38966 case IX86_BUILTIN_GATHERALTDIV8SI:
38967 half = gen_reg_rtx (mode0);
38968 if (mode0 == V4SFmode)
38969 gen = gen_vec_extract_lo_v8sf;
38970 else
38971 gen = gen_vec_extract_lo_v8si;
38972 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38973 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38974 emit_insn (gen (half, op0));
38975 op0 = half;
38976 if (GET_MODE (op3) != VOIDmode)
38978 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38979 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38980 emit_insn (gen (half, op3));
38981 op3 = half;
38983 break;
38984 default:
38985 break;
38988 /* Force memory operand only with base register here. But we
38989 don't want to do it on memory operand for other builtin
38990 functions. */
38991 op1 = ix86_zero_extend_to_Pmode (op1);
38993 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38994 op0 = copy_to_mode_reg (mode0, op0);
38995 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38996 op1 = copy_to_mode_reg (Pmode, op1);
38997 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38998 op2 = copy_to_mode_reg (mode2, op2);
39000 op3 = fixup_modeless_constant (op3, mode3);
39002 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
39004 if (!insn_data[icode].operand[4].predicate (op3, mode3))
39005 op3 = copy_to_mode_reg (mode3, op3);
39007 else
39009 op3 = copy_to_reg (op3);
39010 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39012 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39014 error ("the last argument must be scale 1, 2, 4, 8");
39015 return const0_rtx;
39018 /* Optimize. If mask is known to have all high bits set,
39019 replace op0 with pc_rtx to signal that the instruction
39020 overwrites the whole destination and doesn't use its
39021 previous contents. */
39022 if (optimize)
39024 if (TREE_CODE (arg3) == INTEGER_CST)
39026 if (integer_all_onesp (arg3))
39027 op0 = pc_rtx;
39029 else if (TREE_CODE (arg3) == VECTOR_CST)
39031 unsigned int negative = 0;
39032 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39034 tree cst = VECTOR_CST_ELT (arg3, i);
39035 if (TREE_CODE (cst) == INTEGER_CST
39036 && tree_int_cst_sign_bit (cst))
39037 negative++;
39038 else if (TREE_CODE (cst) == REAL_CST
39039 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39040 negative++;
39042 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39043 op0 = pc_rtx;
39045 else if (TREE_CODE (arg3) == SSA_NAME
39046 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39048 /* Recognize also when mask is like:
39049 __v2df src = _mm_setzero_pd ();
39050 __v2df mask = _mm_cmpeq_pd (src, src);
39052 __v8sf src = _mm256_setzero_ps ();
39053 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39054 as that is a cheaper way to load all ones into
39055 a register than having to load a constant from
39056 memory. */
39057 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39058 if (is_gimple_call (def_stmt))
39060 tree fndecl = gimple_call_fndecl (def_stmt);
39061 if (fndecl
39062 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39063 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39065 case IX86_BUILTIN_CMPPD:
39066 case IX86_BUILTIN_CMPPS:
39067 case IX86_BUILTIN_CMPPD256:
39068 case IX86_BUILTIN_CMPPS256:
39069 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39070 break;
39071 /* FALLTHRU */
39072 case IX86_BUILTIN_CMPEQPD:
39073 case IX86_BUILTIN_CMPEQPS:
39074 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39075 && initializer_zerop (gimple_call_arg (def_stmt,
39076 1)))
39077 op0 = pc_rtx;
39078 break;
39079 default:
39080 break;
39086 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39087 if (! pat)
39088 return const0_rtx;
39089 emit_insn (pat);
39091 switch (fcode)
39093 case IX86_BUILTIN_GATHER3DIV16SF:
39094 if (target == NULL_RTX)
39095 target = gen_reg_rtx (V8SFmode);
39096 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39097 break;
39098 case IX86_BUILTIN_GATHER3DIV16SI:
39099 if (target == NULL_RTX)
39100 target = gen_reg_rtx (V8SImode);
39101 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39102 break;
39103 case IX86_BUILTIN_GATHER3DIV8SF:
39104 case IX86_BUILTIN_GATHERDIV8SF:
39105 if (target == NULL_RTX)
39106 target = gen_reg_rtx (V4SFmode);
39107 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39108 break;
39109 case IX86_BUILTIN_GATHER3DIV8SI:
39110 case IX86_BUILTIN_GATHERDIV8SI:
39111 if (target == NULL_RTX)
39112 target = gen_reg_rtx (V4SImode);
39113 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39114 break;
39115 default:
39116 target = subtarget;
39117 break;
39119 return target;
39121 scatter_gen:
39122 arg0 = CALL_EXPR_ARG (exp, 0);
39123 arg1 = CALL_EXPR_ARG (exp, 1);
39124 arg2 = CALL_EXPR_ARG (exp, 2);
39125 arg3 = CALL_EXPR_ARG (exp, 3);
39126 arg4 = CALL_EXPR_ARG (exp, 4);
39127 op0 = expand_normal (arg0);
39128 op1 = expand_normal (arg1);
39129 op2 = expand_normal (arg2);
39130 op3 = expand_normal (arg3);
39131 op4 = expand_normal (arg4);
39132 mode1 = insn_data[icode].operand[1].mode;
39133 mode2 = insn_data[icode].operand[2].mode;
39134 mode3 = insn_data[icode].operand[3].mode;
39135 mode4 = insn_data[icode].operand[4].mode;
39137 /* Scatter instruction stores operand op3 to memory with
39138 indices from op2 and scale from op4 under writemask op1.
39139 If index operand op2 has more elements then source operand
39140 op3 one need to use only its low half. And vice versa. */
39141 switch (fcode)
39143 case IX86_BUILTIN_SCATTERALTSIV8DF:
39144 case IX86_BUILTIN_SCATTERALTSIV8DI:
39145 half = gen_reg_rtx (V8SImode);
39146 if (!nonimmediate_operand (op2, V16SImode))
39147 op2 = copy_to_mode_reg (V16SImode, op2);
39148 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39149 op2 = half;
39150 break;
39151 case IX86_BUILTIN_SCATTERALTDIV16SF:
39152 case IX86_BUILTIN_SCATTERALTDIV16SI:
39153 half = gen_reg_rtx (mode3);
39154 if (mode3 == V8SFmode)
39155 gen = gen_vec_extract_lo_v16sf;
39156 else
39157 gen = gen_vec_extract_lo_v16si;
39158 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39159 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39160 emit_insn (gen (half, op3));
39161 op3 = half;
39162 break;
39163 default:
39164 break;
39167 /* Force memory operand only with base register here. But we
39168 don't want to do it on memory operand for other builtin
39169 functions. */
39170 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39172 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39173 op0 = copy_to_mode_reg (Pmode, op0);
39175 op1 = fixup_modeless_constant (op1, mode1);
39177 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39179 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39180 op1 = copy_to_mode_reg (mode1, op1);
39182 else
39184 op1 = copy_to_reg (op1);
39185 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39188 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39189 op2 = copy_to_mode_reg (mode2, op2);
39191 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39192 op3 = copy_to_mode_reg (mode3, op3);
39194 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39196 error ("the last argument must be scale 1, 2, 4, 8");
39197 return const0_rtx;
39200 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39201 if (! pat)
39202 return const0_rtx;
39204 emit_insn (pat);
39205 return 0;
39207 vec_prefetch_gen:
39208 arg0 = CALL_EXPR_ARG (exp, 0);
39209 arg1 = CALL_EXPR_ARG (exp, 1);
39210 arg2 = CALL_EXPR_ARG (exp, 2);
39211 arg3 = CALL_EXPR_ARG (exp, 3);
39212 arg4 = CALL_EXPR_ARG (exp, 4);
39213 op0 = expand_normal (arg0);
39214 op1 = expand_normal (arg1);
39215 op2 = expand_normal (arg2);
39216 op3 = expand_normal (arg3);
39217 op4 = expand_normal (arg4);
39218 mode0 = insn_data[icode].operand[0].mode;
39219 mode1 = insn_data[icode].operand[1].mode;
39220 mode3 = insn_data[icode].operand[3].mode;
39221 mode4 = insn_data[icode].operand[4].mode;
39223 op0 = fixup_modeless_constant (op0, mode0);
39225 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39227 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39228 op0 = copy_to_mode_reg (mode0, op0);
39230 else
39232 op0 = copy_to_reg (op0);
39233 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39236 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39237 op1 = copy_to_mode_reg (mode1, op1);
39239 /* Force memory operand only with base register here. But we
39240 don't want to do it on memory operand for other builtin
39241 functions. */
39242 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39244 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39245 op2 = copy_to_mode_reg (Pmode, op2);
39247 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39249 error ("the forth argument must be scale 1, 2, 4, 8");
39250 return const0_rtx;
39253 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39255 error ("incorrect hint operand");
39256 return const0_rtx;
39259 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39260 if (! pat)
39261 return const0_rtx;
39263 emit_insn (pat);
39265 return 0;
39267 case IX86_BUILTIN_XABORT:
39268 icode = CODE_FOR_xabort;
39269 arg0 = CALL_EXPR_ARG (exp, 0);
39270 op0 = expand_normal (arg0);
39271 mode0 = insn_data[icode].operand[0].mode;
39272 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39274 error ("the xabort's argument must be an 8-bit immediate");
39275 return const0_rtx;
39277 emit_insn (gen_xabort (op0));
39278 return 0;
39280 default:
39281 break;
39284 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39285 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39287 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39288 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39289 target);
39292 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39293 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39295 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39296 switch (fcode)
39298 case IX86_BUILTIN_FABSQ:
39299 case IX86_BUILTIN_COPYSIGNQ:
39300 if (!TARGET_SSE)
39301 /* Emit a normal call if SSE isn't available. */
39302 return expand_call (exp, target, ignore);
39303 /* FALLTHRU */
39304 default:
39305 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39309 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39310 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39312 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39313 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39314 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39315 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39316 int masked = 1;
39317 machine_mode mode, wide_mode, nar_mode;
39319 nar_mode = V4SFmode;
39320 mode = V16SFmode;
39321 wide_mode = V64SFmode;
39322 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39323 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39325 switch (fcode)
39327 case IX86_BUILTIN_4FMAPS:
39328 fcn = gen_avx5124fmaddps_4fmaddps;
39329 masked = 0;
39330 goto v4fma_expand;
39332 case IX86_BUILTIN_4DPWSSD:
39333 nar_mode = V4SImode;
39334 mode = V16SImode;
39335 wide_mode = V64SImode;
39336 fcn = gen_avx5124vnniw_vp4dpwssd;
39337 masked = 0;
39338 goto v4fma_expand;
39340 case IX86_BUILTIN_4DPWSSDS:
39341 nar_mode = V4SImode;
39342 mode = V16SImode;
39343 wide_mode = V64SImode;
39344 fcn = gen_avx5124vnniw_vp4dpwssds;
39345 masked = 0;
39346 goto v4fma_expand;
39348 case IX86_BUILTIN_4FNMAPS:
39349 fcn = gen_avx5124fmaddps_4fnmaddps;
39350 masked = 0;
39351 goto v4fma_expand;
39353 case IX86_BUILTIN_4FNMAPS_MASK:
39354 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39355 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39356 goto v4fma_expand;
39358 case IX86_BUILTIN_4DPWSSD_MASK:
39359 nar_mode = V4SImode;
39360 mode = V16SImode;
39361 wide_mode = V64SImode;
39362 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39363 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39364 goto v4fma_expand;
39366 case IX86_BUILTIN_4DPWSSDS_MASK:
39367 nar_mode = V4SImode;
39368 mode = V16SImode;
39369 wide_mode = V64SImode;
39370 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39371 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39372 goto v4fma_expand;
39374 case IX86_BUILTIN_4FMAPS_MASK:
39376 tree args[4];
39377 rtx ops[4];
39378 rtx wide_reg;
39379 rtx accum;
39380 rtx addr;
39381 rtx mem;
39383 v4fma_expand:
39384 wide_reg = gen_reg_rtx (wide_mode);
39385 for (i = 0; i < 4; i++)
39387 args[i] = CALL_EXPR_ARG (exp, i);
39388 ops[i] = expand_normal (args[i]);
39390 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39391 ops[i]);
39394 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39395 accum = force_reg (mode, accum);
39397 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39398 addr = force_reg (Pmode, addr);
39400 mem = gen_rtx_MEM (nar_mode, addr);
39402 target = gen_reg_rtx (mode);
39404 emit_move_insn (target, accum);
39406 if (! masked)
39407 emit_insn (fcn (target, accum, wide_reg, mem));
39408 else
39410 rtx merge, mask;
39411 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39413 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39415 if (CONST_INT_P (mask))
39416 mask = fixup_modeless_constant (mask, HImode);
39418 mask = force_reg (HImode, mask);
39420 if (GET_MODE (mask) != HImode)
39421 mask = gen_rtx_SUBREG (HImode, mask, 0);
39423 /* If merge is 0 then we're about to emit z-masked variant. */
39424 if (const0_operand (merge, mode))
39425 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39426 /* If merge is the same as accum then emit merge-masked variant. */
39427 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39429 merge = force_reg (mode, merge);
39430 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39432 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39433 else
39435 target = gen_reg_rtx (mode);
39436 emit_move_insn (target, merge);
39437 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39440 return target;
39443 case IX86_BUILTIN_4FNMASS:
39444 fcn = gen_avx5124fmaddps_4fnmaddss;
39445 masked = 0;
39446 goto s4fma_expand;
39448 case IX86_BUILTIN_4FMASS:
39449 fcn = gen_avx5124fmaddps_4fmaddss;
39450 masked = 0;
39451 goto s4fma_expand;
39453 case IX86_BUILTIN_4FNMASS_MASK:
39454 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39455 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39456 goto s4fma_expand;
39458 case IX86_BUILTIN_4FMASS_MASK:
39460 tree args[4];
39461 rtx ops[4];
39462 rtx wide_reg;
39463 rtx accum;
39464 rtx addr;
39465 rtx mem;
39467 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39468 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39470 s4fma_expand:
39471 mode = V4SFmode;
39472 wide_reg = gen_reg_rtx (V64SFmode);
39473 for (i = 0; i < 4; i++)
39475 rtx tmp;
39476 args[i] = CALL_EXPR_ARG (exp, i);
39477 ops[i] = expand_normal (args[i]);
39479 tmp = gen_reg_rtx (SFmode);
39480 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39482 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39483 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39486 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39487 accum = force_reg (V4SFmode, accum);
39489 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39490 addr = force_reg (Pmode, addr);
39492 mem = gen_rtx_MEM (V4SFmode, addr);
39494 target = gen_reg_rtx (V4SFmode);
39496 emit_move_insn (target, accum);
39498 if (! masked)
39499 emit_insn (fcn (target, accum, wide_reg, mem));
39500 else
39502 rtx merge, mask;
39503 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39505 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39507 if (CONST_INT_P (mask))
39508 mask = fixup_modeless_constant (mask, QImode);
39510 mask = force_reg (QImode, mask);
39512 if (GET_MODE (mask) != QImode)
39513 mask = gen_rtx_SUBREG (QImode, mask, 0);
39515 /* If merge is 0 then we're about to emit z-masked variant. */
39516 if (const0_operand (merge, mode))
39517 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39518 /* If merge is the same as accum then emit merge-masked
39519 variant. */
39520 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39522 merge = force_reg (mode, merge);
39523 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39525 /* Merge with something unknown might happen if we z-mask
39526 w/ -O0. */
39527 else
39529 target = gen_reg_rtx (mode);
39530 emit_move_insn (target, merge);
39531 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39534 return target;
39536 case IX86_BUILTIN_RDPID:
39537 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39538 target);
39539 default:
39540 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39544 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39545 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39547 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39548 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39551 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39552 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39554 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39555 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39558 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39559 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39561 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39562 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39565 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39566 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39568 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39569 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39572 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39573 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39575 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39576 const struct builtin_description *d = bdesc_multi_arg + i;
39577 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39578 (enum ix86_builtin_func_type)
39579 d->flag, d->comparison);
39582 gcc_unreachable ();
39585 /* This returns the target-specific builtin with code CODE if
39586 current_function_decl has visibility on this builtin, which is checked
39587 using isa flags. Returns NULL_TREE otherwise. */
39589 static tree ix86_get_builtin (enum ix86_builtins code)
39591 struct cl_target_option *opts;
39592 tree target_tree = NULL_TREE;
39594 /* Determine the isa flags of current_function_decl. */
39596 if (current_function_decl)
39597 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39599 if (target_tree == NULL)
39600 target_tree = target_option_default_node;
39602 opts = TREE_TARGET_OPTION (target_tree);
39604 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39605 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39606 return ix86_builtin_decl (code, true);
39607 else
39608 return NULL_TREE;
39611 /* Return function decl for target specific builtin
39612 for given MPX builtin passed i FCODE. */
39613 static tree
39614 ix86_builtin_mpx_function (unsigned fcode)
39616 switch (fcode)
39618 case BUILT_IN_CHKP_BNDMK:
39619 return ix86_builtins[IX86_BUILTIN_BNDMK];
39621 case BUILT_IN_CHKP_BNDSTX:
39622 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39624 case BUILT_IN_CHKP_BNDLDX:
39625 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39627 case BUILT_IN_CHKP_BNDCL:
39628 return ix86_builtins[IX86_BUILTIN_BNDCL];
39630 case BUILT_IN_CHKP_BNDCU:
39631 return ix86_builtins[IX86_BUILTIN_BNDCU];
39633 case BUILT_IN_CHKP_BNDRET:
39634 return ix86_builtins[IX86_BUILTIN_BNDRET];
39636 case BUILT_IN_CHKP_INTERSECT:
39637 return ix86_builtins[IX86_BUILTIN_BNDINT];
39639 case BUILT_IN_CHKP_NARROW:
39640 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39642 case BUILT_IN_CHKP_SIZEOF:
39643 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39645 case BUILT_IN_CHKP_EXTRACT_LOWER:
39646 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39648 case BUILT_IN_CHKP_EXTRACT_UPPER:
39649 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39651 default:
39652 return NULL_TREE;
39655 gcc_unreachable ();
39658 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39660 Return an address to be used to load/store bounds for pointer
39661 passed in SLOT.
39663 SLOT_NO is an integer constant holding number of a target
39664 dependent special slot to be used in case SLOT is not a memory.
39666 SPECIAL_BASE is a pointer to be used as a base of fake address
39667 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39668 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39670 static rtx
39671 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39673 rtx addr = NULL;
39675 /* NULL slot means we pass bounds for pointer not passed to the
39676 function at all. Register slot means we pass pointer in a
39677 register. In both these cases bounds are passed via Bounds
39678 Table. Since we do not have actual pointer stored in memory,
39679 we have to use fake addresses to access Bounds Table. We
39680 start with (special_base - sizeof (void*)) and decrease this
39681 address by pointer size to get addresses for other slots. */
39682 if (!slot || REG_P (slot))
39684 gcc_assert (CONST_INT_P (slot_no));
39685 addr = plus_constant (Pmode, special_base,
39686 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39688 /* If pointer is passed in a memory then its address is used to
39689 access Bounds Table. */
39690 else if (MEM_P (slot))
39692 addr = XEXP (slot, 0);
39693 if (!register_operand (addr, Pmode))
39694 addr = copy_addr_to_reg (addr);
39696 else
39697 gcc_unreachable ();
39699 return addr;
39702 /* Expand pass uses this hook to load bounds for function parameter
39703 PTR passed in SLOT in case its bounds are not passed in a register.
39705 If SLOT is a memory, then bounds are loaded as for regular pointer
39706 loaded from memory. PTR may be NULL in case SLOT is a memory.
39707 In such case value of PTR (if required) may be loaded from SLOT.
39709 If SLOT is NULL or a register then SLOT_NO is an integer constant
39710 holding number of the target dependent special slot which should be
39711 used to obtain bounds.
39713 Return loaded bounds. */
39715 static rtx
39716 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39718 rtx reg = gen_reg_rtx (BNDmode);
39719 rtx addr;
39721 /* Get address to be used to access Bounds Table. Special slots start
39722 at the location of return address of the current function. */
39723 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39725 /* Load pointer value from a memory if we don't have it. */
39726 if (!ptr)
39728 gcc_assert (MEM_P (slot));
39729 ptr = copy_addr_to_reg (slot);
39732 if (!register_operand (ptr, Pmode))
39733 ptr = ix86_zero_extend_to_Pmode (ptr);
39735 emit_insn (BNDmode == BND64mode
39736 ? gen_bnd64_ldx (reg, addr, ptr)
39737 : gen_bnd32_ldx (reg, addr, ptr));
39739 return reg;
39742 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39743 passed in SLOT in case BOUNDS are not passed in a register.
39745 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39746 stored in memory. PTR may be NULL in case SLOT is a memory.
39747 In such case value of PTR (if required) may be loaded from SLOT.
39749 If SLOT is NULL or a register then SLOT_NO is an integer constant
39750 holding number of the target dependent special slot which should be
39751 used to store BOUNDS. */
39753 static void
39754 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39756 rtx addr;
39758 /* Get address to be used to access Bounds Table. Special slots start
39759 at the location of return address of a called function. */
39760 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39762 /* Load pointer value from a memory if we don't have it. */
39763 if (!ptr)
39765 gcc_assert (MEM_P (slot));
39766 ptr = copy_addr_to_reg (slot);
39769 if (!register_operand (ptr, Pmode))
39770 ptr = ix86_zero_extend_to_Pmode (ptr);
39772 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39773 if (!register_operand (bounds, BNDmode))
39774 bounds = copy_to_mode_reg (BNDmode, bounds);
39776 emit_insn (BNDmode == BND64mode
39777 ? gen_bnd64_stx (addr, ptr, bounds)
39778 : gen_bnd32_stx (addr, ptr, bounds));
39781 /* Load and return bounds returned by function in SLOT. */
39783 static rtx
39784 ix86_load_returned_bounds (rtx slot)
39786 rtx res;
39788 gcc_assert (REG_P (slot));
39789 res = gen_reg_rtx (BNDmode);
39790 emit_move_insn (res, slot);
39792 return res;
39795 /* Store BOUNDS returned by function into SLOT. */
39797 static void
39798 ix86_store_returned_bounds (rtx slot, rtx bounds)
39800 gcc_assert (REG_P (slot));
39801 emit_move_insn (slot, bounds);
39804 /* Returns a function decl for a vectorized version of the combined function
39805 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39806 if it is not available. */
39808 static tree
39809 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39810 tree type_in)
39812 machine_mode in_mode, out_mode;
39813 int in_n, out_n;
39815 if (TREE_CODE (type_out) != VECTOR_TYPE
39816 || TREE_CODE (type_in) != VECTOR_TYPE)
39817 return NULL_TREE;
39819 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39820 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39821 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39822 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39824 switch (fn)
39826 CASE_CFN_EXP2:
39827 if (out_mode == SFmode && in_mode == SFmode)
39829 if (out_n == 16 && in_n == 16)
39830 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39832 break;
39834 CASE_CFN_IFLOOR:
39835 CASE_CFN_LFLOOR:
39836 CASE_CFN_LLFLOOR:
39837 /* The round insn does not trap on denormals. */
39838 if (flag_trapping_math || !TARGET_ROUND)
39839 break;
39841 if (out_mode == SImode && in_mode == DFmode)
39843 if (out_n == 4 && in_n == 2)
39844 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39845 else if (out_n == 8 && in_n == 4)
39846 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39847 else if (out_n == 16 && in_n == 8)
39848 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39850 if (out_mode == SImode && in_mode == SFmode)
39852 if (out_n == 4 && in_n == 4)
39853 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39854 else if (out_n == 8 && in_n == 8)
39855 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39856 else if (out_n == 16 && in_n == 16)
39857 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39859 break;
39861 CASE_CFN_ICEIL:
39862 CASE_CFN_LCEIL:
39863 CASE_CFN_LLCEIL:
39864 /* The round insn does not trap on denormals. */
39865 if (flag_trapping_math || !TARGET_ROUND)
39866 break;
39868 if (out_mode == SImode && in_mode == DFmode)
39870 if (out_n == 4 && in_n == 2)
39871 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39872 else if (out_n == 8 && in_n == 4)
39873 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39874 else if (out_n == 16 && in_n == 8)
39875 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39877 if (out_mode == SImode && in_mode == SFmode)
39879 if (out_n == 4 && in_n == 4)
39880 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39881 else if (out_n == 8 && in_n == 8)
39882 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39883 else if (out_n == 16 && in_n == 16)
39884 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39886 break;
39888 CASE_CFN_IRINT:
39889 CASE_CFN_LRINT:
39890 CASE_CFN_LLRINT:
39891 if (out_mode == SImode && in_mode == DFmode)
39893 if (out_n == 4 && in_n == 2)
39894 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39895 else if (out_n == 8 && in_n == 4)
39896 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39897 else if (out_n == 16 && in_n == 8)
39898 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39900 if (out_mode == SImode && in_mode == SFmode)
39902 if (out_n == 4 && in_n == 4)
39903 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39904 else if (out_n == 8 && in_n == 8)
39905 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39906 else if (out_n == 16 && in_n == 16)
39907 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39909 break;
39911 CASE_CFN_IROUND:
39912 CASE_CFN_LROUND:
39913 CASE_CFN_LLROUND:
39914 /* The round insn does not trap on denormals. */
39915 if (flag_trapping_math || !TARGET_ROUND)
39916 break;
39918 if (out_mode == SImode && in_mode == DFmode)
39920 if (out_n == 4 && in_n == 2)
39921 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39922 else if (out_n == 8 && in_n == 4)
39923 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39924 else if (out_n == 16 && in_n == 8)
39925 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39927 if (out_mode == SImode && in_mode == SFmode)
39929 if (out_n == 4 && in_n == 4)
39930 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39931 else if (out_n == 8 && in_n == 8)
39932 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39933 else if (out_n == 16 && in_n == 16)
39934 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39936 break;
39938 CASE_CFN_FLOOR:
39939 /* The round insn does not trap on denormals. */
39940 if (flag_trapping_math || !TARGET_ROUND)
39941 break;
39943 if (out_mode == DFmode && in_mode == DFmode)
39945 if (out_n == 2 && in_n == 2)
39946 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39947 else if (out_n == 4 && in_n == 4)
39948 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39949 else if (out_n == 8 && in_n == 8)
39950 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39952 if (out_mode == SFmode && in_mode == SFmode)
39954 if (out_n == 4 && in_n == 4)
39955 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39956 else if (out_n == 8 && in_n == 8)
39957 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39958 else if (out_n == 16 && in_n == 16)
39959 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39961 break;
39963 CASE_CFN_CEIL:
39964 /* The round insn does not trap on denormals. */
39965 if (flag_trapping_math || !TARGET_ROUND)
39966 break;
39968 if (out_mode == DFmode && in_mode == DFmode)
39970 if (out_n == 2 && in_n == 2)
39971 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39972 else if (out_n == 4 && in_n == 4)
39973 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39974 else if (out_n == 8 && in_n == 8)
39975 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39977 if (out_mode == SFmode && in_mode == SFmode)
39979 if (out_n == 4 && in_n == 4)
39980 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39981 else if (out_n == 8 && in_n == 8)
39982 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39983 else if (out_n == 16 && in_n == 16)
39984 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39986 break;
39988 CASE_CFN_TRUNC:
39989 /* The round insn does not trap on denormals. */
39990 if (flag_trapping_math || !TARGET_ROUND)
39991 break;
39993 if (out_mode == DFmode && in_mode == DFmode)
39995 if (out_n == 2 && in_n == 2)
39996 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39997 else if (out_n == 4 && in_n == 4)
39998 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39999 else if (out_n == 8 && in_n == 8)
40000 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
40002 if (out_mode == SFmode && in_mode == SFmode)
40004 if (out_n == 4 && in_n == 4)
40005 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
40006 else if (out_n == 8 && in_n == 8)
40007 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
40008 else if (out_n == 16 && in_n == 16)
40009 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40011 break;
40013 CASE_CFN_RINT:
40014 /* The round insn does not trap on denormals. */
40015 if (flag_trapping_math || !TARGET_ROUND)
40016 break;
40018 if (out_mode == DFmode && in_mode == DFmode)
40020 if (out_n == 2 && in_n == 2)
40021 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40022 else if (out_n == 4 && in_n == 4)
40023 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40025 if (out_mode == SFmode && in_mode == SFmode)
40027 if (out_n == 4 && in_n == 4)
40028 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40029 else if (out_n == 8 && in_n == 8)
40030 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40032 break;
40034 CASE_CFN_FMA:
40035 if (out_mode == DFmode && in_mode == DFmode)
40037 if (out_n == 2 && in_n == 2)
40038 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40039 if (out_n == 4 && in_n == 4)
40040 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40042 if (out_mode == SFmode && in_mode == SFmode)
40044 if (out_n == 4 && in_n == 4)
40045 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40046 if (out_n == 8 && in_n == 8)
40047 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40049 break;
40051 default:
40052 break;
40055 /* Dispatch to a handler for a vectorization library. */
40056 if (ix86_veclib_handler)
40057 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40059 return NULL_TREE;
40062 /* Handler for an SVML-style interface to
40063 a library with vectorized intrinsics. */
40065 static tree
40066 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40068 char name[20];
40069 tree fntype, new_fndecl, args;
40070 unsigned arity;
40071 const char *bname;
40072 machine_mode el_mode, in_mode;
40073 int n, in_n;
40075 /* The SVML is suitable for unsafe math only. */
40076 if (!flag_unsafe_math_optimizations)
40077 return NULL_TREE;
40079 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40080 n = TYPE_VECTOR_SUBPARTS (type_out);
40081 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40082 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40083 if (el_mode != in_mode
40084 || n != in_n)
40085 return NULL_TREE;
40087 switch (fn)
40089 CASE_CFN_EXP:
40090 CASE_CFN_LOG:
40091 CASE_CFN_LOG10:
40092 CASE_CFN_POW:
40093 CASE_CFN_TANH:
40094 CASE_CFN_TAN:
40095 CASE_CFN_ATAN:
40096 CASE_CFN_ATAN2:
40097 CASE_CFN_ATANH:
40098 CASE_CFN_CBRT:
40099 CASE_CFN_SINH:
40100 CASE_CFN_SIN:
40101 CASE_CFN_ASINH:
40102 CASE_CFN_ASIN:
40103 CASE_CFN_COSH:
40104 CASE_CFN_COS:
40105 CASE_CFN_ACOSH:
40106 CASE_CFN_ACOS:
40107 if ((el_mode != DFmode || n != 2)
40108 && (el_mode != SFmode || n != 4))
40109 return NULL_TREE;
40110 break;
40112 default:
40113 return NULL_TREE;
40116 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40117 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40119 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40120 strcpy (name, "vmlsLn4");
40121 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40122 strcpy (name, "vmldLn2");
40123 else if (n == 4)
40125 sprintf (name, "vmls%s", bname+10);
40126 name[strlen (name)-1] = '4';
40128 else
40129 sprintf (name, "vmld%s2", bname+10);
40131 /* Convert to uppercase. */
40132 name[4] &= ~0x20;
40134 arity = 0;
40135 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40136 arity++;
40138 if (arity == 1)
40139 fntype = build_function_type_list (type_out, type_in, NULL);
40140 else
40141 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40143 /* Build a function declaration for the vectorized function. */
40144 new_fndecl = build_decl (BUILTINS_LOCATION,
40145 FUNCTION_DECL, get_identifier (name), fntype);
40146 TREE_PUBLIC (new_fndecl) = 1;
40147 DECL_EXTERNAL (new_fndecl) = 1;
40148 DECL_IS_NOVOPS (new_fndecl) = 1;
40149 TREE_READONLY (new_fndecl) = 1;
40151 return new_fndecl;
40154 /* Handler for an ACML-style interface to
40155 a library with vectorized intrinsics. */
40157 static tree
40158 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40160 char name[20] = "__vr.._";
40161 tree fntype, new_fndecl, args;
40162 unsigned arity;
40163 const char *bname;
40164 machine_mode el_mode, in_mode;
40165 int n, in_n;
40167 /* The ACML is 64bits only and suitable for unsafe math only as
40168 it does not correctly support parts of IEEE with the required
40169 precision such as denormals. */
40170 if (!TARGET_64BIT
40171 || !flag_unsafe_math_optimizations)
40172 return NULL_TREE;
40174 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40175 n = TYPE_VECTOR_SUBPARTS (type_out);
40176 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40177 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40178 if (el_mode != in_mode
40179 || n != in_n)
40180 return NULL_TREE;
40182 switch (fn)
40184 CASE_CFN_SIN:
40185 CASE_CFN_COS:
40186 CASE_CFN_EXP:
40187 CASE_CFN_LOG:
40188 CASE_CFN_LOG2:
40189 CASE_CFN_LOG10:
40190 if (el_mode == DFmode && n == 2)
40192 name[4] = 'd';
40193 name[5] = '2';
40195 else if (el_mode == SFmode && n == 4)
40197 name[4] = 's';
40198 name[5] = '4';
40200 else
40201 return NULL_TREE;
40202 break;
40204 default:
40205 return NULL_TREE;
40208 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40209 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40210 sprintf (name + 7, "%s", bname+10);
40212 arity = 0;
40213 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40214 arity++;
40216 if (arity == 1)
40217 fntype = build_function_type_list (type_out, type_in, NULL);
40218 else
40219 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40221 /* Build a function declaration for the vectorized function. */
40222 new_fndecl = build_decl (BUILTINS_LOCATION,
40223 FUNCTION_DECL, get_identifier (name), fntype);
40224 TREE_PUBLIC (new_fndecl) = 1;
40225 DECL_EXTERNAL (new_fndecl) = 1;
40226 DECL_IS_NOVOPS (new_fndecl) = 1;
40227 TREE_READONLY (new_fndecl) = 1;
40229 return new_fndecl;
40232 /* Returns a decl of a function that implements gather load with
40233 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40234 Return NULL_TREE if it is not available. */
40236 static tree
40237 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40238 const_tree index_type, int scale)
40240 bool si;
40241 enum ix86_builtins code;
40243 if (! TARGET_AVX2)
40244 return NULL_TREE;
40246 if ((TREE_CODE (index_type) != INTEGER_TYPE
40247 && !POINTER_TYPE_P (index_type))
40248 || (TYPE_MODE (index_type) != SImode
40249 && TYPE_MODE (index_type) != DImode))
40250 return NULL_TREE;
40252 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40253 return NULL_TREE;
40255 /* v*gather* insn sign extends index to pointer mode. */
40256 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40257 && TYPE_UNSIGNED (index_type))
40258 return NULL_TREE;
40260 if (scale <= 0
40261 || scale > 8
40262 || (scale & (scale - 1)) != 0)
40263 return NULL_TREE;
40265 si = TYPE_MODE (index_type) == SImode;
40266 switch (TYPE_MODE (mem_vectype))
40268 case V2DFmode:
40269 if (TARGET_AVX512VL)
40270 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40271 else
40272 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40273 break;
40274 case V4DFmode:
40275 if (TARGET_AVX512VL)
40276 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40277 else
40278 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40279 break;
40280 case V2DImode:
40281 if (TARGET_AVX512VL)
40282 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40283 else
40284 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40285 break;
40286 case V4DImode:
40287 if (TARGET_AVX512VL)
40288 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40289 else
40290 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40291 break;
40292 case V4SFmode:
40293 if (TARGET_AVX512VL)
40294 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40295 else
40296 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40297 break;
40298 case V8SFmode:
40299 if (TARGET_AVX512VL)
40300 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40301 else
40302 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40303 break;
40304 case V4SImode:
40305 if (TARGET_AVX512VL)
40306 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40307 else
40308 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40309 break;
40310 case V8SImode:
40311 if (TARGET_AVX512VL)
40312 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40313 else
40314 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40315 break;
40316 case V8DFmode:
40317 if (TARGET_AVX512F)
40318 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40319 else
40320 return NULL_TREE;
40321 break;
40322 case V8DImode:
40323 if (TARGET_AVX512F)
40324 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40325 else
40326 return NULL_TREE;
40327 break;
40328 case V16SFmode:
40329 if (TARGET_AVX512F)
40330 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40331 else
40332 return NULL_TREE;
40333 break;
40334 case V16SImode:
40335 if (TARGET_AVX512F)
40336 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40337 else
40338 return NULL_TREE;
40339 break;
40340 default:
40341 return NULL_TREE;
40344 return ix86_get_builtin (code);
40347 /* Returns a decl of a function that implements scatter store with
40348 register type VECTYPE and index type INDEX_TYPE and SCALE.
40349 Return NULL_TREE if it is not available. */
40351 static tree
40352 ix86_vectorize_builtin_scatter (const_tree vectype,
40353 const_tree index_type, int scale)
40355 bool si;
40356 enum ix86_builtins code;
40358 if (!TARGET_AVX512F)
40359 return NULL_TREE;
40361 if ((TREE_CODE (index_type) != INTEGER_TYPE
40362 && !POINTER_TYPE_P (index_type))
40363 || (TYPE_MODE (index_type) != SImode
40364 && TYPE_MODE (index_type) != DImode))
40365 return NULL_TREE;
40367 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40368 return NULL_TREE;
40370 /* v*scatter* insn sign extends index to pointer mode. */
40371 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40372 && TYPE_UNSIGNED (index_type))
40373 return NULL_TREE;
40375 /* Scale can be 1, 2, 4 or 8. */
40376 if (scale <= 0
40377 || scale > 8
40378 || (scale & (scale - 1)) != 0)
40379 return NULL_TREE;
40381 si = TYPE_MODE (index_type) == SImode;
40382 switch (TYPE_MODE (vectype))
40384 case V8DFmode:
40385 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40386 break;
40387 case V8DImode:
40388 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40389 break;
40390 case V16SFmode:
40391 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40392 break;
40393 case V16SImode:
40394 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40395 break;
40396 default:
40397 return NULL_TREE;
40400 return ix86_builtins[code];
40403 /* Return true if it is safe to use the rsqrt optabs to optimize
40404 1.0/sqrt. */
40406 static bool
40407 use_rsqrt_p ()
40409 return (TARGET_SSE_MATH
40410 && flag_finite_math_only
40411 && !flag_trapping_math
40412 && flag_unsafe_math_optimizations);
40415 /* Returns a code for a target-specific builtin that implements
40416 reciprocal of the function, or NULL_TREE if not available. */
40418 static tree
40419 ix86_builtin_reciprocal (tree fndecl)
40421 switch (DECL_FUNCTION_CODE (fndecl))
40423 /* Vectorized version of sqrt to rsqrt conversion. */
40424 case IX86_BUILTIN_SQRTPS_NR:
40425 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40427 case IX86_BUILTIN_SQRTPS_NR256:
40428 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40430 default:
40431 return NULL_TREE;
40435 /* Helper for avx_vpermilps256_operand et al. This is also used by
40436 the expansion functions to turn the parallel back into a mask.
40437 The return value is 0 for no match and the imm8+1 for a match. */
40440 avx_vpermilp_parallel (rtx par, machine_mode mode)
40442 unsigned i, nelt = GET_MODE_NUNITS (mode);
40443 unsigned mask = 0;
40444 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40446 if (XVECLEN (par, 0) != (int) nelt)
40447 return 0;
40449 /* Validate that all of the elements are constants, and not totally
40450 out of range. Copy the data into an integral array to make the
40451 subsequent checks easier. */
40452 for (i = 0; i < nelt; ++i)
40454 rtx er = XVECEXP (par, 0, i);
40455 unsigned HOST_WIDE_INT ei;
40457 if (!CONST_INT_P (er))
40458 return 0;
40459 ei = INTVAL (er);
40460 if (ei >= nelt)
40461 return 0;
40462 ipar[i] = ei;
40465 switch (mode)
40467 case V8DFmode:
40468 /* In the 512-bit DFmode case, we can only move elements within
40469 a 128-bit lane. First fill the second part of the mask,
40470 then fallthru. */
40471 for (i = 4; i < 6; ++i)
40473 if (ipar[i] < 4 || ipar[i] >= 6)
40474 return 0;
40475 mask |= (ipar[i] - 4) << i;
40477 for (i = 6; i < 8; ++i)
40479 if (ipar[i] < 6)
40480 return 0;
40481 mask |= (ipar[i] - 6) << i;
40483 /* FALLTHRU */
40485 case V4DFmode:
40486 /* In the 256-bit DFmode case, we can only move elements within
40487 a 128-bit lane. */
40488 for (i = 0; i < 2; ++i)
40490 if (ipar[i] >= 2)
40491 return 0;
40492 mask |= ipar[i] << i;
40494 for (i = 2; i < 4; ++i)
40496 if (ipar[i] < 2)
40497 return 0;
40498 mask |= (ipar[i] - 2) << i;
40500 break;
40502 case V16SFmode:
40503 /* In 512 bit SFmode case, permutation in the upper 256 bits
40504 must mirror the permutation in the lower 256-bits. */
40505 for (i = 0; i < 8; ++i)
40506 if (ipar[i] + 8 != ipar[i + 8])
40507 return 0;
40508 /* FALLTHRU */
40510 case V8SFmode:
40511 /* In 256 bit SFmode case, we have full freedom of
40512 movement within the low 128-bit lane, but the high 128-bit
40513 lane must mirror the exact same pattern. */
40514 for (i = 0; i < 4; ++i)
40515 if (ipar[i] + 4 != ipar[i + 4])
40516 return 0;
40517 nelt = 4;
40518 /* FALLTHRU */
40520 case V2DFmode:
40521 case V4SFmode:
40522 /* In the 128-bit case, we've full freedom in the placement of
40523 the elements from the source operand. */
40524 for (i = 0; i < nelt; ++i)
40525 mask |= ipar[i] << (i * (nelt / 2));
40526 break;
40528 default:
40529 gcc_unreachable ();
40532 /* Make sure success has a non-zero value by adding one. */
40533 return mask + 1;
40536 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40537 the expansion functions to turn the parallel back into a mask.
40538 The return value is 0 for no match and the imm8+1 for a match. */
40541 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40543 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40544 unsigned mask = 0;
40545 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40547 if (XVECLEN (par, 0) != (int) nelt)
40548 return 0;
40550 /* Validate that all of the elements are constants, and not totally
40551 out of range. Copy the data into an integral array to make the
40552 subsequent checks easier. */
40553 for (i = 0; i < nelt; ++i)
40555 rtx er = XVECEXP (par, 0, i);
40556 unsigned HOST_WIDE_INT ei;
40558 if (!CONST_INT_P (er))
40559 return 0;
40560 ei = INTVAL (er);
40561 if (ei >= 2 * nelt)
40562 return 0;
40563 ipar[i] = ei;
40566 /* Validate that the halves of the permute are halves. */
40567 for (i = 0; i < nelt2 - 1; ++i)
40568 if (ipar[i] + 1 != ipar[i + 1])
40569 return 0;
40570 for (i = nelt2; i < nelt - 1; ++i)
40571 if (ipar[i] + 1 != ipar[i + 1])
40572 return 0;
40574 /* Reconstruct the mask. */
40575 for (i = 0; i < 2; ++i)
40577 unsigned e = ipar[i * nelt2];
40578 if (e % nelt2)
40579 return 0;
40580 e /= nelt2;
40581 mask |= e << (i * 4);
40584 /* Make sure success has a non-zero value by adding one. */
40585 return mask + 1;
40588 /* Return a register priority for hard reg REGNO. */
40589 static int
40590 ix86_register_priority (int hard_regno)
40592 /* ebp and r13 as the base always wants a displacement, r12 as the
40593 base always wants an index. So discourage their usage in an
40594 address. */
40595 if (hard_regno == R12_REG || hard_regno == R13_REG)
40596 return 0;
40597 if (hard_regno == BP_REG)
40598 return 1;
40599 /* New x86-64 int registers result in bigger code size. Discourage
40600 them. */
40601 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40602 return 2;
40603 /* New x86-64 SSE registers result in bigger code size. Discourage
40604 them. */
40605 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40606 return 2;
40607 /* Usage of AX register results in smaller code. Prefer it. */
40608 if (hard_regno == AX_REG)
40609 return 4;
40610 return 3;
40613 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40615 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40616 QImode must go into class Q_REGS.
40617 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40618 movdf to do mem-to-mem moves through integer regs. */
40620 static reg_class_t
40621 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40623 machine_mode mode = GET_MODE (x);
40625 /* We're only allowed to return a subclass of CLASS. Many of the
40626 following checks fail for NO_REGS, so eliminate that early. */
40627 if (regclass == NO_REGS)
40628 return NO_REGS;
40630 /* All classes can load zeros. */
40631 if (x == CONST0_RTX (mode))
40632 return regclass;
40634 /* Force constants into memory if we are loading a (nonzero) constant into
40635 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40636 instructions to load from a constant. */
40637 if (CONSTANT_P (x)
40638 && (MAYBE_MMX_CLASS_P (regclass)
40639 || MAYBE_SSE_CLASS_P (regclass)
40640 || MAYBE_MASK_CLASS_P (regclass)))
40641 return NO_REGS;
40643 /* Floating-point constants need more complex checks. */
40644 if (CONST_DOUBLE_P (x))
40646 /* General regs can load everything. */
40647 if (INTEGER_CLASS_P (regclass))
40648 return regclass;
40650 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40651 zero above. We only want to wind up preferring 80387 registers if
40652 we plan on doing computation with them. */
40653 if (IS_STACK_MODE (mode)
40654 && standard_80387_constant_p (x) > 0)
40656 /* Limit class to FP regs. */
40657 if (FLOAT_CLASS_P (regclass))
40658 return FLOAT_REGS;
40659 else if (regclass == FP_TOP_SSE_REGS)
40660 return FP_TOP_REG;
40661 else if (regclass == FP_SECOND_SSE_REGS)
40662 return FP_SECOND_REG;
40665 return NO_REGS;
40668 /* Prefer SSE regs only, if we can use them for math. */
40669 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40670 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40672 /* Generally when we see PLUS here, it's the function invariant
40673 (plus soft-fp const_int). Which can only be computed into general
40674 regs. */
40675 if (GET_CODE (x) == PLUS)
40676 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40678 /* QImode constants are easy to load, but non-constant QImode data
40679 must go into Q_REGS. */
40680 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40682 if (Q_CLASS_P (regclass))
40683 return regclass;
40684 else if (reg_class_subset_p (Q_REGS, regclass))
40685 return Q_REGS;
40686 else
40687 return NO_REGS;
40690 return regclass;
40693 /* Discourage putting floating-point values in SSE registers unless
40694 SSE math is being used, and likewise for the 387 registers. */
40695 static reg_class_t
40696 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40698 machine_mode mode = GET_MODE (x);
40700 /* Restrict the output reload class to the register bank that we are doing
40701 math on. If we would like not to return a subset of CLASS, reject this
40702 alternative: if reload cannot do this, it will still use its choice. */
40703 mode = GET_MODE (x);
40704 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40705 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40707 if (IS_STACK_MODE (mode))
40709 if (regclass == FP_TOP_SSE_REGS)
40710 return FP_TOP_REG;
40711 else if (regclass == FP_SECOND_SSE_REGS)
40712 return FP_SECOND_REG;
40713 else
40714 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40717 return regclass;
40720 static reg_class_t
40721 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40722 machine_mode mode, secondary_reload_info *sri)
40724 /* Double-word spills from general registers to non-offsettable memory
40725 references (zero-extended addresses) require special handling. */
40726 if (TARGET_64BIT
40727 && MEM_P (x)
40728 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40729 && INTEGER_CLASS_P (rclass)
40730 && !offsettable_memref_p (x))
40732 sri->icode = (in_p
40733 ? CODE_FOR_reload_noff_load
40734 : CODE_FOR_reload_noff_store);
40735 /* Add the cost of moving address to a temporary. */
40736 sri->extra_cost = 1;
40738 return NO_REGS;
40741 /* QImode spills from non-QI registers require
40742 intermediate register on 32bit targets. */
40743 if (mode == QImode
40744 && ((!TARGET_64BIT && !in_p
40745 && INTEGER_CLASS_P (rclass)
40746 && MAYBE_NON_Q_CLASS_P (rclass))
40747 || (!TARGET_AVX512DQ
40748 && MAYBE_MASK_CLASS_P (rclass))))
40750 int regno = true_regnum (x);
40752 /* Return Q_REGS if the operand is in memory. */
40753 if (regno == -1)
40754 return Q_REGS;
40756 return NO_REGS;
40759 /* This condition handles corner case where an expression involving
40760 pointers gets vectorized. We're trying to use the address of a
40761 stack slot as a vector initializer.
40763 (set (reg:V2DI 74 [ vect_cst_.2 ])
40764 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40766 Eventually frame gets turned into sp+offset like this:
40768 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40769 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40770 (const_int 392 [0x188]))))
40772 That later gets turned into:
40774 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40775 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40776 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40778 We'll have the following reload recorded:
40780 Reload 0: reload_in (DI) =
40781 (plus:DI (reg/f:DI 7 sp)
40782 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40783 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40784 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40785 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40786 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40787 reload_reg_rtx: (reg:V2DI 22 xmm1)
40789 Which isn't going to work since SSE instructions can't handle scalar
40790 additions. Returning GENERAL_REGS forces the addition into integer
40791 register and reload can handle subsequent reloads without problems. */
40793 if (in_p && GET_CODE (x) == PLUS
40794 && SSE_CLASS_P (rclass)
40795 && SCALAR_INT_MODE_P (mode))
40796 return GENERAL_REGS;
40798 return NO_REGS;
40801 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40803 static bool
40804 ix86_class_likely_spilled_p (reg_class_t rclass)
40806 switch (rclass)
40808 case AREG:
40809 case DREG:
40810 case CREG:
40811 case BREG:
40812 case AD_REGS:
40813 case SIREG:
40814 case DIREG:
40815 case SSE_FIRST_REG:
40816 case FP_TOP_REG:
40817 case FP_SECOND_REG:
40818 case BND_REGS:
40819 return true;
40821 default:
40822 break;
40825 return false;
40828 /* If we are copying between registers from different register sets
40829 (e.g. FP and integer), we may need a memory location.
40831 The function can't work reliably when one of the CLASSES is a class
40832 containing registers from multiple sets. We avoid this by never combining
40833 different sets in a single alternative in the machine description.
40834 Ensure that this constraint holds to avoid unexpected surprises.
40836 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40837 so do not enforce these sanity checks.
40839 To optimize register_move_cost performance, define inline variant. */
40841 static inline bool
40842 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40843 machine_mode mode, int strict)
40845 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40846 return false;
40848 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40849 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40850 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40851 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40852 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40853 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40854 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40855 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40857 gcc_assert (!strict || lra_in_progress);
40858 return true;
40861 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40862 return true;
40864 /* Between mask and general, we have moves no larger than word size. */
40865 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40866 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40867 return true;
40869 /* ??? This is a lie. We do have moves between mmx/general, and for
40870 mmx/sse2. But by saying we need secondary memory we discourage the
40871 register allocator from using the mmx registers unless needed. */
40872 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40873 return true;
40875 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40877 /* SSE1 doesn't have any direct moves from other classes. */
40878 if (!TARGET_SSE2)
40879 return true;
40881 /* If the target says that inter-unit moves are more expensive
40882 than moving through memory, then don't generate them. */
40883 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40884 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40885 return true;
40887 /* Between SSE and general, we have moves no larger than word size. */
40888 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40889 return true;
40892 return false;
40895 bool
40896 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40897 machine_mode mode, int strict)
40899 return inline_secondary_memory_needed (class1, class2, mode, strict);
40902 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40904 On the 80386, this is the size of MODE in words,
40905 except in the FP regs, where a single reg is always enough. */
40907 static unsigned char
40908 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40910 if (MAYBE_INTEGER_CLASS_P (rclass))
40912 if (mode == XFmode)
40913 return (TARGET_64BIT ? 2 : 3);
40914 else if (mode == XCmode)
40915 return (TARGET_64BIT ? 4 : 6);
40916 else
40917 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40919 else
40921 if (COMPLEX_MODE_P (mode))
40922 return 2;
40923 else
40924 return 1;
40928 /* Return true if the registers in CLASS cannot represent the change from
40929 modes FROM to TO. */
40931 bool
40932 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40933 enum reg_class regclass)
40935 if (from == to)
40936 return false;
40938 /* x87 registers can't do subreg at all, as all values are reformatted
40939 to extended precision. */
40940 if (MAYBE_FLOAT_CLASS_P (regclass))
40941 return true;
40943 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40945 /* Vector registers do not support QI or HImode loads. If we don't
40946 disallow a change to these modes, reload will assume it's ok to
40947 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40948 the vec_dupv4hi pattern. */
40949 if (GET_MODE_SIZE (from) < 4)
40950 return true;
40953 return false;
40956 /* Return the cost of moving data of mode M between a
40957 register and memory. A value of 2 is the default; this cost is
40958 relative to those in `REGISTER_MOVE_COST'.
40960 This function is used extensively by register_move_cost that is used to
40961 build tables at startup. Make it inline in this case.
40962 When IN is 2, return maximum of in and out move cost.
40964 If moving between registers and memory is more expensive than
40965 between two registers, you should define this macro to express the
40966 relative cost.
40968 Model also increased moving costs of QImode registers in non
40969 Q_REGS classes.
40971 static inline int
40972 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40973 int in)
40975 int cost;
40976 if (FLOAT_CLASS_P (regclass))
40978 int index;
40979 switch (mode)
40981 case SFmode:
40982 index = 0;
40983 break;
40984 case DFmode:
40985 index = 1;
40986 break;
40987 case XFmode:
40988 index = 2;
40989 break;
40990 default:
40991 return 100;
40993 if (in == 2)
40994 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40995 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40997 if (SSE_CLASS_P (regclass))
40999 int index;
41000 switch (GET_MODE_SIZE (mode))
41002 case 4:
41003 index = 0;
41004 break;
41005 case 8:
41006 index = 1;
41007 break;
41008 case 16:
41009 index = 2;
41010 break;
41011 default:
41012 return 100;
41014 if (in == 2)
41015 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41016 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41018 if (MMX_CLASS_P (regclass))
41020 int index;
41021 switch (GET_MODE_SIZE (mode))
41023 case 4:
41024 index = 0;
41025 break;
41026 case 8:
41027 index = 1;
41028 break;
41029 default:
41030 return 100;
41032 if (in)
41033 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41034 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41036 switch (GET_MODE_SIZE (mode))
41038 case 1:
41039 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41041 if (!in)
41042 return ix86_cost->int_store[0];
41043 if (TARGET_PARTIAL_REG_DEPENDENCY
41044 && optimize_function_for_speed_p (cfun))
41045 cost = ix86_cost->movzbl_load;
41046 else
41047 cost = ix86_cost->int_load[0];
41048 if (in == 2)
41049 return MAX (cost, ix86_cost->int_store[0]);
41050 return cost;
41052 else
41054 if (in == 2)
41055 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41056 if (in)
41057 return ix86_cost->movzbl_load;
41058 else
41059 return ix86_cost->int_store[0] + 4;
41061 break;
41062 case 2:
41063 if (in == 2)
41064 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41065 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41066 default:
41067 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41068 if (mode == TFmode)
41069 mode = XFmode;
41070 if (in == 2)
41071 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41072 else if (in)
41073 cost = ix86_cost->int_load[2];
41074 else
41075 cost = ix86_cost->int_store[2];
41076 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41080 static int
41081 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41082 bool in)
41084 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41088 /* Return the cost of moving data from a register in class CLASS1 to
41089 one in class CLASS2.
41091 It is not required that the cost always equal 2 when FROM is the same as TO;
41092 on some machines it is expensive to move between registers if they are not
41093 general registers. */
41095 static int
41096 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41097 reg_class_t class2_i)
41099 enum reg_class class1 = (enum reg_class) class1_i;
41100 enum reg_class class2 = (enum reg_class) class2_i;
41102 /* In case we require secondary memory, compute cost of the store followed
41103 by load. In order to avoid bad register allocation choices, we need
41104 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41106 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41108 int cost = 1;
41110 cost += inline_memory_move_cost (mode, class1, 2);
41111 cost += inline_memory_move_cost (mode, class2, 2);
41113 /* In case of copying from general_purpose_register we may emit multiple
41114 stores followed by single load causing memory size mismatch stall.
41115 Count this as arbitrarily high cost of 20. */
41116 if (targetm.class_max_nregs (class1, mode)
41117 > targetm.class_max_nregs (class2, mode))
41118 cost += 20;
41120 /* In the case of FP/MMX moves, the registers actually overlap, and we
41121 have to switch modes in order to treat them differently. */
41122 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41123 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41124 cost += 20;
41126 return cost;
41129 /* Moves between SSE/MMX and integer unit are expensive. */
41130 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41131 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41133 /* ??? By keeping returned value relatively high, we limit the number
41134 of moves between integer and MMX/SSE registers for all targets.
41135 Additionally, high value prevents problem with x86_modes_tieable_p(),
41136 where integer modes in MMX/SSE registers are not tieable
41137 because of missing QImode and HImode moves to, from or between
41138 MMX/SSE registers. */
41139 return MAX (8, ix86_cost->mmxsse_to_integer);
41141 if (MAYBE_FLOAT_CLASS_P (class1))
41142 return ix86_cost->fp_move;
41143 if (MAYBE_SSE_CLASS_P (class1))
41144 return ix86_cost->sse_move;
41145 if (MAYBE_MMX_CLASS_P (class1))
41146 return ix86_cost->mmx_move;
41147 return 2;
41150 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41151 MODE. */
41153 bool
41154 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41156 /* Flags and only flags can only hold CCmode values. */
41157 if (CC_REGNO_P (regno))
41158 return GET_MODE_CLASS (mode) == MODE_CC;
41159 if (GET_MODE_CLASS (mode) == MODE_CC
41160 || GET_MODE_CLASS (mode) == MODE_RANDOM
41161 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41162 return false;
41163 if (STACK_REGNO_P (regno))
41164 return VALID_FP_MODE_P (mode);
41165 if (MASK_REGNO_P (regno))
41166 return (VALID_MASK_REG_MODE (mode)
41167 || (TARGET_AVX512BW
41168 && VALID_MASK_AVX512BW_MODE (mode)));
41169 if (BND_REGNO_P (regno))
41170 return VALID_BND_REG_MODE (mode);
41171 if (SSE_REGNO_P (regno))
41173 /* We implement the move patterns for all vector modes into and
41174 out of SSE registers, even when no operation instructions
41175 are available. */
41177 /* For AVX-512 we allow, regardless of regno:
41178 - XI mode
41179 - any of 512-bit wide vector mode
41180 - any scalar mode. */
41181 if (TARGET_AVX512F
41182 && (mode == XImode
41183 || VALID_AVX512F_REG_MODE (mode)
41184 || VALID_AVX512F_SCALAR_MODE (mode)))
41185 return true;
41187 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41188 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41189 && MOD4_SSE_REGNO_P (regno)
41190 && mode == V64SFmode)
41191 return true;
41193 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41194 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41195 && MOD4_SSE_REGNO_P (regno)
41196 && mode == V64SImode)
41197 return true;
41199 /* TODO check for QI/HI scalars. */
41200 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41201 if (TARGET_AVX512VL
41202 && (mode == OImode
41203 || mode == TImode
41204 || VALID_AVX256_REG_MODE (mode)
41205 || VALID_AVX512VL_128_REG_MODE (mode)))
41206 return true;
41208 /* xmm16-xmm31 are only available for AVX-512. */
41209 if (EXT_REX_SSE_REGNO_P (regno))
41210 return false;
41212 /* OImode and AVX modes are available only when AVX is enabled. */
41213 return ((TARGET_AVX
41214 && VALID_AVX256_REG_OR_OI_MODE (mode))
41215 || VALID_SSE_REG_MODE (mode)
41216 || VALID_SSE2_REG_MODE (mode)
41217 || VALID_MMX_REG_MODE (mode)
41218 || VALID_MMX_REG_MODE_3DNOW (mode));
41220 if (MMX_REGNO_P (regno))
41222 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41223 so if the register is available at all, then we can move data of
41224 the given mode into or out of it. */
41225 return (VALID_MMX_REG_MODE (mode)
41226 || VALID_MMX_REG_MODE_3DNOW (mode));
41229 if (mode == QImode)
41231 /* Take care for QImode values - they can be in non-QI regs,
41232 but then they do cause partial register stalls. */
41233 if (ANY_QI_REGNO_P (regno))
41234 return true;
41235 if (!TARGET_PARTIAL_REG_STALL)
41236 return true;
41237 /* LRA checks if the hard register is OK for the given mode.
41238 QImode values can live in non-QI regs, so we allow all
41239 registers here. */
41240 if (lra_in_progress)
41241 return true;
41242 return !can_create_pseudo_p ();
41244 /* We handle both integer and floats in the general purpose registers. */
41245 else if (VALID_INT_MODE_P (mode))
41246 return true;
41247 else if (VALID_FP_MODE_P (mode))
41248 return true;
41249 else if (VALID_DFP_MODE_P (mode))
41250 return true;
41251 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41252 on to use that value in smaller contexts, this can easily force a
41253 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41254 supporting DImode, allow it. */
41255 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41256 return true;
41258 return false;
41261 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41262 tieable integer mode. */
41264 static bool
41265 ix86_tieable_integer_mode_p (machine_mode mode)
41267 switch (mode)
41269 case HImode:
41270 case SImode:
41271 return true;
41273 case QImode:
41274 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41276 case DImode:
41277 return TARGET_64BIT;
41279 default:
41280 return false;
41284 /* Return true if MODE1 is accessible in a register that can hold MODE2
41285 without copying. That is, all register classes that can hold MODE2
41286 can also hold MODE1. */
41288 bool
41289 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41291 if (mode1 == mode2)
41292 return true;
41294 if (ix86_tieable_integer_mode_p (mode1)
41295 && ix86_tieable_integer_mode_p (mode2))
41296 return true;
41298 /* MODE2 being XFmode implies fp stack or general regs, which means we
41299 can tie any smaller floating point modes to it. Note that we do not
41300 tie this with TFmode. */
41301 if (mode2 == XFmode)
41302 return mode1 == SFmode || mode1 == DFmode;
41304 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41305 that we can tie it with SFmode. */
41306 if (mode2 == DFmode)
41307 return mode1 == SFmode;
41309 /* If MODE2 is only appropriate for an SSE register, then tie with
41310 any other mode acceptable to SSE registers. */
41311 if (GET_MODE_SIZE (mode2) == 32
41312 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41313 return (GET_MODE_SIZE (mode1) == 32
41314 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41315 if (GET_MODE_SIZE (mode2) == 16
41316 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41317 return (GET_MODE_SIZE (mode1) == 16
41318 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41320 /* If MODE2 is appropriate for an MMX register, then tie
41321 with any other mode acceptable to MMX registers. */
41322 if (GET_MODE_SIZE (mode2) == 8
41323 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41324 return (GET_MODE_SIZE (mode1) == 8
41325 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41327 return false;
41330 /* Return the cost of moving between two registers of mode MODE. */
41332 static int
41333 ix86_set_reg_reg_cost (machine_mode mode)
41335 unsigned int units = UNITS_PER_WORD;
41337 switch (GET_MODE_CLASS (mode))
41339 default:
41340 break;
41342 case MODE_CC:
41343 units = GET_MODE_SIZE (CCmode);
41344 break;
41346 case MODE_FLOAT:
41347 if ((TARGET_SSE && mode == TFmode)
41348 || (TARGET_80387 && mode == XFmode)
41349 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41350 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41351 units = GET_MODE_SIZE (mode);
41352 break;
41354 case MODE_COMPLEX_FLOAT:
41355 if ((TARGET_SSE && mode == TCmode)
41356 || (TARGET_80387 && mode == XCmode)
41357 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41358 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41359 units = GET_MODE_SIZE (mode);
41360 break;
41362 case MODE_VECTOR_INT:
41363 case MODE_VECTOR_FLOAT:
41364 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41365 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41366 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41367 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41368 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41369 units = GET_MODE_SIZE (mode);
41372 /* Return the cost of moving between two registers of mode MODE,
41373 assuming that the move will be in pieces of at most UNITS bytes. */
41374 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41377 /* Compute a (partial) cost for rtx X. Return true if the complete
41378 cost has been computed, and false if subexpressions should be
41379 scanned. In either case, *TOTAL contains the cost result. */
41381 static bool
41382 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41383 int *total, bool speed)
41385 rtx mask;
41386 enum rtx_code code = GET_CODE (x);
41387 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41388 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41389 int src_cost;
41391 switch (code)
41393 case SET:
41394 if (register_operand (SET_DEST (x), VOIDmode)
41395 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41397 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41398 return true;
41401 if (register_operand (SET_SRC (x), VOIDmode))
41402 /* Avoid potentially incorrect high cost from rtx_costs
41403 for non-tieable SUBREGs. */
41404 src_cost = 0;
41405 else
41407 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41409 if (CONSTANT_P (SET_SRC (x)))
41410 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41411 a small value, possibly zero for cheap constants. */
41412 src_cost += COSTS_N_INSNS (1);
41415 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41416 return true;
41418 case CONST_INT:
41419 case CONST:
41420 case LABEL_REF:
41421 case SYMBOL_REF:
41422 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41423 *total = 3;
41424 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41425 *total = 2;
41426 else if (flag_pic && SYMBOLIC_CONST (x)
41427 && !(TARGET_64BIT
41428 && (GET_CODE (x) == LABEL_REF
41429 || (GET_CODE (x) == SYMBOL_REF
41430 && SYMBOL_REF_LOCAL_P (x))))
41431 /* Use 0 cost for CONST to improve its propagation. */
41432 && (TARGET_64BIT || GET_CODE (x) != CONST))
41433 *total = 1;
41434 else
41435 *total = 0;
41436 return true;
41438 case CONST_DOUBLE:
41439 if (IS_STACK_MODE (mode))
41440 switch (standard_80387_constant_p (x))
41442 case -1:
41443 case 0:
41444 break;
41445 case 1: /* 0.0 */
41446 *total = 1;
41447 return true;
41448 default: /* Other constants */
41449 *total = 2;
41450 return true;
41452 /* FALLTHRU */
41454 case CONST_VECTOR:
41455 switch (standard_sse_constant_p (x, mode))
41457 case 0:
41458 break;
41459 case 1: /* 0: xor eliminates false dependency */
41460 *total = 0;
41461 return true;
41462 default: /* -1: cmp contains false dependency */
41463 *total = 1;
41464 return true;
41466 /* FALLTHRU */
41468 case CONST_WIDE_INT:
41469 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41470 it'll probably end up. Add a penalty for size. */
41471 *total = (COSTS_N_INSNS (1)
41472 + (!TARGET_64BIT && flag_pic)
41473 + (GET_MODE_SIZE (mode) <= 4
41474 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41475 return true;
41477 case ZERO_EXTEND:
41478 /* The zero extensions is often completely free on x86_64, so make
41479 it as cheap as possible. */
41480 if (TARGET_64BIT && mode == DImode
41481 && GET_MODE (XEXP (x, 0)) == SImode)
41482 *total = 1;
41483 else if (TARGET_ZERO_EXTEND_WITH_AND)
41484 *total = cost->add;
41485 else
41486 *total = cost->movzx;
41487 return false;
41489 case SIGN_EXTEND:
41490 *total = cost->movsx;
41491 return false;
41493 case ASHIFT:
41494 if (SCALAR_INT_MODE_P (mode)
41495 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41496 && CONST_INT_P (XEXP (x, 1)))
41498 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41499 if (value == 1)
41501 *total = cost->add;
41502 return false;
41504 if ((value == 2 || value == 3)
41505 && cost->lea <= cost->shift_const)
41507 *total = cost->lea;
41508 return false;
41511 /* FALLTHRU */
41513 case ROTATE:
41514 case ASHIFTRT:
41515 case LSHIFTRT:
41516 case ROTATERT:
41517 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41519 /* ??? Should be SSE vector operation cost. */
41520 /* At least for published AMD latencies, this really is the same
41521 as the latency for a simple fpu operation like fabs. */
41522 /* V*QImode is emulated with 1-11 insns. */
41523 if (mode == V16QImode || mode == V32QImode)
41525 int count = 11;
41526 if (TARGET_XOP && mode == V16QImode)
41528 /* For XOP we use vpshab, which requires a broadcast of the
41529 value to the variable shift insn. For constants this
41530 means a V16Q const in mem; even when we can perform the
41531 shift with one insn set the cost to prefer paddb. */
41532 if (CONSTANT_P (XEXP (x, 1)))
41534 *total = (cost->fabs
41535 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41536 + (speed ? 2 : COSTS_N_BYTES (16)));
41537 return true;
41539 count = 3;
41541 else if (TARGET_SSSE3)
41542 count = 7;
41543 *total = cost->fabs * count;
41545 else
41546 *total = cost->fabs;
41548 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41550 if (CONST_INT_P (XEXP (x, 1)))
41552 if (INTVAL (XEXP (x, 1)) > 32)
41553 *total = cost->shift_const + COSTS_N_INSNS (2);
41554 else
41555 *total = cost->shift_const * 2;
41557 else
41559 if (GET_CODE (XEXP (x, 1)) == AND)
41560 *total = cost->shift_var * 2;
41561 else
41562 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41565 else
41567 if (CONST_INT_P (XEXP (x, 1)))
41568 *total = cost->shift_const;
41569 else if (SUBREG_P (XEXP (x, 1))
41570 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41572 /* Return the cost after shift-and truncation. */
41573 *total = cost->shift_var;
41574 return true;
41576 else
41577 *total = cost->shift_var;
41579 return false;
41581 case FMA:
41583 rtx sub;
41585 gcc_assert (FLOAT_MODE_P (mode));
41586 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41588 /* ??? SSE scalar/vector cost should be used here. */
41589 /* ??? Bald assumption that fma has the same cost as fmul. */
41590 *total = cost->fmul;
41591 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41593 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41594 sub = XEXP (x, 0);
41595 if (GET_CODE (sub) == NEG)
41596 sub = XEXP (sub, 0);
41597 *total += rtx_cost (sub, mode, FMA, 0, speed);
41599 sub = XEXP (x, 2);
41600 if (GET_CODE (sub) == NEG)
41601 sub = XEXP (sub, 0);
41602 *total += rtx_cost (sub, mode, FMA, 2, speed);
41603 return true;
41606 case MULT:
41607 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41609 /* ??? SSE scalar cost should be used here. */
41610 *total = cost->fmul;
41611 return false;
41613 else if (X87_FLOAT_MODE_P (mode))
41615 *total = cost->fmul;
41616 return false;
41618 else if (FLOAT_MODE_P (mode))
41620 /* ??? SSE vector cost should be used here. */
41621 *total = cost->fmul;
41622 return false;
41624 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41626 /* V*QImode is emulated with 7-13 insns. */
41627 if (mode == V16QImode || mode == V32QImode)
41629 int extra = 11;
41630 if (TARGET_XOP && mode == V16QImode)
41631 extra = 5;
41632 else if (TARGET_SSSE3)
41633 extra = 6;
41634 *total = cost->fmul * 2 + cost->fabs * extra;
41636 /* V*DImode is emulated with 5-8 insns. */
41637 else if (mode == V2DImode || mode == V4DImode)
41639 if (TARGET_XOP && mode == V2DImode)
41640 *total = cost->fmul * 2 + cost->fabs * 3;
41641 else
41642 *total = cost->fmul * 3 + cost->fabs * 5;
41644 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41645 insns, including two PMULUDQ. */
41646 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41647 *total = cost->fmul * 2 + cost->fabs * 5;
41648 else
41649 *total = cost->fmul;
41650 return false;
41652 else
41654 rtx op0 = XEXP (x, 0);
41655 rtx op1 = XEXP (x, 1);
41656 int nbits;
41657 if (CONST_INT_P (XEXP (x, 1)))
41659 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41660 for (nbits = 0; value != 0; value &= value - 1)
41661 nbits++;
41663 else
41664 /* This is arbitrary. */
41665 nbits = 7;
41667 /* Compute costs correctly for widening multiplication. */
41668 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41669 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41670 == GET_MODE_SIZE (mode))
41672 int is_mulwiden = 0;
41673 machine_mode inner_mode = GET_MODE (op0);
41675 if (GET_CODE (op0) == GET_CODE (op1))
41676 is_mulwiden = 1, op1 = XEXP (op1, 0);
41677 else if (CONST_INT_P (op1))
41679 if (GET_CODE (op0) == SIGN_EXTEND)
41680 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41681 == INTVAL (op1);
41682 else
41683 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41686 if (is_mulwiden)
41687 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41690 *total = (cost->mult_init[MODE_INDEX (mode)]
41691 + nbits * cost->mult_bit
41692 + rtx_cost (op0, mode, outer_code, opno, speed)
41693 + rtx_cost (op1, mode, outer_code, opno, speed));
41695 return true;
41698 case DIV:
41699 case UDIV:
41700 case MOD:
41701 case UMOD:
41702 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41703 /* ??? SSE cost should be used here. */
41704 *total = cost->fdiv;
41705 else if (X87_FLOAT_MODE_P (mode))
41706 *total = cost->fdiv;
41707 else if (FLOAT_MODE_P (mode))
41708 /* ??? SSE vector cost should be used here. */
41709 *total = cost->fdiv;
41710 else
41711 *total = cost->divide[MODE_INDEX (mode)];
41712 return false;
41714 case PLUS:
41715 if (GET_MODE_CLASS (mode) == MODE_INT
41716 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41718 if (GET_CODE (XEXP (x, 0)) == PLUS
41719 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41720 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41721 && CONSTANT_P (XEXP (x, 1)))
41723 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41724 if (val == 2 || val == 4 || val == 8)
41726 *total = cost->lea;
41727 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41728 outer_code, opno, speed);
41729 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41730 outer_code, opno, speed);
41731 *total += rtx_cost (XEXP (x, 1), mode,
41732 outer_code, opno, speed);
41733 return true;
41736 else if (GET_CODE (XEXP (x, 0)) == MULT
41737 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41739 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41740 if (val == 2 || val == 4 || val == 8)
41742 *total = cost->lea;
41743 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41744 outer_code, opno, speed);
41745 *total += rtx_cost (XEXP (x, 1), mode,
41746 outer_code, opno, speed);
41747 return true;
41750 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41752 /* Add with carry, ignore the cost of adding a carry flag. */
41753 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41754 *total = cost->add;
41755 else
41757 *total = cost->lea;
41758 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41759 outer_code, opno, speed);
41762 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41763 outer_code, opno, speed);
41764 *total += rtx_cost (XEXP (x, 1), mode,
41765 outer_code, opno, speed);
41766 return true;
41769 /* FALLTHRU */
41771 case MINUS:
41772 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41773 if (GET_MODE_CLASS (mode) == MODE_INT
41774 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41775 && GET_CODE (XEXP (x, 0)) == MINUS
41776 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41778 *total = cost->add;
41779 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41780 outer_code, opno, speed);
41781 *total += rtx_cost (XEXP (x, 1), mode,
41782 outer_code, opno, speed);
41783 return true;
41786 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41788 /* ??? SSE cost should be used here. */
41789 *total = cost->fadd;
41790 return false;
41792 else if (X87_FLOAT_MODE_P (mode))
41794 *total = cost->fadd;
41795 return false;
41797 else if (FLOAT_MODE_P (mode))
41799 /* ??? SSE vector cost should be used here. */
41800 *total = cost->fadd;
41801 return false;
41803 /* FALLTHRU */
41805 case AND:
41806 case IOR:
41807 case XOR:
41808 if (GET_MODE_CLASS (mode) == MODE_INT
41809 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41811 *total = (cost->add * 2
41812 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41813 << (GET_MODE (XEXP (x, 0)) != DImode))
41814 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41815 << (GET_MODE (XEXP (x, 1)) != DImode)));
41816 return true;
41818 /* FALLTHRU */
41820 case NEG:
41821 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41823 /* ??? SSE cost should be used here. */
41824 *total = cost->fchs;
41825 return false;
41827 else if (X87_FLOAT_MODE_P (mode))
41829 *total = cost->fchs;
41830 return false;
41832 else if (FLOAT_MODE_P (mode))
41834 /* ??? SSE vector cost should be used here. */
41835 *total = cost->fchs;
41836 return false;
41838 /* FALLTHRU */
41840 case NOT:
41841 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41843 /* ??? Should be SSE vector operation cost. */
41844 /* At least for published AMD latencies, this really is the same
41845 as the latency for a simple fpu operation like fabs. */
41846 *total = cost->fabs;
41848 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41849 *total = cost->add * 2;
41850 else
41851 *total = cost->add;
41852 return false;
41854 case COMPARE:
41855 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41856 && XEXP (XEXP (x, 0), 1) == const1_rtx
41857 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41858 && XEXP (x, 1) == const0_rtx)
41860 /* This kind of construct is implemented using test[bwl].
41861 Treat it as if we had an AND. */
41862 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41863 *total = (cost->add
41864 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41865 opno, speed)
41866 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41867 return true;
41870 /* The embedded comparison operand is completely free. */
41871 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41872 && XEXP (x, 1) == const0_rtx)
41873 *total = 0;
41875 return false;
41877 case FLOAT_EXTEND:
41878 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41879 *total = 0;
41880 return false;
41882 case ABS:
41883 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41884 /* ??? SSE cost should be used here. */
41885 *total = cost->fabs;
41886 else if (X87_FLOAT_MODE_P (mode))
41887 *total = cost->fabs;
41888 else if (FLOAT_MODE_P (mode))
41889 /* ??? SSE vector cost should be used here. */
41890 *total = cost->fabs;
41891 return false;
41893 case SQRT:
41894 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41895 /* ??? SSE cost should be used here. */
41896 *total = cost->fsqrt;
41897 else if (X87_FLOAT_MODE_P (mode))
41898 *total = cost->fsqrt;
41899 else if (FLOAT_MODE_P (mode))
41900 /* ??? SSE vector cost should be used here. */
41901 *total = cost->fsqrt;
41902 return false;
41904 case UNSPEC:
41905 if (XINT (x, 1) == UNSPEC_TP)
41906 *total = 0;
41907 return false;
41909 case VEC_SELECT:
41910 case VEC_CONCAT:
41911 case VEC_DUPLICATE:
41912 /* ??? Assume all of these vector manipulation patterns are
41913 recognizable. In which case they all pretty much have the
41914 same cost. */
41915 *total = cost->fabs;
41916 return true;
41917 case VEC_MERGE:
41918 mask = XEXP (x, 2);
41919 /* This is masked instruction, assume the same cost,
41920 as nonmasked variant. */
41921 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41922 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41923 else
41924 *total = cost->fabs;
41925 return true;
41927 default:
41928 return false;
41932 #if TARGET_MACHO
41934 static int current_machopic_label_num;
41936 /* Given a symbol name and its associated stub, write out the
41937 definition of the stub. */
41939 void
41940 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41942 unsigned int length;
41943 char *binder_name, *symbol_name, lazy_ptr_name[32];
41944 int label = ++current_machopic_label_num;
41946 /* For 64-bit we shouldn't get here. */
41947 gcc_assert (!TARGET_64BIT);
41949 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41950 symb = targetm.strip_name_encoding (symb);
41952 length = strlen (stub);
41953 binder_name = XALLOCAVEC (char, length + 32);
41954 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41956 length = strlen (symb);
41957 symbol_name = XALLOCAVEC (char, length + 32);
41958 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41960 sprintf (lazy_ptr_name, "L%d$lz", label);
41962 if (MACHOPIC_ATT_STUB)
41963 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41964 else if (MACHOPIC_PURE)
41965 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41966 else
41967 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41969 fprintf (file, "%s:\n", stub);
41970 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41972 if (MACHOPIC_ATT_STUB)
41974 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41976 else if (MACHOPIC_PURE)
41978 /* PIC stub. */
41979 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41980 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41981 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41982 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41983 label, lazy_ptr_name, label);
41984 fprintf (file, "\tjmp\t*%%ecx\n");
41986 else
41987 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41989 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41990 it needs no stub-binding-helper. */
41991 if (MACHOPIC_ATT_STUB)
41992 return;
41994 fprintf (file, "%s:\n", binder_name);
41996 if (MACHOPIC_PURE)
41998 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41999 fprintf (file, "\tpushl\t%%ecx\n");
42001 else
42002 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
42004 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
42006 /* N.B. Keep the correspondence of these
42007 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
42008 old-pic/new-pic/non-pic stubs; altering this will break
42009 compatibility with existing dylibs. */
42010 if (MACHOPIC_PURE)
42012 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42013 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42015 else
42016 /* 16-byte -mdynamic-no-pic stub. */
42017 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42019 fprintf (file, "%s:\n", lazy_ptr_name);
42020 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42021 fprintf (file, ASM_LONG "%s\n", binder_name);
42023 #endif /* TARGET_MACHO */
42025 /* Order the registers for register allocator. */
42027 void
42028 x86_order_regs_for_local_alloc (void)
42030 int pos = 0;
42031 int i;
42033 /* First allocate the local general purpose registers. */
42034 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42035 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42036 reg_alloc_order [pos++] = i;
42038 /* Global general purpose registers. */
42039 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42040 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42041 reg_alloc_order [pos++] = i;
42043 /* x87 registers come first in case we are doing FP math
42044 using them. */
42045 if (!TARGET_SSE_MATH)
42046 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42047 reg_alloc_order [pos++] = i;
42049 /* SSE registers. */
42050 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42051 reg_alloc_order [pos++] = i;
42052 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42053 reg_alloc_order [pos++] = i;
42055 /* Extended REX SSE registers. */
42056 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42057 reg_alloc_order [pos++] = i;
42059 /* Mask register. */
42060 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42061 reg_alloc_order [pos++] = i;
42063 /* MPX bound registers. */
42064 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42065 reg_alloc_order [pos++] = i;
42067 /* x87 registers. */
42068 if (TARGET_SSE_MATH)
42069 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42070 reg_alloc_order [pos++] = i;
42072 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42073 reg_alloc_order [pos++] = i;
42075 /* Initialize the rest of array as we do not allocate some registers
42076 at all. */
42077 while (pos < FIRST_PSEUDO_REGISTER)
42078 reg_alloc_order [pos++] = 0;
42081 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42082 in struct attribute_spec handler. */
42083 static tree
42084 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42085 tree args,
42086 int,
42087 bool *no_add_attrs)
42089 if (TREE_CODE (*node) != FUNCTION_TYPE
42090 && TREE_CODE (*node) != METHOD_TYPE
42091 && TREE_CODE (*node) != FIELD_DECL
42092 && TREE_CODE (*node) != TYPE_DECL)
42094 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42095 name);
42096 *no_add_attrs = true;
42097 return NULL_TREE;
42099 if (TARGET_64BIT)
42101 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42102 name);
42103 *no_add_attrs = true;
42104 return NULL_TREE;
42106 if (is_attribute_p ("callee_pop_aggregate_return", name))
42108 tree cst;
42110 cst = TREE_VALUE (args);
42111 if (TREE_CODE (cst) != INTEGER_CST)
42113 warning (OPT_Wattributes,
42114 "%qE attribute requires an integer constant argument",
42115 name);
42116 *no_add_attrs = true;
42118 else if (compare_tree_int (cst, 0) != 0
42119 && compare_tree_int (cst, 1) != 0)
42121 warning (OPT_Wattributes,
42122 "argument to %qE attribute is neither zero, nor one",
42123 name);
42124 *no_add_attrs = true;
42127 return NULL_TREE;
42130 return NULL_TREE;
42133 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42134 struct attribute_spec.handler. */
42135 static tree
42136 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42137 bool *no_add_attrs)
42139 if (TREE_CODE (*node) != FUNCTION_TYPE
42140 && TREE_CODE (*node) != METHOD_TYPE
42141 && TREE_CODE (*node) != FIELD_DECL
42142 && TREE_CODE (*node) != TYPE_DECL)
42144 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42145 name);
42146 *no_add_attrs = true;
42147 return NULL_TREE;
42150 /* Can combine regparm with all attributes but fastcall. */
42151 if (is_attribute_p ("ms_abi", name))
42153 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42155 error ("ms_abi and sysv_abi attributes are not compatible");
42158 return NULL_TREE;
42160 else if (is_attribute_p ("sysv_abi", name))
42162 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42164 error ("ms_abi and sysv_abi attributes are not compatible");
42167 return NULL_TREE;
42170 return NULL_TREE;
42173 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42174 struct attribute_spec.handler. */
42175 static tree
42176 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42177 bool *no_add_attrs)
42179 tree *type = NULL;
42180 if (DECL_P (*node))
42182 if (TREE_CODE (*node) == TYPE_DECL)
42183 type = &TREE_TYPE (*node);
42185 else
42186 type = node;
42188 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42190 warning (OPT_Wattributes, "%qE attribute ignored",
42191 name);
42192 *no_add_attrs = true;
42195 else if ((is_attribute_p ("ms_struct", name)
42196 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42197 || ((is_attribute_p ("gcc_struct", name)
42198 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42200 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42201 name);
42202 *no_add_attrs = true;
42205 return NULL_TREE;
42208 static tree
42209 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42210 bool *no_add_attrs)
42212 if (TREE_CODE (*node) != FUNCTION_DECL)
42214 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42215 name);
42216 *no_add_attrs = true;
42218 return NULL_TREE;
42221 static tree
42222 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42223 int, bool *)
42225 return NULL_TREE;
42228 static tree
42229 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42231 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42232 but the function type contains args and return type data. */
42233 tree func_type = *node;
42234 tree return_type = TREE_TYPE (func_type);
42236 int nargs = 0;
42237 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42238 while (current_arg_type
42239 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42241 if (nargs == 0)
42243 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42244 error ("interrupt service routine should have a pointer "
42245 "as the first argument");
42247 else if (nargs == 1)
42249 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42250 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42251 error ("interrupt service routine should have unsigned %s"
42252 "int as the second argument",
42253 TARGET_64BIT
42254 ? (TARGET_X32 ? "long long " : "long ")
42255 : "");
42257 nargs++;
42258 current_arg_type = TREE_CHAIN (current_arg_type);
42260 if (!nargs || nargs > 2)
42261 error ("interrupt service routine can only have a pointer argument "
42262 "and an optional integer argument");
42263 if (! VOID_TYPE_P (return_type))
42264 error ("interrupt service routine can't have non-void return value");
42266 return NULL_TREE;
42269 static bool
42270 ix86_ms_bitfield_layout_p (const_tree record_type)
42272 return ((TARGET_MS_BITFIELD_LAYOUT
42273 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42274 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42277 /* Returns an expression indicating where the this parameter is
42278 located on entry to the FUNCTION. */
42280 static rtx
42281 x86_this_parameter (tree function)
42283 tree type = TREE_TYPE (function);
42284 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42285 int nregs;
42287 if (TARGET_64BIT)
42289 const int *parm_regs;
42291 if (ix86_function_type_abi (type) == MS_ABI)
42292 parm_regs = x86_64_ms_abi_int_parameter_registers;
42293 else
42294 parm_regs = x86_64_int_parameter_registers;
42295 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42298 nregs = ix86_function_regparm (type, function);
42300 if (nregs > 0 && !stdarg_p (type))
42302 int regno;
42303 unsigned int ccvt = ix86_get_callcvt (type);
42305 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42306 regno = aggr ? DX_REG : CX_REG;
42307 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42309 regno = CX_REG;
42310 if (aggr)
42311 return gen_rtx_MEM (SImode,
42312 plus_constant (Pmode, stack_pointer_rtx, 4));
42314 else
42316 regno = AX_REG;
42317 if (aggr)
42319 regno = DX_REG;
42320 if (nregs == 1)
42321 return gen_rtx_MEM (SImode,
42322 plus_constant (Pmode,
42323 stack_pointer_rtx, 4));
42326 return gen_rtx_REG (SImode, regno);
42329 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42330 aggr ? 8 : 4));
42333 /* Determine whether x86_output_mi_thunk can succeed. */
42335 static bool
42336 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42337 const_tree function)
42339 /* 64-bit can handle anything. */
42340 if (TARGET_64BIT)
42341 return true;
42343 /* For 32-bit, everything's fine if we have one free register. */
42344 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42345 return true;
42347 /* Need a free register for vcall_offset. */
42348 if (vcall_offset)
42349 return false;
42351 /* Need a free register for GOT references. */
42352 if (flag_pic && !targetm.binds_local_p (function))
42353 return false;
42355 /* Otherwise ok. */
42356 return true;
42359 /* Output the assembler code for a thunk function. THUNK_DECL is the
42360 declaration for the thunk function itself, FUNCTION is the decl for
42361 the target function. DELTA is an immediate constant offset to be
42362 added to THIS. If VCALL_OFFSET is nonzero, the word at
42363 *(*this + vcall_offset) should be added to THIS. */
42365 static void
42366 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42367 HOST_WIDE_INT vcall_offset, tree function)
42369 rtx this_param = x86_this_parameter (function);
42370 rtx this_reg, tmp, fnaddr;
42371 unsigned int tmp_regno;
42372 rtx_insn *insn;
42374 if (TARGET_64BIT)
42375 tmp_regno = R10_REG;
42376 else
42378 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42379 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42380 tmp_regno = AX_REG;
42381 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42382 tmp_regno = DX_REG;
42383 else
42384 tmp_regno = CX_REG;
42387 emit_note (NOTE_INSN_PROLOGUE_END);
42389 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42390 pull it in now and let DELTA benefit. */
42391 if (REG_P (this_param))
42392 this_reg = this_param;
42393 else if (vcall_offset)
42395 /* Put the this parameter into %eax. */
42396 this_reg = gen_rtx_REG (Pmode, AX_REG);
42397 emit_move_insn (this_reg, this_param);
42399 else
42400 this_reg = NULL_RTX;
42402 /* Adjust the this parameter by a fixed constant. */
42403 if (delta)
42405 rtx delta_rtx = GEN_INT (delta);
42406 rtx delta_dst = this_reg ? this_reg : this_param;
42408 if (TARGET_64BIT)
42410 if (!x86_64_general_operand (delta_rtx, Pmode))
42412 tmp = gen_rtx_REG (Pmode, tmp_regno);
42413 emit_move_insn (tmp, delta_rtx);
42414 delta_rtx = tmp;
42418 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42421 /* Adjust the this parameter by a value stored in the vtable. */
42422 if (vcall_offset)
42424 rtx vcall_addr, vcall_mem, this_mem;
42426 tmp = gen_rtx_REG (Pmode, tmp_regno);
42428 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42429 if (Pmode != ptr_mode)
42430 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42431 emit_move_insn (tmp, this_mem);
42433 /* Adjust the this parameter. */
42434 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42435 if (TARGET_64BIT
42436 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42438 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42439 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42440 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42443 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42444 if (Pmode != ptr_mode)
42445 emit_insn (gen_addsi_1_zext (this_reg,
42446 gen_rtx_REG (ptr_mode,
42447 REGNO (this_reg)),
42448 vcall_mem));
42449 else
42450 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42453 /* If necessary, drop THIS back to its stack slot. */
42454 if (this_reg && this_reg != this_param)
42455 emit_move_insn (this_param, this_reg);
42457 fnaddr = XEXP (DECL_RTL (function), 0);
42458 if (TARGET_64BIT)
42460 if (!flag_pic || targetm.binds_local_p (function)
42461 || TARGET_PECOFF)
42463 else
42465 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42466 tmp = gen_rtx_CONST (Pmode, tmp);
42467 fnaddr = gen_const_mem (Pmode, tmp);
42470 else
42472 if (!flag_pic || targetm.binds_local_p (function))
42474 #if TARGET_MACHO
42475 else if (TARGET_MACHO)
42477 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42478 fnaddr = XEXP (fnaddr, 0);
42480 #endif /* TARGET_MACHO */
42481 else
42483 tmp = gen_rtx_REG (Pmode, CX_REG);
42484 output_set_got (tmp, NULL_RTX);
42486 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42487 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42488 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42489 fnaddr = gen_const_mem (Pmode, fnaddr);
42493 /* Our sibling call patterns do not allow memories, because we have no
42494 predicate that can distinguish between frame and non-frame memory.
42495 For our purposes here, we can get away with (ab)using a jump pattern,
42496 because we're going to do no optimization. */
42497 if (MEM_P (fnaddr))
42499 if (sibcall_insn_operand (fnaddr, word_mode))
42501 fnaddr = XEXP (DECL_RTL (function), 0);
42502 tmp = gen_rtx_MEM (QImode, fnaddr);
42503 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42504 tmp = emit_call_insn (tmp);
42505 SIBLING_CALL_P (tmp) = 1;
42507 else
42508 emit_jump_insn (gen_indirect_jump (fnaddr));
42510 else
42512 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42514 // CM_LARGE_PIC always uses pseudo PIC register which is
42515 // uninitialized. Since FUNCTION is local and calling it
42516 // doesn't go through PLT, we use scratch register %r11 as
42517 // PIC register and initialize it here.
42518 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42519 ix86_init_large_pic_reg (tmp_regno);
42520 fnaddr = legitimize_pic_address (fnaddr,
42521 gen_rtx_REG (Pmode, tmp_regno));
42524 if (!sibcall_insn_operand (fnaddr, word_mode))
42526 tmp = gen_rtx_REG (word_mode, tmp_regno);
42527 if (GET_MODE (fnaddr) != word_mode)
42528 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42529 emit_move_insn (tmp, fnaddr);
42530 fnaddr = tmp;
42533 tmp = gen_rtx_MEM (QImode, fnaddr);
42534 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42535 tmp = emit_call_insn (tmp);
42536 SIBLING_CALL_P (tmp) = 1;
42538 emit_barrier ();
42540 /* Emit just enough of rest_of_compilation to get the insns emitted.
42541 Note that use_thunk calls assemble_start_function et al. */
42542 insn = get_insns ();
42543 shorten_branches (insn);
42544 final_start_function (insn, file, 1);
42545 final (insn, file, 1);
42546 final_end_function ();
42549 static void
42550 x86_file_start (void)
42552 default_file_start ();
42553 if (TARGET_16BIT)
42554 fputs ("\t.code16gcc\n", asm_out_file);
42555 #if TARGET_MACHO
42556 darwin_file_start ();
42557 #endif
42558 if (X86_FILE_START_VERSION_DIRECTIVE)
42559 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42560 if (X86_FILE_START_FLTUSED)
42561 fputs ("\t.global\t__fltused\n", asm_out_file);
42562 if (ix86_asm_dialect == ASM_INTEL)
42563 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42567 x86_field_alignment (tree type, int computed)
42569 machine_mode mode;
42571 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42572 return computed;
42573 if (TARGET_IAMCU)
42574 return iamcu_alignment (type, computed);
42575 mode = TYPE_MODE (strip_array_types (type));
42576 if (mode == DFmode || mode == DCmode
42577 || GET_MODE_CLASS (mode) == MODE_INT
42578 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42579 return MIN (32, computed);
42580 return computed;
42583 /* Print call to TARGET to FILE. */
42585 static void
42586 x86_print_call_or_nop (FILE *file, const char *target)
42588 if (flag_nop_mcount)
42589 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42590 else
42591 fprintf (file, "1:\tcall\t%s\n", target);
42594 /* Output assembler code to FILE to increment profiler label # LABELNO
42595 for profiling a function entry. */
42596 void
42597 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42599 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42600 : MCOUNT_NAME);
42601 if (TARGET_64BIT)
42603 #ifndef NO_PROFILE_COUNTERS
42604 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42605 #endif
42607 if (!TARGET_PECOFF && flag_pic)
42608 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42609 else
42610 x86_print_call_or_nop (file, mcount_name);
42612 else if (flag_pic)
42614 #ifndef NO_PROFILE_COUNTERS
42615 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42616 LPREFIX, labelno);
42617 #endif
42618 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42620 else
42622 #ifndef NO_PROFILE_COUNTERS
42623 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42624 LPREFIX, labelno);
42625 #endif
42626 x86_print_call_or_nop (file, mcount_name);
42629 if (flag_record_mcount)
42631 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42632 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42633 fprintf (file, "\t.previous\n");
42637 /* We don't have exact information about the insn sizes, but we may assume
42638 quite safely that we are informed about all 1 byte insns and memory
42639 address sizes. This is enough to eliminate unnecessary padding in
42640 99% of cases. */
42642 static int
42643 min_insn_size (rtx_insn *insn)
42645 int l = 0, len;
42647 if (!INSN_P (insn) || !active_insn_p (insn))
42648 return 0;
42650 /* Discard alignments we've emit and jump instructions. */
42651 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42652 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42653 return 0;
42655 /* Important case - calls are always 5 bytes.
42656 It is common to have many calls in the row. */
42657 if (CALL_P (insn)
42658 && symbolic_reference_mentioned_p (PATTERN (insn))
42659 && !SIBLING_CALL_P (insn))
42660 return 5;
42661 len = get_attr_length (insn);
42662 if (len <= 1)
42663 return 1;
42665 /* For normal instructions we rely on get_attr_length being exact,
42666 with a few exceptions. */
42667 if (!JUMP_P (insn))
42669 enum attr_type type = get_attr_type (insn);
42671 switch (type)
42673 case TYPE_MULTI:
42674 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42675 || asm_noperands (PATTERN (insn)) >= 0)
42676 return 0;
42677 break;
42678 case TYPE_OTHER:
42679 case TYPE_FCMP:
42680 break;
42681 default:
42682 /* Otherwise trust get_attr_length. */
42683 return len;
42686 l = get_attr_length_address (insn);
42687 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42688 l = 4;
42690 if (l)
42691 return 1+l;
42692 else
42693 return 2;
42696 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42698 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42699 window. */
42701 static void
42702 ix86_avoid_jump_mispredicts (void)
42704 rtx_insn *insn, *start = get_insns ();
42705 int nbytes = 0, njumps = 0;
42706 bool isjump = false;
42708 /* Look for all minimal intervals of instructions containing 4 jumps.
42709 The intervals are bounded by START and INSN. NBYTES is the total
42710 size of instructions in the interval including INSN and not including
42711 START. When the NBYTES is smaller than 16 bytes, it is possible
42712 that the end of START and INSN ends up in the same 16byte page.
42714 The smallest offset in the page INSN can start is the case where START
42715 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42716 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42718 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42719 have to, control transfer to label(s) can be performed through other
42720 means, and also we estimate minimum length of all asm stmts as 0. */
42721 for (insn = start; insn; insn = NEXT_INSN (insn))
42723 int min_size;
42725 if (LABEL_P (insn))
42727 int align = label_to_alignment (insn);
42728 int max_skip = label_to_max_skip (insn);
42730 if (max_skip > 15)
42731 max_skip = 15;
42732 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42733 already in the current 16 byte page, because otherwise
42734 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42735 bytes to reach 16 byte boundary. */
42736 if (align <= 0
42737 || (align <= 3 && max_skip != (1 << align) - 1))
42738 max_skip = 0;
42739 if (dump_file)
42740 fprintf (dump_file, "Label %i with max_skip %i\n",
42741 INSN_UID (insn), max_skip);
42742 if (max_skip)
42744 while (nbytes + max_skip >= 16)
42746 start = NEXT_INSN (start);
42747 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42748 || CALL_P (start))
42749 njumps--, isjump = true;
42750 else
42751 isjump = false;
42752 nbytes -= min_insn_size (start);
42755 continue;
42758 min_size = min_insn_size (insn);
42759 nbytes += min_size;
42760 if (dump_file)
42761 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42762 INSN_UID (insn), min_size);
42763 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42764 || CALL_P (insn))
42765 njumps++;
42766 else
42767 continue;
42769 while (njumps > 3)
42771 start = NEXT_INSN (start);
42772 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42773 || CALL_P (start))
42774 njumps--, isjump = true;
42775 else
42776 isjump = false;
42777 nbytes -= min_insn_size (start);
42779 gcc_assert (njumps >= 0);
42780 if (dump_file)
42781 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42782 INSN_UID (start), INSN_UID (insn), nbytes);
42784 if (njumps == 3 && isjump && nbytes < 16)
42786 int padsize = 15 - nbytes + min_insn_size (insn);
42788 if (dump_file)
42789 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42790 INSN_UID (insn), padsize);
42791 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42795 #endif
42797 /* AMD Athlon works faster
42798 when RET is not destination of conditional jump or directly preceded
42799 by other jump instruction. We avoid the penalty by inserting NOP just
42800 before the RET instructions in such cases. */
42801 static void
42802 ix86_pad_returns (void)
42804 edge e;
42805 edge_iterator ei;
42807 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42809 basic_block bb = e->src;
42810 rtx_insn *ret = BB_END (bb);
42811 rtx_insn *prev;
42812 bool replace = false;
42814 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42815 || optimize_bb_for_size_p (bb))
42816 continue;
42817 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42818 if (active_insn_p (prev) || LABEL_P (prev))
42819 break;
42820 if (prev && LABEL_P (prev))
42822 edge e;
42823 edge_iterator ei;
42825 FOR_EACH_EDGE (e, ei, bb->preds)
42826 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42827 && !(e->flags & EDGE_FALLTHRU))
42829 replace = true;
42830 break;
42833 if (!replace)
42835 prev = prev_active_insn (ret);
42836 if (prev
42837 && ((JUMP_P (prev) && any_condjump_p (prev))
42838 || CALL_P (prev)))
42839 replace = true;
42840 /* Empty functions get branch mispredict even when
42841 the jump destination is not visible to us. */
42842 if (!prev && !optimize_function_for_size_p (cfun))
42843 replace = true;
42845 if (replace)
42847 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42848 delete_insn (ret);
42853 /* Count the minimum number of instructions in BB. Return 4 if the
42854 number of instructions >= 4. */
42856 static int
42857 ix86_count_insn_bb (basic_block bb)
42859 rtx_insn *insn;
42860 int insn_count = 0;
42862 /* Count number of instructions in this block. Return 4 if the number
42863 of instructions >= 4. */
42864 FOR_BB_INSNS (bb, insn)
42866 /* Only happen in exit blocks. */
42867 if (JUMP_P (insn)
42868 && ANY_RETURN_P (PATTERN (insn)))
42869 break;
42871 if (NONDEBUG_INSN_P (insn)
42872 && GET_CODE (PATTERN (insn)) != USE
42873 && GET_CODE (PATTERN (insn)) != CLOBBER)
42875 insn_count++;
42876 if (insn_count >= 4)
42877 return insn_count;
42881 return insn_count;
42885 /* Count the minimum number of instructions in code path in BB.
42886 Return 4 if the number of instructions >= 4. */
42888 static int
42889 ix86_count_insn (basic_block bb)
42891 edge e;
42892 edge_iterator ei;
42893 int min_prev_count;
42895 /* Only bother counting instructions along paths with no
42896 more than 2 basic blocks between entry and exit. Given
42897 that BB has an edge to exit, determine if a predecessor
42898 of BB has an edge from entry. If so, compute the number
42899 of instructions in the predecessor block. If there
42900 happen to be multiple such blocks, compute the minimum. */
42901 min_prev_count = 4;
42902 FOR_EACH_EDGE (e, ei, bb->preds)
42904 edge prev_e;
42905 edge_iterator prev_ei;
42907 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42909 min_prev_count = 0;
42910 break;
42912 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42914 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42916 int count = ix86_count_insn_bb (e->src);
42917 if (count < min_prev_count)
42918 min_prev_count = count;
42919 break;
42924 if (min_prev_count < 4)
42925 min_prev_count += ix86_count_insn_bb (bb);
42927 return min_prev_count;
42930 /* Pad short function to 4 instructions. */
42932 static void
42933 ix86_pad_short_function (void)
42935 edge e;
42936 edge_iterator ei;
42938 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42940 rtx_insn *ret = BB_END (e->src);
42941 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42943 int insn_count = ix86_count_insn (e->src);
42945 /* Pad short function. */
42946 if (insn_count < 4)
42948 rtx_insn *insn = ret;
42950 /* Find epilogue. */
42951 while (insn
42952 && (!NOTE_P (insn)
42953 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42954 insn = PREV_INSN (insn);
42956 if (!insn)
42957 insn = ret;
42959 /* Two NOPs count as one instruction. */
42960 insn_count = 2 * (4 - insn_count);
42961 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42967 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42968 the epilogue, the Windows system unwinder will apply epilogue logic and
42969 produce incorrect offsets. This can be avoided by adding a nop between
42970 the last insn that can throw and the first insn of the epilogue. */
42972 static void
42973 ix86_seh_fixup_eh_fallthru (void)
42975 edge e;
42976 edge_iterator ei;
42978 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42980 rtx_insn *insn, *next;
42982 /* Find the beginning of the epilogue. */
42983 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42984 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42985 break;
42986 if (insn == NULL)
42987 continue;
42989 /* We only care about preceding insns that can throw. */
42990 insn = prev_active_insn (insn);
42991 if (insn == NULL || !can_throw_internal (insn))
42992 continue;
42994 /* Do not separate calls from their debug information. */
42995 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42996 if (NOTE_P (next)
42997 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42998 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42999 insn = next;
43000 else
43001 break;
43003 emit_insn_after (gen_nops (const1_rtx), insn);
43007 /* Given a register number BASE, the lowest of a group of registers, update
43008 regsets IN and OUT with the registers that should be avoided in input
43009 and output operands respectively when trying to avoid generating a modr/m
43010 byte for -fmitigate-rop. */
43012 static void
43013 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43015 SET_HARD_REG_BIT (out, base);
43016 SET_HARD_REG_BIT (out, base + 1);
43017 SET_HARD_REG_BIT (in, base + 2);
43018 SET_HARD_REG_BIT (in, base + 3);
43021 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43022 that certain encodings of modr/m bytes do not occur. */
43023 static void
43024 ix86_mitigate_rop (void)
43026 HARD_REG_SET input_risky;
43027 HARD_REG_SET output_risky;
43028 HARD_REG_SET inout_risky;
43030 CLEAR_HARD_REG_SET (output_risky);
43031 CLEAR_HARD_REG_SET (input_risky);
43032 SET_HARD_REG_BIT (output_risky, AX_REG);
43033 SET_HARD_REG_BIT (output_risky, CX_REG);
43034 SET_HARD_REG_BIT (input_risky, BX_REG);
43035 SET_HARD_REG_BIT (input_risky, DX_REG);
43036 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43037 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43038 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43039 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43040 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43041 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43042 COPY_HARD_REG_SET (inout_risky, input_risky);
43043 IOR_HARD_REG_SET (inout_risky, output_risky);
43045 df_note_add_problem ();
43046 /* Fix up what stack-regs did. */
43047 df_insn_rescan_all ();
43048 df_analyze ();
43050 regrename_init (true);
43051 regrename_analyze (NULL);
43053 auto_vec<du_head_p> cands;
43055 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43057 if (!NONDEBUG_INSN_P (insn))
43058 continue;
43060 if (GET_CODE (PATTERN (insn)) == USE
43061 || GET_CODE (PATTERN (insn)) == CLOBBER)
43062 continue;
43064 extract_insn (insn);
43066 int opno0, opno1;
43067 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43068 recog_data.n_operands, &opno0,
43069 &opno1);
43071 if (!ix86_rop_should_change_byte_p (modrm))
43072 continue;
43074 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43076 /* This happens when regrename has to fail a block. */
43077 if (!info->op_info)
43078 continue;
43080 if (info->op_info[opno0].n_chains != 0)
43082 gcc_assert (info->op_info[opno0].n_chains == 1);
43083 du_head_p op0c;
43084 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43085 if (op0c->target_data_1 + op0c->target_data_2 == 0
43086 && !op0c->cannot_rename)
43087 cands.safe_push (op0c);
43089 op0c->target_data_1++;
43091 if (info->op_info[opno1].n_chains != 0)
43093 gcc_assert (info->op_info[opno1].n_chains == 1);
43094 du_head_p op1c;
43095 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43096 if (op1c->target_data_1 + op1c->target_data_2 == 0
43097 && !op1c->cannot_rename)
43098 cands.safe_push (op1c);
43100 op1c->target_data_2++;
43104 int i;
43105 du_head_p head;
43106 FOR_EACH_VEC_ELT (cands, i, head)
43108 int old_reg, best_reg;
43109 HARD_REG_SET unavailable;
43111 CLEAR_HARD_REG_SET (unavailable);
43112 if (head->target_data_1)
43113 IOR_HARD_REG_SET (unavailable, output_risky);
43114 if (head->target_data_2)
43115 IOR_HARD_REG_SET (unavailable, input_risky);
43117 int n_uses;
43118 reg_class superclass = regrename_find_superclass (head, &n_uses,
43119 &unavailable);
43120 old_reg = head->regno;
43121 best_reg = find_rename_reg (head, superclass, &unavailable,
43122 old_reg, false);
43123 bool ok = regrename_do_replace (head, best_reg);
43124 gcc_assert (ok);
43125 if (dump_file)
43126 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43127 reg_names[best_reg], reg_class_names[superclass]);
43131 regrename_finish ();
43133 df_analyze ();
43135 basic_block bb;
43136 regset_head live;
43138 INIT_REG_SET (&live);
43140 FOR_EACH_BB_FN (bb, cfun)
43142 rtx_insn *insn;
43144 COPY_REG_SET (&live, DF_LR_OUT (bb));
43145 df_simulate_initialize_backwards (bb, &live);
43147 FOR_BB_INSNS_REVERSE (bb, insn)
43149 if (!NONDEBUG_INSN_P (insn))
43150 continue;
43152 df_simulate_one_insn_backwards (bb, insn, &live);
43154 if (GET_CODE (PATTERN (insn)) == USE
43155 || GET_CODE (PATTERN (insn)) == CLOBBER)
43156 continue;
43158 extract_insn (insn);
43159 constrain_operands_cached (insn, reload_completed);
43160 int opno0, opno1;
43161 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43162 recog_data.n_operands, &opno0,
43163 &opno1);
43164 if (modrm < 0
43165 || !ix86_rop_should_change_byte_p (modrm)
43166 || opno0 == opno1)
43167 continue;
43169 rtx oldreg = recog_data.operand[opno1];
43170 preprocess_constraints (insn);
43171 const operand_alternative *alt = which_op_alt ();
43173 int i;
43174 for (i = 0; i < recog_data.n_operands; i++)
43175 if (i != opno1
43176 && alt[i].earlyclobber
43177 && reg_overlap_mentioned_p (recog_data.operand[i],
43178 oldreg))
43179 break;
43181 if (i < recog_data.n_operands)
43182 continue;
43184 if (dump_file)
43185 fprintf (dump_file,
43186 "attempting to fix modrm byte in insn %d:"
43187 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43188 reg_class_names[alt[opno1].cl]);
43190 HARD_REG_SET unavailable;
43191 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43192 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43193 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43194 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43195 IOR_HARD_REG_SET (unavailable, output_risky);
43196 IOR_COMPL_HARD_REG_SET (unavailable,
43197 reg_class_contents[alt[opno1].cl]);
43199 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43200 if (!TEST_HARD_REG_BIT (unavailable, i))
43201 break;
43202 if (i == FIRST_PSEUDO_REGISTER)
43204 if (dump_file)
43205 fprintf (dump_file, ", none available\n");
43206 continue;
43208 if (dump_file)
43209 fprintf (dump_file, " -> %d\n", i);
43210 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43211 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43212 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43217 /* Implement machine specific optimizations. We implement padding of returns
43218 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43219 static void
43220 ix86_reorg (void)
43222 /* We are freeing block_for_insn in the toplev to keep compatibility
43223 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43224 compute_bb_for_insn ();
43226 if (flag_mitigate_rop)
43227 ix86_mitigate_rop ();
43229 if (TARGET_SEH && current_function_has_exception_handlers ())
43230 ix86_seh_fixup_eh_fallthru ();
43232 if (optimize && optimize_function_for_speed_p (cfun))
43234 if (TARGET_PAD_SHORT_FUNCTION)
43235 ix86_pad_short_function ();
43236 else if (TARGET_PAD_RETURNS)
43237 ix86_pad_returns ();
43238 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43239 if (TARGET_FOUR_JUMP_LIMIT)
43240 ix86_avoid_jump_mispredicts ();
43241 #endif
43245 /* Return nonzero when QImode register that must be represented via REX prefix
43246 is used. */
43247 bool
43248 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43250 int i;
43251 extract_insn_cached (insn);
43252 for (i = 0; i < recog_data.n_operands; i++)
43253 if (GENERAL_REG_P (recog_data.operand[i])
43254 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43255 return true;
43256 return false;
43259 /* Return true when INSN mentions register that must be encoded using REX
43260 prefix. */
43261 bool
43262 x86_extended_reg_mentioned_p (rtx insn)
43264 subrtx_iterator::array_type array;
43265 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43267 const_rtx x = *iter;
43268 if (REG_P (x)
43269 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43270 return true;
43272 return false;
43275 /* If profitable, negate (without causing overflow) integer constant
43276 of mode MODE at location LOC. Return true in this case. */
43277 bool
43278 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43280 HOST_WIDE_INT val;
43282 if (!CONST_INT_P (*loc))
43283 return false;
43285 switch (mode)
43287 case DImode:
43288 /* DImode x86_64 constants must fit in 32 bits. */
43289 gcc_assert (x86_64_immediate_operand (*loc, mode));
43291 mode = SImode;
43292 break;
43294 case SImode:
43295 case HImode:
43296 case QImode:
43297 break;
43299 default:
43300 gcc_unreachable ();
43303 /* Avoid overflows. */
43304 if (mode_signbit_p (mode, *loc))
43305 return false;
43307 val = INTVAL (*loc);
43309 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43310 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43311 if ((val < 0 && val != -128)
43312 || val == 128)
43314 *loc = GEN_INT (-val);
43315 return true;
43318 return false;
43321 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43322 optabs would emit if we didn't have TFmode patterns. */
43324 void
43325 x86_emit_floatuns (rtx operands[2])
43327 rtx_code_label *neglab, *donelab;
43328 rtx i0, i1, f0, in, out;
43329 machine_mode mode, inmode;
43331 inmode = GET_MODE (operands[1]);
43332 gcc_assert (inmode == SImode || inmode == DImode);
43334 out = operands[0];
43335 in = force_reg (inmode, operands[1]);
43336 mode = GET_MODE (out);
43337 neglab = gen_label_rtx ();
43338 donelab = gen_label_rtx ();
43339 f0 = gen_reg_rtx (mode);
43341 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43343 expand_float (out, in, 0);
43345 emit_jump_insn (gen_jump (donelab));
43346 emit_barrier ();
43348 emit_label (neglab);
43350 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43351 1, OPTAB_DIRECT);
43352 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43353 1, OPTAB_DIRECT);
43354 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43356 expand_float (f0, i0, 0);
43358 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43360 emit_label (donelab);
43363 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43364 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43365 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43366 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43368 /* Get a vector mode of the same size as the original but with elements
43369 twice as wide. This is only guaranteed to apply to integral vectors. */
43371 static inline machine_mode
43372 get_mode_wider_vector (machine_mode o)
43374 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43375 machine_mode n = GET_MODE_WIDER_MODE (o);
43376 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43377 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43378 return n;
43381 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43382 fill target with val via vec_duplicate. */
43384 static bool
43385 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43387 bool ok;
43388 rtx_insn *insn;
43389 rtx dup;
43391 /* First attempt to recognize VAL as-is. */
43392 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43393 insn = emit_insn (gen_rtx_SET (target, dup));
43394 if (recog_memoized (insn) < 0)
43396 rtx_insn *seq;
43397 machine_mode innermode = GET_MODE_INNER (mode);
43398 rtx reg;
43400 /* If that fails, force VAL into a register. */
43402 start_sequence ();
43403 reg = force_reg (innermode, val);
43404 if (GET_MODE (reg) != innermode)
43405 reg = gen_lowpart (innermode, reg);
43406 XEXP (dup, 0) = reg;
43407 seq = get_insns ();
43408 end_sequence ();
43409 if (seq)
43410 emit_insn_before (seq, insn);
43412 ok = recog_memoized (insn) >= 0;
43413 gcc_assert (ok);
43415 return true;
43418 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43419 with all elements equal to VAR. Return true if successful. */
43421 static bool
43422 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43423 rtx target, rtx val)
43425 bool ok;
43427 switch (mode)
43429 case V2SImode:
43430 case V2SFmode:
43431 if (!mmx_ok)
43432 return false;
43433 /* FALLTHRU */
43435 case V4DFmode:
43436 case V4DImode:
43437 case V8SFmode:
43438 case V8SImode:
43439 case V2DFmode:
43440 case V2DImode:
43441 case V4SFmode:
43442 case V4SImode:
43443 case V16SImode:
43444 case V8DImode:
43445 case V16SFmode:
43446 case V8DFmode:
43447 return ix86_vector_duplicate_value (mode, target, val);
43449 case V4HImode:
43450 if (!mmx_ok)
43451 return false;
43452 if (TARGET_SSE || TARGET_3DNOW_A)
43454 rtx x;
43456 val = gen_lowpart (SImode, val);
43457 x = gen_rtx_TRUNCATE (HImode, val);
43458 x = gen_rtx_VEC_DUPLICATE (mode, x);
43459 emit_insn (gen_rtx_SET (target, x));
43460 return true;
43462 goto widen;
43464 case V8QImode:
43465 if (!mmx_ok)
43466 return false;
43467 goto widen;
43469 case V8HImode:
43470 if (TARGET_AVX2)
43471 return ix86_vector_duplicate_value (mode, target, val);
43473 if (TARGET_SSE2)
43475 struct expand_vec_perm_d dperm;
43476 rtx tmp1, tmp2;
43478 permute:
43479 memset (&dperm, 0, sizeof (dperm));
43480 dperm.target = target;
43481 dperm.vmode = mode;
43482 dperm.nelt = GET_MODE_NUNITS (mode);
43483 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43484 dperm.one_operand_p = true;
43486 /* Extend to SImode using a paradoxical SUBREG. */
43487 tmp1 = gen_reg_rtx (SImode);
43488 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43490 /* Insert the SImode value as low element of a V4SImode vector. */
43491 tmp2 = gen_reg_rtx (V4SImode);
43492 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43493 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43495 ok = (expand_vec_perm_1 (&dperm)
43496 || expand_vec_perm_broadcast_1 (&dperm));
43497 gcc_assert (ok);
43498 return ok;
43500 goto widen;
43502 case V16QImode:
43503 if (TARGET_AVX2)
43504 return ix86_vector_duplicate_value (mode, target, val);
43506 if (TARGET_SSE2)
43507 goto permute;
43508 goto widen;
43510 widen:
43511 /* Replicate the value once into the next wider mode and recurse. */
43513 machine_mode smode, wsmode, wvmode;
43514 rtx x;
43516 smode = GET_MODE_INNER (mode);
43517 wvmode = get_mode_wider_vector (mode);
43518 wsmode = GET_MODE_INNER (wvmode);
43520 val = convert_modes (wsmode, smode, val, true);
43521 x = expand_simple_binop (wsmode, ASHIFT, val,
43522 GEN_INT (GET_MODE_BITSIZE (smode)),
43523 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43524 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43526 x = gen_reg_rtx (wvmode);
43527 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43528 gcc_assert (ok);
43529 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43530 return ok;
43533 case V16HImode:
43534 case V32QImode:
43535 if (TARGET_AVX2)
43536 return ix86_vector_duplicate_value (mode, target, val);
43537 else
43539 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43540 rtx x = gen_reg_rtx (hvmode);
43542 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43543 gcc_assert (ok);
43545 x = gen_rtx_VEC_CONCAT (mode, x, x);
43546 emit_insn (gen_rtx_SET (target, x));
43548 return true;
43550 case V64QImode:
43551 case V32HImode:
43552 if (TARGET_AVX512BW)
43553 return ix86_vector_duplicate_value (mode, target, val);
43554 else
43556 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43557 rtx x = gen_reg_rtx (hvmode);
43559 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43560 gcc_assert (ok);
43562 x = gen_rtx_VEC_CONCAT (mode, x, x);
43563 emit_insn (gen_rtx_SET (target, x));
43565 return true;
43567 default:
43568 return false;
43572 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43573 whose ONE_VAR element is VAR, and other elements are zero. Return true
43574 if successful. */
43576 static bool
43577 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43578 rtx target, rtx var, int one_var)
43580 machine_mode vsimode;
43581 rtx new_target;
43582 rtx x, tmp;
43583 bool use_vector_set = false;
43585 switch (mode)
43587 case V2DImode:
43588 /* For SSE4.1, we normally use vector set. But if the second
43589 element is zero and inter-unit moves are OK, we use movq
43590 instead. */
43591 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43592 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43593 && one_var == 0));
43594 break;
43595 case V16QImode:
43596 case V4SImode:
43597 case V4SFmode:
43598 use_vector_set = TARGET_SSE4_1;
43599 break;
43600 case V8HImode:
43601 use_vector_set = TARGET_SSE2;
43602 break;
43603 case V4HImode:
43604 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43605 break;
43606 case V32QImode:
43607 case V16HImode:
43608 case V8SImode:
43609 case V8SFmode:
43610 case V4DFmode:
43611 use_vector_set = TARGET_AVX;
43612 break;
43613 case V4DImode:
43614 /* Use ix86_expand_vector_set in 64bit mode only. */
43615 use_vector_set = TARGET_AVX && TARGET_64BIT;
43616 break;
43617 default:
43618 break;
43621 if (use_vector_set)
43623 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43624 var = force_reg (GET_MODE_INNER (mode), var);
43625 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43626 return true;
43629 switch (mode)
43631 case V2SFmode:
43632 case V2SImode:
43633 if (!mmx_ok)
43634 return false;
43635 /* FALLTHRU */
43637 case V2DFmode:
43638 case V2DImode:
43639 if (one_var != 0)
43640 return false;
43641 var = force_reg (GET_MODE_INNER (mode), var);
43642 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43643 emit_insn (gen_rtx_SET (target, x));
43644 return true;
43646 case V4SFmode:
43647 case V4SImode:
43648 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43649 new_target = gen_reg_rtx (mode);
43650 else
43651 new_target = target;
43652 var = force_reg (GET_MODE_INNER (mode), var);
43653 x = gen_rtx_VEC_DUPLICATE (mode, var);
43654 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43655 emit_insn (gen_rtx_SET (new_target, x));
43656 if (one_var != 0)
43658 /* We need to shuffle the value to the correct position, so
43659 create a new pseudo to store the intermediate result. */
43661 /* With SSE2, we can use the integer shuffle insns. */
43662 if (mode != V4SFmode && TARGET_SSE2)
43664 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43665 const1_rtx,
43666 GEN_INT (one_var == 1 ? 0 : 1),
43667 GEN_INT (one_var == 2 ? 0 : 1),
43668 GEN_INT (one_var == 3 ? 0 : 1)));
43669 if (target != new_target)
43670 emit_move_insn (target, new_target);
43671 return true;
43674 /* Otherwise convert the intermediate result to V4SFmode and
43675 use the SSE1 shuffle instructions. */
43676 if (mode != V4SFmode)
43678 tmp = gen_reg_rtx (V4SFmode);
43679 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43681 else
43682 tmp = new_target;
43684 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43685 const1_rtx,
43686 GEN_INT (one_var == 1 ? 0 : 1),
43687 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43688 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43690 if (mode != V4SFmode)
43691 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43692 else if (tmp != target)
43693 emit_move_insn (target, tmp);
43695 else if (target != new_target)
43696 emit_move_insn (target, new_target);
43697 return true;
43699 case V8HImode:
43700 case V16QImode:
43701 vsimode = V4SImode;
43702 goto widen;
43703 case V4HImode:
43704 case V8QImode:
43705 if (!mmx_ok)
43706 return false;
43707 vsimode = V2SImode;
43708 goto widen;
43709 widen:
43710 if (one_var != 0)
43711 return false;
43713 /* Zero extend the variable element to SImode and recurse. */
43714 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43716 x = gen_reg_rtx (vsimode);
43717 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43718 var, one_var))
43719 gcc_unreachable ();
43721 emit_move_insn (target, gen_lowpart (mode, x));
43722 return true;
43724 default:
43725 return false;
43729 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43730 consisting of the values in VALS. It is known that all elements
43731 except ONE_VAR are constants. Return true if successful. */
43733 static bool
43734 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43735 rtx target, rtx vals, int one_var)
43737 rtx var = XVECEXP (vals, 0, one_var);
43738 machine_mode wmode;
43739 rtx const_vec, x;
43741 const_vec = copy_rtx (vals);
43742 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43743 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43745 switch (mode)
43747 case V2DFmode:
43748 case V2DImode:
43749 case V2SFmode:
43750 case V2SImode:
43751 /* For the two element vectors, it's just as easy to use
43752 the general case. */
43753 return false;
43755 case V4DImode:
43756 /* Use ix86_expand_vector_set in 64bit mode only. */
43757 if (!TARGET_64BIT)
43758 return false;
43759 /* FALLTHRU */
43760 case V4DFmode:
43761 case V8SFmode:
43762 case V8SImode:
43763 case V16HImode:
43764 case V32QImode:
43765 case V4SFmode:
43766 case V4SImode:
43767 case V8HImode:
43768 case V4HImode:
43769 break;
43771 case V16QImode:
43772 if (TARGET_SSE4_1)
43773 break;
43774 wmode = V8HImode;
43775 goto widen;
43776 case V8QImode:
43777 wmode = V4HImode;
43778 goto widen;
43779 widen:
43780 /* There's no way to set one QImode entry easily. Combine
43781 the variable value with its adjacent constant value, and
43782 promote to an HImode set. */
43783 x = XVECEXP (vals, 0, one_var ^ 1);
43784 if (one_var & 1)
43786 var = convert_modes (HImode, QImode, var, true);
43787 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43788 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43789 x = GEN_INT (INTVAL (x) & 0xff);
43791 else
43793 var = convert_modes (HImode, QImode, var, true);
43794 x = gen_int_mode (INTVAL (x) << 8, HImode);
43796 if (x != const0_rtx)
43797 var = expand_simple_binop (HImode, IOR, var, x, var,
43798 1, OPTAB_LIB_WIDEN);
43800 x = gen_reg_rtx (wmode);
43801 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43802 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43804 emit_move_insn (target, gen_lowpart (mode, x));
43805 return true;
43807 default:
43808 return false;
43811 emit_move_insn (target, const_vec);
43812 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43813 return true;
43816 /* A subroutine of ix86_expand_vector_init_general. Use vector
43817 concatenate to handle the most general case: all values variable,
43818 and none identical. */
43820 static void
43821 ix86_expand_vector_init_concat (machine_mode mode,
43822 rtx target, rtx *ops, int n)
43824 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43825 rtx first[16], second[8], third[4];
43826 rtvec v;
43827 int i, j;
43829 switch (n)
43831 case 2:
43832 switch (mode)
43834 case V16SImode:
43835 cmode = V8SImode;
43836 break;
43837 case V16SFmode:
43838 cmode = V8SFmode;
43839 break;
43840 case V8DImode:
43841 cmode = V4DImode;
43842 break;
43843 case V8DFmode:
43844 cmode = V4DFmode;
43845 break;
43846 case V8SImode:
43847 cmode = V4SImode;
43848 break;
43849 case V8SFmode:
43850 cmode = V4SFmode;
43851 break;
43852 case V4DImode:
43853 cmode = V2DImode;
43854 break;
43855 case V4DFmode:
43856 cmode = V2DFmode;
43857 break;
43858 case V4SImode:
43859 cmode = V2SImode;
43860 break;
43861 case V4SFmode:
43862 cmode = V2SFmode;
43863 break;
43864 case V2DImode:
43865 cmode = DImode;
43866 break;
43867 case V2SImode:
43868 cmode = SImode;
43869 break;
43870 case V2DFmode:
43871 cmode = DFmode;
43872 break;
43873 case V2SFmode:
43874 cmode = SFmode;
43875 break;
43876 default:
43877 gcc_unreachable ();
43880 if (!register_operand (ops[1], cmode))
43881 ops[1] = force_reg (cmode, ops[1]);
43882 if (!register_operand (ops[0], cmode))
43883 ops[0] = force_reg (cmode, ops[0]);
43884 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43885 ops[1])));
43886 break;
43888 case 4:
43889 switch (mode)
43891 case V4DImode:
43892 cmode = V2DImode;
43893 break;
43894 case V4DFmode:
43895 cmode = V2DFmode;
43896 break;
43897 case V4SImode:
43898 cmode = V2SImode;
43899 break;
43900 case V4SFmode:
43901 cmode = V2SFmode;
43902 break;
43903 default:
43904 gcc_unreachable ();
43906 goto half;
43908 case 8:
43909 switch (mode)
43911 case V8DImode:
43912 cmode = V2DImode;
43913 hmode = V4DImode;
43914 break;
43915 case V8DFmode:
43916 cmode = V2DFmode;
43917 hmode = V4DFmode;
43918 break;
43919 case V8SImode:
43920 cmode = V2SImode;
43921 hmode = V4SImode;
43922 break;
43923 case V8SFmode:
43924 cmode = V2SFmode;
43925 hmode = V4SFmode;
43926 break;
43927 default:
43928 gcc_unreachable ();
43930 goto half;
43932 case 16:
43933 switch (mode)
43935 case V16SImode:
43936 cmode = V2SImode;
43937 hmode = V4SImode;
43938 gmode = V8SImode;
43939 break;
43940 case V16SFmode:
43941 cmode = V2SFmode;
43942 hmode = V4SFmode;
43943 gmode = V8SFmode;
43944 break;
43945 default:
43946 gcc_unreachable ();
43948 goto half;
43950 half:
43951 /* FIXME: We process inputs backward to help RA. PR 36222. */
43952 i = n - 1;
43953 j = (n >> 1) - 1;
43954 for (; i > 0; i -= 2, j--)
43956 first[j] = gen_reg_rtx (cmode);
43957 v = gen_rtvec (2, ops[i - 1], ops[i]);
43958 ix86_expand_vector_init (false, first[j],
43959 gen_rtx_PARALLEL (cmode, v));
43962 n >>= 1;
43963 if (n > 4)
43965 gcc_assert (hmode != VOIDmode);
43966 gcc_assert (gmode != VOIDmode);
43967 for (i = j = 0; i < n; i += 2, j++)
43969 second[j] = gen_reg_rtx (hmode);
43970 ix86_expand_vector_init_concat (hmode, second [j],
43971 &first [i], 2);
43973 n >>= 1;
43974 for (i = j = 0; i < n; i += 2, j++)
43976 third[j] = gen_reg_rtx (gmode);
43977 ix86_expand_vector_init_concat (gmode, third[j],
43978 &second[i], 2);
43980 n >>= 1;
43981 ix86_expand_vector_init_concat (mode, target, third, n);
43983 else if (n > 2)
43985 gcc_assert (hmode != VOIDmode);
43986 for (i = j = 0; i < n; i += 2, j++)
43988 second[j] = gen_reg_rtx (hmode);
43989 ix86_expand_vector_init_concat (hmode, second [j],
43990 &first [i], 2);
43992 n >>= 1;
43993 ix86_expand_vector_init_concat (mode, target, second, n);
43995 else
43996 ix86_expand_vector_init_concat (mode, target, first, n);
43997 break;
43999 default:
44000 gcc_unreachable ();
44004 /* A subroutine of ix86_expand_vector_init_general. Use vector
44005 interleave to handle the most general case: all values variable,
44006 and none identical. */
44008 static void
44009 ix86_expand_vector_init_interleave (machine_mode mode,
44010 rtx target, rtx *ops, int n)
44012 machine_mode first_imode, second_imode, third_imode, inner_mode;
44013 int i, j;
44014 rtx op0, op1;
44015 rtx (*gen_load_even) (rtx, rtx, rtx);
44016 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44017 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44019 switch (mode)
44021 case V8HImode:
44022 gen_load_even = gen_vec_setv8hi;
44023 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44024 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44025 inner_mode = HImode;
44026 first_imode = V4SImode;
44027 second_imode = V2DImode;
44028 third_imode = VOIDmode;
44029 break;
44030 case V16QImode:
44031 gen_load_even = gen_vec_setv16qi;
44032 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44033 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44034 inner_mode = QImode;
44035 first_imode = V8HImode;
44036 second_imode = V4SImode;
44037 third_imode = V2DImode;
44038 break;
44039 default:
44040 gcc_unreachable ();
44043 for (i = 0; i < n; i++)
44045 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44046 op0 = gen_reg_rtx (SImode);
44047 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44049 /* Insert the SImode value as low element of V4SImode vector. */
44050 op1 = gen_reg_rtx (V4SImode);
44051 op0 = gen_rtx_VEC_MERGE (V4SImode,
44052 gen_rtx_VEC_DUPLICATE (V4SImode,
44053 op0),
44054 CONST0_RTX (V4SImode),
44055 const1_rtx);
44056 emit_insn (gen_rtx_SET (op1, op0));
44058 /* Cast the V4SImode vector back to a vector in orignal mode. */
44059 op0 = gen_reg_rtx (mode);
44060 emit_move_insn (op0, gen_lowpart (mode, op1));
44062 /* Load even elements into the second position. */
44063 emit_insn (gen_load_even (op0,
44064 force_reg (inner_mode,
44065 ops [i + i + 1]),
44066 const1_rtx));
44068 /* Cast vector to FIRST_IMODE vector. */
44069 ops[i] = gen_reg_rtx (first_imode);
44070 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44073 /* Interleave low FIRST_IMODE vectors. */
44074 for (i = j = 0; i < n; i += 2, j++)
44076 op0 = gen_reg_rtx (first_imode);
44077 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44079 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44080 ops[j] = gen_reg_rtx (second_imode);
44081 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44084 /* Interleave low SECOND_IMODE vectors. */
44085 switch (second_imode)
44087 case V4SImode:
44088 for (i = j = 0; i < n / 2; i += 2, j++)
44090 op0 = gen_reg_rtx (second_imode);
44091 emit_insn (gen_interleave_second_low (op0, ops[i],
44092 ops[i + 1]));
44094 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44095 vector. */
44096 ops[j] = gen_reg_rtx (third_imode);
44097 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44099 second_imode = V2DImode;
44100 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44101 /* FALLTHRU */
44103 case V2DImode:
44104 op0 = gen_reg_rtx (second_imode);
44105 emit_insn (gen_interleave_second_low (op0, ops[0],
44106 ops[1]));
44108 /* Cast the SECOND_IMODE vector back to a vector on original
44109 mode. */
44110 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44111 break;
44113 default:
44114 gcc_unreachable ();
44118 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44119 all values variable, and none identical. */
44121 static void
44122 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44123 rtx target, rtx vals)
44125 rtx ops[64], op0, op1, op2, op3, op4, op5;
44126 machine_mode half_mode = VOIDmode;
44127 machine_mode quarter_mode = VOIDmode;
44128 int n, i;
44130 switch (mode)
44132 case V2SFmode:
44133 case V2SImode:
44134 if (!mmx_ok && !TARGET_SSE)
44135 break;
44136 /* FALLTHRU */
44138 case V16SImode:
44139 case V16SFmode:
44140 case V8DFmode:
44141 case V8DImode:
44142 case V8SFmode:
44143 case V8SImode:
44144 case V4DFmode:
44145 case V4DImode:
44146 case V4SFmode:
44147 case V4SImode:
44148 case V2DFmode:
44149 case V2DImode:
44150 n = GET_MODE_NUNITS (mode);
44151 for (i = 0; i < n; i++)
44152 ops[i] = XVECEXP (vals, 0, i);
44153 ix86_expand_vector_init_concat (mode, target, ops, n);
44154 return;
44156 case V2TImode:
44157 for (i = 0; i < 2; i++)
44158 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44159 op0 = gen_reg_rtx (V4DImode);
44160 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44161 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44162 return;
44164 case V4TImode:
44165 for (i = 0; i < 4; i++)
44166 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44167 ops[4] = gen_reg_rtx (V4DImode);
44168 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44169 ops[5] = gen_reg_rtx (V4DImode);
44170 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44171 op0 = gen_reg_rtx (V8DImode);
44172 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44173 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44174 return;
44176 case V32QImode:
44177 half_mode = V16QImode;
44178 goto half;
44180 case V16HImode:
44181 half_mode = V8HImode;
44182 goto half;
44184 half:
44185 n = GET_MODE_NUNITS (mode);
44186 for (i = 0; i < n; i++)
44187 ops[i] = XVECEXP (vals, 0, i);
44188 op0 = gen_reg_rtx (half_mode);
44189 op1 = gen_reg_rtx (half_mode);
44190 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44191 n >> 2);
44192 ix86_expand_vector_init_interleave (half_mode, op1,
44193 &ops [n >> 1], n >> 2);
44194 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44195 return;
44197 case V64QImode:
44198 quarter_mode = V16QImode;
44199 half_mode = V32QImode;
44200 goto quarter;
44202 case V32HImode:
44203 quarter_mode = V8HImode;
44204 half_mode = V16HImode;
44205 goto quarter;
44207 quarter:
44208 n = GET_MODE_NUNITS (mode);
44209 for (i = 0; i < n; i++)
44210 ops[i] = XVECEXP (vals, 0, i);
44211 op0 = gen_reg_rtx (quarter_mode);
44212 op1 = gen_reg_rtx (quarter_mode);
44213 op2 = gen_reg_rtx (quarter_mode);
44214 op3 = gen_reg_rtx (quarter_mode);
44215 op4 = gen_reg_rtx (half_mode);
44216 op5 = gen_reg_rtx (half_mode);
44217 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44218 n >> 3);
44219 ix86_expand_vector_init_interleave (quarter_mode, op1,
44220 &ops [n >> 2], n >> 3);
44221 ix86_expand_vector_init_interleave (quarter_mode, op2,
44222 &ops [n >> 1], n >> 3);
44223 ix86_expand_vector_init_interleave (quarter_mode, op3,
44224 &ops [(n >> 1) | (n >> 2)], n >> 3);
44225 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44226 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44227 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44228 return;
44230 case V16QImode:
44231 if (!TARGET_SSE4_1)
44232 break;
44233 /* FALLTHRU */
44235 case V8HImode:
44236 if (!TARGET_SSE2)
44237 break;
44239 /* Don't use ix86_expand_vector_init_interleave if we can't
44240 move from GPR to SSE register directly. */
44241 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44242 break;
44244 n = GET_MODE_NUNITS (mode);
44245 for (i = 0; i < n; i++)
44246 ops[i] = XVECEXP (vals, 0, i);
44247 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44248 return;
44250 case V4HImode:
44251 case V8QImode:
44252 break;
44254 default:
44255 gcc_unreachable ();
44259 int i, j, n_elts, n_words, n_elt_per_word;
44260 machine_mode inner_mode;
44261 rtx words[4], shift;
44263 inner_mode = GET_MODE_INNER (mode);
44264 n_elts = GET_MODE_NUNITS (mode);
44265 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44266 n_elt_per_word = n_elts / n_words;
44267 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44269 for (i = 0; i < n_words; ++i)
44271 rtx word = NULL_RTX;
44273 for (j = 0; j < n_elt_per_word; ++j)
44275 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44276 elt = convert_modes (word_mode, inner_mode, elt, true);
44278 if (j == 0)
44279 word = elt;
44280 else
44282 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44283 word, 1, OPTAB_LIB_WIDEN);
44284 word = expand_simple_binop (word_mode, IOR, word, elt,
44285 word, 1, OPTAB_LIB_WIDEN);
44289 words[i] = word;
44292 if (n_words == 1)
44293 emit_move_insn (target, gen_lowpart (mode, words[0]));
44294 else if (n_words == 2)
44296 rtx tmp = gen_reg_rtx (mode);
44297 emit_clobber (tmp);
44298 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44299 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44300 emit_move_insn (target, tmp);
44302 else if (n_words == 4)
44304 rtx tmp = gen_reg_rtx (V4SImode);
44305 gcc_assert (word_mode == SImode);
44306 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44307 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44308 emit_move_insn (target, gen_lowpart (mode, tmp));
44310 else
44311 gcc_unreachable ();
44315 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44316 instructions unless MMX_OK is true. */
44318 void
44319 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44321 machine_mode mode = GET_MODE (target);
44322 machine_mode inner_mode = GET_MODE_INNER (mode);
44323 int n_elts = GET_MODE_NUNITS (mode);
44324 int n_var = 0, one_var = -1;
44325 bool all_same = true, all_const_zero = true;
44326 int i;
44327 rtx x;
44329 /* Handle first initialization from vector elts. */
44330 if (n_elts != XVECLEN (vals, 0))
44332 rtx subtarget = target;
44333 x = XVECEXP (vals, 0, 0);
44334 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44335 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44337 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44338 if (inner_mode == QImode || inner_mode == HImode)
44340 mode = mode_for_vector (SImode,
44341 n_elts * GET_MODE_SIZE (inner_mode) / 4);
44342 inner_mode
44343 = mode_for_vector (SImode,
44344 n_elts * GET_MODE_SIZE (inner_mode) / 8);
44345 ops[0] = gen_lowpart (inner_mode, ops[0]);
44346 ops[1] = gen_lowpart (inner_mode, ops[1]);
44347 subtarget = gen_reg_rtx (mode);
44349 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44350 if (subtarget != target)
44351 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44352 return;
44354 gcc_unreachable ();
44357 for (i = 0; i < n_elts; ++i)
44359 x = XVECEXP (vals, 0, i);
44360 if (!(CONST_SCALAR_INT_P (x)
44361 || CONST_DOUBLE_P (x)
44362 || CONST_FIXED_P (x)))
44363 n_var++, one_var = i;
44364 else if (x != CONST0_RTX (inner_mode))
44365 all_const_zero = false;
44366 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44367 all_same = false;
44370 /* Constants are best loaded from the constant pool. */
44371 if (n_var == 0)
44373 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44374 return;
44377 /* If all values are identical, broadcast the value. */
44378 if (all_same
44379 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44380 XVECEXP (vals, 0, 0)))
44381 return;
44383 /* Values where only one field is non-constant are best loaded from
44384 the pool and overwritten via move later. */
44385 if (n_var == 1)
44387 if (all_const_zero
44388 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44389 XVECEXP (vals, 0, one_var),
44390 one_var))
44391 return;
44393 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44394 return;
44397 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44400 void
44401 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44403 machine_mode mode = GET_MODE (target);
44404 machine_mode inner_mode = GET_MODE_INNER (mode);
44405 machine_mode half_mode;
44406 bool use_vec_merge = false;
44407 rtx tmp;
44408 static rtx (*gen_extract[6][2]) (rtx, rtx)
44410 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44411 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44412 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44413 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44414 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44415 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44417 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44419 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44420 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44421 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44422 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44423 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44424 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44426 int i, j, n;
44427 machine_mode mmode = VOIDmode;
44428 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44430 switch (mode)
44432 case V2SFmode:
44433 case V2SImode:
44434 if (mmx_ok)
44436 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44437 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44438 if (elt == 0)
44439 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44440 else
44441 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44442 emit_insn (gen_rtx_SET (target, tmp));
44443 return;
44445 break;
44447 case V2DImode:
44448 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44449 if (use_vec_merge)
44450 break;
44452 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44453 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44454 if (elt == 0)
44455 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44456 else
44457 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44458 emit_insn (gen_rtx_SET (target, tmp));
44459 return;
44461 case V2DFmode:
44463 rtx op0, op1;
44465 /* For the two element vectors, we implement a VEC_CONCAT with
44466 the extraction of the other element. */
44468 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44469 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44471 if (elt == 0)
44472 op0 = val, op1 = tmp;
44473 else
44474 op0 = tmp, op1 = val;
44476 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44477 emit_insn (gen_rtx_SET (target, tmp));
44479 return;
44481 case V4SFmode:
44482 use_vec_merge = TARGET_SSE4_1;
44483 if (use_vec_merge)
44484 break;
44486 switch (elt)
44488 case 0:
44489 use_vec_merge = true;
44490 break;
44492 case 1:
44493 /* tmp = target = A B C D */
44494 tmp = copy_to_reg (target);
44495 /* target = A A B B */
44496 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44497 /* target = X A B B */
44498 ix86_expand_vector_set (false, target, val, 0);
44499 /* target = A X C D */
44500 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44501 const1_rtx, const0_rtx,
44502 GEN_INT (2+4), GEN_INT (3+4)));
44503 return;
44505 case 2:
44506 /* tmp = target = A B C D */
44507 tmp = copy_to_reg (target);
44508 /* tmp = X B C D */
44509 ix86_expand_vector_set (false, tmp, val, 0);
44510 /* target = A B X D */
44511 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44512 const0_rtx, const1_rtx,
44513 GEN_INT (0+4), GEN_INT (3+4)));
44514 return;
44516 case 3:
44517 /* tmp = target = A B C D */
44518 tmp = copy_to_reg (target);
44519 /* tmp = X B C D */
44520 ix86_expand_vector_set (false, tmp, val, 0);
44521 /* target = A B X D */
44522 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44523 const0_rtx, const1_rtx,
44524 GEN_INT (2+4), GEN_INT (0+4)));
44525 return;
44527 default:
44528 gcc_unreachable ();
44530 break;
44532 case V4SImode:
44533 use_vec_merge = TARGET_SSE4_1;
44534 if (use_vec_merge)
44535 break;
44537 /* Element 0 handled by vec_merge below. */
44538 if (elt == 0)
44540 use_vec_merge = true;
44541 break;
44544 if (TARGET_SSE2)
44546 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44547 store into element 0, then shuffle them back. */
44549 rtx order[4];
44551 order[0] = GEN_INT (elt);
44552 order[1] = const1_rtx;
44553 order[2] = const2_rtx;
44554 order[3] = GEN_INT (3);
44555 order[elt] = const0_rtx;
44557 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44558 order[1], order[2], order[3]));
44560 ix86_expand_vector_set (false, target, val, 0);
44562 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44563 order[1], order[2], order[3]));
44565 else
44567 /* For SSE1, we have to reuse the V4SF code. */
44568 rtx t = gen_reg_rtx (V4SFmode);
44569 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44570 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44571 emit_move_insn (target, gen_lowpart (mode, t));
44573 return;
44575 case V8HImode:
44576 use_vec_merge = TARGET_SSE2;
44577 break;
44578 case V4HImode:
44579 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44580 break;
44582 case V16QImode:
44583 use_vec_merge = TARGET_SSE4_1;
44584 break;
44586 case V8QImode:
44587 break;
44589 case V32QImode:
44590 half_mode = V16QImode;
44591 j = 0;
44592 n = 16;
44593 goto half;
44595 case V16HImode:
44596 half_mode = V8HImode;
44597 j = 1;
44598 n = 8;
44599 goto half;
44601 case V8SImode:
44602 half_mode = V4SImode;
44603 j = 2;
44604 n = 4;
44605 goto half;
44607 case V4DImode:
44608 half_mode = V2DImode;
44609 j = 3;
44610 n = 2;
44611 goto half;
44613 case V8SFmode:
44614 half_mode = V4SFmode;
44615 j = 4;
44616 n = 4;
44617 goto half;
44619 case V4DFmode:
44620 half_mode = V2DFmode;
44621 j = 5;
44622 n = 2;
44623 goto half;
44625 half:
44626 /* Compute offset. */
44627 i = elt / n;
44628 elt %= n;
44630 gcc_assert (i <= 1);
44632 /* Extract the half. */
44633 tmp = gen_reg_rtx (half_mode);
44634 emit_insn (gen_extract[j][i] (tmp, target));
44636 /* Put val in tmp at elt. */
44637 ix86_expand_vector_set (false, tmp, val, elt);
44639 /* Put it back. */
44640 emit_insn (gen_insert[j][i] (target, target, tmp));
44641 return;
44643 case V8DFmode:
44644 if (TARGET_AVX512F)
44646 mmode = QImode;
44647 gen_blendm = gen_avx512f_blendmv8df;
44649 break;
44651 case V8DImode:
44652 if (TARGET_AVX512F)
44654 mmode = QImode;
44655 gen_blendm = gen_avx512f_blendmv8di;
44657 break;
44659 case V16SFmode:
44660 if (TARGET_AVX512F)
44662 mmode = HImode;
44663 gen_blendm = gen_avx512f_blendmv16sf;
44665 break;
44667 case V16SImode:
44668 if (TARGET_AVX512F)
44670 mmode = HImode;
44671 gen_blendm = gen_avx512f_blendmv16si;
44673 break;
44675 case V32HImode:
44676 if (TARGET_AVX512F && TARGET_AVX512BW)
44678 mmode = SImode;
44679 gen_blendm = gen_avx512bw_blendmv32hi;
44681 break;
44683 case V64QImode:
44684 if (TARGET_AVX512F && TARGET_AVX512BW)
44686 mmode = DImode;
44687 gen_blendm = gen_avx512bw_blendmv64qi;
44689 break;
44691 default:
44692 break;
44695 if (mmode != VOIDmode)
44697 tmp = gen_reg_rtx (mode);
44698 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44699 /* The avx512*_blendm<mode> expanders have different operand order
44700 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44701 elements where the mask is set and second input operand otherwise,
44702 in {sse,avx}*_*blend* the first input operand is used for elements
44703 where the mask is clear and second input operand otherwise. */
44704 emit_insn (gen_blendm (target, target, tmp,
44705 force_reg (mmode,
44706 gen_int_mode (1 << elt, mmode))));
44708 else if (use_vec_merge)
44710 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44711 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44712 emit_insn (gen_rtx_SET (target, tmp));
44714 else
44716 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44718 emit_move_insn (mem, target);
44720 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44721 emit_move_insn (tmp, val);
44723 emit_move_insn (target, mem);
44727 void
44728 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44730 machine_mode mode = GET_MODE (vec);
44731 machine_mode inner_mode = GET_MODE_INNER (mode);
44732 bool use_vec_extr = false;
44733 rtx tmp;
44735 switch (mode)
44737 case V2SImode:
44738 case V2SFmode:
44739 if (!mmx_ok)
44740 break;
44741 /* FALLTHRU */
44743 case V2DFmode:
44744 case V2DImode:
44745 case V2TImode:
44746 case V4TImode:
44747 use_vec_extr = true;
44748 break;
44750 case V4SFmode:
44751 use_vec_extr = TARGET_SSE4_1;
44752 if (use_vec_extr)
44753 break;
44755 switch (elt)
44757 case 0:
44758 tmp = vec;
44759 break;
44761 case 1:
44762 case 3:
44763 tmp = gen_reg_rtx (mode);
44764 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44765 GEN_INT (elt), GEN_INT (elt),
44766 GEN_INT (elt+4), GEN_INT (elt+4)));
44767 break;
44769 case 2:
44770 tmp = gen_reg_rtx (mode);
44771 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44772 break;
44774 default:
44775 gcc_unreachable ();
44777 vec = tmp;
44778 use_vec_extr = true;
44779 elt = 0;
44780 break;
44782 case V4SImode:
44783 use_vec_extr = TARGET_SSE4_1;
44784 if (use_vec_extr)
44785 break;
44787 if (TARGET_SSE2)
44789 switch (elt)
44791 case 0:
44792 tmp = vec;
44793 break;
44795 case 1:
44796 case 3:
44797 tmp = gen_reg_rtx (mode);
44798 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44799 GEN_INT (elt), GEN_INT (elt),
44800 GEN_INT (elt), GEN_INT (elt)));
44801 break;
44803 case 2:
44804 tmp = gen_reg_rtx (mode);
44805 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44806 break;
44808 default:
44809 gcc_unreachable ();
44811 vec = tmp;
44812 use_vec_extr = true;
44813 elt = 0;
44815 else
44817 /* For SSE1, we have to reuse the V4SF code. */
44818 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44819 gen_lowpart (V4SFmode, vec), elt);
44820 return;
44822 break;
44824 case V8HImode:
44825 use_vec_extr = TARGET_SSE2;
44826 break;
44827 case V4HImode:
44828 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44829 break;
44831 case V16QImode:
44832 use_vec_extr = TARGET_SSE4_1;
44833 break;
44835 case V8SFmode:
44836 if (TARGET_AVX)
44838 tmp = gen_reg_rtx (V4SFmode);
44839 if (elt < 4)
44840 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44841 else
44842 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44843 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44844 return;
44846 break;
44848 case V4DFmode:
44849 if (TARGET_AVX)
44851 tmp = gen_reg_rtx (V2DFmode);
44852 if (elt < 2)
44853 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44854 else
44855 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44856 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44857 return;
44859 break;
44861 case V32QImode:
44862 if (TARGET_AVX)
44864 tmp = gen_reg_rtx (V16QImode);
44865 if (elt < 16)
44866 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44867 else
44868 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44869 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44870 return;
44872 break;
44874 case V16HImode:
44875 if (TARGET_AVX)
44877 tmp = gen_reg_rtx (V8HImode);
44878 if (elt < 8)
44879 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44880 else
44881 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44882 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44883 return;
44885 break;
44887 case V8SImode:
44888 if (TARGET_AVX)
44890 tmp = gen_reg_rtx (V4SImode);
44891 if (elt < 4)
44892 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44893 else
44894 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44895 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44896 return;
44898 break;
44900 case V4DImode:
44901 if (TARGET_AVX)
44903 tmp = gen_reg_rtx (V2DImode);
44904 if (elt < 2)
44905 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44906 else
44907 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44908 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44909 return;
44911 break;
44913 case V32HImode:
44914 if (TARGET_AVX512BW)
44916 tmp = gen_reg_rtx (V16HImode);
44917 if (elt < 16)
44918 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44919 else
44920 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44921 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44922 return;
44924 break;
44926 case V64QImode:
44927 if (TARGET_AVX512BW)
44929 tmp = gen_reg_rtx (V32QImode);
44930 if (elt < 32)
44931 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44932 else
44933 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44934 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44935 return;
44937 break;
44939 case V16SFmode:
44940 tmp = gen_reg_rtx (V8SFmode);
44941 if (elt < 8)
44942 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44943 else
44944 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44945 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44946 return;
44948 case V8DFmode:
44949 tmp = gen_reg_rtx (V4DFmode);
44950 if (elt < 4)
44951 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44952 else
44953 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44954 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44955 return;
44957 case V16SImode:
44958 tmp = gen_reg_rtx (V8SImode);
44959 if (elt < 8)
44960 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44961 else
44962 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44963 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44964 return;
44966 case V8DImode:
44967 tmp = gen_reg_rtx (V4DImode);
44968 if (elt < 4)
44969 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44970 else
44971 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44972 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44973 return;
44975 case V8QImode:
44976 /* ??? Could extract the appropriate HImode element and shift. */
44977 default:
44978 break;
44981 if (use_vec_extr)
44983 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44984 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44986 /* Let the rtl optimizers know about the zero extension performed. */
44987 if (inner_mode == QImode || inner_mode == HImode)
44989 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44990 target = gen_lowpart (SImode, target);
44993 emit_insn (gen_rtx_SET (target, tmp));
44995 else
44997 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44999 emit_move_insn (mem, vec);
45001 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45002 emit_move_insn (target, tmp);
45006 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
45007 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
45008 The upper bits of DEST are undefined, though they shouldn't cause
45009 exceptions (some bits from src or all zeros are ok). */
45011 static void
45012 emit_reduc_half (rtx dest, rtx src, int i)
45014 rtx tem, d = dest;
45015 switch (GET_MODE (src))
45017 case V4SFmode:
45018 if (i == 128)
45019 tem = gen_sse_movhlps (dest, src, src);
45020 else
45021 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45022 GEN_INT (1 + 4), GEN_INT (1 + 4));
45023 break;
45024 case V2DFmode:
45025 tem = gen_vec_interleave_highv2df (dest, src, src);
45026 break;
45027 case V16QImode:
45028 case V8HImode:
45029 case V4SImode:
45030 case V2DImode:
45031 d = gen_reg_rtx (V1TImode);
45032 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45033 GEN_INT (i / 2));
45034 break;
45035 case V8SFmode:
45036 if (i == 256)
45037 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45038 else
45039 tem = gen_avx_shufps256 (dest, src, src,
45040 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45041 break;
45042 case V4DFmode:
45043 if (i == 256)
45044 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45045 else
45046 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45047 break;
45048 case V32QImode:
45049 case V16HImode:
45050 case V8SImode:
45051 case V4DImode:
45052 if (i == 256)
45054 if (GET_MODE (dest) != V4DImode)
45055 d = gen_reg_rtx (V4DImode);
45056 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45057 gen_lowpart (V4DImode, src),
45058 const1_rtx);
45060 else
45062 d = gen_reg_rtx (V2TImode);
45063 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45064 GEN_INT (i / 2));
45066 break;
45067 case V64QImode:
45068 case V32HImode:
45069 case V16SImode:
45070 case V16SFmode:
45071 case V8DImode:
45072 case V8DFmode:
45073 if (i > 128)
45074 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45075 gen_lowpart (V16SImode, src),
45076 gen_lowpart (V16SImode, src),
45077 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45078 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45079 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45080 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45081 GEN_INT (0xC), GEN_INT (0xD),
45082 GEN_INT (0xE), GEN_INT (0xF),
45083 GEN_INT (0x10), GEN_INT (0x11),
45084 GEN_INT (0x12), GEN_INT (0x13),
45085 GEN_INT (0x14), GEN_INT (0x15),
45086 GEN_INT (0x16), GEN_INT (0x17));
45087 else
45088 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45089 gen_lowpart (V16SImode, src),
45090 GEN_INT (i == 128 ? 0x2 : 0x1),
45091 GEN_INT (0x3),
45092 GEN_INT (0x3),
45093 GEN_INT (0x3),
45094 GEN_INT (i == 128 ? 0x6 : 0x5),
45095 GEN_INT (0x7),
45096 GEN_INT (0x7),
45097 GEN_INT (0x7),
45098 GEN_INT (i == 128 ? 0xA : 0x9),
45099 GEN_INT (0xB),
45100 GEN_INT (0xB),
45101 GEN_INT (0xB),
45102 GEN_INT (i == 128 ? 0xE : 0xD),
45103 GEN_INT (0xF),
45104 GEN_INT (0xF),
45105 GEN_INT (0xF));
45106 break;
45107 default:
45108 gcc_unreachable ();
45110 emit_insn (tem);
45111 if (d != dest)
45112 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45115 /* Expand a vector reduction. FN is the binary pattern to reduce;
45116 DEST is the destination; IN is the input vector. */
45118 void
45119 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45121 rtx half, dst, vec = in;
45122 machine_mode mode = GET_MODE (in);
45123 int i;
45125 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45126 if (TARGET_SSE4_1
45127 && mode == V8HImode
45128 && fn == gen_uminv8hi3)
45130 emit_insn (gen_sse4_1_phminposuw (dest, in));
45131 return;
45134 for (i = GET_MODE_BITSIZE (mode);
45135 i > GET_MODE_UNIT_BITSIZE (mode);
45136 i >>= 1)
45138 half = gen_reg_rtx (mode);
45139 emit_reduc_half (half, vec, i);
45140 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45141 dst = dest;
45142 else
45143 dst = gen_reg_rtx (mode);
45144 emit_insn (fn (dst, half, vec));
45145 vec = dst;
45149 /* Target hook for scalar_mode_supported_p. */
45150 static bool
45151 ix86_scalar_mode_supported_p (machine_mode mode)
45153 if (DECIMAL_FLOAT_MODE_P (mode))
45154 return default_decimal_float_supported_p ();
45155 else if (mode == TFmode)
45156 return true;
45157 else
45158 return default_scalar_mode_supported_p (mode);
45161 /* Implements target hook vector_mode_supported_p. */
45162 static bool
45163 ix86_vector_mode_supported_p (machine_mode mode)
45165 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45166 return true;
45167 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45168 return true;
45169 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45170 return true;
45171 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45172 return true;
45173 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45174 return true;
45175 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45176 return true;
45177 return false;
45180 /* Target hook for c_mode_for_suffix. */
45181 static machine_mode
45182 ix86_c_mode_for_suffix (char suffix)
45184 if (suffix == 'q')
45185 return TFmode;
45186 if (suffix == 'w')
45187 return XFmode;
45189 return VOIDmode;
45192 /* Worker function for TARGET_MD_ASM_ADJUST.
45194 We implement asm flag outputs, and maintain source compatibility
45195 with the old cc0-based compiler. */
45197 static rtx_insn *
45198 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45199 vec<const char *> &constraints,
45200 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45202 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45203 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45205 bool saw_asm_flag = false;
45207 start_sequence ();
45208 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45210 const char *con = constraints[i];
45211 if (strncmp (con, "=@cc", 4) != 0)
45212 continue;
45213 con += 4;
45214 if (strchr (con, ',') != NULL)
45216 error ("alternatives not allowed in asm flag output");
45217 continue;
45220 bool invert = false;
45221 if (con[0] == 'n')
45222 invert = true, con++;
45224 machine_mode mode = CCmode;
45225 rtx_code code = UNKNOWN;
45227 switch (con[0])
45229 case 'a':
45230 if (con[1] == 0)
45231 mode = CCAmode, code = EQ;
45232 else if (con[1] == 'e' && con[2] == 0)
45233 mode = CCCmode, code = NE;
45234 break;
45235 case 'b':
45236 if (con[1] == 0)
45237 mode = CCCmode, code = EQ;
45238 else if (con[1] == 'e' && con[2] == 0)
45239 mode = CCAmode, code = NE;
45240 break;
45241 case 'c':
45242 if (con[1] == 0)
45243 mode = CCCmode, code = EQ;
45244 break;
45245 case 'e':
45246 if (con[1] == 0)
45247 mode = CCZmode, code = EQ;
45248 break;
45249 case 'g':
45250 if (con[1] == 0)
45251 mode = CCGCmode, code = GT;
45252 else if (con[1] == 'e' && con[2] == 0)
45253 mode = CCGCmode, code = GE;
45254 break;
45255 case 'l':
45256 if (con[1] == 0)
45257 mode = CCGCmode, code = LT;
45258 else if (con[1] == 'e' && con[2] == 0)
45259 mode = CCGCmode, code = LE;
45260 break;
45261 case 'o':
45262 if (con[1] == 0)
45263 mode = CCOmode, code = EQ;
45264 break;
45265 case 'p':
45266 if (con[1] == 0)
45267 mode = CCPmode, code = EQ;
45268 break;
45269 case 's':
45270 if (con[1] == 0)
45271 mode = CCSmode, code = EQ;
45272 break;
45273 case 'z':
45274 if (con[1] == 0)
45275 mode = CCZmode, code = EQ;
45276 break;
45278 if (code == UNKNOWN)
45280 error ("unknown asm flag output %qs", constraints[i]);
45281 continue;
45283 if (invert)
45284 code = reverse_condition (code);
45286 rtx dest = outputs[i];
45287 if (!saw_asm_flag)
45289 /* This is the first asm flag output. Here we put the flags
45290 register in as the real output and adjust the condition to
45291 allow it. */
45292 constraints[i] = "=Bf";
45293 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45294 saw_asm_flag = true;
45296 else
45298 /* We don't need the flags register as output twice. */
45299 constraints[i] = "=X";
45300 outputs[i] = gen_rtx_SCRATCH (SImode);
45303 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45304 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45306 machine_mode dest_mode = GET_MODE (dest);
45307 if (!SCALAR_INT_MODE_P (dest_mode))
45309 error ("invalid type for asm flag output");
45310 continue;
45313 if (dest_mode == DImode && !TARGET_64BIT)
45314 dest_mode = SImode;
45316 if (dest_mode != QImode)
45318 rtx destqi = gen_reg_rtx (QImode);
45319 emit_insn (gen_rtx_SET (destqi, x));
45321 if (TARGET_ZERO_EXTEND_WITH_AND
45322 && optimize_function_for_speed_p (cfun))
45324 x = force_reg (dest_mode, const0_rtx);
45326 emit_insn (gen_movstrictqi
45327 (gen_lowpart (QImode, x), destqi));
45329 else
45330 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45333 if (dest_mode != GET_MODE (dest))
45335 rtx tmp = gen_reg_rtx (SImode);
45337 emit_insn (gen_rtx_SET (tmp, x));
45338 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45340 else
45341 emit_insn (gen_rtx_SET (dest, x));
45343 rtx_insn *seq = get_insns ();
45344 end_sequence ();
45346 if (saw_asm_flag)
45347 return seq;
45348 else
45350 /* If we had no asm flag outputs, clobber the flags. */
45351 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45352 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45353 return NULL;
45357 /* Implements target vector targetm.asm.encode_section_info. */
45359 static void ATTRIBUTE_UNUSED
45360 ix86_encode_section_info (tree decl, rtx rtl, int first)
45362 default_encode_section_info (decl, rtl, first);
45364 if (ix86_in_large_data_p (decl))
45365 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45368 /* Worker function for REVERSE_CONDITION. */
45370 enum rtx_code
45371 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45373 return (mode != CCFPmode && mode != CCFPUmode
45374 ? reverse_condition (code)
45375 : reverse_condition_maybe_unordered (code));
45378 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45379 to OPERANDS[0]. */
45381 const char *
45382 output_387_reg_move (rtx_insn *insn, rtx *operands)
45384 if (REG_P (operands[0]))
45386 if (REG_P (operands[1])
45387 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45389 if (REGNO (operands[0]) == FIRST_STACK_REG)
45390 return output_387_ffreep (operands, 0);
45391 return "fstp\t%y0";
45393 if (STACK_TOP_P (operands[0]))
45394 return "fld%Z1\t%y1";
45395 return "fst\t%y0";
45397 else if (MEM_P (operands[0]))
45399 gcc_assert (REG_P (operands[1]));
45400 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45401 return "fstp%Z0\t%y0";
45402 else
45404 /* There is no non-popping store to memory for XFmode.
45405 So if we need one, follow the store with a load. */
45406 if (GET_MODE (operands[0]) == XFmode)
45407 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45408 else
45409 return "fst%Z0\t%y0";
45412 else
45413 gcc_unreachable();
45416 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45417 FP status register is set. */
45419 void
45420 ix86_emit_fp_unordered_jump (rtx label)
45422 rtx reg = gen_reg_rtx (HImode);
45423 rtx temp;
45425 emit_insn (gen_x86_fnstsw_1 (reg));
45427 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45429 emit_insn (gen_x86_sahf_1 (reg));
45431 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45432 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45434 else
45436 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45438 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45439 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45442 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45443 gen_rtx_LABEL_REF (VOIDmode, label),
45444 pc_rtx);
45445 temp = gen_rtx_SET (pc_rtx, temp);
45447 emit_jump_insn (temp);
45448 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45451 /* Output code to perform a log1p XFmode calculation. */
45453 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45455 rtx_code_label *label1 = gen_label_rtx ();
45456 rtx_code_label *label2 = gen_label_rtx ();
45458 rtx tmp = gen_reg_rtx (XFmode);
45459 rtx tmp2 = gen_reg_rtx (XFmode);
45460 rtx test;
45462 emit_insn (gen_absxf2 (tmp, op1));
45463 test = gen_rtx_GE (VOIDmode, tmp,
45464 const_double_from_real_value (
45465 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45466 XFmode));
45467 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45469 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45470 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45471 emit_jump (label2);
45473 emit_label (label1);
45474 emit_move_insn (tmp, CONST1_RTX (XFmode));
45475 emit_insn (gen_addxf3 (tmp, op1, tmp));
45476 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45477 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45479 emit_label (label2);
45482 /* Emit code for round calculation. */
45483 void ix86_emit_i387_round (rtx op0, rtx op1)
45485 machine_mode inmode = GET_MODE (op1);
45486 machine_mode outmode = GET_MODE (op0);
45487 rtx e1, e2, res, tmp, tmp1, half;
45488 rtx scratch = gen_reg_rtx (HImode);
45489 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45490 rtx_code_label *jump_label = gen_label_rtx ();
45491 rtx insn;
45492 rtx (*gen_abs) (rtx, rtx);
45493 rtx (*gen_neg) (rtx, rtx);
45495 switch (inmode)
45497 case SFmode:
45498 gen_abs = gen_abssf2;
45499 break;
45500 case DFmode:
45501 gen_abs = gen_absdf2;
45502 break;
45503 case XFmode:
45504 gen_abs = gen_absxf2;
45505 break;
45506 default:
45507 gcc_unreachable ();
45510 switch (outmode)
45512 case SFmode:
45513 gen_neg = gen_negsf2;
45514 break;
45515 case DFmode:
45516 gen_neg = gen_negdf2;
45517 break;
45518 case XFmode:
45519 gen_neg = gen_negxf2;
45520 break;
45521 case HImode:
45522 gen_neg = gen_neghi2;
45523 break;
45524 case SImode:
45525 gen_neg = gen_negsi2;
45526 break;
45527 case DImode:
45528 gen_neg = gen_negdi2;
45529 break;
45530 default:
45531 gcc_unreachable ();
45534 e1 = gen_reg_rtx (inmode);
45535 e2 = gen_reg_rtx (inmode);
45536 res = gen_reg_rtx (outmode);
45538 half = const_double_from_real_value (dconsthalf, inmode);
45540 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45542 /* scratch = fxam(op1) */
45543 emit_insn (gen_rtx_SET (scratch,
45544 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45545 UNSPEC_FXAM)));
45546 /* e1 = fabs(op1) */
45547 emit_insn (gen_abs (e1, op1));
45549 /* e2 = e1 + 0.5 */
45550 half = force_reg (inmode, half);
45551 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45553 /* res = floor(e2) */
45554 if (inmode != XFmode)
45556 tmp1 = gen_reg_rtx (XFmode);
45558 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45560 else
45561 tmp1 = e2;
45563 switch (outmode)
45565 case SFmode:
45566 case DFmode:
45568 rtx tmp0 = gen_reg_rtx (XFmode);
45570 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45572 emit_insn (gen_rtx_SET (res,
45573 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45574 UNSPEC_TRUNC_NOOP)));
45576 break;
45577 case XFmode:
45578 emit_insn (gen_frndintxf2_floor (res, tmp1));
45579 break;
45580 case HImode:
45581 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45582 break;
45583 case SImode:
45584 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45585 break;
45586 case DImode:
45587 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45588 break;
45589 default:
45590 gcc_unreachable ();
45593 /* flags = signbit(a) */
45594 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45596 /* if (flags) then res = -res */
45597 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45598 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45599 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45600 pc_rtx);
45601 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45602 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45603 JUMP_LABEL (insn) = jump_label;
45605 emit_insn (gen_neg (res, res));
45607 emit_label (jump_label);
45608 LABEL_NUSES (jump_label) = 1;
45610 emit_move_insn (op0, res);
45613 /* Output code to perform a Newton-Rhapson approximation of a single precision
45614 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45616 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45618 rtx x0, x1, e0, e1;
45620 x0 = gen_reg_rtx (mode);
45621 e0 = gen_reg_rtx (mode);
45622 e1 = gen_reg_rtx (mode);
45623 x1 = gen_reg_rtx (mode);
45625 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45627 b = force_reg (mode, b);
45629 /* x0 = rcp(b) estimate */
45630 if (mode == V16SFmode || mode == V8DFmode)
45632 if (TARGET_AVX512ER)
45634 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45635 UNSPEC_RCP28)));
45636 /* res = a * x0 */
45637 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45638 return;
45640 else
45641 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45642 UNSPEC_RCP14)));
45644 else
45645 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45646 UNSPEC_RCP)));
45648 /* e0 = x0 * b */
45649 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45651 /* e0 = x0 * e0 */
45652 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45654 /* e1 = x0 + x0 */
45655 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45657 /* x1 = e1 - e0 */
45658 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45660 /* res = a * x1 */
45661 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45664 /* Output code to perform a Newton-Rhapson approximation of a
45665 single precision floating point [reciprocal] square root. */
45667 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45669 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45670 REAL_VALUE_TYPE r;
45671 int unspec;
45673 x0 = gen_reg_rtx (mode);
45674 e0 = gen_reg_rtx (mode);
45675 e1 = gen_reg_rtx (mode);
45676 e2 = gen_reg_rtx (mode);
45677 e3 = gen_reg_rtx (mode);
45679 if (TARGET_AVX512ER && mode == V16SFmode)
45681 if (recip)
45682 /* res = rsqrt28(a) estimate */
45683 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45684 UNSPEC_RSQRT28)));
45685 else
45687 /* x0 = rsqrt28(a) estimate */
45688 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45689 UNSPEC_RSQRT28)));
45690 /* res = rcp28(x0) estimate */
45691 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45692 UNSPEC_RCP28)));
45694 return;
45697 real_from_integer (&r, VOIDmode, -3, SIGNED);
45698 mthree = const_double_from_real_value (r, SFmode);
45700 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45701 mhalf = const_double_from_real_value (r, SFmode);
45702 unspec = UNSPEC_RSQRT;
45704 if (VECTOR_MODE_P (mode))
45706 mthree = ix86_build_const_vector (mode, true, mthree);
45707 mhalf = ix86_build_const_vector (mode, true, mhalf);
45708 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45709 if (GET_MODE_SIZE (mode) == 64)
45710 unspec = UNSPEC_RSQRT14;
45713 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45714 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45716 a = force_reg (mode, a);
45718 /* x0 = rsqrt(a) estimate */
45719 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45720 unspec)));
45722 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45723 if (!recip)
45725 rtx zero = force_reg (mode, CONST0_RTX(mode));
45726 rtx mask;
45728 /* Handle masked compare. */
45729 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45731 mask = gen_reg_rtx (HImode);
45732 /* Imm value 0x4 corresponds to not-equal comparison. */
45733 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45734 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45736 else
45738 mask = gen_reg_rtx (mode);
45739 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45740 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45744 /* e0 = x0 * a */
45745 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45746 /* e1 = e0 * x0 */
45747 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45749 /* e2 = e1 - 3. */
45750 mthree = force_reg (mode, mthree);
45751 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45753 mhalf = force_reg (mode, mhalf);
45754 if (recip)
45755 /* e3 = -.5 * x0 */
45756 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45757 else
45758 /* e3 = -.5 * e0 */
45759 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45760 /* ret = e2 * e3 */
45761 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45764 #ifdef TARGET_SOLARIS
45765 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45767 static void
45768 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45769 tree decl)
45771 /* With Binutils 2.15, the "@unwind" marker must be specified on
45772 every occurrence of the ".eh_frame" section, not just the first
45773 one. */
45774 if (TARGET_64BIT
45775 && strcmp (name, ".eh_frame") == 0)
45777 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45778 flags & SECTION_WRITE ? "aw" : "a");
45779 return;
45782 #ifndef USE_GAS
45783 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45785 solaris_elf_asm_comdat_section (name, flags, decl);
45786 return;
45788 #endif
45790 default_elf_asm_named_section (name, flags, decl);
45792 #endif /* TARGET_SOLARIS */
45794 /* Return the mangling of TYPE if it is an extended fundamental type. */
45796 static const char *
45797 ix86_mangle_type (const_tree type)
45799 type = TYPE_MAIN_VARIANT (type);
45801 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45802 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45803 return NULL;
45805 switch (TYPE_MODE (type))
45807 case TFmode:
45808 /* __float128 is "g". */
45809 return "g";
45810 case XFmode:
45811 /* "long double" or __float80 is "e". */
45812 return "e";
45813 default:
45814 return NULL;
45818 #ifdef TARGET_THREAD_SSP_OFFSET
45819 /* If using TLS guards, don't waste time creating and expanding
45820 __stack_chk_guard decl and MEM as we are going to ignore it. */
45821 static tree
45822 ix86_stack_protect_guard (void)
45824 if (TARGET_SSP_TLS_GUARD)
45825 return NULL_TREE;
45826 return default_stack_protect_guard ();
45828 #endif
45830 /* For 32-bit code we can save PIC register setup by using
45831 __stack_chk_fail_local hidden function instead of calling
45832 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45833 register, so it is better to call __stack_chk_fail directly. */
45835 static tree ATTRIBUTE_UNUSED
45836 ix86_stack_protect_fail (void)
45838 return TARGET_64BIT
45839 ? default_external_stack_protect_fail ()
45840 : default_hidden_stack_protect_fail ();
45843 /* Select a format to encode pointers in exception handling data. CODE
45844 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45845 true if the symbol may be affected by dynamic relocations.
45847 ??? All x86 object file formats are capable of representing this.
45848 After all, the relocation needed is the same as for the call insn.
45849 Whether or not a particular assembler allows us to enter such, I
45850 guess we'll have to see. */
45852 asm_preferred_eh_data_format (int code, int global)
45854 if (flag_pic)
45856 int type = DW_EH_PE_sdata8;
45857 if (!TARGET_64BIT
45858 || ix86_cmodel == CM_SMALL_PIC
45859 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45860 type = DW_EH_PE_sdata4;
45861 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45863 if (ix86_cmodel == CM_SMALL
45864 || (ix86_cmodel == CM_MEDIUM && code))
45865 return DW_EH_PE_udata4;
45866 return DW_EH_PE_absptr;
45869 /* Expand copysign from SIGN to the positive value ABS_VALUE
45870 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45871 the sign-bit. */
45872 static void
45873 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45875 machine_mode mode = GET_MODE (sign);
45876 rtx sgn = gen_reg_rtx (mode);
45877 if (mask == NULL_RTX)
45879 machine_mode vmode;
45881 if (mode == SFmode)
45882 vmode = V4SFmode;
45883 else if (mode == DFmode)
45884 vmode = V2DFmode;
45885 else
45886 vmode = mode;
45888 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45889 if (!VECTOR_MODE_P (mode))
45891 /* We need to generate a scalar mode mask in this case. */
45892 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45893 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45894 mask = gen_reg_rtx (mode);
45895 emit_insn (gen_rtx_SET (mask, tmp));
45898 else
45899 mask = gen_rtx_NOT (mode, mask);
45900 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45901 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45904 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45905 mask for masking out the sign-bit is stored in *SMASK, if that is
45906 non-null. */
45907 static rtx
45908 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45910 machine_mode vmode, mode = GET_MODE (op0);
45911 rtx xa, mask;
45913 xa = gen_reg_rtx (mode);
45914 if (mode == SFmode)
45915 vmode = V4SFmode;
45916 else if (mode == DFmode)
45917 vmode = V2DFmode;
45918 else
45919 vmode = mode;
45920 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45921 if (!VECTOR_MODE_P (mode))
45923 /* We need to generate a scalar mode mask in this case. */
45924 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45925 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45926 mask = gen_reg_rtx (mode);
45927 emit_insn (gen_rtx_SET (mask, tmp));
45929 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45931 if (smask)
45932 *smask = mask;
45934 return xa;
45937 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45938 swapping the operands if SWAP_OPERANDS is true. The expanded
45939 code is a forward jump to a newly created label in case the
45940 comparison is true. The generated label rtx is returned. */
45941 static rtx_code_label *
45942 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45943 bool swap_operands)
45945 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45946 rtx_code_label *label;
45947 rtx tmp;
45949 if (swap_operands)
45950 std::swap (op0, op1);
45952 label = gen_label_rtx ();
45953 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45954 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45955 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45956 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45957 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45958 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45959 JUMP_LABEL (tmp) = label;
45961 return label;
45964 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45965 using comparison code CODE. Operands are swapped for the comparison if
45966 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45967 static rtx
45968 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45969 bool swap_operands)
45971 rtx (*insn)(rtx, rtx, rtx, rtx);
45972 machine_mode mode = GET_MODE (op0);
45973 rtx mask = gen_reg_rtx (mode);
45975 if (swap_operands)
45976 std::swap (op0, op1);
45978 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45980 emit_insn (insn (mask, op0, op1,
45981 gen_rtx_fmt_ee (code, mode, op0, op1)));
45982 return mask;
45985 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45986 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45987 static rtx
45988 ix86_gen_TWO52 (machine_mode mode)
45990 REAL_VALUE_TYPE TWO52r;
45991 rtx TWO52;
45993 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45994 TWO52 = const_double_from_real_value (TWO52r, mode);
45995 TWO52 = force_reg (mode, TWO52);
45997 return TWO52;
46000 /* Expand SSE sequence for computing lround from OP1 storing
46001 into OP0. */
46002 void
46003 ix86_expand_lround (rtx op0, rtx op1)
46005 /* C code for the stuff we're doing below:
46006 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
46007 return (long)tmp;
46009 machine_mode mode = GET_MODE (op1);
46010 const struct real_format *fmt;
46011 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46012 rtx adj;
46014 /* load nextafter (0.5, 0.0) */
46015 fmt = REAL_MODE_FORMAT (mode);
46016 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46017 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46019 /* adj = copysign (0.5, op1) */
46020 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46021 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46023 /* adj = op1 + adj */
46024 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46026 /* op0 = (imode)adj */
46027 expand_fix (op0, adj, 0);
46030 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46031 into OPERAND0. */
46032 void
46033 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46035 /* C code for the stuff we're doing below (for do_floor):
46036 xi = (long)op1;
46037 xi -= (double)xi > op1 ? 1 : 0;
46038 return xi;
46040 machine_mode fmode = GET_MODE (op1);
46041 machine_mode imode = GET_MODE (op0);
46042 rtx ireg, freg, tmp;
46043 rtx_code_label *label;
46045 /* reg = (long)op1 */
46046 ireg = gen_reg_rtx (imode);
46047 expand_fix (ireg, op1, 0);
46049 /* freg = (double)reg */
46050 freg = gen_reg_rtx (fmode);
46051 expand_float (freg, ireg, 0);
46053 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46054 label = ix86_expand_sse_compare_and_jump (UNLE,
46055 freg, op1, !do_floor);
46056 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46057 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46058 emit_move_insn (ireg, tmp);
46060 emit_label (label);
46061 LABEL_NUSES (label) = 1;
46063 emit_move_insn (op0, ireg);
46066 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46067 result in OPERAND0. */
46068 void
46069 ix86_expand_rint (rtx operand0, rtx operand1)
46071 /* C code for the stuff we're doing below:
46072 xa = fabs (operand1);
46073 if (!isless (xa, 2**52))
46074 return operand1;
46075 xa = xa + 2**52 - 2**52;
46076 return copysign (xa, operand1);
46078 machine_mode mode = GET_MODE (operand0);
46079 rtx res, xa, TWO52, mask;
46080 rtx_code_label *label;
46082 res = gen_reg_rtx (mode);
46083 emit_move_insn (res, operand1);
46085 /* xa = abs (operand1) */
46086 xa = ix86_expand_sse_fabs (res, &mask);
46088 /* if (!isless (xa, TWO52)) goto label; */
46089 TWO52 = ix86_gen_TWO52 (mode);
46090 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46092 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46093 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46095 ix86_sse_copysign_to_positive (res, xa, res, mask);
46097 emit_label (label);
46098 LABEL_NUSES (label) = 1;
46100 emit_move_insn (operand0, res);
46103 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46104 into OPERAND0. */
46105 void
46106 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46108 /* C code for the stuff we expand below.
46109 double xa = fabs (x), x2;
46110 if (!isless (xa, TWO52))
46111 return x;
46112 xa = xa + TWO52 - TWO52;
46113 x2 = copysign (xa, x);
46114 Compensate. Floor:
46115 if (x2 > x)
46116 x2 -= 1;
46117 Compensate. Ceil:
46118 if (x2 < x)
46119 x2 -= -1;
46120 return x2;
46122 machine_mode mode = GET_MODE (operand0);
46123 rtx xa, TWO52, tmp, one, res, mask;
46124 rtx_code_label *label;
46126 TWO52 = ix86_gen_TWO52 (mode);
46128 /* Temporary for holding the result, initialized to the input
46129 operand to ease control flow. */
46130 res = gen_reg_rtx (mode);
46131 emit_move_insn (res, operand1);
46133 /* xa = abs (operand1) */
46134 xa = ix86_expand_sse_fabs (res, &mask);
46136 /* if (!isless (xa, TWO52)) goto label; */
46137 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46139 /* xa = xa + TWO52 - TWO52; */
46140 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46141 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46143 /* xa = copysign (xa, operand1) */
46144 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46146 /* generate 1.0 or -1.0 */
46147 one = force_reg (mode,
46148 const_double_from_real_value (do_floor
46149 ? dconst1 : dconstm1, mode));
46151 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46152 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46153 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46154 /* We always need to subtract here to preserve signed zero. */
46155 tmp = expand_simple_binop (mode, MINUS,
46156 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46157 emit_move_insn (res, tmp);
46159 emit_label (label);
46160 LABEL_NUSES (label) = 1;
46162 emit_move_insn (operand0, res);
46165 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46166 into OPERAND0. */
46167 void
46168 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46170 /* C code for the stuff we expand below.
46171 double xa = fabs (x), x2;
46172 if (!isless (xa, TWO52))
46173 return x;
46174 x2 = (double)(long)x;
46175 Compensate. Floor:
46176 if (x2 > x)
46177 x2 -= 1;
46178 Compensate. Ceil:
46179 if (x2 < x)
46180 x2 += 1;
46181 if (HONOR_SIGNED_ZEROS (mode))
46182 return copysign (x2, x);
46183 return x2;
46185 machine_mode mode = GET_MODE (operand0);
46186 rtx xa, xi, TWO52, tmp, one, res, mask;
46187 rtx_code_label *label;
46189 TWO52 = ix86_gen_TWO52 (mode);
46191 /* Temporary for holding the result, initialized to the input
46192 operand to ease control flow. */
46193 res = gen_reg_rtx (mode);
46194 emit_move_insn (res, operand1);
46196 /* xa = abs (operand1) */
46197 xa = ix86_expand_sse_fabs (res, &mask);
46199 /* if (!isless (xa, TWO52)) goto label; */
46200 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46202 /* xa = (double)(long)x */
46203 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46204 expand_fix (xi, res, 0);
46205 expand_float (xa, xi, 0);
46207 /* generate 1.0 */
46208 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46210 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46211 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46212 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46213 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46214 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46215 emit_move_insn (res, tmp);
46217 if (HONOR_SIGNED_ZEROS (mode))
46218 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46220 emit_label (label);
46221 LABEL_NUSES (label) = 1;
46223 emit_move_insn (operand0, res);
46226 /* Expand SSE sequence for computing round from OPERAND1 storing
46227 into OPERAND0. Sequence that works without relying on DImode truncation
46228 via cvttsd2siq that is only available on 64bit targets. */
46229 void
46230 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46232 /* C code for the stuff we expand below.
46233 double xa = fabs (x), xa2, x2;
46234 if (!isless (xa, TWO52))
46235 return x;
46236 Using the absolute value and copying back sign makes
46237 -0.0 -> -0.0 correct.
46238 xa2 = xa + TWO52 - TWO52;
46239 Compensate.
46240 dxa = xa2 - xa;
46241 if (dxa <= -0.5)
46242 xa2 += 1;
46243 else if (dxa > 0.5)
46244 xa2 -= 1;
46245 x2 = copysign (xa2, x);
46246 return x2;
46248 machine_mode mode = GET_MODE (operand0);
46249 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46250 rtx_code_label *label;
46252 TWO52 = ix86_gen_TWO52 (mode);
46254 /* Temporary for holding the result, initialized to the input
46255 operand to ease control flow. */
46256 res = gen_reg_rtx (mode);
46257 emit_move_insn (res, operand1);
46259 /* xa = abs (operand1) */
46260 xa = ix86_expand_sse_fabs (res, &mask);
46262 /* if (!isless (xa, TWO52)) goto label; */
46263 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46265 /* xa2 = xa + TWO52 - TWO52; */
46266 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46267 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46269 /* dxa = xa2 - xa; */
46270 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46272 /* generate 0.5, 1.0 and -0.5 */
46273 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46274 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46275 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46276 0, OPTAB_DIRECT);
46278 /* Compensate. */
46279 tmp = gen_reg_rtx (mode);
46280 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46281 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46282 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46283 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46284 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46285 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46286 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46287 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46289 /* res = copysign (xa2, operand1) */
46290 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46292 emit_label (label);
46293 LABEL_NUSES (label) = 1;
46295 emit_move_insn (operand0, res);
46298 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46299 into OPERAND0. */
46300 void
46301 ix86_expand_trunc (rtx operand0, rtx operand1)
46303 /* C code for SSE variant we expand below.
46304 double xa = fabs (x), x2;
46305 if (!isless (xa, TWO52))
46306 return x;
46307 x2 = (double)(long)x;
46308 if (HONOR_SIGNED_ZEROS (mode))
46309 return copysign (x2, x);
46310 return x2;
46312 machine_mode mode = GET_MODE (operand0);
46313 rtx xa, xi, TWO52, res, mask;
46314 rtx_code_label *label;
46316 TWO52 = ix86_gen_TWO52 (mode);
46318 /* Temporary for holding the result, initialized to the input
46319 operand to ease control flow. */
46320 res = gen_reg_rtx (mode);
46321 emit_move_insn (res, operand1);
46323 /* xa = abs (operand1) */
46324 xa = ix86_expand_sse_fabs (res, &mask);
46326 /* if (!isless (xa, TWO52)) goto label; */
46327 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46329 /* x = (double)(long)x */
46330 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46331 expand_fix (xi, res, 0);
46332 expand_float (res, xi, 0);
46334 if (HONOR_SIGNED_ZEROS (mode))
46335 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46337 emit_label (label);
46338 LABEL_NUSES (label) = 1;
46340 emit_move_insn (operand0, res);
46343 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46344 into OPERAND0. */
46345 void
46346 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46348 machine_mode mode = GET_MODE (operand0);
46349 rtx xa, mask, TWO52, one, res, smask, tmp;
46350 rtx_code_label *label;
46352 /* C code for SSE variant we expand below.
46353 double xa = fabs (x), x2;
46354 if (!isless (xa, TWO52))
46355 return x;
46356 xa2 = xa + TWO52 - TWO52;
46357 Compensate:
46358 if (xa2 > xa)
46359 xa2 -= 1.0;
46360 x2 = copysign (xa2, x);
46361 return x2;
46364 TWO52 = ix86_gen_TWO52 (mode);
46366 /* Temporary for holding the result, initialized to the input
46367 operand to ease control flow. */
46368 res = gen_reg_rtx (mode);
46369 emit_move_insn (res, operand1);
46371 /* xa = abs (operand1) */
46372 xa = ix86_expand_sse_fabs (res, &smask);
46374 /* if (!isless (xa, TWO52)) goto label; */
46375 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46377 /* res = xa + TWO52 - TWO52; */
46378 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46379 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46380 emit_move_insn (res, tmp);
46382 /* generate 1.0 */
46383 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46385 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46386 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46387 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46388 tmp = expand_simple_binop (mode, MINUS,
46389 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46390 emit_move_insn (res, tmp);
46392 /* res = copysign (res, operand1) */
46393 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46395 emit_label (label);
46396 LABEL_NUSES (label) = 1;
46398 emit_move_insn (operand0, res);
46401 /* Expand SSE sequence for computing round from OPERAND1 storing
46402 into OPERAND0. */
46403 void
46404 ix86_expand_round (rtx operand0, rtx operand1)
46406 /* C code for the stuff we're doing below:
46407 double xa = fabs (x);
46408 if (!isless (xa, TWO52))
46409 return x;
46410 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46411 return copysign (xa, x);
46413 machine_mode mode = GET_MODE (operand0);
46414 rtx res, TWO52, xa, xi, half, mask;
46415 rtx_code_label *label;
46416 const struct real_format *fmt;
46417 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46419 /* Temporary for holding the result, initialized to the input
46420 operand to ease control flow. */
46421 res = gen_reg_rtx (mode);
46422 emit_move_insn (res, operand1);
46424 TWO52 = ix86_gen_TWO52 (mode);
46425 xa = ix86_expand_sse_fabs (res, &mask);
46426 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46428 /* load nextafter (0.5, 0.0) */
46429 fmt = REAL_MODE_FORMAT (mode);
46430 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46431 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46433 /* xa = xa + 0.5 */
46434 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46435 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46437 /* xa = (double)(int64_t)xa */
46438 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46439 expand_fix (xi, xa, 0);
46440 expand_float (xa, xi, 0);
46442 /* res = copysign (xa, operand1) */
46443 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46445 emit_label (label);
46446 LABEL_NUSES (label) = 1;
46448 emit_move_insn (operand0, res);
46451 /* Expand SSE sequence for computing round
46452 from OP1 storing into OP0 using sse4 round insn. */
46453 void
46454 ix86_expand_round_sse4 (rtx op0, rtx op1)
46456 machine_mode mode = GET_MODE (op0);
46457 rtx e1, e2, res, half;
46458 const struct real_format *fmt;
46459 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46460 rtx (*gen_copysign) (rtx, rtx, rtx);
46461 rtx (*gen_round) (rtx, rtx, rtx);
46463 switch (mode)
46465 case SFmode:
46466 gen_copysign = gen_copysignsf3;
46467 gen_round = gen_sse4_1_roundsf2;
46468 break;
46469 case DFmode:
46470 gen_copysign = gen_copysigndf3;
46471 gen_round = gen_sse4_1_rounddf2;
46472 break;
46473 default:
46474 gcc_unreachable ();
46477 /* round (a) = trunc (a + copysign (0.5, a)) */
46479 /* load nextafter (0.5, 0.0) */
46480 fmt = REAL_MODE_FORMAT (mode);
46481 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46482 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46483 half = const_double_from_real_value (pred_half, mode);
46485 /* e1 = copysign (0.5, op1) */
46486 e1 = gen_reg_rtx (mode);
46487 emit_insn (gen_copysign (e1, half, op1));
46489 /* e2 = op1 + e1 */
46490 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46492 /* res = trunc (e2) */
46493 res = gen_reg_rtx (mode);
46494 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46496 emit_move_insn (op0, res);
46500 /* Table of valid machine attributes. */
46501 static const struct attribute_spec ix86_attribute_table[] =
46503 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46504 affects_type_identity } */
46505 /* Stdcall attribute says callee is responsible for popping arguments
46506 if they are not variable. */
46507 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46508 true },
46509 /* Fastcall attribute says callee is responsible for popping arguments
46510 if they are not variable. */
46511 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46512 true },
46513 /* Thiscall attribute says callee is responsible for popping arguments
46514 if they are not variable. */
46515 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46516 true },
46517 /* Cdecl attribute says the callee is a normal C declaration */
46518 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46519 true },
46520 /* Regparm attribute specifies how many integer arguments are to be
46521 passed in registers. */
46522 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46523 true },
46524 /* Sseregparm attribute says we are using x86_64 calling conventions
46525 for FP arguments. */
46526 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46527 true },
46528 /* The transactional memory builtins are implicitly regparm or fastcall
46529 depending on the ABI. Override the generic do-nothing attribute that
46530 these builtins were declared with. */
46531 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46532 true },
46533 /* force_align_arg_pointer says this function realigns the stack at entry. */
46534 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46535 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46536 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46537 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46538 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46539 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46540 false },
46541 #endif
46542 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46543 false },
46544 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46545 false },
46546 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46547 SUBTARGET_ATTRIBUTE_TABLE,
46548 #endif
46549 /* ms_abi and sysv_abi calling convention function attributes. */
46550 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46551 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46552 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46553 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46554 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46555 false },
46556 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46557 ix86_handle_callee_pop_aggregate_return, true },
46558 { "interrupt", 0, 0, false, true, true,
46559 ix86_handle_interrupt_attribute, false },
46560 { "no_caller_saved_registers", 0, 0, false, true, true,
46561 ix86_handle_no_caller_saved_registers_attribute, false },
46562 { "naked", 0, 0, true, false, false,
46563 ix86_handle_fndecl_attribute, false },
46565 /* End element. */
46566 { NULL, 0, 0, false, false, false, NULL, false }
46569 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46570 static int
46571 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46572 tree vectype, int)
46574 switch (type_of_cost)
46576 case scalar_stmt:
46577 return ix86_cost->scalar_stmt_cost;
46579 case scalar_load:
46580 return ix86_cost->scalar_load_cost;
46582 case scalar_store:
46583 return ix86_cost->scalar_store_cost;
46585 case vector_stmt:
46586 return ix86_cost->vec_stmt_cost;
46588 case vector_load:
46589 return ix86_cost->vec_align_load_cost;
46591 case vector_store:
46592 return ix86_cost->vec_store_cost;
46594 case vec_to_scalar:
46595 return ix86_cost->vec_to_scalar_cost;
46597 case scalar_to_vec:
46598 return ix86_cost->scalar_to_vec_cost;
46600 case unaligned_load:
46601 case unaligned_store:
46602 return ix86_cost->vec_unalign_load_cost;
46604 case cond_branch_taken:
46605 return ix86_cost->cond_taken_branch_cost;
46607 case cond_branch_not_taken:
46608 return ix86_cost->cond_not_taken_branch_cost;
46610 case vec_perm:
46611 case vec_promote_demote:
46612 return ix86_cost->vec_stmt_cost;
46614 case vec_construct:
46615 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46617 default:
46618 gcc_unreachable ();
46622 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46623 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46624 insn every time. */
46626 static GTY(()) rtx_insn *vselect_insn;
46628 /* Initialize vselect_insn. */
46630 static void
46631 init_vselect_insn (void)
46633 unsigned i;
46634 rtx x;
46636 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46637 for (i = 0; i < MAX_VECT_LEN; ++i)
46638 XVECEXP (x, 0, i) = const0_rtx;
46639 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46640 const0_rtx), x);
46641 x = gen_rtx_SET (const0_rtx, x);
46642 start_sequence ();
46643 vselect_insn = emit_insn (x);
46644 end_sequence ();
46647 /* Construct (set target (vec_select op0 (parallel perm))) and
46648 return true if that's a valid instruction in the active ISA. */
46650 static bool
46651 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46652 unsigned nelt, bool testing_p)
46654 unsigned int i;
46655 rtx x, save_vconcat;
46656 int icode;
46658 if (vselect_insn == NULL_RTX)
46659 init_vselect_insn ();
46661 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46662 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46663 for (i = 0; i < nelt; ++i)
46664 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46665 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46666 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46667 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46668 SET_DEST (PATTERN (vselect_insn)) = target;
46669 icode = recog_memoized (vselect_insn);
46671 if (icode >= 0 && !testing_p)
46672 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46674 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46675 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46676 INSN_CODE (vselect_insn) = -1;
46678 return icode >= 0;
46681 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46683 static bool
46684 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46685 const unsigned char *perm, unsigned nelt,
46686 bool testing_p)
46688 machine_mode v2mode;
46689 rtx x;
46690 bool ok;
46692 if (vselect_insn == NULL_RTX)
46693 init_vselect_insn ();
46695 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
46696 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46697 PUT_MODE (x, v2mode);
46698 XEXP (x, 0) = op0;
46699 XEXP (x, 1) = op1;
46700 ok = expand_vselect (target, x, perm, nelt, testing_p);
46701 XEXP (x, 0) = const0_rtx;
46702 XEXP (x, 1) = const0_rtx;
46703 return ok;
46706 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46707 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46709 static bool
46710 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46712 machine_mode mmode, vmode = d->vmode;
46713 unsigned i, mask, nelt = d->nelt;
46714 rtx target, op0, op1, maskop, x;
46715 rtx rperm[32], vperm;
46717 if (d->one_operand_p)
46718 return false;
46719 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46720 && (TARGET_AVX512BW
46721 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46723 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46725 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46727 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46729 else
46730 return false;
46732 /* This is a blend, not a permute. Elements must stay in their
46733 respective lanes. */
46734 for (i = 0; i < nelt; ++i)
46736 unsigned e = d->perm[i];
46737 if (!(e == i || e == i + nelt))
46738 return false;
46741 if (d->testing_p)
46742 return true;
46744 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46745 decision should be extracted elsewhere, so that we only try that
46746 sequence once all budget==3 options have been tried. */
46747 target = d->target;
46748 op0 = d->op0;
46749 op1 = d->op1;
46750 mask = 0;
46752 switch (vmode)
46754 case V8DFmode:
46755 case V16SFmode:
46756 case V4DFmode:
46757 case V8SFmode:
46758 case V2DFmode:
46759 case V4SFmode:
46760 case V8HImode:
46761 case V8SImode:
46762 case V32HImode:
46763 case V64QImode:
46764 case V16SImode:
46765 case V8DImode:
46766 for (i = 0; i < nelt; ++i)
46767 mask |= (d->perm[i] >= nelt) << i;
46768 break;
46770 case V2DImode:
46771 for (i = 0; i < 2; ++i)
46772 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46773 vmode = V8HImode;
46774 goto do_subreg;
46776 case V4SImode:
46777 for (i = 0; i < 4; ++i)
46778 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46779 vmode = V8HImode;
46780 goto do_subreg;
46782 case V16QImode:
46783 /* See if bytes move in pairs so we can use pblendw with
46784 an immediate argument, rather than pblendvb with a vector
46785 argument. */
46786 for (i = 0; i < 16; i += 2)
46787 if (d->perm[i] + 1 != d->perm[i + 1])
46789 use_pblendvb:
46790 for (i = 0; i < nelt; ++i)
46791 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46793 finish_pblendvb:
46794 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46795 vperm = force_reg (vmode, vperm);
46797 if (GET_MODE_SIZE (vmode) == 16)
46798 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46799 else
46800 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46801 if (target != d->target)
46802 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46803 return true;
46806 for (i = 0; i < 8; ++i)
46807 mask |= (d->perm[i * 2] >= 16) << i;
46808 vmode = V8HImode;
46809 /* FALLTHRU */
46811 do_subreg:
46812 target = gen_reg_rtx (vmode);
46813 op0 = gen_lowpart (vmode, op0);
46814 op1 = gen_lowpart (vmode, op1);
46815 break;
46817 case V32QImode:
46818 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46819 for (i = 0; i < 32; i += 2)
46820 if (d->perm[i] + 1 != d->perm[i + 1])
46821 goto use_pblendvb;
46822 /* See if bytes move in quadruplets. If yes, vpblendd
46823 with immediate can be used. */
46824 for (i = 0; i < 32; i += 4)
46825 if (d->perm[i] + 2 != d->perm[i + 2])
46826 break;
46827 if (i < 32)
46829 /* See if bytes move the same in both lanes. If yes,
46830 vpblendw with immediate can be used. */
46831 for (i = 0; i < 16; i += 2)
46832 if (d->perm[i] + 16 != d->perm[i + 16])
46833 goto use_pblendvb;
46835 /* Use vpblendw. */
46836 for (i = 0; i < 16; ++i)
46837 mask |= (d->perm[i * 2] >= 32) << i;
46838 vmode = V16HImode;
46839 goto do_subreg;
46842 /* Use vpblendd. */
46843 for (i = 0; i < 8; ++i)
46844 mask |= (d->perm[i * 4] >= 32) << i;
46845 vmode = V8SImode;
46846 goto do_subreg;
46848 case V16HImode:
46849 /* See if words move in pairs. If yes, vpblendd can be used. */
46850 for (i = 0; i < 16; i += 2)
46851 if (d->perm[i] + 1 != d->perm[i + 1])
46852 break;
46853 if (i < 16)
46855 /* See if words move the same in both lanes. If not,
46856 vpblendvb must be used. */
46857 for (i = 0; i < 8; i++)
46858 if (d->perm[i] + 8 != d->perm[i + 8])
46860 /* Use vpblendvb. */
46861 for (i = 0; i < 32; ++i)
46862 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46864 vmode = V32QImode;
46865 nelt = 32;
46866 target = gen_reg_rtx (vmode);
46867 op0 = gen_lowpart (vmode, op0);
46868 op1 = gen_lowpart (vmode, op1);
46869 goto finish_pblendvb;
46872 /* Use vpblendw. */
46873 for (i = 0; i < 16; ++i)
46874 mask |= (d->perm[i] >= 16) << i;
46875 break;
46878 /* Use vpblendd. */
46879 for (i = 0; i < 8; ++i)
46880 mask |= (d->perm[i * 2] >= 16) << i;
46881 vmode = V8SImode;
46882 goto do_subreg;
46884 case V4DImode:
46885 /* Use vpblendd. */
46886 for (i = 0; i < 4; ++i)
46887 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46888 vmode = V8SImode;
46889 goto do_subreg;
46891 default:
46892 gcc_unreachable ();
46895 switch (vmode)
46897 case V8DFmode:
46898 case V8DImode:
46899 mmode = QImode;
46900 break;
46901 case V16SFmode:
46902 case V16SImode:
46903 mmode = HImode;
46904 break;
46905 case V32HImode:
46906 mmode = SImode;
46907 break;
46908 case V64QImode:
46909 mmode = DImode;
46910 break;
46911 default:
46912 mmode = VOIDmode;
46915 if (mmode != VOIDmode)
46916 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46917 else
46918 maskop = GEN_INT (mask);
46920 /* This matches five different patterns with the different modes. */
46921 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46922 x = gen_rtx_SET (target, x);
46923 emit_insn (x);
46924 if (target != d->target)
46925 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46927 return true;
46930 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46931 in terms of the variable form of vpermilps.
46933 Note that we will have already failed the immediate input vpermilps,
46934 which requires that the high and low part shuffle be identical; the
46935 variable form doesn't require that. */
46937 static bool
46938 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46940 rtx rperm[8], vperm;
46941 unsigned i;
46943 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46944 return false;
46946 /* We can only permute within the 128-bit lane. */
46947 for (i = 0; i < 8; ++i)
46949 unsigned e = d->perm[i];
46950 if (i < 4 ? e >= 4 : e < 4)
46951 return false;
46954 if (d->testing_p)
46955 return true;
46957 for (i = 0; i < 8; ++i)
46959 unsigned e = d->perm[i];
46961 /* Within each 128-bit lane, the elements of op0 are numbered
46962 from 0 and the elements of op1 are numbered from 4. */
46963 if (e >= 8 + 4)
46964 e -= 8;
46965 else if (e >= 4)
46966 e -= 4;
46968 rperm[i] = GEN_INT (e);
46971 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46972 vperm = force_reg (V8SImode, vperm);
46973 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46975 return true;
46978 /* Return true if permutation D can be performed as VMODE permutation
46979 instead. */
46981 static bool
46982 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46984 unsigned int i, j, chunk;
46986 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46987 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46988 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46989 return false;
46991 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46992 return true;
46994 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46995 for (i = 0; i < d->nelt; i += chunk)
46996 if (d->perm[i] & (chunk - 1))
46997 return false;
46998 else
46999 for (j = 1; j < chunk; ++j)
47000 if (d->perm[i] + j != d->perm[i + j])
47001 return false;
47003 return true;
47006 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47007 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
47009 static bool
47010 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47012 unsigned i, nelt, eltsz, mask;
47013 unsigned char perm[64];
47014 machine_mode vmode = V16QImode;
47015 rtx rperm[64], vperm, target, op0, op1;
47017 nelt = d->nelt;
47019 if (!d->one_operand_p)
47021 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47023 if (TARGET_AVX2
47024 && valid_perm_using_mode_p (V2TImode, d))
47026 if (d->testing_p)
47027 return true;
47029 /* Use vperm2i128 insn. The pattern uses
47030 V4DImode instead of V2TImode. */
47031 target = d->target;
47032 if (d->vmode != V4DImode)
47033 target = gen_reg_rtx (V4DImode);
47034 op0 = gen_lowpart (V4DImode, d->op0);
47035 op1 = gen_lowpart (V4DImode, d->op1);
47036 rperm[0]
47037 = GEN_INT ((d->perm[0] / (nelt / 2))
47038 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47039 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47040 if (target != d->target)
47041 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47042 return true;
47044 return false;
47047 else
47049 if (GET_MODE_SIZE (d->vmode) == 16)
47051 if (!TARGET_SSSE3)
47052 return false;
47054 else if (GET_MODE_SIZE (d->vmode) == 32)
47056 if (!TARGET_AVX2)
47057 return false;
47059 /* V4DImode should be already handled through
47060 expand_vselect by vpermq instruction. */
47061 gcc_assert (d->vmode != V4DImode);
47063 vmode = V32QImode;
47064 if (d->vmode == V8SImode
47065 || d->vmode == V16HImode
47066 || d->vmode == V32QImode)
47068 /* First see if vpermq can be used for
47069 V8SImode/V16HImode/V32QImode. */
47070 if (valid_perm_using_mode_p (V4DImode, d))
47072 for (i = 0; i < 4; i++)
47073 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47074 if (d->testing_p)
47075 return true;
47076 target = gen_reg_rtx (V4DImode);
47077 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47078 perm, 4, false))
47080 emit_move_insn (d->target,
47081 gen_lowpart (d->vmode, target));
47082 return true;
47084 return false;
47087 /* Next see if vpermd can be used. */
47088 if (valid_perm_using_mode_p (V8SImode, d))
47089 vmode = V8SImode;
47091 /* Or if vpermps can be used. */
47092 else if (d->vmode == V8SFmode)
47093 vmode = V8SImode;
47095 if (vmode == V32QImode)
47097 /* vpshufb only works intra lanes, it is not
47098 possible to shuffle bytes in between the lanes. */
47099 for (i = 0; i < nelt; ++i)
47100 if ((d->perm[i] ^ i) & (nelt / 2))
47101 return false;
47104 else if (GET_MODE_SIZE (d->vmode) == 64)
47106 if (!TARGET_AVX512BW)
47107 return false;
47109 /* If vpermq didn't work, vpshufb won't work either. */
47110 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47111 return false;
47113 vmode = V64QImode;
47114 if (d->vmode == V16SImode
47115 || d->vmode == V32HImode
47116 || d->vmode == V64QImode)
47118 /* First see if vpermq can be used for
47119 V16SImode/V32HImode/V64QImode. */
47120 if (valid_perm_using_mode_p (V8DImode, d))
47122 for (i = 0; i < 8; i++)
47123 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47124 if (d->testing_p)
47125 return true;
47126 target = gen_reg_rtx (V8DImode);
47127 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47128 perm, 8, false))
47130 emit_move_insn (d->target,
47131 gen_lowpart (d->vmode, target));
47132 return true;
47134 return false;
47137 /* Next see if vpermd can be used. */
47138 if (valid_perm_using_mode_p (V16SImode, d))
47139 vmode = V16SImode;
47141 /* Or if vpermps can be used. */
47142 else if (d->vmode == V16SFmode)
47143 vmode = V16SImode;
47144 if (vmode == V64QImode)
47146 /* vpshufb only works intra lanes, it is not
47147 possible to shuffle bytes in between the lanes. */
47148 for (i = 0; i < nelt; ++i)
47149 if ((d->perm[i] ^ i) & (nelt / 4))
47150 return false;
47153 else
47154 return false;
47157 if (d->testing_p)
47158 return true;
47160 if (vmode == V8SImode)
47161 for (i = 0; i < 8; ++i)
47162 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47163 else if (vmode == V16SImode)
47164 for (i = 0; i < 16; ++i)
47165 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47166 else
47168 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47169 if (!d->one_operand_p)
47170 mask = 2 * nelt - 1;
47171 else if (vmode == V16QImode)
47172 mask = nelt - 1;
47173 else if (vmode == V64QImode)
47174 mask = nelt / 4 - 1;
47175 else
47176 mask = nelt / 2 - 1;
47178 for (i = 0; i < nelt; ++i)
47180 unsigned j, e = d->perm[i] & mask;
47181 for (j = 0; j < eltsz; ++j)
47182 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47186 vperm = gen_rtx_CONST_VECTOR (vmode,
47187 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47188 vperm = force_reg (vmode, vperm);
47190 target = d->target;
47191 if (d->vmode != vmode)
47192 target = gen_reg_rtx (vmode);
47193 op0 = gen_lowpart (vmode, d->op0);
47194 if (d->one_operand_p)
47196 if (vmode == V16QImode)
47197 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47198 else if (vmode == V32QImode)
47199 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47200 else if (vmode == V64QImode)
47201 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47202 else if (vmode == V8SFmode)
47203 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47204 else if (vmode == V8SImode)
47205 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47206 else if (vmode == V16SFmode)
47207 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47208 else if (vmode == V16SImode)
47209 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47210 else
47211 gcc_unreachable ();
47213 else
47215 op1 = gen_lowpart (vmode, d->op1);
47216 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47218 if (target != d->target)
47219 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47221 return true;
47224 /* For V*[QHS]Imode permutations, check if the same permutation
47225 can't be performed in a 2x, 4x or 8x wider inner mode. */
47227 static bool
47228 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47229 struct expand_vec_perm_d *nd)
47231 int i;
47232 machine_mode mode = VOIDmode;
47234 switch (d->vmode)
47236 case V16QImode: mode = V8HImode; break;
47237 case V32QImode: mode = V16HImode; break;
47238 case V64QImode: mode = V32HImode; break;
47239 case V8HImode: mode = V4SImode; break;
47240 case V16HImode: mode = V8SImode; break;
47241 case V32HImode: mode = V16SImode; break;
47242 case V4SImode: mode = V2DImode; break;
47243 case V8SImode: mode = V4DImode; break;
47244 case V16SImode: mode = V8DImode; break;
47245 default: return false;
47247 for (i = 0; i < d->nelt; i += 2)
47248 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47249 return false;
47250 nd->vmode = mode;
47251 nd->nelt = d->nelt / 2;
47252 for (i = 0; i < nd->nelt; i++)
47253 nd->perm[i] = d->perm[2 * i] / 2;
47254 if (GET_MODE_INNER (mode) != DImode)
47255 canonicalize_vector_int_perm (nd, nd);
47256 if (nd != d)
47258 nd->one_operand_p = d->one_operand_p;
47259 nd->testing_p = d->testing_p;
47260 if (d->op0 == d->op1)
47261 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47262 else
47264 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47265 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47267 if (d->testing_p)
47268 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47269 else
47270 nd->target = gen_reg_rtx (nd->vmode);
47272 return true;
47275 /* Try to expand one-operand permutation with constant mask. */
47277 static bool
47278 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47280 machine_mode mode = GET_MODE (d->op0);
47281 machine_mode maskmode = mode;
47282 rtx (*gen) (rtx, rtx, rtx) = NULL;
47283 rtx target, op0, mask;
47284 rtx vec[64];
47286 if (!rtx_equal_p (d->op0, d->op1))
47287 return false;
47289 if (!TARGET_AVX512F)
47290 return false;
47292 switch (mode)
47294 case V16SImode:
47295 gen = gen_avx512f_permvarv16si;
47296 break;
47297 case V16SFmode:
47298 gen = gen_avx512f_permvarv16sf;
47299 maskmode = V16SImode;
47300 break;
47301 case V8DImode:
47302 gen = gen_avx512f_permvarv8di;
47303 break;
47304 case V8DFmode:
47305 gen = gen_avx512f_permvarv8df;
47306 maskmode = V8DImode;
47307 break;
47308 default:
47309 return false;
47312 target = d->target;
47313 op0 = d->op0;
47314 for (int i = 0; i < d->nelt; ++i)
47315 vec[i] = GEN_INT (d->perm[i]);
47316 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47317 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47318 return true;
47321 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47322 in a single instruction. */
47324 static bool
47325 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47327 unsigned i, nelt = d->nelt;
47328 struct expand_vec_perm_d nd;
47330 /* Check plain VEC_SELECT first, because AVX has instructions that could
47331 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47332 input where SEL+CONCAT may not. */
47333 if (d->one_operand_p)
47335 int mask = nelt - 1;
47336 bool identity_perm = true;
47337 bool broadcast_perm = true;
47339 for (i = 0; i < nelt; i++)
47341 nd.perm[i] = d->perm[i] & mask;
47342 if (nd.perm[i] != i)
47343 identity_perm = false;
47344 if (nd.perm[i])
47345 broadcast_perm = false;
47348 if (identity_perm)
47350 if (!d->testing_p)
47351 emit_move_insn (d->target, d->op0);
47352 return true;
47354 else if (broadcast_perm && TARGET_AVX2)
47356 /* Use vpbroadcast{b,w,d}. */
47357 rtx (*gen) (rtx, rtx) = NULL;
47358 switch (d->vmode)
47360 case V64QImode:
47361 if (TARGET_AVX512BW)
47362 gen = gen_avx512bw_vec_dupv64qi_1;
47363 break;
47364 case V32QImode:
47365 gen = gen_avx2_pbroadcastv32qi_1;
47366 break;
47367 case V32HImode:
47368 if (TARGET_AVX512BW)
47369 gen = gen_avx512bw_vec_dupv32hi_1;
47370 break;
47371 case V16HImode:
47372 gen = gen_avx2_pbroadcastv16hi_1;
47373 break;
47374 case V16SImode:
47375 if (TARGET_AVX512F)
47376 gen = gen_avx512f_vec_dupv16si_1;
47377 break;
47378 case V8SImode:
47379 gen = gen_avx2_pbroadcastv8si_1;
47380 break;
47381 case V16QImode:
47382 gen = gen_avx2_pbroadcastv16qi;
47383 break;
47384 case V8HImode:
47385 gen = gen_avx2_pbroadcastv8hi;
47386 break;
47387 case V16SFmode:
47388 if (TARGET_AVX512F)
47389 gen = gen_avx512f_vec_dupv16sf_1;
47390 break;
47391 case V8SFmode:
47392 gen = gen_avx2_vec_dupv8sf_1;
47393 break;
47394 case V8DFmode:
47395 if (TARGET_AVX512F)
47396 gen = gen_avx512f_vec_dupv8df_1;
47397 break;
47398 case V8DImode:
47399 if (TARGET_AVX512F)
47400 gen = gen_avx512f_vec_dupv8di_1;
47401 break;
47402 /* For other modes prefer other shuffles this function creates. */
47403 default: break;
47405 if (gen != NULL)
47407 if (!d->testing_p)
47408 emit_insn (gen (d->target, d->op0));
47409 return true;
47413 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47414 return true;
47416 /* There are plenty of patterns in sse.md that are written for
47417 SEL+CONCAT and are not replicated for a single op. Perhaps
47418 that should be changed, to avoid the nastiness here. */
47420 /* Recognize interleave style patterns, which means incrementing
47421 every other permutation operand. */
47422 for (i = 0; i < nelt; i += 2)
47424 nd.perm[i] = d->perm[i] & mask;
47425 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47427 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47428 d->testing_p))
47429 return true;
47431 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47432 if (nelt >= 4)
47434 for (i = 0; i < nelt; i += 4)
47436 nd.perm[i + 0] = d->perm[i + 0] & mask;
47437 nd.perm[i + 1] = d->perm[i + 1] & mask;
47438 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47439 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47442 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47443 d->testing_p))
47444 return true;
47448 /* Finally, try the fully general two operand permute. */
47449 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47450 d->testing_p))
47451 return true;
47453 /* Recognize interleave style patterns with reversed operands. */
47454 if (!d->one_operand_p)
47456 for (i = 0; i < nelt; ++i)
47458 unsigned e = d->perm[i];
47459 if (e >= nelt)
47460 e -= nelt;
47461 else
47462 e += nelt;
47463 nd.perm[i] = e;
47466 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47467 d->testing_p))
47468 return true;
47471 /* Try the SSE4.1 blend variable merge instructions. */
47472 if (expand_vec_perm_blend (d))
47473 return true;
47475 /* Try one of the AVX vpermil variable permutations. */
47476 if (expand_vec_perm_vpermil (d))
47477 return true;
47479 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47480 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47481 if (expand_vec_perm_pshufb (d))
47482 return true;
47484 /* Try the AVX2 vpalignr instruction. */
47485 if (expand_vec_perm_palignr (d, true))
47486 return true;
47488 /* Try the AVX512F vperm{s,d} instructions. */
47489 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47490 return true;
47492 /* Try the AVX512F vpermi2 instructions. */
47493 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47494 return true;
47496 /* See if we can get the same permutation in different vector integer
47497 mode. */
47498 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47500 if (!d->testing_p)
47501 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47502 return true;
47504 return false;
47507 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47508 in terms of a pair of pshuflw + pshufhw instructions. */
47510 static bool
47511 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47513 unsigned char perm2[MAX_VECT_LEN];
47514 unsigned i;
47515 bool ok;
47517 if (d->vmode != V8HImode || !d->one_operand_p)
47518 return false;
47520 /* The two permutations only operate in 64-bit lanes. */
47521 for (i = 0; i < 4; ++i)
47522 if (d->perm[i] >= 4)
47523 return false;
47524 for (i = 4; i < 8; ++i)
47525 if (d->perm[i] < 4)
47526 return false;
47528 if (d->testing_p)
47529 return true;
47531 /* Emit the pshuflw. */
47532 memcpy (perm2, d->perm, 4);
47533 for (i = 4; i < 8; ++i)
47534 perm2[i] = i;
47535 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47536 gcc_assert (ok);
47538 /* Emit the pshufhw. */
47539 memcpy (perm2 + 4, d->perm + 4, 4);
47540 for (i = 0; i < 4; ++i)
47541 perm2[i] = i;
47542 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47543 gcc_assert (ok);
47545 return true;
47548 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47549 the permutation using the SSSE3 palignr instruction. This succeeds
47550 when all of the elements in PERM fit within one vector and we merely
47551 need to shift them down so that a single vector permutation has a
47552 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47553 the vpalignr instruction itself can perform the requested permutation. */
47555 static bool
47556 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47558 unsigned i, nelt = d->nelt;
47559 unsigned min, max, minswap, maxswap;
47560 bool in_order, ok, swap = false;
47561 rtx shift, target;
47562 struct expand_vec_perm_d dcopy;
47564 /* Even with AVX, palignr only operates on 128-bit vectors,
47565 in AVX2 palignr operates on both 128-bit lanes. */
47566 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47567 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47568 return false;
47570 min = 2 * nelt;
47571 max = 0;
47572 minswap = 2 * nelt;
47573 maxswap = 0;
47574 for (i = 0; i < nelt; ++i)
47576 unsigned e = d->perm[i];
47577 unsigned eswap = d->perm[i] ^ nelt;
47578 if (GET_MODE_SIZE (d->vmode) == 32)
47580 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47581 eswap = e ^ (nelt / 2);
47583 if (e < min)
47584 min = e;
47585 if (e > max)
47586 max = e;
47587 if (eswap < minswap)
47588 minswap = eswap;
47589 if (eswap > maxswap)
47590 maxswap = eswap;
47592 if (min == 0
47593 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47595 if (d->one_operand_p
47596 || minswap == 0
47597 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47598 ? nelt / 2 : nelt))
47599 return false;
47600 swap = true;
47601 min = minswap;
47602 max = maxswap;
47605 /* Given that we have SSSE3, we know we'll be able to implement the
47606 single operand permutation after the palignr with pshufb for
47607 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47608 first. */
47609 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47610 return true;
47612 dcopy = *d;
47613 if (swap)
47615 dcopy.op0 = d->op1;
47616 dcopy.op1 = d->op0;
47617 for (i = 0; i < nelt; ++i)
47618 dcopy.perm[i] ^= nelt;
47621 in_order = true;
47622 for (i = 0; i < nelt; ++i)
47624 unsigned e = dcopy.perm[i];
47625 if (GET_MODE_SIZE (d->vmode) == 32
47626 && e >= nelt
47627 && (e & (nelt / 2 - 1)) < min)
47628 e = e - min - (nelt / 2);
47629 else
47630 e = e - min;
47631 if (e != i)
47632 in_order = false;
47633 dcopy.perm[i] = e;
47635 dcopy.one_operand_p = true;
47637 if (single_insn_only_p && !in_order)
47638 return false;
47640 /* For AVX2, test whether we can permute the result in one instruction. */
47641 if (d->testing_p)
47643 if (in_order)
47644 return true;
47645 dcopy.op1 = dcopy.op0;
47646 return expand_vec_perm_1 (&dcopy);
47649 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47650 if (GET_MODE_SIZE (d->vmode) == 16)
47652 target = gen_reg_rtx (TImode);
47653 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47654 gen_lowpart (TImode, dcopy.op0), shift));
47656 else
47658 target = gen_reg_rtx (V2TImode);
47659 emit_insn (gen_avx2_palignrv2ti (target,
47660 gen_lowpart (V2TImode, dcopy.op1),
47661 gen_lowpart (V2TImode, dcopy.op0),
47662 shift));
47665 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47667 /* Test for the degenerate case where the alignment by itself
47668 produces the desired permutation. */
47669 if (in_order)
47671 emit_move_insn (d->target, dcopy.op0);
47672 return true;
47675 ok = expand_vec_perm_1 (&dcopy);
47676 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47678 return ok;
47681 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47682 the permutation using the SSE4_1 pblendv instruction. Potentially
47683 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47685 static bool
47686 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47688 unsigned i, which, nelt = d->nelt;
47689 struct expand_vec_perm_d dcopy, dcopy1;
47690 machine_mode vmode = d->vmode;
47691 bool ok;
47693 /* Use the same checks as in expand_vec_perm_blend. */
47694 if (d->one_operand_p)
47695 return false;
47696 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47698 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47700 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47702 else
47703 return false;
47705 /* Figure out where permutation elements stay not in their
47706 respective lanes. */
47707 for (i = 0, which = 0; i < nelt; ++i)
47709 unsigned e = d->perm[i];
47710 if (e != i)
47711 which |= (e < nelt ? 1 : 2);
47713 /* We can pblend the part where elements stay not in their
47714 respective lanes only when these elements are all in one
47715 half of a permutation.
47716 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47717 lanes, but both 8 and 9 >= 8
47718 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47719 respective lanes and 8 >= 8, but 2 not. */
47720 if (which != 1 && which != 2)
47721 return false;
47722 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47723 return true;
47725 /* First we apply one operand permutation to the part where
47726 elements stay not in their respective lanes. */
47727 dcopy = *d;
47728 if (which == 2)
47729 dcopy.op0 = dcopy.op1 = d->op1;
47730 else
47731 dcopy.op0 = dcopy.op1 = d->op0;
47732 if (!d->testing_p)
47733 dcopy.target = gen_reg_rtx (vmode);
47734 dcopy.one_operand_p = true;
47736 for (i = 0; i < nelt; ++i)
47737 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47739 ok = expand_vec_perm_1 (&dcopy);
47740 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47741 return false;
47742 else
47743 gcc_assert (ok);
47744 if (d->testing_p)
47745 return true;
47747 /* Next we put permuted elements into their positions. */
47748 dcopy1 = *d;
47749 if (which == 2)
47750 dcopy1.op1 = dcopy.target;
47751 else
47752 dcopy1.op0 = dcopy.target;
47754 for (i = 0; i < nelt; ++i)
47755 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47757 ok = expand_vec_perm_blend (&dcopy1);
47758 gcc_assert (ok);
47760 return true;
47763 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47765 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47766 a two vector permutation into a single vector permutation by using
47767 an interleave operation to merge the vectors. */
47769 static bool
47770 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47772 struct expand_vec_perm_d dremap, dfinal;
47773 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47774 unsigned HOST_WIDE_INT contents;
47775 unsigned char remap[2 * MAX_VECT_LEN];
47776 rtx_insn *seq;
47777 bool ok, same_halves = false;
47779 if (GET_MODE_SIZE (d->vmode) == 16)
47781 if (d->one_operand_p)
47782 return false;
47784 else if (GET_MODE_SIZE (d->vmode) == 32)
47786 if (!TARGET_AVX)
47787 return false;
47788 /* For 32-byte modes allow even d->one_operand_p.
47789 The lack of cross-lane shuffling in some instructions
47790 might prevent a single insn shuffle. */
47791 dfinal = *d;
47792 dfinal.testing_p = true;
47793 /* If expand_vec_perm_interleave3 can expand this into
47794 a 3 insn sequence, give up and let it be expanded as
47795 3 insn sequence. While that is one insn longer,
47796 it doesn't need a memory operand and in the common
47797 case that both interleave low and high permutations
47798 with the same operands are adjacent needs 4 insns
47799 for both after CSE. */
47800 if (expand_vec_perm_interleave3 (&dfinal))
47801 return false;
47803 else
47804 return false;
47806 /* Examine from whence the elements come. */
47807 contents = 0;
47808 for (i = 0; i < nelt; ++i)
47809 contents |= HOST_WIDE_INT_1U << d->perm[i];
47811 memset (remap, 0xff, sizeof (remap));
47812 dremap = *d;
47814 if (GET_MODE_SIZE (d->vmode) == 16)
47816 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47818 /* Split the two input vectors into 4 halves. */
47819 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47820 h2 = h1 << nelt2;
47821 h3 = h2 << nelt2;
47822 h4 = h3 << nelt2;
47824 /* If the elements from the low halves use interleave low, and similarly
47825 for interleave high. If the elements are from mis-matched halves, we
47826 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47827 if ((contents & (h1 | h3)) == contents)
47829 /* punpckl* */
47830 for (i = 0; i < nelt2; ++i)
47832 remap[i] = i * 2;
47833 remap[i + nelt] = i * 2 + 1;
47834 dremap.perm[i * 2] = i;
47835 dremap.perm[i * 2 + 1] = i + nelt;
47837 if (!TARGET_SSE2 && d->vmode == V4SImode)
47838 dremap.vmode = V4SFmode;
47840 else if ((contents & (h2 | h4)) == contents)
47842 /* punpckh* */
47843 for (i = 0; i < nelt2; ++i)
47845 remap[i + nelt2] = i * 2;
47846 remap[i + nelt + nelt2] = i * 2 + 1;
47847 dremap.perm[i * 2] = i + nelt2;
47848 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47850 if (!TARGET_SSE2 && d->vmode == V4SImode)
47851 dremap.vmode = V4SFmode;
47853 else if ((contents & (h1 | h4)) == contents)
47855 /* shufps */
47856 for (i = 0; i < nelt2; ++i)
47858 remap[i] = i;
47859 remap[i + nelt + nelt2] = i + nelt2;
47860 dremap.perm[i] = i;
47861 dremap.perm[i + nelt2] = i + nelt + nelt2;
47863 if (nelt != 4)
47865 /* shufpd */
47866 dremap.vmode = V2DImode;
47867 dremap.nelt = 2;
47868 dremap.perm[0] = 0;
47869 dremap.perm[1] = 3;
47872 else if ((contents & (h2 | h3)) == contents)
47874 /* shufps */
47875 for (i = 0; i < nelt2; ++i)
47877 remap[i + nelt2] = i;
47878 remap[i + nelt] = i + nelt2;
47879 dremap.perm[i] = i + nelt2;
47880 dremap.perm[i + nelt2] = i + nelt;
47882 if (nelt != 4)
47884 /* shufpd */
47885 dremap.vmode = V2DImode;
47886 dremap.nelt = 2;
47887 dremap.perm[0] = 1;
47888 dremap.perm[1] = 2;
47891 else
47892 return false;
47894 else
47896 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47897 unsigned HOST_WIDE_INT q[8];
47898 unsigned int nonzero_halves[4];
47900 /* Split the two input vectors into 8 quarters. */
47901 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47902 for (i = 1; i < 8; ++i)
47903 q[i] = q[0] << (nelt4 * i);
47904 for (i = 0; i < 4; ++i)
47905 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47907 nonzero_halves[nzcnt] = i;
47908 ++nzcnt;
47911 if (nzcnt == 1)
47913 gcc_assert (d->one_operand_p);
47914 nonzero_halves[1] = nonzero_halves[0];
47915 same_halves = true;
47917 else if (d->one_operand_p)
47919 gcc_assert (nonzero_halves[0] == 0);
47920 gcc_assert (nonzero_halves[1] == 1);
47923 if (nzcnt <= 2)
47925 if (d->perm[0] / nelt2 == nonzero_halves[1])
47927 /* Attempt to increase the likelihood that dfinal
47928 shuffle will be intra-lane. */
47929 std::swap (nonzero_halves[0], nonzero_halves[1]);
47932 /* vperm2f128 or vperm2i128. */
47933 for (i = 0; i < nelt2; ++i)
47935 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47936 remap[i + nonzero_halves[0] * nelt2] = i;
47937 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47938 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47941 if (d->vmode != V8SFmode
47942 && d->vmode != V4DFmode
47943 && d->vmode != V8SImode)
47945 dremap.vmode = V8SImode;
47946 dremap.nelt = 8;
47947 for (i = 0; i < 4; ++i)
47949 dremap.perm[i] = i + nonzero_halves[0] * 4;
47950 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47954 else if (d->one_operand_p)
47955 return false;
47956 else if (TARGET_AVX2
47957 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47959 /* vpunpckl* */
47960 for (i = 0; i < nelt4; ++i)
47962 remap[i] = i * 2;
47963 remap[i + nelt] = i * 2 + 1;
47964 remap[i + nelt2] = i * 2 + nelt2;
47965 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47966 dremap.perm[i * 2] = i;
47967 dremap.perm[i * 2 + 1] = i + nelt;
47968 dremap.perm[i * 2 + nelt2] = i + nelt2;
47969 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47972 else if (TARGET_AVX2
47973 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47975 /* vpunpckh* */
47976 for (i = 0; i < nelt4; ++i)
47978 remap[i + nelt4] = i * 2;
47979 remap[i + nelt + nelt4] = i * 2 + 1;
47980 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47981 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47982 dremap.perm[i * 2] = i + nelt4;
47983 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47984 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47985 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47988 else
47989 return false;
47992 /* Use the remapping array set up above to move the elements from their
47993 swizzled locations into their final destinations. */
47994 dfinal = *d;
47995 for (i = 0; i < nelt; ++i)
47997 unsigned e = remap[d->perm[i]];
47998 gcc_assert (e < nelt);
47999 /* If same_halves is true, both halves of the remapped vector are the
48000 same. Avoid cross-lane accesses if possible. */
48001 if (same_halves && i >= nelt2)
48003 gcc_assert (e < nelt2);
48004 dfinal.perm[i] = e + nelt2;
48006 else
48007 dfinal.perm[i] = e;
48009 if (!d->testing_p)
48011 dremap.target = gen_reg_rtx (dremap.vmode);
48012 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48014 dfinal.op1 = dfinal.op0;
48015 dfinal.one_operand_p = true;
48017 /* Test if the final remap can be done with a single insn. For V4SFmode or
48018 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48019 start_sequence ();
48020 ok = expand_vec_perm_1 (&dfinal);
48021 seq = get_insns ();
48022 end_sequence ();
48024 if (!ok)
48025 return false;
48027 if (d->testing_p)
48028 return true;
48030 if (dremap.vmode != dfinal.vmode)
48032 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48033 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48036 ok = expand_vec_perm_1 (&dremap);
48037 gcc_assert (ok);
48039 emit_insn (seq);
48040 return true;
48043 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48044 a single vector cross-lane permutation into vpermq followed
48045 by any of the single insn permutations. */
48047 static bool
48048 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48050 struct expand_vec_perm_d dremap, dfinal;
48051 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48052 unsigned contents[2];
48053 bool ok;
48055 if (!(TARGET_AVX2
48056 && (d->vmode == V32QImode || d->vmode == V16HImode)
48057 && d->one_operand_p))
48058 return false;
48060 contents[0] = 0;
48061 contents[1] = 0;
48062 for (i = 0; i < nelt2; ++i)
48064 contents[0] |= 1u << (d->perm[i] / nelt4);
48065 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48068 for (i = 0; i < 2; ++i)
48070 unsigned int cnt = 0;
48071 for (j = 0; j < 4; ++j)
48072 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48073 return false;
48076 if (d->testing_p)
48077 return true;
48079 dremap = *d;
48080 dremap.vmode = V4DImode;
48081 dremap.nelt = 4;
48082 dremap.target = gen_reg_rtx (V4DImode);
48083 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48084 dremap.op1 = dremap.op0;
48085 dremap.one_operand_p = true;
48086 for (i = 0; i < 2; ++i)
48088 unsigned int cnt = 0;
48089 for (j = 0; j < 4; ++j)
48090 if ((contents[i] & (1u << j)) != 0)
48091 dremap.perm[2 * i + cnt++] = j;
48092 for (; cnt < 2; ++cnt)
48093 dremap.perm[2 * i + cnt] = 0;
48096 dfinal = *d;
48097 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48098 dfinal.op1 = dfinal.op0;
48099 dfinal.one_operand_p = true;
48100 for (i = 0, j = 0; i < nelt; ++i)
48102 if (i == nelt2)
48103 j = 2;
48104 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48105 if ((d->perm[i] / nelt4) == dremap.perm[j])
48107 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48108 dfinal.perm[i] |= nelt4;
48109 else
48110 gcc_unreachable ();
48113 ok = expand_vec_perm_1 (&dremap);
48114 gcc_assert (ok);
48116 ok = expand_vec_perm_1 (&dfinal);
48117 gcc_assert (ok);
48119 return true;
48122 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48123 a vector permutation using two instructions, vperm2f128 resp.
48124 vperm2i128 followed by any single in-lane permutation. */
48126 static bool
48127 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48129 struct expand_vec_perm_d dfirst, dsecond;
48130 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48131 bool ok;
48133 if (!TARGET_AVX
48134 || GET_MODE_SIZE (d->vmode) != 32
48135 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48136 return false;
48138 dsecond = *d;
48139 dsecond.one_operand_p = false;
48140 dsecond.testing_p = true;
48142 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48143 immediate. For perm < 16 the second permutation uses
48144 d->op0 as first operand, for perm >= 16 it uses d->op1
48145 as first operand. The second operand is the result of
48146 vperm2[fi]128. */
48147 for (perm = 0; perm < 32; perm++)
48149 /* Ignore permutations which do not move anything cross-lane. */
48150 if (perm < 16)
48152 /* The second shuffle for e.g. V4DFmode has
48153 0123 and ABCD operands.
48154 Ignore AB23, as 23 is already in the second lane
48155 of the first operand. */
48156 if ((perm & 0xc) == (1 << 2)) continue;
48157 /* And 01CD, as 01 is in the first lane of the first
48158 operand. */
48159 if ((perm & 3) == 0) continue;
48160 /* And 4567, as then the vperm2[fi]128 doesn't change
48161 anything on the original 4567 second operand. */
48162 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48164 else
48166 /* The second shuffle for e.g. V4DFmode has
48167 4567 and ABCD operands.
48168 Ignore AB67, as 67 is already in the second lane
48169 of the first operand. */
48170 if ((perm & 0xc) == (3 << 2)) continue;
48171 /* And 45CD, as 45 is in the first lane of the first
48172 operand. */
48173 if ((perm & 3) == 2) continue;
48174 /* And 0123, as then the vperm2[fi]128 doesn't change
48175 anything on the original 0123 first operand. */
48176 if ((perm & 0xf) == (1 << 2)) continue;
48179 for (i = 0; i < nelt; i++)
48181 j = d->perm[i] / nelt2;
48182 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48183 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48184 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48185 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48186 else
48187 break;
48190 if (i == nelt)
48192 start_sequence ();
48193 ok = expand_vec_perm_1 (&dsecond);
48194 end_sequence ();
48196 else
48197 ok = false;
48199 if (ok)
48201 if (d->testing_p)
48202 return true;
48204 /* Found a usable second shuffle. dfirst will be
48205 vperm2f128 on d->op0 and d->op1. */
48206 dsecond.testing_p = false;
48207 dfirst = *d;
48208 dfirst.target = gen_reg_rtx (d->vmode);
48209 for (i = 0; i < nelt; i++)
48210 dfirst.perm[i] = (i & (nelt2 - 1))
48211 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48213 canonicalize_perm (&dfirst);
48214 ok = expand_vec_perm_1 (&dfirst);
48215 gcc_assert (ok);
48217 /* And dsecond is some single insn shuffle, taking
48218 d->op0 and result of vperm2f128 (if perm < 16) or
48219 d->op1 and result of vperm2f128 (otherwise). */
48220 if (perm >= 16)
48221 dsecond.op0 = dsecond.op1;
48222 dsecond.op1 = dfirst.target;
48224 ok = expand_vec_perm_1 (&dsecond);
48225 gcc_assert (ok);
48227 return true;
48230 /* For one operand, the only useful vperm2f128 permutation is 0x01
48231 aka lanes swap. */
48232 if (d->one_operand_p)
48233 return false;
48236 return false;
48239 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48240 a two vector permutation using 2 intra-lane interleave insns
48241 and cross-lane shuffle for 32-byte vectors. */
48243 static bool
48244 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48246 unsigned i, nelt;
48247 rtx (*gen) (rtx, rtx, rtx);
48249 if (d->one_operand_p)
48250 return false;
48251 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48253 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48255 else
48256 return false;
48258 nelt = d->nelt;
48259 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48260 return false;
48261 for (i = 0; i < nelt; i += 2)
48262 if (d->perm[i] != d->perm[0] + i / 2
48263 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48264 return false;
48266 if (d->testing_p)
48267 return true;
48269 switch (d->vmode)
48271 case V32QImode:
48272 if (d->perm[0])
48273 gen = gen_vec_interleave_highv32qi;
48274 else
48275 gen = gen_vec_interleave_lowv32qi;
48276 break;
48277 case V16HImode:
48278 if (d->perm[0])
48279 gen = gen_vec_interleave_highv16hi;
48280 else
48281 gen = gen_vec_interleave_lowv16hi;
48282 break;
48283 case V8SImode:
48284 if (d->perm[0])
48285 gen = gen_vec_interleave_highv8si;
48286 else
48287 gen = gen_vec_interleave_lowv8si;
48288 break;
48289 case V4DImode:
48290 if (d->perm[0])
48291 gen = gen_vec_interleave_highv4di;
48292 else
48293 gen = gen_vec_interleave_lowv4di;
48294 break;
48295 case V8SFmode:
48296 if (d->perm[0])
48297 gen = gen_vec_interleave_highv8sf;
48298 else
48299 gen = gen_vec_interleave_lowv8sf;
48300 break;
48301 case V4DFmode:
48302 if (d->perm[0])
48303 gen = gen_vec_interleave_highv4df;
48304 else
48305 gen = gen_vec_interleave_lowv4df;
48306 break;
48307 default:
48308 gcc_unreachable ();
48311 emit_insn (gen (d->target, d->op0, d->op1));
48312 return true;
48315 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48316 a single vector permutation using a single intra-lane vector
48317 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48318 the non-swapped and swapped vectors together. */
48320 static bool
48321 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48323 struct expand_vec_perm_d dfirst, dsecond;
48324 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48325 rtx_insn *seq;
48326 bool ok;
48327 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48329 if (!TARGET_AVX
48330 || TARGET_AVX2
48331 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48332 || !d->one_operand_p)
48333 return false;
48335 dfirst = *d;
48336 for (i = 0; i < nelt; i++)
48337 dfirst.perm[i] = 0xff;
48338 for (i = 0, msk = 0; i < nelt; i++)
48340 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48341 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48342 return false;
48343 dfirst.perm[j] = d->perm[i];
48344 if (j != i)
48345 msk |= (1 << i);
48347 for (i = 0; i < nelt; i++)
48348 if (dfirst.perm[i] == 0xff)
48349 dfirst.perm[i] = i;
48351 if (!d->testing_p)
48352 dfirst.target = gen_reg_rtx (dfirst.vmode);
48354 start_sequence ();
48355 ok = expand_vec_perm_1 (&dfirst);
48356 seq = get_insns ();
48357 end_sequence ();
48359 if (!ok)
48360 return false;
48362 if (d->testing_p)
48363 return true;
48365 emit_insn (seq);
48367 dsecond = *d;
48368 dsecond.op0 = dfirst.target;
48369 dsecond.op1 = dfirst.target;
48370 dsecond.one_operand_p = true;
48371 dsecond.target = gen_reg_rtx (dsecond.vmode);
48372 for (i = 0; i < nelt; i++)
48373 dsecond.perm[i] = i ^ nelt2;
48375 ok = expand_vec_perm_1 (&dsecond);
48376 gcc_assert (ok);
48378 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48379 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48380 return true;
48383 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48384 permutation using two vperm2f128, followed by a vshufpd insn blending
48385 the two vectors together. */
48387 static bool
48388 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48390 struct expand_vec_perm_d dfirst, dsecond, dthird;
48391 bool ok;
48393 if (!TARGET_AVX || (d->vmode != V4DFmode))
48394 return false;
48396 if (d->testing_p)
48397 return true;
48399 dfirst = *d;
48400 dsecond = *d;
48401 dthird = *d;
48403 dfirst.perm[0] = (d->perm[0] & ~1);
48404 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48405 dfirst.perm[2] = (d->perm[2] & ~1);
48406 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48407 dsecond.perm[0] = (d->perm[1] & ~1);
48408 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48409 dsecond.perm[2] = (d->perm[3] & ~1);
48410 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48411 dthird.perm[0] = (d->perm[0] % 2);
48412 dthird.perm[1] = (d->perm[1] % 2) + 4;
48413 dthird.perm[2] = (d->perm[2] % 2) + 2;
48414 dthird.perm[3] = (d->perm[3] % 2) + 6;
48416 dfirst.target = gen_reg_rtx (dfirst.vmode);
48417 dsecond.target = gen_reg_rtx (dsecond.vmode);
48418 dthird.op0 = dfirst.target;
48419 dthird.op1 = dsecond.target;
48420 dthird.one_operand_p = false;
48422 canonicalize_perm (&dfirst);
48423 canonicalize_perm (&dsecond);
48425 ok = expand_vec_perm_1 (&dfirst)
48426 && expand_vec_perm_1 (&dsecond)
48427 && expand_vec_perm_1 (&dthird);
48429 gcc_assert (ok);
48431 return true;
48434 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48435 permutation with two pshufb insns and an ior. We should have already
48436 failed all two instruction sequences. */
48438 static bool
48439 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48441 rtx rperm[2][16], vperm, l, h, op, m128;
48442 unsigned int i, nelt, eltsz;
48444 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48445 return false;
48446 gcc_assert (!d->one_operand_p);
48448 if (d->testing_p)
48449 return true;
48451 nelt = d->nelt;
48452 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48454 /* Generate two permutation masks. If the required element is within
48455 the given vector it is shuffled into the proper lane. If the required
48456 element is in the other vector, force a zero into the lane by setting
48457 bit 7 in the permutation mask. */
48458 m128 = GEN_INT (-128);
48459 for (i = 0; i < nelt; ++i)
48461 unsigned j, e = d->perm[i];
48462 unsigned which = (e >= nelt);
48463 if (e >= nelt)
48464 e -= nelt;
48466 for (j = 0; j < eltsz; ++j)
48468 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48469 rperm[1-which][i*eltsz + j] = m128;
48473 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48474 vperm = force_reg (V16QImode, vperm);
48476 l = gen_reg_rtx (V16QImode);
48477 op = gen_lowpart (V16QImode, d->op0);
48478 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48480 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48481 vperm = force_reg (V16QImode, vperm);
48483 h = gen_reg_rtx (V16QImode);
48484 op = gen_lowpart (V16QImode, d->op1);
48485 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48487 op = d->target;
48488 if (d->vmode != V16QImode)
48489 op = gen_reg_rtx (V16QImode);
48490 emit_insn (gen_iorv16qi3 (op, l, h));
48491 if (op != d->target)
48492 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48494 return true;
48497 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48498 with two vpshufb insns, vpermq and vpor. We should have already failed
48499 all two or three instruction sequences. */
48501 static bool
48502 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48504 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48505 unsigned int i, nelt, eltsz;
48507 if (!TARGET_AVX2
48508 || !d->one_operand_p
48509 || (d->vmode != V32QImode && d->vmode != V16HImode))
48510 return false;
48512 if (d->testing_p)
48513 return true;
48515 nelt = d->nelt;
48516 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48518 /* Generate two permutation masks. If the required element is within
48519 the same lane, it is shuffled in. If the required element from the
48520 other lane, force a zero by setting bit 7 in the permutation mask.
48521 In the other mask the mask has non-negative elements if element
48522 is requested from the other lane, but also moved to the other lane,
48523 so that the result of vpshufb can have the two V2TImode halves
48524 swapped. */
48525 m128 = GEN_INT (-128);
48526 for (i = 0; i < nelt; ++i)
48528 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48529 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48531 for (j = 0; j < eltsz; ++j)
48533 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48534 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48538 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48539 vperm = force_reg (V32QImode, vperm);
48541 h = gen_reg_rtx (V32QImode);
48542 op = gen_lowpart (V32QImode, d->op0);
48543 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48545 /* Swap the 128-byte lanes of h into hp. */
48546 hp = gen_reg_rtx (V4DImode);
48547 op = gen_lowpart (V4DImode, h);
48548 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48549 const1_rtx));
48551 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48552 vperm = force_reg (V32QImode, vperm);
48554 l = gen_reg_rtx (V32QImode);
48555 op = gen_lowpart (V32QImode, d->op0);
48556 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48558 op = d->target;
48559 if (d->vmode != V32QImode)
48560 op = gen_reg_rtx (V32QImode);
48561 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48562 if (op != d->target)
48563 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48565 return true;
48568 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48569 and extract-odd permutations of two V32QImode and V16QImode operand
48570 with two vpshufb insns, vpor and vpermq. We should have already
48571 failed all two or three instruction sequences. */
48573 static bool
48574 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48576 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48577 unsigned int i, nelt, eltsz;
48579 if (!TARGET_AVX2
48580 || d->one_operand_p
48581 || (d->vmode != V32QImode && d->vmode != V16HImode))
48582 return false;
48584 for (i = 0; i < d->nelt; ++i)
48585 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48586 return false;
48588 if (d->testing_p)
48589 return true;
48591 nelt = d->nelt;
48592 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48594 /* Generate two permutation masks. In the first permutation mask
48595 the first quarter will contain indexes for the first half
48596 of the op0, the second quarter will contain bit 7 set, third quarter
48597 will contain indexes for the second half of the op0 and the
48598 last quarter bit 7 set. In the second permutation mask
48599 the first quarter will contain bit 7 set, the second quarter
48600 indexes for the first half of the op1, the third quarter bit 7 set
48601 and last quarter indexes for the second half of the op1.
48602 I.e. the first mask e.g. for V32QImode extract even will be:
48603 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48604 (all values masked with 0xf except for -128) and second mask
48605 for extract even will be
48606 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48607 m128 = GEN_INT (-128);
48608 for (i = 0; i < nelt; ++i)
48610 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48611 unsigned which = d->perm[i] >= nelt;
48612 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48614 for (j = 0; j < eltsz; ++j)
48616 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48617 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48621 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48622 vperm = force_reg (V32QImode, vperm);
48624 l = gen_reg_rtx (V32QImode);
48625 op = gen_lowpart (V32QImode, d->op0);
48626 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48628 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48629 vperm = force_reg (V32QImode, vperm);
48631 h = gen_reg_rtx (V32QImode);
48632 op = gen_lowpart (V32QImode, d->op1);
48633 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48635 ior = gen_reg_rtx (V32QImode);
48636 emit_insn (gen_iorv32qi3 (ior, l, h));
48638 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48639 op = gen_reg_rtx (V4DImode);
48640 ior = gen_lowpart (V4DImode, ior);
48641 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48642 const1_rtx, GEN_INT (3)));
48643 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48645 return true;
48648 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48649 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48650 with two "and" and "pack" or two "shift" and "pack" insns. We should
48651 have already failed all two instruction sequences. */
48653 static bool
48654 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48656 rtx op, dop0, dop1, t, rperm[16];
48657 unsigned i, odd, c, s, nelt = d->nelt;
48658 bool end_perm = false;
48659 machine_mode half_mode;
48660 rtx (*gen_and) (rtx, rtx, rtx);
48661 rtx (*gen_pack) (rtx, rtx, rtx);
48662 rtx (*gen_shift) (rtx, rtx, rtx);
48664 if (d->one_operand_p)
48665 return false;
48667 switch (d->vmode)
48669 case V8HImode:
48670 /* Required for "pack". */
48671 if (!TARGET_SSE4_1)
48672 return false;
48673 c = 0xffff;
48674 s = 16;
48675 half_mode = V4SImode;
48676 gen_and = gen_andv4si3;
48677 gen_pack = gen_sse4_1_packusdw;
48678 gen_shift = gen_lshrv4si3;
48679 break;
48680 case V16QImode:
48681 /* No check as all instructions are SSE2. */
48682 c = 0xff;
48683 s = 8;
48684 half_mode = V8HImode;
48685 gen_and = gen_andv8hi3;
48686 gen_pack = gen_sse2_packuswb;
48687 gen_shift = gen_lshrv8hi3;
48688 break;
48689 case V16HImode:
48690 if (!TARGET_AVX2)
48691 return false;
48692 c = 0xffff;
48693 s = 16;
48694 half_mode = V8SImode;
48695 gen_and = gen_andv8si3;
48696 gen_pack = gen_avx2_packusdw;
48697 gen_shift = gen_lshrv8si3;
48698 end_perm = true;
48699 break;
48700 case V32QImode:
48701 if (!TARGET_AVX2)
48702 return false;
48703 c = 0xff;
48704 s = 8;
48705 half_mode = V16HImode;
48706 gen_and = gen_andv16hi3;
48707 gen_pack = gen_avx2_packuswb;
48708 gen_shift = gen_lshrv16hi3;
48709 end_perm = true;
48710 break;
48711 default:
48712 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48713 general shuffles. */
48714 return false;
48717 /* Check that permutation is even or odd. */
48718 odd = d->perm[0];
48719 if (odd > 1)
48720 return false;
48722 for (i = 1; i < nelt; ++i)
48723 if (d->perm[i] != 2 * i + odd)
48724 return false;
48726 if (d->testing_p)
48727 return true;
48729 dop0 = gen_reg_rtx (half_mode);
48730 dop1 = gen_reg_rtx (half_mode);
48731 if (odd == 0)
48733 for (i = 0; i < nelt / 2; i++)
48734 rperm[i] = GEN_INT (c);
48735 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48736 t = force_reg (half_mode, t);
48737 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48738 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48740 else
48742 emit_insn (gen_shift (dop0,
48743 gen_lowpart (half_mode, d->op0),
48744 GEN_INT (s)));
48745 emit_insn (gen_shift (dop1,
48746 gen_lowpart (half_mode, d->op1),
48747 GEN_INT (s)));
48749 /* In AVX2 for 256 bit case we need to permute pack result. */
48750 if (TARGET_AVX2 && end_perm)
48752 op = gen_reg_rtx (d->vmode);
48753 t = gen_reg_rtx (V4DImode);
48754 emit_insn (gen_pack (op, dop0, dop1));
48755 emit_insn (gen_avx2_permv4di_1 (t,
48756 gen_lowpart (V4DImode, op),
48757 const0_rtx,
48758 const2_rtx,
48759 const1_rtx,
48760 GEN_INT (3)));
48761 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48763 else
48764 emit_insn (gen_pack (d->target, dop0, dop1));
48766 return true;
48769 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48770 and extract-odd permutations of two V64QI operands
48771 with two "shifts", two "truncs" and one "concat" insns for "odd"
48772 and two "truncs" and one concat insn for "even."
48773 Have already failed all two instruction sequences. */
48775 static bool
48776 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48778 rtx t1, t2, t3, t4;
48779 unsigned i, odd, nelt = d->nelt;
48781 if (!TARGET_AVX512BW
48782 || d->one_operand_p
48783 || d->vmode != V64QImode)
48784 return false;
48786 /* Check that permutation is even or odd. */
48787 odd = d->perm[0];
48788 if (odd > 1)
48789 return false;
48791 for (i = 1; i < nelt; ++i)
48792 if (d->perm[i] != 2 * i + odd)
48793 return false;
48795 if (d->testing_p)
48796 return true;
48799 if (odd)
48801 t1 = gen_reg_rtx (V32HImode);
48802 t2 = gen_reg_rtx (V32HImode);
48803 emit_insn (gen_lshrv32hi3 (t1,
48804 gen_lowpart (V32HImode, d->op0),
48805 GEN_INT (8)));
48806 emit_insn (gen_lshrv32hi3 (t2,
48807 gen_lowpart (V32HImode, d->op1),
48808 GEN_INT (8)));
48810 else
48812 t1 = gen_lowpart (V32HImode, d->op0);
48813 t2 = gen_lowpart (V32HImode, d->op1);
48816 t3 = gen_reg_rtx (V32QImode);
48817 t4 = gen_reg_rtx (V32QImode);
48818 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48819 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48820 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48822 return true;
48825 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48826 and extract-odd permutations. */
48828 static bool
48829 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48831 rtx t1, t2, t3, t4, t5;
48833 switch (d->vmode)
48835 case V4DFmode:
48836 if (d->testing_p)
48837 break;
48838 t1 = gen_reg_rtx (V4DFmode);
48839 t2 = gen_reg_rtx (V4DFmode);
48841 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48842 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48843 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48845 /* Now an unpck[lh]pd will produce the result required. */
48846 if (odd)
48847 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48848 else
48849 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48850 emit_insn (t3);
48851 break;
48853 case V8SFmode:
48855 int mask = odd ? 0xdd : 0x88;
48857 if (d->testing_p)
48858 break;
48859 t1 = gen_reg_rtx (V8SFmode);
48860 t2 = gen_reg_rtx (V8SFmode);
48861 t3 = gen_reg_rtx (V8SFmode);
48863 /* Shuffle within the 128-bit lanes to produce:
48864 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48865 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48866 GEN_INT (mask)));
48868 /* Shuffle the lanes around to produce:
48869 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48870 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48871 GEN_INT (0x3)));
48873 /* Shuffle within the 128-bit lanes to produce:
48874 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48875 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48877 /* Shuffle within the 128-bit lanes to produce:
48878 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48879 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48881 /* Shuffle the lanes around to produce:
48882 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48883 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48884 GEN_INT (0x20)));
48886 break;
48888 case V2DFmode:
48889 case V4SFmode:
48890 case V2DImode:
48891 case V4SImode:
48892 /* These are always directly implementable by expand_vec_perm_1. */
48893 gcc_unreachable ();
48895 case V8HImode:
48896 if (TARGET_SSE4_1)
48897 return expand_vec_perm_even_odd_pack (d);
48898 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48899 return expand_vec_perm_pshufb2 (d);
48900 else
48902 if (d->testing_p)
48903 break;
48904 /* We need 2*log2(N)-1 operations to achieve odd/even
48905 with interleave. */
48906 t1 = gen_reg_rtx (V8HImode);
48907 t2 = gen_reg_rtx (V8HImode);
48908 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48909 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48910 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48911 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48912 if (odd)
48913 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48914 else
48915 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48916 emit_insn (t3);
48918 break;
48920 case V16QImode:
48921 return expand_vec_perm_even_odd_pack (d);
48923 case V16HImode:
48924 case V32QImode:
48925 return expand_vec_perm_even_odd_pack (d);
48927 case V64QImode:
48928 return expand_vec_perm_even_odd_trunc (d);
48930 case V4DImode:
48931 if (!TARGET_AVX2)
48933 struct expand_vec_perm_d d_copy = *d;
48934 d_copy.vmode = V4DFmode;
48935 if (d->testing_p)
48936 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48937 else
48938 d_copy.target = gen_reg_rtx (V4DFmode);
48939 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48940 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48941 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48943 if (!d->testing_p)
48944 emit_move_insn (d->target,
48945 gen_lowpart (V4DImode, d_copy.target));
48946 return true;
48948 return false;
48951 if (d->testing_p)
48952 break;
48954 t1 = gen_reg_rtx (V4DImode);
48955 t2 = gen_reg_rtx (V4DImode);
48957 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48958 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48959 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48961 /* Now an vpunpck[lh]qdq will produce the result required. */
48962 if (odd)
48963 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48964 else
48965 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48966 emit_insn (t3);
48967 break;
48969 case V8SImode:
48970 if (!TARGET_AVX2)
48972 struct expand_vec_perm_d d_copy = *d;
48973 d_copy.vmode = V8SFmode;
48974 if (d->testing_p)
48975 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48976 else
48977 d_copy.target = gen_reg_rtx (V8SFmode);
48978 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48979 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48980 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48982 if (!d->testing_p)
48983 emit_move_insn (d->target,
48984 gen_lowpart (V8SImode, d_copy.target));
48985 return true;
48987 return false;
48990 if (d->testing_p)
48991 break;
48993 t1 = gen_reg_rtx (V8SImode);
48994 t2 = gen_reg_rtx (V8SImode);
48995 t3 = gen_reg_rtx (V4DImode);
48996 t4 = gen_reg_rtx (V4DImode);
48997 t5 = gen_reg_rtx (V4DImode);
48999 /* Shuffle the lanes around into
49000 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
49001 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
49002 gen_lowpart (V4DImode, d->op1),
49003 GEN_INT (0x20)));
49004 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
49005 gen_lowpart (V4DImode, d->op1),
49006 GEN_INT (0x31)));
49008 /* Swap the 2nd and 3rd position in each lane into
49009 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
49010 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49011 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49012 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49013 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49015 /* Now an vpunpck[lh]qdq will produce
49016 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49017 if (odd)
49018 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49019 gen_lowpart (V4DImode, t2));
49020 else
49021 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49022 gen_lowpart (V4DImode, t2));
49023 emit_insn (t3);
49024 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49025 break;
49027 default:
49028 gcc_unreachable ();
49031 return true;
49034 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49035 extract-even and extract-odd permutations. */
49037 static bool
49038 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49040 unsigned i, odd, nelt = d->nelt;
49042 odd = d->perm[0];
49043 if (odd != 0 && odd != 1)
49044 return false;
49046 for (i = 1; i < nelt; ++i)
49047 if (d->perm[i] != 2 * i + odd)
49048 return false;
49050 return expand_vec_perm_even_odd_1 (d, odd);
49053 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49054 permutations. We assume that expand_vec_perm_1 has already failed. */
49056 static bool
49057 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49059 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49060 machine_mode vmode = d->vmode;
49061 unsigned char perm2[4];
49062 rtx op0 = d->op0, dest;
49063 bool ok;
49065 switch (vmode)
49067 case V4DFmode:
49068 case V8SFmode:
49069 /* These are special-cased in sse.md so that we can optionally
49070 use the vbroadcast instruction. They expand to two insns
49071 if the input happens to be in a register. */
49072 gcc_unreachable ();
49074 case V2DFmode:
49075 case V2DImode:
49076 case V4SFmode:
49077 case V4SImode:
49078 /* These are always implementable using standard shuffle patterns. */
49079 gcc_unreachable ();
49081 case V8HImode:
49082 case V16QImode:
49083 /* These can be implemented via interleave. We save one insn by
49084 stopping once we have promoted to V4SImode and then use pshufd. */
49085 if (d->testing_p)
49086 return true;
49089 rtx dest;
49090 rtx (*gen) (rtx, rtx, rtx)
49091 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49092 : gen_vec_interleave_lowv8hi;
49094 if (elt >= nelt2)
49096 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49097 : gen_vec_interleave_highv8hi;
49098 elt -= nelt2;
49100 nelt2 /= 2;
49102 dest = gen_reg_rtx (vmode);
49103 emit_insn (gen (dest, op0, op0));
49104 vmode = get_mode_wider_vector (vmode);
49105 op0 = gen_lowpart (vmode, dest);
49107 while (vmode != V4SImode);
49109 memset (perm2, elt, 4);
49110 dest = gen_reg_rtx (V4SImode);
49111 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49112 gcc_assert (ok);
49113 if (!d->testing_p)
49114 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49115 return true;
49117 case V64QImode:
49118 case V32QImode:
49119 case V16HImode:
49120 case V8SImode:
49121 case V4DImode:
49122 /* For AVX2 broadcasts of the first element vpbroadcast* or
49123 vpermq should be used by expand_vec_perm_1. */
49124 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49125 return false;
49127 default:
49128 gcc_unreachable ();
49132 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49133 broadcast permutations. */
49135 static bool
49136 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49138 unsigned i, elt, nelt = d->nelt;
49140 if (!d->one_operand_p)
49141 return false;
49143 elt = d->perm[0];
49144 for (i = 1; i < nelt; ++i)
49145 if (d->perm[i] != elt)
49146 return false;
49148 return expand_vec_perm_broadcast_1 (d);
49151 /* Implement arbitrary permutations of two V64QImode operands
49152 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49153 static bool
49154 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49156 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49157 return false;
49159 if (d->testing_p)
49160 return true;
49162 struct expand_vec_perm_d ds[2];
49163 rtx rperm[128], vperm, target0, target1;
49164 unsigned int i, nelt;
49165 machine_mode vmode;
49167 nelt = d->nelt;
49168 vmode = V64QImode;
49170 for (i = 0; i < 2; i++)
49172 ds[i] = *d;
49173 ds[i].vmode = V32HImode;
49174 ds[i].nelt = 32;
49175 ds[i].target = gen_reg_rtx (V32HImode);
49176 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49177 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49180 /* Prepare permutations such that the first one takes care of
49181 putting the even bytes into the right positions or one higher
49182 positions (ds[0]) and the second one takes care of
49183 putting the odd bytes into the right positions or one below
49184 (ds[1]). */
49186 for (i = 0; i < nelt; i++)
49188 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49189 if (i & 1)
49191 rperm[i] = constm1_rtx;
49192 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49194 else
49196 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49197 rperm[i + 64] = constm1_rtx;
49201 bool ok = expand_vec_perm_1 (&ds[0]);
49202 gcc_assert (ok);
49203 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49205 ok = expand_vec_perm_1 (&ds[1]);
49206 gcc_assert (ok);
49207 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49209 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49210 vperm = force_reg (vmode, vperm);
49211 target0 = gen_reg_rtx (V64QImode);
49212 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49214 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49215 vperm = force_reg (vmode, vperm);
49216 target1 = gen_reg_rtx (V64QImode);
49217 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49219 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49220 return true;
49223 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49224 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49225 all the shorter instruction sequences. */
49227 static bool
49228 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49230 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49231 unsigned int i, nelt, eltsz;
49232 bool used[4];
49234 if (!TARGET_AVX2
49235 || d->one_operand_p
49236 || (d->vmode != V32QImode && d->vmode != V16HImode))
49237 return false;
49239 if (d->testing_p)
49240 return true;
49242 nelt = d->nelt;
49243 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49245 /* Generate 4 permutation masks. If the required element is within
49246 the same lane, it is shuffled in. If the required element from the
49247 other lane, force a zero by setting bit 7 in the permutation mask.
49248 In the other mask the mask has non-negative elements if element
49249 is requested from the other lane, but also moved to the other lane,
49250 so that the result of vpshufb can have the two V2TImode halves
49251 swapped. */
49252 m128 = GEN_INT (-128);
49253 for (i = 0; i < 32; ++i)
49255 rperm[0][i] = m128;
49256 rperm[1][i] = m128;
49257 rperm[2][i] = m128;
49258 rperm[3][i] = m128;
49260 used[0] = false;
49261 used[1] = false;
49262 used[2] = false;
49263 used[3] = false;
49264 for (i = 0; i < nelt; ++i)
49266 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49267 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49268 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49270 for (j = 0; j < eltsz; ++j)
49271 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49272 used[which] = true;
49275 for (i = 0; i < 2; ++i)
49277 if (!used[2 * i + 1])
49279 h[i] = NULL_RTX;
49280 continue;
49282 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49283 gen_rtvec_v (32, rperm[2 * i + 1]));
49284 vperm = force_reg (V32QImode, vperm);
49285 h[i] = gen_reg_rtx (V32QImode);
49286 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49287 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49290 /* Swap the 128-byte lanes of h[X]. */
49291 for (i = 0; i < 2; ++i)
49293 if (h[i] == NULL_RTX)
49294 continue;
49295 op = gen_reg_rtx (V4DImode);
49296 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49297 const2_rtx, GEN_INT (3), const0_rtx,
49298 const1_rtx));
49299 h[i] = gen_lowpart (V32QImode, op);
49302 for (i = 0; i < 2; ++i)
49304 if (!used[2 * i])
49306 l[i] = NULL_RTX;
49307 continue;
49309 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49310 vperm = force_reg (V32QImode, vperm);
49311 l[i] = gen_reg_rtx (V32QImode);
49312 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49313 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49316 for (i = 0; i < 2; ++i)
49318 if (h[i] && l[i])
49320 op = gen_reg_rtx (V32QImode);
49321 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49322 l[i] = op;
49324 else if (h[i])
49325 l[i] = h[i];
49328 gcc_assert (l[0] && l[1]);
49329 op = d->target;
49330 if (d->vmode != V32QImode)
49331 op = gen_reg_rtx (V32QImode);
49332 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49333 if (op != d->target)
49334 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49335 return true;
49338 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49339 With all of the interface bits taken care of, perform the expansion
49340 in D and return true on success. */
49342 static bool
49343 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49345 /* Try a single instruction expansion. */
49346 if (expand_vec_perm_1 (d))
49347 return true;
49349 /* Try sequences of two instructions. */
49351 if (expand_vec_perm_pshuflw_pshufhw (d))
49352 return true;
49354 if (expand_vec_perm_palignr (d, false))
49355 return true;
49357 if (expand_vec_perm_interleave2 (d))
49358 return true;
49360 if (expand_vec_perm_broadcast (d))
49361 return true;
49363 if (expand_vec_perm_vpermq_perm_1 (d))
49364 return true;
49366 if (expand_vec_perm_vperm2f128 (d))
49367 return true;
49369 if (expand_vec_perm_pblendv (d))
49370 return true;
49372 /* Try sequences of three instructions. */
49374 if (expand_vec_perm_even_odd_pack (d))
49375 return true;
49377 if (expand_vec_perm_2vperm2f128_vshuf (d))
49378 return true;
49380 if (expand_vec_perm_pshufb2 (d))
49381 return true;
49383 if (expand_vec_perm_interleave3 (d))
49384 return true;
49386 if (expand_vec_perm_vperm2f128_vblend (d))
49387 return true;
49389 /* Try sequences of four instructions. */
49391 if (expand_vec_perm_even_odd_trunc (d))
49392 return true;
49393 if (expand_vec_perm_vpshufb2_vpermq (d))
49394 return true;
49396 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49397 return true;
49399 if (expand_vec_perm_vpermi2_vpshub2 (d))
49400 return true;
49402 /* ??? Look for narrow permutations whose element orderings would
49403 allow the promotion to a wider mode. */
49405 /* ??? Look for sequences of interleave or a wider permute that place
49406 the data into the correct lanes for a half-vector shuffle like
49407 pshuf[lh]w or vpermilps. */
49409 /* ??? Look for sequences of interleave that produce the desired results.
49410 The combinatorics of punpck[lh] get pretty ugly... */
49412 if (expand_vec_perm_even_odd (d))
49413 return true;
49415 /* Even longer sequences. */
49416 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49417 return true;
49419 /* See if we can get the same permutation in different vector integer
49420 mode. */
49421 struct expand_vec_perm_d nd;
49422 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49424 if (!d->testing_p)
49425 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49426 return true;
49429 return false;
49432 /* If a permutation only uses one operand, make it clear. Returns true
49433 if the permutation references both operands. */
49435 static bool
49436 canonicalize_perm (struct expand_vec_perm_d *d)
49438 int i, which, nelt = d->nelt;
49440 for (i = which = 0; i < nelt; ++i)
49441 which |= (d->perm[i] < nelt ? 1 : 2);
49443 d->one_operand_p = true;
49444 switch (which)
49446 default:
49447 gcc_unreachable();
49449 case 3:
49450 if (!rtx_equal_p (d->op0, d->op1))
49452 d->one_operand_p = false;
49453 break;
49455 /* The elements of PERM do not suggest that only the first operand
49456 is used, but both operands are identical. Allow easier matching
49457 of the permutation by folding the permutation into the single
49458 input vector. */
49459 /* FALLTHRU */
49461 case 2:
49462 for (i = 0; i < nelt; ++i)
49463 d->perm[i] &= nelt - 1;
49464 d->op0 = d->op1;
49465 break;
49467 case 1:
49468 d->op1 = d->op0;
49469 break;
49472 return (which == 3);
49475 bool
49476 ix86_expand_vec_perm_const (rtx operands[4])
49478 struct expand_vec_perm_d d;
49479 unsigned char perm[MAX_VECT_LEN];
49480 int i, nelt;
49481 bool two_args;
49482 rtx sel;
49484 d.target = operands[0];
49485 d.op0 = operands[1];
49486 d.op1 = operands[2];
49487 sel = operands[3];
49489 d.vmode = GET_MODE (d.target);
49490 gcc_assert (VECTOR_MODE_P (d.vmode));
49491 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49492 d.testing_p = false;
49494 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49495 gcc_assert (XVECLEN (sel, 0) == nelt);
49496 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49498 for (i = 0; i < nelt; ++i)
49500 rtx e = XVECEXP (sel, 0, i);
49501 int ei = INTVAL (e) & (2 * nelt - 1);
49502 d.perm[i] = ei;
49503 perm[i] = ei;
49506 two_args = canonicalize_perm (&d);
49508 if (ix86_expand_vec_perm_const_1 (&d))
49509 return true;
49511 /* If the selector says both arguments are needed, but the operands are the
49512 same, the above tried to expand with one_operand_p and flattened selector.
49513 If that didn't work, retry without one_operand_p; we succeeded with that
49514 during testing. */
49515 if (two_args && d.one_operand_p)
49517 d.one_operand_p = false;
49518 memcpy (d.perm, perm, sizeof (perm));
49519 return ix86_expand_vec_perm_const_1 (&d);
49522 return false;
49525 /* Implement targetm.vectorize.vec_perm_const_ok. */
49527 static bool
49528 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49529 const unsigned char *sel)
49531 struct expand_vec_perm_d d;
49532 unsigned int i, nelt, which;
49533 bool ret;
49535 d.vmode = vmode;
49536 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49537 d.testing_p = true;
49539 /* Given sufficient ISA support we can just return true here
49540 for selected vector modes. */
49541 switch (d.vmode)
49543 case V16SFmode:
49544 case V16SImode:
49545 case V8DImode:
49546 case V8DFmode:
49547 if (TARGET_AVX512F)
49548 /* All implementable with a single vpermi2 insn. */
49549 return true;
49550 break;
49551 case V32HImode:
49552 if (TARGET_AVX512BW)
49553 /* All implementable with a single vpermi2 insn. */
49554 return true;
49555 break;
49556 case V64QImode:
49557 if (TARGET_AVX512BW)
49558 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49559 return true;
49560 break;
49561 case V8SImode:
49562 case V8SFmode:
49563 case V4DFmode:
49564 case V4DImode:
49565 if (TARGET_AVX512VL)
49566 /* All implementable with a single vpermi2 insn. */
49567 return true;
49568 break;
49569 case V16HImode:
49570 if (TARGET_AVX2)
49571 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49572 return true;
49573 break;
49574 case V32QImode:
49575 if (TARGET_AVX2)
49576 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49577 return true;
49578 break;
49579 case V4SImode:
49580 case V4SFmode:
49581 case V8HImode:
49582 case V16QImode:
49583 /* All implementable with a single vpperm insn. */
49584 if (TARGET_XOP)
49585 return true;
49586 /* All implementable with 2 pshufb + 1 ior. */
49587 if (TARGET_SSSE3)
49588 return true;
49589 break;
49590 case V2DImode:
49591 case V2DFmode:
49592 /* All implementable with shufpd or unpck[lh]pd. */
49593 return true;
49594 default:
49595 return false;
49598 /* Extract the values from the vector CST into the permutation
49599 array in D. */
49600 memcpy (d.perm, sel, nelt);
49601 for (i = which = 0; i < nelt; ++i)
49603 unsigned char e = d.perm[i];
49604 gcc_assert (e < 2 * nelt);
49605 which |= (e < nelt ? 1 : 2);
49608 /* For all elements from second vector, fold the elements to first. */
49609 if (which == 2)
49610 for (i = 0; i < nelt; ++i)
49611 d.perm[i] -= nelt;
49613 /* Check whether the mask can be applied to the vector type. */
49614 d.one_operand_p = (which != 3);
49616 /* Implementable with shufps or pshufd. */
49617 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49618 return true;
49620 /* Otherwise we have to go through the motions and see if we can
49621 figure out how to generate the requested permutation. */
49622 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49623 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49624 if (!d.one_operand_p)
49625 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49627 start_sequence ();
49628 ret = ix86_expand_vec_perm_const_1 (&d);
49629 end_sequence ();
49631 return ret;
49634 void
49635 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49637 struct expand_vec_perm_d d;
49638 unsigned i, nelt;
49640 d.target = targ;
49641 d.op0 = op0;
49642 d.op1 = op1;
49643 d.vmode = GET_MODE (targ);
49644 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49645 d.one_operand_p = false;
49646 d.testing_p = false;
49648 for (i = 0; i < nelt; ++i)
49649 d.perm[i] = i * 2 + odd;
49651 /* We'll either be able to implement the permutation directly... */
49652 if (expand_vec_perm_1 (&d))
49653 return;
49655 /* ... or we use the special-case patterns. */
49656 expand_vec_perm_even_odd_1 (&d, odd);
49659 static void
49660 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49662 struct expand_vec_perm_d d;
49663 unsigned i, nelt, base;
49664 bool ok;
49666 d.target = targ;
49667 d.op0 = op0;
49668 d.op1 = op1;
49669 d.vmode = GET_MODE (targ);
49670 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49671 d.one_operand_p = false;
49672 d.testing_p = false;
49674 base = high_p ? nelt / 2 : 0;
49675 for (i = 0; i < nelt / 2; ++i)
49677 d.perm[i * 2] = i + base;
49678 d.perm[i * 2 + 1] = i + base + nelt;
49681 /* Note that for AVX this isn't one instruction. */
49682 ok = ix86_expand_vec_perm_const_1 (&d);
49683 gcc_assert (ok);
49687 /* Expand a vector operation CODE for a V*QImode in terms of the
49688 same operation on V*HImode. */
49690 void
49691 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49693 machine_mode qimode = GET_MODE (dest);
49694 machine_mode himode;
49695 rtx (*gen_il) (rtx, rtx, rtx);
49696 rtx (*gen_ih) (rtx, rtx, rtx);
49697 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49698 struct expand_vec_perm_d d;
49699 bool ok, full_interleave;
49700 bool uns_p = false;
49701 int i;
49703 switch (qimode)
49705 case V16QImode:
49706 himode = V8HImode;
49707 gen_il = gen_vec_interleave_lowv16qi;
49708 gen_ih = gen_vec_interleave_highv16qi;
49709 break;
49710 case V32QImode:
49711 himode = V16HImode;
49712 gen_il = gen_avx2_interleave_lowv32qi;
49713 gen_ih = gen_avx2_interleave_highv32qi;
49714 break;
49715 case V64QImode:
49716 himode = V32HImode;
49717 gen_il = gen_avx512bw_interleave_lowv64qi;
49718 gen_ih = gen_avx512bw_interleave_highv64qi;
49719 break;
49720 default:
49721 gcc_unreachable ();
49724 op2_l = op2_h = op2;
49725 switch (code)
49727 case MULT:
49728 /* Unpack data such that we've got a source byte in each low byte of
49729 each word. We don't care what goes into the high byte of each word.
49730 Rather than trying to get zero in there, most convenient is to let
49731 it be a copy of the low byte. */
49732 op2_l = gen_reg_rtx (qimode);
49733 op2_h = gen_reg_rtx (qimode);
49734 emit_insn (gen_il (op2_l, op2, op2));
49735 emit_insn (gen_ih (op2_h, op2, op2));
49736 /* FALLTHRU */
49738 op1_l = gen_reg_rtx (qimode);
49739 op1_h = gen_reg_rtx (qimode);
49740 emit_insn (gen_il (op1_l, op1, op1));
49741 emit_insn (gen_ih (op1_h, op1, op1));
49742 full_interleave = qimode == V16QImode;
49743 break;
49745 case ASHIFT:
49746 case LSHIFTRT:
49747 uns_p = true;
49748 /* FALLTHRU */
49749 case ASHIFTRT:
49750 op1_l = gen_reg_rtx (himode);
49751 op1_h = gen_reg_rtx (himode);
49752 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49753 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49754 full_interleave = true;
49755 break;
49756 default:
49757 gcc_unreachable ();
49760 /* Perform the operation. */
49761 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49762 1, OPTAB_DIRECT);
49763 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49764 1, OPTAB_DIRECT);
49765 gcc_assert (res_l && res_h);
49767 /* Merge the data back into the right place. */
49768 d.target = dest;
49769 d.op0 = gen_lowpart (qimode, res_l);
49770 d.op1 = gen_lowpart (qimode, res_h);
49771 d.vmode = qimode;
49772 d.nelt = GET_MODE_NUNITS (qimode);
49773 d.one_operand_p = false;
49774 d.testing_p = false;
49776 if (full_interleave)
49778 /* For SSE2, we used an full interleave, so the desired
49779 results are in the even elements. */
49780 for (i = 0; i < d.nelt; ++i)
49781 d.perm[i] = i * 2;
49783 else
49785 /* For AVX, the interleave used above was not cross-lane. So the
49786 extraction is evens but with the second and third quarter swapped.
49787 Happily, that is even one insn shorter than even extraction.
49788 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49789 always first from the first and then from the second source operand,
49790 the index bits above the low 4 bits remains the same.
49791 Thus, for d.nelt == 32 we want permutation
49792 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49793 and for d.nelt == 64 we want permutation
49794 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49795 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49796 for (i = 0; i < d.nelt; ++i)
49797 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49800 ok = ix86_expand_vec_perm_const_1 (&d);
49801 gcc_assert (ok);
49803 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49804 gen_rtx_fmt_ee (code, qimode, op1, op2));
49807 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49808 if op is CONST_VECTOR with all odd elements equal to their
49809 preceding element. */
49811 static bool
49812 const_vector_equal_evenodd_p (rtx op)
49814 machine_mode mode = GET_MODE (op);
49815 int i, nunits = GET_MODE_NUNITS (mode);
49816 if (GET_CODE (op) != CONST_VECTOR
49817 || nunits != CONST_VECTOR_NUNITS (op))
49818 return false;
49819 for (i = 0; i < nunits; i += 2)
49820 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49821 return false;
49822 return true;
49825 void
49826 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49827 bool uns_p, bool odd_p)
49829 machine_mode mode = GET_MODE (op1);
49830 machine_mode wmode = GET_MODE (dest);
49831 rtx x;
49832 rtx orig_op1 = op1, orig_op2 = op2;
49834 if (!nonimmediate_operand (op1, mode))
49835 op1 = force_reg (mode, op1);
49836 if (!nonimmediate_operand (op2, mode))
49837 op2 = force_reg (mode, op2);
49839 /* We only play even/odd games with vectors of SImode. */
49840 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49842 /* If we're looking for the odd results, shift those members down to
49843 the even slots. For some cpus this is faster than a PSHUFD. */
49844 if (odd_p)
49846 /* For XOP use vpmacsdqh, but only for smult, as it is only
49847 signed. */
49848 if (TARGET_XOP && mode == V4SImode && !uns_p)
49850 x = force_reg (wmode, CONST0_RTX (wmode));
49851 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49852 return;
49855 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49856 if (!const_vector_equal_evenodd_p (orig_op1))
49857 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49858 x, NULL, 1, OPTAB_DIRECT);
49859 if (!const_vector_equal_evenodd_p (orig_op2))
49860 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49861 x, NULL, 1, OPTAB_DIRECT);
49862 op1 = gen_lowpart (mode, op1);
49863 op2 = gen_lowpart (mode, op2);
49866 if (mode == V16SImode)
49868 if (uns_p)
49869 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49870 else
49871 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49873 else if (mode == V8SImode)
49875 if (uns_p)
49876 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49877 else
49878 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49880 else if (uns_p)
49881 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49882 else if (TARGET_SSE4_1)
49883 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49884 else
49886 rtx s1, s2, t0, t1, t2;
49888 /* The easiest way to implement this without PMULDQ is to go through
49889 the motions as if we are performing a full 64-bit multiply. With
49890 the exception that we need to do less shuffling of the elements. */
49892 /* Compute the sign-extension, aka highparts, of the two operands. */
49893 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49894 op1, pc_rtx, pc_rtx);
49895 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49896 op2, pc_rtx, pc_rtx);
49898 /* Multiply LO(A) * HI(B), and vice-versa. */
49899 t1 = gen_reg_rtx (wmode);
49900 t2 = gen_reg_rtx (wmode);
49901 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49902 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49904 /* Multiply LO(A) * LO(B). */
49905 t0 = gen_reg_rtx (wmode);
49906 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49908 /* Combine and shift the highparts into place. */
49909 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49910 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49911 1, OPTAB_DIRECT);
49913 /* Combine high and low parts. */
49914 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49915 return;
49917 emit_insn (x);
49920 void
49921 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49922 bool uns_p, bool high_p)
49924 machine_mode wmode = GET_MODE (dest);
49925 machine_mode mode = GET_MODE (op1);
49926 rtx t1, t2, t3, t4, mask;
49928 switch (mode)
49930 case V4SImode:
49931 t1 = gen_reg_rtx (mode);
49932 t2 = gen_reg_rtx (mode);
49933 if (TARGET_XOP && !uns_p)
49935 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49936 shuffle the elements once so that all elements are in the right
49937 place for immediate use: { A C B D }. */
49938 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49939 const1_rtx, GEN_INT (3)));
49940 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49941 const1_rtx, GEN_INT (3)));
49943 else
49945 /* Put the elements into place for the multiply. */
49946 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49947 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49948 high_p = false;
49950 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49951 break;
49953 case V8SImode:
49954 /* Shuffle the elements between the lanes. After this we
49955 have { A B E F | C D G H } for each operand. */
49956 t1 = gen_reg_rtx (V4DImode);
49957 t2 = gen_reg_rtx (V4DImode);
49958 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49959 const0_rtx, const2_rtx,
49960 const1_rtx, GEN_INT (3)));
49961 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49962 const0_rtx, const2_rtx,
49963 const1_rtx, GEN_INT (3)));
49965 /* Shuffle the elements within the lanes. After this we
49966 have { A A B B | C C D D } or { E E F F | G G H H }. */
49967 t3 = gen_reg_rtx (V8SImode);
49968 t4 = gen_reg_rtx (V8SImode);
49969 mask = GEN_INT (high_p
49970 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49971 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49972 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49973 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49975 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49976 break;
49978 case V8HImode:
49979 case V16HImode:
49980 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49981 uns_p, OPTAB_DIRECT);
49982 t2 = expand_binop (mode,
49983 uns_p ? umul_highpart_optab : smul_highpart_optab,
49984 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49985 gcc_assert (t1 && t2);
49987 t3 = gen_reg_rtx (mode);
49988 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49989 emit_move_insn (dest, gen_lowpart (wmode, t3));
49990 break;
49992 case V16QImode:
49993 case V32QImode:
49994 case V32HImode:
49995 case V16SImode:
49996 case V64QImode:
49997 t1 = gen_reg_rtx (wmode);
49998 t2 = gen_reg_rtx (wmode);
49999 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
50000 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
50002 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
50003 break;
50005 default:
50006 gcc_unreachable ();
50010 void
50011 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50013 rtx res_1, res_2, res_3, res_4;
50015 res_1 = gen_reg_rtx (V4SImode);
50016 res_2 = gen_reg_rtx (V4SImode);
50017 res_3 = gen_reg_rtx (V2DImode);
50018 res_4 = gen_reg_rtx (V2DImode);
50019 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50020 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50022 /* Move the results in element 2 down to element 1; we don't care
50023 what goes in elements 2 and 3. Then we can merge the parts
50024 back together with an interleave.
50026 Note that two other sequences were tried:
50027 (1) Use interleaves at the start instead of psrldq, which allows
50028 us to use a single shufps to merge things back at the end.
50029 (2) Use shufps here to combine the two vectors, then pshufd to
50030 put the elements in the correct order.
50031 In both cases the cost of the reformatting stall was too high
50032 and the overall sequence slower. */
50034 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50035 const0_rtx, const2_rtx,
50036 const0_rtx, const0_rtx));
50037 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50038 const0_rtx, const2_rtx,
50039 const0_rtx, const0_rtx));
50040 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50042 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50045 void
50046 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50048 machine_mode mode = GET_MODE (op0);
50049 rtx t1, t2, t3, t4, t5, t6;
50051 if (TARGET_AVX512DQ && mode == V8DImode)
50052 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50053 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50054 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50055 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50056 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50057 else if (TARGET_XOP && mode == V2DImode)
50059 /* op1: A,B,C,D, op2: E,F,G,H */
50060 op1 = gen_lowpart (V4SImode, op1);
50061 op2 = gen_lowpart (V4SImode, op2);
50063 t1 = gen_reg_rtx (V4SImode);
50064 t2 = gen_reg_rtx (V4SImode);
50065 t3 = gen_reg_rtx (V2DImode);
50066 t4 = gen_reg_rtx (V2DImode);
50068 /* t1: B,A,D,C */
50069 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50070 GEN_INT (1),
50071 GEN_INT (0),
50072 GEN_INT (3),
50073 GEN_INT (2)));
50075 /* t2: (B*E),(A*F),(D*G),(C*H) */
50076 emit_insn (gen_mulv4si3 (t2, t1, op2));
50078 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50079 emit_insn (gen_xop_phadddq (t3, t2));
50081 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50082 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50084 /* Multiply lower parts and add all */
50085 t5 = gen_reg_rtx (V2DImode);
50086 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50087 gen_lowpart (V4SImode, op1),
50088 gen_lowpart (V4SImode, op2)));
50089 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50092 else
50094 machine_mode nmode;
50095 rtx (*umul) (rtx, rtx, rtx);
50097 if (mode == V2DImode)
50099 umul = gen_vec_widen_umult_even_v4si;
50100 nmode = V4SImode;
50102 else if (mode == V4DImode)
50104 umul = gen_vec_widen_umult_even_v8si;
50105 nmode = V8SImode;
50107 else if (mode == V8DImode)
50109 umul = gen_vec_widen_umult_even_v16si;
50110 nmode = V16SImode;
50112 else
50113 gcc_unreachable ();
50116 /* Multiply low parts. */
50117 t1 = gen_reg_rtx (mode);
50118 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50120 /* Shift input vectors right 32 bits so we can multiply high parts. */
50121 t6 = GEN_INT (32);
50122 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50123 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50125 /* Multiply high parts by low parts. */
50126 t4 = gen_reg_rtx (mode);
50127 t5 = gen_reg_rtx (mode);
50128 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50129 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50131 /* Combine and shift the highparts back. */
50132 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50133 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50135 /* Combine high and low parts. */
50136 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50139 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50140 gen_rtx_MULT (mode, op1, op2));
50143 /* Return 1 if control tansfer instruction INSN
50144 should be encoded with bnd prefix.
50145 If insn is NULL then return 1 when control
50146 transfer instructions should be prefixed with
50147 bnd by default for current function. */
50149 bool
50150 ix86_bnd_prefixed_insn_p (rtx insn)
50152 /* For call insns check special flag. */
50153 if (insn && CALL_P (insn))
50155 rtx call = get_call_rtx_from (insn);
50156 if (call)
50157 return CALL_EXPR_WITH_BOUNDS_P (call);
50160 /* All other insns are prefixed only if function is instrumented. */
50161 return chkp_function_instrumented_p (current_function_decl);
50164 /* Calculate integer abs() using only SSE2 instructions. */
50166 void
50167 ix86_expand_sse2_abs (rtx target, rtx input)
50169 machine_mode mode = GET_MODE (target);
50170 rtx tmp0, tmp1, x;
50172 switch (mode)
50174 /* For 32-bit signed integer X, the best way to calculate the absolute
50175 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50176 case V4SImode:
50177 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50178 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50179 NULL, 0, OPTAB_DIRECT);
50180 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50181 NULL, 0, OPTAB_DIRECT);
50182 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50183 target, 0, OPTAB_DIRECT);
50184 break;
50186 /* For 16-bit signed integer X, the best way to calculate the absolute
50187 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50188 case V8HImode:
50189 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50191 x = expand_simple_binop (mode, SMAX, tmp0, input,
50192 target, 0, OPTAB_DIRECT);
50193 break;
50195 /* For 8-bit signed integer X, the best way to calculate the absolute
50196 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50197 as SSE2 provides the PMINUB insn. */
50198 case V16QImode:
50199 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50201 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50202 target, 0, OPTAB_DIRECT);
50203 break;
50205 default:
50206 gcc_unreachable ();
50209 if (x != target)
50210 emit_move_insn (target, x);
50213 /* Expand an extract from a vector register through pextr insn.
50214 Return true if successful. */
50216 bool
50217 ix86_expand_pextr (rtx *operands)
50219 rtx dst = operands[0];
50220 rtx src = operands[1];
50222 unsigned int size = INTVAL (operands[2]);
50223 unsigned int pos = INTVAL (operands[3]);
50225 if (SUBREG_P (dst))
50227 /* Reject non-lowpart subregs. */
50228 if (SUBREG_BYTE (dst) > 0)
50229 return false;
50230 dst = SUBREG_REG (dst);
50233 if (SUBREG_P (src))
50235 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50236 src = SUBREG_REG (src);
50239 switch (GET_MODE (src))
50241 case V16QImode:
50242 case V8HImode:
50243 case V4SImode:
50244 case V2DImode:
50245 case V1TImode:
50246 case TImode:
50248 machine_mode srcmode, dstmode;
50249 rtx d, pat;
50251 dstmode = mode_for_size (size, MODE_INT, 0);
50253 switch (dstmode)
50255 case QImode:
50256 if (!TARGET_SSE4_1)
50257 return false;
50258 srcmode = V16QImode;
50259 break;
50261 case HImode:
50262 if (!TARGET_SSE2)
50263 return false;
50264 srcmode = V8HImode;
50265 break;
50267 case SImode:
50268 if (!TARGET_SSE4_1)
50269 return false;
50270 srcmode = V4SImode;
50271 break;
50273 case DImode:
50274 gcc_assert (TARGET_64BIT);
50275 if (!TARGET_SSE4_1)
50276 return false;
50277 srcmode = V2DImode;
50278 break;
50280 default:
50281 return false;
50284 /* Reject extractions from misaligned positions. */
50285 if (pos & (size-1))
50286 return false;
50288 if (GET_MODE (dst) == dstmode)
50289 d = dst;
50290 else
50291 d = gen_reg_rtx (dstmode);
50293 /* Construct insn pattern. */
50294 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50295 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50297 /* Let the rtl optimizers know about the zero extension performed. */
50298 if (dstmode == QImode || dstmode == HImode)
50300 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50301 d = gen_lowpart (SImode, d);
50304 emit_insn (gen_rtx_SET (d, pat));
50306 if (d != dst)
50307 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50308 return true;
50311 default:
50312 return false;
50316 /* Expand an insert into a vector register through pinsr insn.
50317 Return true if successful. */
50319 bool
50320 ix86_expand_pinsr (rtx *operands)
50322 rtx dst = operands[0];
50323 rtx src = operands[3];
50325 unsigned int size = INTVAL (operands[1]);
50326 unsigned int pos = INTVAL (operands[2]);
50328 if (SUBREG_P (dst))
50330 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50331 dst = SUBREG_REG (dst);
50334 switch (GET_MODE (dst))
50336 case V16QImode:
50337 case V8HImode:
50338 case V4SImode:
50339 case V2DImode:
50340 case V1TImode:
50341 case TImode:
50343 machine_mode srcmode, dstmode;
50344 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50345 rtx d;
50347 srcmode = mode_for_size (size, MODE_INT, 0);
50349 switch (srcmode)
50351 case QImode:
50352 if (!TARGET_SSE4_1)
50353 return false;
50354 dstmode = V16QImode;
50355 pinsr = gen_sse4_1_pinsrb;
50356 break;
50358 case HImode:
50359 if (!TARGET_SSE2)
50360 return false;
50361 dstmode = V8HImode;
50362 pinsr = gen_sse2_pinsrw;
50363 break;
50365 case SImode:
50366 if (!TARGET_SSE4_1)
50367 return false;
50368 dstmode = V4SImode;
50369 pinsr = gen_sse4_1_pinsrd;
50370 break;
50372 case DImode:
50373 gcc_assert (TARGET_64BIT);
50374 if (!TARGET_SSE4_1)
50375 return false;
50376 dstmode = V2DImode;
50377 pinsr = gen_sse4_1_pinsrq;
50378 break;
50380 default:
50381 return false;
50384 /* Reject insertions to misaligned positions. */
50385 if (pos & (size-1))
50386 return false;
50388 if (SUBREG_P (src))
50390 unsigned int srcpos = SUBREG_BYTE (src);
50392 if (srcpos > 0)
50394 rtx extr_ops[4];
50396 extr_ops[0] = gen_reg_rtx (srcmode);
50397 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50398 extr_ops[2] = GEN_INT (size);
50399 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50401 if (!ix86_expand_pextr (extr_ops))
50402 return false;
50404 src = extr_ops[0];
50406 else
50407 src = gen_lowpart (srcmode, SUBREG_REG (src));
50410 if (GET_MODE (dst) == dstmode)
50411 d = dst;
50412 else
50413 d = gen_reg_rtx (dstmode);
50415 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50416 gen_lowpart (srcmode, src),
50417 GEN_INT (1 << (pos / size))));
50418 if (d != dst)
50419 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50420 return true;
50423 default:
50424 return false;
50428 /* This function returns the calling abi specific va_list type node.
50429 It returns the FNDECL specific va_list type. */
50431 static tree
50432 ix86_fn_abi_va_list (tree fndecl)
50434 if (!TARGET_64BIT)
50435 return va_list_type_node;
50436 gcc_assert (fndecl != NULL_TREE);
50438 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50439 return ms_va_list_type_node;
50440 else
50441 return sysv_va_list_type_node;
50444 /* Returns the canonical va_list type specified by TYPE. If there
50445 is no valid TYPE provided, it return NULL_TREE. */
50447 static tree
50448 ix86_canonical_va_list_type (tree type)
50450 if (TARGET_64BIT)
50452 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50453 return ms_va_list_type_node;
50455 if ((TREE_CODE (type) == ARRAY_TYPE
50456 && integer_zerop (array_type_nelts (type)))
50457 || POINTER_TYPE_P (type))
50459 tree elem_type = TREE_TYPE (type);
50460 if (TREE_CODE (elem_type) == RECORD_TYPE
50461 && lookup_attribute ("sysv_abi va_list",
50462 TYPE_ATTRIBUTES (elem_type)))
50463 return sysv_va_list_type_node;
50466 return NULL_TREE;
50469 return std_canonical_va_list_type (type);
50472 /* Iterate through the target-specific builtin types for va_list.
50473 IDX denotes the iterator, *PTREE is set to the result type of
50474 the va_list builtin, and *PNAME to its internal type.
50475 Returns zero if there is no element for this index, otherwise
50476 IDX should be increased upon the next call.
50477 Note, do not iterate a base builtin's name like __builtin_va_list.
50478 Used from c_common_nodes_and_builtins. */
50480 static int
50481 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50483 if (TARGET_64BIT)
50485 switch (idx)
50487 default:
50488 break;
50490 case 0:
50491 *ptree = ms_va_list_type_node;
50492 *pname = "__builtin_ms_va_list";
50493 return 1;
50495 case 1:
50496 *ptree = sysv_va_list_type_node;
50497 *pname = "__builtin_sysv_va_list";
50498 return 1;
50502 return 0;
50505 #undef TARGET_SCHED_DISPATCH
50506 #define TARGET_SCHED_DISPATCH has_dispatch
50507 #undef TARGET_SCHED_DISPATCH_DO
50508 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50509 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50510 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50511 #undef TARGET_SCHED_REORDER
50512 #define TARGET_SCHED_REORDER ix86_sched_reorder
50513 #undef TARGET_SCHED_ADJUST_PRIORITY
50514 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50515 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50516 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50517 ix86_dependencies_evaluation_hook
50519 /* The size of the dispatch window is the total number of bytes of
50520 object code allowed in a window. */
50521 #define DISPATCH_WINDOW_SIZE 16
50523 /* Number of dispatch windows considered for scheduling. */
50524 #define MAX_DISPATCH_WINDOWS 3
50526 /* Maximum number of instructions in a window. */
50527 #define MAX_INSN 4
50529 /* Maximum number of immediate operands in a window. */
50530 #define MAX_IMM 4
50532 /* Maximum number of immediate bits allowed in a window. */
50533 #define MAX_IMM_SIZE 128
50535 /* Maximum number of 32 bit immediates allowed in a window. */
50536 #define MAX_IMM_32 4
50538 /* Maximum number of 64 bit immediates allowed in a window. */
50539 #define MAX_IMM_64 2
50541 /* Maximum total of loads or prefetches allowed in a window. */
50542 #define MAX_LOAD 2
50544 /* Maximum total of stores allowed in a window. */
50545 #define MAX_STORE 1
50547 #undef BIG
50548 #define BIG 100
50551 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50552 enum dispatch_group {
50553 disp_no_group = 0,
50554 disp_load,
50555 disp_store,
50556 disp_load_store,
50557 disp_prefetch,
50558 disp_imm,
50559 disp_imm_32,
50560 disp_imm_64,
50561 disp_branch,
50562 disp_cmp,
50563 disp_jcc,
50564 disp_last
50567 /* Number of allowable groups in a dispatch window. It is an array
50568 indexed by dispatch_group enum. 100 is used as a big number,
50569 because the number of these kind of operations does not have any
50570 effect in dispatch window, but we need them for other reasons in
50571 the table. */
50572 static unsigned int num_allowable_groups[disp_last] = {
50573 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50576 char group_name[disp_last + 1][16] = {
50577 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50578 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50579 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50582 /* Instruction path. */
50583 enum insn_path {
50584 no_path = 0,
50585 path_single, /* Single micro op. */
50586 path_double, /* Double micro op. */
50587 path_multi, /* Instructions with more than 2 micro op.. */
50588 last_path
50591 /* sched_insn_info defines a window to the instructions scheduled in
50592 the basic block. It contains a pointer to the insn_info table and
50593 the instruction scheduled.
50595 Windows are allocated for each basic block and are linked
50596 together. */
50597 typedef struct sched_insn_info_s {
50598 rtx insn;
50599 enum dispatch_group group;
50600 enum insn_path path;
50601 int byte_len;
50602 int imm_bytes;
50603 } sched_insn_info;
50605 /* Linked list of dispatch windows. This is a two way list of
50606 dispatch windows of a basic block. It contains information about
50607 the number of uops in the window and the total number of
50608 instructions and of bytes in the object code for this dispatch
50609 window. */
50610 typedef struct dispatch_windows_s {
50611 int num_insn; /* Number of insn in the window. */
50612 int num_uops; /* Number of uops in the window. */
50613 int window_size; /* Number of bytes in the window. */
50614 int window_num; /* Window number between 0 or 1. */
50615 int num_imm; /* Number of immediates in an insn. */
50616 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50617 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50618 int imm_size; /* Total immediates in the window. */
50619 int num_loads; /* Total memory loads in the window. */
50620 int num_stores; /* Total memory stores in the window. */
50621 int violation; /* Violation exists in window. */
50622 sched_insn_info *window; /* Pointer to the window. */
50623 struct dispatch_windows_s *next;
50624 struct dispatch_windows_s *prev;
50625 } dispatch_windows;
50627 /* Immediate valuse used in an insn. */
50628 typedef struct imm_info_s
50630 int imm;
50631 int imm32;
50632 int imm64;
50633 } imm_info;
50635 static dispatch_windows *dispatch_window_list;
50636 static dispatch_windows *dispatch_window_list1;
50638 /* Get dispatch group of insn. */
50640 static enum dispatch_group
50641 get_mem_group (rtx_insn *insn)
50643 enum attr_memory memory;
50645 if (INSN_CODE (insn) < 0)
50646 return disp_no_group;
50647 memory = get_attr_memory (insn);
50648 if (memory == MEMORY_STORE)
50649 return disp_store;
50651 if (memory == MEMORY_LOAD)
50652 return disp_load;
50654 if (memory == MEMORY_BOTH)
50655 return disp_load_store;
50657 return disp_no_group;
50660 /* Return true if insn is a compare instruction. */
50662 static bool
50663 is_cmp (rtx_insn *insn)
50665 enum attr_type type;
50667 type = get_attr_type (insn);
50668 return (type == TYPE_TEST
50669 || type == TYPE_ICMP
50670 || type == TYPE_FCMP
50671 || GET_CODE (PATTERN (insn)) == COMPARE);
50674 /* Return true if a dispatch violation encountered. */
50676 static bool
50677 dispatch_violation (void)
50679 if (dispatch_window_list->next)
50680 return dispatch_window_list->next->violation;
50681 return dispatch_window_list->violation;
50684 /* Return true if insn is a branch instruction. */
50686 static bool
50687 is_branch (rtx_insn *insn)
50689 return (CALL_P (insn) || JUMP_P (insn));
50692 /* Return true if insn is a prefetch instruction. */
50694 static bool
50695 is_prefetch (rtx_insn *insn)
50697 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50700 /* This function initializes a dispatch window and the list container holding a
50701 pointer to the window. */
50703 static void
50704 init_window (int window_num)
50706 int i;
50707 dispatch_windows *new_list;
50709 if (window_num == 0)
50710 new_list = dispatch_window_list;
50711 else
50712 new_list = dispatch_window_list1;
50714 new_list->num_insn = 0;
50715 new_list->num_uops = 0;
50716 new_list->window_size = 0;
50717 new_list->next = NULL;
50718 new_list->prev = NULL;
50719 new_list->window_num = window_num;
50720 new_list->num_imm = 0;
50721 new_list->num_imm_32 = 0;
50722 new_list->num_imm_64 = 0;
50723 new_list->imm_size = 0;
50724 new_list->num_loads = 0;
50725 new_list->num_stores = 0;
50726 new_list->violation = false;
50728 for (i = 0; i < MAX_INSN; i++)
50730 new_list->window[i].insn = NULL;
50731 new_list->window[i].group = disp_no_group;
50732 new_list->window[i].path = no_path;
50733 new_list->window[i].byte_len = 0;
50734 new_list->window[i].imm_bytes = 0;
50736 return;
50739 /* This function allocates and initializes a dispatch window and the
50740 list container holding a pointer to the window. */
50742 static dispatch_windows *
50743 allocate_window (void)
50745 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50746 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50748 return new_list;
50751 /* This routine initializes the dispatch scheduling information. It
50752 initiates building dispatch scheduler tables and constructs the
50753 first dispatch window. */
50755 static void
50756 init_dispatch_sched (void)
50758 /* Allocate a dispatch list and a window. */
50759 dispatch_window_list = allocate_window ();
50760 dispatch_window_list1 = allocate_window ();
50761 init_window (0);
50762 init_window (1);
50765 /* This function returns true if a branch is detected. End of a basic block
50766 does not have to be a branch, but here we assume only branches end a
50767 window. */
50769 static bool
50770 is_end_basic_block (enum dispatch_group group)
50772 return group == disp_branch;
50775 /* This function is called when the end of a window processing is reached. */
50777 static void
50778 process_end_window (void)
50780 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50781 if (dispatch_window_list->next)
50783 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50784 gcc_assert (dispatch_window_list->window_size
50785 + dispatch_window_list1->window_size <= 48);
50786 init_window (1);
50788 init_window (0);
50791 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50792 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50793 for 48 bytes of instructions. Note that these windows are not dispatch
50794 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50796 static dispatch_windows *
50797 allocate_next_window (int window_num)
50799 if (window_num == 0)
50801 if (dispatch_window_list->next)
50802 init_window (1);
50803 init_window (0);
50804 return dispatch_window_list;
50807 dispatch_window_list->next = dispatch_window_list1;
50808 dispatch_window_list1->prev = dispatch_window_list;
50810 return dispatch_window_list1;
50813 /* Compute number of immediate operands of an instruction. */
50815 static void
50816 find_constant (rtx in_rtx, imm_info *imm_values)
50818 if (INSN_P (in_rtx))
50819 in_rtx = PATTERN (in_rtx);
50820 subrtx_iterator::array_type array;
50821 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50822 if (const_rtx x = *iter)
50823 switch (GET_CODE (x))
50825 case CONST:
50826 case SYMBOL_REF:
50827 case CONST_INT:
50828 (imm_values->imm)++;
50829 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
50830 (imm_values->imm32)++;
50831 else
50832 (imm_values->imm64)++;
50833 break;
50835 case CONST_DOUBLE:
50836 case CONST_WIDE_INT:
50837 (imm_values->imm)++;
50838 (imm_values->imm64)++;
50839 break;
50841 case CODE_LABEL:
50842 if (LABEL_KIND (x) == LABEL_NORMAL)
50844 (imm_values->imm)++;
50845 (imm_values->imm32)++;
50847 break;
50849 default:
50850 break;
50854 /* Return total size of immediate operands of an instruction along with number
50855 of corresponding immediate-operands. It initializes its parameters to zero
50856 befor calling FIND_CONSTANT.
50857 INSN is the input instruction. IMM is the total of immediates.
50858 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
50859 bit immediates. */
50861 static int
50862 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
50864 imm_info imm_values = {0, 0, 0};
50866 find_constant (insn, &imm_values);
50867 *imm = imm_values.imm;
50868 *imm32 = imm_values.imm32;
50869 *imm64 = imm_values.imm64;
50870 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
50873 /* This function indicates if an operand of an instruction is an
50874 immediate. */
50876 static bool
50877 has_immediate (rtx_insn *insn)
50879 int num_imm_operand;
50880 int num_imm32_operand;
50881 int num_imm64_operand;
50883 if (insn)
50884 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50885 &num_imm64_operand);
50886 return false;
50889 /* Return single or double path for instructions. */
50891 static enum insn_path
50892 get_insn_path (rtx_insn *insn)
50894 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
50896 if ((int)path == 0)
50897 return path_single;
50899 if ((int)path == 1)
50900 return path_double;
50902 return path_multi;
50905 /* Return insn dispatch group. */
50907 static enum dispatch_group
50908 get_insn_group (rtx_insn *insn)
50910 enum dispatch_group group = get_mem_group (insn);
50911 if (group)
50912 return group;
50914 if (is_branch (insn))
50915 return disp_branch;
50917 if (is_cmp (insn))
50918 return disp_cmp;
50920 if (has_immediate (insn))
50921 return disp_imm;
50923 if (is_prefetch (insn))
50924 return disp_prefetch;
50926 return disp_no_group;
50929 /* Count number of GROUP restricted instructions in a dispatch
50930 window WINDOW_LIST. */
50932 static int
50933 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50935 enum dispatch_group group = get_insn_group (insn);
50936 int imm_size;
50937 int num_imm_operand;
50938 int num_imm32_operand;
50939 int num_imm64_operand;
50941 if (group == disp_no_group)
50942 return 0;
50944 if (group == disp_imm)
50946 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50947 &num_imm64_operand);
50948 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50949 || num_imm_operand + window_list->num_imm > MAX_IMM
50950 || (num_imm32_operand > 0
50951 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50952 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50953 || (num_imm64_operand > 0
50954 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50955 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50956 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50957 && num_imm64_operand > 0
50958 && ((window_list->num_imm_64 > 0
50959 && window_list->num_insn >= 2)
50960 || window_list->num_insn >= 3)))
50961 return BIG;
50963 return 1;
50966 if ((group == disp_load_store
50967 && (window_list->num_loads >= MAX_LOAD
50968 || window_list->num_stores >= MAX_STORE))
50969 || ((group == disp_load
50970 || group == disp_prefetch)
50971 && window_list->num_loads >= MAX_LOAD)
50972 || (group == disp_store
50973 && window_list->num_stores >= MAX_STORE))
50974 return BIG;
50976 return 1;
50979 /* This function returns true if insn satisfies dispatch rules on the
50980 last window scheduled. */
50982 static bool
50983 fits_dispatch_window (rtx_insn *insn)
50985 dispatch_windows *window_list = dispatch_window_list;
50986 dispatch_windows *window_list_next = dispatch_window_list->next;
50987 unsigned int num_restrict;
50988 enum dispatch_group group = get_insn_group (insn);
50989 enum insn_path path = get_insn_path (insn);
50990 int sum;
50992 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50993 instructions should be given the lowest priority in the
50994 scheduling process in Haifa scheduler to make sure they will be
50995 scheduled in the same dispatch window as the reference to them. */
50996 if (group == disp_jcc || group == disp_cmp)
50997 return false;
50999 /* Check nonrestricted. */
51000 if (group == disp_no_group || group == disp_branch)
51001 return true;
51003 /* Get last dispatch window. */
51004 if (window_list_next)
51005 window_list = window_list_next;
51007 if (window_list->window_num == 1)
51009 sum = window_list->prev->window_size + window_list->window_size;
51011 if (sum == 32
51012 || (min_insn_size (insn) + sum) >= 48)
51013 /* Window 1 is full. Go for next window. */
51014 return true;
51017 num_restrict = count_num_restricted (insn, window_list);
51019 if (num_restrict > num_allowable_groups[group])
51020 return false;
51022 /* See if it fits in the first window. */
51023 if (window_list->window_num == 0)
51025 /* The first widow should have only single and double path
51026 uops. */
51027 if (path == path_double
51028 && (window_list->num_uops + 2) > MAX_INSN)
51029 return false;
51030 else if (path != path_single)
51031 return false;
51033 return true;
51036 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51037 dispatch window WINDOW_LIST. */
51039 static void
51040 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51042 int byte_len = min_insn_size (insn);
51043 int num_insn = window_list->num_insn;
51044 int imm_size;
51045 sched_insn_info *window = window_list->window;
51046 enum dispatch_group group = get_insn_group (insn);
51047 enum insn_path path = get_insn_path (insn);
51048 int num_imm_operand;
51049 int num_imm32_operand;
51050 int num_imm64_operand;
51052 if (!window_list->violation && group != disp_cmp
51053 && !fits_dispatch_window (insn))
51054 window_list->violation = true;
51056 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51057 &num_imm64_operand);
51059 /* Initialize window with new instruction. */
51060 window[num_insn].insn = insn;
51061 window[num_insn].byte_len = byte_len;
51062 window[num_insn].group = group;
51063 window[num_insn].path = path;
51064 window[num_insn].imm_bytes = imm_size;
51066 window_list->window_size += byte_len;
51067 window_list->num_insn = num_insn + 1;
51068 window_list->num_uops = window_list->num_uops + num_uops;
51069 window_list->imm_size += imm_size;
51070 window_list->num_imm += num_imm_operand;
51071 window_list->num_imm_32 += num_imm32_operand;
51072 window_list->num_imm_64 += num_imm64_operand;
51074 if (group == disp_store)
51075 window_list->num_stores += 1;
51076 else if (group == disp_load
51077 || group == disp_prefetch)
51078 window_list->num_loads += 1;
51079 else if (group == disp_load_store)
51081 window_list->num_stores += 1;
51082 window_list->num_loads += 1;
51086 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51087 If the total bytes of instructions or the number of instructions in
51088 the window exceed allowable, it allocates a new window. */
51090 static void
51091 add_to_dispatch_window (rtx_insn *insn)
51093 int byte_len;
51094 dispatch_windows *window_list;
51095 dispatch_windows *next_list;
51096 dispatch_windows *window0_list;
51097 enum insn_path path;
51098 enum dispatch_group insn_group;
51099 bool insn_fits;
51100 int num_insn;
51101 int num_uops;
51102 int window_num;
51103 int insn_num_uops;
51104 int sum;
51106 if (INSN_CODE (insn) < 0)
51107 return;
51109 byte_len = min_insn_size (insn);
51110 window_list = dispatch_window_list;
51111 next_list = window_list->next;
51112 path = get_insn_path (insn);
51113 insn_group = get_insn_group (insn);
51115 /* Get the last dispatch window. */
51116 if (next_list)
51117 window_list = dispatch_window_list->next;
51119 if (path == path_single)
51120 insn_num_uops = 1;
51121 else if (path == path_double)
51122 insn_num_uops = 2;
51123 else
51124 insn_num_uops = (int) path;
51126 /* If current window is full, get a new window.
51127 Window number zero is full, if MAX_INSN uops are scheduled in it.
51128 Window number one is full, if window zero's bytes plus window
51129 one's bytes is 32, or if the bytes of the new instruction added
51130 to the total makes it greater than 48, or it has already MAX_INSN
51131 instructions in it. */
51132 num_insn = window_list->num_insn;
51133 num_uops = window_list->num_uops;
51134 window_num = window_list->window_num;
51135 insn_fits = fits_dispatch_window (insn);
51137 if (num_insn >= MAX_INSN
51138 || num_uops + insn_num_uops > MAX_INSN
51139 || !(insn_fits))
51141 window_num = ~window_num & 1;
51142 window_list = allocate_next_window (window_num);
51145 if (window_num == 0)
51147 add_insn_window (insn, window_list, insn_num_uops);
51148 if (window_list->num_insn >= MAX_INSN
51149 && insn_group == disp_branch)
51151 process_end_window ();
51152 return;
51155 else if (window_num == 1)
51157 window0_list = window_list->prev;
51158 sum = window0_list->window_size + window_list->window_size;
51159 if (sum == 32
51160 || (byte_len + sum) >= 48)
51162 process_end_window ();
51163 window_list = dispatch_window_list;
51166 add_insn_window (insn, window_list, insn_num_uops);
51168 else
51169 gcc_unreachable ();
51171 if (is_end_basic_block (insn_group))
51173 /* End of basic block is reached do end-basic-block process. */
51174 process_end_window ();
51175 return;
51179 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51181 DEBUG_FUNCTION static void
51182 debug_dispatch_window_file (FILE *file, int window_num)
51184 dispatch_windows *list;
51185 int i;
51187 if (window_num == 0)
51188 list = dispatch_window_list;
51189 else
51190 list = dispatch_window_list1;
51192 fprintf (file, "Window #%d:\n", list->window_num);
51193 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51194 list->num_insn, list->num_uops, list->window_size);
51195 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51196 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51198 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51199 list->num_stores);
51200 fprintf (file, " insn info:\n");
51202 for (i = 0; i < MAX_INSN; i++)
51204 if (!list->window[i].insn)
51205 break;
51206 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51207 i, group_name[list->window[i].group],
51208 i, (void *)list->window[i].insn,
51209 i, list->window[i].path,
51210 i, list->window[i].byte_len,
51211 i, list->window[i].imm_bytes);
51215 /* Print to stdout a dispatch window. */
51217 DEBUG_FUNCTION void
51218 debug_dispatch_window (int window_num)
51220 debug_dispatch_window_file (stdout, window_num);
51223 /* Print INSN dispatch information to FILE. */
51225 DEBUG_FUNCTION static void
51226 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51228 int byte_len;
51229 enum insn_path path;
51230 enum dispatch_group group;
51231 int imm_size;
51232 int num_imm_operand;
51233 int num_imm32_operand;
51234 int num_imm64_operand;
51236 if (INSN_CODE (insn) < 0)
51237 return;
51239 byte_len = min_insn_size (insn);
51240 path = get_insn_path (insn);
51241 group = get_insn_group (insn);
51242 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51243 &num_imm64_operand);
51245 fprintf (file, " insn info:\n");
51246 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51247 group_name[group], path, byte_len);
51248 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51249 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51252 /* Print to STDERR the status of the ready list with respect to
51253 dispatch windows. */
51255 DEBUG_FUNCTION void
51256 debug_ready_dispatch (void)
51258 int i;
51259 int no_ready = number_in_ready ();
51261 fprintf (stdout, "Number of ready: %d\n", no_ready);
51263 for (i = 0; i < no_ready; i++)
51264 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51267 /* This routine is the driver of the dispatch scheduler. */
51269 static void
51270 do_dispatch (rtx_insn *insn, int mode)
51272 if (mode == DISPATCH_INIT)
51273 init_dispatch_sched ();
51274 else if (mode == ADD_TO_DISPATCH_WINDOW)
51275 add_to_dispatch_window (insn);
51278 /* Return TRUE if Dispatch Scheduling is supported. */
51280 static bool
51281 has_dispatch (rtx_insn *insn, int action)
51283 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51284 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51285 switch (action)
51287 default:
51288 return false;
51290 case IS_DISPATCH_ON:
51291 return true;
51293 case IS_CMP:
51294 return is_cmp (insn);
51296 case DISPATCH_VIOLATION:
51297 return dispatch_violation ();
51299 case FITS_DISPATCH_WINDOW:
51300 return fits_dispatch_window (insn);
51303 return false;
51306 /* Implementation of reassociation_width target hook used by
51307 reassoc phase to identify parallelism level in reassociated
51308 tree. Statements tree_code is passed in OPC. Arguments type
51309 is passed in MODE.
51311 Currently parallel reassociation is enabled for Atom
51312 processors only and we set reassociation width to be 2
51313 because Atom may issue up to 2 instructions per cycle.
51315 Return value should be fixed if parallel reassociation is
51316 enabled for other processors. */
51318 static int
51319 ix86_reassociation_width (unsigned int, machine_mode mode)
51321 /* Vector part. */
51322 if (VECTOR_MODE_P (mode))
51324 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51325 return 2;
51326 else
51327 return 1;
51330 /* Scalar part. */
51331 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51332 return 2;
51333 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51334 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51335 else
51336 return 1;
51339 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51340 place emms and femms instructions. */
51342 static machine_mode
51343 ix86_preferred_simd_mode (machine_mode mode)
51345 if (!TARGET_SSE)
51346 return word_mode;
51348 switch (mode)
51350 case QImode:
51351 return TARGET_AVX512BW ? V64QImode :
51352 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51353 case HImode:
51354 return TARGET_AVX512BW ? V32HImode :
51355 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51356 case SImode:
51357 return TARGET_AVX512F ? V16SImode :
51358 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51359 case DImode:
51360 return TARGET_AVX512F ? V8DImode :
51361 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51363 case SFmode:
51364 if (TARGET_AVX512F)
51365 return V16SFmode;
51366 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51367 return V8SFmode;
51368 else
51369 return V4SFmode;
51371 case DFmode:
51372 if (TARGET_AVX512F)
51373 return V8DFmode;
51374 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51375 return V4DFmode;
51376 else if (TARGET_SSE2)
51377 return V2DFmode;
51378 /* FALLTHRU */
51380 default:
51381 return word_mode;
51385 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51386 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51387 256bit and 128bit vectors. */
51389 static unsigned int
51390 ix86_autovectorize_vector_sizes (void)
51392 return TARGET_AVX512F ? 64 | 32 | 16 :
51393 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51396 /* Implemenation of targetm.vectorize.get_mask_mode. */
51398 static machine_mode
51399 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51401 unsigned elem_size = vector_size / nunits;
51403 /* Scalar mask case. */
51404 if ((TARGET_AVX512F && vector_size == 64)
51405 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51407 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51408 return smallest_mode_for_size (nunits, MODE_INT);
51411 machine_mode elem_mode
51412 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51414 gcc_assert (elem_size * nunits == vector_size);
51416 return mode_for_vector (elem_mode, nunits);
51421 /* Return class of registers which could be used for pseudo of MODE
51422 and of class RCLASS for spilling instead of memory. Return NO_REGS
51423 if it is not possible or non-profitable. */
51425 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51427 static reg_class_t
51428 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51430 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51431 && TARGET_SSE2
51432 && TARGET_INTER_UNIT_MOVES_TO_VEC
51433 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51434 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51435 && INTEGER_CLASS_P (rclass))
51436 return ALL_SSE_REGS;
51437 return NO_REGS;
51440 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51441 but returns a lower bound. */
51443 static unsigned int
51444 ix86_max_noce_ifcvt_seq_cost (edge e)
51446 bool predictable_p = predictable_edge_p (e);
51448 enum compiler_param param
51449 = (predictable_p
51450 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51451 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51453 /* If we have a parameter set, use that, otherwise take a guess using
51454 BRANCH_COST. */
51455 if (global_options_set.x_param_values[param])
51456 return PARAM_VALUE (param);
51457 else
51458 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51461 /* Return true if SEQ is a good candidate as a replacement for the
51462 if-convertible sequence described in IF_INFO. */
51464 static bool
51465 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51467 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51469 int cmov_cnt = 0;
51470 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51471 Maybe we should allow even more conditional moves as long as they
51472 are used far enough not to stall the CPU, or also consider
51473 IF_INFO->TEST_BB succ edge probabilities. */
51474 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51476 rtx set = single_set (insn);
51477 if (!set)
51478 continue;
51479 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51480 continue;
51481 rtx src = SET_SRC (set);
51482 machine_mode mode = GET_MODE (src);
51483 if (GET_MODE_CLASS (mode) != MODE_INT
51484 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51485 continue;
51486 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51487 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51488 continue;
51489 /* insn is CMOV or FCMOV. */
51490 if (++cmov_cnt > 1)
51491 return false;
51494 return default_noce_conversion_profitable_p (seq, if_info);
51497 /* Implement targetm.vectorize.init_cost. */
51499 static void *
51500 ix86_init_cost (struct loop *)
51502 unsigned *cost = XNEWVEC (unsigned, 3);
51503 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51504 return cost;
51507 /* Implement targetm.vectorize.add_stmt_cost. */
51509 static unsigned
51510 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51511 struct _stmt_vec_info *stmt_info, int misalign,
51512 enum vect_cost_model_location where)
51514 unsigned *cost = (unsigned *) data;
51515 unsigned retval = 0;
51517 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51518 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51520 /* Penalize DFmode vector operations for Bonnell. */
51521 if (TARGET_BONNELL && kind == vector_stmt
51522 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51523 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51525 /* Statements in an inner loop relative to the loop being
51526 vectorized are weighted more heavily. The value here is
51527 arbitrary and could potentially be improved with analysis. */
51528 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51529 count *= 50; /* FIXME. */
51531 retval = (unsigned) (count * stmt_cost);
51533 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51534 for Silvermont as it has out of order integer pipeline and can execute
51535 2 scalar instruction per tick, but has in order SIMD pipeline. */
51536 if ((TARGET_SILVERMONT || TARGET_INTEL)
51537 && stmt_info && stmt_info->stmt)
51539 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51540 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51541 retval = (retval * 17) / 10;
51544 cost[where] += retval;
51546 return retval;
51549 /* Implement targetm.vectorize.finish_cost. */
51551 static void
51552 ix86_finish_cost (void *data, unsigned *prologue_cost,
51553 unsigned *body_cost, unsigned *epilogue_cost)
51555 unsigned *cost = (unsigned *) data;
51556 *prologue_cost = cost[vect_prologue];
51557 *body_cost = cost[vect_body];
51558 *epilogue_cost = cost[vect_epilogue];
51561 /* Implement targetm.vectorize.destroy_cost_data. */
51563 static void
51564 ix86_destroy_cost_data (void *data)
51566 free (data);
51569 /* Validate target specific memory model bits in VAL. */
51571 static unsigned HOST_WIDE_INT
51572 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51574 enum memmodel model = memmodel_from_int (val);
51575 bool strong;
51577 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51578 |MEMMODEL_MASK)
51579 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51581 warning (OPT_Winvalid_memory_model,
51582 "Unknown architecture specific memory model");
51583 return MEMMODEL_SEQ_CST;
51585 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51586 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51588 warning (OPT_Winvalid_memory_model,
51589 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51590 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51592 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51594 warning (OPT_Winvalid_memory_model,
51595 "HLE_RELEASE not used with RELEASE or stronger memory model");
51596 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51598 return val;
51601 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51602 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51603 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51604 or number of vecsize_mangle variants that should be emitted. */
51606 static int
51607 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51608 struct cgraph_simd_clone *clonei,
51609 tree base_type, int num)
51611 int ret = 1;
51613 if (clonei->simdlen
51614 && (clonei->simdlen < 2
51615 || clonei->simdlen > 1024
51616 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51618 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51619 "unsupported simdlen %d", clonei->simdlen);
51620 return 0;
51623 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51624 if (TREE_CODE (ret_type) != VOID_TYPE)
51625 switch (TYPE_MODE (ret_type))
51627 case QImode:
51628 case HImode:
51629 case SImode:
51630 case DImode:
51631 case SFmode:
51632 case DFmode:
51633 /* case SCmode: */
51634 /* case DCmode: */
51635 break;
51636 default:
51637 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51638 "unsupported return type %qT for simd\n", ret_type);
51639 return 0;
51642 tree t;
51643 int i;
51645 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51646 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51647 switch (TYPE_MODE (TREE_TYPE (t)))
51649 case QImode:
51650 case HImode:
51651 case SImode:
51652 case DImode:
51653 case SFmode:
51654 case DFmode:
51655 /* case SCmode: */
51656 /* case DCmode: */
51657 break;
51658 default:
51659 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51660 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51661 return 0;
51664 if (clonei->cilk_elemental)
51666 /* Parse here processor clause. If not present, default to 'b'. */
51667 clonei->vecsize_mangle = 'b';
51669 else if (!TREE_PUBLIC (node->decl))
51671 /* If the function isn't exported, we can pick up just one ISA
51672 for the clones. */
51673 if (TARGET_AVX512F)
51674 clonei->vecsize_mangle = 'e';
51675 else if (TARGET_AVX2)
51676 clonei->vecsize_mangle = 'd';
51677 else if (TARGET_AVX)
51678 clonei->vecsize_mangle = 'c';
51679 else
51680 clonei->vecsize_mangle = 'b';
51681 ret = 1;
51683 else
51685 clonei->vecsize_mangle = "bcde"[num];
51686 ret = 4;
51688 clonei->mask_mode = VOIDmode;
51689 switch (clonei->vecsize_mangle)
51691 case 'b':
51692 clonei->vecsize_int = 128;
51693 clonei->vecsize_float = 128;
51694 break;
51695 case 'c':
51696 clonei->vecsize_int = 128;
51697 clonei->vecsize_float = 256;
51698 break;
51699 case 'd':
51700 clonei->vecsize_int = 256;
51701 clonei->vecsize_float = 256;
51702 break;
51703 case 'e':
51704 clonei->vecsize_int = 512;
51705 clonei->vecsize_float = 512;
51706 if (TYPE_MODE (base_type) == QImode)
51707 clonei->mask_mode = DImode;
51708 else
51709 clonei->mask_mode = SImode;
51710 break;
51712 if (clonei->simdlen == 0)
51714 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51715 clonei->simdlen = clonei->vecsize_int;
51716 else
51717 clonei->simdlen = clonei->vecsize_float;
51718 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51720 else if (clonei->simdlen > 16)
51722 /* For compatibility with ICC, use the same upper bounds
51723 for simdlen. In particular, for CTYPE below, use the return type,
51724 unless the function returns void, in that case use the characteristic
51725 type. If it is possible for given SIMDLEN to pass CTYPE value
51726 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51727 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51728 emit corresponding clone. */
51729 tree ctype = ret_type;
51730 if (TREE_CODE (ret_type) == VOID_TYPE)
51731 ctype = base_type;
51732 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51733 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51734 cnt /= clonei->vecsize_int;
51735 else
51736 cnt /= clonei->vecsize_float;
51737 if (cnt > (TARGET_64BIT ? 16 : 8))
51739 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51740 "unsupported simdlen %d", clonei->simdlen);
51741 return 0;
51744 return ret;
51747 /* Add target attribute to SIMD clone NODE if needed. */
51749 static void
51750 ix86_simd_clone_adjust (struct cgraph_node *node)
51752 const char *str = NULL;
51753 gcc_assert (node->decl == cfun->decl);
51754 switch (node->simdclone->vecsize_mangle)
51756 case 'b':
51757 if (!TARGET_SSE2)
51758 str = "sse2";
51759 break;
51760 case 'c':
51761 if (!TARGET_AVX)
51762 str = "avx";
51763 break;
51764 case 'd':
51765 if (!TARGET_AVX2)
51766 str = "avx2";
51767 break;
51768 case 'e':
51769 if (!TARGET_AVX512F)
51770 str = "avx512f";
51771 break;
51772 default:
51773 gcc_unreachable ();
51775 if (str == NULL)
51776 return;
51777 push_cfun (NULL);
51778 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51779 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51780 gcc_assert (ok);
51781 pop_cfun ();
51782 ix86_reset_previous_fndecl ();
51783 ix86_set_current_function (node->decl);
51786 /* If SIMD clone NODE can't be used in a vectorized loop
51787 in current function, return -1, otherwise return a badness of using it
51788 (0 if it is most desirable from vecsize_mangle point of view, 1
51789 slightly less desirable, etc.). */
51791 static int
51792 ix86_simd_clone_usable (struct cgraph_node *node)
51794 switch (node->simdclone->vecsize_mangle)
51796 case 'b':
51797 if (!TARGET_SSE2)
51798 return -1;
51799 if (!TARGET_AVX)
51800 return 0;
51801 return TARGET_AVX2 ? 2 : 1;
51802 case 'c':
51803 if (!TARGET_AVX)
51804 return -1;
51805 return TARGET_AVX2 ? 1 : 0;
51806 case 'd':
51807 if (!TARGET_AVX2)
51808 return -1;
51809 return 0;
51810 case 'e':
51811 if (!TARGET_AVX512F)
51812 return -1;
51813 return 0;
51814 default:
51815 gcc_unreachable ();
51819 /* This function adjusts the unroll factor based on
51820 the hardware capabilities. For ex, bdver3 has
51821 a loop buffer which makes unrolling of smaller
51822 loops less important. This function decides the
51823 unroll factor using number of memory references
51824 (value 32 is used) as a heuristic. */
51826 static unsigned
51827 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
51829 basic_block *bbs;
51830 rtx_insn *insn;
51831 unsigned i;
51832 unsigned mem_count = 0;
51834 if (!TARGET_ADJUST_UNROLL)
51835 return nunroll;
51837 /* Count the number of memory references within the loop body.
51838 This value determines the unrolling factor for bdver3 and bdver4
51839 architectures. */
51840 subrtx_iterator::array_type array;
51841 bbs = get_loop_body (loop);
51842 for (i = 0; i < loop->num_nodes; i++)
51843 FOR_BB_INSNS (bbs[i], insn)
51844 if (NONDEBUG_INSN_P (insn))
51845 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
51846 if (const_rtx x = *iter)
51847 if (MEM_P (x))
51849 machine_mode mode = GET_MODE (x);
51850 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
51851 if (n_words > 4)
51852 mem_count += 2;
51853 else
51854 mem_count += 1;
51856 free (bbs);
51858 if (mem_count && mem_count <=32)
51859 return 32/mem_count;
51861 return nunroll;
51865 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
51867 static bool
51868 ix86_float_exceptions_rounding_supported_p (void)
51870 /* For x87 floating point with standard excess precision handling,
51871 there is no adddf3 pattern (since x87 floating point only has
51872 XFmode operations) so the default hook implementation gets this
51873 wrong. */
51874 return TARGET_80387 || TARGET_SSE_MATH;
51877 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
51879 static void
51880 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
51882 if (!TARGET_80387 && !TARGET_SSE_MATH)
51883 return;
51884 tree exceptions_var = create_tmp_var_raw (integer_type_node);
51885 if (TARGET_80387)
51887 tree fenv_index_type = build_index_type (size_int (6));
51888 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
51889 tree fenv_var = create_tmp_var_raw (fenv_type);
51890 TREE_ADDRESSABLE (fenv_var) = 1;
51891 tree fenv_ptr = build_pointer_type (fenv_type);
51892 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
51893 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
51894 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
51895 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
51896 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
51897 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
51898 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
51899 tree hold_fnclex = build_call_expr (fnclex, 0);
51900 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
51901 NULL_TREE, NULL_TREE);
51902 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
51903 hold_fnclex);
51904 *clear = build_call_expr (fnclex, 0);
51905 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
51906 tree fnstsw_call = build_call_expr (fnstsw, 0);
51907 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
51908 sw_var, fnstsw_call);
51909 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
51910 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
51911 exceptions_var, exceptions_x87);
51912 *update = build2 (COMPOUND_EXPR, integer_type_node,
51913 sw_mod, update_mod);
51914 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
51915 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
51917 if (TARGET_SSE_MATH)
51919 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
51920 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
51921 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
51922 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
51923 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
51924 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
51925 mxcsr_orig_var, stmxcsr_hold_call);
51926 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
51927 mxcsr_orig_var,
51928 build_int_cst (unsigned_type_node, 0x1f80));
51929 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
51930 build_int_cst (unsigned_type_node, 0xffffffc0));
51931 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
51932 mxcsr_mod_var, hold_mod_val);
51933 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51934 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51935 hold_assign_orig, hold_assign_mod);
51936 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51937 ldmxcsr_hold_call);
51938 if (*hold)
51939 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51940 else
51941 *hold = hold_all;
51942 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51943 if (*clear)
51944 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51945 ldmxcsr_clear_call);
51946 else
51947 *clear = ldmxcsr_clear_call;
51948 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51949 tree exceptions_sse = fold_convert (integer_type_node,
51950 stxmcsr_update_call);
51951 if (*update)
51953 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51954 exceptions_var, exceptions_sse);
51955 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51956 exceptions_var, exceptions_mod);
51957 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51958 exceptions_assign);
51960 else
51961 *update = build2 (MODIFY_EXPR, integer_type_node,
51962 exceptions_var, exceptions_sse);
51963 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51964 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51965 ldmxcsr_update_call);
51967 tree atomic_feraiseexcept
51968 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51969 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51970 1, exceptions_var);
51971 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51972 atomic_feraiseexcept_call);
51975 /* Return mode to be used for bounds or VOIDmode
51976 if bounds are not supported. */
51978 static machine_mode
51979 ix86_mpx_bound_mode ()
51981 /* Do not support pointer checker if MPX
51982 is not enabled. */
51983 if (!TARGET_MPX)
51985 if (flag_check_pointer_bounds)
51986 warning (0, "Pointer Checker requires MPX support on this target."
51987 " Use -mmpx options to enable MPX.");
51988 return VOIDmode;
51991 return BNDmode;
51994 /* Return constant used to statically initialize constant bounds.
51996 This function is used to create special bound values. For now
51997 only INIT bounds and NONE bounds are expected. More special
51998 values may be added later. */
52000 static tree
52001 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
52003 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
52004 : build_zero_cst (pointer_sized_int_node);
52005 tree high = ub ? build_zero_cst (pointer_sized_int_node)
52006 : build_minus_one_cst (pointer_sized_int_node);
52008 /* This function is supposed to be used to create INIT and
52009 NONE bounds only. */
52010 gcc_assert ((lb == 0 && ub == -1)
52011 || (lb == -1 && ub == 0));
52013 return build_complex (NULL, low, high);
52016 /* Generate a list of statements STMTS to initialize pointer bounds
52017 variable VAR with bounds LB and UB. Return the number of generated
52018 statements. */
52020 static int
52021 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52023 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52024 tree lhs, modify, var_p;
52026 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52027 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52029 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52030 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52031 append_to_statement_list (modify, stmts);
52033 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52034 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52035 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52036 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52037 append_to_statement_list (modify, stmts);
52039 return 2;
52042 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52043 /* For i386, common symbol is local only for non-PIE binaries. For
52044 x86-64, common symbol is local only for non-PIE binaries or linker
52045 supports copy reloc in PIE binaries. */
52047 static bool
52048 ix86_binds_local_p (const_tree exp)
52050 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52051 (!flag_pic
52052 || (TARGET_64BIT
52053 && HAVE_LD_PIE_COPYRELOC != 0)));
52055 #endif
52057 /* If MEM is in the form of [base+offset], extract the two parts
52058 of address and set to BASE and OFFSET, otherwise return false. */
52060 static bool
52061 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52063 rtx addr;
52065 gcc_assert (MEM_P (mem));
52067 addr = XEXP (mem, 0);
52069 if (GET_CODE (addr) == CONST)
52070 addr = XEXP (addr, 0);
52072 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52074 *base = addr;
52075 *offset = const0_rtx;
52076 return true;
52079 if (GET_CODE (addr) == PLUS
52080 && (REG_P (XEXP (addr, 0))
52081 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52082 && CONST_INT_P (XEXP (addr, 1)))
52084 *base = XEXP (addr, 0);
52085 *offset = XEXP (addr, 1);
52086 return true;
52089 return false;
52092 /* Given OPERANDS of consecutive load/store, check if we can merge
52093 them into move multiple. LOAD is true if they are load instructions.
52094 MODE is the mode of memory operands. */
52096 bool
52097 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52098 machine_mode mode)
52100 HOST_WIDE_INT offval_1, offval_2, msize;
52101 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52103 if (load)
52105 mem_1 = operands[1];
52106 mem_2 = operands[3];
52107 reg_1 = operands[0];
52108 reg_2 = operands[2];
52110 else
52112 mem_1 = operands[0];
52113 mem_2 = operands[2];
52114 reg_1 = operands[1];
52115 reg_2 = operands[3];
52118 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52120 if (REGNO (reg_1) != REGNO (reg_2))
52121 return false;
52123 /* Check if the addresses are in the form of [base+offset]. */
52124 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52125 return false;
52126 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52127 return false;
52129 /* Check if the bases are the same. */
52130 if (!rtx_equal_p (base_1, base_2))
52131 return false;
52133 offval_1 = INTVAL (offset_1);
52134 offval_2 = INTVAL (offset_2);
52135 msize = GET_MODE_SIZE (mode);
52136 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52137 if (offval_1 + msize != offval_2)
52138 return false;
52140 return true;
52143 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52145 static bool
52146 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52147 optimization_type opt_type)
52149 switch (op)
52151 case asin_optab:
52152 case acos_optab:
52153 case log1p_optab:
52154 case exp_optab:
52155 case exp10_optab:
52156 case exp2_optab:
52157 case expm1_optab:
52158 case ldexp_optab:
52159 case scalb_optab:
52160 case round_optab:
52161 return opt_type == OPTIMIZE_FOR_SPEED;
52163 case rint_optab:
52164 if (SSE_FLOAT_MODE_P (mode1)
52165 && TARGET_SSE_MATH
52166 && !flag_trapping_math
52167 && !TARGET_ROUND)
52168 return opt_type == OPTIMIZE_FOR_SPEED;
52169 return true;
52171 case floor_optab:
52172 case ceil_optab:
52173 case btrunc_optab:
52174 if (SSE_FLOAT_MODE_P (mode1)
52175 && TARGET_SSE_MATH
52176 && !flag_trapping_math
52177 && TARGET_ROUND)
52178 return true;
52179 return opt_type == OPTIMIZE_FOR_SPEED;
52181 case rsqrt_optab:
52182 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52184 default:
52185 return true;
52189 /* Address space support.
52191 This is not "far pointers" in the 16-bit sense, but an easy way
52192 to use %fs and %gs segment prefixes. Therefore:
52194 (a) All address spaces have the same modes,
52195 (b) All address spaces have the same addresss forms,
52196 (c) While %fs and %gs are technically subsets of the generic
52197 address space, they are probably not subsets of each other.
52198 (d) Since we have no access to the segment base register values
52199 without resorting to a system call, we cannot convert a
52200 non-default address space to a default address space.
52201 Therefore we do not claim %fs or %gs are subsets of generic.
52203 Therefore we can (mostly) use the default hooks. */
52205 /* All use of segmentation is assumed to make address 0 valid. */
52207 static bool
52208 ix86_addr_space_zero_address_valid (addr_space_t as)
52210 return as != ADDR_SPACE_GENERIC;
52213 static void
52214 ix86_init_libfuncs (void)
52216 if (TARGET_64BIT)
52218 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52219 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52221 else
52223 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52224 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52227 #if TARGET_MACHO
52228 darwin_rename_builtins ();
52229 #endif
52232 /* Generate call to __divmoddi4. */
52234 static void
52235 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52236 rtx op0, rtx op1,
52237 rtx *quot_p, rtx *rem_p)
52239 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52241 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52242 mode, 3,
52243 op0, GET_MODE (op0),
52244 op1, GET_MODE (op1),
52245 XEXP (rem, 0), Pmode);
52246 *quot_p = quot;
52247 *rem_p = rem;
52250 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52251 FPU, assume that the fpcw is set to extended precision; when using
52252 only SSE, rounding is correct; when using both SSE and the FPU,
52253 the rounding precision is indeterminate, since either may be chosen
52254 apparently at random. */
52256 static enum flt_eval_method
52257 ix86_excess_precision (enum excess_precision_type type)
52259 switch (type)
52261 case EXCESS_PRECISION_TYPE_FAST:
52262 /* The fastest type to promote to will always be the native type,
52263 whether that occurs with implicit excess precision or
52264 otherwise. */
52265 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52266 case EXCESS_PRECISION_TYPE_STANDARD:
52267 case EXCESS_PRECISION_TYPE_IMPLICIT:
52268 /* Otherwise, the excess precision we want when we are
52269 in a standards compliant mode, and the implicit precision we
52270 provide would be identical were it not for the unpredictable
52271 cases. */
52272 if (!TARGET_80387)
52273 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52274 else if (!TARGET_MIX_SSE_I387)
52276 if (!TARGET_SSE_MATH)
52277 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52278 else if (TARGET_SSE2)
52279 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52282 /* If we are in standards compliant mode, but we know we will
52283 calculate in unpredictable precision, return
52284 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52285 excess precision if the target can't guarantee it will honor
52286 it. */
52287 return (type == EXCESS_PRECISION_TYPE_STANDARD
52288 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52289 : FLT_EVAL_METHOD_UNPREDICTABLE);
52290 default:
52291 gcc_unreachable ();
52294 return FLT_EVAL_METHOD_UNPREDICTABLE;
52297 /* Target-specific selftests. */
52299 #if CHECKING_P
52301 namespace selftest {
52303 /* Verify that hard regs are dumped as expected (in compact mode). */
52305 static void
52306 ix86_test_dumping_hard_regs ()
52308 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52309 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52312 /* Test dumping an insn with repeated references to the same SCRATCH,
52313 to verify the rtx_reuse code. */
52315 static void
52316 ix86_test_dumping_memory_blockage ()
52318 set_new_first_and_last_insn (NULL, NULL);
52320 rtx pat = gen_memory_blockage ();
52321 rtx_reuse_manager r;
52322 r.preprocess (pat);
52324 /* Verify that the repeated references to the SCRATCH show use
52325 reuse IDS. The first should be prefixed with a reuse ID,
52326 and the second should be dumped as a "reuse_rtx" of that ID.
52327 The expected string assumes Pmode == DImode. */
52328 if (Pmode == DImode)
52329 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52330 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52331 " (unspec:BLK [\n"
52332 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52333 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52336 /* Verify loading an RTL dump; specifically a dump of copying
52337 a param on x86_64 from a hard reg into the frame.
52338 This test is target-specific since the dump contains target-specific
52339 hard reg names. */
52341 static void
52342 ix86_test_loading_dump_fragment_1 ()
52344 rtl_dump_test t (SELFTEST_LOCATION,
52345 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52347 rtx_insn *insn = get_insn_by_uid (1);
52349 /* The block structure and indentation here is purely for
52350 readability; it mirrors the structure of the rtx. */
52351 tree mem_expr;
52353 rtx pat = PATTERN (insn);
52354 ASSERT_EQ (SET, GET_CODE (pat));
52356 rtx dest = SET_DEST (pat);
52357 ASSERT_EQ (MEM, GET_CODE (dest));
52358 /* Verify the "/c" was parsed. */
52359 ASSERT_TRUE (RTX_FLAG (dest, call));
52360 ASSERT_EQ (SImode, GET_MODE (dest));
52362 rtx addr = XEXP (dest, 0);
52363 ASSERT_EQ (PLUS, GET_CODE (addr));
52364 ASSERT_EQ (DImode, GET_MODE (addr));
52366 rtx lhs = XEXP (addr, 0);
52367 /* Verify that the "frame" REG was consolidated. */
52368 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52371 rtx rhs = XEXP (addr, 1);
52372 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52373 ASSERT_EQ (-4, INTVAL (rhs));
52376 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52377 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52378 /* "i" should have been handled by synthesizing a global int
52379 variable named "i". */
52380 mem_expr = MEM_EXPR (dest);
52381 ASSERT_NE (mem_expr, NULL);
52382 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52383 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52384 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52385 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52386 /* "+0". */
52387 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52388 ASSERT_EQ (0, MEM_OFFSET (dest));
52389 /* "S4". */
52390 ASSERT_EQ (4, MEM_SIZE (dest));
52391 /* "A32. */
52392 ASSERT_EQ (32, MEM_ALIGN (dest));
52395 rtx src = SET_SRC (pat);
52396 ASSERT_EQ (REG, GET_CODE (src));
52397 ASSERT_EQ (SImode, GET_MODE (src));
52398 ASSERT_EQ (5, REGNO (src));
52399 tree reg_expr = REG_EXPR (src);
52400 /* "i" here should point to the same var as for the MEM_EXPR. */
52401 ASSERT_EQ (reg_expr, mem_expr);
52406 /* Verify that the RTL loader copes with a call_insn dump.
52407 This test is target-specific since the dump contains a target-specific
52408 hard reg name. */
52410 static void
52411 ix86_test_loading_call_insn ()
52413 /* The test dump includes register "xmm0", where requires TARGET_SSE
52414 to exist. */
52415 if (!TARGET_SSE)
52416 return;
52418 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52420 rtx_insn *insn = get_insns ();
52421 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52423 /* "/j". */
52424 ASSERT_TRUE (RTX_FLAG (insn, jump));
52426 rtx pat = PATTERN (insn);
52427 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52429 /* Verify REG_NOTES. */
52431 /* "(expr_list:REG_CALL_DECL". */
52432 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52433 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52434 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52436 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52437 rtx_expr_list *note1 = note0->next ();
52438 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52440 ASSERT_EQ (NULL, note1->next ());
52443 /* Verify CALL_INSN_FUNCTION_USAGE. */
52445 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52446 rtx_expr_list *usage
52447 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52448 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52449 ASSERT_EQ (DFmode, GET_MODE (usage));
52450 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52451 ASSERT_EQ (NULL, usage->next ());
52455 /* Verify that the RTL loader copes a dump from print_rtx_function.
52456 This test is target-specific since the dump contains target-specific
52457 hard reg names. */
52459 static void
52460 ix86_test_loading_full_dump ()
52462 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52464 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52466 rtx_insn *insn_1 = get_insn_by_uid (1);
52467 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52469 rtx_insn *insn_7 = get_insn_by_uid (7);
52470 ASSERT_EQ (INSN, GET_CODE (insn_7));
52471 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52473 rtx_insn *insn_15 = get_insn_by_uid (15);
52474 ASSERT_EQ (INSN, GET_CODE (insn_15));
52475 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52477 /* Verify crtl->return_rtx. */
52478 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52479 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52480 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52483 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52484 In particular, verify that it correctly loads the 2nd operand.
52485 This test is target-specific since these are machine-specific
52486 operands (and enums). */
52488 static void
52489 ix86_test_loading_unspec ()
52491 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52493 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52495 ASSERT_TRUE (cfun);
52497 /* Test of an UNSPEC. */
52498 rtx_insn *insn = get_insns ();
52499 ASSERT_EQ (INSN, GET_CODE (insn));
52500 rtx set = single_set (insn);
52501 ASSERT_NE (NULL, set);
52502 rtx dst = SET_DEST (set);
52503 ASSERT_EQ (MEM, GET_CODE (dst));
52504 rtx src = SET_SRC (set);
52505 ASSERT_EQ (UNSPEC, GET_CODE (src));
52506 ASSERT_EQ (BLKmode, GET_MODE (src));
52507 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52509 rtx v0 = XVECEXP (src, 0, 0);
52511 /* Verify that the two uses of the first SCRATCH have pointer
52512 equality. */
52513 rtx scratch_a = XEXP (dst, 0);
52514 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52516 rtx scratch_b = XEXP (v0, 0);
52517 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52519 ASSERT_EQ (scratch_a, scratch_b);
52521 /* Verify that the two mems are thus treated as equal. */
52522 ASSERT_TRUE (rtx_equal_p (dst, v0));
52524 /* Verify the the insn is recognized. */
52525 ASSERT_NE(-1, recog_memoized (insn));
52527 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52528 insn = NEXT_INSN (insn);
52529 ASSERT_EQ (INSN, GET_CODE (insn));
52531 set = single_set (insn);
52532 ASSERT_NE (NULL, set);
52534 src = SET_SRC (set);
52535 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52536 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52539 /* Run all target-specific selftests. */
52541 static void
52542 ix86_run_selftests (void)
52544 ix86_test_dumping_hard_regs ();
52545 ix86_test_dumping_memory_blockage ();
52547 /* Various tests of loading RTL dumps, here because they contain
52548 ix86-isms (e.g. names of hard regs). */
52549 ix86_test_loading_dump_fragment_1 ();
52550 ix86_test_loading_call_insn ();
52551 ix86_test_loading_full_dump ();
52552 ix86_test_loading_unspec ();
52555 } // namespace selftest
52557 #endif /* CHECKING_P */
52559 /* Initialize the GCC target structure. */
52560 #undef TARGET_RETURN_IN_MEMORY
52561 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52563 #undef TARGET_LEGITIMIZE_ADDRESS
52564 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52566 #undef TARGET_ATTRIBUTE_TABLE
52567 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52568 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52569 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52570 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52571 # undef TARGET_MERGE_DECL_ATTRIBUTES
52572 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52573 #endif
52575 #undef TARGET_COMP_TYPE_ATTRIBUTES
52576 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52578 #undef TARGET_INIT_BUILTINS
52579 #define TARGET_INIT_BUILTINS ix86_init_builtins
52580 #undef TARGET_BUILTIN_DECL
52581 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52582 #undef TARGET_EXPAND_BUILTIN
52583 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52585 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52586 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52587 ix86_builtin_vectorized_function
52589 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52590 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52592 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52593 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52595 #undef TARGET_BUILTIN_RECIPROCAL
52596 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52598 #undef TARGET_ASM_FUNCTION_EPILOGUE
52599 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52601 #undef TARGET_ENCODE_SECTION_INFO
52602 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52603 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52604 #else
52605 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52606 #endif
52608 #undef TARGET_ASM_OPEN_PAREN
52609 #define TARGET_ASM_OPEN_PAREN ""
52610 #undef TARGET_ASM_CLOSE_PAREN
52611 #define TARGET_ASM_CLOSE_PAREN ""
52613 #undef TARGET_ASM_BYTE_OP
52614 #define TARGET_ASM_BYTE_OP ASM_BYTE
52616 #undef TARGET_ASM_ALIGNED_HI_OP
52617 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52618 #undef TARGET_ASM_ALIGNED_SI_OP
52619 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52620 #ifdef ASM_QUAD
52621 #undef TARGET_ASM_ALIGNED_DI_OP
52622 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52623 #endif
52625 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52626 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52628 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52629 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52631 #undef TARGET_ASM_UNALIGNED_HI_OP
52632 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52633 #undef TARGET_ASM_UNALIGNED_SI_OP
52634 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52635 #undef TARGET_ASM_UNALIGNED_DI_OP
52636 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52638 #undef TARGET_PRINT_OPERAND
52639 #define TARGET_PRINT_OPERAND ix86_print_operand
52640 #undef TARGET_PRINT_OPERAND_ADDRESS
52641 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52642 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52643 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52644 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52645 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52647 #undef TARGET_SCHED_INIT_GLOBAL
52648 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52649 #undef TARGET_SCHED_ADJUST_COST
52650 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52651 #undef TARGET_SCHED_ISSUE_RATE
52652 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52653 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52654 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52655 ia32_multipass_dfa_lookahead
52656 #undef TARGET_SCHED_MACRO_FUSION_P
52657 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52658 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52659 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52661 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52662 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52664 #undef TARGET_MEMMODEL_CHECK
52665 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52667 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52668 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52670 #ifdef HAVE_AS_TLS
52671 #undef TARGET_HAVE_TLS
52672 #define TARGET_HAVE_TLS true
52673 #endif
52674 #undef TARGET_CANNOT_FORCE_CONST_MEM
52675 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52676 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52677 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52679 #undef TARGET_DELEGITIMIZE_ADDRESS
52680 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52682 #undef TARGET_MS_BITFIELD_LAYOUT_P
52683 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52685 #if TARGET_MACHO
52686 #undef TARGET_BINDS_LOCAL_P
52687 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52688 #else
52689 #undef TARGET_BINDS_LOCAL_P
52690 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52691 #endif
52692 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52693 #undef TARGET_BINDS_LOCAL_P
52694 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52695 #endif
52697 #undef TARGET_ASM_OUTPUT_MI_THUNK
52698 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52699 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52700 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52702 #undef TARGET_ASM_FILE_START
52703 #define TARGET_ASM_FILE_START x86_file_start
52705 #undef TARGET_OPTION_OVERRIDE
52706 #define TARGET_OPTION_OVERRIDE ix86_option_override
52708 #undef TARGET_REGISTER_MOVE_COST
52709 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52710 #undef TARGET_MEMORY_MOVE_COST
52711 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52712 #undef TARGET_RTX_COSTS
52713 #define TARGET_RTX_COSTS ix86_rtx_costs
52714 #undef TARGET_ADDRESS_COST
52715 #define TARGET_ADDRESS_COST ix86_address_cost
52717 #undef TARGET_FLAGS_REGNUM
52718 #define TARGET_FLAGS_REGNUM FLAGS_REG
52719 #undef TARGET_FIXED_CONDITION_CODE_REGS
52720 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52721 #undef TARGET_CC_MODES_COMPATIBLE
52722 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52724 #undef TARGET_MACHINE_DEPENDENT_REORG
52725 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52727 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52728 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52730 #undef TARGET_BUILD_BUILTIN_VA_LIST
52731 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52733 #undef TARGET_FOLD_BUILTIN
52734 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52736 #undef TARGET_GIMPLE_FOLD_BUILTIN
52737 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52739 #undef TARGET_COMPARE_VERSION_PRIORITY
52740 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52742 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52743 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52744 ix86_generate_version_dispatcher_body
52746 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52747 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52748 ix86_get_function_versions_dispatcher
52750 #undef TARGET_ENUM_VA_LIST_P
52751 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52753 #undef TARGET_FN_ABI_VA_LIST
52754 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52756 #undef TARGET_CANONICAL_VA_LIST_TYPE
52757 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52759 #undef TARGET_EXPAND_BUILTIN_VA_START
52760 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52762 #undef TARGET_MD_ASM_ADJUST
52763 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52765 #undef TARGET_C_EXCESS_PRECISION
52766 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52767 #undef TARGET_PROMOTE_PROTOTYPES
52768 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52769 #undef TARGET_SETUP_INCOMING_VARARGS
52770 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52771 #undef TARGET_MUST_PASS_IN_STACK
52772 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52773 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
52774 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
52775 #undef TARGET_FUNCTION_ARG_ADVANCE
52776 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52777 #undef TARGET_FUNCTION_ARG
52778 #define TARGET_FUNCTION_ARG ix86_function_arg
52779 #undef TARGET_INIT_PIC_REG
52780 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52781 #undef TARGET_USE_PSEUDO_PIC_REG
52782 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52783 #undef TARGET_FUNCTION_ARG_BOUNDARY
52784 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52785 #undef TARGET_PASS_BY_REFERENCE
52786 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52787 #undef TARGET_INTERNAL_ARG_POINTER
52788 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52789 #undef TARGET_UPDATE_STACK_BOUNDARY
52790 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52791 #undef TARGET_GET_DRAP_RTX
52792 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52793 #undef TARGET_STRICT_ARGUMENT_NAMING
52794 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52795 #undef TARGET_STATIC_CHAIN
52796 #define TARGET_STATIC_CHAIN ix86_static_chain
52797 #undef TARGET_TRAMPOLINE_INIT
52798 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52799 #undef TARGET_RETURN_POPS_ARGS
52800 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52802 #undef TARGET_WARN_FUNC_RETURN
52803 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
52805 #undef TARGET_LEGITIMATE_COMBINED_INSN
52806 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52808 #undef TARGET_ASAN_SHADOW_OFFSET
52809 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52811 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52812 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52814 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52815 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52817 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52818 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52820 #undef TARGET_C_MODE_FOR_SUFFIX
52821 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52823 #ifdef HAVE_AS_TLS
52824 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
52825 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
52826 #endif
52828 #ifdef SUBTARGET_INSERT_ATTRIBUTES
52829 #undef TARGET_INSERT_ATTRIBUTES
52830 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
52831 #endif
52833 #undef TARGET_MANGLE_TYPE
52834 #define TARGET_MANGLE_TYPE ix86_mangle_type
52836 #ifdef TARGET_THREAD_SSP_OFFSET
52837 #undef TARGET_STACK_PROTECT_GUARD
52838 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
52839 #endif
52841 #if !TARGET_MACHO
52842 #undef TARGET_STACK_PROTECT_FAIL
52843 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
52844 #endif
52846 #undef TARGET_FUNCTION_VALUE
52847 #define TARGET_FUNCTION_VALUE ix86_function_value
52849 #undef TARGET_FUNCTION_VALUE_REGNO_P
52850 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
52852 #undef TARGET_PROMOTE_FUNCTION_MODE
52853 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
52855 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
52856 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
52858 #undef TARGET_MEMBER_TYPE_FORCES_BLK
52859 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
52861 #undef TARGET_INSTANTIATE_DECLS
52862 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
52864 #undef TARGET_SECONDARY_RELOAD
52865 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
52867 #undef TARGET_CLASS_MAX_NREGS
52868 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
52870 #undef TARGET_PREFERRED_RELOAD_CLASS
52871 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
52872 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
52873 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
52874 #undef TARGET_CLASS_LIKELY_SPILLED_P
52875 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
52877 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
52878 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
52879 ix86_builtin_vectorization_cost
52880 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
52881 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
52882 ix86_vectorize_vec_perm_const_ok
52883 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
52884 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
52885 ix86_preferred_simd_mode
52886 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
52887 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
52888 ix86_autovectorize_vector_sizes
52889 #undef TARGET_VECTORIZE_GET_MASK_MODE
52890 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
52891 #undef TARGET_VECTORIZE_INIT_COST
52892 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
52893 #undef TARGET_VECTORIZE_ADD_STMT_COST
52894 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
52895 #undef TARGET_VECTORIZE_FINISH_COST
52896 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
52897 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
52898 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
52900 #undef TARGET_SET_CURRENT_FUNCTION
52901 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
52903 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
52904 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
52906 #undef TARGET_OPTION_SAVE
52907 #define TARGET_OPTION_SAVE ix86_function_specific_save
52909 #undef TARGET_OPTION_RESTORE
52910 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
52912 #undef TARGET_OPTION_POST_STREAM_IN
52913 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
52915 #undef TARGET_OPTION_PRINT
52916 #define TARGET_OPTION_PRINT ix86_function_specific_print
52918 #undef TARGET_OPTION_FUNCTION_VERSIONS
52919 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
52921 #undef TARGET_CAN_INLINE_P
52922 #define TARGET_CAN_INLINE_P ix86_can_inline_p
52924 #undef TARGET_LEGITIMATE_ADDRESS_P
52925 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
52927 #undef TARGET_REGISTER_PRIORITY
52928 #define TARGET_REGISTER_PRIORITY ix86_register_priority
52930 #undef TARGET_REGISTER_USAGE_LEVELING_P
52931 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
52933 #undef TARGET_LEGITIMATE_CONSTANT_P
52934 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52936 #undef TARGET_COMPUTE_FRAME_LAYOUT
52937 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52939 #undef TARGET_FRAME_POINTER_REQUIRED
52940 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52942 #undef TARGET_CAN_ELIMINATE
52943 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52945 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52946 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52948 #undef TARGET_ASM_CODE_END
52949 #define TARGET_ASM_CODE_END ix86_code_end
52951 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52952 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52954 #undef TARGET_LOOP_UNROLL_ADJUST
52955 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52957 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52958 #undef TARGET_SPILL_CLASS
52959 #define TARGET_SPILL_CLASS ix86_spill_class
52961 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52962 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52963 ix86_simd_clone_compute_vecsize_and_simdlen
52965 #undef TARGET_SIMD_CLONE_ADJUST
52966 #define TARGET_SIMD_CLONE_ADJUST \
52967 ix86_simd_clone_adjust
52969 #undef TARGET_SIMD_CLONE_USABLE
52970 #define TARGET_SIMD_CLONE_USABLE \
52971 ix86_simd_clone_usable
52973 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52974 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52975 ix86_float_exceptions_rounding_supported_p
52977 #undef TARGET_MODE_EMIT
52978 #define TARGET_MODE_EMIT ix86_emit_mode_set
52980 #undef TARGET_MODE_NEEDED
52981 #define TARGET_MODE_NEEDED ix86_mode_needed
52983 #undef TARGET_MODE_AFTER
52984 #define TARGET_MODE_AFTER ix86_mode_after
52986 #undef TARGET_MODE_ENTRY
52987 #define TARGET_MODE_ENTRY ix86_mode_entry
52989 #undef TARGET_MODE_EXIT
52990 #define TARGET_MODE_EXIT ix86_mode_exit
52992 #undef TARGET_MODE_PRIORITY
52993 #define TARGET_MODE_PRIORITY ix86_mode_priority
52995 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52996 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52998 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52999 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
53001 #undef TARGET_STORE_BOUNDS_FOR_ARG
53002 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
53004 #undef TARGET_LOAD_RETURNED_BOUNDS
53005 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
53007 #undef TARGET_STORE_RETURNED_BOUNDS
53008 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
53010 #undef TARGET_CHKP_BOUND_MODE
53011 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53013 #undef TARGET_BUILTIN_CHKP_FUNCTION
53014 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53016 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53017 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53019 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53020 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53022 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53023 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53025 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53026 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53028 #undef TARGET_OFFLOAD_OPTIONS
53029 #define TARGET_OFFLOAD_OPTIONS \
53030 ix86_offload_options
53032 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53033 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53035 #undef TARGET_OPTAB_SUPPORTED_P
53036 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53038 #undef TARGET_HARD_REGNO_SCRATCH_OK
53039 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53041 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53042 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53044 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53045 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53047 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53048 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53050 #undef TARGET_INIT_LIBFUNCS
53051 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53053 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53054 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53056 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53057 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53059 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53060 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53062 #if CHECKING_P
53063 #undef TARGET_RUN_TARGET_SELFTESTS
53064 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53065 #endif /* #if CHECKING_P */
53067 struct gcc_target targetm = TARGET_INITIALIZER;
53069 #include "gt-i386.h"