x86: Properly check saved register CFA offset
[official-gcc.git] / gcc / config / i386 / i386.c
blobf1486ff3750dc49799dce44ac0333df2816f399a
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
89 /* This file should be included last. */
90 #include "target-def.h"
92 static rtx legitimize_dllimport_symbol (rtx, bool);
93 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
94 static rtx legitimize_pe_coff_symbol (rtx, bool);
95 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
96 static bool ix86_save_reg (unsigned int, bool, bool);
98 #ifndef CHECK_STACK_LIMIT
99 #define CHECK_STACK_LIMIT (-1)
100 #endif
102 /* Return index of given mode in mult and division cost tables. */
103 #define MODE_INDEX(mode) \
104 ((mode) == QImode ? 0 \
105 : (mode) == HImode ? 1 \
106 : (mode) == SImode ? 2 \
107 : (mode) == DImode ? 3 \
108 : 4)
110 /* Processor costs (relative to an add) */
111 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
112 #define COSTS_N_BYTES(N) ((N) * 2)
114 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
116 static stringop_algs ix86_size_memcpy[2] = {
117 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
119 static stringop_algs ix86_size_memset[2] = {
120 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
123 const
124 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
125 COSTS_N_BYTES (2), /* cost of an add instruction */
126 COSTS_N_BYTES (3), /* cost of a lea instruction */
127 COSTS_N_BYTES (2), /* variable shift costs */
128 COSTS_N_BYTES (3), /* constant shift costs */
129 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
130 COSTS_N_BYTES (3), /* HI */
131 COSTS_N_BYTES (3), /* SI */
132 COSTS_N_BYTES (3), /* DI */
133 COSTS_N_BYTES (5)}, /* other */
134 0, /* cost of multiply per each bit set */
135 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
136 COSTS_N_BYTES (3), /* HI */
137 COSTS_N_BYTES (3), /* SI */
138 COSTS_N_BYTES (3), /* DI */
139 COSTS_N_BYTES (5)}, /* other */
140 COSTS_N_BYTES (3), /* cost of movsx */
141 COSTS_N_BYTES (3), /* cost of movzx */
142 0, /* "large" insn */
143 2, /* MOVE_RATIO */
144 2, /* cost for loading QImode using movzbl */
145 {2, 2, 2}, /* cost of loading integer registers
146 in QImode, HImode and SImode.
147 Relative to reg-reg move (2). */
148 {2, 2, 2}, /* cost of storing integer registers */
149 2, /* cost of reg,reg fld/fst */
150 {2, 2, 2}, /* cost of loading fp registers
151 in SFmode, DFmode and XFmode */
152 {2, 2, 2}, /* cost of storing fp registers
153 in SFmode, DFmode and XFmode */
154 3, /* cost of moving MMX register */
155 {3, 3}, /* cost of loading MMX registers
156 in SImode and DImode */
157 {3, 3}, /* cost of storing MMX registers
158 in SImode and DImode */
159 3, /* cost of moving SSE register */
160 {3, 3, 3}, /* cost of loading SSE registers
161 in SImode, DImode and TImode */
162 {3, 3, 3}, /* cost of storing SSE registers
163 in SImode, DImode and TImode */
164 3, /* MMX or SSE register to integer */
165 0, /* size of l1 cache */
166 0, /* size of l2 cache */
167 0, /* size of prefetch block */
168 0, /* number of parallel prefetches */
169 2, /* Branch cost */
170 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
171 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
172 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
173 COSTS_N_BYTES (2), /* cost of FABS instruction. */
174 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
175 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
176 ix86_size_memcpy,
177 ix86_size_memset,
178 1, /* scalar_stmt_cost. */
179 1, /* scalar load_cost. */
180 1, /* scalar_store_cost. */
181 1, /* vec_stmt_cost. */
182 1, /* vec_to_scalar_cost. */
183 1, /* scalar_to_vec_cost. */
184 1, /* vec_align_load_cost. */
185 1, /* vec_unalign_load_cost. */
186 1, /* vec_store_cost. */
187 1, /* cond_taken_branch_cost. */
188 1, /* cond_not_taken_branch_cost. */
191 /* Processor costs (relative to an add) */
192 static stringop_algs i386_memcpy[2] = {
193 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
194 DUMMY_STRINGOP_ALGS};
195 static stringop_algs i386_memset[2] = {
196 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
197 DUMMY_STRINGOP_ALGS};
199 static const
200 struct processor_costs i386_cost = { /* 386 specific costs */
201 COSTS_N_INSNS (1), /* cost of an add instruction */
202 COSTS_N_INSNS (1), /* cost of a lea instruction */
203 COSTS_N_INSNS (3), /* variable shift costs */
204 COSTS_N_INSNS (2), /* constant shift costs */
205 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
206 COSTS_N_INSNS (6), /* HI */
207 COSTS_N_INSNS (6), /* SI */
208 COSTS_N_INSNS (6), /* DI */
209 COSTS_N_INSNS (6)}, /* other */
210 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
211 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
212 COSTS_N_INSNS (23), /* HI */
213 COSTS_N_INSNS (23), /* SI */
214 COSTS_N_INSNS (23), /* DI */
215 COSTS_N_INSNS (23)}, /* other */
216 COSTS_N_INSNS (3), /* cost of movsx */
217 COSTS_N_INSNS (2), /* cost of movzx */
218 15, /* "large" insn */
219 3, /* MOVE_RATIO */
220 4, /* cost for loading QImode using movzbl */
221 {2, 4, 2}, /* cost of loading integer registers
222 in QImode, HImode and SImode.
223 Relative to reg-reg move (2). */
224 {2, 4, 2}, /* cost of storing integer registers */
225 2, /* cost of reg,reg fld/fst */
226 {8, 8, 8}, /* cost of loading fp registers
227 in SFmode, DFmode and XFmode */
228 {8, 8, 8}, /* cost of storing fp registers
229 in SFmode, DFmode and XFmode */
230 2, /* cost of moving MMX register */
231 {4, 8}, /* cost of loading MMX registers
232 in SImode and DImode */
233 {4, 8}, /* cost of storing MMX registers
234 in SImode and DImode */
235 2, /* cost of moving SSE register */
236 {4, 8, 16}, /* cost of loading SSE registers
237 in SImode, DImode and TImode */
238 {4, 8, 16}, /* cost of storing SSE registers
239 in SImode, DImode and TImode */
240 3, /* MMX or SSE register to integer */
241 0, /* size of l1 cache */
242 0, /* size of l2 cache */
243 0, /* size of prefetch block */
244 0, /* number of parallel prefetches */
245 1, /* Branch cost */
246 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
247 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
248 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
249 COSTS_N_INSNS (22), /* cost of FABS instruction. */
250 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
251 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
252 i386_memcpy,
253 i386_memset,
254 1, /* scalar_stmt_cost. */
255 1, /* scalar load_cost. */
256 1, /* scalar_store_cost. */
257 1, /* vec_stmt_cost. */
258 1, /* vec_to_scalar_cost. */
259 1, /* scalar_to_vec_cost. */
260 1, /* vec_align_load_cost. */
261 2, /* vec_unalign_load_cost. */
262 1, /* vec_store_cost. */
263 3, /* cond_taken_branch_cost. */
264 1, /* cond_not_taken_branch_cost. */
267 static stringop_algs i486_memcpy[2] = {
268 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
269 DUMMY_STRINGOP_ALGS};
270 static stringop_algs i486_memset[2] = {
271 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
272 DUMMY_STRINGOP_ALGS};
274 static const
275 struct processor_costs i486_cost = { /* 486 specific costs */
276 COSTS_N_INSNS (1), /* cost of an add instruction */
277 COSTS_N_INSNS (1), /* cost of a lea instruction */
278 COSTS_N_INSNS (3), /* variable shift costs */
279 COSTS_N_INSNS (2), /* constant shift costs */
280 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
281 COSTS_N_INSNS (12), /* HI */
282 COSTS_N_INSNS (12), /* SI */
283 COSTS_N_INSNS (12), /* DI */
284 COSTS_N_INSNS (12)}, /* other */
285 1, /* cost of multiply per each bit set */
286 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
287 COSTS_N_INSNS (40), /* HI */
288 COSTS_N_INSNS (40), /* SI */
289 COSTS_N_INSNS (40), /* DI */
290 COSTS_N_INSNS (40)}, /* other */
291 COSTS_N_INSNS (3), /* cost of movsx */
292 COSTS_N_INSNS (2), /* cost of movzx */
293 15, /* "large" insn */
294 3, /* MOVE_RATIO */
295 4, /* cost for loading QImode using movzbl */
296 {2, 4, 2}, /* cost of loading integer registers
297 in QImode, HImode and SImode.
298 Relative to reg-reg move (2). */
299 {2, 4, 2}, /* cost of storing integer registers */
300 2, /* cost of reg,reg fld/fst */
301 {8, 8, 8}, /* cost of loading fp registers
302 in SFmode, DFmode and XFmode */
303 {8, 8, 8}, /* cost of storing fp registers
304 in SFmode, DFmode and XFmode */
305 2, /* cost of moving MMX register */
306 {4, 8}, /* cost of loading MMX registers
307 in SImode and DImode */
308 {4, 8}, /* cost of storing MMX registers
309 in SImode and DImode */
310 2, /* cost of moving SSE register */
311 {4, 8, 16}, /* cost of loading SSE registers
312 in SImode, DImode and TImode */
313 {4, 8, 16}, /* cost of storing SSE registers
314 in SImode, DImode and TImode */
315 3, /* MMX or SSE register to integer */
316 4, /* size of l1 cache. 486 has 8kB cache
317 shared for code and data, so 4kB is
318 not really precise. */
319 4, /* size of l2 cache */
320 0, /* size of prefetch block */
321 0, /* number of parallel prefetches */
322 1, /* Branch cost */
323 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
324 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
325 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
326 COSTS_N_INSNS (3), /* cost of FABS instruction. */
327 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
328 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
329 i486_memcpy,
330 i486_memset,
331 1, /* scalar_stmt_cost. */
332 1, /* scalar load_cost. */
333 1, /* scalar_store_cost. */
334 1, /* vec_stmt_cost. */
335 1, /* vec_to_scalar_cost. */
336 1, /* scalar_to_vec_cost. */
337 1, /* vec_align_load_cost. */
338 2, /* vec_unalign_load_cost. */
339 1, /* vec_store_cost. */
340 3, /* cond_taken_branch_cost. */
341 1, /* cond_not_taken_branch_cost. */
344 static stringop_algs pentium_memcpy[2] = {
345 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
346 DUMMY_STRINGOP_ALGS};
347 static stringop_algs pentium_memset[2] = {
348 {libcall, {{-1, rep_prefix_4_byte, false}}},
349 DUMMY_STRINGOP_ALGS};
351 static const
352 struct processor_costs pentium_cost = {
353 COSTS_N_INSNS (1), /* cost of an add instruction */
354 COSTS_N_INSNS (1), /* cost of a lea instruction */
355 COSTS_N_INSNS (4), /* variable shift costs */
356 COSTS_N_INSNS (1), /* constant shift costs */
357 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
358 COSTS_N_INSNS (11), /* HI */
359 COSTS_N_INSNS (11), /* SI */
360 COSTS_N_INSNS (11), /* DI */
361 COSTS_N_INSNS (11)}, /* other */
362 0, /* cost of multiply per each bit set */
363 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
364 COSTS_N_INSNS (25), /* HI */
365 COSTS_N_INSNS (25), /* SI */
366 COSTS_N_INSNS (25), /* DI */
367 COSTS_N_INSNS (25)}, /* other */
368 COSTS_N_INSNS (3), /* cost of movsx */
369 COSTS_N_INSNS (2), /* cost of movzx */
370 8, /* "large" insn */
371 6, /* MOVE_RATIO */
372 6, /* cost for loading QImode using movzbl */
373 {2, 4, 2}, /* cost of loading integer registers
374 in QImode, HImode and SImode.
375 Relative to reg-reg move (2). */
376 {2, 4, 2}, /* cost of storing integer registers */
377 2, /* cost of reg,reg fld/fst */
378 {2, 2, 6}, /* cost of loading fp registers
379 in SFmode, DFmode and XFmode */
380 {4, 4, 6}, /* cost of storing fp registers
381 in SFmode, DFmode and XFmode */
382 8, /* cost of moving MMX register */
383 {8, 8}, /* cost of loading MMX registers
384 in SImode and DImode */
385 {8, 8}, /* cost of storing MMX registers
386 in SImode and DImode */
387 2, /* cost of moving SSE register */
388 {4, 8, 16}, /* cost of loading SSE registers
389 in SImode, DImode and TImode */
390 {4, 8, 16}, /* cost of storing SSE registers
391 in SImode, DImode and TImode */
392 3, /* MMX or SSE register to integer */
393 8, /* size of l1 cache. */
394 8, /* size of l2 cache */
395 0, /* size of prefetch block */
396 0, /* number of parallel prefetches */
397 2, /* Branch cost */
398 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
399 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
400 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
401 COSTS_N_INSNS (1), /* cost of FABS instruction. */
402 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
403 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
404 pentium_memcpy,
405 pentium_memset,
406 1, /* scalar_stmt_cost. */
407 1, /* scalar load_cost. */
408 1, /* scalar_store_cost. */
409 1, /* vec_stmt_cost. */
410 1, /* vec_to_scalar_cost. */
411 1, /* scalar_to_vec_cost. */
412 1, /* vec_align_load_cost. */
413 2, /* vec_unalign_load_cost. */
414 1, /* vec_store_cost. */
415 3, /* cond_taken_branch_cost. */
416 1, /* cond_not_taken_branch_cost. */
419 static const
420 struct processor_costs lakemont_cost = {
421 COSTS_N_INSNS (1), /* cost of an add instruction */
422 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
423 COSTS_N_INSNS (1), /* variable shift costs */
424 COSTS_N_INSNS (1), /* constant shift costs */
425 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
426 COSTS_N_INSNS (11), /* HI */
427 COSTS_N_INSNS (11), /* SI */
428 COSTS_N_INSNS (11), /* DI */
429 COSTS_N_INSNS (11)}, /* other */
430 0, /* cost of multiply per each bit set */
431 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
432 COSTS_N_INSNS (25), /* HI */
433 COSTS_N_INSNS (25), /* SI */
434 COSTS_N_INSNS (25), /* DI */
435 COSTS_N_INSNS (25)}, /* other */
436 COSTS_N_INSNS (3), /* cost of movsx */
437 COSTS_N_INSNS (2), /* cost of movzx */
438 8, /* "large" insn */
439 17, /* MOVE_RATIO */
440 6, /* cost for loading QImode using movzbl */
441 {2, 4, 2}, /* cost of loading integer registers
442 in QImode, HImode and SImode.
443 Relative to reg-reg move (2). */
444 {2, 4, 2}, /* cost of storing integer registers */
445 2, /* cost of reg,reg fld/fst */
446 {2, 2, 6}, /* cost of loading fp registers
447 in SFmode, DFmode and XFmode */
448 {4, 4, 6}, /* cost of storing fp registers
449 in SFmode, DFmode and XFmode */
450 8, /* cost of moving MMX register */
451 {8, 8}, /* cost of loading MMX registers
452 in SImode and DImode */
453 {8, 8}, /* cost of storing MMX registers
454 in SImode and DImode */
455 2, /* cost of moving SSE register */
456 {4, 8, 16}, /* cost of loading SSE registers
457 in SImode, DImode and TImode */
458 {4, 8, 16}, /* cost of storing SSE registers
459 in SImode, DImode and TImode */
460 3, /* MMX or SSE register to integer */
461 8, /* size of l1 cache. */
462 8, /* size of l2 cache */
463 0, /* size of prefetch block */
464 0, /* number of parallel prefetches */
465 2, /* Branch cost */
466 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
467 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
468 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
469 COSTS_N_INSNS (1), /* cost of FABS instruction. */
470 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
471 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
472 pentium_memcpy,
473 pentium_memset,
474 1, /* scalar_stmt_cost. */
475 1, /* scalar load_cost. */
476 1, /* scalar_store_cost. */
477 1, /* vec_stmt_cost. */
478 1, /* vec_to_scalar_cost. */
479 1, /* scalar_to_vec_cost. */
480 1, /* vec_align_load_cost. */
481 2, /* vec_unalign_load_cost. */
482 1, /* vec_store_cost. */
483 3, /* cond_taken_branch_cost. */
484 1, /* cond_not_taken_branch_cost. */
487 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
488 (we ensure the alignment). For small blocks inline loop is still a
489 noticeable win, for bigger blocks either rep movsl or rep movsb is
490 way to go. Rep movsb has apparently more expensive startup time in CPU,
491 but after 4K the difference is down in the noise. */
492 static stringop_algs pentiumpro_memcpy[2] = {
493 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
494 {8192, rep_prefix_4_byte, false},
495 {-1, rep_prefix_1_byte, false}}},
496 DUMMY_STRINGOP_ALGS};
497 static stringop_algs pentiumpro_memset[2] = {
498 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
499 {8192, rep_prefix_4_byte, false},
500 {-1, libcall, false}}},
501 DUMMY_STRINGOP_ALGS};
502 static const
503 struct processor_costs pentiumpro_cost = {
504 COSTS_N_INSNS (1), /* cost of an add instruction */
505 COSTS_N_INSNS (1), /* cost of a lea instruction */
506 COSTS_N_INSNS (1), /* variable shift costs */
507 COSTS_N_INSNS (1), /* constant shift costs */
508 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
509 COSTS_N_INSNS (4), /* HI */
510 COSTS_N_INSNS (4), /* SI */
511 COSTS_N_INSNS (4), /* DI */
512 COSTS_N_INSNS (4)}, /* other */
513 0, /* cost of multiply per each bit set */
514 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
515 COSTS_N_INSNS (17), /* HI */
516 COSTS_N_INSNS (17), /* SI */
517 COSTS_N_INSNS (17), /* DI */
518 COSTS_N_INSNS (17)}, /* other */
519 COSTS_N_INSNS (1), /* cost of movsx */
520 COSTS_N_INSNS (1), /* cost of movzx */
521 8, /* "large" insn */
522 6, /* MOVE_RATIO */
523 2, /* cost for loading QImode using movzbl */
524 {4, 4, 4}, /* cost of loading integer registers
525 in QImode, HImode and SImode.
526 Relative to reg-reg move (2). */
527 {2, 2, 2}, /* cost of storing integer registers */
528 2, /* cost of reg,reg fld/fst */
529 {2, 2, 6}, /* cost of loading fp registers
530 in SFmode, DFmode and XFmode */
531 {4, 4, 6}, /* cost of storing fp registers
532 in SFmode, DFmode and XFmode */
533 2, /* cost of moving MMX register */
534 {2, 2}, /* cost of loading MMX registers
535 in SImode and DImode */
536 {2, 2}, /* cost of storing MMX registers
537 in SImode and DImode */
538 2, /* cost of moving SSE register */
539 {2, 2, 8}, /* cost of loading SSE registers
540 in SImode, DImode and TImode */
541 {2, 2, 8}, /* cost of storing SSE registers
542 in SImode, DImode and TImode */
543 3, /* MMX or SSE register to integer */
544 8, /* size of l1 cache. */
545 256, /* size of l2 cache */
546 32, /* size of prefetch block */
547 6, /* number of parallel prefetches */
548 2, /* Branch cost */
549 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
550 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
551 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
552 COSTS_N_INSNS (2), /* cost of FABS instruction. */
553 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
554 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
555 pentiumpro_memcpy,
556 pentiumpro_memset,
557 1, /* scalar_stmt_cost. */
558 1, /* scalar load_cost. */
559 1, /* scalar_store_cost. */
560 1, /* vec_stmt_cost. */
561 1, /* vec_to_scalar_cost. */
562 1, /* scalar_to_vec_cost. */
563 1, /* vec_align_load_cost. */
564 2, /* vec_unalign_load_cost. */
565 1, /* vec_store_cost. */
566 3, /* cond_taken_branch_cost. */
567 1, /* cond_not_taken_branch_cost. */
570 static stringop_algs geode_memcpy[2] = {
571 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
572 DUMMY_STRINGOP_ALGS};
573 static stringop_algs geode_memset[2] = {
574 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
575 DUMMY_STRINGOP_ALGS};
576 static const
577 struct processor_costs geode_cost = {
578 COSTS_N_INSNS (1), /* cost of an add instruction */
579 COSTS_N_INSNS (1), /* cost of a lea instruction */
580 COSTS_N_INSNS (2), /* variable shift costs */
581 COSTS_N_INSNS (1), /* constant shift costs */
582 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
583 COSTS_N_INSNS (4), /* HI */
584 COSTS_N_INSNS (7), /* SI */
585 COSTS_N_INSNS (7), /* DI */
586 COSTS_N_INSNS (7)}, /* other */
587 0, /* cost of multiply per each bit set */
588 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
589 COSTS_N_INSNS (23), /* HI */
590 COSTS_N_INSNS (39), /* SI */
591 COSTS_N_INSNS (39), /* DI */
592 COSTS_N_INSNS (39)}, /* other */
593 COSTS_N_INSNS (1), /* cost of movsx */
594 COSTS_N_INSNS (1), /* cost of movzx */
595 8, /* "large" insn */
596 4, /* MOVE_RATIO */
597 1, /* cost for loading QImode using movzbl */
598 {1, 1, 1}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 {1, 1, 1}, /* cost of storing integer registers */
602 1, /* cost of reg,reg fld/fst */
603 {1, 1, 1}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 {4, 6, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
608 2, /* cost of moving MMX register */
609 {2, 2}, /* cost of loading MMX registers
610 in SImode and DImode */
611 {2, 2}, /* cost of storing MMX registers
612 in SImode and DImode */
613 2, /* cost of moving SSE register */
614 {2, 2, 8}, /* cost of loading SSE registers
615 in SImode, DImode and TImode */
616 {2, 2, 8}, /* cost of storing SSE registers
617 in SImode, DImode and TImode */
618 3, /* MMX or SSE register to integer */
619 64, /* size of l1 cache. */
620 128, /* size of l2 cache. */
621 32, /* size of prefetch block */
622 1, /* number of parallel prefetches */
623 1, /* Branch cost */
624 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
625 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
626 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
627 COSTS_N_INSNS (1), /* cost of FABS instruction. */
628 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
629 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
630 geode_memcpy,
631 geode_memset,
632 1, /* scalar_stmt_cost. */
633 1, /* scalar load_cost. */
634 1, /* scalar_store_cost. */
635 1, /* vec_stmt_cost. */
636 1, /* vec_to_scalar_cost. */
637 1, /* scalar_to_vec_cost. */
638 1, /* vec_align_load_cost. */
639 2, /* vec_unalign_load_cost. */
640 1, /* vec_store_cost. */
641 3, /* cond_taken_branch_cost. */
642 1, /* cond_not_taken_branch_cost. */
645 static stringop_algs k6_memcpy[2] = {
646 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
647 DUMMY_STRINGOP_ALGS};
648 static stringop_algs k6_memset[2] = {
649 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
650 DUMMY_STRINGOP_ALGS};
651 static const
652 struct processor_costs k6_cost = {
653 COSTS_N_INSNS (1), /* cost of an add instruction */
654 COSTS_N_INSNS (2), /* cost of a lea instruction */
655 COSTS_N_INSNS (1), /* variable shift costs */
656 COSTS_N_INSNS (1), /* constant shift costs */
657 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
658 COSTS_N_INSNS (3), /* HI */
659 COSTS_N_INSNS (3), /* SI */
660 COSTS_N_INSNS (3), /* DI */
661 COSTS_N_INSNS (3)}, /* other */
662 0, /* cost of multiply per each bit set */
663 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
664 COSTS_N_INSNS (18), /* HI */
665 COSTS_N_INSNS (18), /* SI */
666 COSTS_N_INSNS (18), /* DI */
667 COSTS_N_INSNS (18)}, /* other */
668 COSTS_N_INSNS (2), /* cost of movsx */
669 COSTS_N_INSNS (2), /* cost of movzx */
670 8, /* "large" insn */
671 4, /* MOVE_RATIO */
672 3, /* cost for loading QImode using movzbl */
673 {4, 5, 4}, /* cost of loading integer registers
674 in QImode, HImode and SImode.
675 Relative to reg-reg move (2). */
676 {2, 3, 2}, /* cost of storing integer registers */
677 4, /* cost of reg,reg fld/fst */
678 {6, 6, 6}, /* cost of loading fp registers
679 in SFmode, DFmode and XFmode */
680 {4, 4, 4}, /* cost of storing fp registers
681 in SFmode, DFmode and XFmode */
682 2, /* cost of moving MMX register */
683 {2, 2}, /* cost of loading MMX registers
684 in SImode and DImode */
685 {2, 2}, /* cost of storing MMX registers
686 in SImode and DImode */
687 2, /* cost of moving SSE register */
688 {2, 2, 8}, /* cost of loading SSE registers
689 in SImode, DImode and TImode */
690 {2, 2, 8}, /* cost of storing SSE registers
691 in SImode, DImode and TImode */
692 6, /* MMX or SSE register to integer */
693 32, /* size of l1 cache. */
694 32, /* size of l2 cache. Some models
695 have integrated l2 cache, but
696 optimizing for k6 is not important
697 enough to worry about that. */
698 32, /* size of prefetch block */
699 1, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (2), /* cost of FABS instruction. */
705 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
707 k6_memcpy,
708 k6_memset,
709 1, /* scalar_stmt_cost. */
710 1, /* scalar load_cost. */
711 1, /* scalar_store_cost. */
712 1, /* vec_stmt_cost. */
713 1, /* vec_to_scalar_cost. */
714 1, /* scalar_to_vec_cost. */
715 1, /* vec_align_load_cost. */
716 2, /* vec_unalign_load_cost. */
717 1, /* vec_store_cost. */
718 3, /* cond_taken_branch_cost. */
719 1, /* cond_not_taken_branch_cost. */
722 /* For some reason, Athlon deals better with REP prefix (relative to loops)
723 compared to K8. Alignment becomes important after 8 bytes for memcpy and
724 128 bytes for memset. */
725 static stringop_algs athlon_memcpy[2] = {
726 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
727 DUMMY_STRINGOP_ALGS};
728 static stringop_algs athlon_memset[2] = {
729 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
730 DUMMY_STRINGOP_ALGS};
731 static const
732 struct processor_costs athlon_cost = {
733 COSTS_N_INSNS (1), /* cost of an add instruction */
734 COSTS_N_INSNS (2), /* cost of a lea instruction */
735 COSTS_N_INSNS (1), /* variable shift costs */
736 COSTS_N_INSNS (1), /* constant shift costs */
737 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
738 COSTS_N_INSNS (5), /* HI */
739 COSTS_N_INSNS (5), /* SI */
740 COSTS_N_INSNS (5), /* DI */
741 COSTS_N_INSNS (5)}, /* other */
742 0, /* cost of multiply per each bit set */
743 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
744 COSTS_N_INSNS (26), /* HI */
745 COSTS_N_INSNS (42), /* SI */
746 COSTS_N_INSNS (74), /* DI */
747 COSTS_N_INSNS (74)}, /* other */
748 COSTS_N_INSNS (1), /* cost of movsx */
749 COSTS_N_INSNS (1), /* cost of movzx */
750 8, /* "large" insn */
751 9, /* MOVE_RATIO */
752 4, /* cost for loading QImode using movzbl */
753 {3, 4, 3}, /* cost of loading integer registers
754 in QImode, HImode and SImode.
755 Relative to reg-reg move (2). */
756 {3, 4, 3}, /* cost of storing integer registers */
757 4, /* cost of reg,reg fld/fst */
758 {4, 4, 12}, /* cost of loading fp registers
759 in SFmode, DFmode and XFmode */
760 {6, 6, 8}, /* cost of storing fp registers
761 in SFmode, DFmode and XFmode */
762 2, /* cost of moving MMX register */
763 {4, 4}, /* cost of loading MMX registers
764 in SImode and DImode */
765 {4, 4}, /* cost of storing MMX registers
766 in SImode and DImode */
767 2, /* cost of moving SSE register */
768 {4, 4, 6}, /* cost of loading SSE registers
769 in SImode, DImode and TImode */
770 {4, 4, 5}, /* cost of storing SSE registers
771 in SImode, DImode and TImode */
772 5, /* MMX or SSE register to integer */
773 64, /* size of l1 cache. */
774 256, /* size of l2 cache. */
775 64, /* size of prefetch block */
776 6, /* number of parallel prefetches */
777 5, /* Branch cost */
778 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
779 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
780 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
781 COSTS_N_INSNS (2), /* cost of FABS instruction. */
782 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
783 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
784 athlon_memcpy,
785 athlon_memset,
786 1, /* scalar_stmt_cost. */
787 1, /* scalar load_cost. */
788 1, /* scalar_store_cost. */
789 1, /* vec_stmt_cost. */
790 1, /* vec_to_scalar_cost. */
791 1, /* scalar_to_vec_cost. */
792 1, /* vec_align_load_cost. */
793 2, /* vec_unalign_load_cost. */
794 1, /* vec_store_cost. */
795 3, /* cond_taken_branch_cost. */
796 1, /* cond_not_taken_branch_cost. */
799 /* K8 has optimized REP instruction for medium sized blocks, but for very
800 small blocks it is better to use loop. For large blocks, libcall can
801 do nontemporary accesses and beat inline considerably. */
802 static stringop_algs k8_memcpy[2] = {
803 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
804 {-1, rep_prefix_4_byte, false}}},
805 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
806 {-1, libcall, false}}}};
807 static stringop_algs k8_memset[2] = {
808 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
809 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
810 {libcall, {{48, unrolled_loop, false},
811 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
812 static const
813 struct processor_costs k8_cost = {
814 COSTS_N_INSNS (1), /* cost of an add instruction */
815 COSTS_N_INSNS (2), /* cost of a lea instruction */
816 COSTS_N_INSNS (1), /* variable shift costs */
817 COSTS_N_INSNS (1), /* constant shift costs */
818 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
819 COSTS_N_INSNS (4), /* HI */
820 COSTS_N_INSNS (3), /* SI */
821 COSTS_N_INSNS (4), /* DI */
822 COSTS_N_INSNS (5)}, /* other */
823 0, /* cost of multiply per each bit set */
824 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
825 COSTS_N_INSNS (26), /* HI */
826 COSTS_N_INSNS (42), /* SI */
827 COSTS_N_INSNS (74), /* DI */
828 COSTS_N_INSNS (74)}, /* other */
829 COSTS_N_INSNS (1), /* cost of movsx */
830 COSTS_N_INSNS (1), /* cost of movzx */
831 8, /* "large" insn */
832 9, /* MOVE_RATIO */
833 4, /* cost for loading QImode using movzbl */
834 {3, 4, 3}, /* cost of loading integer registers
835 in QImode, HImode and SImode.
836 Relative to reg-reg move (2). */
837 {3, 4, 3}, /* cost of storing integer registers */
838 4, /* cost of reg,reg fld/fst */
839 {4, 4, 12}, /* cost of loading fp registers
840 in SFmode, DFmode and XFmode */
841 {6, 6, 8}, /* cost of storing fp registers
842 in SFmode, DFmode and XFmode */
843 2, /* cost of moving MMX register */
844 {3, 3}, /* cost of loading MMX registers
845 in SImode and DImode */
846 {4, 4}, /* cost of storing MMX registers
847 in SImode and DImode */
848 2, /* cost of moving SSE register */
849 {4, 3, 6}, /* cost of loading SSE registers
850 in SImode, DImode and TImode */
851 {4, 4, 5}, /* cost of storing SSE registers
852 in SImode, DImode and TImode */
853 5, /* MMX or SSE register to integer */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 3, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
871 k8_memcpy,
872 k8_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 5, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 3, /* vec_unalign_load_cost. */
881 3, /* vec_store_cost. */
882 3, /* cond_taken_branch_cost. */
883 2, /* cond_not_taken_branch_cost. */
886 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall can
888 do nontemporary accesses and beat inline considerably. */
889 static stringop_algs amdfam10_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs amdfam10_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
899 struct processor_costs amdfam10_cost = {
900 COSTS_N_INSNS (1), /* cost of an add instruction */
901 COSTS_N_INSNS (2), /* cost of a lea instruction */
902 COSTS_N_INSNS (1), /* variable shift costs */
903 COSTS_N_INSNS (1), /* constant shift costs */
904 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
905 COSTS_N_INSNS (4), /* HI */
906 COSTS_N_INSNS (3), /* SI */
907 COSTS_N_INSNS (4), /* DI */
908 COSTS_N_INSNS (5)}, /* other */
909 0, /* cost of multiply per each bit set */
910 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
911 COSTS_N_INSNS (35), /* HI */
912 COSTS_N_INSNS (51), /* SI */
913 COSTS_N_INSNS (83), /* DI */
914 COSTS_N_INSNS (83)}, /* other */
915 COSTS_N_INSNS (1), /* cost of movsx */
916 COSTS_N_INSNS (1), /* cost of movzx */
917 8, /* "large" insn */
918 9, /* MOVE_RATIO */
919 4, /* cost for loading QImode using movzbl */
920 {3, 4, 3}, /* cost of loading integer registers
921 in QImode, HImode and SImode.
922 Relative to reg-reg move (2). */
923 {3, 4, 3}, /* cost of storing integer registers */
924 4, /* cost of reg,reg fld/fst */
925 {4, 4, 12}, /* cost of loading fp registers
926 in SFmode, DFmode and XFmode */
927 {6, 6, 8}, /* cost of storing fp registers
928 in SFmode, DFmode and XFmode */
929 2, /* cost of moving MMX register */
930 {3, 3}, /* cost of loading MMX registers
931 in SImode and DImode */
932 {4, 4}, /* cost of storing MMX registers
933 in SImode and DImode */
934 2, /* cost of moving SSE register */
935 {4, 4, 3}, /* cost of loading SSE registers
936 in SImode, DImode and TImode */
937 {4, 4, 5}, /* cost of storing SSE registers
938 in SImode, DImode and TImode */
939 3, /* MMX or SSE register to integer */
940 /* On K8:
941 MOVD reg64, xmmreg Double FSTORE 4
942 MOVD reg32, xmmreg Double FSTORE 4
943 On AMDFAM10:
944 MOVD reg64, xmmreg Double FADD 3
945 1/1 1/1
946 MOVD reg32, xmmreg Double FADD 3
947 1/1 1/1 */
948 64, /* size of l1 cache. */
949 512, /* size of l2 cache. */
950 64, /* size of prefetch block */
951 /* New AMD processors never drop prefetches; if they cannot be performed
952 immediately, they are queued. We set number of simultaneous prefetches
953 to a large constant to reflect this (it probably is not a good idea not
954 to limit number of prefetches at all, as their execution also takes some
955 time). */
956 100, /* number of parallel prefetches */
957 2, /* Branch cost */
958 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
959 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
960 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
961 COSTS_N_INSNS (2), /* cost of FABS instruction. */
962 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
963 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
965 amdfam10_memcpy,
966 amdfam10_memset,
967 4, /* scalar_stmt_cost. */
968 2, /* scalar load_cost. */
969 2, /* scalar_store_cost. */
970 6, /* vec_stmt_cost. */
971 0, /* vec_to_scalar_cost. */
972 2, /* scalar_to_vec_cost. */
973 2, /* vec_align_load_cost. */
974 2, /* vec_unalign_load_cost. */
975 2, /* vec_store_cost. */
976 2, /* cond_taken_branch_cost. */
977 1, /* cond_not_taken_branch_cost. */
980 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
981 very small blocks it is better to use loop. For large blocks, libcall
982 can do nontemporary accesses and beat inline considerably. */
983 static stringop_algs bdver1_memcpy[2] = {
984 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
985 {-1, rep_prefix_4_byte, false}}},
986 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
987 {-1, libcall, false}}}};
988 static stringop_algs bdver1_memset[2] = {
989 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
990 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
991 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
992 {-1, libcall, false}}}};
994 const struct processor_costs bdver1_cost = {
995 COSTS_N_INSNS (1), /* cost of an add instruction */
996 COSTS_N_INSNS (1), /* cost of a lea instruction */
997 COSTS_N_INSNS (1), /* variable shift costs */
998 COSTS_N_INSNS (1), /* constant shift costs */
999 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1000 COSTS_N_INSNS (4), /* HI */
1001 COSTS_N_INSNS (4), /* SI */
1002 COSTS_N_INSNS (6), /* DI */
1003 COSTS_N_INSNS (6)}, /* other */
1004 0, /* cost of multiply per each bit set */
1005 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1006 COSTS_N_INSNS (35), /* HI */
1007 COSTS_N_INSNS (51), /* SI */
1008 COSTS_N_INSNS (83), /* DI */
1009 COSTS_N_INSNS (83)}, /* other */
1010 COSTS_N_INSNS (1), /* cost of movsx */
1011 COSTS_N_INSNS (1), /* cost of movzx */
1012 8, /* "large" insn */
1013 9, /* MOVE_RATIO */
1014 4, /* cost for loading QImode using movzbl */
1015 {5, 5, 4}, /* cost of loading integer registers
1016 in QImode, HImode and SImode.
1017 Relative to reg-reg move (2). */
1018 {4, 4, 4}, /* cost of storing integer registers */
1019 2, /* cost of reg,reg fld/fst */
1020 {5, 5, 12}, /* cost of loading fp registers
1021 in SFmode, DFmode and XFmode */
1022 {4, 4, 8}, /* cost of storing fp registers
1023 in SFmode, DFmode and XFmode */
1024 2, /* cost of moving MMX register */
1025 {4, 4}, /* cost of loading MMX registers
1026 in SImode and DImode */
1027 {4, 4}, /* cost of storing MMX registers
1028 in SImode and DImode */
1029 2, /* cost of moving SSE register */
1030 {4, 4, 4}, /* cost of loading SSE registers
1031 in SImode, DImode and TImode */
1032 {4, 4, 4}, /* cost of storing SSE registers
1033 in SImode, DImode and TImode */
1034 2, /* MMX or SSE register to integer */
1035 /* On K8:
1036 MOVD reg64, xmmreg Double FSTORE 4
1037 MOVD reg32, xmmreg Double FSTORE 4
1038 On AMDFAM10:
1039 MOVD reg64, xmmreg Double FADD 3
1040 1/1 1/1
1041 MOVD reg32, xmmreg Double FADD 3
1042 1/1 1/1 */
1043 16, /* size of l1 cache. */
1044 2048, /* size of l2 cache. */
1045 64, /* size of prefetch block */
1046 /* New AMD processors never drop prefetches; if they cannot be performed
1047 immediately, they are queued. We set number of simultaneous prefetches
1048 to a large constant to reflect this (it probably is not a good idea not
1049 to limit number of prefetches at all, as their execution also takes some
1050 time). */
1051 100, /* number of parallel prefetches */
1052 2, /* Branch cost */
1053 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1054 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1055 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1056 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1057 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1058 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1060 bdver1_memcpy,
1061 bdver1_memset,
1062 6, /* scalar_stmt_cost. */
1063 4, /* scalar load_cost. */
1064 4, /* scalar_store_cost. */
1065 6, /* vec_stmt_cost. */
1066 0, /* vec_to_scalar_cost. */
1067 2, /* scalar_to_vec_cost. */
1068 4, /* vec_align_load_cost. */
1069 4, /* vec_unalign_load_cost. */
1070 4, /* vec_store_cost. */
1071 4, /* cond_taken_branch_cost. */
1072 2, /* cond_not_taken_branch_cost. */
1075 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1079 static stringop_algs bdver2_memcpy[2] = {
1080 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1081 {-1, rep_prefix_4_byte, false}}},
1082 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1083 {-1, libcall, false}}}};
1084 static stringop_algs bdver2_memset[2] = {
1085 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1086 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1087 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1088 {-1, libcall, false}}}};
1090 const struct processor_costs bdver2_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (1), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (4), /* SI */
1098 COSTS_N_INSNS (6), /* DI */
1099 COSTS_N_INSNS (6)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (35), /* HI */
1103 COSTS_N_INSNS (51), /* SI */
1104 COSTS_N_INSNS (83), /* DI */
1105 COSTS_N_INSNS (83)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {5, 5, 4}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {4, 4, 4}, /* cost of storing integer registers */
1115 2, /* cost of reg,reg fld/fst */
1116 {5, 5, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {4, 4, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {4, 4}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 4, 4}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 4}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 2, /* MMX or SSE register to integer */
1131 /* On K8:
1132 MOVD reg64, xmmreg Double FSTORE 4
1133 MOVD reg32, xmmreg Double FSTORE 4
1134 On AMDFAM10:
1135 MOVD reg64, xmmreg Double FADD 3
1136 1/1 1/1
1137 MOVD reg32, xmmreg Double FADD 3
1138 1/1 1/1 */
1139 16, /* size of l1 cache. */
1140 2048, /* size of l2 cache. */
1141 64, /* size of prefetch block */
1142 /* New AMD processors never drop prefetches; if they cannot be performed
1143 immediately, they are queued. We set number of simultaneous prefetches
1144 to a large constant to reflect this (it probably is not a good idea not
1145 to limit number of prefetches at all, as their execution also takes some
1146 time). */
1147 100, /* number of parallel prefetches */
1148 2, /* Branch cost */
1149 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1150 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1151 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1152 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1153 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1154 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1156 bdver2_memcpy,
1157 bdver2_memset,
1158 6, /* scalar_stmt_cost. */
1159 4, /* scalar load_cost. */
1160 4, /* scalar_store_cost. */
1161 6, /* vec_stmt_cost. */
1162 0, /* vec_to_scalar_cost. */
1163 2, /* scalar_to_vec_cost. */
1164 4, /* vec_align_load_cost. */
1165 4, /* vec_unalign_load_cost. */
1166 4, /* vec_store_cost. */
1167 4, /* cond_taken_branch_cost. */
1168 2, /* cond_not_taken_branch_cost. */
1172 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1173 very small blocks it is better to use loop. For large blocks, libcall
1174 can do nontemporary accesses and beat inline considerably. */
1175 static stringop_algs bdver3_memcpy[2] = {
1176 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1177 {-1, rep_prefix_4_byte, false}}},
1178 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1179 {-1, libcall, false}}}};
1180 static stringop_algs bdver3_memset[2] = {
1181 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1182 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1183 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1184 {-1, libcall, false}}}};
1185 struct processor_costs bdver3_cost = {
1186 COSTS_N_INSNS (1), /* cost of an add instruction */
1187 COSTS_N_INSNS (1), /* cost of a lea instruction */
1188 COSTS_N_INSNS (1), /* variable shift costs */
1189 COSTS_N_INSNS (1), /* constant shift costs */
1190 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1191 COSTS_N_INSNS (4), /* HI */
1192 COSTS_N_INSNS (4), /* SI */
1193 COSTS_N_INSNS (6), /* DI */
1194 COSTS_N_INSNS (6)}, /* other */
1195 0, /* cost of multiply per each bit set */
1196 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1197 COSTS_N_INSNS (35), /* HI */
1198 COSTS_N_INSNS (51), /* SI */
1199 COSTS_N_INSNS (83), /* DI */
1200 COSTS_N_INSNS (83)}, /* other */
1201 COSTS_N_INSNS (1), /* cost of movsx */
1202 COSTS_N_INSNS (1), /* cost of movzx */
1203 8, /* "large" insn */
1204 9, /* MOVE_RATIO */
1205 4, /* cost for loading QImode using movzbl */
1206 {5, 5, 4}, /* cost of loading integer registers
1207 in QImode, HImode and SImode.
1208 Relative to reg-reg move (2). */
1209 {4, 4, 4}, /* cost of storing integer registers */
1210 2, /* cost of reg,reg fld/fst */
1211 {5, 5, 12}, /* cost of loading fp registers
1212 in SFmode, DFmode and XFmode */
1213 {4, 4, 8}, /* cost of storing fp registers
1214 in SFmode, DFmode and XFmode */
1215 2, /* cost of moving MMX register */
1216 {4, 4}, /* cost of loading MMX registers
1217 in SImode and DImode */
1218 {4, 4}, /* cost of storing MMX registers
1219 in SImode and DImode */
1220 2, /* cost of moving SSE register */
1221 {4, 4, 4}, /* cost of loading SSE registers
1222 in SImode, DImode and TImode */
1223 {4, 4, 4}, /* cost of storing SSE registers
1224 in SImode, DImode and TImode */
1225 2, /* MMX or SSE register to integer */
1226 16, /* size of l1 cache. */
1227 2048, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 /* New AMD processors never drop prefetches; if they cannot be performed
1230 immediately, they are queued. We set number of simultaneous prefetches
1231 to a large constant to reflect this (it probably is not a good idea not
1232 to limit number of prefetches at all, as their execution also takes some
1233 time). */
1234 100, /* number of parallel prefetches */
1235 2, /* Branch cost */
1236 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1237 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1238 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1239 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1240 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1241 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1243 bdver3_memcpy,
1244 bdver3_memset,
1245 6, /* scalar_stmt_cost. */
1246 4, /* scalar load_cost. */
1247 4, /* scalar_store_cost. */
1248 6, /* vec_stmt_cost. */
1249 0, /* vec_to_scalar_cost. */
1250 2, /* scalar_to_vec_cost. */
1251 4, /* vec_align_load_cost. */
1252 4, /* vec_unalign_load_cost. */
1253 4, /* vec_store_cost. */
1254 4, /* cond_taken_branch_cost. */
1255 2, /* cond_not_taken_branch_cost. */
1258 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1259 very small blocks it is better to use loop. For large blocks, libcall
1260 can do nontemporary accesses and beat inline considerably. */
1261 static stringop_algs bdver4_memcpy[2] = {
1262 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1263 {-1, rep_prefix_4_byte, false}}},
1264 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1265 {-1, libcall, false}}}};
1266 static stringop_algs bdver4_memset[2] = {
1267 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1268 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1269 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1270 {-1, libcall, false}}}};
1271 struct processor_costs bdver4_cost = {
1272 COSTS_N_INSNS (1), /* cost of an add instruction */
1273 COSTS_N_INSNS (1), /* cost of a lea instruction */
1274 COSTS_N_INSNS (1), /* variable shift costs */
1275 COSTS_N_INSNS (1), /* constant shift costs */
1276 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1277 COSTS_N_INSNS (4), /* HI */
1278 COSTS_N_INSNS (4), /* SI */
1279 COSTS_N_INSNS (6), /* DI */
1280 COSTS_N_INSNS (6)}, /* other */
1281 0, /* cost of multiply per each bit set */
1282 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1283 COSTS_N_INSNS (35), /* HI */
1284 COSTS_N_INSNS (51), /* SI */
1285 COSTS_N_INSNS (83), /* DI */
1286 COSTS_N_INSNS (83)}, /* other */
1287 COSTS_N_INSNS (1), /* cost of movsx */
1288 COSTS_N_INSNS (1), /* cost of movzx */
1289 8, /* "large" insn */
1290 9, /* MOVE_RATIO */
1291 4, /* cost for loading QImode using movzbl */
1292 {5, 5, 4}, /* cost of loading integer registers
1293 in QImode, HImode and SImode.
1294 Relative to reg-reg move (2). */
1295 {4, 4, 4}, /* cost of storing integer registers */
1296 2, /* cost of reg,reg fld/fst */
1297 {5, 5, 12}, /* cost of loading fp registers
1298 in SFmode, DFmode and XFmode */
1299 {4, 4, 8}, /* cost of storing fp registers
1300 in SFmode, DFmode and XFmode */
1301 2, /* cost of moving MMX register */
1302 {4, 4}, /* cost of loading MMX registers
1303 in SImode and DImode */
1304 {4, 4}, /* cost of storing MMX registers
1305 in SImode and DImode */
1306 2, /* cost of moving SSE register */
1307 {4, 4, 4}, /* cost of loading SSE registers
1308 in SImode, DImode and TImode */
1309 {4, 4, 4}, /* cost of storing SSE registers
1310 in SImode, DImode and TImode */
1311 2, /* MMX or SSE register to integer */
1312 16, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 /* New AMD processors never drop prefetches; if they cannot be performed
1316 immediately, they are queued. We set number of simultaneous prefetches
1317 to a large constant to reflect this (it probably is not a good idea not
1318 to limit number of prefetches at all, as their execution also takes some
1319 time). */
1320 100, /* number of parallel prefetches */
1321 2, /* Branch cost */
1322 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1323 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1324 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1325 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1326 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1327 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1329 bdver4_memcpy,
1330 bdver4_memset,
1331 6, /* scalar_stmt_cost. */
1332 4, /* scalar load_cost. */
1333 4, /* scalar_store_cost. */
1334 6, /* vec_stmt_cost. */
1335 0, /* vec_to_scalar_cost. */
1336 2, /* scalar_to_vec_cost. */
1337 4, /* vec_align_load_cost. */
1338 4, /* vec_unalign_load_cost. */
1339 4, /* vec_store_cost. */
1340 4, /* cond_taken_branch_cost. */
1341 2, /* cond_not_taken_branch_cost. */
1345 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1346 very small blocks it is better to use loop. For large blocks, libcall
1347 can do nontemporary accesses and beat inline considerably. */
1348 static stringop_algs znver1_memcpy[2] = {
1349 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1350 {-1, rep_prefix_4_byte, false}}},
1351 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1352 {-1, libcall, false}}}};
1353 static stringop_algs znver1_memset[2] = {
1354 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1355 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1356 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1357 {-1, libcall, false}}}};
1358 struct processor_costs znver1_cost = {
1359 COSTS_N_INSNS (1), /* cost of an add instruction. */
1360 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1361 COSTS_N_INSNS (1), /* variable shift costs. */
1362 COSTS_N_INSNS (1), /* constant shift costs. */
1363 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1364 COSTS_N_INSNS (3), /* HI. */
1365 COSTS_N_INSNS (3), /* SI. */
1366 COSTS_N_INSNS (4), /* DI. */
1367 COSTS_N_INSNS (4)}, /* other. */
1368 0, /* cost of multiply per each bit
1369 set. */
1370 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1371 COSTS_N_INSNS (35), /* HI. */
1372 COSTS_N_INSNS (51), /* SI. */
1373 COSTS_N_INSNS (83), /* DI. */
1374 COSTS_N_INSNS (83)}, /* other. */
1375 COSTS_N_INSNS (1), /* cost of movsx. */
1376 COSTS_N_INSNS (1), /* cost of movzx. */
1377 8, /* "large" insn. */
1378 9, /* MOVE_RATIO. */
1379 4, /* cost for loading QImode using
1380 movzbl. */
1381 {5, 5, 4}, /* cost of loading integer registers
1382 in QImode, HImode and SImode.
1383 Relative to reg-reg move (2). */
1384 {4, 4, 4}, /* cost of storing integer
1385 registers. */
1386 2, /* cost of reg,reg fld/fst. */
1387 {5, 5, 12}, /* cost of loading fp registers
1388 in SFmode, DFmode and XFmode. */
1389 {4, 4, 8}, /* cost of storing fp registers
1390 in SFmode, DFmode and XFmode. */
1391 2, /* cost of moving MMX register. */
1392 {4, 4}, /* cost of loading MMX registers
1393 in SImode and DImode. */
1394 {4, 4}, /* cost of storing MMX registers
1395 in SImode and DImode. */
1396 2, /* cost of moving SSE register. */
1397 {4, 4, 4}, /* cost of loading SSE registers
1398 in SImode, DImode and TImode. */
1399 {4, 4, 4}, /* cost of storing SSE registers
1400 in SImode, DImode and TImode. */
1401 2, /* MMX or SSE register to integer. */
1402 32, /* size of l1 cache. */
1403 512, /* size of l2 cache. */
1404 64, /* size of prefetch block. */
1405 /* New AMD processors never drop prefetches; if they cannot be performed
1406 immediately, they are queued. We set number of simultaneous prefetches
1407 to a large constant to reflect this (it probably is not a good idea not
1408 to limit number of prefetches at all, as their execution also takes some
1409 time). */
1410 100, /* number of parallel prefetches. */
1411 2, /* Branch cost. */
1412 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1413 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1414 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1415 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1416 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1417 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1419 znver1_memcpy,
1420 znver1_memset,
1421 6, /* scalar_stmt_cost. */
1422 4, /* scalar load_cost. */
1423 4, /* scalar_store_cost. */
1424 6, /* vec_stmt_cost. */
1425 0, /* vec_to_scalar_cost. */
1426 2, /* scalar_to_vec_cost. */
1427 4, /* vec_align_load_cost. */
1428 4, /* vec_unalign_load_cost. */
1429 4, /* vec_store_cost. */
1430 4, /* cond_taken_branch_cost. */
1431 2, /* cond_not_taken_branch_cost. */
1434 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1435 very small blocks it is better to use loop. For large blocks, libcall can
1436 do nontemporary accesses and beat inline considerably. */
1437 static stringop_algs btver1_memcpy[2] = {
1438 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1439 {-1, rep_prefix_4_byte, false}}},
1440 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1441 {-1, libcall, false}}}};
1442 static stringop_algs btver1_memset[2] = {
1443 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1444 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1445 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1446 {-1, libcall, false}}}};
1447 const struct processor_costs btver1_cost = {
1448 COSTS_N_INSNS (1), /* cost of an add instruction */
1449 COSTS_N_INSNS (2), /* cost of a lea instruction */
1450 COSTS_N_INSNS (1), /* variable shift costs */
1451 COSTS_N_INSNS (1), /* constant shift costs */
1452 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1453 COSTS_N_INSNS (4), /* HI */
1454 COSTS_N_INSNS (3), /* SI */
1455 COSTS_N_INSNS (4), /* DI */
1456 COSTS_N_INSNS (5)}, /* other */
1457 0, /* cost of multiply per each bit set */
1458 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1459 COSTS_N_INSNS (35), /* HI */
1460 COSTS_N_INSNS (51), /* SI */
1461 COSTS_N_INSNS (83), /* DI */
1462 COSTS_N_INSNS (83)}, /* other */
1463 COSTS_N_INSNS (1), /* cost of movsx */
1464 COSTS_N_INSNS (1), /* cost of movzx */
1465 8, /* "large" insn */
1466 9, /* MOVE_RATIO */
1467 4, /* cost for loading QImode using movzbl */
1468 {3, 4, 3}, /* cost of loading integer registers
1469 in QImode, HImode and SImode.
1470 Relative to reg-reg move (2). */
1471 {3, 4, 3}, /* cost of storing integer registers */
1472 4, /* cost of reg,reg fld/fst */
1473 {4, 4, 12}, /* cost of loading fp registers
1474 in SFmode, DFmode and XFmode */
1475 {6, 6, 8}, /* cost of storing fp registers
1476 in SFmode, DFmode and XFmode */
1477 2, /* cost of moving MMX register */
1478 {3, 3}, /* cost of loading MMX registers
1479 in SImode and DImode */
1480 {4, 4}, /* cost of storing MMX registers
1481 in SImode and DImode */
1482 2, /* cost of moving SSE register */
1483 {4, 4, 3}, /* cost of loading SSE registers
1484 in SImode, DImode and TImode */
1485 {4, 4, 5}, /* cost of storing SSE registers
1486 in SImode, DImode and TImode */
1487 3, /* MMX or SSE register to integer */
1488 /* On K8:
1489 MOVD reg64, xmmreg Double FSTORE 4
1490 MOVD reg32, xmmreg Double FSTORE 4
1491 On AMDFAM10:
1492 MOVD reg64, xmmreg Double FADD 3
1493 1/1 1/1
1494 MOVD reg32, xmmreg Double FADD 3
1495 1/1 1/1 */
1496 32, /* size of l1 cache. */
1497 512, /* size of l2 cache. */
1498 64, /* size of prefetch block */
1499 100, /* number of parallel prefetches */
1500 2, /* Branch cost */
1501 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1502 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1503 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1504 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1505 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1506 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1508 btver1_memcpy,
1509 btver1_memset,
1510 4, /* scalar_stmt_cost. */
1511 2, /* scalar load_cost. */
1512 2, /* scalar_store_cost. */
1513 6, /* vec_stmt_cost. */
1514 0, /* vec_to_scalar_cost. */
1515 2, /* scalar_to_vec_cost. */
1516 2, /* vec_align_load_cost. */
1517 2, /* vec_unalign_load_cost. */
1518 2, /* vec_store_cost. */
1519 2, /* cond_taken_branch_cost. */
1520 1, /* cond_not_taken_branch_cost. */
1523 static stringop_algs btver2_memcpy[2] = {
1524 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1525 {-1, rep_prefix_4_byte, false}}},
1526 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1527 {-1, libcall, false}}}};
1528 static stringop_algs btver2_memset[2] = {
1529 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1530 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1531 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1532 {-1, libcall, false}}}};
1533 const struct processor_costs btver2_cost = {
1534 COSTS_N_INSNS (1), /* cost of an add instruction */
1535 COSTS_N_INSNS (2), /* cost of a lea instruction */
1536 COSTS_N_INSNS (1), /* variable shift costs */
1537 COSTS_N_INSNS (1), /* constant shift costs */
1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1539 COSTS_N_INSNS (4), /* HI */
1540 COSTS_N_INSNS (3), /* SI */
1541 COSTS_N_INSNS (4), /* DI */
1542 COSTS_N_INSNS (5)}, /* other */
1543 0, /* cost of multiply per each bit set */
1544 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1545 COSTS_N_INSNS (35), /* HI */
1546 COSTS_N_INSNS (51), /* SI */
1547 COSTS_N_INSNS (83), /* DI */
1548 COSTS_N_INSNS (83)}, /* other */
1549 COSTS_N_INSNS (1), /* cost of movsx */
1550 COSTS_N_INSNS (1), /* cost of movzx */
1551 8, /* "large" insn */
1552 9, /* MOVE_RATIO */
1553 4, /* cost for loading QImode using movzbl */
1554 {3, 4, 3}, /* cost of loading integer registers
1555 in QImode, HImode and SImode.
1556 Relative to reg-reg move (2). */
1557 {3, 4, 3}, /* cost of storing integer registers */
1558 4, /* cost of reg,reg fld/fst */
1559 {4, 4, 12}, /* cost of loading fp registers
1560 in SFmode, DFmode and XFmode */
1561 {6, 6, 8}, /* cost of storing fp registers
1562 in SFmode, DFmode and XFmode */
1563 2, /* cost of moving MMX register */
1564 {3, 3}, /* cost of loading MMX registers
1565 in SImode and DImode */
1566 {4, 4}, /* cost of storing MMX registers
1567 in SImode and DImode */
1568 2, /* cost of moving SSE register */
1569 {4, 4, 3}, /* cost of loading SSE registers
1570 in SImode, DImode and TImode */
1571 {4, 4, 5}, /* cost of storing SSE registers
1572 in SImode, DImode and TImode */
1573 3, /* MMX or SSE register to integer */
1574 /* On K8:
1575 MOVD reg64, xmmreg Double FSTORE 4
1576 MOVD reg32, xmmreg Double FSTORE 4
1577 On AMDFAM10:
1578 MOVD reg64, xmmreg Double FADD 3
1579 1/1 1/1
1580 MOVD reg32, xmmreg Double FADD 3
1581 1/1 1/1 */
1582 32, /* size of l1 cache. */
1583 2048, /* size of l2 cache. */
1584 64, /* size of prefetch block */
1585 100, /* number of parallel prefetches */
1586 2, /* Branch cost */
1587 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1588 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1589 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1590 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1591 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1592 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1593 btver2_memcpy,
1594 btver2_memset,
1595 4, /* scalar_stmt_cost. */
1596 2, /* scalar load_cost. */
1597 2, /* scalar_store_cost. */
1598 6, /* vec_stmt_cost. */
1599 0, /* vec_to_scalar_cost. */
1600 2, /* scalar_to_vec_cost. */
1601 2, /* vec_align_load_cost. */
1602 2, /* vec_unalign_load_cost. */
1603 2, /* vec_store_cost. */
1604 2, /* cond_taken_branch_cost. */
1605 1, /* cond_not_taken_branch_cost. */
1608 static stringop_algs pentium4_memcpy[2] = {
1609 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1610 DUMMY_STRINGOP_ALGS};
1611 static stringop_algs pentium4_memset[2] = {
1612 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1613 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1614 DUMMY_STRINGOP_ALGS};
1616 static const
1617 struct processor_costs pentium4_cost = {
1618 COSTS_N_INSNS (1), /* cost of an add instruction */
1619 COSTS_N_INSNS (3), /* cost of a lea instruction */
1620 COSTS_N_INSNS (4), /* variable shift costs */
1621 COSTS_N_INSNS (4), /* constant shift costs */
1622 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1623 COSTS_N_INSNS (15), /* HI */
1624 COSTS_N_INSNS (15), /* SI */
1625 COSTS_N_INSNS (15), /* DI */
1626 COSTS_N_INSNS (15)}, /* other */
1627 0, /* cost of multiply per each bit set */
1628 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1629 COSTS_N_INSNS (56), /* HI */
1630 COSTS_N_INSNS (56), /* SI */
1631 COSTS_N_INSNS (56), /* DI */
1632 COSTS_N_INSNS (56)}, /* other */
1633 COSTS_N_INSNS (1), /* cost of movsx */
1634 COSTS_N_INSNS (1), /* cost of movzx */
1635 16, /* "large" insn */
1636 6, /* MOVE_RATIO */
1637 2, /* cost for loading QImode using movzbl */
1638 {4, 5, 4}, /* cost of loading integer registers
1639 in QImode, HImode and SImode.
1640 Relative to reg-reg move (2). */
1641 {2, 3, 2}, /* cost of storing integer registers */
1642 2, /* cost of reg,reg fld/fst */
1643 {2, 2, 6}, /* cost of loading fp registers
1644 in SFmode, DFmode and XFmode */
1645 {4, 4, 6}, /* cost of storing fp registers
1646 in SFmode, DFmode and XFmode */
1647 2, /* cost of moving MMX register */
1648 {2, 2}, /* cost of loading MMX registers
1649 in SImode and DImode */
1650 {2, 2}, /* cost of storing MMX registers
1651 in SImode and DImode */
1652 12, /* cost of moving SSE register */
1653 {12, 12, 12}, /* cost of loading SSE registers
1654 in SImode, DImode and TImode */
1655 {2, 2, 8}, /* cost of storing SSE registers
1656 in SImode, DImode and TImode */
1657 10, /* MMX or SSE register to integer */
1658 8, /* size of l1 cache. */
1659 256, /* size of l2 cache. */
1660 64, /* size of prefetch block */
1661 6, /* number of parallel prefetches */
1662 2, /* Branch cost */
1663 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1664 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1665 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1666 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1667 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1668 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1669 pentium4_memcpy,
1670 pentium4_memset,
1671 1, /* scalar_stmt_cost. */
1672 1, /* scalar load_cost. */
1673 1, /* scalar_store_cost. */
1674 1, /* vec_stmt_cost. */
1675 1, /* vec_to_scalar_cost. */
1676 1, /* scalar_to_vec_cost. */
1677 1, /* vec_align_load_cost. */
1678 2, /* vec_unalign_load_cost. */
1679 1, /* vec_store_cost. */
1680 3, /* cond_taken_branch_cost. */
1681 1, /* cond_not_taken_branch_cost. */
1684 static stringop_algs nocona_memcpy[2] = {
1685 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1686 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1687 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1689 static stringop_algs nocona_memset[2] = {
1690 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1691 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1692 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1693 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1695 static const
1696 struct processor_costs nocona_cost = {
1697 COSTS_N_INSNS (1), /* cost of an add instruction */
1698 COSTS_N_INSNS (1), /* cost of a lea instruction */
1699 COSTS_N_INSNS (1), /* variable shift costs */
1700 COSTS_N_INSNS (1), /* constant shift costs */
1701 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1702 COSTS_N_INSNS (10), /* HI */
1703 COSTS_N_INSNS (10), /* SI */
1704 COSTS_N_INSNS (10), /* DI */
1705 COSTS_N_INSNS (10)}, /* other */
1706 0, /* cost of multiply per each bit set */
1707 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1708 COSTS_N_INSNS (66), /* HI */
1709 COSTS_N_INSNS (66), /* SI */
1710 COSTS_N_INSNS (66), /* DI */
1711 COSTS_N_INSNS (66)}, /* other */
1712 COSTS_N_INSNS (1), /* cost of movsx */
1713 COSTS_N_INSNS (1), /* cost of movzx */
1714 16, /* "large" insn */
1715 17, /* MOVE_RATIO */
1716 4, /* cost for loading QImode using movzbl */
1717 {4, 4, 4}, /* cost of loading integer registers
1718 in QImode, HImode and SImode.
1719 Relative to reg-reg move (2). */
1720 {4, 4, 4}, /* cost of storing integer registers */
1721 3, /* cost of reg,reg fld/fst */
1722 {12, 12, 12}, /* cost of loading fp registers
1723 in SFmode, DFmode and XFmode */
1724 {4, 4, 4}, /* cost of storing fp registers
1725 in SFmode, DFmode and XFmode */
1726 6, /* cost of moving MMX register */
1727 {12, 12}, /* cost of loading MMX registers
1728 in SImode and DImode */
1729 {12, 12}, /* cost of storing MMX registers
1730 in SImode and DImode */
1731 6, /* cost of moving SSE register */
1732 {12, 12, 12}, /* cost of loading SSE registers
1733 in SImode, DImode and TImode */
1734 {12, 12, 12}, /* cost of storing SSE registers
1735 in SImode, DImode and TImode */
1736 8, /* MMX or SSE register to integer */
1737 8, /* size of l1 cache. */
1738 1024, /* size of l2 cache. */
1739 64, /* size of prefetch block */
1740 8, /* number of parallel prefetches */
1741 1, /* Branch cost */
1742 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1743 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1744 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1745 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1746 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1747 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1748 nocona_memcpy,
1749 nocona_memset,
1750 1, /* scalar_stmt_cost. */
1751 1, /* scalar load_cost. */
1752 1, /* scalar_store_cost. */
1753 1, /* vec_stmt_cost. */
1754 1, /* vec_to_scalar_cost. */
1755 1, /* scalar_to_vec_cost. */
1756 1, /* vec_align_load_cost. */
1757 2, /* vec_unalign_load_cost. */
1758 1, /* vec_store_cost. */
1759 3, /* cond_taken_branch_cost. */
1760 1, /* cond_not_taken_branch_cost. */
1763 static stringop_algs atom_memcpy[2] = {
1764 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1765 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1766 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1767 static stringop_algs atom_memset[2] = {
1768 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1769 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1770 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1771 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1772 static const
1773 struct processor_costs atom_cost = {
1774 COSTS_N_INSNS (1), /* cost of an add instruction */
1775 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1776 COSTS_N_INSNS (1), /* variable shift costs */
1777 COSTS_N_INSNS (1), /* constant shift costs */
1778 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1779 COSTS_N_INSNS (4), /* HI */
1780 COSTS_N_INSNS (3), /* SI */
1781 COSTS_N_INSNS (4), /* DI */
1782 COSTS_N_INSNS (2)}, /* other */
1783 0, /* cost of multiply per each bit set */
1784 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1785 COSTS_N_INSNS (26), /* HI */
1786 COSTS_N_INSNS (42), /* SI */
1787 COSTS_N_INSNS (74), /* DI */
1788 COSTS_N_INSNS (74)}, /* other */
1789 COSTS_N_INSNS (1), /* cost of movsx */
1790 COSTS_N_INSNS (1), /* cost of movzx */
1791 8, /* "large" insn */
1792 17, /* MOVE_RATIO */
1793 4, /* cost for loading QImode using movzbl */
1794 {4, 4, 4}, /* cost of loading integer registers
1795 in QImode, HImode and SImode.
1796 Relative to reg-reg move (2). */
1797 {4, 4, 4}, /* cost of storing integer registers */
1798 4, /* cost of reg,reg fld/fst */
1799 {12, 12, 12}, /* cost of loading fp registers
1800 in SFmode, DFmode and XFmode */
1801 {6, 6, 8}, /* cost of storing fp registers
1802 in SFmode, DFmode and XFmode */
1803 2, /* cost of moving MMX register */
1804 {8, 8}, /* cost of loading MMX registers
1805 in SImode and DImode */
1806 {8, 8}, /* cost of storing MMX registers
1807 in SImode and DImode */
1808 2, /* cost of moving SSE register */
1809 {8, 8, 8}, /* cost of loading SSE registers
1810 in SImode, DImode and TImode */
1811 {8, 8, 8}, /* cost of storing SSE registers
1812 in SImode, DImode and TImode */
1813 5, /* MMX or SSE register to integer */
1814 32, /* size of l1 cache. */
1815 256, /* size of l2 cache. */
1816 64, /* size of prefetch block */
1817 6, /* number of parallel prefetches */
1818 3, /* Branch cost */
1819 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1820 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1821 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1822 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1823 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1824 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1825 atom_memcpy,
1826 atom_memset,
1827 1, /* scalar_stmt_cost. */
1828 1, /* scalar load_cost. */
1829 1, /* scalar_store_cost. */
1830 1, /* vec_stmt_cost. */
1831 1, /* vec_to_scalar_cost. */
1832 1, /* scalar_to_vec_cost. */
1833 1, /* vec_align_load_cost. */
1834 2, /* vec_unalign_load_cost. */
1835 1, /* vec_store_cost. */
1836 3, /* cond_taken_branch_cost. */
1837 1, /* cond_not_taken_branch_cost. */
1840 static stringop_algs slm_memcpy[2] = {
1841 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1842 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1843 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1844 static stringop_algs slm_memset[2] = {
1845 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1846 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1847 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1848 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1849 static const
1850 struct processor_costs slm_cost = {
1851 COSTS_N_INSNS (1), /* cost of an add instruction */
1852 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1853 COSTS_N_INSNS (1), /* variable shift costs */
1854 COSTS_N_INSNS (1), /* constant shift costs */
1855 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1856 COSTS_N_INSNS (3), /* HI */
1857 COSTS_N_INSNS (3), /* SI */
1858 COSTS_N_INSNS (4), /* DI */
1859 COSTS_N_INSNS (2)}, /* other */
1860 0, /* cost of multiply per each bit set */
1861 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1862 COSTS_N_INSNS (26), /* HI */
1863 COSTS_N_INSNS (42), /* SI */
1864 COSTS_N_INSNS (74), /* DI */
1865 COSTS_N_INSNS (74)}, /* other */
1866 COSTS_N_INSNS (1), /* cost of movsx */
1867 COSTS_N_INSNS (1), /* cost of movzx */
1868 8, /* "large" insn */
1869 17, /* MOVE_RATIO */
1870 4, /* cost for loading QImode using movzbl */
1871 {4, 4, 4}, /* cost of loading integer registers
1872 in QImode, HImode and SImode.
1873 Relative to reg-reg move (2). */
1874 {4, 4, 4}, /* cost of storing integer registers */
1875 4, /* cost of reg,reg fld/fst */
1876 {12, 12, 12}, /* cost of loading fp registers
1877 in SFmode, DFmode and XFmode */
1878 {6, 6, 8}, /* cost of storing fp registers
1879 in SFmode, DFmode and XFmode */
1880 2, /* cost of moving MMX register */
1881 {8, 8}, /* cost of loading MMX registers
1882 in SImode and DImode */
1883 {8, 8}, /* cost of storing MMX registers
1884 in SImode and DImode */
1885 2, /* cost of moving SSE register */
1886 {8, 8, 8}, /* cost of loading SSE registers
1887 in SImode, DImode and TImode */
1888 {8, 8, 8}, /* cost of storing SSE registers
1889 in SImode, DImode and TImode */
1890 5, /* MMX or SSE register to integer */
1891 32, /* size of l1 cache. */
1892 256, /* size of l2 cache. */
1893 64, /* size of prefetch block */
1894 6, /* number of parallel prefetches */
1895 3, /* Branch cost */
1896 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1897 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1898 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1899 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1900 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1901 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1902 slm_memcpy,
1903 slm_memset,
1904 1, /* scalar_stmt_cost. */
1905 1, /* scalar load_cost. */
1906 1, /* scalar_store_cost. */
1907 1, /* vec_stmt_cost. */
1908 4, /* vec_to_scalar_cost. */
1909 1, /* scalar_to_vec_cost. */
1910 1, /* vec_align_load_cost. */
1911 2, /* vec_unalign_load_cost. */
1912 1, /* vec_store_cost. */
1913 3, /* cond_taken_branch_cost. */
1914 1, /* cond_not_taken_branch_cost. */
1917 static stringop_algs intel_memcpy[2] = {
1918 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1919 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1920 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1921 static stringop_algs intel_memset[2] = {
1922 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1923 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1924 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1925 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1926 static const
1927 struct processor_costs intel_cost = {
1928 COSTS_N_INSNS (1), /* cost of an add instruction */
1929 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1930 COSTS_N_INSNS (1), /* variable shift costs */
1931 COSTS_N_INSNS (1), /* constant shift costs */
1932 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1933 COSTS_N_INSNS (3), /* HI */
1934 COSTS_N_INSNS (3), /* SI */
1935 COSTS_N_INSNS (4), /* DI */
1936 COSTS_N_INSNS (2)}, /* other */
1937 0, /* cost of multiply per each bit set */
1938 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1939 COSTS_N_INSNS (26), /* HI */
1940 COSTS_N_INSNS (42), /* SI */
1941 COSTS_N_INSNS (74), /* DI */
1942 COSTS_N_INSNS (74)}, /* other */
1943 COSTS_N_INSNS (1), /* cost of movsx */
1944 COSTS_N_INSNS (1), /* cost of movzx */
1945 8, /* "large" insn */
1946 17, /* MOVE_RATIO */
1947 4, /* cost for loading QImode using movzbl */
1948 {4, 4, 4}, /* cost of loading integer registers
1949 in QImode, HImode and SImode.
1950 Relative to reg-reg move (2). */
1951 {4, 4, 4}, /* cost of storing integer registers */
1952 4, /* cost of reg,reg fld/fst */
1953 {12, 12, 12}, /* cost of loading fp registers
1954 in SFmode, DFmode and XFmode */
1955 {6, 6, 8}, /* cost of storing fp registers
1956 in SFmode, DFmode and XFmode */
1957 2, /* cost of moving MMX register */
1958 {8, 8}, /* cost of loading MMX registers
1959 in SImode and DImode */
1960 {8, 8}, /* cost of storing MMX registers
1961 in SImode and DImode */
1962 2, /* cost of moving SSE register */
1963 {8, 8, 8}, /* cost of loading SSE registers
1964 in SImode, DImode and TImode */
1965 {8, 8, 8}, /* cost of storing SSE registers
1966 in SImode, DImode and TImode */
1967 5, /* MMX or SSE register to integer */
1968 32, /* size of l1 cache. */
1969 256, /* size of l2 cache. */
1970 64, /* size of prefetch block */
1971 6, /* number of parallel prefetches */
1972 3, /* Branch cost */
1973 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1974 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1975 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1976 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1977 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1978 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1979 intel_memcpy,
1980 intel_memset,
1981 1, /* scalar_stmt_cost. */
1982 1, /* scalar load_cost. */
1983 1, /* scalar_store_cost. */
1984 1, /* vec_stmt_cost. */
1985 4, /* vec_to_scalar_cost. */
1986 1, /* scalar_to_vec_cost. */
1987 1, /* vec_align_load_cost. */
1988 2, /* vec_unalign_load_cost. */
1989 1, /* vec_store_cost. */
1990 3, /* cond_taken_branch_cost. */
1991 1, /* cond_not_taken_branch_cost. */
1994 /* Generic should produce code tuned for Core-i7 (and newer chips)
1995 and btver1 (and newer chips). */
1997 static stringop_algs generic_memcpy[2] = {
1998 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1999 {-1, libcall, false}}},
2000 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2001 {-1, libcall, false}}}};
2002 static stringop_algs generic_memset[2] = {
2003 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2004 {-1, libcall, false}}},
2005 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2006 {-1, libcall, false}}}};
2007 static const
2008 struct processor_costs generic_cost = {
2009 COSTS_N_INSNS (1), /* cost of an add instruction */
2010 /* On all chips taken into consideration lea is 2 cycles and more. With
2011 this cost however our current implementation of synth_mult results in
2012 use of unnecessary temporary registers causing regression on several
2013 SPECfp benchmarks. */
2014 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2015 COSTS_N_INSNS (1), /* variable shift costs */
2016 COSTS_N_INSNS (1), /* constant shift costs */
2017 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2018 COSTS_N_INSNS (4), /* HI */
2019 COSTS_N_INSNS (3), /* SI */
2020 COSTS_N_INSNS (4), /* DI */
2021 COSTS_N_INSNS (2)}, /* other */
2022 0, /* cost of multiply per each bit set */
2023 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2024 COSTS_N_INSNS (26), /* HI */
2025 COSTS_N_INSNS (42), /* SI */
2026 COSTS_N_INSNS (74), /* DI */
2027 COSTS_N_INSNS (74)}, /* other */
2028 COSTS_N_INSNS (1), /* cost of movsx */
2029 COSTS_N_INSNS (1), /* cost of movzx */
2030 8, /* "large" insn */
2031 17, /* MOVE_RATIO */
2032 4, /* cost for loading QImode using movzbl */
2033 {4, 4, 4}, /* cost of loading integer registers
2034 in QImode, HImode and SImode.
2035 Relative to reg-reg move (2). */
2036 {4, 4, 4}, /* cost of storing integer registers */
2037 4, /* cost of reg,reg fld/fst */
2038 {12, 12, 12}, /* cost of loading fp registers
2039 in SFmode, DFmode and XFmode */
2040 {6, 6, 8}, /* cost of storing fp registers
2041 in SFmode, DFmode and XFmode */
2042 2, /* cost of moving MMX register */
2043 {8, 8}, /* cost of loading MMX registers
2044 in SImode and DImode */
2045 {8, 8}, /* cost of storing MMX registers
2046 in SImode and DImode */
2047 2, /* cost of moving SSE register */
2048 {8, 8, 8}, /* cost of loading SSE registers
2049 in SImode, DImode and TImode */
2050 {8, 8, 8}, /* cost of storing SSE registers
2051 in SImode, DImode and TImode */
2052 5, /* MMX or SSE register to integer */
2053 32, /* size of l1 cache. */
2054 512, /* size of l2 cache. */
2055 64, /* size of prefetch block */
2056 6, /* number of parallel prefetches */
2057 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2058 value is increased to perhaps more appropriate value of 5. */
2059 3, /* Branch cost */
2060 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2061 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2062 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2063 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2064 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2065 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2066 generic_memcpy,
2067 generic_memset,
2068 1, /* scalar_stmt_cost. */
2069 1, /* scalar load_cost. */
2070 1, /* scalar_store_cost. */
2071 1, /* vec_stmt_cost. */
2072 1, /* vec_to_scalar_cost. */
2073 1, /* scalar_to_vec_cost. */
2074 1, /* vec_align_load_cost. */
2075 2, /* vec_unalign_load_cost. */
2076 1, /* vec_store_cost. */
2077 3, /* cond_taken_branch_cost. */
2078 1, /* cond_not_taken_branch_cost. */
2081 /* core_cost should produce code tuned for Core familly of CPUs. */
2082 static stringop_algs core_memcpy[2] = {
2083 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2084 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2085 {-1, libcall, false}}}};
2086 static stringop_algs core_memset[2] = {
2087 {libcall, {{6, loop_1_byte, true},
2088 {24, loop, true},
2089 {8192, rep_prefix_4_byte, true},
2090 {-1, libcall, false}}},
2091 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2092 {-1, libcall, false}}}};
2094 static const
2095 struct processor_costs core_cost = {
2096 COSTS_N_INSNS (1), /* cost of an add instruction */
2097 /* On all chips taken into consideration lea is 2 cycles and more. With
2098 this cost however our current implementation of synth_mult results in
2099 use of unnecessary temporary registers causing regression on several
2100 SPECfp benchmarks. */
2101 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2102 COSTS_N_INSNS (1), /* variable shift costs */
2103 COSTS_N_INSNS (1), /* constant shift costs */
2104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2105 COSTS_N_INSNS (4), /* HI */
2106 COSTS_N_INSNS (3), /* SI */
2107 COSTS_N_INSNS (4), /* DI */
2108 COSTS_N_INSNS (2)}, /* other */
2109 0, /* cost of multiply per each bit set */
2110 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2111 COSTS_N_INSNS (26), /* HI */
2112 COSTS_N_INSNS (42), /* SI */
2113 COSTS_N_INSNS (74), /* DI */
2114 COSTS_N_INSNS (74)}, /* other */
2115 COSTS_N_INSNS (1), /* cost of movsx */
2116 COSTS_N_INSNS (1), /* cost of movzx */
2117 8, /* "large" insn */
2118 17, /* MOVE_RATIO */
2119 4, /* cost for loading QImode using movzbl */
2120 {4, 4, 4}, /* cost of loading integer registers
2121 in QImode, HImode and SImode.
2122 Relative to reg-reg move (2). */
2123 {4, 4, 4}, /* cost of storing integer registers */
2124 4, /* cost of reg,reg fld/fst */
2125 {12, 12, 12}, /* cost of loading fp registers
2126 in SFmode, DFmode and XFmode */
2127 {6, 6, 8}, /* cost of storing fp registers
2128 in SFmode, DFmode and XFmode */
2129 2, /* cost of moving MMX register */
2130 {8, 8}, /* cost of loading MMX registers
2131 in SImode and DImode */
2132 {8, 8}, /* cost of storing MMX registers
2133 in SImode and DImode */
2134 2, /* cost of moving SSE register */
2135 {8, 8, 8}, /* cost of loading SSE registers
2136 in SImode, DImode and TImode */
2137 {8, 8, 8}, /* cost of storing SSE registers
2138 in SImode, DImode and TImode */
2139 5, /* MMX or SSE register to integer */
2140 64, /* size of l1 cache. */
2141 512, /* size of l2 cache. */
2142 64, /* size of prefetch block */
2143 6, /* number of parallel prefetches */
2144 /* FIXME perhaps more appropriate value is 5. */
2145 3, /* Branch cost */
2146 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2147 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2148 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2149 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2150 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2151 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2152 core_memcpy,
2153 core_memset,
2154 1, /* scalar_stmt_cost. */
2155 1, /* scalar load_cost. */
2156 1, /* scalar_store_cost. */
2157 1, /* vec_stmt_cost. */
2158 1, /* vec_to_scalar_cost. */
2159 1, /* scalar_to_vec_cost. */
2160 1, /* vec_align_load_cost. */
2161 2, /* vec_unalign_load_cost. */
2162 1, /* vec_store_cost. */
2163 3, /* cond_taken_branch_cost. */
2164 1, /* cond_not_taken_branch_cost. */
2168 /* Set by -mtune. */
2169 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2171 /* Set by -mtune or -Os. */
2172 const struct processor_costs *ix86_cost = &pentium_cost;
2174 /* Processor feature/optimization bitmasks. */
2175 #define m_386 (1U<<PROCESSOR_I386)
2176 #define m_486 (1U<<PROCESSOR_I486)
2177 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2178 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2179 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2180 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2181 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2182 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2183 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2184 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2185 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2186 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2187 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2188 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2189 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2190 #define m_KNL (1U<<PROCESSOR_KNL)
2191 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2192 #define m_INTEL (1U<<PROCESSOR_INTEL)
2194 #define m_GEODE (1U<<PROCESSOR_GEODE)
2195 #define m_K6 (1U<<PROCESSOR_K6)
2196 #define m_K6_GEODE (m_K6 | m_GEODE)
2197 #define m_K8 (1U<<PROCESSOR_K8)
2198 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2199 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2200 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2201 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2202 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2203 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2204 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2205 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2206 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2207 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2208 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2209 #define m_BTVER (m_BTVER1 | m_BTVER2)
2210 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2211 | m_ZNVER1)
2213 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2215 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2216 #undef DEF_TUNE
2217 #define DEF_TUNE(tune, name, selector) name,
2218 #include "x86-tune.def"
2219 #undef DEF_TUNE
2222 /* Feature tests against the various tunings. */
2223 unsigned char ix86_tune_features[X86_TUNE_LAST];
2225 /* Feature tests against the various tunings used to create ix86_tune_features
2226 based on the processor mask. */
2227 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2228 #undef DEF_TUNE
2229 #define DEF_TUNE(tune, name, selector) selector,
2230 #include "x86-tune.def"
2231 #undef DEF_TUNE
2234 /* Feature tests against the various architecture variations. */
2235 unsigned char ix86_arch_features[X86_ARCH_LAST];
2237 /* Feature tests against the various architecture variations, used to create
2238 ix86_arch_features based on the processor mask. */
2239 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2240 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2241 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2243 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2244 ~m_386,
2246 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2247 ~(m_386 | m_486),
2249 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2250 ~m_386,
2252 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2253 ~m_386,
2256 /* In case the average insn count for single function invocation is
2257 lower than this constant, emit fast (but longer) prologue and
2258 epilogue code. */
2259 #define FAST_PROLOGUE_INSN_COUNT 20
2261 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2262 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2263 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2264 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2266 /* Array of the smallest class containing reg number REGNO, indexed by
2267 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2269 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2271 /* ax, dx, cx, bx */
2272 AREG, DREG, CREG, BREG,
2273 /* si, di, bp, sp */
2274 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2275 /* FP registers */
2276 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2277 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2278 /* arg pointer */
2279 NON_Q_REGS,
2280 /* flags, fpsr, fpcr, frame */
2281 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2282 /* SSE registers */
2283 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2284 SSE_REGS, SSE_REGS,
2285 /* MMX registers */
2286 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2287 MMX_REGS, MMX_REGS,
2288 /* REX registers */
2289 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2290 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2291 /* SSE REX registers */
2292 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2293 SSE_REGS, SSE_REGS,
2294 /* AVX-512 SSE registers */
2295 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2296 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2297 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2298 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2299 /* Mask registers. */
2300 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2301 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2302 /* MPX bound registers */
2303 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2306 /* The "default" register map used in 32bit mode. */
2308 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2310 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2311 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2312 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2313 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2314 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2315 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2316 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2317 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2318 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2319 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2320 101, 102, 103, 104, /* bound registers */
2323 /* The "default" register map used in 64bit mode. */
2325 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2327 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2328 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2329 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2330 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2331 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2332 8,9,10,11,12,13,14,15, /* extended integer registers */
2333 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2334 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2335 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2336 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2337 126, 127, 128, 129, /* bound registers */
2340 /* Define the register numbers to be used in Dwarf debugging information.
2341 The SVR4 reference port C compiler uses the following register numbers
2342 in its Dwarf output code:
2343 0 for %eax (gcc regno = 0)
2344 1 for %ecx (gcc regno = 2)
2345 2 for %edx (gcc regno = 1)
2346 3 for %ebx (gcc regno = 3)
2347 4 for %esp (gcc regno = 7)
2348 5 for %ebp (gcc regno = 6)
2349 6 for %esi (gcc regno = 4)
2350 7 for %edi (gcc regno = 5)
2351 The following three DWARF register numbers are never generated by
2352 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2353 believes these numbers have these meanings.
2354 8 for %eip (no gcc equivalent)
2355 9 for %eflags (gcc regno = 17)
2356 10 for %trapno (no gcc equivalent)
2357 It is not at all clear how we should number the FP stack registers
2358 for the x86 architecture. If the version of SDB on x86/svr4 were
2359 a bit less brain dead with respect to floating-point then we would
2360 have a precedent to follow with respect to DWARF register numbers
2361 for x86 FP registers, but the SDB on x86/svr4 is so completely
2362 broken with respect to FP registers that it is hardly worth thinking
2363 of it as something to strive for compatibility with.
2364 The version of x86/svr4 SDB I have at the moment does (partially)
2365 seem to believe that DWARF register number 11 is associated with
2366 the x86 register %st(0), but that's about all. Higher DWARF
2367 register numbers don't seem to be associated with anything in
2368 particular, and even for DWARF regno 11, SDB only seems to under-
2369 stand that it should say that a variable lives in %st(0) (when
2370 asked via an `=' command) if we said it was in DWARF regno 11,
2371 but SDB still prints garbage when asked for the value of the
2372 variable in question (via a `/' command).
2373 (Also note that the labels SDB prints for various FP stack regs
2374 when doing an `x' command are all wrong.)
2375 Note that these problems generally don't affect the native SVR4
2376 C compiler because it doesn't allow the use of -O with -g and
2377 because when it is *not* optimizing, it allocates a memory
2378 location for each floating-point variable, and the memory
2379 location is what gets described in the DWARF AT_location
2380 attribute for the variable in question.
2381 Regardless of the severe mental illness of the x86/svr4 SDB, we
2382 do something sensible here and we use the following DWARF
2383 register numbers. Note that these are all stack-top-relative
2384 numbers.
2385 11 for %st(0) (gcc regno = 8)
2386 12 for %st(1) (gcc regno = 9)
2387 13 for %st(2) (gcc regno = 10)
2388 14 for %st(3) (gcc regno = 11)
2389 15 for %st(4) (gcc regno = 12)
2390 16 for %st(5) (gcc regno = 13)
2391 17 for %st(6) (gcc regno = 14)
2392 18 for %st(7) (gcc regno = 15)
2394 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2396 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2397 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2398 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2399 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2400 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2401 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2402 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2403 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2404 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2405 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2406 101, 102, 103, 104, /* bound registers */
2409 /* Define parameter passing and return registers. */
2411 static int const x86_64_int_parameter_registers[6] =
2413 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2416 static int const x86_64_ms_abi_int_parameter_registers[4] =
2418 CX_REG, DX_REG, R8_REG, R9_REG
2421 static int const x86_64_int_return_registers[4] =
2423 AX_REG, DX_REG, DI_REG, SI_REG
2426 /* Additional registers that are clobbered by SYSV calls. */
2428 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2429 static int const x86_64_ms_sysv_extra_clobbered_registers
2430 [NUM_X86_64_MS_CLOBBERED_REGS] =
2432 SI_REG, DI_REG,
2433 XMM6_REG, XMM7_REG,
2434 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2435 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2438 enum xlogue_stub {
2439 XLOGUE_STUB_SAVE,
2440 XLOGUE_STUB_RESTORE,
2441 XLOGUE_STUB_RESTORE_TAIL,
2442 XLOGUE_STUB_SAVE_HFP,
2443 XLOGUE_STUB_RESTORE_HFP,
2444 XLOGUE_STUB_RESTORE_HFP_TAIL,
2446 XLOGUE_STUB_COUNT
2449 enum xlogue_stub_sets {
2450 XLOGUE_SET_ALIGNED,
2451 XLOGUE_SET_ALIGNED_PLUS_8,
2452 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2453 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2455 XLOGUE_SET_COUNT
2458 /* Register save/restore layout used by out-of-line stubs. */
2459 class xlogue_layout {
2460 public:
2461 struct reginfo
2463 unsigned regno;
2464 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2465 rsi) to where each register is stored. */
2468 unsigned get_nregs () const {return m_nregs;}
2469 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2471 const reginfo &get_reginfo (unsigned reg) const
2473 gcc_assert (reg < m_nregs);
2474 return m_regs[reg];
2477 static const char *get_stub_name (enum xlogue_stub stub,
2478 unsigned n_extra_args);
2480 /* Returns an rtx for the stub's symbol based upon
2481 1.) the specified stub (save, restore or restore_ret) and
2482 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2483 3.) rather or not stack alignment is being performed. */
2484 static rtx get_stub_rtx (enum xlogue_stub stub);
2486 /* Returns the amount of stack space (including padding) that the stub
2487 needs to store registers based upon data in the machine_function. */
2488 HOST_WIDE_INT get_stack_space_used () const
2490 const struct machine_function *m = cfun->machine;
2491 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2493 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2494 return m_regs[last_reg].offset
2495 + (m->call_ms2sysv_pad_out ? 8 : 0)
2496 + STUB_INDEX_OFFSET;
2499 /* Returns the offset for the base pointer used by the stub. */
2500 HOST_WIDE_INT get_stub_ptr_offset () const
2502 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2505 static const struct xlogue_layout &get_instance ();
2506 static unsigned count_stub_managed_regs ();
2507 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2509 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2510 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2511 static const unsigned MAX_REGS = 18;
2512 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2513 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2514 static const unsigned STUB_NAME_MAX_LEN = 16;
2515 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2516 static const unsigned REG_ORDER[MAX_REGS];
2517 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2519 private:
2520 xlogue_layout ();
2521 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2522 xlogue_layout (const xlogue_layout &);
2524 /* True if hard frame pointer is used. */
2525 bool m_hfp;
2527 /* Max number of register this layout manages. */
2528 unsigned m_nregs;
2530 /* Incoming offset from 16-byte alignment. */
2531 HOST_WIDE_INT m_stack_align_off_in;
2533 /* Register order and offsets. */
2534 struct reginfo m_regs[MAX_REGS];
2536 /* Lazy-inited cache of symbol names for stubs. */
2537 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2538 [STUB_NAME_MAX_LEN];
2540 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2543 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2544 "savms64",
2545 "resms64",
2546 "resms64x",
2547 "savms64f",
2548 "resms64f",
2549 "resms64fx"
2552 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2553 /* The below offset values are where each register is stored for the layout
2554 relative to incoming stack pointer. The value of each m_regs[].offset will
2555 be relative to the incoming base pointer (rax or rsi) used by the stub.
2557 s_instances: 0 1 2 3
2558 Offset: realigned or aligned + 8
2559 Register aligned aligned + 8 aligned w/HFP w/HFP */
2560 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2561 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2562 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2563 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2564 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2565 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2566 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2567 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2568 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2569 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2570 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2571 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2572 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2573 BP_REG, /* 0xc0 0xc8 N/A N/A */
2574 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2575 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2576 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2577 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2580 /* Instantiate static const values. */
2581 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2582 const unsigned xlogue_layout::MIN_REGS;
2583 const unsigned xlogue_layout::MAX_REGS;
2584 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2585 const unsigned xlogue_layout::VARIANT_COUNT;
2586 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2588 /* Initialize xlogue_layout::s_stub_names to zero. */
2589 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2590 [STUB_NAME_MAX_LEN];
2592 /* Instantiates all xlogue_layout instances. */
2593 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2594 xlogue_layout (0, false),
2595 xlogue_layout (8, false),
2596 xlogue_layout (0, true),
2597 xlogue_layout (8, true)
2600 /* Return an appropriate const instance of xlogue_layout based upon values
2601 in cfun->machine and crtl. */
2602 const struct xlogue_layout &
2603 xlogue_layout::get_instance ()
2605 enum xlogue_stub_sets stub_set;
2606 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2608 if (stack_realign_fp)
2609 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2610 else if (frame_pointer_needed)
2611 stub_set = aligned_plus_8
2612 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2613 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2614 else
2615 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2617 return s_instances[stub_set];
2620 /* Determine how many clobbered registers can be saved by the stub.
2621 Returns the count of registers the stub will save and restore. */
2622 unsigned
2623 xlogue_layout::count_stub_managed_regs ()
2625 bool hfp = frame_pointer_needed || stack_realign_fp;
2626 unsigned i, count;
2627 unsigned regno;
2629 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2631 regno = REG_ORDER[i];
2632 if (regno == BP_REG && hfp)
2633 continue;
2634 if (!ix86_save_reg (regno, false, false))
2635 break;
2636 ++count;
2638 return count;
2641 /* Determine if register REGNO is a stub managed register given the
2642 total COUNT of stub managed registers. */
2643 bool
2644 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2646 bool hfp = frame_pointer_needed || stack_realign_fp;
2647 unsigned i;
2649 for (i = 0; i < count; ++i)
2651 gcc_assert (i < MAX_REGS);
2652 if (REG_ORDER[i] == BP_REG && hfp)
2653 ++count;
2654 else if (REG_ORDER[i] == regno)
2655 return true;
2657 return false;
2660 /* Constructor for xlogue_layout. */
2661 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2662 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2663 m_stack_align_off_in (stack_align_off_in)
2665 HOST_WIDE_INT offset = stack_align_off_in;
2666 unsigned i, j;
2668 for (i = j = 0; i < MAX_REGS; ++i)
2670 unsigned regno = REG_ORDER[i];
2672 if (regno == BP_REG && hfp)
2673 continue;
2674 if (SSE_REGNO_P (regno))
2676 offset += 16;
2677 /* Verify that SSE regs are always aligned. */
2678 gcc_assert (!((stack_align_off_in + offset) & 15));
2680 else
2681 offset += 8;
2683 m_regs[j].regno = regno;
2684 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2686 gcc_assert (j == m_nregs);
2689 const char *
2690 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2691 unsigned n_extra_regs)
2693 char *name = s_stub_names[stub][n_extra_regs];
2695 /* Lazy init */
2696 if (!*name)
2698 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2699 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2700 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2703 return name;
2706 /* Return rtx of a symbol ref for the entry point (based upon
2707 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2709 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2711 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2712 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2713 gcc_assert (stub < XLOGUE_STUB_COUNT);
2714 gcc_assert (crtl->stack_realign_finalized);
2716 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2719 /* Define the structure for the machine field in struct function. */
2721 struct GTY(()) stack_local_entry {
2722 unsigned short mode;
2723 unsigned short n;
2724 rtx rtl;
2725 struct stack_local_entry *next;
2728 /* Which cpu are we scheduling for. */
2729 enum attr_cpu ix86_schedule;
2731 /* Which cpu are we optimizing for. */
2732 enum processor_type ix86_tune;
2734 /* Which instruction set architecture to use. */
2735 enum processor_type ix86_arch;
2737 /* True if processor has SSE prefetch instruction. */
2738 unsigned char x86_prefetch_sse;
2740 /* -mstackrealign option */
2741 static const char ix86_force_align_arg_pointer_string[]
2742 = "force_align_arg_pointer";
2744 static rtx (*ix86_gen_leave) (void);
2745 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2746 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2747 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2748 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2749 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2750 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2751 static rtx (*ix86_gen_clzero) (rtx);
2752 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2753 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2754 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2757 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2759 /* Preferred alignment for stack boundary in bits. */
2760 unsigned int ix86_preferred_stack_boundary;
2762 /* Alignment for incoming stack boundary in bits specified at
2763 command line. */
2764 static unsigned int ix86_user_incoming_stack_boundary;
2766 /* Default alignment for incoming stack boundary in bits. */
2767 static unsigned int ix86_default_incoming_stack_boundary;
2769 /* Alignment for incoming stack boundary in bits. */
2770 unsigned int ix86_incoming_stack_boundary;
2772 /* Calling abi specific va_list type nodes. */
2773 static GTY(()) tree sysv_va_list_type_node;
2774 static GTY(()) tree ms_va_list_type_node;
2776 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2777 char internal_label_prefix[16];
2778 int internal_label_prefix_len;
2780 /* Fence to use after loop using movnt. */
2781 tree x86_mfence;
2783 /* Register class used for passing given 64bit part of the argument.
2784 These represent classes as documented by the PS ABI, with the exception
2785 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2786 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2788 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2789 whenever possible (upper half does contain padding). */
2790 enum x86_64_reg_class
2792 X86_64_NO_CLASS,
2793 X86_64_INTEGER_CLASS,
2794 X86_64_INTEGERSI_CLASS,
2795 X86_64_SSE_CLASS,
2796 X86_64_SSESF_CLASS,
2797 X86_64_SSEDF_CLASS,
2798 X86_64_SSEUP_CLASS,
2799 X86_64_X87_CLASS,
2800 X86_64_X87UP_CLASS,
2801 X86_64_COMPLEX_X87_CLASS,
2802 X86_64_MEMORY_CLASS
2805 #define MAX_CLASSES 8
2807 /* Table of constants used by fldpi, fldln2, etc.... */
2808 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2809 static bool ext_80387_constants_init;
2812 static struct machine_function * ix86_init_machine_status (void);
2813 static rtx ix86_function_value (const_tree, const_tree, bool);
2814 static bool ix86_function_value_regno_p (const unsigned int);
2815 static unsigned int ix86_function_arg_boundary (machine_mode,
2816 const_tree);
2817 static rtx ix86_static_chain (const_tree, bool);
2818 static int ix86_function_regparm (const_tree, const_tree);
2819 static void ix86_compute_frame_layout (void);
2820 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2821 rtx, rtx, int);
2822 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2823 static tree ix86_canonical_va_list_type (tree);
2824 static void predict_jump (int);
2825 static unsigned int split_stack_prologue_scratch_regno (void);
2826 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2828 enum ix86_function_specific_strings
2830 IX86_FUNCTION_SPECIFIC_ARCH,
2831 IX86_FUNCTION_SPECIFIC_TUNE,
2832 IX86_FUNCTION_SPECIFIC_MAX
2835 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2836 const char *, const char *, enum fpmath_unit,
2837 bool);
2838 static void ix86_function_specific_save (struct cl_target_option *,
2839 struct gcc_options *opts);
2840 static void ix86_function_specific_restore (struct gcc_options *opts,
2841 struct cl_target_option *);
2842 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2843 static void ix86_function_specific_print (FILE *, int,
2844 struct cl_target_option *);
2845 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2846 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2847 struct gcc_options *,
2848 struct gcc_options *,
2849 struct gcc_options *);
2850 static bool ix86_can_inline_p (tree, tree);
2851 static void ix86_set_current_function (tree);
2852 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2854 static enum calling_abi ix86_function_abi (const_tree);
2857 #ifndef SUBTARGET32_DEFAULT_CPU
2858 #define SUBTARGET32_DEFAULT_CPU "i386"
2859 #endif
2861 /* Whether -mtune= or -march= were specified */
2862 static int ix86_tune_defaulted;
2863 static int ix86_arch_specified;
2865 /* Vectorization library interface and handlers. */
2866 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2868 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2869 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2871 /* Processor target table, indexed by processor number */
2872 struct ptt
2874 const char *const name; /* processor name */
2875 const struct processor_costs *cost; /* Processor costs */
2876 const int align_loop; /* Default alignments. */
2877 const int align_loop_max_skip;
2878 const int align_jump;
2879 const int align_jump_max_skip;
2880 const int align_func;
2883 /* This table must be in sync with enum processor_type in i386.h. */
2884 static const struct ptt processor_target_table[PROCESSOR_max] =
2886 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2887 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2888 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2889 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2890 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2891 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2892 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2893 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2894 {"core2", &core_cost, 16, 10, 16, 10, 16},
2895 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2896 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2897 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2898 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2899 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2900 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2901 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2902 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2903 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2904 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2905 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2906 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2907 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2908 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2909 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2910 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2911 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2912 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2913 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2914 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2917 static unsigned int
2918 rest_of_handle_insert_vzeroupper (void)
2920 int i;
2922 /* vzeroupper instructions are inserted immediately after reload to
2923 account for possible spills from 256bit registers. The pass
2924 reuses mode switching infrastructure by re-running mode insertion
2925 pass, so disable entities that have already been processed. */
2926 for (i = 0; i < MAX_386_ENTITIES; i++)
2927 ix86_optimize_mode_switching[i] = 0;
2929 ix86_optimize_mode_switching[AVX_U128] = 1;
2931 /* Call optimize_mode_switching. */
2932 g->get_passes ()->execute_pass_mode_switching ();
2933 return 0;
2936 /* Return 1 if INSN uses or defines a hard register.
2937 Hard register uses in a memory address are ignored.
2938 Clobbers and flags definitions are ignored. */
2940 static bool
2941 has_non_address_hard_reg (rtx_insn *insn)
2943 df_ref ref;
2944 FOR_EACH_INSN_DEF (ref, insn)
2945 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2946 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2947 && DF_REF_REGNO (ref) != FLAGS_REG)
2948 return true;
2950 FOR_EACH_INSN_USE (ref, insn)
2951 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2952 return true;
2954 return false;
2957 /* Check if comparison INSN may be transformed
2958 into vector comparison. Currently we transform
2959 zero checks only which look like:
2961 (set (reg:CCZ 17 flags)
2962 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2963 (subreg:SI (reg:DI x) 0))
2964 (const_int 0 [0]))) */
2966 static bool
2967 convertible_comparison_p (rtx_insn *insn)
2969 if (!TARGET_SSE4_1)
2970 return false;
2972 rtx def_set = single_set (insn);
2974 gcc_assert (def_set);
2976 rtx src = SET_SRC (def_set);
2977 rtx dst = SET_DEST (def_set);
2979 gcc_assert (GET_CODE (src) == COMPARE);
2981 if (GET_CODE (dst) != REG
2982 || REGNO (dst) != FLAGS_REG
2983 || GET_MODE (dst) != CCZmode)
2984 return false;
2986 rtx op1 = XEXP (src, 0);
2987 rtx op2 = XEXP (src, 1);
2989 if (op2 != CONST0_RTX (GET_MODE (op2)))
2990 return false;
2992 if (GET_CODE (op1) != IOR)
2993 return false;
2995 op2 = XEXP (op1, 1);
2996 op1 = XEXP (op1, 0);
2998 if (!SUBREG_P (op1)
2999 || !SUBREG_P (op2)
3000 || GET_MODE (op1) != SImode
3001 || GET_MODE (op2) != SImode
3002 || ((SUBREG_BYTE (op1) != 0
3003 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3004 && (SUBREG_BYTE (op2) != 0
3005 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3006 return false;
3008 op1 = SUBREG_REG (op1);
3009 op2 = SUBREG_REG (op2);
3011 if (op1 != op2
3012 || !REG_P (op1)
3013 || GET_MODE (op1) != DImode)
3014 return false;
3016 return true;
3019 /* The DImode version of scalar_to_vector_candidate_p. */
3021 static bool
3022 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3024 rtx def_set = single_set (insn);
3026 if (!def_set)
3027 return false;
3029 if (has_non_address_hard_reg (insn))
3030 return false;
3032 rtx src = SET_SRC (def_set);
3033 rtx dst = SET_DEST (def_set);
3035 if (GET_CODE (src) == COMPARE)
3036 return convertible_comparison_p (insn);
3038 /* We are interested in DImode promotion only. */
3039 if ((GET_MODE (src) != DImode
3040 && !CONST_INT_P (src))
3041 || GET_MODE (dst) != DImode)
3042 return false;
3044 if (!REG_P (dst) && !MEM_P (dst))
3045 return false;
3047 switch (GET_CODE (src))
3049 case ASHIFTRT:
3050 if (!TARGET_AVX512VL)
3051 return false;
3052 /* FALLTHRU */
3054 case ASHIFT:
3055 case LSHIFTRT:
3056 if (!REG_P (XEXP (src, 1))
3057 && (!SUBREG_P (XEXP (src, 1))
3058 || SUBREG_BYTE (XEXP (src, 1)) != 0
3059 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3060 && (!CONST_INT_P (XEXP (src, 1))
3061 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3062 return false;
3064 if (GET_MODE (XEXP (src, 1)) != QImode
3065 && !CONST_INT_P (XEXP (src, 1)))
3066 return false;
3067 break;
3069 case PLUS:
3070 case MINUS:
3071 case IOR:
3072 case XOR:
3073 case AND:
3074 if (!REG_P (XEXP (src, 1))
3075 && !MEM_P (XEXP (src, 1))
3076 && !CONST_INT_P (XEXP (src, 1)))
3077 return false;
3079 if (GET_MODE (XEXP (src, 1)) != DImode
3080 && !CONST_INT_P (XEXP (src, 1)))
3081 return false;
3082 break;
3084 case NEG:
3085 case NOT:
3086 break;
3088 case REG:
3089 return true;
3091 case MEM:
3092 case CONST_INT:
3093 return REG_P (dst);
3095 default:
3096 return false;
3099 if (!REG_P (XEXP (src, 0))
3100 && !MEM_P (XEXP (src, 0))
3101 && !CONST_INT_P (XEXP (src, 0))
3102 /* Check for andnot case. */
3103 && (GET_CODE (src) != AND
3104 || GET_CODE (XEXP (src, 0)) != NOT
3105 || !REG_P (XEXP (XEXP (src, 0), 0))))
3106 return false;
3108 if (GET_MODE (XEXP (src, 0)) != DImode
3109 && !CONST_INT_P (XEXP (src, 0)))
3110 return false;
3112 return true;
3115 /* The TImode version of scalar_to_vector_candidate_p. */
3117 static bool
3118 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3120 rtx def_set = single_set (insn);
3122 if (!def_set)
3123 return false;
3125 if (has_non_address_hard_reg (insn))
3126 return false;
3128 rtx src = SET_SRC (def_set);
3129 rtx dst = SET_DEST (def_set);
3131 /* Only TImode load and store are allowed. */
3132 if (GET_MODE (dst) != TImode)
3133 return false;
3135 if (MEM_P (dst))
3137 /* Check for store. Memory must be aligned or unaligned store
3138 is optimal. Only support store from register, standard SSE
3139 constant or CONST_WIDE_INT generated from piecewise store.
3141 ??? Verify performance impact before enabling CONST_INT for
3142 __int128 store. */
3143 if (misaligned_operand (dst, TImode)
3144 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3145 return false;
3147 switch (GET_CODE (src))
3149 default:
3150 return false;
3152 case REG:
3153 case CONST_WIDE_INT:
3154 return true;
3156 case CONST_INT:
3157 return standard_sse_constant_p (src, TImode);
3160 else if (MEM_P (src))
3162 /* Check for load. Memory must be aligned or unaligned load is
3163 optimal. */
3164 return (REG_P (dst)
3165 && (!misaligned_operand (src, TImode)
3166 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3169 return false;
3172 /* Return 1 if INSN may be converted into vector
3173 instruction. */
3175 static bool
3176 scalar_to_vector_candidate_p (rtx_insn *insn)
3178 if (TARGET_64BIT)
3179 return timode_scalar_to_vector_candidate_p (insn);
3180 else
3181 return dimode_scalar_to_vector_candidate_p (insn);
3184 /* The DImode version of remove_non_convertible_regs. */
3186 static void
3187 dimode_remove_non_convertible_regs (bitmap candidates)
3189 bitmap_iterator bi;
3190 unsigned id;
3191 bitmap regs = BITMAP_ALLOC (NULL);
3193 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3195 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3196 rtx reg = SET_DEST (def_set);
3198 if (!REG_P (reg)
3199 || bitmap_bit_p (regs, REGNO (reg))
3200 || HARD_REGISTER_P (reg))
3201 continue;
3203 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3204 def;
3205 def = DF_REF_NEXT_REG (def))
3207 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3209 if (dump_file)
3210 fprintf (dump_file,
3211 "r%d has non convertible definition in insn %d\n",
3212 REGNO (reg), DF_REF_INSN_UID (def));
3214 bitmap_set_bit (regs, REGNO (reg));
3215 break;
3220 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3222 for (df_ref def = DF_REG_DEF_CHAIN (id);
3223 def;
3224 def = DF_REF_NEXT_REG (def))
3225 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3227 if (dump_file)
3228 fprintf (dump_file, "Removing insn %d from candidates list\n",
3229 DF_REF_INSN_UID (def));
3231 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3235 BITMAP_FREE (regs);
3238 /* For a register REGNO, scan instructions for its defs and uses.
3239 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3241 static void
3242 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3243 unsigned int regno)
3245 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3246 def;
3247 def = DF_REF_NEXT_REG (def))
3249 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3251 if (dump_file)
3252 fprintf (dump_file,
3253 "r%d has non convertible def in insn %d\n",
3254 regno, DF_REF_INSN_UID (def));
3256 bitmap_set_bit (regs, regno);
3257 break;
3261 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3262 ref;
3263 ref = DF_REF_NEXT_REG (ref))
3265 /* Debug instructions are skipped. */
3266 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3267 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3269 if (dump_file)
3270 fprintf (dump_file,
3271 "r%d has non convertible use in insn %d\n",
3272 regno, DF_REF_INSN_UID (ref));
3274 bitmap_set_bit (regs, regno);
3275 break;
3280 /* The TImode version of remove_non_convertible_regs. */
3282 static void
3283 timode_remove_non_convertible_regs (bitmap candidates)
3285 bitmap_iterator bi;
3286 unsigned id;
3287 bitmap regs = BITMAP_ALLOC (NULL);
3289 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3291 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3292 rtx dest = SET_DEST (def_set);
3293 rtx src = SET_SRC (def_set);
3295 if ((!REG_P (dest)
3296 || bitmap_bit_p (regs, REGNO (dest))
3297 || HARD_REGISTER_P (dest))
3298 && (!REG_P (src)
3299 || bitmap_bit_p (regs, REGNO (src))
3300 || HARD_REGISTER_P (src)))
3301 continue;
3303 if (REG_P (dest))
3304 timode_check_non_convertible_regs (candidates, regs,
3305 REGNO (dest));
3307 if (REG_P (src))
3308 timode_check_non_convertible_regs (candidates, regs,
3309 REGNO (src));
3312 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3314 for (df_ref def = DF_REG_DEF_CHAIN (id);
3315 def;
3316 def = DF_REF_NEXT_REG (def))
3317 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3319 if (dump_file)
3320 fprintf (dump_file, "Removing insn %d from candidates list\n",
3321 DF_REF_INSN_UID (def));
3323 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3326 for (df_ref ref = DF_REG_USE_CHAIN (id);
3327 ref;
3328 ref = DF_REF_NEXT_REG (ref))
3329 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3331 if (dump_file)
3332 fprintf (dump_file, "Removing insn %d from candidates list\n",
3333 DF_REF_INSN_UID (ref));
3335 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3339 BITMAP_FREE (regs);
3342 /* For a given bitmap of insn UIDs scans all instruction and
3343 remove insn from CANDIDATES in case it has both convertible
3344 and not convertible definitions.
3346 All insns in a bitmap are conversion candidates according to
3347 scalar_to_vector_candidate_p. Currently it implies all insns
3348 are single_set. */
3350 static void
3351 remove_non_convertible_regs (bitmap candidates)
3353 if (TARGET_64BIT)
3354 timode_remove_non_convertible_regs (candidates);
3355 else
3356 dimode_remove_non_convertible_regs (candidates);
3359 class scalar_chain
3361 public:
3362 scalar_chain ();
3363 virtual ~scalar_chain ();
3365 static unsigned max_id;
3367 /* ID of a chain. */
3368 unsigned int chain_id;
3369 /* A queue of instructions to be included into a chain. */
3370 bitmap queue;
3371 /* Instructions included into a chain. */
3372 bitmap insns;
3373 /* All registers defined by a chain. */
3374 bitmap defs;
3375 /* Registers used in both vector and sclar modes. */
3376 bitmap defs_conv;
3378 void build (bitmap candidates, unsigned insn_uid);
3379 virtual int compute_convert_gain () = 0;
3380 int convert ();
3382 protected:
3383 void add_to_queue (unsigned insn_uid);
3384 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3386 private:
3387 void add_insn (bitmap candidates, unsigned insn_uid);
3388 void analyze_register_chain (bitmap candidates, df_ref ref);
3389 virtual void mark_dual_mode_def (df_ref def) = 0;
3390 virtual void convert_insn (rtx_insn *insn) = 0;
3391 virtual void convert_registers () = 0;
3394 class dimode_scalar_chain : public scalar_chain
3396 public:
3397 int compute_convert_gain ();
3398 private:
3399 void mark_dual_mode_def (df_ref def);
3400 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3401 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3402 void convert_insn (rtx_insn *insn);
3403 void convert_op (rtx *op, rtx_insn *insn);
3404 void convert_reg (unsigned regno);
3405 void make_vector_copies (unsigned regno);
3406 void convert_registers ();
3407 int vector_const_cost (rtx exp);
3410 class timode_scalar_chain : public scalar_chain
3412 public:
3413 /* Convert from TImode to V1TImode is always faster. */
3414 int compute_convert_gain () { return 1; }
3416 private:
3417 void mark_dual_mode_def (df_ref def);
3418 void fix_debug_reg_uses (rtx reg);
3419 void convert_insn (rtx_insn *insn);
3420 /* We don't convert registers to difference size. */
3421 void convert_registers () {}
3424 unsigned scalar_chain::max_id = 0;
3426 /* Initialize new chain. */
3428 scalar_chain::scalar_chain ()
3430 chain_id = ++max_id;
3432 if (dump_file)
3433 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3435 bitmap_obstack_initialize (NULL);
3436 insns = BITMAP_ALLOC (NULL);
3437 defs = BITMAP_ALLOC (NULL);
3438 defs_conv = BITMAP_ALLOC (NULL);
3439 queue = NULL;
3442 /* Free chain's data. */
3444 scalar_chain::~scalar_chain ()
3446 BITMAP_FREE (insns);
3447 BITMAP_FREE (defs);
3448 BITMAP_FREE (defs_conv);
3449 bitmap_obstack_release (NULL);
3452 /* Add instruction into chains' queue. */
3454 void
3455 scalar_chain::add_to_queue (unsigned insn_uid)
3457 if (bitmap_bit_p (insns, insn_uid)
3458 || bitmap_bit_p (queue, insn_uid))
3459 return;
3461 if (dump_file)
3462 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3463 insn_uid, chain_id);
3464 bitmap_set_bit (queue, insn_uid);
3467 /* For DImode conversion, mark register defined by DEF as requiring
3468 conversion. */
3470 void
3471 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3473 gcc_assert (DF_REF_REG_DEF_P (def));
3475 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3476 return;
3478 if (dump_file)
3479 fprintf (dump_file,
3480 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3481 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3483 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3486 /* For TImode conversion, it is unused. */
3488 void
3489 timode_scalar_chain::mark_dual_mode_def (df_ref)
3491 gcc_unreachable ();
3494 /* Check REF's chain to add new insns into a queue
3495 and find registers requiring conversion. */
3497 void
3498 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3500 df_link *chain;
3502 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3503 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3504 add_to_queue (DF_REF_INSN_UID (ref));
3506 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3508 unsigned uid = DF_REF_INSN_UID (chain->ref);
3510 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3511 continue;
3513 if (!DF_REF_REG_MEM_P (chain->ref))
3515 if (bitmap_bit_p (insns, uid))
3516 continue;
3518 if (bitmap_bit_p (candidates, uid))
3520 add_to_queue (uid);
3521 continue;
3525 if (DF_REF_REG_DEF_P (chain->ref))
3527 if (dump_file)
3528 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3529 DF_REF_REGNO (chain->ref), uid);
3530 mark_dual_mode_def (chain->ref);
3532 else
3534 if (dump_file)
3535 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3536 DF_REF_REGNO (chain->ref), uid);
3537 mark_dual_mode_def (ref);
3542 /* Add instruction into a chain. */
3544 void
3545 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3547 if (bitmap_bit_p (insns, insn_uid))
3548 return;
3550 if (dump_file)
3551 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3553 bitmap_set_bit (insns, insn_uid);
3555 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3556 rtx def_set = single_set (insn);
3557 if (def_set && REG_P (SET_DEST (def_set))
3558 && !HARD_REGISTER_P (SET_DEST (def_set)))
3559 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3561 df_ref ref;
3562 df_ref def;
3563 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3564 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3565 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3566 def;
3567 def = DF_REF_NEXT_REG (def))
3568 analyze_register_chain (candidates, def);
3569 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3570 if (!DF_REF_REG_MEM_P (ref))
3571 analyze_register_chain (candidates, ref);
3574 /* Build new chain starting from insn INSN_UID recursively
3575 adding all dependent uses and definitions. */
3577 void
3578 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3580 queue = BITMAP_ALLOC (NULL);
3581 bitmap_set_bit (queue, insn_uid);
3583 if (dump_file)
3584 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3586 while (!bitmap_empty_p (queue))
3588 insn_uid = bitmap_first_set_bit (queue);
3589 bitmap_clear_bit (queue, insn_uid);
3590 bitmap_clear_bit (candidates, insn_uid);
3591 add_insn (candidates, insn_uid);
3594 if (dump_file)
3596 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3597 fprintf (dump_file, " insns: ");
3598 dump_bitmap (dump_file, insns);
3599 if (!bitmap_empty_p (defs_conv))
3601 bitmap_iterator bi;
3602 unsigned id;
3603 const char *comma = "";
3604 fprintf (dump_file, " defs to convert: ");
3605 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3607 fprintf (dump_file, "%sr%d", comma, id);
3608 comma = ", ";
3610 fprintf (dump_file, "\n");
3614 BITMAP_FREE (queue);
3617 /* Return a cost of building a vector costant
3618 instead of using a scalar one. */
3621 dimode_scalar_chain::vector_const_cost (rtx exp)
3623 gcc_assert (CONST_INT_P (exp));
3625 if (standard_sse_constant_p (exp, V2DImode))
3626 return COSTS_N_INSNS (1);
3627 return ix86_cost->sse_load[1];
3630 /* Compute a gain for chain conversion. */
3633 dimode_scalar_chain::compute_convert_gain ()
3635 bitmap_iterator bi;
3636 unsigned insn_uid;
3637 int gain = 0;
3638 int cost = 0;
3640 if (dump_file)
3641 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3643 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3645 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3646 rtx def_set = single_set (insn);
3647 rtx src = SET_SRC (def_set);
3648 rtx dst = SET_DEST (def_set);
3650 if (REG_P (src) && REG_P (dst))
3651 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3652 else if (REG_P (src) && MEM_P (dst))
3653 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3654 else if (MEM_P (src) && REG_P (dst))
3655 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3656 else if (GET_CODE (src) == ASHIFT
3657 || GET_CODE (src) == ASHIFTRT
3658 || GET_CODE (src) == LSHIFTRT)
3660 if (CONST_INT_P (XEXP (src, 0)))
3661 gain -= vector_const_cost (XEXP (src, 0));
3662 if (CONST_INT_P (XEXP (src, 1)))
3664 gain += ix86_cost->shift_const;
3665 if (INTVAL (XEXP (src, 1)) >= 32)
3666 gain -= COSTS_N_INSNS (1);
3668 else
3669 /* Additional gain for omitting two CMOVs. */
3670 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3672 else if (GET_CODE (src) == PLUS
3673 || GET_CODE (src) == MINUS
3674 || GET_CODE (src) == IOR
3675 || GET_CODE (src) == XOR
3676 || GET_CODE (src) == AND)
3678 gain += ix86_cost->add;
3679 /* Additional gain for andnot for targets without BMI. */
3680 if (GET_CODE (XEXP (src, 0)) == NOT
3681 && !TARGET_BMI)
3682 gain += 2 * ix86_cost->add;
3684 if (CONST_INT_P (XEXP (src, 0)))
3685 gain -= vector_const_cost (XEXP (src, 0));
3686 if (CONST_INT_P (XEXP (src, 1)))
3687 gain -= vector_const_cost (XEXP (src, 1));
3689 else if (GET_CODE (src) == NEG
3690 || GET_CODE (src) == NOT)
3691 gain += ix86_cost->add - COSTS_N_INSNS (1);
3692 else if (GET_CODE (src) == COMPARE)
3694 /* Assume comparison cost is the same. */
3696 else if (CONST_INT_P (src))
3698 if (REG_P (dst))
3699 gain += COSTS_N_INSNS (2);
3700 else if (MEM_P (dst))
3701 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3702 gain -= vector_const_cost (src);
3704 else
3705 gcc_unreachable ();
3708 if (dump_file)
3709 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3711 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3712 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3714 if (dump_file)
3715 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3717 gain -= cost;
3719 if (dump_file)
3720 fprintf (dump_file, " Total gain: %d\n", gain);
3722 return gain;
3725 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3728 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3730 if (x == reg)
3731 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3733 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3734 int i, j;
3735 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3737 if (fmt[i] == 'e')
3738 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3739 else if (fmt[i] == 'E')
3740 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3741 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3742 reg, new_reg);
3745 return x;
3748 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3750 void
3751 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3752 rtx reg, rtx new_reg)
3754 replace_with_subreg (single_set (insn), reg, new_reg);
3757 /* Insert generated conversion instruction sequence INSNS
3758 after instruction AFTER. New BB may be required in case
3759 instruction has EH region attached. */
3761 void
3762 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3764 if (!control_flow_insn_p (after))
3766 emit_insn_after (insns, after);
3767 return;
3770 basic_block bb = BLOCK_FOR_INSN (after);
3771 edge e = find_fallthru_edge (bb->succs);
3772 gcc_assert (e);
3774 basic_block new_bb = split_edge (e);
3775 emit_insn_after (insns, BB_HEAD (new_bb));
3778 /* Make vector copies for all register REGNO definitions
3779 and replace its uses in a chain. */
3781 void
3782 dimode_scalar_chain::make_vector_copies (unsigned regno)
3784 rtx reg = regno_reg_rtx[regno];
3785 rtx vreg = gen_reg_rtx (DImode);
3786 bool count_reg = false;
3787 df_ref ref;
3789 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3790 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3792 df_ref use;
3794 /* Detect the count register of a shift instruction. */
3795 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3796 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3798 rtx_insn *insn = DF_REF_INSN (use);
3799 rtx def_set = single_set (insn);
3801 gcc_assert (def_set);
3803 rtx src = SET_SRC (def_set);
3805 if ((GET_CODE (src) == ASHIFT
3806 || GET_CODE (src) == ASHIFTRT
3807 || GET_CODE (src) == LSHIFTRT)
3808 && !CONST_INT_P (XEXP (src, 1))
3809 && reg_or_subregno (XEXP (src, 1)) == regno)
3810 count_reg = true;
3813 start_sequence ();
3814 if (count_reg)
3816 rtx qreg = gen_lowpart (QImode, reg);
3817 rtx tmp = gen_reg_rtx (SImode);
3819 if (TARGET_ZERO_EXTEND_WITH_AND
3820 && optimize_function_for_speed_p (cfun))
3822 emit_move_insn (tmp, const0_rtx);
3823 emit_insn (gen_movstrictqi
3824 (gen_lowpart (QImode, tmp), qreg));
3826 else
3827 emit_insn (gen_rtx_SET
3828 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3830 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3832 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3833 emit_move_insn (slot, tmp);
3834 tmp = copy_rtx (slot);
3837 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3839 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3841 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3842 emit_move_insn (adjust_address (tmp, SImode, 0),
3843 gen_rtx_SUBREG (SImode, reg, 0));
3844 emit_move_insn (adjust_address (tmp, SImode, 4),
3845 gen_rtx_SUBREG (SImode, reg, 4));
3846 emit_move_insn (vreg, tmp);
3848 else if (TARGET_SSE4_1)
3850 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3851 CONST0_RTX (V4SImode),
3852 gen_rtx_SUBREG (SImode, reg, 0)));
3853 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3854 gen_rtx_SUBREG (V4SImode, vreg, 0),
3855 gen_rtx_SUBREG (SImode, reg, 4),
3856 GEN_INT (2)));
3858 else
3860 rtx tmp = gen_reg_rtx (DImode);
3861 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3862 CONST0_RTX (V4SImode),
3863 gen_rtx_SUBREG (SImode, reg, 0)));
3864 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3865 CONST0_RTX (V4SImode),
3866 gen_rtx_SUBREG (SImode, reg, 4)));
3867 emit_insn (gen_vec_interleave_lowv4si
3868 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3869 gen_rtx_SUBREG (V4SImode, vreg, 0),
3870 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3872 rtx_insn *seq = get_insns ();
3873 end_sequence ();
3874 rtx_insn *insn = DF_REF_INSN (ref);
3875 emit_conversion_insns (seq, insn);
3877 if (dump_file)
3878 fprintf (dump_file,
3879 " Copied r%d to a vector register r%d for insn %d\n",
3880 regno, REGNO (vreg), INSN_UID (insn));
3883 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3884 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3886 rtx_insn *insn = DF_REF_INSN (ref);
3887 if (count_reg)
3889 rtx def_set = single_set (insn);
3890 gcc_assert (def_set);
3892 rtx src = SET_SRC (def_set);
3894 if ((GET_CODE (src) == ASHIFT
3895 || GET_CODE (src) == ASHIFTRT
3896 || GET_CODE (src) == LSHIFTRT)
3897 && !CONST_INT_P (XEXP (src, 1))
3898 && reg_or_subregno (XEXP (src, 1)) == regno)
3899 XEXP (src, 1) = vreg;
3901 else
3902 replace_with_subreg_in_insn (insn, reg, vreg);
3904 if (dump_file)
3905 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3906 regno, REGNO (vreg), INSN_UID (insn));
3910 /* Convert all definitions of register REGNO
3911 and fix its uses. Scalar copies may be created
3912 in case register is used in not convertible insn. */
3914 void
3915 dimode_scalar_chain::convert_reg (unsigned regno)
3917 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3918 rtx reg = regno_reg_rtx[regno];
3919 rtx scopy = NULL_RTX;
3920 df_ref ref;
3921 bitmap conv;
3923 conv = BITMAP_ALLOC (NULL);
3924 bitmap_copy (conv, insns);
3926 if (scalar_copy)
3927 scopy = gen_reg_rtx (DImode);
3929 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3931 rtx_insn *insn = DF_REF_INSN (ref);
3932 rtx def_set = single_set (insn);
3933 rtx src = SET_SRC (def_set);
3934 rtx reg = DF_REF_REG (ref);
3936 if (!MEM_P (src))
3938 replace_with_subreg_in_insn (insn, reg, reg);
3939 bitmap_clear_bit (conv, INSN_UID (insn));
3942 if (scalar_copy)
3944 start_sequence ();
3945 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3947 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3948 emit_move_insn (tmp, reg);
3949 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3950 adjust_address (tmp, SImode, 0));
3951 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3952 adjust_address (tmp, SImode, 4));
3954 else if (TARGET_SSE4_1)
3956 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3957 emit_insn
3958 (gen_rtx_SET
3959 (gen_rtx_SUBREG (SImode, scopy, 0),
3960 gen_rtx_VEC_SELECT (SImode,
3961 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3963 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3964 emit_insn
3965 (gen_rtx_SET
3966 (gen_rtx_SUBREG (SImode, scopy, 4),
3967 gen_rtx_VEC_SELECT (SImode,
3968 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3970 else
3972 rtx vcopy = gen_reg_rtx (V2DImode);
3973 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3974 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3975 gen_rtx_SUBREG (SImode, vcopy, 0));
3976 emit_move_insn (vcopy,
3977 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3978 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3979 gen_rtx_SUBREG (SImode, vcopy, 0));
3981 rtx_insn *seq = get_insns ();
3982 end_sequence ();
3983 emit_conversion_insns (seq, insn);
3985 if (dump_file)
3986 fprintf (dump_file,
3987 " Copied r%d to a scalar register r%d for insn %d\n",
3988 regno, REGNO (scopy), INSN_UID (insn));
3992 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3993 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3995 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3997 rtx_insn *insn = DF_REF_INSN (ref);
3999 rtx def_set = single_set (insn);
4000 gcc_assert (def_set);
4002 rtx src = SET_SRC (def_set);
4003 rtx dst = SET_DEST (def_set);
4005 if ((GET_CODE (src) == ASHIFT
4006 || GET_CODE (src) == ASHIFTRT
4007 || GET_CODE (src) == LSHIFTRT)
4008 && !CONST_INT_P (XEXP (src, 1))
4009 && reg_or_subregno (XEXP (src, 1)) == regno)
4011 rtx tmp2 = gen_reg_rtx (V2DImode);
4013 start_sequence ();
4015 if (TARGET_SSE4_1)
4016 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4017 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4018 else
4020 rtx vec_cst
4021 = gen_rtx_CONST_VECTOR (V2DImode,
4022 gen_rtvec (2, GEN_INT (0xff),
4023 const0_rtx));
4024 vec_cst
4025 = validize_mem (force_const_mem (V2DImode, vec_cst));
4027 emit_insn (gen_rtx_SET
4028 (tmp2,
4029 gen_rtx_AND (V2DImode,
4030 gen_rtx_SUBREG (V2DImode, reg, 0),
4031 vec_cst)));
4033 rtx_insn *seq = get_insns ();
4034 end_sequence ();
4036 emit_insn_before (seq, insn);
4038 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4040 else if (!MEM_P (dst) || !REG_P (src))
4041 replace_with_subreg_in_insn (insn, reg, reg);
4043 bitmap_clear_bit (conv, INSN_UID (insn));
4046 /* Skip debug insns and uninitialized uses. */
4047 else if (DF_REF_CHAIN (ref)
4048 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4050 gcc_assert (scopy);
4051 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4052 df_insn_rescan (DF_REF_INSN (ref));
4055 BITMAP_FREE (conv);
4058 /* Convert operand OP in INSN. We should handle
4059 memory operands and uninitialized registers.
4060 All other register uses are converted during
4061 registers conversion. */
4063 void
4064 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4066 *op = copy_rtx_if_shared (*op);
4068 if (GET_CODE (*op) == NOT)
4070 convert_op (&XEXP (*op, 0), insn);
4071 PUT_MODE (*op, V2DImode);
4073 else if (MEM_P (*op))
4075 rtx tmp = gen_reg_rtx (DImode);
4077 emit_insn_before (gen_move_insn (tmp, *op), insn);
4078 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4080 if (dump_file)
4081 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4082 INSN_UID (insn), REGNO (tmp));
4084 else if (REG_P (*op))
4086 /* We may have not converted register usage in case
4087 this register has no definition. Otherwise it
4088 should be converted in convert_reg. */
4089 df_ref ref;
4090 FOR_EACH_INSN_USE (ref, insn)
4091 if (DF_REF_REGNO (ref) == REGNO (*op))
4093 gcc_assert (!DF_REF_CHAIN (ref));
4094 break;
4096 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4098 else if (CONST_INT_P (*op))
4100 rtx vec_cst;
4101 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4103 /* Prefer all ones vector in case of -1. */
4104 if (constm1_operand (*op, GET_MODE (*op)))
4105 vec_cst = CONSTM1_RTX (V2DImode);
4106 else
4107 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4108 gen_rtvec (2, *op, const0_rtx));
4110 if (!standard_sse_constant_p (vec_cst, V2DImode))
4112 start_sequence ();
4113 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4114 rtx_insn *seq = get_insns ();
4115 end_sequence ();
4116 emit_insn_before (seq, insn);
4119 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4120 *op = tmp;
4122 else
4124 gcc_assert (SUBREG_P (*op));
4125 gcc_assert (GET_MODE (*op) == V2DImode);
4129 /* Convert INSN to vector mode. */
4131 void
4132 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4134 rtx def_set = single_set (insn);
4135 rtx src = SET_SRC (def_set);
4136 rtx dst = SET_DEST (def_set);
4137 rtx subreg;
4139 if (MEM_P (dst) && !REG_P (src))
4141 /* There are no scalar integer instructions and therefore
4142 temporary register usage is required. */
4143 rtx tmp = gen_reg_rtx (DImode);
4144 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4145 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4148 switch (GET_CODE (src))
4150 case ASHIFT:
4151 case ASHIFTRT:
4152 case LSHIFTRT:
4153 convert_op (&XEXP (src, 0), insn);
4154 PUT_MODE (src, V2DImode);
4155 break;
4157 case PLUS:
4158 case MINUS:
4159 case IOR:
4160 case XOR:
4161 case AND:
4162 convert_op (&XEXP (src, 0), insn);
4163 convert_op (&XEXP (src, 1), insn);
4164 PUT_MODE (src, V2DImode);
4165 break;
4167 case NEG:
4168 src = XEXP (src, 0);
4169 convert_op (&src, insn);
4170 subreg = gen_reg_rtx (V2DImode);
4171 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4172 src = gen_rtx_MINUS (V2DImode, subreg, src);
4173 break;
4175 case NOT:
4176 src = XEXP (src, 0);
4177 convert_op (&src, insn);
4178 subreg = gen_reg_rtx (V2DImode);
4179 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4180 src = gen_rtx_XOR (V2DImode, src, subreg);
4181 break;
4183 case MEM:
4184 if (!REG_P (dst))
4185 convert_op (&src, insn);
4186 break;
4188 case REG:
4189 if (!MEM_P (dst))
4190 convert_op (&src, insn);
4191 break;
4193 case SUBREG:
4194 gcc_assert (GET_MODE (src) == V2DImode);
4195 break;
4197 case COMPARE:
4198 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4200 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4201 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4203 if (REG_P (src))
4204 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4205 else
4206 subreg = copy_rtx_if_shared (src);
4207 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4208 copy_rtx_if_shared (subreg),
4209 copy_rtx_if_shared (subreg)),
4210 insn);
4211 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4212 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4213 copy_rtx_if_shared (src)),
4214 UNSPEC_PTEST);
4215 break;
4217 case CONST_INT:
4218 convert_op (&src, insn);
4219 break;
4221 default:
4222 gcc_unreachable ();
4225 SET_SRC (def_set) = src;
4226 SET_DEST (def_set) = dst;
4228 /* Drop possible dead definitions. */
4229 PATTERN (insn) = def_set;
4231 INSN_CODE (insn) = -1;
4232 recog_memoized (insn);
4233 df_insn_rescan (insn);
4236 /* Fix uses of converted REG in debug insns. */
4238 void
4239 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4241 if (!flag_var_tracking)
4242 return;
4244 df_ref ref, next;
4245 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4247 rtx_insn *insn = DF_REF_INSN (ref);
4248 /* Make sure the next ref is for a different instruction,
4249 so that we're not affected by the rescan. */
4250 next = DF_REF_NEXT_REG (ref);
4251 while (next && DF_REF_INSN (next) == insn)
4252 next = DF_REF_NEXT_REG (next);
4254 if (DEBUG_INSN_P (insn))
4256 /* It may be a debug insn with a TImode variable in
4257 register. */
4258 bool changed = false;
4259 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4261 rtx *loc = DF_REF_LOC (ref);
4262 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4264 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4265 changed = true;
4268 if (changed)
4269 df_insn_rescan (insn);
4274 /* Convert INSN from TImode to V1T1mode. */
4276 void
4277 timode_scalar_chain::convert_insn (rtx_insn *insn)
4279 rtx def_set = single_set (insn);
4280 rtx src = SET_SRC (def_set);
4281 rtx dst = SET_DEST (def_set);
4283 switch (GET_CODE (dst))
4285 case REG:
4287 rtx tmp = find_reg_equal_equiv_note (insn);
4288 if (tmp)
4289 PUT_MODE (XEXP (tmp, 0), V1TImode);
4290 PUT_MODE (dst, V1TImode);
4291 fix_debug_reg_uses (dst);
4293 break;
4294 case MEM:
4295 PUT_MODE (dst, V1TImode);
4296 break;
4298 default:
4299 gcc_unreachable ();
4302 switch (GET_CODE (src))
4304 case REG:
4305 PUT_MODE (src, V1TImode);
4306 /* Call fix_debug_reg_uses only if SRC is never defined. */
4307 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4308 fix_debug_reg_uses (src);
4309 break;
4311 case MEM:
4312 PUT_MODE (src, V1TImode);
4313 break;
4315 case CONST_WIDE_INT:
4316 if (NONDEBUG_INSN_P (insn))
4318 /* Since there are no instructions to store 128-bit constant,
4319 temporary register usage is required. */
4320 rtx tmp = gen_reg_rtx (V1TImode);
4321 start_sequence ();
4322 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4323 src = validize_mem (force_const_mem (V1TImode, src));
4324 rtx_insn *seq = get_insns ();
4325 end_sequence ();
4326 if (seq)
4327 emit_insn_before (seq, insn);
4328 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4329 dst = tmp;
4331 break;
4333 case CONST_INT:
4334 switch (standard_sse_constant_p (src, TImode))
4336 case 1:
4337 src = CONST0_RTX (GET_MODE (dst));
4338 break;
4339 case 2:
4340 src = CONSTM1_RTX (GET_MODE (dst));
4341 break;
4342 default:
4343 gcc_unreachable ();
4345 if (NONDEBUG_INSN_P (insn))
4347 rtx tmp = gen_reg_rtx (V1TImode);
4348 /* Since there are no instructions to store standard SSE
4349 constant, temporary register usage is required. */
4350 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4351 dst = tmp;
4353 break;
4355 default:
4356 gcc_unreachable ();
4359 SET_SRC (def_set) = src;
4360 SET_DEST (def_set) = dst;
4362 /* Drop possible dead definitions. */
4363 PATTERN (insn) = def_set;
4365 INSN_CODE (insn) = -1;
4366 recog_memoized (insn);
4367 df_insn_rescan (insn);
4370 void
4371 dimode_scalar_chain::convert_registers ()
4373 bitmap_iterator bi;
4374 unsigned id;
4376 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4377 convert_reg (id);
4379 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4380 make_vector_copies (id);
4383 /* Convert whole chain creating required register
4384 conversions and copies. */
4387 scalar_chain::convert ()
4389 bitmap_iterator bi;
4390 unsigned id;
4391 int converted_insns = 0;
4393 if (!dbg_cnt (stv_conversion))
4394 return 0;
4396 if (dump_file)
4397 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4399 convert_registers ();
4401 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4403 convert_insn (DF_INSN_UID_GET (id)->insn);
4404 converted_insns++;
4407 return converted_insns;
4410 /* Main STV pass function. Find and convert scalar
4411 instructions into vector mode when profitable. */
4413 static unsigned int
4414 convert_scalars_to_vector ()
4416 basic_block bb;
4417 bitmap candidates;
4418 int converted_insns = 0;
4420 bitmap_obstack_initialize (NULL);
4421 candidates = BITMAP_ALLOC (NULL);
4423 calculate_dominance_info (CDI_DOMINATORS);
4424 df_set_flags (DF_DEFER_INSN_RESCAN);
4425 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4426 df_md_add_problem ();
4427 df_analyze ();
4429 /* Find all instructions we want to convert into vector mode. */
4430 if (dump_file)
4431 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4433 FOR_EACH_BB_FN (bb, cfun)
4435 rtx_insn *insn;
4436 FOR_BB_INSNS (bb, insn)
4437 if (scalar_to_vector_candidate_p (insn))
4439 if (dump_file)
4440 fprintf (dump_file, " insn %d is marked as a candidate\n",
4441 INSN_UID (insn));
4443 bitmap_set_bit (candidates, INSN_UID (insn));
4447 remove_non_convertible_regs (candidates);
4449 if (bitmap_empty_p (candidates))
4450 if (dump_file)
4451 fprintf (dump_file, "There are no candidates for optimization.\n");
4453 while (!bitmap_empty_p (candidates))
4455 unsigned uid = bitmap_first_set_bit (candidates);
4456 scalar_chain *chain;
4458 if (TARGET_64BIT)
4459 chain = new timode_scalar_chain;
4460 else
4461 chain = new dimode_scalar_chain;
4463 /* Find instructions chain we want to convert to vector mode.
4464 Check all uses and definitions to estimate all required
4465 conversions. */
4466 chain->build (candidates, uid);
4468 if (chain->compute_convert_gain () > 0)
4469 converted_insns += chain->convert ();
4470 else
4471 if (dump_file)
4472 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4473 chain->chain_id);
4475 delete chain;
4478 if (dump_file)
4479 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4481 BITMAP_FREE (candidates);
4482 bitmap_obstack_release (NULL);
4483 df_process_deferred_rescans ();
4485 /* Conversion means we may have 128bit register spills/fills
4486 which require aligned stack. */
4487 if (converted_insns)
4489 if (crtl->stack_alignment_needed < 128)
4490 crtl->stack_alignment_needed = 128;
4491 if (crtl->stack_alignment_estimated < 128)
4492 crtl->stack_alignment_estimated = 128;
4493 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4494 if (TARGET_64BIT)
4495 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4496 parm; parm = DECL_CHAIN (parm))
4498 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4499 continue;
4500 if (DECL_RTL_SET_P (parm)
4501 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4503 rtx r = DECL_RTL (parm);
4504 if (REG_P (r))
4505 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4507 if (DECL_INCOMING_RTL (parm)
4508 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4510 rtx r = DECL_INCOMING_RTL (parm);
4511 if (REG_P (r))
4512 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4517 return 0;
4520 namespace {
4522 const pass_data pass_data_insert_vzeroupper =
4524 RTL_PASS, /* type */
4525 "vzeroupper", /* name */
4526 OPTGROUP_NONE, /* optinfo_flags */
4527 TV_MACH_DEP, /* tv_id */
4528 0, /* properties_required */
4529 0, /* properties_provided */
4530 0, /* properties_destroyed */
4531 0, /* todo_flags_start */
4532 TODO_df_finish, /* todo_flags_finish */
4535 class pass_insert_vzeroupper : public rtl_opt_pass
4537 public:
4538 pass_insert_vzeroupper(gcc::context *ctxt)
4539 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4542 /* opt_pass methods: */
4543 virtual bool gate (function *)
4545 return TARGET_AVX && !TARGET_AVX512F
4546 && TARGET_VZEROUPPER && flag_expensive_optimizations
4547 && !optimize_size;
4550 virtual unsigned int execute (function *)
4552 return rest_of_handle_insert_vzeroupper ();
4555 }; // class pass_insert_vzeroupper
4557 const pass_data pass_data_stv =
4559 RTL_PASS, /* type */
4560 "stv", /* name */
4561 OPTGROUP_NONE, /* optinfo_flags */
4562 TV_MACH_DEP, /* tv_id */
4563 0, /* properties_required */
4564 0, /* properties_provided */
4565 0, /* properties_destroyed */
4566 0, /* todo_flags_start */
4567 TODO_df_finish, /* todo_flags_finish */
4570 class pass_stv : public rtl_opt_pass
4572 public:
4573 pass_stv (gcc::context *ctxt)
4574 : rtl_opt_pass (pass_data_stv, ctxt),
4575 timode_p (false)
4578 /* opt_pass methods: */
4579 virtual bool gate (function *)
4581 return (timode_p == !!TARGET_64BIT
4582 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4585 virtual unsigned int execute (function *)
4587 return convert_scalars_to_vector ();
4590 opt_pass *clone ()
4592 return new pass_stv (m_ctxt);
4595 void set_pass_param (unsigned int n, bool param)
4597 gcc_assert (n == 0);
4598 timode_p = param;
4601 private:
4602 bool timode_p;
4603 }; // class pass_stv
4605 } // anon namespace
4607 rtl_opt_pass *
4608 make_pass_insert_vzeroupper (gcc::context *ctxt)
4610 return new pass_insert_vzeroupper (ctxt);
4613 rtl_opt_pass *
4614 make_pass_stv (gcc::context *ctxt)
4616 return new pass_stv (ctxt);
4619 /* Return true if a red-zone is in use. */
4621 bool
4622 ix86_using_red_zone (void)
4624 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4627 /* Return a string that documents the current -m options. The caller is
4628 responsible for freeing the string. */
4630 static char *
4631 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4632 int flags, int flags2,
4633 const char *arch, const char *tune,
4634 enum fpmath_unit fpmath, bool add_nl_p)
4636 struct ix86_target_opts
4638 const char *option; /* option string */
4639 HOST_WIDE_INT mask; /* isa mask options */
4642 /* This table is ordered so that options like -msse4.2 that imply other
4643 ISAs come first. Target string will be displayed in the same order. */
4644 static struct ix86_target_opts isa2_opts[] =
4646 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4647 { "-msgx", OPTION_MASK_ISA_SGX },
4648 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4649 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4650 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4652 static struct ix86_target_opts isa_opts[] =
4654 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4655 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4656 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4657 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4658 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4659 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4660 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4661 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4662 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4663 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4664 { "-mfma", OPTION_MASK_ISA_FMA },
4665 { "-mxop", OPTION_MASK_ISA_XOP },
4666 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4667 { "-mf16c", OPTION_MASK_ISA_F16C },
4668 { "-mavx", OPTION_MASK_ISA_AVX },
4669 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4670 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4671 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4672 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4673 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4674 { "-msse3", OPTION_MASK_ISA_SSE3 },
4675 { "-maes", OPTION_MASK_ISA_AES },
4676 { "-msha", OPTION_MASK_ISA_SHA },
4677 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4678 { "-msse2", OPTION_MASK_ISA_SSE2 },
4679 { "-msse", OPTION_MASK_ISA_SSE },
4680 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4681 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4682 { "-mmmx", OPTION_MASK_ISA_MMX },
4683 { "-mrtm", OPTION_MASK_ISA_RTM },
4684 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4685 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4686 { "-madx", OPTION_MASK_ISA_ADX },
4687 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4688 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4689 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4690 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4691 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4692 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4693 { "-mabm", OPTION_MASK_ISA_ABM },
4694 { "-mbmi", OPTION_MASK_ISA_BMI },
4695 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4696 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4697 { "-mtbm", OPTION_MASK_ISA_TBM },
4698 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4699 { "-mcx16", OPTION_MASK_ISA_CX16 },
4700 { "-msahf", OPTION_MASK_ISA_SAHF },
4701 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4702 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4703 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4704 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4705 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4706 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4707 { "-mpku", OPTION_MASK_ISA_PKU },
4708 { "-mlwp", OPTION_MASK_ISA_LWP },
4709 { "-mhle", OPTION_MASK_ISA_HLE },
4710 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4711 { "-mmpx", OPTION_MASK_ISA_MPX },
4712 { "-mclwb", OPTION_MASK_ISA_CLWB }
4715 /* Flag options. */
4716 static struct ix86_target_opts flag_opts[] =
4718 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4719 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4720 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4721 { "-m80387", MASK_80387 },
4722 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4723 { "-malign-double", MASK_ALIGN_DOUBLE },
4724 { "-mcld", MASK_CLD },
4725 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4726 { "-mieee-fp", MASK_IEEE_FP },
4727 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4728 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4729 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4730 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4731 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4732 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4733 { "-mno-red-zone", MASK_NO_RED_ZONE },
4734 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4735 { "-mrecip", MASK_RECIP },
4736 { "-mrtd", MASK_RTD },
4737 { "-msseregparm", MASK_SSEREGPARM },
4738 { "-mstack-arg-probe", MASK_STACK_PROBE },
4739 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4740 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4741 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4742 { "-mvzeroupper", MASK_VZEROUPPER },
4743 { "-mstv", MASK_STV },
4744 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4745 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4746 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4747 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4750 /* Additional flag options. */
4751 static struct ix86_target_opts flag2_opts[] =
4753 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4756 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4757 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4759 char isa_other[40];
4760 char isa2_other[40];
4761 char flags_other[40];
4762 char flags2_other[40];
4763 unsigned num = 0;
4764 unsigned i, j;
4765 char *ret;
4766 char *ptr;
4767 size_t len;
4768 size_t line_len;
4769 size_t sep_len;
4770 const char *abi;
4772 memset (opts, '\0', sizeof (opts));
4774 /* Add -march= option. */
4775 if (arch)
4777 opts[num][0] = "-march=";
4778 opts[num++][1] = arch;
4781 /* Add -mtune= option. */
4782 if (tune)
4784 opts[num][0] = "-mtune=";
4785 opts[num++][1] = tune;
4788 /* Add -m32/-m64/-mx32. */
4789 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4791 if ((isa & OPTION_MASK_ABI_64) != 0)
4792 abi = "-m64";
4793 else
4794 abi = "-mx32";
4795 isa &= ~ (OPTION_MASK_ISA_64BIT
4796 | OPTION_MASK_ABI_64
4797 | OPTION_MASK_ABI_X32);
4799 else
4800 abi = "-m32";
4801 opts[num++][0] = abi;
4803 /* Pick out the options in isa2 options. */
4804 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4806 if ((isa2 & isa2_opts[i].mask) != 0)
4808 opts[num++][0] = isa2_opts[i].option;
4809 isa2 &= ~ isa2_opts[i].mask;
4813 if (isa2 && add_nl_p)
4815 opts[num++][0] = isa2_other;
4816 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4819 /* Pick out the options in isa options. */
4820 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4822 if ((isa & isa_opts[i].mask) != 0)
4824 opts[num++][0] = isa_opts[i].option;
4825 isa &= ~ isa_opts[i].mask;
4829 if (isa && add_nl_p)
4831 opts[num++][0] = isa_other;
4832 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4835 /* Add flag options. */
4836 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4838 if ((flags & flag_opts[i].mask) != 0)
4840 opts[num++][0] = flag_opts[i].option;
4841 flags &= ~ flag_opts[i].mask;
4845 if (flags && add_nl_p)
4847 opts[num++][0] = flags_other;
4848 sprintf (flags_other, "(other flags: %#x)", flags);
4851 /* Add additional flag options. */
4852 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4854 if ((flags2 & flag2_opts[i].mask) != 0)
4856 opts[num++][0] = flag2_opts[i].option;
4857 flags2 &= ~ flag2_opts[i].mask;
4861 if (flags2 && add_nl_p)
4863 opts[num++][0] = flags2_other;
4864 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4867 /* Add -fpmath= option. */
4868 if (fpmath)
4870 opts[num][0] = "-mfpmath=";
4871 switch ((int) fpmath)
4873 case FPMATH_387:
4874 opts[num++][1] = "387";
4875 break;
4877 case FPMATH_SSE:
4878 opts[num++][1] = "sse";
4879 break;
4881 case FPMATH_387 | FPMATH_SSE:
4882 opts[num++][1] = "sse+387";
4883 break;
4885 default:
4886 gcc_unreachable ();
4890 /* Any options? */
4891 if (num == 0)
4892 return NULL;
4894 gcc_assert (num < ARRAY_SIZE (opts));
4896 /* Size the string. */
4897 len = 0;
4898 sep_len = (add_nl_p) ? 3 : 1;
4899 for (i = 0; i < num; i++)
4901 len += sep_len;
4902 for (j = 0; j < 2; j++)
4903 if (opts[i][j])
4904 len += strlen (opts[i][j]);
4907 /* Build the string. */
4908 ret = ptr = (char *) xmalloc (len);
4909 line_len = 0;
4911 for (i = 0; i < num; i++)
4913 size_t len2[2];
4915 for (j = 0; j < 2; j++)
4916 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4918 if (i != 0)
4920 *ptr++ = ' ';
4921 line_len++;
4923 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4925 *ptr++ = '\\';
4926 *ptr++ = '\n';
4927 line_len = 0;
4931 for (j = 0; j < 2; j++)
4932 if (opts[i][j])
4934 memcpy (ptr, opts[i][j], len2[j]);
4935 ptr += len2[j];
4936 line_len += len2[j];
4940 *ptr = '\0';
4941 gcc_assert (ret + len >= ptr);
4943 return ret;
4946 /* Return true, if profiling code should be emitted before
4947 prologue. Otherwise it returns false.
4948 Note: For x86 with "hotfix" it is sorried. */
4949 static bool
4950 ix86_profile_before_prologue (void)
4952 return flag_fentry != 0;
4955 /* Function that is callable from the debugger to print the current
4956 options. */
4957 void ATTRIBUTE_UNUSED
4958 ix86_debug_options (void)
4960 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4961 target_flags, ix86_target_flags,
4962 ix86_arch_string,ix86_tune_string,
4963 ix86_fpmath, true);
4965 if (opts)
4967 fprintf (stderr, "%s\n\n", opts);
4968 free (opts);
4970 else
4971 fputs ("<no options>\n\n", stderr);
4973 return;
4976 /* Return true if T is one of the bytes we should avoid with
4977 -fmitigate-rop. */
4979 static bool
4980 ix86_rop_should_change_byte_p (int t)
4982 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4985 static const char *stringop_alg_names[] = {
4986 #define DEF_ENUM
4987 #define DEF_ALG(alg, name) #name,
4988 #include "stringop.def"
4989 #undef DEF_ENUM
4990 #undef DEF_ALG
4993 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4994 The string is of the following form (or comma separated list of it):
4996 strategy_alg:max_size:[align|noalign]
4998 where the full size range for the strategy is either [0, max_size] or
4999 [min_size, max_size], in which min_size is the max_size + 1 of the
5000 preceding range. The last size range must have max_size == -1.
5002 Examples:
5005 -mmemcpy-strategy=libcall:-1:noalign
5007 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5011 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5013 This is to tell the compiler to use the following strategy for memset
5014 1) when the expected size is between [1, 16], use rep_8byte strategy;
5015 2) when the size is between [17, 2048], use vector_loop;
5016 3) when the size is > 2048, use libcall. */
5018 struct stringop_size_range
5020 int max;
5021 stringop_alg alg;
5022 bool noalign;
5025 static void
5026 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5028 const struct stringop_algs *default_algs;
5029 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5030 char *curr_range_str, *next_range_str;
5031 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5032 int i = 0, n = 0;
5034 if (is_memset)
5035 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5036 else
5037 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5039 curr_range_str = strategy_str;
5043 int maxs;
5044 char alg_name[128];
5045 char align[16];
5046 next_range_str = strchr (curr_range_str, ',');
5047 if (next_range_str)
5048 *next_range_str++ = '\0';
5050 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5051 alg_name, &maxs, align))
5053 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5054 return;
5057 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5059 error ("size ranges of option %qs should be increasing", opt);
5060 return;
5063 for (i = 0; i < last_alg; i++)
5064 if (!strcmp (alg_name, stringop_alg_names[i]))
5065 break;
5067 if (i == last_alg)
5069 error ("wrong strategy name %qs specified for option %qs",
5070 alg_name, opt);
5072 auto_vec <const char *> candidates;
5073 for (i = 0; i < last_alg; i++)
5074 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5075 candidates.safe_push (stringop_alg_names[i]);
5077 char *s;
5078 const char *hint
5079 = candidates_list_and_hint (alg_name, s, candidates);
5080 if (hint)
5081 inform (input_location,
5082 "valid arguments to %qs are: %s; did you mean %qs?",
5083 opt, s, hint);
5084 else
5085 inform (input_location, "valid arguments to %qs are: %s",
5086 opt, s);
5087 XDELETEVEC (s);
5088 return;
5091 if ((stringop_alg) i == rep_prefix_8_byte
5092 && !TARGET_64BIT)
5094 /* rep; movq isn't available in 32-bit code. */
5095 error ("strategy name %qs specified for option %qs "
5096 "not supported for 32-bit code", alg_name, opt);
5097 return;
5100 input_ranges[n].max = maxs;
5101 input_ranges[n].alg = (stringop_alg) i;
5102 if (!strcmp (align, "align"))
5103 input_ranges[n].noalign = false;
5104 else if (!strcmp (align, "noalign"))
5105 input_ranges[n].noalign = true;
5106 else
5108 error ("unknown alignment %qs specified for option %qs", align, opt);
5109 return;
5111 n++;
5112 curr_range_str = next_range_str;
5114 while (curr_range_str);
5116 if (input_ranges[n - 1].max != -1)
5118 error ("the max value for the last size range should be -1"
5119 " for option %qs", opt);
5120 return;
5123 if (n > MAX_STRINGOP_ALGS)
5125 error ("too many size ranges specified in option %qs", opt);
5126 return;
5129 /* Now override the default algs array. */
5130 for (i = 0; i < n; i++)
5132 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5133 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5134 = input_ranges[i].alg;
5135 *const_cast<int *>(&default_algs->size[i].noalign)
5136 = input_ranges[i].noalign;
5141 /* parse -mtune-ctrl= option. When DUMP is true,
5142 print the features that are explicitly set. */
5144 static void
5145 parse_mtune_ctrl_str (bool dump)
5147 if (!ix86_tune_ctrl_string)
5148 return;
5150 char *next_feature_string = NULL;
5151 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5152 char *orig = curr_feature_string;
5153 int i;
5156 bool clear = false;
5158 next_feature_string = strchr (curr_feature_string, ',');
5159 if (next_feature_string)
5160 *next_feature_string++ = '\0';
5161 if (*curr_feature_string == '^')
5163 curr_feature_string++;
5164 clear = true;
5166 for (i = 0; i < X86_TUNE_LAST; i++)
5168 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5170 ix86_tune_features[i] = !clear;
5171 if (dump)
5172 fprintf (stderr, "Explicitly %s feature %s\n",
5173 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5174 break;
5177 if (i == X86_TUNE_LAST)
5178 error ("Unknown parameter to option -mtune-ctrl: %s",
5179 clear ? curr_feature_string - 1 : curr_feature_string);
5180 curr_feature_string = next_feature_string;
5182 while (curr_feature_string);
5183 free (orig);
5186 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5187 processor type. */
5189 static void
5190 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5192 unsigned int ix86_tune_mask = 1u << ix86_tune;
5193 int i;
5195 for (i = 0; i < X86_TUNE_LAST; ++i)
5197 if (ix86_tune_no_default)
5198 ix86_tune_features[i] = 0;
5199 else
5200 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5203 if (dump)
5205 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5206 for (i = 0; i < X86_TUNE_LAST; i++)
5207 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5208 ix86_tune_features[i] ? "on" : "off");
5211 parse_mtune_ctrl_str (dump);
5215 /* Default align_* from the processor table. */
5217 static void
5218 ix86_default_align (struct gcc_options *opts)
5220 if (opts->x_align_loops == 0)
5222 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5223 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5225 if (opts->x_align_jumps == 0)
5227 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5228 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5230 if (opts->x_align_functions == 0)
5232 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5236 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5238 static void
5239 ix86_override_options_after_change (void)
5241 ix86_default_align (&global_options);
5244 /* Override various settings based on options. If MAIN_ARGS_P, the
5245 options are from the command line, otherwise they are from
5246 attributes. Return true if there's an error related to march
5247 option. */
5249 static bool
5250 ix86_option_override_internal (bool main_args_p,
5251 struct gcc_options *opts,
5252 struct gcc_options *opts_set)
5254 int i;
5255 unsigned int ix86_arch_mask;
5256 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5258 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5259 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5260 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5261 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5262 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5263 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5264 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5265 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5266 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5267 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5268 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5269 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5270 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5271 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5272 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5273 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5274 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5275 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5276 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5277 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5278 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5279 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5280 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5281 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5282 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5283 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5284 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5285 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5286 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5287 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5288 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5289 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5290 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5291 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5292 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5293 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5294 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5295 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5296 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5297 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5298 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5299 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5300 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5301 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5302 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5303 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5304 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5305 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5306 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5307 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5308 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5309 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5310 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5311 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5312 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5313 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5314 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5315 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5316 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5317 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5318 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5319 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5320 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5321 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5323 #define PTA_CORE2 \
5324 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5325 | PTA_CX16 | PTA_FXSR)
5326 #define PTA_NEHALEM \
5327 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5328 #define PTA_WESTMERE \
5329 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5330 #define PTA_SANDYBRIDGE \
5331 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5332 #define PTA_IVYBRIDGE \
5333 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5334 #define PTA_HASWELL \
5335 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5336 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5337 #define PTA_BROADWELL \
5338 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5339 #define PTA_SKYLAKE \
5340 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5341 #define PTA_SKYLAKE_AVX512 \
5342 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5343 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5344 #define PTA_KNL \
5345 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5346 #define PTA_BONNELL \
5347 (PTA_CORE2 | PTA_MOVBE)
5348 #define PTA_SILVERMONT \
5349 (PTA_WESTMERE | PTA_MOVBE)
5351 /* if this reaches 64, need to widen struct pta flags below */
5353 static struct pta
5355 const char *const name; /* processor name or nickname. */
5356 const enum processor_type processor;
5357 const enum attr_cpu schedule;
5358 const unsigned HOST_WIDE_INT flags;
5360 const processor_alias_table[] =
5362 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5363 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5364 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5365 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5366 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5367 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5368 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5369 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5370 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5371 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5372 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5373 PTA_MMX | PTA_SSE | PTA_FXSR},
5374 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5375 PTA_MMX | PTA_SSE | PTA_FXSR},
5376 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5377 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5378 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5379 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5380 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5381 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5382 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5383 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5384 PTA_MMX | PTA_SSE | PTA_FXSR},
5385 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5386 PTA_MMX | PTA_SSE | PTA_FXSR},
5387 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5388 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5389 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5390 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5391 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5392 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5393 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5394 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5395 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5396 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5397 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5398 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5399 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5400 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5401 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5402 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5403 PTA_SANDYBRIDGE},
5404 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5405 PTA_SANDYBRIDGE},
5406 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5407 PTA_IVYBRIDGE},
5408 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5409 PTA_IVYBRIDGE},
5410 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5411 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5412 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5413 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5414 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5415 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5416 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5417 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5418 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5419 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5420 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5421 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5422 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5423 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5424 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5425 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5426 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5427 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5428 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5429 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5430 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5431 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5432 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5433 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5434 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5435 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5436 {"x86-64", PROCESSOR_K8, CPU_K8,
5437 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5438 {"eden-x2", PROCESSOR_K8, CPU_K8,
5439 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5440 {"nano", PROCESSOR_K8, CPU_K8,
5441 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5442 | PTA_SSSE3 | PTA_FXSR},
5443 {"nano-1000", PROCESSOR_K8, CPU_K8,
5444 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5445 | PTA_SSSE3 | PTA_FXSR},
5446 {"nano-2000", PROCESSOR_K8, CPU_K8,
5447 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5448 | PTA_SSSE3 | PTA_FXSR},
5449 {"nano-3000", PROCESSOR_K8, CPU_K8,
5450 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5451 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5452 {"nano-x2", PROCESSOR_K8, CPU_K8,
5453 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5454 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5455 {"eden-x4", PROCESSOR_K8, CPU_K8,
5456 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5457 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5458 {"nano-x4", PROCESSOR_K8, CPU_K8,
5459 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5460 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5461 {"k8", PROCESSOR_K8, CPU_K8,
5462 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5463 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5464 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5465 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5466 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5467 {"opteron", PROCESSOR_K8, CPU_K8,
5468 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5469 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5470 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5471 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5472 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5473 {"athlon64", PROCESSOR_K8, CPU_K8,
5474 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5475 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5476 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5477 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5478 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5479 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5480 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5481 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5482 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5483 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5484 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5485 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5486 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5487 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5488 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5489 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5490 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5491 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5492 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5493 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5494 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5495 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5496 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5497 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5498 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5499 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5500 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5501 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5502 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5503 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5504 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5505 | PTA_XSAVEOPT | PTA_FSGSBASE},
5506 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5507 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5508 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5509 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5510 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5511 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5512 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5513 | PTA_MOVBE | PTA_MWAITX},
5514 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5515 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5516 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5517 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5518 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5519 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5520 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5521 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5522 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5523 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5524 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5525 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5526 | PTA_FXSR | PTA_XSAVE},
5527 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5528 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5529 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5530 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5531 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5532 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5534 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5535 PTA_64BIT
5536 | PTA_HLE /* flags are only used for -march switch. */ },
5539 /* -mrecip options. */
5540 static struct
5542 const char *string; /* option name */
5543 unsigned int mask; /* mask bits to set */
5545 const recip_options[] =
5547 { "all", RECIP_MASK_ALL },
5548 { "none", RECIP_MASK_NONE },
5549 { "div", RECIP_MASK_DIV },
5550 { "sqrt", RECIP_MASK_SQRT },
5551 { "vec-div", RECIP_MASK_VEC_DIV },
5552 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5555 int const pta_size = ARRAY_SIZE (processor_alias_table);
5557 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5558 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5559 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5560 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5561 #ifdef TARGET_BI_ARCH
5562 else
5564 #if TARGET_BI_ARCH == 1
5565 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5566 is on and OPTION_MASK_ABI_X32 is off. We turn off
5567 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5568 -mx32. */
5569 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5570 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5571 #else
5572 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5573 on and OPTION_MASK_ABI_64 is off. We turn off
5574 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5575 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5576 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5577 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5578 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5579 #endif
5580 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5581 && TARGET_IAMCU_P (opts->x_target_flags))
5582 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5583 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5585 #endif
5587 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5589 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5590 OPTION_MASK_ABI_64 for TARGET_X32. */
5591 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5592 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5594 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5595 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5596 | OPTION_MASK_ABI_X32
5597 | OPTION_MASK_ABI_64);
5598 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5600 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5601 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5602 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5603 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5606 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5607 SUBTARGET_OVERRIDE_OPTIONS;
5608 #endif
5610 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5611 SUBSUBTARGET_OVERRIDE_OPTIONS;
5612 #endif
5614 /* -fPIC is the default for x86_64. */
5615 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5616 opts->x_flag_pic = 2;
5618 /* Need to check -mtune=generic first. */
5619 if (opts->x_ix86_tune_string)
5621 /* As special support for cross compilers we read -mtune=native
5622 as -mtune=generic. With native compilers we won't see the
5623 -mtune=native, as it was changed by the driver. */
5624 if (!strcmp (opts->x_ix86_tune_string, "native"))
5626 opts->x_ix86_tune_string = "generic";
5628 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5629 warning (OPT_Wdeprecated,
5630 main_args_p
5631 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5632 "or %<-mtune=generic%> instead as appropriate")
5633 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5634 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5635 " instead as appropriate"));
5637 else
5639 if (opts->x_ix86_arch_string)
5640 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5641 if (!opts->x_ix86_tune_string)
5643 opts->x_ix86_tune_string
5644 = processor_target_table[TARGET_CPU_DEFAULT].name;
5645 ix86_tune_defaulted = 1;
5648 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5649 or defaulted. We need to use a sensible tune option. */
5650 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5652 opts->x_ix86_tune_string = "generic";
5656 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5657 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5659 /* rep; movq isn't available in 32-bit code. */
5660 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5661 opts->x_ix86_stringop_alg = no_stringop;
5664 if (!opts->x_ix86_arch_string)
5665 opts->x_ix86_arch_string
5666 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5667 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5668 else
5669 ix86_arch_specified = 1;
5671 if (opts_set->x_ix86_pmode)
5673 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5674 && opts->x_ix86_pmode == PMODE_SI)
5675 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5676 && opts->x_ix86_pmode == PMODE_DI))
5677 error ("address mode %qs not supported in the %s bit mode",
5678 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5679 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5681 else
5682 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5683 ? PMODE_DI : PMODE_SI;
5685 if (!opts_set->x_ix86_abi)
5686 opts->x_ix86_abi = DEFAULT_ABI;
5688 /* For targets using ms ABI enable ms-extensions, if not
5689 explicit turned off. For non-ms ABI we turn off this
5690 option. */
5691 if (!opts_set->x_flag_ms_extensions)
5692 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5694 if (opts_set->x_ix86_cmodel)
5696 switch (opts->x_ix86_cmodel)
5698 case CM_SMALL:
5699 case CM_SMALL_PIC:
5700 if (opts->x_flag_pic)
5701 opts->x_ix86_cmodel = CM_SMALL_PIC;
5702 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5703 error ("code model %qs not supported in the %s bit mode",
5704 "small", "32");
5705 break;
5707 case CM_MEDIUM:
5708 case CM_MEDIUM_PIC:
5709 if (opts->x_flag_pic)
5710 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5711 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5712 error ("code model %qs not supported in the %s bit mode",
5713 "medium", "32");
5714 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5715 error ("code model %qs not supported in x32 mode",
5716 "medium");
5717 break;
5719 case CM_LARGE:
5720 case CM_LARGE_PIC:
5721 if (opts->x_flag_pic)
5722 opts->x_ix86_cmodel = CM_LARGE_PIC;
5723 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5724 error ("code model %qs not supported in the %s bit mode",
5725 "large", "32");
5726 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5727 error ("code model %qs not supported in x32 mode",
5728 "large");
5729 break;
5731 case CM_32:
5732 if (opts->x_flag_pic)
5733 error ("code model %s does not support PIC mode", "32");
5734 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5735 error ("code model %qs not supported in the %s bit mode",
5736 "32", "64");
5737 break;
5739 case CM_KERNEL:
5740 if (opts->x_flag_pic)
5742 error ("code model %s does not support PIC mode", "kernel");
5743 opts->x_ix86_cmodel = CM_32;
5745 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5746 error ("code model %qs not supported in the %s bit mode",
5747 "kernel", "32");
5748 break;
5750 default:
5751 gcc_unreachable ();
5754 else
5756 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5757 use of rip-relative addressing. This eliminates fixups that
5758 would otherwise be needed if this object is to be placed in a
5759 DLL, and is essentially just as efficient as direct addressing. */
5760 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5761 && (TARGET_RDOS || TARGET_PECOFF))
5762 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5763 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5764 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5765 else
5766 opts->x_ix86_cmodel = CM_32;
5768 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5770 error ("-masm=intel not supported in this configuration");
5771 opts->x_ix86_asm_dialect = ASM_ATT;
5773 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5774 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5775 sorry ("%i-bit mode not compiled in",
5776 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5778 for (i = 0; i < pta_size; i++)
5779 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5781 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5783 error (main_args_p
5784 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5785 "switch")
5786 : G_("%<generic%> CPU can be used only for "
5787 "%<target(\"tune=\")%> attribute"));
5788 return false;
5790 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5792 error (main_args_p
5793 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5794 "switch")
5795 : G_("%<intel%> CPU can be used only for "
5796 "%<target(\"tune=\")%> attribute"));
5797 return false;
5800 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5801 && !(processor_alias_table[i].flags & PTA_64BIT))
5803 error ("CPU you selected does not support x86-64 "
5804 "instruction set");
5805 return false;
5808 ix86_schedule = processor_alias_table[i].schedule;
5809 ix86_arch = processor_alias_table[i].processor;
5810 /* Default cpu tuning to the architecture. */
5811 ix86_tune = ix86_arch;
5813 if (processor_alias_table[i].flags & PTA_MMX
5814 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5815 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5816 if (processor_alias_table[i].flags & PTA_3DNOW
5817 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5818 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5819 if (processor_alias_table[i].flags & PTA_3DNOW_A
5820 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5821 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5822 if (processor_alias_table[i].flags & PTA_SSE
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5825 if (processor_alias_table[i].flags & PTA_SSE2
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5828 if (processor_alias_table[i].flags & PTA_SSE3
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5831 if (processor_alias_table[i].flags & PTA_SSSE3
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5834 if (processor_alias_table[i].flags & PTA_SSE4_1
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5837 if (processor_alias_table[i].flags & PTA_SSE4_2
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5840 if (processor_alias_table[i].flags & PTA_AVX
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5843 if (processor_alias_table[i].flags & PTA_AVX2
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5846 if (processor_alias_table[i].flags & PTA_FMA
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5849 if (processor_alias_table[i].flags & PTA_SSE4A
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5852 if (processor_alias_table[i].flags & PTA_FMA4
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5855 if (processor_alias_table[i].flags & PTA_XOP
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5858 if (processor_alias_table[i].flags & PTA_LWP
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5861 if (processor_alias_table[i].flags & PTA_ABM
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5864 if (processor_alias_table[i].flags & PTA_BMI
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5867 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5870 if (processor_alias_table[i].flags & PTA_TBM
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5873 if (processor_alias_table[i].flags & PTA_BMI2
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5876 if (processor_alias_table[i].flags & PTA_CX16
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5879 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5882 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5883 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5884 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5885 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5886 if (processor_alias_table[i].flags & PTA_MOVBE
5887 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5888 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5889 if (processor_alias_table[i].flags & PTA_AES
5890 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5891 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5892 if (processor_alias_table[i].flags & PTA_SHA
5893 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5894 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5895 if (processor_alias_table[i].flags & PTA_PCLMUL
5896 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5897 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5898 if (processor_alias_table[i].flags & PTA_FSGSBASE
5899 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5900 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5901 if (processor_alias_table[i].flags & PTA_RDRND
5902 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5903 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5904 if (processor_alias_table[i].flags & PTA_F16C
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5907 if (processor_alias_table[i].flags & PTA_RTM
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5910 if (processor_alias_table[i].flags & PTA_HLE
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5913 if (processor_alias_table[i].flags & PTA_PRFCHW
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5916 if (processor_alias_table[i].flags & PTA_RDSEED
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5919 if (processor_alias_table[i].flags & PTA_ADX
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5922 if (processor_alias_table[i].flags & PTA_FXSR
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5925 if (processor_alias_table[i].flags & PTA_XSAVE
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5928 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5931 if (processor_alias_table[i].flags & PTA_AVX512F
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5934 if (processor_alias_table[i].flags & PTA_AVX512ER
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5937 if (processor_alias_table[i].flags & PTA_AVX512PF
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5940 if (processor_alias_table[i].flags & PTA_AVX512CD
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5943 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5946 if (processor_alias_table[i].flags & PTA_CLWB
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5949 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5952 if (processor_alias_table[i].flags & PTA_CLZERO
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5955 if (processor_alias_table[i].flags & PTA_XSAVEC
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5958 if (processor_alias_table[i].flags & PTA_XSAVES
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5961 if (processor_alias_table[i].flags & PTA_AVX512DQ
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5964 if (processor_alias_table[i].flags & PTA_AVX512BW
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5967 if (processor_alias_table[i].flags & PTA_AVX512VL
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5970 if (processor_alias_table[i].flags & PTA_MPX
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5973 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5976 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5980 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5981 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5982 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5983 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5984 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5985 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5986 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5987 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5988 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5989 if (processor_alias_table[i].flags & PTA_SGX
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5993 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5994 x86_prefetch_sse = true;
5995 if (processor_alias_table[i].flags & PTA_MWAITX
5996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5998 if (processor_alias_table[i].flags & PTA_PKU
5999 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6000 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6002 /* Don't enable x87 instructions if only
6003 general registers are allowed. */
6004 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6005 && !(opts_set->x_target_flags & MASK_80387))
6007 if (processor_alias_table[i].flags & PTA_NO_80387)
6008 opts->x_target_flags &= ~MASK_80387;
6009 else
6010 opts->x_target_flags |= MASK_80387;
6012 break;
6015 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6016 error ("Intel MPX does not support x32");
6018 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6019 error ("Intel MPX does not support x32");
6021 if (i == pta_size)
6023 error (main_args_p
6024 ? G_("bad value (%qs) for %<-march=%> switch")
6025 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6026 opts->x_ix86_arch_string);
6028 auto_vec <const char *> candidates;
6029 for (i = 0; i < pta_size; i++)
6030 if (strcmp (processor_alias_table[i].name, "generic")
6031 && strcmp (processor_alias_table[i].name, "intel")
6032 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6033 || (processor_alias_table[i].flags & PTA_64BIT)))
6034 candidates.safe_push (processor_alias_table[i].name);
6036 char *s;
6037 const char *hint
6038 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6039 if (hint)
6040 inform (input_location,
6041 main_args_p
6042 ? G_("valid arguments to %<-march=%> switch are: "
6043 "%s; did you mean %qs?")
6044 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6045 "%s; did you mean %qs?"), s, hint);
6046 else
6047 inform (input_location,
6048 main_args_p
6049 ? G_("valid arguments to %<-march=%> switch are: %s")
6050 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6051 "are: %s"), s);
6052 XDELETEVEC (s);
6055 ix86_arch_mask = 1u << ix86_arch;
6056 for (i = 0; i < X86_ARCH_LAST; ++i)
6057 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6059 for (i = 0; i < pta_size; i++)
6060 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6062 ix86_schedule = processor_alias_table[i].schedule;
6063 ix86_tune = processor_alias_table[i].processor;
6064 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6066 if (!(processor_alias_table[i].flags & PTA_64BIT))
6068 if (ix86_tune_defaulted)
6070 opts->x_ix86_tune_string = "x86-64";
6071 for (i = 0; i < pta_size; i++)
6072 if (! strcmp (opts->x_ix86_tune_string,
6073 processor_alias_table[i].name))
6074 break;
6075 ix86_schedule = processor_alias_table[i].schedule;
6076 ix86_tune = processor_alias_table[i].processor;
6078 else
6079 error ("CPU you selected does not support x86-64 "
6080 "instruction set");
6083 /* Intel CPUs have always interpreted SSE prefetch instructions as
6084 NOPs; so, we can enable SSE prefetch instructions even when
6085 -mtune (rather than -march) points us to a processor that has them.
6086 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6087 higher processors. */
6088 if (TARGET_CMOV
6089 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6090 x86_prefetch_sse = true;
6091 break;
6094 if (ix86_tune_specified && i == pta_size)
6096 error (main_args_p
6097 ? G_("bad value (%qs) for %<-mtune=%> switch")
6098 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6099 opts->x_ix86_tune_string);
6101 auto_vec <const char *> candidates;
6102 for (i = 0; i < pta_size; i++)
6103 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6104 || (processor_alias_table[i].flags & PTA_64BIT))
6105 candidates.safe_push (processor_alias_table[i].name);
6107 char *s;
6108 const char *hint
6109 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6110 if (hint)
6111 inform (input_location,
6112 main_args_p
6113 ? G_("valid arguments to %<-mtune=%> switch are: "
6114 "%s; did you mean %qs?")
6115 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6116 "%s; did you mean %qs?"), s, hint);
6117 else
6118 inform (input_location,
6119 main_args_p
6120 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6121 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6122 "are: %s"), s);
6123 XDELETEVEC (s);
6126 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6128 #ifndef USE_IX86_FRAME_POINTER
6129 #define USE_IX86_FRAME_POINTER 0
6130 #endif
6132 #ifndef USE_X86_64_FRAME_POINTER
6133 #define USE_X86_64_FRAME_POINTER 0
6134 #endif
6136 /* Set the default values for switches whose default depends on TARGET_64BIT
6137 in case they weren't overwritten by command line options. */
6138 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6140 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6141 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6142 if (opts->x_flag_asynchronous_unwind_tables
6143 && !opts_set->x_flag_unwind_tables
6144 && TARGET_64BIT_MS_ABI)
6145 opts->x_flag_unwind_tables = 1;
6146 if (opts->x_flag_asynchronous_unwind_tables == 2)
6147 opts->x_flag_unwind_tables
6148 = opts->x_flag_asynchronous_unwind_tables = 1;
6149 if (opts->x_flag_pcc_struct_return == 2)
6150 opts->x_flag_pcc_struct_return = 0;
6152 else
6154 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6155 opts->x_flag_omit_frame_pointer
6156 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6157 if (opts->x_flag_asynchronous_unwind_tables == 2)
6158 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6159 if (opts->x_flag_pcc_struct_return == 2)
6161 /* Intel MCU psABI specifies that -freg-struct-return should
6162 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6163 we check -miamcu so that -freg-struct-return is always
6164 turned on if -miamcu is used. */
6165 if (TARGET_IAMCU_P (opts->x_target_flags))
6166 opts->x_flag_pcc_struct_return = 0;
6167 else
6168 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6172 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6173 /* TODO: ix86_cost should be chosen at instruction or function granuality
6174 so for cold code we use size_cost even in !optimize_size compilation. */
6175 if (opts->x_optimize_size)
6176 ix86_cost = &ix86_size_cost;
6177 else
6178 ix86_cost = ix86_tune_cost;
6180 /* Arrange to set up i386_stack_locals for all functions. */
6181 init_machine_status = ix86_init_machine_status;
6183 /* Validate -mregparm= value. */
6184 if (opts_set->x_ix86_regparm)
6186 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6187 warning (0, "-mregparm is ignored in 64-bit mode");
6188 else if (TARGET_IAMCU_P (opts->x_target_flags))
6189 warning (0, "-mregparm is ignored for Intel MCU psABI");
6190 if (opts->x_ix86_regparm > REGPARM_MAX)
6192 error ("-mregparm=%d is not between 0 and %d",
6193 opts->x_ix86_regparm, REGPARM_MAX);
6194 opts->x_ix86_regparm = 0;
6197 if (TARGET_IAMCU_P (opts->x_target_flags)
6198 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6199 opts->x_ix86_regparm = REGPARM_MAX;
6201 /* Default align_* from the processor table. */
6202 ix86_default_align (opts);
6204 /* Provide default for -mbranch-cost= value. */
6205 if (!opts_set->x_ix86_branch_cost)
6206 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6208 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6210 opts->x_target_flags
6211 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6213 /* Enable by default the SSE and MMX builtins. Do allow the user to
6214 explicitly disable any of these. In particular, disabling SSE and
6215 MMX for kernel code is extremely useful. */
6216 if (!ix86_arch_specified)
6217 opts->x_ix86_isa_flags
6218 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6219 | TARGET_SUBTARGET64_ISA_DEFAULT)
6220 & ~opts->x_ix86_isa_flags_explicit);
6222 if (TARGET_RTD_P (opts->x_target_flags))
6223 warning (0,
6224 main_args_p
6225 ? G_("%<-mrtd%> is ignored in 64bit mode")
6226 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6228 else
6230 opts->x_target_flags
6231 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6233 if (!ix86_arch_specified)
6234 opts->x_ix86_isa_flags
6235 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6237 /* i386 ABI does not specify red zone. It still makes sense to use it
6238 when programmer takes care to stack from being destroyed. */
6239 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6240 opts->x_target_flags |= MASK_NO_RED_ZONE;
6243 /* Keep nonleaf frame pointers. */
6244 if (opts->x_flag_omit_frame_pointer)
6245 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6246 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6247 opts->x_flag_omit_frame_pointer = 1;
6249 /* If we're doing fast math, we don't care about comparison order
6250 wrt NaNs. This lets us use a shorter comparison sequence. */
6251 if (opts->x_flag_finite_math_only)
6252 opts->x_target_flags &= ~MASK_IEEE_FP;
6254 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6255 since the insns won't need emulation. */
6256 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6257 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6259 /* Likewise, if the target doesn't have a 387, or we've specified
6260 software floating point, don't use 387 inline intrinsics. */
6261 if (!TARGET_80387_P (opts->x_target_flags))
6262 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6264 /* Turn on MMX builtins for -msse. */
6265 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6266 opts->x_ix86_isa_flags
6267 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6269 /* Enable SSE prefetch. */
6270 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6271 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6272 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6273 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6274 x86_prefetch_sse = true;
6276 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6277 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6278 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6279 opts->x_ix86_isa_flags
6280 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6282 /* Enable lzcnt instruction for -mabm. */
6283 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6284 opts->x_ix86_isa_flags
6285 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6287 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6288 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6289 opts->x_ix86_isa_flags
6290 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6291 & ~opts->x_ix86_isa_flags_explicit);
6293 /* Validate -mpreferred-stack-boundary= value or default it to
6294 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6295 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6296 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6298 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6299 int max = TARGET_SEH ? 4 : 12;
6301 if (opts->x_ix86_preferred_stack_boundary_arg < min
6302 || opts->x_ix86_preferred_stack_boundary_arg > max)
6304 if (min == max)
6305 error ("-mpreferred-stack-boundary is not supported "
6306 "for this target");
6307 else
6308 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6309 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6311 else
6312 ix86_preferred_stack_boundary
6313 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6316 /* Set the default value for -mstackrealign. */
6317 if (opts->x_ix86_force_align_arg_pointer == -1)
6318 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6320 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6322 /* Validate -mincoming-stack-boundary= value or default it to
6323 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6324 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6325 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6327 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6329 if (opts->x_ix86_incoming_stack_boundary_arg < min
6330 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6331 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6332 opts->x_ix86_incoming_stack_boundary_arg, min);
6333 else
6335 ix86_user_incoming_stack_boundary
6336 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6337 ix86_incoming_stack_boundary
6338 = ix86_user_incoming_stack_boundary;
6342 #ifndef NO_PROFILE_COUNTERS
6343 if (flag_nop_mcount)
6344 error ("-mnop-mcount is not compatible with this target");
6345 #endif
6346 if (flag_nop_mcount && flag_pic)
6347 error ("-mnop-mcount is not implemented for -fPIC");
6349 /* Accept -msseregparm only if at least SSE support is enabled. */
6350 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6351 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6352 error (main_args_p
6353 ? G_("%<-msseregparm%> used without SSE enabled")
6354 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6356 if (opts_set->x_ix86_fpmath)
6358 if (opts->x_ix86_fpmath & FPMATH_SSE)
6360 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6362 if (TARGET_80387_P (opts->x_target_flags))
6364 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6365 opts->x_ix86_fpmath = FPMATH_387;
6368 else if ((opts->x_ix86_fpmath & FPMATH_387)
6369 && !TARGET_80387_P (opts->x_target_flags))
6371 warning (0, "387 instruction set disabled, using SSE arithmetics");
6372 opts->x_ix86_fpmath = FPMATH_SSE;
6376 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6377 fpmath=387. The second is however default at many targets since the
6378 extra 80bit precision of temporaries is considered to be part of ABI.
6379 Overwrite the default at least for -ffast-math.
6380 TODO: -mfpmath=both seems to produce same performing code with bit
6381 smaller binaries. It is however not clear if register allocation is
6382 ready for this setting.
6383 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6384 codegen. We may switch to 387 with -ffast-math for size optimized
6385 functions. */
6386 else if (fast_math_flags_set_p (&global_options)
6387 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6388 opts->x_ix86_fpmath = FPMATH_SSE;
6389 else
6390 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6392 /* Use external vectorized library in vectorizing intrinsics. */
6393 if (opts_set->x_ix86_veclibabi_type)
6394 switch (opts->x_ix86_veclibabi_type)
6396 case ix86_veclibabi_type_svml:
6397 ix86_veclib_handler = ix86_veclibabi_svml;
6398 break;
6400 case ix86_veclibabi_type_acml:
6401 ix86_veclib_handler = ix86_veclibabi_acml;
6402 break;
6404 default:
6405 gcc_unreachable ();
6408 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6409 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6410 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6412 /* If stack probes are required, the space used for large function
6413 arguments on the stack must also be probed, so enable
6414 -maccumulate-outgoing-args so this happens in the prologue. */
6415 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6416 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6418 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6419 warning (0,
6420 main_args_p
6421 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6422 "for correctness")
6423 : G_("stack probing requires "
6424 "%<target(\"accumulate-outgoing-args\")%> for "
6425 "correctness"));
6426 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6429 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6430 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6431 if (fixed_regs[BP_REG]
6432 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6434 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6435 warning (0,
6436 main_args_p
6437 ? G_("fixed ebp register requires "
6438 "%<-maccumulate-outgoing-args%>")
6439 : G_("fixed ebp register requires "
6440 "%<target(\"accumulate-outgoing-args\")%>"));
6441 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6444 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6446 char *p;
6447 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6448 p = strchr (internal_label_prefix, 'X');
6449 internal_label_prefix_len = p - internal_label_prefix;
6450 *p = '\0';
6453 /* When scheduling description is not available, disable scheduler pass
6454 so it won't slow down the compilation and make x87 code slower. */
6455 if (!TARGET_SCHEDULE)
6456 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6458 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6459 ix86_tune_cost->simultaneous_prefetches,
6460 opts->x_param_values,
6461 opts_set->x_param_values);
6462 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6463 ix86_tune_cost->prefetch_block,
6464 opts->x_param_values,
6465 opts_set->x_param_values);
6466 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6467 ix86_tune_cost->l1_cache_size,
6468 opts->x_param_values,
6469 opts_set->x_param_values);
6470 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6471 ix86_tune_cost->l2_cache_size,
6472 opts->x_param_values,
6473 opts_set->x_param_values);
6475 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6476 if (opts->x_flag_prefetch_loop_arrays < 0
6477 && HAVE_prefetch
6478 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6479 && !opts->x_optimize_size
6480 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6481 opts->x_flag_prefetch_loop_arrays = 1;
6483 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6484 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6485 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6486 targetm.expand_builtin_va_start = NULL;
6488 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6490 ix86_gen_leave = gen_leave_rex64;
6491 if (Pmode == DImode)
6493 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6494 ix86_gen_tls_local_dynamic_base_64
6495 = gen_tls_local_dynamic_base_64_di;
6497 else
6499 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6500 ix86_gen_tls_local_dynamic_base_64
6501 = gen_tls_local_dynamic_base_64_si;
6504 else
6505 ix86_gen_leave = gen_leave;
6507 if (Pmode == DImode)
6509 ix86_gen_add3 = gen_adddi3;
6510 ix86_gen_sub3 = gen_subdi3;
6511 ix86_gen_sub3_carry = gen_subdi3_carry;
6512 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6513 ix86_gen_andsp = gen_anddi3;
6514 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6515 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6516 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6517 ix86_gen_monitor = gen_sse3_monitor_di;
6518 ix86_gen_monitorx = gen_monitorx_di;
6519 ix86_gen_clzero = gen_clzero_di;
6521 else
6523 ix86_gen_add3 = gen_addsi3;
6524 ix86_gen_sub3 = gen_subsi3;
6525 ix86_gen_sub3_carry = gen_subsi3_carry;
6526 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6527 ix86_gen_andsp = gen_andsi3;
6528 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6529 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6530 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6531 ix86_gen_monitor = gen_sse3_monitor_si;
6532 ix86_gen_monitorx = gen_monitorx_si;
6533 ix86_gen_clzero = gen_clzero_si;
6536 #ifdef USE_IX86_CLD
6537 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6538 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6539 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6540 #endif
6542 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6544 if (opts->x_flag_fentry > 0)
6545 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6546 "with -fpic");
6547 opts->x_flag_fentry = 0;
6549 else if (TARGET_SEH)
6551 if (opts->x_flag_fentry == 0)
6552 sorry ("-mno-fentry isn%'t compatible with SEH");
6553 opts->x_flag_fentry = 1;
6555 else if (opts->x_flag_fentry < 0)
6557 #if defined(PROFILE_BEFORE_PROLOGUE)
6558 opts->x_flag_fentry = 1;
6559 #else
6560 opts->x_flag_fentry = 0;
6561 #endif
6564 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6565 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6567 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6568 opts->x_target_flags |= MASK_VZEROUPPER;
6569 if (!(opts_set->x_target_flags & MASK_STV))
6570 opts->x_target_flags |= MASK_STV;
6571 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6572 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6573 stack realignment will be extra cost the pass doesn't take into
6574 account and the pass can't realign the stack. */
6575 if (ix86_preferred_stack_boundary < 128
6576 || ix86_incoming_stack_boundary < 128
6577 || opts->x_ix86_force_align_arg_pointer)
6578 opts->x_target_flags &= ~MASK_STV;
6579 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6580 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6581 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6582 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6583 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6584 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6585 /* Enable 128-bit AVX instruction generation
6586 for the auto-vectorizer. */
6587 if (TARGET_AVX128_OPTIMAL
6588 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6589 opts->x_target_flags |= MASK_PREFER_AVX128;
6591 if (opts->x_ix86_recip_name)
6593 char *p = ASTRDUP (opts->x_ix86_recip_name);
6594 char *q;
6595 unsigned int mask, i;
6596 bool invert;
6598 while ((q = strtok (p, ",")) != NULL)
6600 p = NULL;
6601 if (*q == '!')
6603 invert = true;
6604 q++;
6606 else
6607 invert = false;
6609 if (!strcmp (q, "default"))
6610 mask = RECIP_MASK_ALL;
6611 else
6613 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6614 if (!strcmp (q, recip_options[i].string))
6616 mask = recip_options[i].mask;
6617 break;
6620 if (i == ARRAY_SIZE (recip_options))
6622 error ("unknown option for -mrecip=%s", q);
6623 invert = false;
6624 mask = RECIP_MASK_NONE;
6628 opts->x_recip_mask_explicit |= mask;
6629 if (invert)
6630 opts->x_recip_mask &= ~mask;
6631 else
6632 opts->x_recip_mask |= mask;
6636 if (TARGET_RECIP_P (opts->x_target_flags))
6637 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6638 else if (opts_set->x_target_flags & MASK_RECIP)
6639 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6641 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6642 for 64-bit Bionic. Also default long double to 64-bit for Intel
6643 MCU psABI. */
6644 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6645 && !(opts_set->x_target_flags
6646 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6647 opts->x_target_flags |= (TARGET_64BIT
6648 ? MASK_LONG_DOUBLE_128
6649 : MASK_LONG_DOUBLE_64);
6651 /* Only one of them can be active. */
6652 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6653 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6655 /* Save the initial options in case the user does function specific
6656 options. */
6657 if (main_args_p)
6658 target_option_default_node = target_option_current_node
6659 = build_target_option_node (opts);
6661 /* Handle stack protector */
6662 if (!opts_set->x_ix86_stack_protector_guard)
6663 opts->x_ix86_stack_protector_guard
6664 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6666 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6667 if (opts->x_ix86_tune_memcpy_strategy)
6669 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6670 ix86_parse_stringop_strategy_string (str, false);
6671 free (str);
6674 if (opts->x_ix86_tune_memset_strategy)
6676 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6677 ix86_parse_stringop_strategy_string (str, true);
6678 free (str);
6681 return true;
6684 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6686 static void
6687 ix86_option_override (void)
6689 ix86_option_override_internal (true, &global_options, &global_options_set);
6692 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6693 static char *
6694 ix86_offload_options (void)
6696 if (TARGET_LP64)
6697 return xstrdup ("-foffload-abi=lp64");
6698 return xstrdup ("-foffload-abi=ilp32");
6701 /* Update register usage after having seen the compiler flags. */
6703 static void
6704 ix86_conditional_register_usage (void)
6706 int i, c_mask;
6708 /* If there are no caller-saved registers, preserve all registers.
6709 except fixed_regs and registers used for function return value
6710 since aggregate_value_p checks call_used_regs[regno] on return
6711 value. */
6712 if (cfun && cfun->machine->no_caller_saved_registers)
6713 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6714 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6715 call_used_regs[i] = 0;
6717 /* For 32-bit targets, squash the REX registers. */
6718 if (! TARGET_64BIT)
6720 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6721 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6722 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6723 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6724 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6725 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6728 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6729 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6731 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6733 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6735 /* Set/reset conditionally defined registers from
6736 CALL_USED_REGISTERS initializer. */
6737 if (call_used_regs[i] > 1)
6738 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6740 /* Calculate registers of CLOBBERED_REGS register set
6741 as call used registers from GENERAL_REGS register set. */
6742 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6743 && call_used_regs[i])
6744 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6747 /* If MMX is disabled, squash the registers. */
6748 if (! TARGET_MMX)
6749 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6750 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6751 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6753 /* If SSE is disabled, squash the registers. */
6754 if (! TARGET_SSE)
6755 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6756 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6757 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6759 /* If the FPU is disabled, squash the registers. */
6760 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6761 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6762 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6763 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6765 /* If AVX512F is disabled, squash the registers. */
6766 if (! TARGET_AVX512F)
6768 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6769 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6771 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6772 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6775 /* If MPX is disabled, squash the registers. */
6776 if (! TARGET_MPX)
6777 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6778 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6782 /* Save the current options */
6784 static void
6785 ix86_function_specific_save (struct cl_target_option *ptr,
6786 struct gcc_options *opts)
6788 ptr->arch = ix86_arch;
6789 ptr->schedule = ix86_schedule;
6790 ptr->prefetch_sse = x86_prefetch_sse;
6791 ptr->tune = ix86_tune;
6792 ptr->branch_cost = ix86_branch_cost;
6793 ptr->tune_defaulted = ix86_tune_defaulted;
6794 ptr->arch_specified = ix86_arch_specified;
6795 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6796 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6797 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6798 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6799 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6800 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6801 ptr->x_ix86_abi = opts->x_ix86_abi;
6802 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6803 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6804 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6805 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6806 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6807 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6808 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6809 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6810 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6811 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6812 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6813 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6814 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6815 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6816 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6817 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6818 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6819 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6820 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6821 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6823 /* The fields are char but the variables are not; make sure the
6824 values fit in the fields. */
6825 gcc_assert (ptr->arch == ix86_arch);
6826 gcc_assert (ptr->schedule == ix86_schedule);
6827 gcc_assert (ptr->tune == ix86_tune);
6828 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6831 /* Restore the current options */
6833 static void
6834 ix86_function_specific_restore (struct gcc_options *opts,
6835 struct cl_target_option *ptr)
6837 enum processor_type old_tune = ix86_tune;
6838 enum processor_type old_arch = ix86_arch;
6839 unsigned int ix86_arch_mask;
6840 int i;
6842 /* We don't change -fPIC. */
6843 opts->x_flag_pic = flag_pic;
6845 ix86_arch = (enum processor_type) ptr->arch;
6846 ix86_schedule = (enum attr_cpu) ptr->schedule;
6847 ix86_tune = (enum processor_type) ptr->tune;
6848 x86_prefetch_sse = ptr->prefetch_sse;
6849 opts->x_ix86_branch_cost = ptr->branch_cost;
6850 ix86_tune_defaulted = ptr->tune_defaulted;
6851 ix86_arch_specified = ptr->arch_specified;
6852 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6853 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6854 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6855 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6856 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6857 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6858 opts->x_ix86_abi = ptr->x_ix86_abi;
6859 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6860 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6861 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6862 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6863 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6864 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6865 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6866 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6867 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6868 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6869 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6870 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6871 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6872 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6873 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6874 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6875 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6876 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6877 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6878 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6879 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6880 /* TODO: ix86_cost should be chosen at instruction or function granuality
6881 so for cold code we use size_cost even in !optimize_size compilation. */
6882 if (opts->x_optimize_size)
6883 ix86_cost = &ix86_size_cost;
6884 else
6885 ix86_cost = ix86_tune_cost;
6887 /* Recreate the arch feature tests if the arch changed */
6888 if (old_arch != ix86_arch)
6890 ix86_arch_mask = 1u << ix86_arch;
6891 for (i = 0; i < X86_ARCH_LAST; ++i)
6892 ix86_arch_features[i]
6893 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6896 /* Recreate the tune optimization tests */
6897 if (old_tune != ix86_tune)
6898 set_ix86_tune_features (ix86_tune, false);
6901 /* Adjust target options after streaming them in. This is mainly about
6902 reconciling them with global options. */
6904 static void
6905 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6907 /* flag_pic is a global option, but ix86_cmodel is target saved option
6908 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6909 for PIC, or error out. */
6910 if (flag_pic)
6911 switch (ptr->x_ix86_cmodel)
6913 case CM_SMALL:
6914 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6915 break;
6917 case CM_MEDIUM:
6918 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6919 break;
6921 case CM_LARGE:
6922 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6923 break;
6925 case CM_KERNEL:
6926 error ("code model %s does not support PIC mode", "kernel");
6927 break;
6929 default:
6930 break;
6932 else
6933 switch (ptr->x_ix86_cmodel)
6935 case CM_SMALL_PIC:
6936 ptr->x_ix86_cmodel = CM_SMALL;
6937 break;
6939 case CM_MEDIUM_PIC:
6940 ptr->x_ix86_cmodel = CM_MEDIUM;
6941 break;
6943 case CM_LARGE_PIC:
6944 ptr->x_ix86_cmodel = CM_LARGE;
6945 break;
6947 default:
6948 break;
6952 /* Print the current options */
6954 static void
6955 ix86_function_specific_print (FILE *file, int indent,
6956 struct cl_target_option *ptr)
6958 char *target_string
6959 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6960 ptr->x_target_flags, ptr->x_ix86_target_flags,
6961 NULL, NULL, ptr->x_ix86_fpmath, false);
6963 gcc_assert (ptr->arch < PROCESSOR_max);
6964 fprintf (file, "%*sarch = %d (%s)\n",
6965 indent, "",
6966 ptr->arch, processor_target_table[ptr->arch].name);
6968 gcc_assert (ptr->tune < PROCESSOR_max);
6969 fprintf (file, "%*stune = %d (%s)\n",
6970 indent, "",
6971 ptr->tune, processor_target_table[ptr->tune].name);
6973 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6975 if (target_string)
6977 fprintf (file, "%*s%s\n", indent, "", target_string);
6978 free (target_string);
6983 /* Inner function to process the attribute((target(...))), take an argument and
6984 set the current options from the argument. If we have a list, recursively go
6985 over the list. */
6987 static bool
6988 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6989 struct gcc_options *opts,
6990 struct gcc_options *opts_set,
6991 struct gcc_options *enum_opts_set)
6993 char *next_optstr;
6994 bool ret = true;
6996 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6997 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6998 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6999 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7000 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7002 enum ix86_opt_type
7004 ix86_opt_unknown,
7005 ix86_opt_yes,
7006 ix86_opt_no,
7007 ix86_opt_str,
7008 ix86_opt_enum,
7009 ix86_opt_isa
7012 static const struct
7014 const char *string;
7015 size_t len;
7016 enum ix86_opt_type type;
7017 int opt;
7018 int mask;
7019 } attrs[] = {
7020 /* isa options */
7021 IX86_ATTR_ISA ("sgx", OPT_msgx),
7022 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7023 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7024 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7026 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7027 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7028 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7029 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7030 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7031 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7032 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7033 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7034 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7035 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7036 IX86_ATTR_ISA ("fma", OPT_mfma),
7037 IX86_ATTR_ISA ("xop", OPT_mxop),
7038 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7039 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7040 IX86_ATTR_ISA ("avx", OPT_mavx),
7041 IX86_ATTR_ISA ("sse4", OPT_msse4),
7042 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7043 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7044 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7045 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7046 IX86_ATTR_ISA ("sse3", OPT_msse3),
7047 IX86_ATTR_ISA ("aes", OPT_maes),
7048 IX86_ATTR_ISA ("sha", OPT_msha),
7049 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7050 IX86_ATTR_ISA ("sse2", OPT_msse2),
7051 IX86_ATTR_ISA ("sse", OPT_msse),
7052 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7053 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7054 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7055 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7056 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7057 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7058 IX86_ATTR_ISA ("adx", OPT_madx),
7059 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7060 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7061 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7062 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7063 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7064 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7065 IX86_ATTR_ISA ("abm", OPT_mabm),
7066 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7067 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7068 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7069 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7070 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7071 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7072 IX86_ATTR_ISA ("sahf", OPT_msahf),
7073 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7074 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7075 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7076 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7077 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7078 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7079 IX86_ATTR_ISA ("pku", OPT_mpku),
7080 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7081 IX86_ATTR_ISA ("hle", OPT_mhle),
7082 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7083 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7084 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7085 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7087 /* enum options */
7088 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7090 /* string options */
7091 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7092 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7094 /* flag options */
7095 IX86_ATTR_YES ("cld",
7096 OPT_mcld,
7097 MASK_CLD),
7099 IX86_ATTR_NO ("fancy-math-387",
7100 OPT_mfancy_math_387,
7101 MASK_NO_FANCY_MATH_387),
7103 IX86_ATTR_YES ("ieee-fp",
7104 OPT_mieee_fp,
7105 MASK_IEEE_FP),
7107 IX86_ATTR_YES ("inline-all-stringops",
7108 OPT_minline_all_stringops,
7109 MASK_INLINE_ALL_STRINGOPS),
7111 IX86_ATTR_YES ("inline-stringops-dynamically",
7112 OPT_minline_stringops_dynamically,
7113 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7115 IX86_ATTR_NO ("align-stringops",
7116 OPT_mno_align_stringops,
7117 MASK_NO_ALIGN_STRINGOPS),
7119 IX86_ATTR_YES ("recip",
7120 OPT_mrecip,
7121 MASK_RECIP),
7125 /* If this is a list, recurse to get the options. */
7126 if (TREE_CODE (args) == TREE_LIST)
7128 bool ret = true;
7130 for (; args; args = TREE_CHAIN (args))
7131 if (TREE_VALUE (args)
7132 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7133 p_strings, opts, opts_set,
7134 enum_opts_set))
7135 ret = false;
7137 return ret;
7140 else if (TREE_CODE (args) != STRING_CST)
7142 error ("attribute %<target%> argument not a string");
7143 return false;
7146 /* Handle multiple arguments separated by commas. */
7147 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7149 while (next_optstr && *next_optstr != '\0')
7151 char *p = next_optstr;
7152 char *orig_p = p;
7153 char *comma = strchr (next_optstr, ',');
7154 const char *opt_string;
7155 size_t len, opt_len;
7156 int opt;
7157 bool opt_set_p;
7158 char ch;
7159 unsigned i;
7160 enum ix86_opt_type type = ix86_opt_unknown;
7161 int mask = 0;
7163 if (comma)
7165 *comma = '\0';
7166 len = comma - next_optstr;
7167 next_optstr = comma + 1;
7169 else
7171 len = strlen (p);
7172 next_optstr = NULL;
7175 /* Recognize no-xxx. */
7176 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7178 opt_set_p = false;
7179 p += 3;
7180 len -= 3;
7182 else
7183 opt_set_p = true;
7185 /* Find the option. */
7186 ch = *p;
7187 opt = N_OPTS;
7188 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7190 type = attrs[i].type;
7191 opt_len = attrs[i].len;
7192 if (ch == attrs[i].string[0]
7193 && ((type != ix86_opt_str && type != ix86_opt_enum)
7194 ? len == opt_len
7195 : len > opt_len)
7196 && memcmp (p, attrs[i].string, opt_len) == 0)
7198 opt = attrs[i].opt;
7199 mask = attrs[i].mask;
7200 opt_string = attrs[i].string;
7201 break;
7205 /* Process the option. */
7206 if (opt == N_OPTS)
7208 error ("attribute(target(\"%s\")) is unknown", orig_p);
7209 ret = false;
7212 else if (type == ix86_opt_isa)
7214 struct cl_decoded_option decoded;
7216 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7217 ix86_handle_option (opts, opts_set,
7218 &decoded, input_location);
7221 else if (type == ix86_opt_yes || type == ix86_opt_no)
7223 if (type == ix86_opt_no)
7224 opt_set_p = !opt_set_p;
7226 if (opt_set_p)
7227 opts->x_target_flags |= mask;
7228 else
7229 opts->x_target_flags &= ~mask;
7232 else if (type == ix86_opt_str)
7234 if (p_strings[opt])
7236 error ("option(\"%s\") was already specified", opt_string);
7237 ret = false;
7239 else
7240 p_strings[opt] = xstrdup (p + opt_len);
7243 else if (type == ix86_opt_enum)
7245 bool arg_ok;
7246 int value;
7248 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7249 if (arg_ok)
7250 set_option (opts, enum_opts_set, opt, value,
7251 p + opt_len, DK_UNSPECIFIED, input_location,
7252 global_dc);
7253 else
7255 error ("attribute(target(\"%s\")) is unknown", orig_p);
7256 ret = false;
7260 else
7261 gcc_unreachable ();
7264 return ret;
7267 /* Release allocated strings. */
7268 static void
7269 release_options_strings (char **option_strings)
7271 /* Free up memory allocated to hold the strings */
7272 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7273 free (option_strings[i]);
7276 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7278 tree
7279 ix86_valid_target_attribute_tree (tree args,
7280 struct gcc_options *opts,
7281 struct gcc_options *opts_set)
7283 const char *orig_arch_string = opts->x_ix86_arch_string;
7284 const char *orig_tune_string = opts->x_ix86_tune_string;
7285 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7286 int orig_tune_defaulted = ix86_tune_defaulted;
7287 int orig_arch_specified = ix86_arch_specified;
7288 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7289 tree t = NULL_TREE;
7290 struct cl_target_option *def
7291 = TREE_TARGET_OPTION (target_option_default_node);
7292 struct gcc_options enum_opts_set;
7294 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7296 /* Process each of the options on the chain. */
7297 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7298 opts_set, &enum_opts_set))
7299 return error_mark_node;
7301 /* If the changed options are different from the default, rerun
7302 ix86_option_override_internal, and then save the options away.
7303 The string options are attribute options, and will be undone
7304 when we copy the save structure. */
7305 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7306 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7307 || opts->x_target_flags != def->x_target_flags
7308 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7309 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7310 || enum_opts_set.x_ix86_fpmath)
7312 /* If we are using the default tune= or arch=, undo the string assigned,
7313 and use the default. */
7314 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7316 opts->x_ix86_arch_string
7317 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7319 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7320 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7321 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7322 | OPTION_MASK_ABI_64
7323 | OPTION_MASK_ABI_X32
7324 | OPTION_MASK_CODE16);
7325 opts->x_ix86_isa_flags2 = 0;
7327 else if (!orig_arch_specified)
7328 opts->x_ix86_arch_string = NULL;
7330 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7331 opts->x_ix86_tune_string
7332 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7333 else if (orig_tune_defaulted)
7334 opts->x_ix86_tune_string = NULL;
7336 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7337 if (enum_opts_set.x_ix86_fpmath)
7338 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7339 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
7340 && TARGET_SSE_P (opts->x_ix86_isa_flags))
7342 if (TARGET_80387_P (opts->x_target_flags))
7343 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
7344 | FPMATH_387);
7345 else
7346 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
7347 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7350 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7351 bool r = ix86_option_override_internal (false, opts, opts_set);
7352 if (!r)
7354 release_options_strings (option_strings);
7355 return error_mark_node;
7358 /* Add any builtin functions with the new isa if any. */
7359 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7361 /* Save the current options unless we are validating options for
7362 #pragma. */
7363 t = build_target_option_node (opts);
7365 opts->x_ix86_arch_string = orig_arch_string;
7366 opts->x_ix86_tune_string = orig_tune_string;
7367 opts_set->x_ix86_fpmath = orig_fpmath_set;
7369 release_options_strings (option_strings);
7372 return t;
7375 /* Hook to validate attribute((target("string"))). */
7377 static bool
7378 ix86_valid_target_attribute_p (tree fndecl,
7379 tree ARG_UNUSED (name),
7380 tree args,
7381 int ARG_UNUSED (flags))
7383 struct gcc_options func_options;
7384 tree new_target, new_optimize;
7385 bool ret = true;
7387 /* attribute((target("default"))) does nothing, beyond
7388 affecting multi-versioning. */
7389 if (TREE_VALUE (args)
7390 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7391 && TREE_CHAIN (args) == NULL_TREE
7392 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7393 return true;
7395 tree old_optimize = build_optimization_node (&global_options);
7397 /* Get the optimization options of the current function. */
7398 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7400 if (!func_optimize)
7401 func_optimize = old_optimize;
7403 /* Init func_options. */
7404 memset (&func_options, 0, sizeof (func_options));
7405 init_options_struct (&func_options, NULL);
7406 lang_hooks.init_options_struct (&func_options);
7408 cl_optimization_restore (&func_options,
7409 TREE_OPTIMIZATION (func_optimize));
7411 /* Initialize func_options to the default before its target options can
7412 be set. */
7413 cl_target_option_restore (&func_options,
7414 TREE_TARGET_OPTION (target_option_default_node));
7416 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7417 &global_options_set);
7419 new_optimize = build_optimization_node (&func_options);
7421 if (new_target == error_mark_node)
7422 ret = false;
7424 else if (fndecl && new_target)
7426 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7428 if (old_optimize != new_optimize)
7429 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7432 finalize_options_struct (&func_options);
7434 return ret;
7438 /* Hook to determine if one function can safely inline another. */
7440 static bool
7441 ix86_can_inline_p (tree caller, tree callee)
7443 bool ret = false;
7444 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7445 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7447 /* If callee has no option attributes, then it is ok to inline. */
7448 if (!callee_tree)
7449 ret = true;
7451 /* If caller has no option attributes, but callee does then it is not ok to
7452 inline. */
7453 else if (!caller_tree)
7454 ret = false;
7456 else
7458 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7459 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7461 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7462 function can inline a SSE2 function but a SSE2 function can't inline
7463 a SSE4 function. */
7464 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7465 != callee_opts->x_ix86_isa_flags)
7466 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7467 != callee_opts->x_ix86_isa_flags2))
7468 ret = false;
7470 /* See if we have the same non-isa options. */
7471 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7472 ret = false;
7474 /* See if arch, tune, etc. are the same. */
7475 else if (caller_opts->arch != callee_opts->arch)
7476 ret = false;
7478 else if (caller_opts->tune != callee_opts->tune)
7479 ret = false;
7481 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7482 ret = false;
7484 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7485 ret = false;
7487 else
7488 ret = true;
7491 return ret;
7495 /* Remember the last target of ix86_set_current_function. */
7496 static GTY(()) tree ix86_previous_fndecl;
7498 /* Set targets globals to the default (or current #pragma GCC target
7499 if active). Invalidate ix86_previous_fndecl cache. */
7501 void
7502 ix86_reset_previous_fndecl (void)
7504 tree new_tree = target_option_current_node;
7505 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7506 if (TREE_TARGET_GLOBALS (new_tree))
7507 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7508 else if (new_tree == target_option_default_node)
7509 restore_target_globals (&default_target_globals);
7510 else
7511 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7512 ix86_previous_fndecl = NULL_TREE;
7515 /* Set the func_type field from the function FNDECL. */
7517 static void
7518 ix86_set_func_type (tree fndecl)
7520 if (cfun->machine->func_type == TYPE_UNKNOWN)
7522 if (lookup_attribute ("interrupt",
7523 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7525 int nargs = 0;
7526 for (tree arg = DECL_ARGUMENTS (fndecl);
7527 arg;
7528 arg = TREE_CHAIN (arg))
7529 nargs++;
7530 cfun->machine->no_caller_saved_registers = true;
7531 cfun->machine->func_type
7532 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7534 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7536 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7537 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7538 sorry ("Only DWARF debug format is supported for interrupt "
7539 "service routine.");
7541 else
7543 cfun->machine->func_type = TYPE_NORMAL;
7544 if (lookup_attribute ("no_caller_saved_registers",
7545 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7546 cfun->machine->no_caller_saved_registers = true;
7551 /* Establish appropriate back-end context for processing the function
7552 FNDECL. The argument might be NULL to indicate processing at top
7553 level, outside of any function scope. */
7554 static void
7555 ix86_set_current_function (tree fndecl)
7557 /* Only change the context if the function changes. This hook is called
7558 several times in the course of compiling a function, and we don't want to
7559 slow things down too much or call target_reinit when it isn't safe. */
7560 if (fndecl == ix86_previous_fndecl)
7562 /* There may be 2 function bodies for the same function FNDECL,
7563 one is extern inline and one isn't. Call ix86_set_func_type
7564 to set the func_type field. */
7565 if (fndecl != NULL_TREE)
7566 ix86_set_func_type (fndecl);
7567 return;
7570 tree old_tree;
7571 if (ix86_previous_fndecl == NULL_TREE)
7572 old_tree = target_option_current_node;
7573 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7574 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7575 else
7576 old_tree = target_option_default_node;
7578 if (fndecl == NULL_TREE)
7580 if (old_tree != target_option_current_node)
7581 ix86_reset_previous_fndecl ();
7582 return;
7585 ix86_set_func_type (fndecl);
7587 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7588 if (new_tree == NULL_TREE)
7589 new_tree = target_option_default_node;
7591 if (old_tree != new_tree)
7593 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7594 if (TREE_TARGET_GLOBALS (new_tree))
7595 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7596 else if (new_tree == target_option_default_node)
7597 restore_target_globals (&default_target_globals);
7598 else
7599 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7601 ix86_previous_fndecl = fndecl;
7603 static bool prev_no_caller_saved_registers;
7605 /* 64-bit MS and SYSV ABI have different set of call used registers.
7606 Avoid expensive re-initialization of init_regs each time we switch
7607 function context. */
7608 if (TARGET_64BIT
7609 && (call_used_regs[SI_REG]
7610 == (cfun->machine->call_abi == MS_ABI)))
7611 reinit_regs ();
7612 /* Need to re-initialize init_regs if caller-saved registers are
7613 changed. */
7614 else if (prev_no_caller_saved_registers
7615 != cfun->machine->no_caller_saved_registers)
7616 reinit_regs ();
7618 if (cfun->machine->func_type != TYPE_NORMAL
7619 || cfun->machine->no_caller_saved_registers)
7621 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7622 may change processor state. */
7623 const char *isa;
7624 if (TARGET_MPX)
7625 isa = "MPX";
7626 else if (TARGET_SSE)
7627 isa = "SSE";
7628 else if (TARGET_MMX)
7629 isa = "MMX/3Dnow";
7630 else if (TARGET_80387)
7631 isa = "80387";
7632 else
7633 isa = NULL;
7634 if (isa != NULL)
7636 if (cfun->machine->func_type != TYPE_NORMAL)
7637 sorry ("%s instructions aren't allowed in %s service routine",
7638 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7639 ? "exception" : "interrupt"));
7640 else
7641 sorry ("%s instructions aren't allowed in function with "
7642 "no_caller_saved_registers attribute", isa);
7643 /* Don't issue the same error twice. */
7644 cfun->machine->func_type = TYPE_NORMAL;
7645 cfun->machine->no_caller_saved_registers = false;
7649 prev_no_caller_saved_registers
7650 = cfun->machine->no_caller_saved_registers;
7654 /* Return true if this goes in large data/bss. */
7656 static bool
7657 ix86_in_large_data_p (tree exp)
7659 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7660 return false;
7662 if (exp == NULL_TREE)
7663 return false;
7665 /* Functions are never large data. */
7666 if (TREE_CODE (exp) == FUNCTION_DECL)
7667 return false;
7669 /* Automatic variables are never large data. */
7670 if (VAR_P (exp) && !is_global_var (exp))
7671 return false;
7673 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7675 const char *section = DECL_SECTION_NAME (exp);
7676 if (strcmp (section, ".ldata") == 0
7677 || strcmp (section, ".lbss") == 0)
7678 return true;
7679 return false;
7681 else
7683 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7685 /* If this is an incomplete type with size 0, then we can't put it
7686 in data because it might be too big when completed. Also,
7687 int_size_in_bytes returns -1 if size can vary or is larger than
7688 an integer in which case also it is safer to assume that it goes in
7689 large data. */
7690 if (size <= 0 || size > ix86_section_threshold)
7691 return true;
7694 return false;
7697 /* i386-specific section flag to mark large sections. */
7698 #define SECTION_LARGE SECTION_MACH_DEP
7700 /* Switch to the appropriate section for output of DECL.
7701 DECL is either a `VAR_DECL' node or a constant of some sort.
7702 RELOC indicates whether forming the initial value of DECL requires
7703 link-time relocations. */
7705 ATTRIBUTE_UNUSED static section *
7706 x86_64_elf_select_section (tree decl, int reloc,
7707 unsigned HOST_WIDE_INT align)
7709 if (ix86_in_large_data_p (decl))
7711 const char *sname = NULL;
7712 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7713 switch (categorize_decl_for_section (decl, reloc))
7715 case SECCAT_DATA:
7716 sname = ".ldata";
7717 break;
7718 case SECCAT_DATA_REL:
7719 sname = ".ldata.rel";
7720 break;
7721 case SECCAT_DATA_REL_LOCAL:
7722 sname = ".ldata.rel.local";
7723 break;
7724 case SECCAT_DATA_REL_RO:
7725 sname = ".ldata.rel.ro";
7726 break;
7727 case SECCAT_DATA_REL_RO_LOCAL:
7728 sname = ".ldata.rel.ro.local";
7729 break;
7730 case SECCAT_BSS:
7731 sname = ".lbss";
7732 flags |= SECTION_BSS;
7733 break;
7734 case SECCAT_RODATA:
7735 case SECCAT_RODATA_MERGE_STR:
7736 case SECCAT_RODATA_MERGE_STR_INIT:
7737 case SECCAT_RODATA_MERGE_CONST:
7738 sname = ".lrodata";
7739 flags &= ~SECTION_WRITE;
7740 break;
7741 case SECCAT_SRODATA:
7742 case SECCAT_SDATA:
7743 case SECCAT_SBSS:
7744 gcc_unreachable ();
7745 case SECCAT_TEXT:
7746 case SECCAT_TDATA:
7747 case SECCAT_TBSS:
7748 /* We don't split these for medium model. Place them into
7749 default sections and hope for best. */
7750 break;
7752 if (sname)
7754 /* We might get called with string constants, but get_named_section
7755 doesn't like them as they are not DECLs. Also, we need to set
7756 flags in that case. */
7757 if (!DECL_P (decl))
7758 return get_section (sname, flags, NULL);
7759 return get_named_section (decl, sname, reloc);
7762 return default_elf_select_section (decl, reloc, align);
7765 /* Select a set of attributes for section NAME based on the properties
7766 of DECL and whether or not RELOC indicates that DECL's initializer
7767 might contain runtime relocations. */
7769 static unsigned int ATTRIBUTE_UNUSED
7770 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7772 unsigned int flags = default_section_type_flags (decl, name, reloc);
7774 if (ix86_in_large_data_p (decl))
7775 flags |= SECTION_LARGE;
7777 if (decl == NULL_TREE
7778 && (strcmp (name, ".ldata.rel.ro") == 0
7779 || strcmp (name, ".ldata.rel.ro.local") == 0))
7780 flags |= SECTION_RELRO;
7782 if (strcmp (name, ".lbss") == 0
7783 || strncmp (name, ".lbss.", 5) == 0
7784 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7785 flags |= SECTION_BSS;
7787 return flags;
7790 /* Build up a unique section name, expressed as a
7791 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7792 RELOC indicates whether the initial value of EXP requires
7793 link-time relocations. */
7795 static void ATTRIBUTE_UNUSED
7796 x86_64_elf_unique_section (tree decl, int reloc)
7798 if (ix86_in_large_data_p (decl))
7800 const char *prefix = NULL;
7801 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7802 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7804 switch (categorize_decl_for_section (decl, reloc))
7806 case SECCAT_DATA:
7807 case SECCAT_DATA_REL:
7808 case SECCAT_DATA_REL_LOCAL:
7809 case SECCAT_DATA_REL_RO:
7810 case SECCAT_DATA_REL_RO_LOCAL:
7811 prefix = one_only ? ".ld" : ".ldata";
7812 break;
7813 case SECCAT_BSS:
7814 prefix = one_only ? ".lb" : ".lbss";
7815 break;
7816 case SECCAT_RODATA:
7817 case SECCAT_RODATA_MERGE_STR:
7818 case SECCAT_RODATA_MERGE_STR_INIT:
7819 case SECCAT_RODATA_MERGE_CONST:
7820 prefix = one_only ? ".lr" : ".lrodata";
7821 break;
7822 case SECCAT_SRODATA:
7823 case SECCAT_SDATA:
7824 case SECCAT_SBSS:
7825 gcc_unreachable ();
7826 case SECCAT_TEXT:
7827 case SECCAT_TDATA:
7828 case SECCAT_TBSS:
7829 /* We don't split these for medium model. Place them into
7830 default sections and hope for best. */
7831 break;
7833 if (prefix)
7835 const char *name, *linkonce;
7836 char *string;
7838 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7839 name = targetm.strip_name_encoding (name);
7841 /* If we're using one_only, then there needs to be a .gnu.linkonce
7842 prefix to the section name. */
7843 linkonce = one_only ? ".gnu.linkonce" : "";
7845 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7847 set_decl_section_name (decl, string);
7848 return;
7851 default_unique_section (decl, reloc);
7854 #ifdef COMMON_ASM_OP
7856 #ifndef LARGECOMM_SECTION_ASM_OP
7857 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7858 #endif
7860 /* This says how to output assembler code to declare an
7861 uninitialized external linkage data object.
7863 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7864 large objects. */
7865 void
7866 x86_elf_aligned_decl_common (FILE *file, tree decl,
7867 const char *name, unsigned HOST_WIDE_INT size,
7868 int align)
7870 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7871 && size > (unsigned int)ix86_section_threshold)
7873 switch_to_section (get_named_section (decl, ".lbss", 0));
7874 fputs (LARGECOMM_SECTION_ASM_OP, file);
7876 else
7877 fputs (COMMON_ASM_OP, file);
7878 assemble_name (file, name);
7879 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7880 size, align / BITS_PER_UNIT);
7882 #endif
7884 /* Utility function for targets to use in implementing
7885 ASM_OUTPUT_ALIGNED_BSS. */
7887 void
7888 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7889 unsigned HOST_WIDE_INT size, int align)
7891 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7892 && size > (unsigned int)ix86_section_threshold)
7893 switch_to_section (get_named_section (decl, ".lbss", 0));
7894 else
7895 switch_to_section (bss_section);
7896 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7897 #ifdef ASM_DECLARE_OBJECT_NAME
7898 last_assemble_variable_decl = decl;
7899 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7900 #else
7901 /* Standard thing is just output label for the object. */
7902 ASM_OUTPUT_LABEL (file, name);
7903 #endif /* ASM_DECLARE_OBJECT_NAME */
7904 ASM_OUTPUT_SKIP (file, size ? size : 1);
7907 /* Decide whether we must probe the stack before any space allocation
7908 on this target. It's essentially TARGET_STACK_PROBE except when
7909 -fstack-check causes the stack to be already probed differently. */
7911 bool
7912 ix86_target_stack_probe (void)
7914 /* Do not probe the stack twice if static stack checking is enabled. */
7915 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7916 return false;
7918 return TARGET_STACK_PROBE;
7921 /* Decide whether we can make a sibling call to a function. DECL is the
7922 declaration of the function being targeted by the call and EXP is the
7923 CALL_EXPR representing the call. */
7925 static bool
7926 ix86_function_ok_for_sibcall (tree decl, tree exp)
7928 tree type, decl_or_type;
7929 rtx a, b;
7930 bool bind_global = decl && !targetm.binds_local_p (decl);
7932 /* Sibling call isn't OK if there are no caller-saved registers
7933 since all registers must be preserved before return. */
7934 if (cfun->machine->no_caller_saved_registers)
7935 return false;
7937 /* If we are generating position-independent code, we cannot sibcall
7938 optimize direct calls to global functions, as the PLT requires
7939 %ebx be live. (Darwin does not have a PLT.) */
7940 if (!TARGET_MACHO
7941 && !TARGET_64BIT
7942 && flag_pic
7943 && flag_plt
7944 && bind_global)
7945 return false;
7947 /* If we need to align the outgoing stack, then sibcalling would
7948 unalign the stack, which may break the called function. */
7949 if (ix86_minimum_incoming_stack_boundary (true)
7950 < PREFERRED_STACK_BOUNDARY)
7951 return false;
7953 if (decl)
7955 decl_or_type = decl;
7956 type = TREE_TYPE (decl);
7958 else
7960 /* We're looking at the CALL_EXPR, we need the type of the function. */
7961 type = CALL_EXPR_FN (exp); /* pointer expression */
7962 type = TREE_TYPE (type); /* pointer type */
7963 type = TREE_TYPE (type); /* function type */
7964 decl_or_type = type;
7967 /* Check that the return value locations are the same. Like
7968 if we are returning floats on the 80387 register stack, we cannot
7969 make a sibcall from a function that doesn't return a float to a
7970 function that does or, conversely, from a function that does return
7971 a float to a function that doesn't; the necessary stack adjustment
7972 would not be executed. This is also the place we notice
7973 differences in the return value ABI. Note that it is ok for one
7974 of the functions to have void return type as long as the return
7975 value of the other is passed in a register. */
7976 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7977 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7978 cfun->decl, false);
7979 if (STACK_REG_P (a) || STACK_REG_P (b))
7981 if (!rtx_equal_p (a, b))
7982 return false;
7984 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7986 else if (!rtx_equal_p (a, b))
7987 return false;
7989 if (TARGET_64BIT)
7991 /* The SYSV ABI has more call-clobbered registers;
7992 disallow sibcalls from MS to SYSV. */
7993 if (cfun->machine->call_abi == MS_ABI
7994 && ix86_function_type_abi (type) == SYSV_ABI)
7995 return false;
7997 else
7999 /* If this call is indirect, we'll need to be able to use a
8000 call-clobbered register for the address of the target function.
8001 Make sure that all such registers are not used for passing
8002 parameters. Note that DLLIMPORT functions and call to global
8003 function via GOT slot are indirect. */
8004 if (!decl
8005 || (bind_global && flag_pic && !flag_plt)
8006 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8008 /* Check if regparm >= 3 since arg_reg_available is set to
8009 false if regparm == 0. If regparm is 1 or 2, there is
8010 always a call-clobbered register available.
8012 ??? The symbol indirect call doesn't need a call-clobbered
8013 register. But we don't know if this is a symbol indirect
8014 call or not here. */
8015 if (ix86_function_regparm (type, NULL) >= 3
8016 && !cfun->machine->arg_reg_available)
8017 return false;
8021 /* Otherwise okay. That also includes certain types of indirect calls. */
8022 return true;
8025 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8026 and "sseregparm" calling convention attributes;
8027 arguments as in struct attribute_spec.handler. */
8029 static tree
8030 ix86_handle_cconv_attribute (tree *node, tree name,
8031 tree args,
8032 int,
8033 bool *no_add_attrs)
8035 if (TREE_CODE (*node) != FUNCTION_TYPE
8036 && TREE_CODE (*node) != METHOD_TYPE
8037 && TREE_CODE (*node) != FIELD_DECL
8038 && TREE_CODE (*node) != TYPE_DECL)
8040 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8041 name);
8042 *no_add_attrs = true;
8043 return NULL_TREE;
8046 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8047 if (is_attribute_p ("regparm", name))
8049 tree cst;
8051 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8053 error ("fastcall and regparm attributes are not compatible");
8056 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8058 error ("regparam and thiscall attributes are not compatible");
8061 cst = TREE_VALUE (args);
8062 if (TREE_CODE (cst) != INTEGER_CST)
8064 warning (OPT_Wattributes,
8065 "%qE attribute requires an integer constant argument",
8066 name);
8067 *no_add_attrs = true;
8069 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8071 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8072 name, REGPARM_MAX);
8073 *no_add_attrs = true;
8076 return NULL_TREE;
8079 if (TARGET_64BIT)
8081 /* Do not warn when emulating the MS ABI. */
8082 if ((TREE_CODE (*node) != FUNCTION_TYPE
8083 && TREE_CODE (*node) != METHOD_TYPE)
8084 || ix86_function_type_abi (*node) != MS_ABI)
8085 warning (OPT_Wattributes, "%qE attribute ignored",
8086 name);
8087 *no_add_attrs = true;
8088 return NULL_TREE;
8091 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8092 if (is_attribute_p ("fastcall", name))
8094 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8096 error ("fastcall and cdecl attributes are not compatible");
8098 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8100 error ("fastcall and stdcall attributes are not compatible");
8102 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8104 error ("fastcall and regparm attributes are not compatible");
8106 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8108 error ("fastcall and thiscall attributes are not compatible");
8112 /* Can combine stdcall with fastcall (redundant), regparm and
8113 sseregparm. */
8114 else if (is_attribute_p ("stdcall", name))
8116 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8118 error ("stdcall and cdecl attributes are not compatible");
8120 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8122 error ("stdcall and fastcall attributes are not compatible");
8124 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8126 error ("stdcall and thiscall attributes are not compatible");
8130 /* Can combine cdecl with regparm and sseregparm. */
8131 else if (is_attribute_p ("cdecl", name))
8133 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8135 error ("stdcall and cdecl attributes are not compatible");
8137 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8139 error ("fastcall and cdecl attributes are not compatible");
8141 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8143 error ("cdecl and thiscall attributes are not compatible");
8146 else if (is_attribute_p ("thiscall", name))
8148 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8149 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8150 name);
8151 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8153 error ("stdcall and thiscall attributes are not compatible");
8155 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8157 error ("fastcall and thiscall attributes are not compatible");
8159 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8161 error ("cdecl and thiscall attributes are not compatible");
8165 /* Can combine sseregparm with all attributes. */
8167 return NULL_TREE;
8170 /* The transactional memory builtins are implicitly regparm or fastcall
8171 depending on the ABI. Override the generic do-nothing attribute that
8172 these builtins were declared with, and replace it with one of the two
8173 attributes that we expect elsewhere. */
8175 static tree
8176 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8177 int flags, bool *no_add_attrs)
8179 tree alt;
8181 /* In no case do we want to add the placeholder attribute. */
8182 *no_add_attrs = true;
8184 /* The 64-bit ABI is unchanged for transactional memory. */
8185 if (TARGET_64BIT)
8186 return NULL_TREE;
8188 /* ??? Is there a better way to validate 32-bit windows? We have
8189 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8190 if (CHECK_STACK_LIMIT > 0)
8191 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8192 else
8194 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8195 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8197 decl_attributes (node, alt, flags);
8199 return NULL_TREE;
8202 /* This function determines from TYPE the calling-convention. */
8204 unsigned int
8205 ix86_get_callcvt (const_tree type)
8207 unsigned int ret = 0;
8208 bool is_stdarg;
8209 tree attrs;
8211 if (TARGET_64BIT)
8212 return IX86_CALLCVT_CDECL;
8214 attrs = TYPE_ATTRIBUTES (type);
8215 if (attrs != NULL_TREE)
8217 if (lookup_attribute ("cdecl", attrs))
8218 ret |= IX86_CALLCVT_CDECL;
8219 else if (lookup_attribute ("stdcall", attrs))
8220 ret |= IX86_CALLCVT_STDCALL;
8221 else if (lookup_attribute ("fastcall", attrs))
8222 ret |= IX86_CALLCVT_FASTCALL;
8223 else if (lookup_attribute ("thiscall", attrs))
8224 ret |= IX86_CALLCVT_THISCALL;
8226 /* Regparam isn't allowed for thiscall and fastcall. */
8227 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8229 if (lookup_attribute ("regparm", attrs))
8230 ret |= IX86_CALLCVT_REGPARM;
8231 if (lookup_attribute ("sseregparm", attrs))
8232 ret |= IX86_CALLCVT_SSEREGPARM;
8235 if (IX86_BASE_CALLCVT(ret) != 0)
8236 return ret;
8239 is_stdarg = stdarg_p (type);
8240 if (TARGET_RTD && !is_stdarg)
8241 return IX86_CALLCVT_STDCALL | ret;
8243 if (ret != 0
8244 || is_stdarg
8245 || TREE_CODE (type) != METHOD_TYPE
8246 || ix86_function_type_abi (type) != MS_ABI)
8247 return IX86_CALLCVT_CDECL | ret;
8249 return IX86_CALLCVT_THISCALL;
8252 /* Return 0 if the attributes for two types are incompatible, 1 if they
8253 are compatible, and 2 if they are nearly compatible (which causes a
8254 warning to be generated). */
8256 static int
8257 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8259 unsigned int ccvt1, ccvt2;
8261 if (TREE_CODE (type1) != FUNCTION_TYPE
8262 && TREE_CODE (type1) != METHOD_TYPE)
8263 return 1;
8265 ccvt1 = ix86_get_callcvt (type1);
8266 ccvt2 = ix86_get_callcvt (type2);
8267 if (ccvt1 != ccvt2)
8268 return 0;
8269 if (ix86_function_regparm (type1, NULL)
8270 != ix86_function_regparm (type2, NULL))
8271 return 0;
8273 return 1;
8276 /* Return the regparm value for a function with the indicated TYPE and DECL.
8277 DECL may be NULL when calling function indirectly
8278 or considering a libcall. */
8280 static int
8281 ix86_function_regparm (const_tree type, const_tree decl)
8283 tree attr;
8284 int regparm;
8285 unsigned int ccvt;
8287 if (TARGET_64BIT)
8288 return (ix86_function_type_abi (type) == SYSV_ABI
8289 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8290 ccvt = ix86_get_callcvt (type);
8291 regparm = ix86_regparm;
8293 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8295 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8296 if (attr)
8298 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8299 return regparm;
8302 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8303 return 2;
8304 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8305 return 1;
8307 /* Use register calling convention for local functions when possible. */
8308 if (decl
8309 && TREE_CODE (decl) == FUNCTION_DECL)
8311 cgraph_node *target = cgraph_node::get (decl);
8312 if (target)
8313 target = target->function_symbol ();
8315 /* Caller and callee must agree on the calling convention, so
8316 checking here just optimize means that with
8317 __attribute__((optimize (...))) caller could use regparm convention
8318 and callee not, or vice versa. Instead look at whether the callee
8319 is optimized or not. */
8320 if (target && opt_for_fn (target->decl, optimize)
8321 && !(profile_flag && !flag_fentry))
8323 cgraph_local_info *i = &target->local;
8324 if (i && i->local && i->can_change_signature)
8326 int local_regparm, globals = 0, regno;
8328 /* Make sure no regparm register is taken by a
8329 fixed register variable. */
8330 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8331 local_regparm++)
8332 if (fixed_regs[local_regparm])
8333 break;
8335 /* We don't want to use regparm(3) for nested functions as
8336 these use a static chain pointer in the third argument. */
8337 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8338 local_regparm = 2;
8340 /* Save a register for the split stack. */
8341 if (flag_split_stack)
8343 if (local_regparm == 3)
8344 local_regparm = 2;
8345 else if (local_regparm == 2
8346 && DECL_STATIC_CHAIN (target->decl))
8347 local_regparm = 1;
8350 /* Each fixed register usage increases register pressure,
8351 so less registers should be used for argument passing.
8352 This functionality can be overriden by an explicit
8353 regparm value. */
8354 for (regno = AX_REG; regno <= DI_REG; regno++)
8355 if (fixed_regs[regno])
8356 globals++;
8358 local_regparm
8359 = globals < local_regparm ? local_regparm - globals : 0;
8361 if (local_regparm > regparm)
8362 regparm = local_regparm;
8367 return regparm;
8370 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8371 DFmode (2) arguments in SSE registers for a function with the
8372 indicated TYPE and DECL. DECL may be NULL when calling function
8373 indirectly or considering a libcall. Return -1 if any FP parameter
8374 should be rejected by error. This is used in siutation we imply SSE
8375 calling convetion but the function is called from another function with
8376 SSE disabled. Otherwise return 0. */
8378 static int
8379 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8381 gcc_assert (!TARGET_64BIT);
8383 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8384 by the sseregparm attribute. */
8385 if (TARGET_SSEREGPARM
8386 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8388 if (!TARGET_SSE)
8390 if (warn)
8392 if (decl)
8393 error ("calling %qD with attribute sseregparm without "
8394 "SSE/SSE2 enabled", decl);
8395 else
8396 error ("calling %qT with attribute sseregparm without "
8397 "SSE/SSE2 enabled", type);
8399 return 0;
8402 return 2;
8405 if (!decl)
8406 return 0;
8408 cgraph_node *target = cgraph_node::get (decl);
8409 if (target)
8410 target = target->function_symbol ();
8412 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8413 (and DFmode for SSE2) arguments in SSE registers. */
8414 if (target
8415 /* TARGET_SSE_MATH */
8416 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8417 && opt_for_fn (target->decl, optimize)
8418 && !(profile_flag && !flag_fentry))
8420 cgraph_local_info *i = &target->local;
8421 if (i && i->local && i->can_change_signature)
8423 /* Refuse to produce wrong code when local function with SSE enabled
8424 is called from SSE disabled function.
8425 FIXME: We need a way to detect these cases cross-ltrans partition
8426 and avoid using SSE calling conventions on local functions called
8427 from function with SSE disabled. For now at least delay the
8428 warning until we know we are going to produce wrong code.
8429 See PR66047 */
8430 if (!TARGET_SSE && warn)
8431 return -1;
8432 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8433 ->x_ix86_isa_flags) ? 2 : 1;
8437 return 0;
8440 /* Return true if EAX is live at the start of the function. Used by
8441 ix86_expand_prologue to determine if we need special help before
8442 calling allocate_stack_worker. */
8444 static bool
8445 ix86_eax_live_at_start_p (void)
8447 /* Cheat. Don't bother working forward from ix86_function_regparm
8448 to the function type to whether an actual argument is located in
8449 eax. Instead just look at cfg info, which is still close enough
8450 to correct at this point. This gives false positives for broken
8451 functions that might use uninitialized data that happens to be
8452 allocated in eax, but who cares? */
8453 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8456 static bool
8457 ix86_keep_aggregate_return_pointer (tree fntype)
8459 tree attr;
8461 if (!TARGET_64BIT)
8463 attr = lookup_attribute ("callee_pop_aggregate_return",
8464 TYPE_ATTRIBUTES (fntype));
8465 if (attr)
8466 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8468 /* For 32-bit MS-ABI the default is to keep aggregate
8469 return pointer. */
8470 if (ix86_function_type_abi (fntype) == MS_ABI)
8471 return true;
8473 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8476 /* Value is the number of bytes of arguments automatically
8477 popped when returning from a subroutine call.
8478 FUNDECL is the declaration node of the function (as a tree),
8479 FUNTYPE is the data type of the function (as a tree),
8480 or for a library call it is an identifier node for the subroutine name.
8481 SIZE is the number of bytes of arguments passed on the stack.
8483 On the 80386, the RTD insn may be used to pop them if the number
8484 of args is fixed, but if the number is variable then the caller
8485 must pop them all. RTD can't be used for library calls now
8486 because the library is compiled with the Unix compiler.
8487 Use of RTD is a selectable option, since it is incompatible with
8488 standard Unix calling sequences. If the option is not selected,
8489 the caller must always pop the args.
8491 The attribute stdcall is equivalent to RTD on a per module basis. */
8493 static int
8494 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8496 unsigned int ccvt;
8498 /* None of the 64-bit ABIs pop arguments. */
8499 if (TARGET_64BIT)
8500 return 0;
8502 ccvt = ix86_get_callcvt (funtype);
8504 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8505 | IX86_CALLCVT_THISCALL)) != 0
8506 && ! stdarg_p (funtype))
8507 return size;
8509 /* Lose any fake structure return argument if it is passed on the stack. */
8510 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8511 && !ix86_keep_aggregate_return_pointer (funtype))
8513 int nregs = ix86_function_regparm (funtype, fundecl);
8514 if (nregs == 0)
8515 return GET_MODE_SIZE (Pmode);
8518 return 0;
8521 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8523 static bool
8524 ix86_legitimate_combined_insn (rtx_insn *insn)
8526 int i;
8528 /* Check operand constraints in case hard registers were propagated
8529 into insn pattern. This check prevents combine pass from
8530 generating insn patterns with invalid hard register operands.
8531 These invalid insns can eventually confuse reload to error out
8532 with a spill failure. See also PRs 46829 and 46843. */
8534 gcc_assert (INSN_CODE (insn) >= 0);
8536 extract_insn (insn);
8537 preprocess_constraints (insn);
8539 int n_operands = recog_data.n_operands;
8540 int n_alternatives = recog_data.n_alternatives;
8541 for (i = 0; i < n_operands; i++)
8543 rtx op = recog_data.operand[i];
8544 machine_mode mode = GET_MODE (op);
8545 const operand_alternative *op_alt;
8546 int offset = 0;
8547 bool win;
8548 int j;
8550 /* A unary operator may be accepted by the predicate, but it
8551 is irrelevant for matching constraints. */
8552 if (UNARY_P (op))
8553 op = XEXP (op, 0);
8555 if (SUBREG_P (op))
8557 if (REG_P (SUBREG_REG (op))
8558 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8559 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8560 GET_MODE (SUBREG_REG (op)),
8561 SUBREG_BYTE (op),
8562 GET_MODE (op));
8563 op = SUBREG_REG (op);
8566 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8567 continue;
8569 op_alt = recog_op_alt;
8571 /* Operand has no constraints, anything is OK. */
8572 win = !n_alternatives;
8574 alternative_mask preferred = get_preferred_alternatives (insn);
8575 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8577 if (!TEST_BIT (preferred, j))
8578 continue;
8579 if (op_alt[i].anything_ok
8580 || (op_alt[i].matches != -1
8581 && operands_match_p
8582 (recog_data.operand[i],
8583 recog_data.operand[op_alt[i].matches]))
8584 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8586 win = true;
8587 break;
8591 if (!win)
8592 return false;
8595 return true;
8598 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8600 static unsigned HOST_WIDE_INT
8601 ix86_asan_shadow_offset (void)
8603 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8604 : HOST_WIDE_INT_C (0x7fff8000))
8605 : (HOST_WIDE_INT_1 << 29);
8608 /* Argument support functions. */
8610 /* Return true when register may be used to pass function parameters. */
8611 bool
8612 ix86_function_arg_regno_p (int regno)
8614 int i;
8615 enum calling_abi call_abi;
8616 const int *parm_regs;
8618 if (TARGET_MPX && BND_REGNO_P (regno))
8619 return true;
8621 if (!TARGET_64BIT)
8623 if (TARGET_MACHO)
8624 return (regno < REGPARM_MAX
8625 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8626 else
8627 return (regno < REGPARM_MAX
8628 || (TARGET_MMX && MMX_REGNO_P (regno)
8629 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8630 || (TARGET_SSE && SSE_REGNO_P (regno)
8631 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8634 if (TARGET_SSE && SSE_REGNO_P (regno)
8635 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8636 return true;
8638 /* TODO: The function should depend on current function ABI but
8639 builtins.c would need updating then. Therefore we use the
8640 default ABI. */
8641 call_abi = ix86_cfun_abi ();
8643 /* RAX is used as hidden argument to va_arg functions. */
8644 if (call_abi == SYSV_ABI && regno == AX_REG)
8645 return true;
8647 if (call_abi == MS_ABI)
8648 parm_regs = x86_64_ms_abi_int_parameter_registers;
8649 else
8650 parm_regs = x86_64_int_parameter_registers;
8652 for (i = 0; i < (call_abi == MS_ABI
8653 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8654 if (regno == parm_regs[i])
8655 return true;
8656 return false;
8659 /* Return if we do not know how to pass TYPE solely in registers. */
8661 static bool
8662 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8664 if (must_pass_in_stack_var_size_or_pad (mode, type))
8665 return true;
8667 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8668 The layout_type routine is crafty and tries to trick us into passing
8669 currently unsupported vector types on the stack by using TImode. */
8670 return (!TARGET_64BIT && mode == TImode
8671 && type && TREE_CODE (type) != VECTOR_TYPE);
8674 /* It returns the size, in bytes, of the area reserved for arguments passed
8675 in registers for the function represented by fndecl dependent to the used
8676 abi format. */
8678 ix86_reg_parm_stack_space (const_tree fndecl)
8680 enum calling_abi call_abi = SYSV_ABI;
8681 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8682 call_abi = ix86_function_abi (fndecl);
8683 else
8684 call_abi = ix86_function_type_abi (fndecl);
8685 if (TARGET_64BIT && call_abi == MS_ABI)
8686 return 32;
8687 return 0;
8690 /* We add this as a workaround in order to use libc_has_function
8691 hook in i386.md. */
8692 bool
8693 ix86_libc_has_function (enum function_class fn_class)
8695 return targetm.libc_has_function (fn_class);
8698 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8699 specifying the call abi used. */
8700 enum calling_abi
8701 ix86_function_type_abi (const_tree fntype)
8703 enum calling_abi abi = ix86_abi;
8705 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8706 return abi;
8708 if (abi == SYSV_ABI
8709 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8711 if (TARGET_X32)
8712 error ("X32 does not support ms_abi attribute");
8714 abi = MS_ABI;
8716 else if (abi == MS_ABI
8717 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8718 abi = SYSV_ABI;
8720 return abi;
8723 static enum calling_abi
8724 ix86_function_abi (const_tree fndecl)
8726 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8729 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8730 specifying the call abi used. */
8731 enum calling_abi
8732 ix86_cfun_abi (void)
8734 return cfun ? cfun->machine->call_abi : ix86_abi;
8737 static bool
8738 ix86_function_ms_hook_prologue (const_tree fn)
8740 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8742 if (decl_function_context (fn) != NULL_TREE)
8743 error_at (DECL_SOURCE_LOCATION (fn),
8744 "ms_hook_prologue is not compatible with nested function");
8745 else
8746 return true;
8748 return false;
8751 /* Write the extra assembler code needed to declare a function properly. */
8753 void
8754 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8755 tree decl)
8757 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8759 if (is_ms_hook)
8761 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8762 unsigned int filler_cc = 0xcccccccc;
8764 for (i = 0; i < filler_count; i += 4)
8765 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8768 #ifdef SUBTARGET_ASM_UNWIND_INIT
8769 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8770 #endif
8772 ASM_OUTPUT_LABEL (asm_out_file, fname);
8774 /* Output magic byte marker, if hot-patch attribute is set. */
8775 if (is_ms_hook)
8777 if (TARGET_64BIT)
8779 /* leaq [%rsp + 0], %rsp */
8780 asm_fprintf (asm_out_file, ASM_BYTE
8781 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8783 else
8785 /* movl.s %edi, %edi
8786 push %ebp
8787 movl.s %esp, %ebp */
8788 asm_fprintf (asm_out_file, ASM_BYTE
8789 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8794 /* Implementation of call abi switching target hook. Specific to FNDECL
8795 the specific call register sets are set. See also
8796 ix86_conditional_register_usage for more details. */
8797 void
8798 ix86_call_abi_override (const_tree fndecl)
8800 cfun->machine->call_abi = ix86_function_abi (fndecl);
8803 /* Return 1 if pseudo register should be created and used to hold
8804 GOT address for PIC code. */
8805 bool
8806 ix86_use_pseudo_pic_reg (void)
8808 if ((TARGET_64BIT
8809 && (ix86_cmodel == CM_SMALL_PIC
8810 || TARGET_PECOFF))
8811 || !flag_pic)
8812 return false;
8813 return true;
8816 /* Initialize large model PIC register. */
8818 static void
8819 ix86_init_large_pic_reg (unsigned int tmp_regno)
8821 rtx_code_label *label;
8822 rtx tmp_reg;
8824 gcc_assert (Pmode == DImode);
8825 label = gen_label_rtx ();
8826 emit_label (label);
8827 LABEL_PRESERVE_P (label) = 1;
8828 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8829 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8830 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8831 label));
8832 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8833 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8834 pic_offset_table_rtx, tmp_reg));
8837 /* Create and initialize PIC register if required. */
8838 static void
8839 ix86_init_pic_reg (void)
8841 edge entry_edge;
8842 rtx_insn *seq;
8844 if (!ix86_use_pseudo_pic_reg ())
8845 return;
8847 start_sequence ();
8849 if (TARGET_64BIT)
8851 if (ix86_cmodel == CM_LARGE_PIC)
8852 ix86_init_large_pic_reg (R11_REG);
8853 else
8854 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8856 else
8858 /* If there is future mcount call in the function it is more profitable
8859 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8860 rtx reg = crtl->profile
8861 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8862 : pic_offset_table_rtx;
8863 rtx_insn *insn = emit_insn (gen_set_got (reg));
8864 RTX_FRAME_RELATED_P (insn) = 1;
8865 if (crtl->profile)
8866 emit_move_insn (pic_offset_table_rtx, reg);
8867 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8870 seq = get_insns ();
8871 end_sequence ();
8873 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8874 insert_insn_on_edge (seq, entry_edge);
8875 commit_one_edge_insertion (entry_edge);
8878 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8879 for a call to a function whose data type is FNTYPE.
8880 For a library call, FNTYPE is 0. */
8882 void
8883 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8884 tree fntype, /* tree ptr for function decl */
8885 rtx libname, /* SYMBOL_REF of library name or 0 */
8886 tree fndecl,
8887 int caller)
8889 struct cgraph_local_info *i = NULL;
8890 struct cgraph_node *target = NULL;
8892 memset (cum, 0, sizeof (*cum));
8894 if (fndecl)
8896 target = cgraph_node::get (fndecl);
8897 if (target)
8899 target = target->function_symbol ();
8900 i = cgraph_node::local_info (target->decl);
8901 cum->call_abi = ix86_function_abi (target->decl);
8903 else
8904 cum->call_abi = ix86_function_abi (fndecl);
8906 else
8907 cum->call_abi = ix86_function_type_abi (fntype);
8909 cum->caller = caller;
8911 /* Set up the number of registers to use for passing arguments. */
8912 cum->nregs = ix86_regparm;
8913 if (TARGET_64BIT)
8915 cum->nregs = (cum->call_abi == SYSV_ABI
8916 ? X86_64_REGPARM_MAX
8917 : X86_64_MS_REGPARM_MAX);
8919 if (TARGET_SSE)
8921 cum->sse_nregs = SSE_REGPARM_MAX;
8922 if (TARGET_64BIT)
8924 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8925 ? X86_64_SSE_REGPARM_MAX
8926 : X86_64_MS_SSE_REGPARM_MAX);
8929 if (TARGET_MMX)
8930 cum->mmx_nregs = MMX_REGPARM_MAX;
8931 cum->warn_avx512f = true;
8932 cum->warn_avx = true;
8933 cum->warn_sse = true;
8934 cum->warn_mmx = true;
8936 /* Because type might mismatch in between caller and callee, we need to
8937 use actual type of function for local calls.
8938 FIXME: cgraph_analyze can be told to actually record if function uses
8939 va_start so for local functions maybe_vaarg can be made aggressive
8940 helping K&R code.
8941 FIXME: once typesytem is fixed, we won't need this code anymore. */
8942 if (i && i->local && i->can_change_signature)
8943 fntype = TREE_TYPE (target->decl);
8944 cum->stdarg = stdarg_p (fntype);
8945 cum->maybe_vaarg = (fntype
8946 ? (!prototype_p (fntype) || stdarg_p (fntype))
8947 : !libname);
8949 cum->bnd_regno = FIRST_BND_REG;
8950 cum->bnds_in_bt = 0;
8951 cum->force_bnd_pass = 0;
8952 cum->decl = fndecl;
8954 if (!TARGET_64BIT)
8956 /* If there are variable arguments, then we won't pass anything
8957 in registers in 32-bit mode. */
8958 if (stdarg_p (fntype))
8960 cum->nregs = 0;
8961 /* Since in 32-bit, variable arguments are always passed on
8962 stack, there is scratch register available for indirect
8963 sibcall. */
8964 cfun->machine->arg_reg_available = true;
8965 cum->sse_nregs = 0;
8966 cum->mmx_nregs = 0;
8967 cum->warn_avx512f = false;
8968 cum->warn_avx = false;
8969 cum->warn_sse = false;
8970 cum->warn_mmx = false;
8971 return;
8974 /* Use ecx and edx registers if function has fastcall attribute,
8975 else look for regparm information. */
8976 if (fntype)
8978 unsigned int ccvt = ix86_get_callcvt (fntype);
8979 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8981 cum->nregs = 1;
8982 cum->fastcall = 1; /* Same first register as in fastcall. */
8984 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8986 cum->nregs = 2;
8987 cum->fastcall = 1;
8989 else
8990 cum->nregs = ix86_function_regparm (fntype, fndecl);
8993 /* Set up the number of SSE registers used for passing SFmode
8994 and DFmode arguments. Warn for mismatching ABI. */
8995 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8998 cfun->machine->arg_reg_available = (cum->nregs > 0);
9001 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9002 But in the case of vector types, it is some vector mode.
9004 When we have only some of our vector isa extensions enabled, then there
9005 are some modes for which vector_mode_supported_p is false. For these
9006 modes, the generic vector support in gcc will choose some non-vector mode
9007 in order to implement the type. By computing the natural mode, we'll
9008 select the proper ABI location for the operand and not depend on whatever
9009 the middle-end decides to do with these vector types.
9011 The midde-end can't deal with the vector types > 16 bytes. In this
9012 case, we return the original mode and warn ABI change if CUM isn't
9013 NULL.
9015 If INT_RETURN is true, warn ABI change if the vector mode isn't
9016 available for function return value. */
9018 static machine_mode
9019 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9020 bool in_return)
9022 machine_mode mode = TYPE_MODE (type);
9024 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9026 HOST_WIDE_INT size = int_size_in_bytes (type);
9027 if ((size == 8 || size == 16 || size == 32 || size == 64)
9028 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9029 && TYPE_VECTOR_SUBPARTS (type) > 1)
9031 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9033 /* There are no XFmode vector modes. */
9034 if (innermode == XFmode)
9035 return mode;
9037 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9038 mode = MIN_MODE_VECTOR_FLOAT;
9039 else
9040 mode = MIN_MODE_VECTOR_INT;
9042 /* Get the mode which has this inner mode and number of units. */
9043 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
9044 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9045 && GET_MODE_INNER (mode) == innermode)
9047 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9049 static bool warnedavx512f;
9050 static bool warnedavx512f_ret;
9052 if (cum && cum->warn_avx512f && !warnedavx512f)
9054 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9055 "without AVX512F enabled changes the ABI"))
9056 warnedavx512f = true;
9058 else if (in_return && !warnedavx512f_ret)
9060 if (warning (OPT_Wpsabi, "AVX512F vector return "
9061 "without AVX512F enabled changes the ABI"))
9062 warnedavx512f_ret = true;
9065 return TYPE_MODE (type);
9067 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9069 static bool warnedavx;
9070 static bool warnedavx_ret;
9072 if (cum && cum->warn_avx && !warnedavx)
9074 if (warning (OPT_Wpsabi, "AVX vector argument "
9075 "without AVX enabled changes the ABI"))
9076 warnedavx = true;
9078 else if (in_return && !warnedavx_ret)
9080 if (warning (OPT_Wpsabi, "AVX vector return "
9081 "without AVX enabled changes the ABI"))
9082 warnedavx_ret = true;
9085 return TYPE_MODE (type);
9087 else if (((size == 8 && TARGET_64BIT) || size == 16)
9088 && !TARGET_SSE
9089 && !TARGET_IAMCU)
9091 static bool warnedsse;
9092 static bool warnedsse_ret;
9094 if (cum && cum->warn_sse && !warnedsse)
9096 if (warning (OPT_Wpsabi, "SSE vector argument "
9097 "without SSE enabled changes the ABI"))
9098 warnedsse = true;
9100 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9102 if (warning (OPT_Wpsabi, "SSE vector return "
9103 "without SSE enabled changes the ABI"))
9104 warnedsse_ret = true;
9107 else if ((size == 8 && !TARGET_64BIT)
9108 && (!cfun
9109 || cfun->machine->func_type == TYPE_NORMAL)
9110 && !TARGET_MMX
9111 && !TARGET_IAMCU)
9113 static bool warnedmmx;
9114 static bool warnedmmx_ret;
9116 if (cum && cum->warn_mmx && !warnedmmx)
9118 if (warning (OPT_Wpsabi, "MMX vector argument "
9119 "without MMX enabled changes the ABI"))
9120 warnedmmx = true;
9122 else if (in_return && !warnedmmx_ret)
9124 if (warning (OPT_Wpsabi, "MMX vector return "
9125 "without MMX enabled changes the ABI"))
9126 warnedmmx_ret = true;
9129 return mode;
9132 gcc_unreachable ();
9136 return mode;
9139 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9140 this may not agree with the mode that the type system has chosen for the
9141 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9142 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9144 static rtx
9145 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9146 unsigned int regno)
9148 rtx tmp;
9150 if (orig_mode != BLKmode)
9151 tmp = gen_rtx_REG (orig_mode, regno);
9152 else
9154 tmp = gen_rtx_REG (mode, regno);
9155 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9156 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9159 return tmp;
9162 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9163 of this code is to classify each 8bytes of incoming argument by the register
9164 class and assign registers accordingly. */
9166 /* Return the union class of CLASS1 and CLASS2.
9167 See the x86-64 PS ABI for details. */
9169 static enum x86_64_reg_class
9170 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9172 /* Rule #1: If both classes are equal, this is the resulting class. */
9173 if (class1 == class2)
9174 return class1;
9176 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9177 the other class. */
9178 if (class1 == X86_64_NO_CLASS)
9179 return class2;
9180 if (class2 == X86_64_NO_CLASS)
9181 return class1;
9183 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9184 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9185 return X86_64_MEMORY_CLASS;
9187 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9188 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9189 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9190 return X86_64_INTEGERSI_CLASS;
9191 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9192 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9193 return X86_64_INTEGER_CLASS;
9195 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9196 MEMORY is used. */
9197 if (class1 == X86_64_X87_CLASS
9198 || class1 == X86_64_X87UP_CLASS
9199 || class1 == X86_64_COMPLEX_X87_CLASS
9200 || class2 == X86_64_X87_CLASS
9201 || class2 == X86_64_X87UP_CLASS
9202 || class2 == X86_64_COMPLEX_X87_CLASS)
9203 return X86_64_MEMORY_CLASS;
9205 /* Rule #6: Otherwise class SSE is used. */
9206 return X86_64_SSE_CLASS;
9209 /* Classify the argument of type TYPE and mode MODE.
9210 CLASSES will be filled by the register class used to pass each word
9211 of the operand. The number of words is returned. In case the parameter
9212 should be passed in memory, 0 is returned. As a special case for zero
9213 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9215 BIT_OFFSET is used internally for handling records and specifies offset
9216 of the offset in bits modulo 512 to avoid overflow cases.
9218 See the x86-64 PS ABI for details.
9221 static int
9222 classify_argument (machine_mode mode, const_tree type,
9223 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9225 HOST_WIDE_INT bytes =
9226 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9227 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9229 /* Variable sized entities are always passed/returned in memory. */
9230 if (bytes < 0)
9231 return 0;
9233 if (mode != VOIDmode
9234 && targetm.calls.must_pass_in_stack (mode, type))
9235 return 0;
9237 if (type && AGGREGATE_TYPE_P (type))
9239 int i;
9240 tree field;
9241 enum x86_64_reg_class subclasses[MAX_CLASSES];
9243 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9244 if (bytes > 64)
9245 return 0;
9247 for (i = 0; i < words; i++)
9248 classes[i] = X86_64_NO_CLASS;
9250 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9251 signalize memory class, so handle it as special case. */
9252 if (!words)
9254 classes[0] = X86_64_NO_CLASS;
9255 return 1;
9258 /* Classify each field of record and merge classes. */
9259 switch (TREE_CODE (type))
9261 case RECORD_TYPE:
9262 /* And now merge the fields of structure. */
9263 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9265 if (TREE_CODE (field) == FIELD_DECL)
9267 int num;
9269 if (TREE_TYPE (field) == error_mark_node)
9270 continue;
9272 /* Bitfields are always classified as integer. Handle them
9273 early, since later code would consider them to be
9274 misaligned integers. */
9275 if (DECL_BIT_FIELD (field))
9277 for (i = (int_bit_position (field)
9278 + (bit_offset % 64)) / 8 / 8;
9279 i < ((int_bit_position (field) + (bit_offset % 64))
9280 + tree_to_shwi (DECL_SIZE (field))
9281 + 63) / 8 / 8; i++)
9282 classes[i] =
9283 merge_classes (X86_64_INTEGER_CLASS,
9284 classes[i]);
9286 else
9288 int pos;
9290 type = TREE_TYPE (field);
9292 /* Flexible array member is ignored. */
9293 if (TYPE_MODE (type) == BLKmode
9294 && TREE_CODE (type) == ARRAY_TYPE
9295 && TYPE_SIZE (type) == NULL_TREE
9296 && TYPE_DOMAIN (type) != NULL_TREE
9297 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9298 == NULL_TREE))
9300 static bool warned;
9302 if (!warned && warn_psabi)
9304 warned = true;
9305 inform (input_location,
9306 "the ABI of passing struct with"
9307 " a flexible array member has"
9308 " changed in GCC 4.4");
9310 continue;
9312 num = classify_argument (TYPE_MODE (type), type,
9313 subclasses,
9314 (int_bit_position (field)
9315 + bit_offset) % 512);
9316 if (!num)
9317 return 0;
9318 pos = (int_bit_position (field)
9319 + (bit_offset % 64)) / 8 / 8;
9320 for (i = 0; i < num && (i + pos) < words; i++)
9321 classes[i + pos] =
9322 merge_classes (subclasses[i], classes[i + pos]);
9326 break;
9328 case ARRAY_TYPE:
9329 /* Arrays are handled as small records. */
9331 int num;
9332 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9333 TREE_TYPE (type), subclasses, bit_offset);
9334 if (!num)
9335 return 0;
9337 /* The partial classes are now full classes. */
9338 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9339 subclasses[0] = X86_64_SSE_CLASS;
9340 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9341 && !((bit_offset % 64) == 0 && bytes == 4))
9342 subclasses[0] = X86_64_INTEGER_CLASS;
9344 for (i = 0; i < words; i++)
9345 classes[i] = subclasses[i % num];
9347 break;
9349 case UNION_TYPE:
9350 case QUAL_UNION_TYPE:
9351 /* Unions are similar to RECORD_TYPE but offset is always 0.
9353 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9355 if (TREE_CODE (field) == FIELD_DECL)
9357 int num;
9359 if (TREE_TYPE (field) == error_mark_node)
9360 continue;
9362 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9363 TREE_TYPE (field), subclasses,
9364 bit_offset);
9365 if (!num)
9366 return 0;
9367 for (i = 0; i < num && i < words; i++)
9368 classes[i] = merge_classes (subclasses[i], classes[i]);
9371 break;
9373 default:
9374 gcc_unreachable ();
9377 if (words > 2)
9379 /* When size > 16 bytes, if the first one isn't
9380 X86_64_SSE_CLASS or any other ones aren't
9381 X86_64_SSEUP_CLASS, everything should be passed in
9382 memory. */
9383 if (classes[0] != X86_64_SSE_CLASS)
9384 return 0;
9386 for (i = 1; i < words; i++)
9387 if (classes[i] != X86_64_SSEUP_CLASS)
9388 return 0;
9391 /* Final merger cleanup. */
9392 for (i = 0; i < words; i++)
9394 /* If one class is MEMORY, everything should be passed in
9395 memory. */
9396 if (classes[i] == X86_64_MEMORY_CLASS)
9397 return 0;
9399 /* The X86_64_SSEUP_CLASS should be always preceded by
9400 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9401 if (classes[i] == X86_64_SSEUP_CLASS
9402 && classes[i - 1] != X86_64_SSE_CLASS
9403 && classes[i - 1] != X86_64_SSEUP_CLASS)
9405 /* The first one should never be X86_64_SSEUP_CLASS. */
9406 gcc_assert (i != 0);
9407 classes[i] = X86_64_SSE_CLASS;
9410 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9411 everything should be passed in memory. */
9412 if (classes[i] == X86_64_X87UP_CLASS
9413 && (classes[i - 1] != X86_64_X87_CLASS))
9415 static bool warned;
9417 /* The first one should never be X86_64_X87UP_CLASS. */
9418 gcc_assert (i != 0);
9419 if (!warned && warn_psabi)
9421 warned = true;
9422 inform (input_location,
9423 "the ABI of passing union with long double"
9424 " has changed in GCC 4.4");
9426 return 0;
9429 return words;
9432 /* Compute alignment needed. We align all types to natural boundaries with
9433 exception of XFmode that is aligned to 64bits. */
9434 if (mode != VOIDmode && mode != BLKmode)
9436 int mode_alignment = GET_MODE_BITSIZE (mode);
9438 if (mode == XFmode)
9439 mode_alignment = 128;
9440 else if (mode == XCmode)
9441 mode_alignment = 256;
9442 if (COMPLEX_MODE_P (mode))
9443 mode_alignment /= 2;
9444 /* Misaligned fields are always returned in memory. */
9445 if (bit_offset % mode_alignment)
9446 return 0;
9449 /* for V1xx modes, just use the base mode */
9450 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9451 && GET_MODE_UNIT_SIZE (mode) == bytes)
9452 mode = GET_MODE_INNER (mode);
9454 /* Classification of atomic types. */
9455 switch (mode)
9457 case SDmode:
9458 case DDmode:
9459 classes[0] = X86_64_SSE_CLASS;
9460 return 1;
9461 case TDmode:
9462 classes[0] = X86_64_SSE_CLASS;
9463 classes[1] = X86_64_SSEUP_CLASS;
9464 return 2;
9465 case DImode:
9466 case SImode:
9467 case HImode:
9468 case QImode:
9469 case CSImode:
9470 case CHImode:
9471 case CQImode:
9473 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9475 /* Analyze last 128 bits only. */
9476 size = (size - 1) & 0x7f;
9478 if (size < 32)
9480 classes[0] = X86_64_INTEGERSI_CLASS;
9481 return 1;
9483 else if (size < 64)
9485 classes[0] = X86_64_INTEGER_CLASS;
9486 return 1;
9488 else if (size < 64+32)
9490 classes[0] = X86_64_INTEGER_CLASS;
9491 classes[1] = X86_64_INTEGERSI_CLASS;
9492 return 2;
9494 else if (size < 64+64)
9496 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9497 return 2;
9499 else
9500 gcc_unreachable ();
9502 case CDImode:
9503 case TImode:
9504 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9505 return 2;
9506 case COImode:
9507 case OImode:
9508 /* OImode shouldn't be used directly. */
9509 gcc_unreachable ();
9510 case CTImode:
9511 return 0;
9512 case SFmode:
9513 if (!(bit_offset % 64))
9514 classes[0] = X86_64_SSESF_CLASS;
9515 else
9516 classes[0] = X86_64_SSE_CLASS;
9517 return 1;
9518 case DFmode:
9519 classes[0] = X86_64_SSEDF_CLASS;
9520 return 1;
9521 case XFmode:
9522 classes[0] = X86_64_X87_CLASS;
9523 classes[1] = X86_64_X87UP_CLASS;
9524 return 2;
9525 case TFmode:
9526 classes[0] = X86_64_SSE_CLASS;
9527 classes[1] = X86_64_SSEUP_CLASS;
9528 return 2;
9529 case SCmode:
9530 classes[0] = X86_64_SSE_CLASS;
9531 if (!(bit_offset % 64))
9532 return 1;
9533 else
9535 static bool warned;
9537 if (!warned && warn_psabi)
9539 warned = true;
9540 inform (input_location,
9541 "the ABI of passing structure with complex float"
9542 " member has changed in GCC 4.4");
9544 classes[1] = X86_64_SSESF_CLASS;
9545 return 2;
9547 case DCmode:
9548 classes[0] = X86_64_SSEDF_CLASS;
9549 classes[1] = X86_64_SSEDF_CLASS;
9550 return 2;
9551 case XCmode:
9552 classes[0] = X86_64_COMPLEX_X87_CLASS;
9553 return 1;
9554 case TCmode:
9555 /* This modes is larger than 16 bytes. */
9556 return 0;
9557 case V8SFmode:
9558 case V8SImode:
9559 case V32QImode:
9560 case V16HImode:
9561 case V4DFmode:
9562 case V4DImode:
9563 classes[0] = X86_64_SSE_CLASS;
9564 classes[1] = X86_64_SSEUP_CLASS;
9565 classes[2] = X86_64_SSEUP_CLASS;
9566 classes[3] = X86_64_SSEUP_CLASS;
9567 return 4;
9568 case V8DFmode:
9569 case V16SFmode:
9570 case V8DImode:
9571 case V16SImode:
9572 case V32HImode:
9573 case V64QImode:
9574 classes[0] = X86_64_SSE_CLASS;
9575 classes[1] = X86_64_SSEUP_CLASS;
9576 classes[2] = X86_64_SSEUP_CLASS;
9577 classes[3] = X86_64_SSEUP_CLASS;
9578 classes[4] = X86_64_SSEUP_CLASS;
9579 classes[5] = X86_64_SSEUP_CLASS;
9580 classes[6] = X86_64_SSEUP_CLASS;
9581 classes[7] = X86_64_SSEUP_CLASS;
9582 return 8;
9583 case V4SFmode:
9584 case V4SImode:
9585 case V16QImode:
9586 case V8HImode:
9587 case V2DFmode:
9588 case V2DImode:
9589 classes[0] = X86_64_SSE_CLASS;
9590 classes[1] = X86_64_SSEUP_CLASS;
9591 return 2;
9592 case V1TImode:
9593 case V1DImode:
9594 case V2SFmode:
9595 case V2SImode:
9596 case V4HImode:
9597 case V8QImode:
9598 classes[0] = X86_64_SSE_CLASS;
9599 return 1;
9600 case BLKmode:
9601 case VOIDmode:
9602 return 0;
9603 default:
9604 gcc_assert (VECTOR_MODE_P (mode));
9606 if (bytes > 16)
9607 return 0;
9609 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9611 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9612 classes[0] = X86_64_INTEGERSI_CLASS;
9613 else
9614 classes[0] = X86_64_INTEGER_CLASS;
9615 classes[1] = X86_64_INTEGER_CLASS;
9616 return 1 + (bytes > 8);
9620 /* Examine the argument and return set number of register required in each
9621 class. Return true iff parameter should be passed in memory. */
9623 static bool
9624 examine_argument (machine_mode mode, const_tree type, int in_return,
9625 int *int_nregs, int *sse_nregs)
9627 enum x86_64_reg_class regclass[MAX_CLASSES];
9628 int n = classify_argument (mode, type, regclass, 0);
9630 *int_nregs = 0;
9631 *sse_nregs = 0;
9633 if (!n)
9634 return true;
9635 for (n--; n >= 0; n--)
9636 switch (regclass[n])
9638 case X86_64_INTEGER_CLASS:
9639 case X86_64_INTEGERSI_CLASS:
9640 (*int_nregs)++;
9641 break;
9642 case X86_64_SSE_CLASS:
9643 case X86_64_SSESF_CLASS:
9644 case X86_64_SSEDF_CLASS:
9645 (*sse_nregs)++;
9646 break;
9647 case X86_64_NO_CLASS:
9648 case X86_64_SSEUP_CLASS:
9649 break;
9650 case X86_64_X87_CLASS:
9651 case X86_64_X87UP_CLASS:
9652 case X86_64_COMPLEX_X87_CLASS:
9653 if (!in_return)
9654 return true;
9655 break;
9656 case X86_64_MEMORY_CLASS:
9657 gcc_unreachable ();
9660 return false;
9663 /* Construct container for the argument used by GCC interface. See
9664 FUNCTION_ARG for the detailed description. */
9666 static rtx
9667 construct_container (machine_mode mode, machine_mode orig_mode,
9668 const_tree type, int in_return, int nintregs, int nsseregs,
9669 const int *intreg, int sse_regno)
9671 /* The following variables hold the static issued_error state. */
9672 static bool issued_sse_arg_error;
9673 static bool issued_sse_ret_error;
9674 static bool issued_x87_ret_error;
9676 machine_mode tmpmode;
9677 int bytes =
9678 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9679 enum x86_64_reg_class regclass[MAX_CLASSES];
9680 int n;
9681 int i;
9682 int nexps = 0;
9683 int needed_sseregs, needed_intregs;
9684 rtx exp[MAX_CLASSES];
9685 rtx ret;
9687 n = classify_argument (mode, type, regclass, 0);
9688 if (!n)
9689 return NULL;
9690 if (examine_argument (mode, type, in_return, &needed_intregs,
9691 &needed_sseregs))
9692 return NULL;
9693 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9694 return NULL;
9696 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9697 some less clueful developer tries to use floating-point anyway. */
9698 if (needed_sseregs && !TARGET_SSE)
9700 if (in_return)
9702 if (!issued_sse_ret_error)
9704 error ("SSE register return with SSE disabled");
9705 issued_sse_ret_error = true;
9708 else if (!issued_sse_arg_error)
9710 error ("SSE register argument with SSE disabled");
9711 issued_sse_arg_error = true;
9713 return NULL;
9716 /* Likewise, error if the ABI requires us to return values in the
9717 x87 registers and the user specified -mno-80387. */
9718 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9719 for (i = 0; i < n; i++)
9720 if (regclass[i] == X86_64_X87_CLASS
9721 || regclass[i] == X86_64_X87UP_CLASS
9722 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9724 if (!issued_x87_ret_error)
9726 error ("x87 register return with x87 disabled");
9727 issued_x87_ret_error = true;
9729 return NULL;
9732 /* First construct simple cases. Avoid SCmode, since we want to use
9733 single register to pass this type. */
9734 if (n == 1 && mode != SCmode)
9735 switch (regclass[0])
9737 case X86_64_INTEGER_CLASS:
9738 case X86_64_INTEGERSI_CLASS:
9739 return gen_rtx_REG (mode, intreg[0]);
9740 case X86_64_SSE_CLASS:
9741 case X86_64_SSESF_CLASS:
9742 case X86_64_SSEDF_CLASS:
9743 if (mode != BLKmode)
9744 return gen_reg_or_parallel (mode, orig_mode,
9745 SSE_REGNO (sse_regno));
9746 break;
9747 case X86_64_X87_CLASS:
9748 case X86_64_COMPLEX_X87_CLASS:
9749 return gen_rtx_REG (mode, FIRST_STACK_REG);
9750 case X86_64_NO_CLASS:
9751 /* Zero sized array, struct or class. */
9752 return NULL;
9753 default:
9754 gcc_unreachable ();
9756 if (n == 2
9757 && regclass[0] == X86_64_SSE_CLASS
9758 && regclass[1] == X86_64_SSEUP_CLASS
9759 && mode != BLKmode)
9760 return gen_reg_or_parallel (mode, orig_mode,
9761 SSE_REGNO (sse_regno));
9762 if (n == 4
9763 && regclass[0] == X86_64_SSE_CLASS
9764 && regclass[1] == X86_64_SSEUP_CLASS
9765 && regclass[2] == X86_64_SSEUP_CLASS
9766 && regclass[3] == X86_64_SSEUP_CLASS
9767 && mode != BLKmode)
9768 return gen_reg_or_parallel (mode, orig_mode,
9769 SSE_REGNO (sse_regno));
9770 if (n == 8
9771 && regclass[0] == X86_64_SSE_CLASS
9772 && regclass[1] == X86_64_SSEUP_CLASS
9773 && regclass[2] == X86_64_SSEUP_CLASS
9774 && regclass[3] == X86_64_SSEUP_CLASS
9775 && regclass[4] == X86_64_SSEUP_CLASS
9776 && regclass[5] == X86_64_SSEUP_CLASS
9777 && regclass[6] == X86_64_SSEUP_CLASS
9778 && regclass[7] == X86_64_SSEUP_CLASS
9779 && mode != BLKmode)
9780 return gen_reg_or_parallel (mode, orig_mode,
9781 SSE_REGNO (sse_regno));
9782 if (n == 2
9783 && regclass[0] == X86_64_X87_CLASS
9784 && regclass[1] == X86_64_X87UP_CLASS)
9785 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9787 if (n == 2
9788 && regclass[0] == X86_64_INTEGER_CLASS
9789 && regclass[1] == X86_64_INTEGER_CLASS
9790 && (mode == CDImode || mode == TImode)
9791 && intreg[0] + 1 == intreg[1])
9792 return gen_rtx_REG (mode, intreg[0]);
9794 /* Otherwise figure out the entries of the PARALLEL. */
9795 for (i = 0; i < n; i++)
9797 int pos;
9799 switch (regclass[i])
9801 case X86_64_NO_CLASS:
9802 break;
9803 case X86_64_INTEGER_CLASS:
9804 case X86_64_INTEGERSI_CLASS:
9805 /* Merge TImodes on aligned occasions here too. */
9806 if (i * 8 + 8 > bytes)
9807 tmpmode
9808 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9809 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9810 tmpmode = SImode;
9811 else
9812 tmpmode = DImode;
9813 /* We've requested 24 bytes we
9814 don't have mode for. Use DImode. */
9815 if (tmpmode == BLKmode)
9816 tmpmode = DImode;
9817 exp [nexps++]
9818 = gen_rtx_EXPR_LIST (VOIDmode,
9819 gen_rtx_REG (tmpmode, *intreg),
9820 GEN_INT (i*8));
9821 intreg++;
9822 break;
9823 case X86_64_SSESF_CLASS:
9824 exp [nexps++]
9825 = gen_rtx_EXPR_LIST (VOIDmode,
9826 gen_rtx_REG (SFmode,
9827 SSE_REGNO (sse_regno)),
9828 GEN_INT (i*8));
9829 sse_regno++;
9830 break;
9831 case X86_64_SSEDF_CLASS:
9832 exp [nexps++]
9833 = gen_rtx_EXPR_LIST (VOIDmode,
9834 gen_rtx_REG (DFmode,
9835 SSE_REGNO (sse_regno)),
9836 GEN_INT (i*8));
9837 sse_regno++;
9838 break;
9839 case X86_64_SSE_CLASS:
9840 pos = i;
9841 switch (n)
9843 case 1:
9844 tmpmode = DImode;
9845 break;
9846 case 2:
9847 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9849 tmpmode = TImode;
9850 i++;
9852 else
9853 tmpmode = DImode;
9854 break;
9855 case 4:
9856 gcc_assert (i == 0
9857 && regclass[1] == X86_64_SSEUP_CLASS
9858 && regclass[2] == X86_64_SSEUP_CLASS
9859 && regclass[3] == X86_64_SSEUP_CLASS);
9860 tmpmode = OImode;
9861 i += 3;
9862 break;
9863 case 8:
9864 gcc_assert (i == 0
9865 && regclass[1] == X86_64_SSEUP_CLASS
9866 && regclass[2] == X86_64_SSEUP_CLASS
9867 && regclass[3] == X86_64_SSEUP_CLASS
9868 && regclass[4] == X86_64_SSEUP_CLASS
9869 && regclass[5] == X86_64_SSEUP_CLASS
9870 && regclass[6] == X86_64_SSEUP_CLASS
9871 && regclass[7] == X86_64_SSEUP_CLASS);
9872 tmpmode = XImode;
9873 i += 7;
9874 break;
9875 default:
9876 gcc_unreachable ();
9878 exp [nexps++]
9879 = gen_rtx_EXPR_LIST (VOIDmode,
9880 gen_rtx_REG (tmpmode,
9881 SSE_REGNO (sse_regno)),
9882 GEN_INT (pos*8));
9883 sse_regno++;
9884 break;
9885 default:
9886 gcc_unreachable ();
9890 /* Empty aligned struct, union or class. */
9891 if (nexps == 0)
9892 return NULL;
9894 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9895 for (i = 0; i < nexps; i++)
9896 XVECEXP (ret, 0, i) = exp [i];
9897 return ret;
9900 /* Update the data in CUM to advance over an argument of mode MODE
9901 and data type TYPE. (TYPE is null for libcalls where that information
9902 may not be available.)
9904 Return a number of integer regsiters advanced over. */
9906 static int
9907 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9908 const_tree type, HOST_WIDE_INT bytes,
9909 HOST_WIDE_INT words)
9911 int res = 0;
9912 bool error_p = false;
9914 if (TARGET_IAMCU)
9916 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9917 bytes in registers. */
9918 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9919 goto pass_in_reg;
9920 return res;
9923 switch (mode)
9925 default:
9926 break;
9928 case BLKmode:
9929 if (bytes < 0)
9930 break;
9931 /* FALLTHRU */
9933 case DImode:
9934 case SImode:
9935 case HImode:
9936 case QImode:
9937 pass_in_reg:
9938 cum->words += words;
9939 cum->nregs -= words;
9940 cum->regno += words;
9941 if (cum->nregs >= 0)
9942 res = words;
9943 if (cum->nregs <= 0)
9945 cum->nregs = 0;
9946 cfun->machine->arg_reg_available = false;
9947 cum->regno = 0;
9949 break;
9951 case OImode:
9952 /* OImode shouldn't be used directly. */
9953 gcc_unreachable ();
9955 case DFmode:
9956 if (cum->float_in_sse == -1)
9957 error_p = true;
9958 if (cum->float_in_sse < 2)
9959 break;
9960 /* FALLTHRU */
9961 case SFmode:
9962 if (cum->float_in_sse == -1)
9963 error_p = true;
9964 if (cum->float_in_sse < 1)
9965 break;
9966 /* FALLTHRU */
9968 case V8SFmode:
9969 case V8SImode:
9970 case V64QImode:
9971 case V32HImode:
9972 case V16SImode:
9973 case V8DImode:
9974 case V16SFmode:
9975 case V8DFmode:
9976 case V32QImode:
9977 case V16HImode:
9978 case V4DFmode:
9979 case V4DImode:
9980 case TImode:
9981 case V16QImode:
9982 case V8HImode:
9983 case V4SImode:
9984 case V2DImode:
9985 case V4SFmode:
9986 case V2DFmode:
9987 if (!type || !AGGREGATE_TYPE_P (type))
9989 cum->sse_words += words;
9990 cum->sse_nregs -= 1;
9991 cum->sse_regno += 1;
9992 if (cum->sse_nregs <= 0)
9994 cum->sse_nregs = 0;
9995 cum->sse_regno = 0;
9998 break;
10000 case V8QImode:
10001 case V4HImode:
10002 case V2SImode:
10003 case V2SFmode:
10004 case V1TImode:
10005 case V1DImode:
10006 if (!type || !AGGREGATE_TYPE_P (type))
10008 cum->mmx_words += words;
10009 cum->mmx_nregs -= 1;
10010 cum->mmx_regno += 1;
10011 if (cum->mmx_nregs <= 0)
10013 cum->mmx_nregs = 0;
10014 cum->mmx_regno = 0;
10017 break;
10019 if (error_p)
10021 cum->float_in_sse = 0;
10022 error ("calling %qD with SSE calling convention without "
10023 "SSE/SSE2 enabled", cum->decl);
10024 sorry ("this is a GCC bug that can be worked around by adding "
10025 "attribute used to function called");
10028 return res;
10031 static int
10032 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10033 const_tree type, HOST_WIDE_INT words, bool named)
10035 int int_nregs, sse_nregs;
10037 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10038 if (!named && (VALID_AVX512F_REG_MODE (mode)
10039 || VALID_AVX256_REG_MODE (mode)))
10040 return 0;
10042 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10043 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10045 cum->nregs -= int_nregs;
10046 cum->sse_nregs -= sse_nregs;
10047 cum->regno += int_nregs;
10048 cum->sse_regno += sse_nregs;
10049 return int_nregs;
10051 else
10053 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10054 cum->words = ROUND_UP (cum->words, align);
10055 cum->words += words;
10056 return 0;
10060 static int
10061 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10062 HOST_WIDE_INT words)
10064 /* Otherwise, this should be passed indirect. */
10065 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10067 cum->words += words;
10068 if (cum->nregs > 0)
10070 cum->nregs -= 1;
10071 cum->regno += 1;
10072 return 1;
10074 return 0;
10077 /* Update the data in CUM to advance over an argument of mode MODE and
10078 data type TYPE. (TYPE is null for libcalls where that information
10079 may not be available.) */
10081 static void
10082 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10083 const_tree type, bool named)
10085 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10086 HOST_WIDE_INT bytes, words;
10087 int nregs;
10089 /* The argument of interrupt handler is a special case and is
10090 handled in ix86_function_arg. */
10091 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10092 return;
10094 if (mode == BLKmode)
10095 bytes = int_size_in_bytes (type);
10096 else
10097 bytes = GET_MODE_SIZE (mode);
10098 words = CEIL (bytes, UNITS_PER_WORD);
10100 if (type)
10101 mode = type_natural_mode (type, NULL, false);
10103 if ((type && POINTER_BOUNDS_TYPE_P (type))
10104 || POINTER_BOUNDS_MODE_P (mode))
10106 /* If we pass bounds in BT then just update remained bounds count. */
10107 if (cum->bnds_in_bt)
10109 cum->bnds_in_bt--;
10110 return;
10113 /* Update remained number of bounds to force. */
10114 if (cum->force_bnd_pass)
10115 cum->force_bnd_pass--;
10117 cum->bnd_regno++;
10119 return;
10122 /* The first arg not going to Bounds Tables resets this counter. */
10123 cum->bnds_in_bt = 0;
10124 /* For unnamed args we always pass bounds to avoid bounds mess when
10125 passed and received types do not match. If bounds do not follow
10126 unnamed arg, still pretend required number of bounds were passed. */
10127 if (cum->force_bnd_pass)
10129 cum->bnd_regno += cum->force_bnd_pass;
10130 cum->force_bnd_pass = 0;
10133 if (TARGET_64BIT)
10135 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10137 if (call_abi == MS_ABI)
10138 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10139 else
10140 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10142 else
10143 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10145 /* For stdarg we expect bounds to be passed for each value passed
10146 in register. */
10147 if (cum->stdarg)
10148 cum->force_bnd_pass = nregs;
10149 /* For pointers passed in memory we expect bounds passed in Bounds
10150 Table. */
10151 if (!nregs)
10153 /* Track if there are outgoing arguments on stack. */
10154 if (cum->caller)
10155 cfun->machine->outgoing_args_on_stack = true;
10157 cum->bnds_in_bt = chkp_type_bounds_count (type);
10161 /* Define where to put the arguments to a function.
10162 Value is zero to push the argument on the stack,
10163 or a hard register in which to store the argument.
10165 MODE is the argument's machine mode.
10166 TYPE is the data type of the argument (as a tree).
10167 This is null for libcalls where that information may
10168 not be available.
10169 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10170 the preceding args and about the function being called.
10171 NAMED is nonzero if this argument is a named parameter
10172 (otherwise it is an extra parameter matching an ellipsis). */
10174 static rtx
10175 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10176 machine_mode orig_mode, const_tree type,
10177 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10179 bool error_p = false;
10181 /* Avoid the AL settings for the Unix64 ABI. */
10182 if (mode == VOIDmode)
10183 return constm1_rtx;
10185 if (TARGET_IAMCU)
10187 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10188 bytes in registers. */
10189 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10190 goto pass_in_reg;
10191 return NULL_RTX;
10194 switch (mode)
10196 default:
10197 break;
10199 case BLKmode:
10200 if (bytes < 0)
10201 break;
10202 /* FALLTHRU */
10203 case DImode:
10204 case SImode:
10205 case HImode:
10206 case QImode:
10207 pass_in_reg:
10208 if (words <= cum->nregs)
10210 int regno = cum->regno;
10212 /* Fastcall allocates the first two DWORD (SImode) or
10213 smaller arguments to ECX and EDX if it isn't an
10214 aggregate type . */
10215 if (cum->fastcall)
10217 if (mode == BLKmode
10218 || mode == DImode
10219 || (type && AGGREGATE_TYPE_P (type)))
10220 break;
10222 /* ECX not EAX is the first allocated register. */
10223 if (regno == AX_REG)
10224 regno = CX_REG;
10226 return gen_rtx_REG (mode, regno);
10228 break;
10230 case DFmode:
10231 if (cum->float_in_sse == -1)
10232 error_p = true;
10233 if (cum->float_in_sse < 2)
10234 break;
10235 /* FALLTHRU */
10236 case SFmode:
10237 if (cum->float_in_sse == -1)
10238 error_p = true;
10239 if (cum->float_in_sse < 1)
10240 break;
10241 /* FALLTHRU */
10242 case TImode:
10243 /* In 32bit, we pass TImode in xmm registers. */
10244 case V16QImode:
10245 case V8HImode:
10246 case V4SImode:
10247 case V2DImode:
10248 case V4SFmode:
10249 case V2DFmode:
10250 if (!type || !AGGREGATE_TYPE_P (type))
10252 if (cum->sse_nregs)
10253 return gen_reg_or_parallel (mode, orig_mode,
10254 cum->sse_regno + FIRST_SSE_REG);
10256 break;
10258 case OImode:
10259 case XImode:
10260 /* OImode and XImode shouldn't be used directly. */
10261 gcc_unreachable ();
10263 case V64QImode:
10264 case V32HImode:
10265 case V16SImode:
10266 case V8DImode:
10267 case V16SFmode:
10268 case V8DFmode:
10269 case V8SFmode:
10270 case V8SImode:
10271 case V32QImode:
10272 case V16HImode:
10273 case V4DFmode:
10274 case V4DImode:
10275 if (!type || !AGGREGATE_TYPE_P (type))
10277 if (cum->sse_nregs)
10278 return gen_reg_or_parallel (mode, orig_mode,
10279 cum->sse_regno + FIRST_SSE_REG);
10281 break;
10283 case V8QImode:
10284 case V4HImode:
10285 case V2SImode:
10286 case V2SFmode:
10287 case V1TImode:
10288 case V1DImode:
10289 if (!type || !AGGREGATE_TYPE_P (type))
10291 if (cum->mmx_nregs)
10292 return gen_reg_or_parallel (mode, orig_mode,
10293 cum->mmx_regno + FIRST_MMX_REG);
10295 break;
10297 if (error_p)
10299 cum->float_in_sse = 0;
10300 error ("calling %qD with SSE calling convention without "
10301 "SSE/SSE2 enabled", cum->decl);
10302 sorry ("this is a GCC bug that can be worked around by adding "
10303 "attribute used to function called");
10306 return NULL_RTX;
10309 static rtx
10310 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10311 machine_mode orig_mode, const_tree type, bool named)
10313 /* Handle a hidden AL argument containing number of registers
10314 for varargs x86-64 functions. */
10315 if (mode == VOIDmode)
10316 return GEN_INT (cum->maybe_vaarg
10317 ? (cum->sse_nregs < 0
10318 ? X86_64_SSE_REGPARM_MAX
10319 : cum->sse_regno)
10320 : -1);
10322 switch (mode)
10324 default:
10325 break;
10327 case V8SFmode:
10328 case V8SImode:
10329 case V32QImode:
10330 case V16HImode:
10331 case V4DFmode:
10332 case V4DImode:
10333 case V16SFmode:
10334 case V16SImode:
10335 case V64QImode:
10336 case V32HImode:
10337 case V8DFmode:
10338 case V8DImode:
10339 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10340 if (!named)
10341 return NULL;
10342 break;
10345 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10346 cum->sse_nregs,
10347 &x86_64_int_parameter_registers [cum->regno],
10348 cum->sse_regno);
10351 static rtx
10352 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10353 machine_mode orig_mode, bool named,
10354 HOST_WIDE_INT bytes)
10356 unsigned int regno;
10358 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10359 We use value of -2 to specify that current function call is MSABI. */
10360 if (mode == VOIDmode)
10361 return GEN_INT (-2);
10363 /* If we've run out of registers, it goes on the stack. */
10364 if (cum->nregs == 0)
10365 return NULL_RTX;
10367 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10369 /* Only floating point modes are passed in anything but integer regs. */
10370 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10372 if (named)
10373 regno = cum->regno + FIRST_SSE_REG;
10374 else
10376 rtx t1, t2;
10378 /* Unnamed floating parameters are passed in both the
10379 SSE and integer registers. */
10380 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10381 t2 = gen_rtx_REG (mode, regno);
10382 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10383 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10384 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10387 /* Handle aggregated types passed in register. */
10388 if (orig_mode == BLKmode)
10390 if (bytes > 0 && bytes <= 8)
10391 mode = (bytes > 4 ? DImode : SImode);
10392 if (mode == BLKmode)
10393 mode = DImode;
10396 return gen_reg_or_parallel (mode, orig_mode, regno);
10399 /* Return where to put the arguments to a function.
10400 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10402 MODE is the argument's machine mode. TYPE is the data type of the
10403 argument. It is null for libcalls where that information may not be
10404 available. CUM gives information about the preceding args and about
10405 the function being called. NAMED is nonzero if this argument is a
10406 named parameter (otherwise it is an extra parameter matching an
10407 ellipsis). */
10409 static rtx
10410 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10411 const_tree type, bool named)
10413 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10414 machine_mode mode = omode;
10415 HOST_WIDE_INT bytes, words;
10416 rtx arg;
10418 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10420 gcc_assert (type != NULL_TREE);
10421 if (POINTER_TYPE_P (type))
10423 /* This is the pointer argument. */
10424 gcc_assert (TYPE_MODE (type) == Pmode);
10425 if (cfun->machine->func_type == TYPE_INTERRUPT)
10426 /* -WORD(AP) in the current frame in interrupt handler. */
10427 arg = plus_constant (Pmode, arg_pointer_rtx,
10428 -UNITS_PER_WORD);
10429 else
10430 /* (AP) in the current frame in exception handler. */
10431 arg = arg_pointer_rtx;
10433 else
10435 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10436 && TREE_CODE (type) == INTEGER_TYPE
10437 && TYPE_MODE (type) == word_mode);
10438 /* The integer argument is the error code at -WORD(AP) in
10439 the current frame in exception handler. */
10440 arg = gen_rtx_MEM (word_mode,
10441 plus_constant (Pmode,
10442 arg_pointer_rtx,
10443 -UNITS_PER_WORD));
10445 return arg;
10448 /* All pointer bounds arguments are handled separately here. */
10449 if ((type && POINTER_BOUNDS_TYPE_P (type))
10450 || POINTER_BOUNDS_MODE_P (mode))
10452 /* Return NULL if bounds are forced to go in Bounds Table. */
10453 if (cum->bnds_in_bt)
10454 arg = NULL;
10455 /* Return the next available bound reg if any. */
10456 else if (cum->bnd_regno <= LAST_BND_REG)
10457 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10458 /* Return the next special slot number otherwise. */
10459 else
10460 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10462 return arg;
10465 if (mode == BLKmode)
10466 bytes = int_size_in_bytes (type);
10467 else
10468 bytes = GET_MODE_SIZE (mode);
10469 words = CEIL (bytes, UNITS_PER_WORD);
10471 /* To simplify the code below, represent vector types with a vector mode
10472 even if MMX/SSE are not active. */
10473 if (type && TREE_CODE (type) == VECTOR_TYPE)
10474 mode = type_natural_mode (type, cum, false);
10476 if (TARGET_64BIT)
10478 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10480 if (call_abi == MS_ABI)
10481 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10482 else
10483 arg = function_arg_64 (cum, mode, omode, type, named);
10485 else
10486 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10488 /* Track if there are outgoing arguments on stack. */
10489 if (arg == NULL_RTX && cum->caller)
10490 cfun->machine->outgoing_args_on_stack = true;
10492 return arg;
10495 /* A C expression that indicates when an argument must be passed by
10496 reference. If nonzero for an argument, a copy of that argument is
10497 made in memory and a pointer to the argument is passed instead of
10498 the argument itself. The pointer is passed in whatever way is
10499 appropriate for passing a pointer to that type. */
10501 static bool
10502 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10503 const_tree type, bool)
10505 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10507 /* Bounds are never passed by reference. */
10508 if ((type && POINTER_BOUNDS_TYPE_P (type))
10509 || POINTER_BOUNDS_MODE_P (mode))
10510 return false;
10512 if (TARGET_64BIT)
10514 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10516 /* See Windows x64 Software Convention. */
10517 if (call_abi == MS_ABI)
10519 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10521 if (type)
10523 /* Arrays are passed by reference. */
10524 if (TREE_CODE (type) == ARRAY_TYPE)
10525 return true;
10527 if (RECORD_OR_UNION_TYPE_P (type))
10529 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10530 are passed by reference. */
10531 msize = int_size_in_bytes (type);
10535 /* __m128 is passed by reference. */
10536 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10538 else if (type && int_size_in_bytes (type) == -1)
10539 return true;
10542 return false;
10545 /* Return true when TYPE should be 128bit aligned for 32bit argument
10546 passing ABI. XXX: This function is obsolete and is only used for
10547 checking psABI compatibility with previous versions of GCC. */
10549 static bool
10550 ix86_compat_aligned_value_p (const_tree type)
10552 machine_mode mode = TYPE_MODE (type);
10553 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10554 || mode == TDmode
10555 || mode == TFmode
10556 || mode == TCmode)
10557 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10558 return true;
10559 if (TYPE_ALIGN (type) < 128)
10560 return false;
10562 if (AGGREGATE_TYPE_P (type))
10564 /* Walk the aggregates recursively. */
10565 switch (TREE_CODE (type))
10567 case RECORD_TYPE:
10568 case UNION_TYPE:
10569 case QUAL_UNION_TYPE:
10571 tree field;
10573 /* Walk all the structure fields. */
10574 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10576 if (TREE_CODE (field) == FIELD_DECL
10577 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10578 return true;
10580 break;
10583 case ARRAY_TYPE:
10584 /* Just for use if some languages passes arrays by value. */
10585 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10586 return true;
10587 break;
10589 default:
10590 gcc_unreachable ();
10593 return false;
10596 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10597 XXX: This function is obsolete and is only used for checking psABI
10598 compatibility with previous versions of GCC. */
10600 static unsigned int
10601 ix86_compat_function_arg_boundary (machine_mode mode,
10602 const_tree type, unsigned int align)
10604 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10605 natural boundaries. */
10606 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10608 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10609 make an exception for SSE modes since these require 128bit
10610 alignment.
10612 The handling here differs from field_alignment. ICC aligns MMX
10613 arguments to 4 byte boundaries, while structure fields are aligned
10614 to 8 byte boundaries. */
10615 if (!type)
10617 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10618 align = PARM_BOUNDARY;
10620 else
10622 if (!ix86_compat_aligned_value_p (type))
10623 align = PARM_BOUNDARY;
10626 if (align > BIGGEST_ALIGNMENT)
10627 align = BIGGEST_ALIGNMENT;
10628 return align;
10631 /* Return true when TYPE should be 128bit aligned for 32bit argument
10632 passing ABI. */
10634 static bool
10635 ix86_contains_aligned_value_p (const_tree type)
10637 machine_mode mode = TYPE_MODE (type);
10639 if (mode == XFmode || mode == XCmode)
10640 return false;
10642 if (TYPE_ALIGN (type) < 128)
10643 return false;
10645 if (AGGREGATE_TYPE_P (type))
10647 /* Walk the aggregates recursively. */
10648 switch (TREE_CODE (type))
10650 case RECORD_TYPE:
10651 case UNION_TYPE:
10652 case QUAL_UNION_TYPE:
10654 tree field;
10656 /* Walk all the structure fields. */
10657 for (field = TYPE_FIELDS (type);
10658 field;
10659 field = DECL_CHAIN (field))
10661 if (TREE_CODE (field) == FIELD_DECL
10662 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10663 return true;
10665 break;
10668 case ARRAY_TYPE:
10669 /* Just for use if some languages passes arrays by value. */
10670 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10671 return true;
10672 break;
10674 default:
10675 gcc_unreachable ();
10678 else
10679 return TYPE_ALIGN (type) >= 128;
10681 return false;
10684 /* Gives the alignment boundary, in bits, of an argument with the
10685 specified mode and type. */
10687 static unsigned int
10688 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10690 unsigned int align;
10691 if (type)
10693 /* Since the main variant type is used for call, we convert it to
10694 the main variant type. */
10695 type = TYPE_MAIN_VARIANT (type);
10696 align = TYPE_ALIGN (type);
10698 else
10699 align = GET_MODE_ALIGNMENT (mode);
10700 if (align < PARM_BOUNDARY)
10701 align = PARM_BOUNDARY;
10702 else
10704 static bool warned;
10705 unsigned int saved_align = align;
10707 if (!TARGET_64BIT)
10709 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10710 if (!type)
10712 if (mode == XFmode || mode == XCmode)
10713 align = PARM_BOUNDARY;
10715 else if (!ix86_contains_aligned_value_p (type))
10716 align = PARM_BOUNDARY;
10718 if (align < 128)
10719 align = PARM_BOUNDARY;
10722 if (warn_psabi
10723 && !warned
10724 && align != ix86_compat_function_arg_boundary (mode, type,
10725 saved_align))
10727 warned = true;
10728 inform (input_location,
10729 "The ABI for passing parameters with %d-byte"
10730 " alignment has changed in GCC 4.6",
10731 align / BITS_PER_UNIT);
10735 return align;
10738 /* Return true if N is a possible register number of function value. */
10740 static bool
10741 ix86_function_value_regno_p (const unsigned int regno)
10743 switch (regno)
10745 case AX_REG:
10746 return true;
10747 case DX_REG:
10748 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10749 case DI_REG:
10750 case SI_REG:
10751 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10753 case BND0_REG:
10754 case BND1_REG:
10755 return chkp_function_instrumented_p (current_function_decl);
10757 /* Complex values are returned in %st(0)/%st(1) pair. */
10758 case ST0_REG:
10759 case ST1_REG:
10760 /* TODO: The function should depend on current function ABI but
10761 builtins.c would need updating then. Therefore we use the
10762 default ABI. */
10763 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10764 return false;
10765 return TARGET_FLOAT_RETURNS_IN_80387;
10767 /* Complex values are returned in %xmm0/%xmm1 pair. */
10768 case XMM0_REG:
10769 case XMM1_REG:
10770 return TARGET_SSE;
10772 case MM0_REG:
10773 if (TARGET_MACHO || TARGET_64BIT)
10774 return false;
10775 return TARGET_MMX;
10778 return false;
10781 /* Define how to find the value returned by a function.
10782 VALTYPE is the data type of the value (as a tree).
10783 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10784 otherwise, FUNC is 0. */
10786 static rtx
10787 function_value_32 (machine_mode orig_mode, machine_mode mode,
10788 const_tree fntype, const_tree fn)
10790 unsigned int regno;
10792 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10793 we normally prevent this case when mmx is not available. However
10794 some ABIs may require the result to be returned like DImode. */
10795 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10796 regno = FIRST_MMX_REG;
10798 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10799 we prevent this case when sse is not available. However some ABIs
10800 may require the result to be returned like integer TImode. */
10801 else if (mode == TImode
10802 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10803 regno = FIRST_SSE_REG;
10805 /* 32-byte vector modes in %ymm0. */
10806 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10807 regno = FIRST_SSE_REG;
10809 /* 64-byte vector modes in %zmm0. */
10810 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10811 regno = FIRST_SSE_REG;
10813 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10814 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10815 regno = FIRST_FLOAT_REG;
10816 else
10817 /* Most things go in %eax. */
10818 regno = AX_REG;
10820 /* Override FP return register with %xmm0 for local functions when
10821 SSE math is enabled or for functions with sseregparm attribute. */
10822 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10824 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10825 if (sse_level == -1)
10827 error ("calling %qD with SSE calling convention without "
10828 "SSE/SSE2 enabled", fn);
10829 sorry ("this is a GCC bug that can be worked around by adding "
10830 "attribute used to function called");
10832 else if ((sse_level >= 1 && mode == SFmode)
10833 || (sse_level == 2 && mode == DFmode))
10834 regno = FIRST_SSE_REG;
10837 /* OImode shouldn't be used directly. */
10838 gcc_assert (mode != OImode);
10840 return gen_rtx_REG (orig_mode, regno);
10843 static rtx
10844 function_value_64 (machine_mode orig_mode, machine_mode mode,
10845 const_tree valtype)
10847 rtx ret;
10849 /* Handle libcalls, which don't provide a type node. */
10850 if (valtype == NULL)
10852 unsigned int regno;
10854 switch (mode)
10856 case SFmode:
10857 case SCmode:
10858 case DFmode:
10859 case DCmode:
10860 case TFmode:
10861 case SDmode:
10862 case DDmode:
10863 case TDmode:
10864 regno = FIRST_SSE_REG;
10865 break;
10866 case XFmode:
10867 case XCmode:
10868 regno = FIRST_FLOAT_REG;
10869 break;
10870 case TCmode:
10871 return NULL;
10872 default:
10873 regno = AX_REG;
10876 return gen_rtx_REG (mode, regno);
10878 else if (POINTER_TYPE_P (valtype))
10880 /* Pointers are always returned in word_mode. */
10881 mode = word_mode;
10884 ret = construct_container (mode, orig_mode, valtype, 1,
10885 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10886 x86_64_int_return_registers, 0);
10888 /* For zero sized structures, construct_container returns NULL, but we
10889 need to keep rest of compiler happy by returning meaningful value. */
10890 if (!ret)
10891 ret = gen_rtx_REG (orig_mode, AX_REG);
10893 return ret;
10896 static rtx
10897 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10898 const_tree valtype)
10900 unsigned int regno = AX_REG;
10902 if (TARGET_SSE)
10904 switch (GET_MODE_SIZE (mode))
10906 case 16:
10907 if (valtype != NULL_TREE
10908 && !VECTOR_INTEGER_TYPE_P (valtype)
10909 && !VECTOR_INTEGER_TYPE_P (valtype)
10910 && !INTEGRAL_TYPE_P (valtype)
10911 && !VECTOR_FLOAT_TYPE_P (valtype))
10912 break;
10913 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10914 && !COMPLEX_MODE_P (mode))
10915 regno = FIRST_SSE_REG;
10916 break;
10917 case 8:
10918 case 4:
10919 if (mode == SFmode || mode == DFmode)
10920 regno = FIRST_SSE_REG;
10921 break;
10922 default:
10923 break;
10926 return gen_rtx_REG (orig_mode, regno);
10929 static rtx
10930 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10931 machine_mode orig_mode, machine_mode mode)
10933 const_tree fn, fntype;
10935 fn = NULL_TREE;
10936 if (fntype_or_decl && DECL_P (fntype_or_decl))
10937 fn = fntype_or_decl;
10938 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10940 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10941 || POINTER_BOUNDS_MODE_P (mode))
10942 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10943 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10944 return function_value_ms_64 (orig_mode, mode, valtype);
10945 else if (TARGET_64BIT)
10946 return function_value_64 (orig_mode, mode, valtype);
10947 else
10948 return function_value_32 (orig_mode, mode, fntype, fn);
10951 static rtx
10952 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10954 machine_mode mode, orig_mode;
10956 orig_mode = TYPE_MODE (valtype);
10957 mode = type_natural_mode (valtype, NULL, true);
10958 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10961 /* Return an RTX representing a place where a function returns
10962 or recieves pointer bounds or NULL if no bounds are returned.
10964 VALTYPE is a data type of a value returned by the function.
10966 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10967 or FUNCTION_TYPE of the function.
10969 If OUTGOING is false, return a place in which the caller will
10970 see the return value. Otherwise, return a place where a
10971 function returns a value. */
10973 static rtx
10974 ix86_function_value_bounds (const_tree valtype,
10975 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10976 bool outgoing ATTRIBUTE_UNUSED)
10978 rtx res = NULL_RTX;
10980 if (BOUNDED_TYPE_P (valtype))
10981 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10982 else if (chkp_type_has_pointer (valtype))
10984 bitmap slots;
10985 rtx bounds[2];
10986 bitmap_iterator bi;
10987 unsigned i, bnd_no = 0;
10989 bitmap_obstack_initialize (NULL);
10990 slots = BITMAP_ALLOC (NULL);
10991 chkp_find_bound_slots (valtype, slots);
10993 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10995 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10996 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10997 gcc_assert (bnd_no < 2);
10998 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11001 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11003 BITMAP_FREE (slots);
11004 bitmap_obstack_release (NULL);
11006 else
11007 res = NULL_RTX;
11009 return res;
11012 /* Pointer function arguments and return values are promoted to
11013 word_mode for normal functions. */
11015 static machine_mode
11016 ix86_promote_function_mode (const_tree type, machine_mode mode,
11017 int *punsignedp, const_tree fntype,
11018 int for_return)
11020 if (cfun->machine->func_type == TYPE_NORMAL
11021 && type != NULL_TREE
11022 && POINTER_TYPE_P (type))
11024 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11025 return word_mode;
11027 return default_promote_function_mode (type, mode, punsignedp, fntype,
11028 for_return);
11031 /* Return true if a structure, union or array with MODE containing FIELD
11032 should be accessed using BLKmode. */
11034 static bool
11035 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11037 /* Union with XFmode must be in BLKmode. */
11038 return (mode == XFmode
11039 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11040 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11044 ix86_libcall_value (machine_mode mode)
11046 return ix86_function_value_1 (NULL, NULL, mode, mode);
11049 /* Return true iff type is returned in memory. */
11051 static bool
11052 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11054 #ifdef SUBTARGET_RETURN_IN_MEMORY
11055 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11056 #else
11057 const machine_mode mode = type_natural_mode (type, NULL, true);
11058 HOST_WIDE_INT size;
11060 if (POINTER_BOUNDS_TYPE_P (type))
11061 return false;
11063 if (TARGET_64BIT)
11065 if (ix86_function_type_abi (fntype) == MS_ABI)
11067 size = int_size_in_bytes (type);
11069 /* __m128 is returned in xmm0. */
11070 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11071 || INTEGRAL_TYPE_P (type)
11072 || VECTOR_FLOAT_TYPE_P (type))
11073 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11074 && !COMPLEX_MODE_P (mode)
11075 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11076 return false;
11078 /* Otherwise, the size must be exactly in [1248]. */
11079 return size != 1 && size != 2 && size != 4 && size != 8;
11081 else
11083 int needed_intregs, needed_sseregs;
11085 return examine_argument (mode, type, 1,
11086 &needed_intregs, &needed_sseregs);
11089 else
11091 size = int_size_in_bytes (type);
11093 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11094 bytes in registers. */
11095 if (TARGET_IAMCU)
11096 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11098 if (mode == BLKmode)
11099 return true;
11101 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11102 return false;
11104 if (VECTOR_MODE_P (mode) || mode == TImode)
11106 /* User-created vectors small enough to fit in EAX. */
11107 if (size < 8)
11108 return false;
11110 /* Unless ABI prescibes otherwise,
11111 MMX/3dNow values are returned in MM0 if available. */
11113 if (size == 8)
11114 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11116 /* SSE values are returned in XMM0 if available. */
11117 if (size == 16)
11118 return !TARGET_SSE;
11120 /* AVX values are returned in YMM0 if available. */
11121 if (size == 32)
11122 return !TARGET_AVX;
11124 /* AVX512F values are returned in ZMM0 if available. */
11125 if (size == 64)
11126 return !TARGET_AVX512F;
11129 if (mode == XFmode)
11130 return false;
11132 if (size > 12)
11133 return true;
11135 /* OImode shouldn't be used directly. */
11136 gcc_assert (mode != OImode);
11138 return false;
11140 #endif
11144 /* Create the va_list data type. */
11146 static tree
11147 ix86_build_builtin_va_list_64 (void)
11149 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11151 record = lang_hooks.types.make_type (RECORD_TYPE);
11152 type_decl = build_decl (BUILTINS_LOCATION,
11153 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11155 f_gpr = build_decl (BUILTINS_LOCATION,
11156 FIELD_DECL, get_identifier ("gp_offset"),
11157 unsigned_type_node);
11158 f_fpr = build_decl (BUILTINS_LOCATION,
11159 FIELD_DECL, get_identifier ("fp_offset"),
11160 unsigned_type_node);
11161 f_ovf = build_decl (BUILTINS_LOCATION,
11162 FIELD_DECL, get_identifier ("overflow_arg_area"),
11163 ptr_type_node);
11164 f_sav = build_decl (BUILTINS_LOCATION,
11165 FIELD_DECL, get_identifier ("reg_save_area"),
11166 ptr_type_node);
11168 va_list_gpr_counter_field = f_gpr;
11169 va_list_fpr_counter_field = f_fpr;
11171 DECL_FIELD_CONTEXT (f_gpr) = record;
11172 DECL_FIELD_CONTEXT (f_fpr) = record;
11173 DECL_FIELD_CONTEXT (f_ovf) = record;
11174 DECL_FIELD_CONTEXT (f_sav) = record;
11176 TYPE_STUB_DECL (record) = type_decl;
11177 TYPE_NAME (record) = type_decl;
11178 TYPE_FIELDS (record) = f_gpr;
11179 DECL_CHAIN (f_gpr) = f_fpr;
11180 DECL_CHAIN (f_fpr) = f_ovf;
11181 DECL_CHAIN (f_ovf) = f_sav;
11183 layout_type (record);
11185 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11186 NULL_TREE, TYPE_ATTRIBUTES (record));
11188 /* The correct type is an array type of one element. */
11189 return build_array_type (record, build_index_type (size_zero_node));
11192 /* Setup the builtin va_list data type and for 64-bit the additional
11193 calling convention specific va_list data types. */
11195 static tree
11196 ix86_build_builtin_va_list (void)
11198 if (TARGET_64BIT)
11200 /* Initialize ABI specific va_list builtin types.
11202 In lto1, we can encounter two va_list types:
11203 - one as a result of the type-merge across TUs, and
11204 - the one constructed here.
11205 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11206 a type identity check in canonical_va_list_type based on
11207 TYPE_MAIN_VARIANT (which we used to have) will not work.
11208 Instead, we tag each va_list_type_node with its unique attribute, and
11209 look for the attribute in the type identity check in
11210 canonical_va_list_type.
11212 Tagging sysv_va_list_type_node directly with the attribute is
11213 problematic since it's a array of one record, which will degrade into a
11214 pointer to record when used as parameter (see build_va_arg comments for
11215 an example), dropping the attribute in the process. So we tag the
11216 record instead. */
11218 /* For SYSV_ABI we use an array of one record. */
11219 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11221 /* For MS_ABI we use plain pointer to argument area. */
11222 tree char_ptr_type = build_pointer_type (char_type_node);
11223 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11224 TYPE_ATTRIBUTES (char_ptr_type));
11225 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11227 return ((ix86_abi == MS_ABI)
11228 ? ms_va_list_type_node
11229 : sysv_va_list_type_node);
11231 else
11233 /* For i386 we use plain pointer to argument area. */
11234 return build_pointer_type (char_type_node);
11238 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11240 static void
11241 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11243 rtx save_area, mem;
11244 alias_set_type set;
11245 int i, max;
11247 /* GPR size of varargs save area. */
11248 if (cfun->va_list_gpr_size)
11249 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11250 else
11251 ix86_varargs_gpr_size = 0;
11253 /* FPR size of varargs save area. We don't need it if we don't pass
11254 anything in SSE registers. */
11255 if (TARGET_SSE && cfun->va_list_fpr_size)
11256 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11257 else
11258 ix86_varargs_fpr_size = 0;
11260 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11261 return;
11263 save_area = frame_pointer_rtx;
11264 set = get_varargs_alias_set ();
11266 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11267 if (max > X86_64_REGPARM_MAX)
11268 max = X86_64_REGPARM_MAX;
11270 for (i = cum->regno; i < max; i++)
11272 mem = gen_rtx_MEM (word_mode,
11273 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11274 MEM_NOTRAP_P (mem) = 1;
11275 set_mem_alias_set (mem, set);
11276 emit_move_insn (mem,
11277 gen_rtx_REG (word_mode,
11278 x86_64_int_parameter_registers[i]));
11281 if (ix86_varargs_fpr_size)
11283 machine_mode smode;
11284 rtx_code_label *label;
11285 rtx test;
11287 /* Now emit code to save SSE registers. The AX parameter contains number
11288 of SSE parameter registers used to call this function, though all we
11289 actually check here is the zero/non-zero status. */
11291 label = gen_label_rtx ();
11292 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11293 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11294 label));
11296 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11297 we used movdqa (i.e. TImode) instead? Perhaps even better would
11298 be if we could determine the real mode of the data, via a hook
11299 into pass_stdarg. Ignore all that for now. */
11300 smode = V4SFmode;
11301 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11302 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11304 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11305 if (max > X86_64_SSE_REGPARM_MAX)
11306 max = X86_64_SSE_REGPARM_MAX;
11308 for (i = cum->sse_regno; i < max; ++i)
11310 mem = plus_constant (Pmode, save_area,
11311 i * 16 + ix86_varargs_gpr_size);
11312 mem = gen_rtx_MEM (smode, mem);
11313 MEM_NOTRAP_P (mem) = 1;
11314 set_mem_alias_set (mem, set);
11315 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11317 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11320 emit_label (label);
11324 static void
11325 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11327 alias_set_type set = get_varargs_alias_set ();
11328 int i;
11330 /* Reset to zero, as there might be a sysv vaarg used
11331 before. */
11332 ix86_varargs_gpr_size = 0;
11333 ix86_varargs_fpr_size = 0;
11335 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11337 rtx reg, mem;
11339 mem = gen_rtx_MEM (Pmode,
11340 plus_constant (Pmode, virtual_incoming_args_rtx,
11341 i * UNITS_PER_WORD));
11342 MEM_NOTRAP_P (mem) = 1;
11343 set_mem_alias_set (mem, set);
11345 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11346 emit_move_insn (mem, reg);
11350 static void
11351 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11352 tree type, int *, int no_rtl)
11354 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11355 CUMULATIVE_ARGS next_cum;
11356 tree fntype;
11358 /* This argument doesn't appear to be used anymore. Which is good,
11359 because the old code here didn't suppress rtl generation. */
11360 gcc_assert (!no_rtl);
11362 if (!TARGET_64BIT)
11363 return;
11365 fntype = TREE_TYPE (current_function_decl);
11367 /* For varargs, we do not want to skip the dummy va_dcl argument.
11368 For stdargs, we do want to skip the last named argument. */
11369 next_cum = *cum;
11370 if (stdarg_p (fntype))
11371 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11372 true);
11374 if (cum->call_abi == MS_ABI)
11375 setup_incoming_varargs_ms_64 (&next_cum);
11376 else
11377 setup_incoming_varargs_64 (&next_cum);
11380 static void
11381 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11382 machine_mode mode,
11383 tree type,
11384 int *pretend_size ATTRIBUTE_UNUSED,
11385 int no_rtl)
11387 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11388 CUMULATIVE_ARGS next_cum;
11389 tree fntype;
11390 rtx save_area;
11391 int bnd_reg, i, max;
11393 gcc_assert (!no_rtl);
11395 /* Do nothing if we use plain pointer to argument area. */
11396 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11397 return;
11399 fntype = TREE_TYPE (current_function_decl);
11401 /* For varargs, we do not want to skip the dummy va_dcl argument.
11402 For stdargs, we do want to skip the last named argument. */
11403 next_cum = *cum;
11404 if (stdarg_p (fntype))
11405 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11406 true);
11407 save_area = frame_pointer_rtx;
11409 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11410 if (max > X86_64_REGPARM_MAX)
11411 max = X86_64_REGPARM_MAX;
11413 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11414 if (chkp_function_instrumented_p (current_function_decl))
11415 for (i = cum->regno; i < max; i++)
11417 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11418 rtx ptr = gen_rtx_REG (Pmode,
11419 x86_64_int_parameter_registers[i]);
11420 rtx bounds;
11422 if (bnd_reg <= LAST_BND_REG)
11423 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11424 else
11426 rtx ldx_addr =
11427 plus_constant (Pmode, arg_pointer_rtx,
11428 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11429 bounds = gen_reg_rtx (BNDmode);
11430 emit_insn (BNDmode == BND64mode
11431 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11432 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11435 emit_insn (BNDmode == BND64mode
11436 ? gen_bnd64_stx (addr, ptr, bounds)
11437 : gen_bnd32_stx (addr, ptr, bounds));
11439 bnd_reg++;
11444 /* Checks if TYPE is of kind va_list char *. */
11446 static bool
11447 is_va_list_char_pointer (tree type)
11449 tree canonic;
11451 /* For 32-bit it is always true. */
11452 if (!TARGET_64BIT)
11453 return true;
11454 canonic = ix86_canonical_va_list_type (type);
11455 return (canonic == ms_va_list_type_node
11456 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11459 /* Implement va_start. */
11461 static void
11462 ix86_va_start (tree valist, rtx nextarg)
11464 HOST_WIDE_INT words, n_gpr, n_fpr;
11465 tree f_gpr, f_fpr, f_ovf, f_sav;
11466 tree gpr, fpr, ovf, sav, t;
11467 tree type;
11468 rtx ovf_rtx;
11470 if (flag_split_stack
11471 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11473 unsigned int scratch_regno;
11475 /* When we are splitting the stack, we can't refer to the stack
11476 arguments using internal_arg_pointer, because they may be on
11477 the old stack. The split stack prologue will arrange to
11478 leave a pointer to the old stack arguments in a scratch
11479 register, which we here copy to a pseudo-register. The split
11480 stack prologue can't set the pseudo-register directly because
11481 it (the prologue) runs before any registers have been saved. */
11483 scratch_regno = split_stack_prologue_scratch_regno ();
11484 if (scratch_regno != INVALID_REGNUM)
11486 rtx reg;
11487 rtx_insn *seq;
11489 reg = gen_reg_rtx (Pmode);
11490 cfun->machine->split_stack_varargs_pointer = reg;
11492 start_sequence ();
11493 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11494 seq = get_insns ();
11495 end_sequence ();
11497 push_topmost_sequence ();
11498 emit_insn_after (seq, entry_of_function ());
11499 pop_topmost_sequence ();
11503 /* Only 64bit target needs something special. */
11504 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11506 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11507 std_expand_builtin_va_start (valist, nextarg);
11508 else
11510 rtx va_r, next;
11512 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11513 next = expand_binop (ptr_mode, add_optab,
11514 cfun->machine->split_stack_varargs_pointer,
11515 crtl->args.arg_offset_rtx,
11516 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11517 convert_move (va_r, next, 0);
11519 /* Store zero bounds for va_list. */
11520 if (chkp_function_instrumented_p (current_function_decl))
11521 chkp_expand_bounds_reset_for_mem (valist,
11522 make_tree (TREE_TYPE (valist),
11523 next));
11526 return;
11529 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11530 f_fpr = DECL_CHAIN (f_gpr);
11531 f_ovf = DECL_CHAIN (f_fpr);
11532 f_sav = DECL_CHAIN (f_ovf);
11534 valist = build_simple_mem_ref (valist);
11535 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11536 /* The following should be folded into the MEM_REF offset. */
11537 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11538 f_gpr, NULL_TREE);
11539 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11540 f_fpr, NULL_TREE);
11541 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11542 f_ovf, NULL_TREE);
11543 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11544 f_sav, NULL_TREE);
11546 /* Count number of gp and fp argument registers used. */
11547 words = crtl->args.info.words;
11548 n_gpr = crtl->args.info.regno;
11549 n_fpr = crtl->args.info.sse_regno;
11551 if (cfun->va_list_gpr_size)
11553 type = TREE_TYPE (gpr);
11554 t = build2 (MODIFY_EXPR, type,
11555 gpr, build_int_cst (type, n_gpr * 8));
11556 TREE_SIDE_EFFECTS (t) = 1;
11557 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11560 if (TARGET_SSE && cfun->va_list_fpr_size)
11562 type = TREE_TYPE (fpr);
11563 t = build2 (MODIFY_EXPR, type, fpr,
11564 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11565 TREE_SIDE_EFFECTS (t) = 1;
11566 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11569 /* Find the overflow area. */
11570 type = TREE_TYPE (ovf);
11571 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11572 ovf_rtx = crtl->args.internal_arg_pointer;
11573 else
11574 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11575 t = make_tree (type, ovf_rtx);
11576 if (words != 0)
11577 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11579 /* Store zero bounds for overflow area pointer. */
11580 if (chkp_function_instrumented_p (current_function_decl))
11581 chkp_expand_bounds_reset_for_mem (ovf, t);
11583 t = build2 (MODIFY_EXPR, type, ovf, t);
11584 TREE_SIDE_EFFECTS (t) = 1;
11585 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11587 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11589 /* Find the register save area.
11590 Prologue of the function save it right above stack frame. */
11591 type = TREE_TYPE (sav);
11592 t = make_tree (type, frame_pointer_rtx);
11593 if (!ix86_varargs_gpr_size)
11594 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11596 /* Store zero bounds for save area pointer. */
11597 if (chkp_function_instrumented_p (current_function_decl))
11598 chkp_expand_bounds_reset_for_mem (sav, t);
11600 t = build2 (MODIFY_EXPR, type, sav, t);
11601 TREE_SIDE_EFFECTS (t) = 1;
11602 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11606 /* Implement va_arg. */
11608 static tree
11609 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11610 gimple_seq *post_p)
11612 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11613 tree f_gpr, f_fpr, f_ovf, f_sav;
11614 tree gpr, fpr, ovf, sav, t;
11615 int size, rsize;
11616 tree lab_false, lab_over = NULL_TREE;
11617 tree addr, t2;
11618 rtx container;
11619 int indirect_p = 0;
11620 tree ptrtype;
11621 machine_mode nat_mode;
11622 unsigned int arg_boundary;
11624 /* Only 64bit target needs something special. */
11625 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11626 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11628 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11629 f_fpr = DECL_CHAIN (f_gpr);
11630 f_ovf = DECL_CHAIN (f_fpr);
11631 f_sav = DECL_CHAIN (f_ovf);
11633 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11634 valist, f_gpr, NULL_TREE);
11636 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11637 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11638 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11640 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11641 if (indirect_p)
11642 type = build_pointer_type (type);
11643 size = int_size_in_bytes (type);
11644 rsize = CEIL (size, UNITS_PER_WORD);
11646 nat_mode = type_natural_mode (type, NULL, false);
11647 switch (nat_mode)
11649 case V8SFmode:
11650 case V8SImode:
11651 case V32QImode:
11652 case V16HImode:
11653 case V4DFmode:
11654 case V4DImode:
11655 case V16SFmode:
11656 case V16SImode:
11657 case V64QImode:
11658 case V32HImode:
11659 case V8DFmode:
11660 case V8DImode:
11661 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11662 if (!TARGET_64BIT_MS_ABI)
11664 container = NULL;
11665 break;
11667 /* FALLTHRU */
11669 default:
11670 container = construct_container (nat_mode, TYPE_MODE (type),
11671 type, 0, X86_64_REGPARM_MAX,
11672 X86_64_SSE_REGPARM_MAX, intreg,
11674 break;
11677 /* Pull the value out of the saved registers. */
11679 addr = create_tmp_var (ptr_type_node, "addr");
11681 if (container)
11683 int needed_intregs, needed_sseregs;
11684 bool need_temp;
11685 tree int_addr, sse_addr;
11687 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11688 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11690 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11692 need_temp = (!REG_P (container)
11693 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11694 || TYPE_ALIGN (type) > 128));
11696 /* In case we are passing structure, verify that it is consecutive block
11697 on the register save area. If not we need to do moves. */
11698 if (!need_temp && !REG_P (container))
11700 /* Verify that all registers are strictly consecutive */
11701 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11703 int i;
11705 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11707 rtx slot = XVECEXP (container, 0, i);
11708 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11709 || INTVAL (XEXP (slot, 1)) != i * 16)
11710 need_temp = true;
11713 else
11715 int i;
11717 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11719 rtx slot = XVECEXP (container, 0, i);
11720 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11721 || INTVAL (XEXP (slot, 1)) != i * 8)
11722 need_temp = true;
11726 if (!need_temp)
11728 int_addr = addr;
11729 sse_addr = addr;
11731 else
11733 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11734 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11737 /* First ensure that we fit completely in registers. */
11738 if (needed_intregs)
11740 t = build_int_cst (TREE_TYPE (gpr),
11741 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11742 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11743 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11744 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11745 gimplify_and_add (t, pre_p);
11747 if (needed_sseregs)
11749 t = build_int_cst (TREE_TYPE (fpr),
11750 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11751 + X86_64_REGPARM_MAX * 8);
11752 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11753 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11754 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11755 gimplify_and_add (t, pre_p);
11758 /* Compute index to start of area used for integer regs. */
11759 if (needed_intregs)
11761 /* int_addr = gpr + sav; */
11762 t = fold_build_pointer_plus (sav, gpr);
11763 gimplify_assign (int_addr, t, pre_p);
11765 if (needed_sseregs)
11767 /* sse_addr = fpr + sav; */
11768 t = fold_build_pointer_plus (sav, fpr);
11769 gimplify_assign (sse_addr, t, pre_p);
11771 if (need_temp)
11773 int i, prev_size = 0;
11774 tree temp = create_tmp_var (type, "va_arg_tmp");
11776 /* addr = &temp; */
11777 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11778 gimplify_assign (addr, t, pre_p);
11780 for (i = 0; i < XVECLEN (container, 0); i++)
11782 rtx slot = XVECEXP (container, 0, i);
11783 rtx reg = XEXP (slot, 0);
11784 machine_mode mode = GET_MODE (reg);
11785 tree piece_type;
11786 tree addr_type;
11787 tree daddr_type;
11788 tree src_addr, src;
11789 int src_offset;
11790 tree dest_addr, dest;
11791 int cur_size = GET_MODE_SIZE (mode);
11793 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11794 prev_size = INTVAL (XEXP (slot, 1));
11795 if (prev_size + cur_size > size)
11797 cur_size = size - prev_size;
11798 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11799 if (mode == BLKmode)
11800 mode = QImode;
11802 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11803 if (mode == GET_MODE (reg))
11804 addr_type = build_pointer_type (piece_type);
11805 else
11806 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11807 true);
11808 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11809 true);
11811 if (SSE_REGNO_P (REGNO (reg)))
11813 src_addr = sse_addr;
11814 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11816 else
11818 src_addr = int_addr;
11819 src_offset = REGNO (reg) * 8;
11821 src_addr = fold_convert (addr_type, src_addr);
11822 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11824 dest_addr = fold_convert (daddr_type, addr);
11825 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11826 if (cur_size == GET_MODE_SIZE (mode))
11828 src = build_va_arg_indirect_ref (src_addr);
11829 dest = build_va_arg_indirect_ref (dest_addr);
11831 gimplify_assign (dest, src, pre_p);
11833 else
11835 tree copy
11836 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11837 3, dest_addr, src_addr,
11838 size_int (cur_size));
11839 gimplify_and_add (copy, pre_p);
11841 prev_size += cur_size;
11845 if (needed_intregs)
11847 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11848 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11849 gimplify_assign (gpr, t, pre_p);
11852 if (needed_sseregs)
11854 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11855 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11856 gimplify_assign (unshare_expr (fpr), t, pre_p);
11859 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11861 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11864 /* ... otherwise out of the overflow area. */
11866 /* When we align parameter on stack for caller, if the parameter
11867 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11868 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11869 here with caller. */
11870 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11871 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11872 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11874 /* Care for on-stack alignment if needed. */
11875 if (arg_boundary <= 64 || size == 0)
11876 t = ovf;
11877 else
11879 HOST_WIDE_INT align = arg_boundary / 8;
11880 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11881 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11882 build_int_cst (TREE_TYPE (t), -align));
11885 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11886 gimplify_assign (addr, t, pre_p);
11888 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11889 gimplify_assign (unshare_expr (ovf), t, pre_p);
11891 if (container)
11892 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11894 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11895 addr = fold_convert (ptrtype, addr);
11897 if (indirect_p)
11898 addr = build_va_arg_indirect_ref (addr);
11899 return build_va_arg_indirect_ref (addr);
11902 /* Return true if OPNUM's MEM should be matched
11903 in movabs* patterns. */
11905 bool
11906 ix86_check_movabs (rtx insn, int opnum)
11908 rtx set, mem;
11910 set = PATTERN (insn);
11911 if (GET_CODE (set) == PARALLEL)
11912 set = XVECEXP (set, 0, 0);
11913 gcc_assert (GET_CODE (set) == SET);
11914 mem = XEXP (set, opnum);
11915 while (SUBREG_P (mem))
11916 mem = SUBREG_REG (mem);
11917 gcc_assert (MEM_P (mem));
11918 return volatile_ok || !MEM_VOLATILE_P (mem);
11921 /* Return false if INSN contains a MEM with a non-default address space. */
11922 bool
11923 ix86_check_no_addr_space (rtx insn)
11925 subrtx_var_iterator::array_type array;
11926 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11928 rtx x = *iter;
11929 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11930 return false;
11932 return true;
11935 /* Initialize the table of extra 80387 mathematical constants. */
11937 static void
11938 init_ext_80387_constants (void)
11940 static const char * cst[5] =
11942 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11943 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11944 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11945 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11946 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11948 int i;
11950 for (i = 0; i < 5; i++)
11952 real_from_string (&ext_80387_constants_table[i], cst[i]);
11953 /* Ensure each constant is rounded to XFmode precision. */
11954 real_convert (&ext_80387_constants_table[i],
11955 XFmode, &ext_80387_constants_table[i]);
11958 ext_80387_constants_init = 1;
11961 /* Return non-zero if the constant is something that
11962 can be loaded with a special instruction. */
11965 standard_80387_constant_p (rtx x)
11967 machine_mode mode = GET_MODE (x);
11969 const REAL_VALUE_TYPE *r;
11971 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11972 return -1;
11974 if (x == CONST0_RTX (mode))
11975 return 1;
11976 if (x == CONST1_RTX (mode))
11977 return 2;
11979 r = CONST_DOUBLE_REAL_VALUE (x);
11981 /* For XFmode constants, try to find a special 80387 instruction when
11982 optimizing for size or on those CPUs that benefit from them. */
11983 if (mode == XFmode
11984 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11986 int i;
11988 if (! ext_80387_constants_init)
11989 init_ext_80387_constants ();
11991 for (i = 0; i < 5; i++)
11992 if (real_identical (r, &ext_80387_constants_table[i]))
11993 return i + 3;
11996 /* Load of the constant -0.0 or -1.0 will be split as
11997 fldz;fchs or fld1;fchs sequence. */
11998 if (real_isnegzero (r))
11999 return 8;
12000 if (real_identical (r, &dconstm1))
12001 return 9;
12003 return 0;
12006 /* Return the opcode of the special instruction to be used to load
12007 the constant X. */
12009 const char *
12010 standard_80387_constant_opcode (rtx x)
12012 switch (standard_80387_constant_p (x))
12014 case 1:
12015 return "fldz";
12016 case 2:
12017 return "fld1";
12018 case 3:
12019 return "fldlg2";
12020 case 4:
12021 return "fldln2";
12022 case 5:
12023 return "fldl2e";
12024 case 6:
12025 return "fldl2t";
12026 case 7:
12027 return "fldpi";
12028 case 8:
12029 case 9:
12030 return "#";
12031 default:
12032 gcc_unreachable ();
12036 /* Return the CONST_DOUBLE representing the 80387 constant that is
12037 loaded by the specified special instruction. The argument IDX
12038 matches the return value from standard_80387_constant_p. */
12041 standard_80387_constant_rtx (int idx)
12043 int i;
12045 if (! ext_80387_constants_init)
12046 init_ext_80387_constants ();
12048 switch (idx)
12050 case 3:
12051 case 4:
12052 case 5:
12053 case 6:
12054 case 7:
12055 i = idx - 3;
12056 break;
12058 default:
12059 gcc_unreachable ();
12062 return const_double_from_real_value (ext_80387_constants_table[i],
12063 XFmode);
12066 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12067 in supported SSE/AVX vector mode. */
12070 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12072 machine_mode mode;
12074 if (!TARGET_SSE)
12075 return 0;
12077 mode = GET_MODE (x);
12079 if (x == const0_rtx || const0_operand (x, mode))
12080 return 1;
12082 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12084 /* VOIDmode integer constant, get mode from the predicate. */
12085 if (mode == VOIDmode)
12086 mode = pred_mode;
12088 switch (GET_MODE_SIZE (mode))
12090 case 64:
12091 if (TARGET_AVX512F)
12092 return 2;
12093 break;
12094 case 32:
12095 if (TARGET_AVX2)
12096 return 2;
12097 break;
12098 case 16:
12099 if (TARGET_SSE2)
12100 return 2;
12101 break;
12102 case 0:
12103 /* VOIDmode */
12104 gcc_unreachable ();
12105 default:
12106 break;
12110 return 0;
12113 /* Return the opcode of the special instruction to be used to load
12114 the constant X. */
12116 const char *
12117 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12119 machine_mode mode;
12121 gcc_assert (TARGET_SSE);
12123 mode = GET_MODE (x);
12125 if (x == const0_rtx || const0_operand (x, mode))
12127 switch (get_attr_mode (insn))
12129 case MODE_XI:
12130 return "vpxord\t%g0, %g0, %g0";
12131 case MODE_OI:
12132 return (TARGET_AVX512VL
12133 ? "vpxord\t%x0, %x0, %x0"
12134 : "vpxor\t%x0, %x0, %x0");
12135 case MODE_TI:
12136 return (TARGET_AVX512VL
12137 ? "vpxord\t%t0, %t0, %t0"
12138 : "%vpxor\t%0, %d0");
12140 case MODE_V8DF:
12141 return (TARGET_AVX512DQ
12142 ? "vxorpd\t%g0, %g0, %g0"
12143 : "vpxorq\t%g0, %g0, %g0");
12144 case MODE_V4DF:
12145 return "vxorpd\t%x0, %x0, %x0";
12146 case MODE_V2DF:
12147 return "%vxorpd\t%0, %d0";
12149 case MODE_V16SF:
12150 return (TARGET_AVX512DQ
12151 ? "vxorps\t%g0, %g0, %g0"
12152 : "vpxord\t%g0, %g0, %g0");
12153 case MODE_V8SF:
12154 return "vxorps\t%x0, %x0, %x0";
12155 case MODE_V4SF:
12156 return "%vxorps\t%0, %d0";
12158 default:
12159 gcc_unreachable ();
12162 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12164 enum attr_mode insn_mode = get_attr_mode (insn);
12166 switch (insn_mode)
12168 case MODE_XI:
12169 case MODE_V8DF:
12170 case MODE_V16SF:
12171 gcc_assert (TARGET_AVX512F);
12172 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12174 case MODE_OI:
12175 case MODE_V4DF:
12176 case MODE_V8SF:
12177 gcc_assert (TARGET_AVX2);
12178 /* FALLTHRU */
12179 case MODE_TI:
12180 case MODE_V2DF:
12181 case MODE_V4SF:
12182 gcc_assert (TARGET_SSE2);
12183 return (TARGET_AVX
12184 ? "vpcmpeqd\t%0, %0, %0"
12185 : "pcmpeqd\t%0, %0");
12187 default:
12188 gcc_unreachable ();
12192 gcc_unreachable ();
12195 /* Returns true if INSN can be transformed from a memory load
12196 to a supported FP constant load. */
12198 bool
12199 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12201 rtx src = find_constant_src (insn);
12203 gcc_assert (REG_P (dst));
12205 if (src == NULL
12206 || (SSE_REGNO_P (REGNO (dst))
12207 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12208 || (STACK_REGNO_P (REGNO (dst))
12209 && standard_80387_constant_p (src) < 1))
12210 return false;
12212 return true;
12215 /* Returns true if OP contains a symbol reference */
12217 bool
12218 symbolic_reference_mentioned_p (rtx op)
12220 const char *fmt;
12221 int i;
12223 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12224 return true;
12226 fmt = GET_RTX_FORMAT (GET_CODE (op));
12227 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12229 if (fmt[i] == 'E')
12231 int j;
12233 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12234 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12235 return true;
12238 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12239 return true;
12242 return false;
12245 /* Return true if it is appropriate to emit `ret' instructions in the
12246 body of a function. Do this only if the epilogue is simple, needing a
12247 couple of insns. Prior to reloading, we can't tell how many registers
12248 must be saved, so return false then. Return false if there is no frame
12249 marker to de-allocate. */
12251 bool
12252 ix86_can_use_return_insn_p (void)
12254 struct ix86_frame frame;
12256 /* Don't use `ret' instruction in interrupt handler. */
12257 if (! reload_completed
12258 || frame_pointer_needed
12259 || cfun->machine->func_type != TYPE_NORMAL)
12260 return 0;
12262 /* Don't allow more than 32k pop, since that's all we can do
12263 with one instruction. */
12264 if (crtl->args.pops_args && crtl->args.size >= 32768)
12265 return 0;
12267 frame = cfun->machine->frame;
12268 return (frame.stack_pointer_offset == UNITS_PER_WORD
12269 && (frame.nregs + frame.nsseregs) == 0);
12272 /* Value should be nonzero if functions must have frame pointers.
12273 Zero means the frame pointer need not be set up (and parms may
12274 be accessed via the stack pointer) in functions that seem suitable. */
12276 static bool
12277 ix86_frame_pointer_required (void)
12279 /* If we accessed previous frames, then the generated code expects
12280 to be able to access the saved ebp value in our frame. */
12281 if (cfun->machine->accesses_prev_frame)
12282 return true;
12284 /* Several x86 os'es need a frame pointer for other reasons,
12285 usually pertaining to setjmp. */
12286 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12287 return true;
12289 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12290 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12291 return true;
12293 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12294 allocation is 4GB. */
12295 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12296 return true;
12298 /* SSE saves require frame-pointer when stack is misaligned. */
12299 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12300 return true;
12302 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12303 turns off the frame pointer by default. Turn it back on now if
12304 we've not got a leaf function. */
12305 if (TARGET_OMIT_LEAF_FRAME_POINTER
12306 && (!crtl->is_leaf
12307 || ix86_current_function_calls_tls_descriptor))
12308 return true;
12310 if (crtl->profile && !flag_fentry)
12311 return true;
12313 return false;
12316 /* Record that the current function accesses previous call frames. */
12318 void
12319 ix86_setup_frame_addresses (void)
12321 cfun->machine->accesses_prev_frame = 1;
12324 #ifndef USE_HIDDEN_LINKONCE
12325 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12326 # define USE_HIDDEN_LINKONCE 1
12327 # else
12328 # define USE_HIDDEN_LINKONCE 0
12329 # endif
12330 #endif
12332 static int pic_labels_used;
12334 /* Fills in the label name that should be used for a pc thunk for
12335 the given register. */
12337 static void
12338 get_pc_thunk_name (char name[32], unsigned int regno)
12340 gcc_assert (!TARGET_64BIT);
12342 if (USE_HIDDEN_LINKONCE)
12343 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12344 else
12345 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12349 /* This function generates code for -fpic that loads %ebx with
12350 the return address of the caller and then returns. */
12352 static void
12353 ix86_code_end (void)
12355 rtx xops[2];
12356 int regno;
12358 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12360 char name[32];
12361 tree decl;
12363 if (!(pic_labels_used & (1 << regno)))
12364 continue;
12366 get_pc_thunk_name (name, regno);
12368 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12369 get_identifier (name),
12370 build_function_type_list (void_type_node, NULL_TREE));
12371 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12372 NULL_TREE, void_type_node);
12373 TREE_PUBLIC (decl) = 1;
12374 TREE_STATIC (decl) = 1;
12375 DECL_IGNORED_P (decl) = 1;
12377 #if TARGET_MACHO
12378 if (TARGET_MACHO)
12380 switch_to_section (darwin_sections[picbase_thunk_section]);
12381 fputs ("\t.weak_definition\t", asm_out_file);
12382 assemble_name (asm_out_file, name);
12383 fputs ("\n\t.private_extern\t", asm_out_file);
12384 assemble_name (asm_out_file, name);
12385 putc ('\n', asm_out_file);
12386 ASM_OUTPUT_LABEL (asm_out_file, name);
12387 DECL_WEAK (decl) = 1;
12389 else
12390 #endif
12391 if (USE_HIDDEN_LINKONCE)
12393 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12395 targetm.asm_out.unique_section (decl, 0);
12396 switch_to_section (get_named_section (decl, NULL, 0));
12398 targetm.asm_out.globalize_label (asm_out_file, name);
12399 fputs ("\t.hidden\t", asm_out_file);
12400 assemble_name (asm_out_file, name);
12401 putc ('\n', asm_out_file);
12402 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12404 else
12406 switch_to_section (text_section);
12407 ASM_OUTPUT_LABEL (asm_out_file, name);
12410 DECL_INITIAL (decl) = make_node (BLOCK);
12411 current_function_decl = decl;
12412 allocate_struct_function (decl, false);
12413 init_function_start (decl);
12414 /* We're about to hide the function body from callees of final_* by
12415 emitting it directly; tell them we're a thunk, if they care. */
12416 cfun->is_thunk = true;
12417 first_function_block_is_cold = false;
12418 /* Make sure unwind info is emitted for the thunk if needed. */
12419 final_start_function (emit_barrier (), asm_out_file, 1);
12421 /* Pad stack IP move with 4 instructions (two NOPs count
12422 as one instruction). */
12423 if (TARGET_PAD_SHORT_FUNCTION)
12425 int i = 8;
12427 while (i--)
12428 fputs ("\tnop\n", asm_out_file);
12431 xops[0] = gen_rtx_REG (Pmode, regno);
12432 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12433 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12434 output_asm_insn ("%!ret", NULL);
12435 final_end_function ();
12436 init_insn_lengths ();
12437 free_after_compilation (cfun);
12438 set_cfun (NULL);
12439 current_function_decl = NULL;
12442 if (flag_split_stack)
12443 file_end_indicate_split_stack ();
12446 /* Emit code for the SET_GOT patterns. */
12448 const char *
12449 output_set_got (rtx dest, rtx label)
12451 rtx xops[3];
12453 xops[0] = dest;
12455 if (TARGET_VXWORKS_RTP && flag_pic)
12457 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12458 xops[2] = gen_rtx_MEM (Pmode,
12459 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12460 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12462 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12463 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12464 an unadorned address. */
12465 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12466 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12467 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12468 return "";
12471 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12473 if (flag_pic)
12475 char name[32];
12476 get_pc_thunk_name (name, REGNO (dest));
12477 pic_labels_used |= 1 << REGNO (dest);
12479 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12480 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12481 output_asm_insn ("%!call\t%X2", xops);
12483 #if TARGET_MACHO
12484 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12485 This is what will be referenced by the Mach-O PIC subsystem. */
12486 if (machopic_should_output_picbase_label () || !label)
12487 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12489 /* When we are restoring the pic base at the site of a nonlocal label,
12490 and we decided to emit the pic base above, we will still output a
12491 local label used for calculating the correction offset (even though
12492 the offset will be 0 in that case). */
12493 if (label)
12494 targetm.asm_out.internal_label (asm_out_file, "L",
12495 CODE_LABEL_NUMBER (label));
12496 #endif
12498 else
12500 if (TARGET_MACHO)
12501 /* We don't need a pic base, we're not producing pic. */
12502 gcc_unreachable ();
12504 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12505 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12506 targetm.asm_out.internal_label (asm_out_file, "L",
12507 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12510 if (!TARGET_MACHO)
12511 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12513 return "";
12516 /* Generate an "push" pattern for input ARG. */
12518 static rtx
12519 gen_push (rtx arg)
12521 struct machine_function *m = cfun->machine;
12523 if (m->fs.cfa_reg == stack_pointer_rtx)
12524 m->fs.cfa_offset += UNITS_PER_WORD;
12525 m->fs.sp_offset += UNITS_PER_WORD;
12527 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12528 arg = gen_rtx_REG (word_mode, REGNO (arg));
12530 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12531 gen_rtx_PRE_DEC (Pmode,
12532 stack_pointer_rtx)),
12533 arg);
12536 /* Generate an "pop" pattern for input ARG. */
12538 static rtx
12539 gen_pop (rtx arg)
12541 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12542 arg = gen_rtx_REG (word_mode, REGNO (arg));
12544 return gen_rtx_SET (arg,
12545 gen_rtx_MEM (word_mode,
12546 gen_rtx_POST_INC (Pmode,
12547 stack_pointer_rtx)));
12550 /* Return >= 0 if there is an unused call-clobbered register available
12551 for the entire function. */
12553 static unsigned int
12554 ix86_select_alt_pic_regnum (void)
12556 if (ix86_use_pseudo_pic_reg ())
12557 return INVALID_REGNUM;
12559 if (crtl->is_leaf
12560 && !crtl->profile
12561 && !ix86_current_function_calls_tls_descriptor)
12563 int i, drap;
12564 /* Can't use the same register for both PIC and DRAP. */
12565 if (crtl->drap_reg)
12566 drap = REGNO (crtl->drap_reg);
12567 else
12568 drap = -1;
12569 for (i = 2; i >= 0; --i)
12570 if (i != drap && !df_regs_ever_live_p (i))
12571 return i;
12574 return INVALID_REGNUM;
12577 /* Return true if REGNO is used by the epilogue. */
12579 bool
12580 ix86_epilogue_uses (int regno)
12582 /* If there are no caller-saved registers, we preserve all registers,
12583 except for MMX and x87 registers which aren't supported when saving
12584 and restoring registers. Don't explicitly save SP register since
12585 it is always preserved. */
12586 return (epilogue_completed
12587 && cfun->machine->no_caller_saved_registers
12588 && !fixed_regs[regno]
12589 && !STACK_REGNO_P (regno)
12590 && !MMX_REGNO_P (regno));
12593 /* Return nonzero if register REGNO can be used as a scratch register
12594 in peephole2. */
12596 static bool
12597 ix86_hard_regno_scratch_ok (unsigned int regno)
12599 /* If there are no caller-saved registers, we can't use any register
12600 as a scratch register after epilogue and use REGNO as scratch
12601 register only if it has been used before to avoid saving and
12602 restoring it. */
12603 return (!cfun->machine->no_caller_saved_registers
12604 || (!epilogue_completed
12605 && df_regs_ever_live_p (regno)));
12608 /* Return true if register class CL should be an additional allocno
12609 class. */
12611 static bool
12612 ix86_additional_allocno_class_p (reg_class_t cl)
12614 return cl == MOD4_SSE_REGS;
12617 /* Return TRUE if we need to save REGNO. */
12619 static bool
12620 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12622 /* If there are no caller-saved registers, we preserve all registers,
12623 except for MMX and x87 registers which aren't supported when saving
12624 and restoring registers. Don't explicitly save SP register since
12625 it is always preserved. */
12626 if (cfun->machine->no_caller_saved_registers)
12628 /* Don't preserve registers used for function return value. */
12629 rtx reg = crtl->return_rtx;
12630 if (reg)
12632 unsigned int i = REGNO (reg);
12633 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12634 while (nregs-- > 0)
12635 if ((i + nregs) == regno)
12636 return false;
12638 reg = crtl->return_bnd;
12639 if (reg)
12641 i = REGNO (reg);
12642 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12643 while (nregs-- > 0)
12644 if ((i + nregs) == regno)
12645 return false;
12649 return (df_regs_ever_live_p (regno)
12650 && !fixed_regs[regno]
12651 && !STACK_REGNO_P (regno)
12652 && !MMX_REGNO_P (regno)
12653 && (regno != HARD_FRAME_POINTER_REGNUM
12654 || !frame_pointer_needed));
12657 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12658 && pic_offset_table_rtx)
12660 if (ix86_use_pseudo_pic_reg ())
12662 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12663 _mcount in prologue. */
12664 if (!TARGET_64BIT && flag_pic && crtl->profile)
12665 return true;
12667 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12668 || crtl->profile
12669 || crtl->calls_eh_return
12670 || crtl->uses_const_pool
12671 || cfun->has_nonlocal_label)
12672 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12675 if (crtl->calls_eh_return && maybe_eh_return)
12677 unsigned i;
12678 for (i = 0; ; i++)
12680 unsigned test = EH_RETURN_DATA_REGNO (i);
12681 if (test == INVALID_REGNUM)
12682 break;
12683 if (test == regno)
12684 return true;
12688 if (ignore_outlined && cfun->machine->call_ms2sysv)
12690 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12691 + xlogue_layout::MIN_REGS;
12692 if (xlogue_layout::is_stub_managed_reg (regno, count))
12693 return false;
12696 if (crtl->drap_reg
12697 && regno == REGNO (crtl->drap_reg)
12698 && !cfun->machine->no_drap_save_restore)
12699 return true;
12701 return (df_regs_ever_live_p (regno)
12702 && !call_used_regs[regno]
12703 && !fixed_regs[regno]
12704 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12707 /* Return number of saved general prupose registers. */
12709 static int
12710 ix86_nsaved_regs (void)
12712 int nregs = 0;
12713 int regno;
12715 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12716 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12717 nregs ++;
12718 return nregs;
12721 /* Return number of saved SSE registers. */
12723 static int
12724 ix86_nsaved_sseregs (void)
12726 int nregs = 0;
12727 int regno;
12729 if (!TARGET_64BIT_MS_ABI)
12730 return 0;
12731 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12732 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12733 nregs ++;
12734 return nregs;
12737 /* Given FROM and TO register numbers, say whether this elimination is
12738 allowed. If stack alignment is needed, we can only replace argument
12739 pointer with hard frame pointer, or replace frame pointer with stack
12740 pointer. Otherwise, frame pointer elimination is automatically
12741 handled and all other eliminations are valid. */
12743 static bool
12744 ix86_can_eliminate (const int from, const int to)
12746 if (stack_realign_fp)
12747 return ((from == ARG_POINTER_REGNUM
12748 && to == HARD_FRAME_POINTER_REGNUM)
12749 || (from == FRAME_POINTER_REGNUM
12750 && to == STACK_POINTER_REGNUM));
12751 else
12752 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12755 /* Return the offset between two registers, one to be eliminated, and the other
12756 its replacement, at the start of a routine. */
12758 HOST_WIDE_INT
12759 ix86_initial_elimination_offset (int from, int to)
12761 struct ix86_frame frame = cfun->machine->frame;
12763 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12764 return frame.hard_frame_pointer_offset;
12765 else if (from == FRAME_POINTER_REGNUM
12766 && to == HARD_FRAME_POINTER_REGNUM)
12767 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12768 else
12770 gcc_assert (to == STACK_POINTER_REGNUM);
12772 if (from == ARG_POINTER_REGNUM)
12773 return frame.stack_pointer_offset;
12775 gcc_assert (from == FRAME_POINTER_REGNUM);
12776 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12780 /* In a dynamically-aligned function, we can't know the offset from
12781 stack pointer to frame pointer, so we must ensure that setjmp
12782 eliminates fp against the hard fp (%ebp) rather than trying to
12783 index from %esp up to the top of the frame across a gap that is
12784 of unknown (at compile-time) size. */
12785 static rtx
12786 ix86_builtin_setjmp_frame_value (void)
12788 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12791 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12792 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12794 static bool warned_once = false;
12795 if (!warned_once)
12797 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12798 feature);
12799 warned_once = true;
12803 /* When using -fsplit-stack, the allocation routines set a field in
12804 the TCB to the bottom of the stack plus this much space, measured
12805 in bytes. */
12807 #define SPLIT_STACK_AVAILABLE 256
12809 /* Fill structure ix86_frame about frame of currently computed function. */
12811 static void
12812 ix86_compute_frame_layout (void)
12814 struct ix86_frame *frame = &cfun->machine->frame;
12815 struct machine_function *m = cfun->machine;
12816 unsigned HOST_WIDE_INT stack_alignment_needed;
12817 HOST_WIDE_INT offset;
12818 unsigned HOST_WIDE_INT preferred_alignment;
12819 HOST_WIDE_INT size = get_frame_size ();
12820 HOST_WIDE_INT to_allocate;
12822 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12823 * ms_abi functions that call a sysv function. We now need to prune away
12824 * cases where it should be disabled. */
12825 if (TARGET_64BIT && m->call_ms2sysv)
12827 gcc_assert (TARGET_64BIT_MS_ABI);
12828 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12829 gcc_assert (!TARGET_SEH);
12830 gcc_assert (TARGET_SSE);
12831 gcc_assert (!ix86_using_red_zone ());
12833 if (crtl->calls_eh_return)
12835 gcc_assert (!reload_completed);
12836 m->call_ms2sysv = false;
12837 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12840 else if (ix86_static_chain_on_stack)
12842 gcc_assert (!reload_completed);
12843 m->call_ms2sysv = false;
12844 warn_once_call_ms2sysv_xlogues ("static call chains");
12847 /* Finally, compute which registers the stub will manage. */
12848 else
12850 unsigned count = xlogue_layout::count_stub_managed_regs ();
12851 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12855 frame->nregs = ix86_nsaved_regs ();
12856 frame->nsseregs = ix86_nsaved_sseregs ();
12857 m->call_ms2sysv_pad_in = 0;
12858 m->call_ms2sysv_pad_out = 0;
12860 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12861 except for function prologues, leaf functions and when the defult
12862 incoming stack boundary is overriden at command line or via
12863 force_align_arg_pointer attribute. */
12864 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12865 && (!crtl->is_leaf || cfun->calls_alloca != 0
12866 || ix86_current_function_calls_tls_descriptor
12867 || ix86_incoming_stack_boundary < 128))
12869 crtl->preferred_stack_boundary = 128;
12870 crtl->stack_alignment_needed = 128;
12873 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12874 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12876 gcc_assert (!size || stack_alignment_needed);
12877 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12878 gcc_assert (preferred_alignment <= stack_alignment_needed);
12880 /* For SEH we have to limit the amount of code movement into the prologue.
12881 At present we do this via a BLOCKAGE, at which point there's very little
12882 scheduling that can be done, which means that there's very little point
12883 in doing anything except PUSHs. */
12884 if (TARGET_SEH)
12885 m->use_fast_prologue_epilogue = false;
12886 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12888 int count = frame->nregs;
12889 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12891 /* The fast prologue uses move instead of push to save registers. This
12892 is significantly longer, but also executes faster as modern hardware
12893 can execute the moves in parallel, but can't do that for push/pop.
12895 Be careful about choosing what prologue to emit: When function takes
12896 many instructions to execute we may use slow version as well as in
12897 case function is known to be outside hot spot (this is known with
12898 feedback only). Weight the size of function by number of registers
12899 to save as it is cheap to use one or two push instructions but very
12900 slow to use many of them. */
12901 if (count)
12902 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12903 if (node->frequency < NODE_FREQUENCY_NORMAL
12904 || (flag_branch_probabilities
12905 && node->frequency < NODE_FREQUENCY_HOT))
12906 m->use_fast_prologue_epilogue = false;
12907 else
12908 m->use_fast_prologue_epilogue
12909 = !expensive_function_p (count);
12912 frame->save_regs_using_mov
12913 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12914 /* If static stack checking is enabled and done with probes,
12915 the registers need to be saved before allocating the frame. */
12916 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12918 /* Skip return address. */
12919 offset = UNITS_PER_WORD;
12921 /* Skip pushed static chain. */
12922 if (ix86_static_chain_on_stack)
12923 offset += UNITS_PER_WORD;
12925 /* Skip saved base pointer. */
12926 if (frame_pointer_needed)
12927 offset += UNITS_PER_WORD;
12928 frame->hfp_save_offset = offset;
12930 /* The traditional frame pointer location is at the top of the frame. */
12931 frame->hard_frame_pointer_offset = offset;
12933 /* Register save area */
12934 offset += frame->nregs * UNITS_PER_WORD;
12935 frame->reg_save_offset = offset;
12937 /* On SEH target, registers are pushed just before the frame pointer
12938 location. */
12939 if (TARGET_SEH)
12940 frame->hard_frame_pointer_offset = offset;
12942 /* When re-aligning the stack frame, but not saving SSE registers, this
12943 is the offset we want adjust the stack pointer to. */
12944 frame->stack_realign_allocate_offset = offset;
12946 /* The re-aligned stack starts here. Values before this point are not
12947 directly comparable with values below this point. Use sp_valid_at
12948 to determine if the stack pointer is valid for a given offset and
12949 fp_valid_at for the frame pointer. */
12950 if (stack_realign_fp)
12951 offset = ROUND_UP (offset, stack_alignment_needed);
12952 frame->stack_realign_offset = offset;
12954 if (TARGET_64BIT && m->call_ms2sysv)
12956 gcc_assert (stack_alignment_needed >= 16);
12957 gcc_assert (!frame->nsseregs);
12959 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
12961 /* Select an appropriate layout for incoming stack offset. */
12962 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12964 if ((offset + xlogue.get_stack_space_used ()) & UNITS_PER_WORD)
12965 m->call_ms2sysv_pad_out = 1;
12967 offset += xlogue.get_stack_space_used ();
12968 gcc_assert (!(offset & 0xf));
12969 frame->outlined_save_offset = offset;
12972 /* Align and set SSE register save area. */
12973 else if (frame->nsseregs)
12975 /* The only ABI that has saved SSE registers (Win64) also has a
12976 16-byte aligned default stack. However, many programs violate
12977 the ABI, and Wine64 forces stack realignment to compensate.
12979 If the incoming stack boundary is at least 16 bytes, or DRAP is
12980 required and the DRAP re-alignment boundary is at least 16 bytes,
12981 then we want the SSE register save area properly aligned. */
12982 if (ix86_incoming_stack_boundary >= 128
12983 || (stack_realign_drap && stack_alignment_needed >= 16))
12984 offset = ROUND_UP (offset, 16);
12985 offset += frame->nsseregs * 16;
12986 frame->stack_realign_allocate_offset = offset;
12989 frame->sse_reg_save_offset = offset;
12991 /* Va-arg area */
12992 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12993 offset += frame->va_arg_size;
12995 /* Align start of frame for local function. */
12996 if (stack_realign_fp
12997 || offset != frame->sse_reg_save_offset
12998 || size != 0
12999 || !crtl->is_leaf
13000 || cfun->calls_alloca
13001 || ix86_current_function_calls_tls_descriptor)
13002 offset = ROUND_UP (offset, stack_alignment_needed);
13004 /* Frame pointer points here. */
13005 frame->frame_pointer_offset = offset;
13007 offset += size;
13009 /* Add outgoing arguments area. Can be skipped if we eliminated
13010 all the function calls as dead code.
13011 Skipping is however impossible when function calls alloca. Alloca
13012 expander assumes that last crtl->outgoing_args_size
13013 of stack frame are unused. */
13014 if (ACCUMULATE_OUTGOING_ARGS
13015 && (!crtl->is_leaf || cfun->calls_alloca
13016 || ix86_current_function_calls_tls_descriptor))
13018 offset += crtl->outgoing_args_size;
13019 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13021 else
13022 frame->outgoing_arguments_size = 0;
13024 /* Align stack boundary. Only needed if we're calling another function
13025 or using alloca. */
13026 if (!crtl->is_leaf || cfun->calls_alloca
13027 || ix86_current_function_calls_tls_descriptor)
13028 offset = ROUND_UP (offset, preferred_alignment);
13030 /* We've reached end of stack frame. */
13031 frame->stack_pointer_offset = offset;
13033 /* Size prologue needs to allocate. */
13034 to_allocate = offset - frame->sse_reg_save_offset;
13036 if ((!to_allocate && frame->nregs <= 1)
13037 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13038 frame->save_regs_using_mov = false;
13040 if (ix86_using_red_zone ()
13041 && crtl->sp_is_unchanging
13042 && crtl->is_leaf
13043 && !ix86_pc_thunk_call_expanded
13044 && !ix86_current_function_calls_tls_descriptor)
13046 frame->red_zone_size = to_allocate;
13047 if (frame->save_regs_using_mov)
13048 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13049 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13050 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13052 else
13053 frame->red_zone_size = 0;
13054 frame->stack_pointer_offset -= frame->red_zone_size;
13056 /* The SEH frame pointer location is near the bottom of the frame.
13057 This is enforced by the fact that the difference between the
13058 stack pointer and the frame pointer is limited to 240 bytes in
13059 the unwind data structure. */
13060 if (TARGET_SEH)
13062 HOST_WIDE_INT diff;
13064 /* If we can leave the frame pointer where it is, do so. Also, returns
13065 the establisher frame for __builtin_frame_address (0). */
13066 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13067 if (diff <= SEH_MAX_FRAME_SIZE
13068 && (diff > 240 || (diff & 15) != 0)
13069 && !crtl->accesses_prior_frames)
13071 /* Ideally we'd determine what portion of the local stack frame
13072 (within the constraint of the lowest 240) is most heavily used.
13073 But without that complication, simply bias the frame pointer
13074 by 128 bytes so as to maximize the amount of the local stack
13075 frame that is addressable with 8-bit offsets. */
13076 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13081 /* This is semi-inlined memory_address_length, but simplified
13082 since we know that we're always dealing with reg+offset, and
13083 to avoid having to create and discard all that rtl. */
13085 static inline int
13086 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13088 int len = 4;
13090 if (offset == 0)
13092 /* EBP and R13 cannot be encoded without an offset. */
13093 len = (regno == BP_REG || regno == R13_REG);
13095 else if (IN_RANGE (offset, -128, 127))
13096 len = 1;
13098 /* ESP and R12 must be encoded with a SIB byte. */
13099 if (regno == SP_REG || regno == R12_REG)
13100 len++;
13102 return len;
13105 /* Determine if the stack pointer is valid for accessing the cfa_offset.
13106 The register is saved at CFA - CFA_OFFSET. */
13108 static inline bool
13109 sp_valid_at (HOST_WIDE_INT cfa_offset)
13111 const struct machine_frame_state &fs = cfun->machine->fs;
13112 return fs.sp_valid && !(fs.sp_realigned
13113 && cfa_offset <= fs.sp_realigned_offset);
13116 /* Determine if the frame pointer is valid for accessing the cfa_offset.
13117 The register is saved at CFA - CFA_OFFSET. */
13119 static inline bool
13120 fp_valid_at (HOST_WIDE_INT cfa_offset)
13122 const struct machine_frame_state &fs = cfun->machine->fs;
13123 return fs.fp_valid && !(fs.sp_valid && fs.sp_realigned
13124 && cfa_offset > fs.sp_realigned_offset);
13127 /* Choose a base register based upon alignment requested, speed and/or
13128 size. */
13130 static void
13131 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13132 HOST_WIDE_INT &base_offset,
13133 unsigned int align_reqested, unsigned int *align)
13135 const struct machine_function *m = cfun->machine;
13136 unsigned int hfp_align;
13137 unsigned int drap_align;
13138 unsigned int sp_align;
13139 bool hfp_ok = fp_valid_at (cfa_offset);
13140 bool drap_ok = m->fs.drap_valid;
13141 bool sp_ok = sp_valid_at (cfa_offset);
13143 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13145 /* Filter out any registers that don't meet the requested alignment
13146 criteria. */
13147 if (align_reqested)
13149 if (m->fs.realigned)
13150 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13151 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13152 notes (which we would need to use a realigned stack pointer),
13153 so disable on SEH targets. */
13154 else if (m->fs.sp_realigned)
13155 sp_align = crtl->stack_alignment_needed;
13157 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13158 drap_ok = drap_ok && drap_align >= align_reqested;
13159 sp_ok = sp_ok && sp_align >= align_reqested;
13162 if (m->use_fast_prologue_epilogue)
13164 /* Choose the base register most likely to allow the most scheduling
13165 opportunities. Generally FP is valid throughout the function,
13166 while DRAP must be reloaded within the epilogue. But choose either
13167 over the SP due to increased encoding size. */
13169 if (hfp_ok)
13171 base_reg = hard_frame_pointer_rtx;
13172 base_offset = m->fs.fp_offset - cfa_offset;
13174 else if (drap_ok)
13176 base_reg = crtl->drap_reg;
13177 base_offset = 0 - cfa_offset;
13179 else if (sp_ok)
13181 base_reg = stack_pointer_rtx;
13182 base_offset = m->fs.sp_offset - cfa_offset;
13185 else
13187 HOST_WIDE_INT toffset;
13188 int len = 16, tlen;
13190 /* Choose the base register with the smallest address encoding.
13191 With a tie, choose FP > DRAP > SP. */
13192 if (sp_ok)
13194 base_reg = stack_pointer_rtx;
13195 base_offset = m->fs.sp_offset - cfa_offset;
13196 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13198 if (drap_ok)
13200 toffset = 0 - cfa_offset;
13201 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13202 if (tlen <= len)
13204 base_reg = crtl->drap_reg;
13205 base_offset = toffset;
13206 len = tlen;
13209 if (hfp_ok)
13211 toffset = m->fs.fp_offset - cfa_offset;
13212 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13213 if (tlen <= len)
13215 base_reg = hard_frame_pointer_rtx;
13216 base_offset = toffset;
13217 len = tlen;
13222 /* Set the align return value. */
13223 if (align)
13225 if (base_reg == stack_pointer_rtx)
13226 *align = sp_align;
13227 else if (base_reg == crtl->drap_reg)
13228 *align = drap_align;
13229 else if (base_reg == hard_frame_pointer_rtx)
13230 *align = hfp_align;
13234 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13235 the alignment of address. If align is non-null, it should point to
13236 an alignment value (in bits) that is preferred or zero and will
13237 recieve the alignment of the base register that was selected. The
13238 valid base registers are taken from CFUN->MACHINE->FS. */
13240 static rtx
13241 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13243 rtx base_reg = NULL;
13244 HOST_WIDE_INT base_offset = 0;
13246 /* If a specific alignment is requested, try to get a base register
13247 with that alignment first. */
13248 if (align && *align)
13249 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13251 if (!base_reg)
13252 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13254 gcc_assert (base_reg != NULL);
13255 return plus_constant (Pmode, base_reg, base_offset);
13258 /* Emit code to save registers in the prologue. */
13260 static void
13261 ix86_emit_save_regs (void)
13263 unsigned int regno;
13264 rtx_insn *insn;
13266 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13267 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13269 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13270 RTX_FRAME_RELATED_P (insn) = 1;
13274 /* Emit a single register save at CFA - CFA_OFFSET. */
13276 static void
13277 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13278 HOST_WIDE_INT cfa_offset)
13280 struct machine_function *m = cfun->machine;
13281 rtx reg = gen_rtx_REG (mode, regno);
13282 rtx mem, addr, base, insn;
13283 unsigned int align = GET_MODE_ALIGNMENT (mode);
13285 addr = choose_baseaddr (cfa_offset, &align);
13286 mem = gen_frame_mem (mode, addr);
13288 /* The location aligment depends upon the base register. */
13289 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13290 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13291 set_mem_align (mem, align);
13293 insn = emit_insn (gen_rtx_SET (mem, reg));
13294 RTX_FRAME_RELATED_P (insn) = 1;
13296 base = addr;
13297 if (GET_CODE (base) == PLUS)
13298 base = XEXP (base, 0);
13299 gcc_checking_assert (REG_P (base));
13301 /* When saving registers into a re-aligned local stack frame, avoid
13302 any tricky guessing by dwarf2out. */
13303 if (m->fs.realigned)
13305 gcc_checking_assert (stack_realign_drap);
13307 if (regno == REGNO (crtl->drap_reg))
13309 /* A bit of a hack. We force the DRAP register to be saved in
13310 the re-aligned stack frame, which provides us with a copy
13311 of the CFA that will last past the prologue. Install it. */
13312 gcc_checking_assert (cfun->machine->fs.fp_valid);
13313 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13314 cfun->machine->fs.fp_offset - cfa_offset);
13315 mem = gen_rtx_MEM (mode, addr);
13316 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13318 else
13320 /* The frame pointer is a stable reference within the
13321 aligned frame. Use it. */
13322 gcc_checking_assert (cfun->machine->fs.fp_valid);
13323 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13324 cfun->machine->fs.fp_offset - cfa_offset);
13325 mem = gen_rtx_MEM (mode, addr);
13326 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13330 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13331 && cfa_offset >= m->fs.sp_realigned_offset)
13333 gcc_checking_assert (stack_realign_fp);
13334 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13337 /* The memory may not be relative to the current CFA register,
13338 which means that we may need to generate a new pattern for
13339 use by the unwind info. */
13340 else if (base != m->fs.cfa_reg)
13342 addr = plus_constant (Pmode, m->fs.cfa_reg,
13343 m->fs.cfa_offset - cfa_offset);
13344 mem = gen_rtx_MEM (mode, addr);
13345 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13349 /* Emit code to save registers using MOV insns.
13350 First register is stored at CFA - CFA_OFFSET. */
13351 static void
13352 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13354 unsigned int regno;
13356 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13357 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13359 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13360 cfa_offset -= UNITS_PER_WORD;
13364 /* Emit code to save SSE registers using MOV insns.
13365 First register is stored at CFA - CFA_OFFSET. */
13366 static void
13367 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13369 unsigned int regno;
13371 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13372 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13374 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13375 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13379 static GTY(()) rtx queued_cfa_restores;
13381 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13382 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13383 Don't add the note if the previously saved value will be left untouched
13384 within stack red-zone till return, as unwinders can find the same value
13385 in the register and on the stack. */
13387 static void
13388 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13390 if (!crtl->shrink_wrapped
13391 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13392 return;
13394 if (insn)
13396 add_reg_note (insn, REG_CFA_RESTORE, reg);
13397 RTX_FRAME_RELATED_P (insn) = 1;
13399 else
13400 queued_cfa_restores
13401 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13404 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13406 static void
13407 ix86_add_queued_cfa_restore_notes (rtx insn)
13409 rtx last;
13410 if (!queued_cfa_restores)
13411 return;
13412 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13414 XEXP (last, 1) = REG_NOTES (insn);
13415 REG_NOTES (insn) = queued_cfa_restores;
13416 queued_cfa_restores = NULL_RTX;
13417 RTX_FRAME_RELATED_P (insn) = 1;
13420 /* Expand prologue or epilogue stack adjustment.
13421 The pattern exist to put a dependency on all ebp-based memory accesses.
13422 STYLE should be negative if instructions should be marked as frame related,
13423 zero if %r11 register is live and cannot be freely used and positive
13424 otherwise. */
13426 static void
13427 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13428 int style, bool set_cfa)
13430 struct machine_function *m = cfun->machine;
13431 rtx insn;
13432 bool add_frame_related_expr = false;
13434 if (Pmode == SImode)
13435 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13436 else if (x86_64_immediate_operand (offset, DImode))
13437 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13438 else
13440 rtx tmp;
13441 /* r11 is used by indirect sibcall return as well, set before the
13442 epilogue and used after the epilogue. */
13443 if (style)
13444 tmp = gen_rtx_REG (DImode, R11_REG);
13445 else
13447 gcc_assert (src != hard_frame_pointer_rtx
13448 && dest != hard_frame_pointer_rtx);
13449 tmp = hard_frame_pointer_rtx;
13451 insn = emit_insn (gen_rtx_SET (tmp, offset));
13452 if (style < 0)
13453 add_frame_related_expr = true;
13455 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13458 insn = emit_insn (insn);
13459 if (style >= 0)
13460 ix86_add_queued_cfa_restore_notes (insn);
13462 if (set_cfa)
13464 rtx r;
13466 gcc_assert (m->fs.cfa_reg == src);
13467 m->fs.cfa_offset += INTVAL (offset);
13468 m->fs.cfa_reg = dest;
13470 r = gen_rtx_PLUS (Pmode, src, offset);
13471 r = gen_rtx_SET (dest, r);
13472 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13473 RTX_FRAME_RELATED_P (insn) = 1;
13475 else if (style < 0)
13477 RTX_FRAME_RELATED_P (insn) = 1;
13478 if (add_frame_related_expr)
13480 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13481 r = gen_rtx_SET (dest, r);
13482 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13486 if (dest == stack_pointer_rtx)
13488 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13489 bool valid = m->fs.sp_valid;
13490 bool realigned = m->fs.sp_realigned;
13492 if (src == hard_frame_pointer_rtx)
13494 valid = m->fs.fp_valid;
13495 realigned = false;
13496 ooffset = m->fs.fp_offset;
13498 else if (src == crtl->drap_reg)
13500 valid = m->fs.drap_valid;
13501 realigned = false;
13502 ooffset = 0;
13504 else
13506 /* Else there are two possibilities: SP itself, which we set
13507 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13508 taken care of this by hand along the eh_return path. */
13509 gcc_checking_assert (src == stack_pointer_rtx
13510 || offset == const0_rtx);
13513 m->fs.sp_offset = ooffset - INTVAL (offset);
13514 m->fs.sp_valid = valid;
13515 m->fs.sp_realigned = realigned;
13519 /* Find an available register to be used as dynamic realign argument
13520 pointer regsiter. Such a register will be written in prologue and
13521 used in begin of body, so it must not be
13522 1. parameter passing register.
13523 2. GOT pointer.
13524 We reuse static-chain register if it is available. Otherwise, we
13525 use DI for i386 and R13 for x86-64. We chose R13 since it has
13526 shorter encoding.
13528 Return: the regno of chosen register. */
13530 static unsigned int
13531 find_drap_reg (void)
13533 tree decl = cfun->decl;
13535 /* Always use callee-saved register if there are no caller-saved
13536 registers. */
13537 if (TARGET_64BIT)
13539 /* Use R13 for nested function or function need static chain.
13540 Since function with tail call may use any caller-saved
13541 registers in epilogue, DRAP must not use caller-saved
13542 register in such case. */
13543 if (DECL_STATIC_CHAIN (decl)
13544 || cfun->machine->no_caller_saved_registers
13545 || crtl->tail_call_emit)
13546 return R13_REG;
13548 return R10_REG;
13550 else
13552 /* Use DI for nested function or function need static chain.
13553 Since function with tail call may use any caller-saved
13554 registers in epilogue, DRAP must not use caller-saved
13555 register in such case. */
13556 if (DECL_STATIC_CHAIN (decl)
13557 || cfun->machine->no_caller_saved_registers
13558 || crtl->tail_call_emit)
13559 return DI_REG;
13561 /* Reuse static chain register if it isn't used for parameter
13562 passing. */
13563 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13565 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13566 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13567 return CX_REG;
13569 return DI_REG;
13573 /* Handle a "force_align_arg_pointer" attribute. */
13575 static tree
13576 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13577 tree, int, bool *no_add_attrs)
13579 if (TREE_CODE (*node) != FUNCTION_TYPE
13580 && TREE_CODE (*node) != METHOD_TYPE
13581 && TREE_CODE (*node) != FIELD_DECL
13582 && TREE_CODE (*node) != TYPE_DECL)
13584 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13585 name);
13586 *no_add_attrs = true;
13589 return NULL_TREE;
13592 /* Return minimum incoming stack alignment. */
13594 static unsigned int
13595 ix86_minimum_incoming_stack_boundary (bool sibcall)
13597 unsigned int incoming_stack_boundary;
13599 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13600 if (cfun->machine->func_type != TYPE_NORMAL)
13601 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13602 /* Prefer the one specified at command line. */
13603 else if (ix86_user_incoming_stack_boundary)
13604 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13605 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13606 if -mstackrealign is used, it isn't used for sibcall check and
13607 estimated stack alignment is 128bit. */
13608 else if (!sibcall
13609 && ix86_force_align_arg_pointer
13610 && crtl->stack_alignment_estimated == 128)
13611 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13612 else
13613 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13615 /* Incoming stack alignment can be changed on individual functions
13616 via force_align_arg_pointer attribute. We use the smallest
13617 incoming stack boundary. */
13618 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13619 && lookup_attribute (ix86_force_align_arg_pointer_string,
13620 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13621 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13623 /* The incoming stack frame has to be aligned at least at
13624 parm_stack_boundary. */
13625 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13626 incoming_stack_boundary = crtl->parm_stack_boundary;
13628 /* Stack at entrance of main is aligned by runtime. We use the
13629 smallest incoming stack boundary. */
13630 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13631 && DECL_NAME (current_function_decl)
13632 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13633 && DECL_FILE_SCOPE_P (current_function_decl))
13634 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13636 return incoming_stack_boundary;
13639 /* Update incoming stack boundary and estimated stack alignment. */
13641 static void
13642 ix86_update_stack_boundary (void)
13644 ix86_incoming_stack_boundary
13645 = ix86_minimum_incoming_stack_boundary (false);
13647 /* x86_64 vararg needs 16byte stack alignment for register save
13648 area. */
13649 if (TARGET_64BIT
13650 && cfun->stdarg
13651 && crtl->stack_alignment_estimated < 128)
13652 crtl->stack_alignment_estimated = 128;
13654 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13655 if (ix86_tls_descriptor_calls_expanded_in_cfun
13656 && crtl->preferred_stack_boundary < 128)
13657 crtl->preferred_stack_boundary = 128;
13660 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13661 needed or an rtx for DRAP otherwise. */
13663 static rtx
13664 ix86_get_drap_rtx (void)
13666 /* We must use DRAP if there are outgoing arguments on stack and
13667 ACCUMULATE_OUTGOING_ARGS is false. */
13668 if (ix86_force_drap
13669 || (cfun->machine->outgoing_args_on_stack
13670 && !ACCUMULATE_OUTGOING_ARGS))
13671 crtl->need_drap = true;
13673 if (stack_realign_drap)
13675 /* Assign DRAP to vDRAP and returns vDRAP */
13676 unsigned int regno = find_drap_reg ();
13677 rtx drap_vreg;
13678 rtx arg_ptr;
13679 rtx_insn *seq, *insn;
13681 arg_ptr = gen_rtx_REG (Pmode, regno);
13682 crtl->drap_reg = arg_ptr;
13684 start_sequence ();
13685 drap_vreg = copy_to_reg (arg_ptr);
13686 seq = get_insns ();
13687 end_sequence ();
13689 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13690 if (!optimize)
13692 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13693 RTX_FRAME_RELATED_P (insn) = 1;
13695 return drap_vreg;
13697 else
13698 return NULL;
13701 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13703 static rtx
13704 ix86_internal_arg_pointer (void)
13706 return virtual_incoming_args_rtx;
13709 struct scratch_reg {
13710 rtx reg;
13711 bool saved;
13714 /* Return a short-lived scratch register for use on function entry.
13715 In 32-bit mode, it is valid only after the registers are saved
13716 in the prologue. This register must be released by means of
13717 release_scratch_register_on_entry once it is dead. */
13719 static void
13720 get_scratch_register_on_entry (struct scratch_reg *sr)
13722 int regno;
13724 sr->saved = false;
13726 if (TARGET_64BIT)
13728 /* We always use R11 in 64-bit mode. */
13729 regno = R11_REG;
13731 else
13733 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13734 bool fastcall_p
13735 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13736 bool thiscall_p
13737 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13738 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13739 int regparm = ix86_function_regparm (fntype, decl);
13740 int drap_regno
13741 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13743 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13744 for the static chain register. */
13745 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13746 && drap_regno != AX_REG)
13747 regno = AX_REG;
13748 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13749 for the static chain register. */
13750 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13751 regno = AX_REG;
13752 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13753 regno = DX_REG;
13754 /* ecx is the static chain register. */
13755 else if (regparm < 3 && !fastcall_p && !thiscall_p
13756 && !static_chain_p
13757 && drap_regno != CX_REG)
13758 regno = CX_REG;
13759 else if (ix86_save_reg (BX_REG, true, false))
13760 regno = BX_REG;
13761 /* esi is the static chain register. */
13762 else if (!(regparm == 3 && static_chain_p)
13763 && ix86_save_reg (SI_REG, true, false))
13764 regno = SI_REG;
13765 else if (ix86_save_reg (DI_REG, true, false))
13766 regno = DI_REG;
13767 else
13769 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13770 sr->saved = true;
13774 sr->reg = gen_rtx_REG (Pmode, regno);
13775 if (sr->saved)
13777 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13778 RTX_FRAME_RELATED_P (insn) = 1;
13782 /* Release a scratch register obtained from the preceding function. */
13784 static void
13785 release_scratch_register_on_entry (struct scratch_reg *sr)
13787 if (sr->saved)
13789 struct machine_function *m = cfun->machine;
13790 rtx x, insn = emit_insn (gen_pop (sr->reg));
13792 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13793 RTX_FRAME_RELATED_P (insn) = 1;
13794 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13795 x = gen_rtx_SET (stack_pointer_rtx, x);
13796 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13797 m->fs.sp_offset -= UNITS_PER_WORD;
13801 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13803 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13805 static void
13806 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13808 /* We skip the probe for the first interval + a small dope of 4 words and
13809 probe that many bytes past the specified size to maintain a protection
13810 area at the botton of the stack. */
13811 const int dope = 4 * UNITS_PER_WORD;
13812 rtx size_rtx = GEN_INT (size), last;
13814 /* See if we have a constant small number of probes to generate. If so,
13815 that's the easy case. The run-time loop is made up of 9 insns in the
13816 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13817 for n # of intervals. */
13818 if (size <= 4 * PROBE_INTERVAL)
13820 HOST_WIDE_INT i, adjust;
13821 bool first_probe = true;
13823 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13824 values of N from 1 until it exceeds SIZE. If only one probe is
13825 needed, this will not generate any code. Then adjust and probe
13826 to PROBE_INTERVAL + SIZE. */
13827 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13829 if (first_probe)
13831 adjust = 2 * PROBE_INTERVAL + dope;
13832 first_probe = false;
13834 else
13835 adjust = PROBE_INTERVAL;
13837 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13838 plus_constant (Pmode, stack_pointer_rtx,
13839 -adjust)));
13840 emit_stack_probe (stack_pointer_rtx);
13843 if (first_probe)
13844 adjust = size + PROBE_INTERVAL + dope;
13845 else
13846 adjust = size + PROBE_INTERVAL - i;
13848 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13849 plus_constant (Pmode, stack_pointer_rtx,
13850 -adjust)));
13851 emit_stack_probe (stack_pointer_rtx);
13853 /* Adjust back to account for the additional first interval. */
13854 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13855 plus_constant (Pmode, stack_pointer_rtx,
13856 PROBE_INTERVAL + dope)));
13859 /* Otherwise, do the same as above, but in a loop. Note that we must be
13860 extra careful with variables wrapping around because we might be at
13861 the very top (or the very bottom) of the address space and we have
13862 to be able to handle this case properly; in particular, we use an
13863 equality test for the loop condition. */
13864 else
13866 HOST_WIDE_INT rounded_size;
13867 struct scratch_reg sr;
13869 get_scratch_register_on_entry (&sr);
13872 /* Step 1: round SIZE to the previous multiple of the interval. */
13874 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13877 /* Step 2: compute initial and final value of the loop counter. */
13879 /* SP = SP_0 + PROBE_INTERVAL. */
13880 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13881 plus_constant (Pmode, stack_pointer_rtx,
13882 - (PROBE_INTERVAL + dope))));
13884 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13885 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13886 emit_insn (gen_rtx_SET (sr.reg,
13887 plus_constant (Pmode, stack_pointer_rtx,
13888 -rounded_size)));
13889 else
13891 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13892 emit_insn (gen_rtx_SET (sr.reg,
13893 gen_rtx_PLUS (Pmode, sr.reg,
13894 stack_pointer_rtx)));
13898 /* Step 3: the loop
13902 SP = SP + PROBE_INTERVAL
13903 probe at SP
13905 while (SP != LAST_ADDR)
13907 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13908 values of N from 1 until it is equal to ROUNDED_SIZE. */
13910 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13913 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13914 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13916 if (size != rounded_size)
13918 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13919 plus_constant (Pmode, stack_pointer_rtx,
13920 rounded_size - size)));
13921 emit_stack_probe (stack_pointer_rtx);
13924 /* Adjust back to account for the additional first interval. */
13925 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13926 plus_constant (Pmode, stack_pointer_rtx,
13927 PROBE_INTERVAL + dope)));
13929 release_scratch_register_on_entry (&sr);
13932 /* Even if the stack pointer isn't the CFA register, we need to correctly
13933 describe the adjustments made to it, in particular differentiate the
13934 frame-related ones from the frame-unrelated ones. */
13935 if (size > 0)
13937 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13938 XVECEXP (expr, 0, 0)
13939 = gen_rtx_SET (stack_pointer_rtx,
13940 plus_constant (Pmode, stack_pointer_rtx, -size));
13941 XVECEXP (expr, 0, 1)
13942 = gen_rtx_SET (stack_pointer_rtx,
13943 plus_constant (Pmode, stack_pointer_rtx,
13944 PROBE_INTERVAL + dope + size));
13945 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13946 RTX_FRAME_RELATED_P (last) = 1;
13948 cfun->machine->fs.sp_offset += size;
13951 /* Make sure nothing is scheduled before we are done. */
13952 emit_insn (gen_blockage ());
13955 /* Adjust the stack pointer up to REG while probing it. */
13957 const char *
13958 output_adjust_stack_and_probe (rtx reg)
13960 static int labelno = 0;
13961 char loop_lab[32];
13962 rtx xops[2];
13964 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13966 /* Loop. */
13967 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13969 /* SP = SP + PROBE_INTERVAL. */
13970 xops[0] = stack_pointer_rtx;
13971 xops[1] = GEN_INT (PROBE_INTERVAL);
13972 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13974 /* Probe at SP. */
13975 xops[1] = const0_rtx;
13976 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13978 /* Test if SP == LAST_ADDR. */
13979 xops[0] = stack_pointer_rtx;
13980 xops[1] = reg;
13981 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13983 /* Branch. */
13984 fputs ("\tjne\t", asm_out_file);
13985 assemble_name_raw (asm_out_file, loop_lab);
13986 fputc ('\n', asm_out_file);
13988 return "";
13991 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13992 inclusive. These are offsets from the current stack pointer. */
13994 static void
13995 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13997 /* See if we have a constant small number of probes to generate. If so,
13998 that's the easy case. The run-time loop is made up of 6 insns in the
13999 generic case while the compile-time loop is made up of n insns for n #
14000 of intervals. */
14001 if (size <= 6 * PROBE_INTERVAL)
14003 HOST_WIDE_INT i;
14005 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14006 it exceeds SIZE. If only one probe is needed, this will not
14007 generate any code. Then probe at FIRST + SIZE. */
14008 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14009 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14010 -(first + i)));
14012 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14013 -(first + size)));
14016 /* Otherwise, do the same as above, but in a loop. Note that we must be
14017 extra careful with variables wrapping around because we might be at
14018 the very top (or the very bottom) of the address space and we have
14019 to be able to handle this case properly; in particular, we use an
14020 equality test for the loop condition. */
14021 else
14023 HOST_WIDE_INT rounded_size, last;
14024 struct scratch_reg sr;
14026 get_scratch_register_on_entry (&sr);
14029 /* Step 1: round SIZE to the previous multiple of the interval. */
14031 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14034 /* Step 2: compute initial and final value of the loop counter. */
14036 /* TEST_OFFSET = FIRST. */
14037 emit_move_insn (sr.reg, GEN_INT (-first));
14039 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14040 last = first + rounded_size;
14043 /* Step 3: the loop
14047 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14048 probe at TEST_ADDR
14050 while (TEST_ADDR != LAST_ADDR)
14052 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14053 until it is equal to ROUNDED_SIZE. */
14055 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14058 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14059 that SIZE is equal to ROUNDED_SIZE. */
14061 if (size != rounded_size)
14062 emit_stack_probe (plus_constant (Pmode,
14063 gen_rtx_PLUS (Pmode,
14064 stack_pointer_rtx,
14065 sr.reg),
14066 rounded_size - size));
14068 release_scratch_register_on_entry (&sr);
14071 /* Make sure nothing is scheduled before we are done. */
14072 emit_insn (gen_blockage ());
14075 /* Probe a range of stack addresses from REG to END, inclusive. These are
14076 offsets from the current stack pointer. */
14078 const char *
14079 output_probe_stack_range (rtx reg, rtx end)
14081 static int labelno = 0;
14082 char loop_lab[32];
14083 rtx xops[3];
14085 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14087 /* Loop. */
14088 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14090 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14091 xops[0] = reg;
14092 xops[1] = GEN_INT (PROBE_INTERVAL);
14093 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14095 /* Probe at TEST_ADDR. */
14096 xops[0] = stack_pointer_rtx;
14097 xops[1] = reg;
14098 xops[2] = const0_rtx;
14099 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14101 /* Test if TEST_ADDR == LAST_ADDR. */
14102 xops[0] = reg;
14103 xops[1] = end;
14104 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14106 /* Branch. */
14107 fputs ("\tjne\t", asm_out_file);
14108 assemble_name_raw (asm_out_file, loop_lab);
14109 fputc ('\n', asm_out_file);
14111 return "";
14114 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
14115 to be generated in correct form. */
14116 static void
14117 ix86_finalize_stack_realign_flags (void)
14119 /* Check if stack realign is really needed after reload, and
14120 stores result in cfun */
14121 unsigned int incoming_stack_boundary
14122 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14123 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14124 unsigned int stack_realign
14125 = (incoming_stack_boundary
14126 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14127 ? crtl->max_used_stack_slot_alignment
14128 : crtl->stack_alignment_needed));
14129 bool recompute_frame_layout_p = false;
14131 if (crtl->stack_realign_finalized)
14133 /* After stack_realign_needed is finalized, we can't no longer
14134 change it. */
14135 gcc_assert (crtl->stack_realign_needed == stack_realign);
14136 return;
14139 /* If the only reason for frame_pointer_needed is that we conservatively
14140 assumed stack realignment might be needed, but in the end nothing that
14141 needed the stack alignment had been spilled, clear frame_pointer_needed
14142 and say we don't need stack realignment. */
14143 if (stack_realign
14144 && frame_pointer_needed
14145 && crtl->is_leaf
14146 && flag_omit_frame_pointer
14147 && crtl->sp_is_unchanging
14148 && !ix86_current_function_calls_tls_descriptor
14149 && !crtl->accesses_prior_frames
14150 && !cfun->calls_alloca
14151 && !crtl->calls_eh_return
14152 /* See ira_setup_eliminable_regset for the rationale. */
14153 && !(STACK_CHECK_MOVING_SP
14154 && flag_stack_check
14155 && flag_exceptions
14156 && cfun->can_throw_non_call_exceptions)
14157 && !ix86_frame_pointer_required ()
14158 && get_frame_size () == 0
14159 && ix86_nsaved_sseregs () == 0
14160 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14162 HARD_REG_SET set_up_by_prologue, prologue_used;
14163 basic_block bb;
14165 CLEAR_HARD_REG_SET (prologue_used);
14166 CLEAR_HARD_REG_SET (set_up_by_prologue);
14167 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14168 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14169 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14170 HARD_FRAME_POINTER_REGNUM);
14171 FOR_EACH_BB_FN (bb, cfun)
14173 rtx_insn *insn;
14174 FOR_BB_INSNS (bb, insn)
14175 if (NONDEBUG_INSN_P (insn)
14176 && requires_stack_frame_p (insn, prologue_used,
14177 set_up_by_prologue))
14179 if (crtl->stack_realign_needed != stack_realign)
14180 recompute_frame_layout_p = true;
14181 crtl->stack_realign_needed = stack_realign;
14182 crtl->stack_realign_finalized = true;
14183 if (recompute_frame_layout_p)
14184 ix86_compute_frame_layout ();
14185 return;
14189 /* If drap has been set, but it actually isn't live at the start
14190 of the function, there is no reason to set it up. */
14191 if (crtl->drap_reg)
14193 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14194 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14196 crtl->drap_reg = NULL_RTX;
14197 crtl->need_drap = false;
14200 else
14201 cfun->machine->no_drap_save_restore = true;
14203 frame_pointer_needed = false;
14204 stack_realign = false;
14205 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14206 crtl->stack_alignment_needed = incoming_stack_boundary;
14207 crtl->stack_alignment_estimated = incoming_stack_boundary;
14208 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14209 crtl->preferred_stack_boundary = incoming_stack_boundary;
14210 df_finish_pass (true);
14211 df_scan_alloc (NULL);
14212 df_scan_blocks ();
14213 df_compute_regs_ever_live (true);
14214 df_analyze ();
14215 recompute_frame_layout_p = true;
14218 if (crtl->stack_realign_needed != stack_realign)
14219 recompute_frame_layout_p = true;
14220 crtl->stack_realign_needed = stack_realign;
14221 crtl->stack_realign_finalized = true;
14222 if (recompute_frame_layout_p)
14223 ix86_compute_frame_layout ();
14226 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14228 static void
14229 ix86_elim_entry_set_got (rtx reg)
14231 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14232 rtx_insn *c_insn = BB_HEAD (bb);
14233 if (!NONDEBUG_INSN_P (c_insn))
14234 c_insn = next_nonnote_nondebug_insn (c_insn);
14235 if (c_insn && NONJUMP_INSN_P (c_insn))
14237 rtx pat = PATTERN (c_insn);
14238 if (GET_CODE (pat) == PARALLEL)
14240 rtx vec = XVECEXP (pat, 0, 0);
14241 if (GET_CODE (vec) == SET
14242 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14243 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14244 delete_insn (c_insn);
14249 static rtx
14250 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14252 rtx addr, mem;
14254 if (offset)
14255 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14256 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14257 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14260 static inline rtx
14261 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14263 return gen_frame_set (reg, frame_reg, offset, false);
14266 static inline rtx
14267 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14269 return gen_frame_set (reg, frame_reg, offset, true);
14272 static void
14273 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14275 struct machine_function *m = cfun->machine;
14276 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14277 + m->call_ms2sysv_extra_regs;
14278 rtvec v = rtvec_alloc (ncregs + 1);
14279 unsigned int align, i, vi = 0;
14280 rtx_insn *insn;
14281 rtx sym, addr;
14282 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14283 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14284 HOST_WIDE_INT rax_offset = xlogue.get_stub_ptr_offset () + m->fs.sp_offset;
14285 HOST_WIDE_INT stack_alloc_size = frame.stack_pointer_offset - m->fs.sp_offset;
14286 HOST_WIDE_INT stack_align_off_in = xlogue.get_stack_align_off_in ();
14288 /* Verify that the incoming stack 16-byte alignment offset matches the
14289 layout we're using. */
14290 gcc_assert (stack_align_off_in == (m->fs.sp_offset & UNITS_PER_WORD));
14292 /* Get the stub symbol. */
14293 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14294 : XLOGUE_STUB_SAVE);
14295 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14297 /* Setup RAX as the stub's base pointer. */
14298 align = GET_MODE_ALIGNMENT (V4SFmode);
14299 addr = choose_baseaddr (rax_offset, &align);
14300 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14301 insn = emit_insn (gen_rtx_SET (rax, addr));
14303 gcc_assert (stack_alloc_size >= xlogue.get_stack_space_used ());
14304 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14305 GEN_INT (-stack_alloc_size), -1,
14306 m->fs.cfa_reg == stack_pointer_rtx);
14307 for (i = 0; i < ncregs; ++i)
14309 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14310 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14311 r.regno);
14312 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);;
14315 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14317 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14318 RTX_FRAME_RELATED_P (insn) = true;
14321 /* Expand the prologue into a bunch of separate insns. */
14323 void
14324 ix86_expand_prologue (void)
14326 struct machine_function *m = cfun->machine;
14327 rtx insn, t;
14328 struct ix86_frame frame;
14329 HOST_WIDE_INT allocate;
14330 bool int_registers_saved;
14331 bool sse_registers_saved;
14332 rtx static_chain = NULL_RTX;
14334 ix86_finalize_stack_realign_flags ();
14336 /* DRAP should not coexist with stack_realign_fp */
14337 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14339 memset (&m->fs, 0, sizeof (m->fs));
14341 /* Initialize CFA state for before the prologue. */
14342 m->fs.cfa_reg = stack_pointer_rtx;
14343 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14345 /* Track SP offset to the CFA. We continue tracking this after we've
14346 swapped the CFA register away from SP. In the case of re-alignment
14347 this is fudged; we're interested to offsets within the local frame. */
14348 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14349 m->fs.sp_valid = true;
14350 m->fs.sp_realigned = false;
14352 frame = m->frame;
14354 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14356 /* We should have already generated an error for any use of
14357 ms_hook on a nested function. */
14358 gcc_checking_assert (!ix86_static_chain_on_stack);
14360 /* Check if profiling is active and we shall use profiling before
14361 prologue variant. If so sorry. */
14362 if (crtl->profile && flag_fentry != 0)
14363 sorry ("ms_hook_prologue attribute isn%'t compatible "
14364 "with -mfentry for 32-bit");
14366 /* In ix86_asm_output_function_label we emitted:
14367 8b ff movl.s %edi,%edi
14368 55 push %ebp
14369 8b ec movl.s %esp,%ebp
14371 This matches the hookable function prologue in Win32 API
14372 functions in Microsoft Windows XP Service Pack 2 and newer.
14373 Wine uses this to enable Windows apps to hook the Win32 API
14374 functions provided by Wine.
14376 What that means is that we've already set up the frame pointer. */
14378 if (frame_pointer_needed
14379 && !(crtl->drap_reg && crtl->stack_realign_needed))
14381 rtx push, mov;
14383 /* We've decided to use the frame pointer already set up.
14384 Describe this to the unwinder by pretending that both
14385 push and mov insns happen right here.
14387 Putting the unwind info here at the end of the ms_hook
14388 is done so that we can make absolutely certain we get
14389 the required byte sequence at the start of the function,
14390 rather than relying on an assembler that can produce
14391 the exact encoding required.
14393 However it does mean (in the unpatched case) that we have
14394 a 1 insn window where the asynchronous unwind info is
14395 incorrect. However, if we placed the unwind info at
14396 its correct location we would have incorrect unwind info
14397 in the patched case. Which is probably all moot since
14398 I don't expect Wine generates dwarf2 unwind info for the
14399 system libraries that use this feature. */
14401 insn = emit_insn (gen_blockage ());
14403 push = gen_push (hard_frame_pointer_rtx);
14404 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14405 stack_pointer_rtx);
14406 RTX_FRAME_RELATED_P (push) = 1;
14407 RTX_FRAME_RELATED_P (mov) = 1;
14409 RTX_FRAME_RELATED_P (insn) = 1;
14410 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14411 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14413 /* Note that gen_push incremented m->fs.cfa_offset, even
14414 though we didn't emit the push insn here. */
14415 m->fs.cfa_reg = hard_frame_pointer_rtx;
14416 m->fs.fp_offset = m->fs.cfa_offset;
14417 m->fs.fp_valid = true;
14419 else
14421 /* The frame pointer is not needed so pop %ebp again.
14422 This leaves us with a pristine state. */
14423 emit_insn (gen_pop (hard_frame_pointer_rtx));
14427 /* The first insn of a function that accepts its static chain on the
14428 stack is to push the register that would be filled in by a direct
14429 call. This insn will be skipped by the trampoline. */
14430 else if (ix86_static_chain_on_stack)
14432 static_chain = ix86_static_chain (cfun->decl, false);
14433 insn = emit_insn (gen_push (static_chain));
14434 emit_insn (gen_blockage ());
14436 /* We don't want to interpret this push insn as a register save,
14437 only as a stack adjustment. The real copy of the register as
14438 a save will be done later, if needed. */
14439 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14440 t = gen_rtx_SET (stack_pointer_rtx, t);
14441 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14442 RTX_FRAME_RELATED_P (insn) = 1;
14445 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14446 of DRAP is needed and stack realignment is really needed after reload */
14447 if (stack_realign_drap)
14449 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14451 /* Can't use DRAP in interrupt function. */
14452 if (cfun->machine->func_type != TYPE_NORMAL)
14453 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14454 "in interrupt service routine. This may be worked "
14455 "around by avoiding functions with aggregate return.");
14457 /* Only need to push parameter pointer reg if it is caller saved. */
14458 if (!call_used_regs[REGNO (crtl->drap_reg)])
14460 /* Push arg pointer reg */
14461 insn = emit_insn (gen_push (crtl->drap_reg));
14462 RTX_FRAME_RELATED_P (insn) = 1;
14465 /* Grab the argument pointer. */
14466 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14467 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14468 RTX_FRAME_RELATED_P (insn) = 1;
14469 m->fs.cfa_reg = crtl->drap_reg;
14470 m->fs.cfa_offset = 0;
14472 /* Align the stack. */
14473 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14474 stack_pointer_rtx,
14475 GEN_INT (-align_bytes)));
14476 RTX_FRAME_RELATED_P (insn) = 1;
14478 /* Replicate the return address on the stack so that return
14479 address can be reached via (argp - 1) slot. This is needed
14480 to implement macro RETURN_ADDR_RTX and intrinsic function
14481 expand_builtin_return_addr etc. */
14482 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14483 t = gen_frame_mem (word_mode, t);
14484 insn = emit_insn (gen_push (t));
14485 RTX_FRAME_RELATED_P (insn) = 1;
14487 /* For the purposes of frame and register save area addressing,
14488 we've started over with a new frame. */
14489 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14490 m->fs.realigned = true;
14492 if (static_chain)
14494 /* Replicate static chain on the stack so that static chain
14495 can be reached via (argp - 2) slot. This is needed for
14496 nested function with stack realignment. */
14497 insn = emit_insn (gen_push (static_chain));
14498 RTX_FRAME_RELATED_P (insn) = 1;
14502 int_registers_saved = (frame.nregs == 0);
14503 sse_registers_saved = (frame.nsseregs == 0);
14505 if (frame_pointer_needed && !m->fs.fp_valid)
14507 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14508 slower on all targets. Also sdb doesn't like it. */
14509 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14510 RTX_FRAME_RELATED_P (insn) = 1;
14512 /* Push registers now, before setting the frame pointer
14513 on SEH target. */
14514 if (!int_registers_saved
14515 && TARGET_SEH
14516 && !frame.save_regs_using_mov)
14518 ix86_emit_save_regs ();
14519 int_registers_saved = true;
14520 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14523 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14525 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14526 RTX_FRAME_RELATED_P (insn) = 1;
14528 if (m->fs.cfa_reg == stack_pointer_rtx)
14529 m->fs.cfa_reg = hard_frame_pointer_rtx;
14530 m->fs.fp_offset = m->fs.sp_offset;
14531 m->fs.fp_valid = true;
14535 if (!int_registers_saved)
14537 /* If saving registers via PUSH, do so now. */
14538 if (!frame.save_regs_using_mov)
14540 ix86_emit_save_regs ();
14541 int_registers_saved = true;
14542 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14545 /* When using red zone we may start register saving before allocating
14546 the stack frame saving one cycle of the prologue. However, avoid
14547 doing this if we have to probe the stack; at least on x86_64 the
14548 stack probe can turn into a call that clobbers a red zone location. */
14549 else if (ix86_using_red_zone ()
14550 && (! TARGET_STACK_PROBE
14551 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14553 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14554 int_registers_saved = true;
14558 if (stack_realign_fp)
14560 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14561 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14563 /* The computation of the size of the re-aligned stack frame means
14564 that we must allocate the size of the register save area before
14565 performing the actual alignment. Otherwise we cannot guarantee
14566 that there's enough storage above the realignment point. */
14567 allocate = frame.stack_realign_allocate_offset - m->fs.sp_offset;
14568 if (allocate && !m->call_ms2sysv)
14569 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14570 GEN_INT (-allocate), -1, false);
14572 /* Align the stack. */
14573 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14574 stack_pointer_rtx,
14575 GEN_INT (-align_bytes)));
14576 /* For the purposes of register save area addressing, the stack
14577 pointer can no longer be used to access anything in the frame
14578 below m->fs.sp_realigned_offset and the frame pointer cannot be
14579 used for anything at or above. */
14580 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14581 m->fs.sp_realigned = true;
14582 m->fs.sp_realigned_offset = m->fs.sp_offset - frame.nsseregs * 16;
14583 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14584 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14585 is needed to describe where a register is saved using a realigned
14586 stack pointer, so we need to invalidate the stack pointer for that
14587 target. */
14588 if (TARGET_SEH)
14589 m->fs.sp_valid = false;
14592 if (m->call_ms2sysv)
14593 ix86_emit_outlined_ms2sysv_save (frame);
14595 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14597 if (flag_stack_usage_info)
14599 /* We start to count from ARG_POINTER. */
14600 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14602 /* If it was realigned, take into account the fake frame. */
14603 if (stack_realign_drap)
14605 if (ix86_static_chain_on_stack)
14606 stack_size += UNITS_PER_WORD;
14608 if (!call_used_regs[REGNO (crtl->drap_reg)])
14609 stack_size += UNITS_PER_WORD;
14611 /* This over-estimates by 1 minimal-stack-alignment-unit but
14612 mitigates that by counting in the new return address slot. */
14613 current_function_dynamic_stack_size
14614 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14617 current_function_static_stack_size = stack_size;
14620 /* On SEH target with very large frame size, allocate an area to save
14621 SSE registers (as the very large allocation won't be described). */
14622 if (TARGET_SEH
14623 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14624 && !sse_registers_saved)
14626 HOST_WIDE_INT sse_size =
14627 frame.sse_reg_save_offset - frame.reg_save_offset;
14629 gcc_assert (int_registers_saved);
14631 /* No need to do stack checking as the area will be immediately
14632 written. */
14633 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14634 GEN_INT (-sse_size), -1,
14635 m->fs.cfa_reg == stack_pointer_rtx);
14636 allocate -= sse_size;
14637 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14638 sse_registers_saved = true;
14641 /* The stack has already been decremented by the instruction calling us
14642 so probe if the size is non-negative to preserve the protection area. */
14643 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14645 /* We expect the registers to be saved when probes are used. */
14646 gcc_assert (int_registers_saved);
14648 if (STACK_CHECK_MOVING_SP)
14650 if (!(crtl->is_leaf && !cfun->calls_alloca
14651 && allocate <= PROBE_INTERVAL))
14653 ix86_adjust_stack_and_probe (allocate);
14654 allocate = 0;
14657 else
14659 HOST_WIDE_INT size = allocate;
14661 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14662 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14664 if (TARGET_STACK_PROBE)
14666 if (crtl->is_leaf && !cfun->calls_alloca)
14668 if (size > PROBE_INTERVAL)
14669 ix86_emit_probe_stack_range (0, size);
14671 else
14672 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14674 else
14676 if (crtl->is_leaf && !cfun->calls_alloca)
14678 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14679 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14680 size - STACK_CHECK_PROTECT);
14682 else
14683 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14688 if (allocate == 0)
14690 else if (!ix86_target_stack_probe ()
14691 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14693 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14694 GEN_INT (-allocate), -1,
14695 m->fs.cfa_reg == stack_pointer_rtx);
14697 else
14699 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14700 rtx r10 = NULL;
14701 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14702 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14703 bool eax_live = ix86_eax_live_at_start_p ();
14704 bool r10_live = false;
14706 if (TARGET_64BIT)
14707 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14709 if (eax_live)
14711 insn = emit_insn (gen_push (eax));
14712 allocate -= UNITS_PER_WORD;
14713 /* Note that SEH directives need to continue tracking the stack
14714 pointer even after the frame pointer has been set up. */
14715 if (sp_is_cfa_reg || TARGET_SEH)
14717 if (sp_is_cfa_reg)
14718 m->fs.cfa_offset += UNITS_PER_WORD;
14719 RTX_FRAME_RELATED_P (insn) = 1;
14720 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14721 gen_rtx_SET (stack_pointer_rtx,
14722 plus_constant (Pmode, stack_pointer_rtx,
14723 -UNITS_PER_WORD)));
14727 if (r10_live)
14729 r10 = gen_rtx_REG (Pmode, R10_REG);
14730 insn = emit_insn (gen_push (r10));
14731 allocate -= UNITS_PER_WORD;
14732 if (sp_is_cfa_reg || TARGET_SEH)
14734 if (sp_is_cfa_reg)
14735 m->fs.cfa_offset += UNITS_PER_WORD;
14736 RTX_FRAME_RELATED_P (insn) = 1;
14737 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14738 gen_rtx_SET (stack_pointer_rtx,
14739 plus_constant (Pmode, stack_pointer_rtx,
14740 -UNITS_PER_WORD)));
14744 emit_move_insn (eax, GEN_INT (allocate));
14745 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14747 /* Use the fact that AX still contains ALLOCATE. */
14748 adjust_stack_insn = (Pmode == DImode
14749 ? gen_pro_epilogue_adjust_stack_di_sub
14750 : gen_pro_epilogue_adjust_stack_si_sub);
14752 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14753 stack_pointer_rtx, eax));
14755 if (sp_is_cfa_reg || TARGET_SEH)
14757 if (sp_is_cfa_reg)
14758 m->fs.cfa_offset += allocate;
14759 RTX_FRAME_RELATED_P (insn) = 1;
14760 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14761 gen_rtx_SET (stack_pointer_rtx,
14762 plus_constant (Pmode, stack_pointer_rtx,
14763 -allocate)));
14765 m->fs.sp_offset += allocate;
14767 /* Use stack_pointer_rtx for relative addressing so that code
14768 works for realigned stack, too. */
14769 if (r10_live && eax_live)
14771 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14772 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14773 gen_frame_mem (word_mode, t));
14774 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14775 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14776 gen_frame_mem (word_mode, t));
14778 else if (eax_live || r10_live)
14780 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14781 emit_move_insn (gen_rtx_REG (word_mode,
14782 (eax_live ? AX_REG : R10_REG)),
14783 gen_frame_mem (word_mode, t));
14786 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14788 /* If we havn't already set up the frame pointer, do so now. */
14789 if (frame_pointer_needed && !m->fs.fp_valid)
14791 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14792 GEN_INT (frame.stack_pointer_offset
14793 - frame.hard_frame_pointer_offset));
14794 insn = emit_insn (insn);
14795 RTX_FRAME_RELATED_P (insn) = 1;
14796 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14798 if (m->fs.cfa_reg == stack_pointer_rtx)
14799 m->fs.cfa_reg = hard_frame_pointer_rtx;
14800 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14801 m->fs.fp_valid = true;
14804 if (!int_registers_saved)
14805 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14806 if (!sse_registers_saved)
14807 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14809 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14810 in PROLOGUE. */
14811 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14813 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14814 insn = emit_insn (gen_set_got (pic));
14815 RTX_FRAME_RELATED_P (insn) = 1;
14816 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14817 emit_insn (gen_prologue_use (pic));
14818 /* Deleting already emmitted SET_GOT if exist and allocated to
14819 REAL_PIC_OFFSET_TABLE_REGNUM. */
14820 ix86_elim_entry_set_got (pic);
14823 if (crtl->drap_reg && !crtl->stack_realign_needed)
14825 /* vDRAP is setup but after reload it turns out stack realign
14826 isn't necessary, here we will emit prologue to setup DRAP
14827 without stack realign adjustment */
14828 t = choose_baseaddr (0, NULL);
14829 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14832 /* Prevent instructions from being scheduled into register save push
14833 sequence when access to the redzone area is done through frame pointer.
14834 The offset between the frame pointer and the stack pointer is calculated
14835 relative to the value of the stack pointer at the end of the function
14836 prologue, and moving instructions that access redzone area via frame
14837 pointer inside push sequence violates this assumption. */
14838 if (frame_pointer_needed && frame.red_zone_size)
14839 emit_insn (gen_memory_blockage ());
14841 /* SEH requires that the prologue end within 256 bytes of the start of
14842 the function. Prevent instruction schedules that would extend that.
14843 Further, prevent alloca modifications to the stack pointer from being
14844 combined with prologue modifications. */
14845 if (TARGET_SEH)
14846 emit_insn (gen_prologue_use (stack_pointer_rtx));
14849 /* Emit code to restore REG using a POP insn. */
14851 static void
14852 ix86_emit_restore_reg_using_pop (rtx reg)
14854 struct machine_function *m = cfun->machine;
14855 rtx_insn *insn = emit_insn (gen_pop (reg));
14857 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14858 m->fs.sp_offset -= UNITS_PER_WORD;
14860 if (m->fs.cfa_reg == crtl->drap_reg
14861 && REGNO (reg) == REGNO (crtl->drap_reg))
14863 /* Previously we'd represented the CFA as an expression
14864 like *(%ebp - 8). We've just popped that value from
14865 the stack, which means we need to reset the CFA to
14866 the drap register. This will remain until we restore
14867 the stack pointer. */
14868 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14869 RTX_FRAME_RELATED_P (insn) = 1;
14871 /* This means that the DRAP register is valid for addressing too. */
14872 m->fs.drap_valid = true;
14873 return;
14876 if (m->fs.cfa_reg == stack_pointer_rtx)
14878 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14879 x = gen_rtx_SET (stack_pointer_rtx, x);
14880 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14881 RTX_FRAME_RELATED_P (insn) = 1;
14883 m->fs.cfa_offset -= UNITS_PER_WORD;
14886 /* When the frame pointer is the CFA, and we pop it, we are
14887 swapping back to the stack pointer as the CFA. This happens
14888 for stack frames that don't allocate other data, so we assume
14889 the stack pointer is now pointing at the return address, i.e.
14890 the function entry state, which makes the offset be 1 word. */
14891 if (reg == hard_frame_pointer_rtx)
14893 m->fs.fp_valid = false;
14894 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14896 m->fs.cfa_reg = stack_pointer_rtx;
14897 m->fs.cfa_offset -= UNITS_PER_WORD;
14899 add_reg_note (insn, REG_CFA_DEF_CFA,
14900 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14901 GEN_INT (m->fs.cfa_offset)));
14902 RTX_FRAME_RELATED_P (insn) = 1;
14907 /* Emit code to restore saved registers using POP insns. */
14909 static void
14910 ix86_emit_restore_regs_using_pop (void)
14912 unsigned int regno;
14914 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14915 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14916 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14919 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14920 omits the emit and only attaches the notes. */
14922 static void
14923 ix86_emit_leave (rtx_insn *insn)
14925 struct machine_function *m = cfun->machine;
14926 if (!insn)
14927 insn = emit_insn (ix86_gen_leave ());
14929 ix86_add_queued_cfa_restore_notes (insn);
14931 gcc_assert (m->fs.fp_valid);
14932 m->fs.sp_valid = true;
14933 m->fs.sp_realigned = false;
14934 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14935 m->fs.fp_valid = false;
14937 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14939 m->fs.cfa_reg = stack_pointer_rtx;
14940 m->fs.cfa_offset = m->fs.sp_offset;
14942 add_reg_note (insn, REG_CFA_DEF_CFA,
14943 plus_constant (Pmode, stack_pointer_rtx,
14944 m->fs.sp_offset));
14945 RTX_FRAME_RELATED_P (insn) = 1;
14947 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14948 m->fs.fp_offset);
14951 /* Emit code to restore saved registers using MOV insns.
14952 First register is restored from CFA - CFA_OFFSET. */
14953 static void
14954 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14955 bool maybe_eh_return)
14957 struct machine_function *m = cfun->machine;
14958 unsigned int regno;
14960 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14961 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14963 rtx reg = gen_rtx_REG (word_mode, regno);
14964 rtx mem;
14965 rtx_insn *insn;
14967 mem = choose_baseaddr (cfa_offset, NULL);
14968 mem = gen_frame_mem (word_mode, mem);
14969 insn = emit_move_insn (reg, mem);
14971 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14973 /* Previously we'd represented the CFA as an expression
14974 like *(%ebp - 8). We've just popped that value from
14975 the stack, which means we need to reset the CFA to
14976 the drap register. This will remain until we restore
14977 the stack pointer. */
14978 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14979 RTX_FRAME_RELATED_P (insn) = 1;
14981 /* This means that the DRAP register is valid for addressing. */
14982 m->fs.drap_valid = true;
14984 else
14985 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14987 cfa_offset -= UNITS_PER_WORD;
14991 /* Emit code to restore saved registers using MOV insns.
14992 First register is restored from CFA - CFA_OFFSET. */
14993 static void
14994 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14995 bool maybe_eh_return)
14997 unsigned int regno;
14999 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15000 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15002 rtx reg = gen_rtx_REG (V4SFmode, regno);
15003 rtx mem;
15004 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15006 mem = choose_baseaddr (cfa_offset, &align);
15007 mem = gen_rtx_MEM (V4SFmode, mem);
15009 /* The location aligment depends upon the base register. */
15010 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15011 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15012 set_mem_align (mem, align);
15013 emit_insn (gen_rtx_SET (reg, mem));
15015 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15017 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15021 static void
15022 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15023 bool use_call, int style)
15025 struct machine_function *m = cfun->machine;
15026 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15027 + m->call_ms2sysv_extra_regs;
15028 rtvec v;
15029 unsigned int elems_needed, align, i, vi = 0;
15030 rtx_insn *insn;
15031 rtx sym, tmp;
15032 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15033 rtx r10 = NULL_RTX;
15034 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15035 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15036 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15037 rtx rsi_frame_load = NULL_RTX;
15038 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15039 enum xlogue_stub stub;
15041 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15043 /* If using a realigned stack, we should never start with padding. */
15044 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15046 /* Setup RSI as the stub's base pointer. */
15047 align = GET_MODE_ALIGNMENT (V4SFmode);
15048 tmp = choose_baseaddr (rsi_offset, &align);
15049 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15050 emit_insn (gen_rtx_SET (rsi, tmp));
15052 /* Get a symbol for the stub. */
15053 if (frame_pointer_needed)
15054 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15055 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15056 else
15057 stub = use_call ? XLOGUE_STUB_RESTORE
15058 : XLOGUE_STUB_RESTORE_TAIL;
15059 sym = xlogue.get_stub_rtx (stub);
15061 elems_needed = ncregs;
15062 if (use_call)
15063 elems_needed += 1;
15064 else
15065 elems_needed += frame_pointer_needed ? 5 : 3;
15066 v = rtvec_alloc (elems_needed);
15068 /* We call the epilogue stub when we need to pop incoming args or we are
15069 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15070 epilogue stub and it is the tail-call. */
15071 if (use_call)
15072 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15073 else
15075 RTVEC_ELT (v, vi++) = ret_rtx;
15076 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15077 if (frame_pointer_needed)
15079 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15080 gcc_assert (m->fs.fp_valid);
15081 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15083 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15084 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15085 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15086 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15087 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15089 else
15091 /* If no hard frame pointer, we set R10 to the SP restore value. */
15092 gcc_assert (!m->fs.fp_valid);
15093 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15094 gcc_assert (m->fs.sp_valid);
15096 r10 = gen_rtx_REG (DImode, R10_REG);
15097 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15098 emit_insn (gen_rtx_SET (r10, tmp));
15100 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15104 /* Generate frame load insns and restore notes. */
15105 for (i = 0; i < ncregs; ++i)
15107 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15108 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15109 rtx reg, frame_load;
15111 reg = gen_rtx_REG (mode, r.regno);
15112 frame_load = gen_frame_load (reg, rsi, r.offset);
15114 /* Save RSI frame load insn & note to add last. */
15115 if (r.regno == SI_REG)
15117 gcc_assert (!rsi_frame_load);
15118 rsi_frame_load = frame_load;
15119 rsi_restore_offset = r.offset;
15121 else
15123 RTVEC_ELT (v, vi++) = frame_load;
15124 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15128 /* Add RSI frame load & restore note at the end. */
15129 gcc_assert (rsi_frame_load);
15130 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15131 RTVEC_ELT (v, vi++) = rsi_frame_load;
15132 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15133 rsi_restore_offset);
15135 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15136 if (!use_call && !frame_pointer_needed)
15138 gcc_assert (m->fs.sp_valid);
15139 gcc_assert (!m->fs.sp_realigned);
15141 /* At this point, R10 should point to frame.stack_realign_offset. */
15142 if (m->fs.cfa_reg == stack_pointer_rtx)
15143 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15144 m->fs.sp_offset = frame.stack_realign_offset;
15147 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15148 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15149 if (use_call)
15150 insn = emit_insn (tmp);
15151 else
15153 insn = emit_jump_insn (tmp);
15154 JUMP_LABEL (insn) = ret_rtx;
15156 if (frame_pointer_needed)
15157 ix86_emit_leave (insn);
15158 else
15160 /* Need CFA adjust note. */
15161 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15162 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15166 RTX_FRAME_RELATED_P (insn) = true;
15167 ix86_add_queued_cfa_restore_notes (insn);
15169 /* If we're not doing a tail-call, we need to adjust the stack. */
15170 if (use_call && m->fs.sp_valid)
15172 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15173 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15174 GEN_INT (dealloc), style,
15175 m->fs.cfa_reg == stack_pointer_rtx);
15179 /* Restore function stack, frame, and registers. */
15181 void
15182 ix86_expand_epilogue (int style)
15184 struct machine_function *m = cfun->machine;
15185 struct machine_frame_state frame_state_save = m->fs;
15186 struct ix86_frame frame;
15187 bool restore_regs_via_mov;
15188 bool using_drap;
15189 bool restore_stub_is_tail = false;
15191 ix86_finalize_stack_realign_flags ();
15192 frame = m->frame;
15194 m->fs.sp_realigned = stack_realign_fp;
15195 m->fs.sp_valid = stack_realign_fp
15196 || !frame_pointer_needed
15197 || crtl->sp_is_unchanging;
15198 gcc_assert (!m->fs.sp_valid
15199 || m->fs.sp_offset == frame.stack_pointer_offset);
15201 /* The FP must be valid if the frame pointer is present. */
15202 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15203 gcc_assert (!m->fs.fp_valid
15204 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15206 /* We must have *some* valid pointer to the stack frame. */
15207 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15209 /* The DRAP is never valid at this point. */
15210 gcc_assert (!m->fs.drap_valid);
15212 /* See the comment about red zone and frame
15213 pointer usage in ix86_expand_prologue. */
15214 if (frame_pointer_needed && frame.red_zone_size)
15215 emit_insn (gen_memory_blockage ());
15217 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15218 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15220 /* Determine the CFA offset of the end of the red-zone. */
15221 m->fs.red_zone_offset = 0;
15222 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15224 /* The red-zone begins below the return address. */
15225 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
15227 /* When the register save area is in the aligned portion of
15228 the stack, determine the maximum runtime displacement that
15229 matches up with the aligned frame. */
15230 if (stack_realign_drap)
15231 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15232 + UNITS_PER_WORD);
15235 /* Special care must be taken for the normal return case of a function
15236 using eh_return: the eax and edx registers are marked as saved, but
15237 not restored along this path. Adjust the save location to match. */
15238 if (crtl->calls_eh_return && style != 2)
15239 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15241 /* EH_RETURN requires the use of moves to function properly. */
15242 if (crtl->calls_eh_return)
15243 restore_regs_via_mov = true;
15244 /* SEH requires the use of pops to identify the epilogue. */
15245 else if (TARGET_SEH)
15246 restore_regs_via_mov = false;
15247 /* If we're only restoring one register and sp cannot be used then
15248 using a move instruction to restore the register since it's
15249 less work than reloading sp and popping the register. */
15250 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15251 restore_regs_via_mov = true;
15252 else if (TARGET_EPILOGUE_USING_MOVE
15253 && cfun->machine->use_fast_prologue_epilogue
15254 && (frame.nregs > 1
15255 || m->fs.sp_offset != frame.reg_save_offset))
15256 restore_regs_via_mov = true;
15257 else if (frame_pointer_needed
15258 && !frame.nregs
15259 && m->fs.sp_offset != frame.reg_save_offset)
15260 restore_regs_via_mov = true;
15261 else if (frame_pointer_needed
15262 && TARGET_USE_LEAVE
15263 && cfun->machine->use_fast_prologue_epilogue
15264 && frame.nregs == 1)
15265 restore_regs_via_mov = true;
15266 else
15267 restore_regs_via_mov = false;
15269 if (restore_regs_via_mov || frame.nsseregs)
15271 /* Ensure that the entire register save area is addressable via
15272 the stack pointer, if we will restore via sp. */
15273 if (TARGET_64BIT
15274 && m->fs.sp_offset > 0x7fffffff
15275 && !(fp_valid_at (frame.stack_realign_offset) || m->fs.drap_valid)
15276 && (frame.nsseregs + frame.nregs) != 0)
15278 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15279 GEN_INT (m->fs.sp_offset
15280 - frame.sse_reg_save_offset),
15281 style,
15282 m->fs.cfa_reg == stack_pointer_rtx);
15286 /* If there are any SSE registers to restore, then we have to do it
15287 via moves, since there's obviously no pop for SSE regs. */
15288 if (frame.nsseregs)
15289 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15290 style == 2);
15292 if (m->call_ms2sysv)
15294 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15296 /* We cannot use a tail-call for the stub if:
15297 1. We have to pop incoming args,
15298 2. We have additional int regs to restore, or
15299 3. A sibling call will be the tail-call, or
15300 4. We are emitting an eh_return_internal epilogue.
15302 TODO: Item 4 has not yet tested!
15304 If any of the above are true, we will call the stub rather than
15305 jump to it. */
15306 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15307 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15310 /* If using out-of-line stub that is a tail-call, then...*/
15311 if (m->call_ms2sysv && restore_stub_is_tail)
15313 /* TODO: parinoid tests. (remove eventually) */
15314 gcc_assert (m->fs.sp_valid);
15315 gcc_assert (!m->fs.sp_realigned);
15316 gcc_assert (!m->fs.fp_valid);
15317 gcc_assert (!m->fs.realigned);
15318 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15319 gcc_assert (!crtl->drap_reg);
15320 gcc_assert (!frame.nregs);
15322 else if (restore_regs_via_mov)
15324 rtx t;
15326 if (frame.nregs)
15327 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15329 /* eh_return epilogues need %ecx added to the stack pointer. */
15330 if (style == 2)
15332 rtx sa = EH_RETURN_STACKADJ_RTX;
15333 rtx_insn *insn;
15335 /* %ecx can't be used for both DRAP register and eh_return. */
15336 if (crtl->drap_reg)
15337 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15339 /* regparm nested functions don't work with eh_return. */
15340 gcc_assert (!ix86_static_chain_on_stack);
15342 if (frame_pointer_needed)
15344 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15345 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15346 emit_insn (gen_rtx_SET (sa, t));
15348 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15349 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15351 /* Note that we use SA as a temporary CFA, as the return
15352 address is at the proper place relative to it. We
15353 pretend this happens at the FP restore insn because
15354 prior to this insn the FP would be stored at the wrong
15355 offset relative to SA, and after this insn we have no
15356 other reasonable register to use for the CFA. We don't
15357 bother resetting the CFA to the SP for the duration of
15358 the return insn. */
15359 add_reg_note (insn, REG_CFA_DEF_CFA,
15360 plus_constant (Pmode, sa, UNITS_PER_WORD));
15361 ix86_add_queued_cfa_restore_notes (insn);
15362 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15363 RTX_FRAME_RELATED_P (insn) = 1;
15365 m->fs.cfa_reg = sa;
15366 m->fs.cfa_offset = UNITS_PER_WORD;
15367 m->fs.fp_valid = false;
15369 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15370 const0_rtx, style, false);
15372 else
15374 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15375 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15376 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15377 ix86_add_queued_cfa_restore_notes (insn);
15379 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15380 if (m->fs.cfa_offset != UNITS_PER_WORD)
15382 m->fs.cfa_offset = UNITS_PER_WORD;
15383 add_reg_note (insn, REG_CFA_DEF_CFA,
15384 plus_constant (Pmode, stack_pointer_rtx,
15385 UNITS_PER_WORD));
15386 RTX_FRAME_RELATED_P (insn) = 1;
15389 m->fs.sp_offset = UNITS_PER_WORD;
15390 m->fs.sp_valid = true;
15391 m->fs.sp_realigned = false;
15394 else
15396 /* SEH requires that the function end with (1) a stack adjustment
15397 if necessary, (2) a sequence of pops, and (3) a return or
15398 jump instruction. Prevent insns from the function body from
15399 being scheduled into this sequence. */
15400 if (TARGET_SEH)
15402 /* Prevent a catch region from being adjacent to the standard
15403 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15404 several other flags that would be interesting to test are
15405 not yet set up. */
15406 if (flag_non_call_exceptions)
15407 emit_insn (gen_nops (const1_rtx));
15408 else
15409 emit_insn (gen_blockage ());
15412 /* First step is to deallocate the stack frame so that we can
15413 pop the registers. If the stack pointer was realigned, it needs
15414 to be restored now. Also do it on SEH target for very large
15415 frame as the emitted instructions aren't allowed by the ABI
15416 in epilogues. */
15417 if (!m->fs.sp_valid || m->fs.sp_realigned
15418 || (TARGET_SEH
15419 && (m->fs.sp_offset - frame.reg_save_offset
15420 >= SEH_MAX_FRAME_SIZE)))
15422 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15423 GEN_INT (m->fs.fp_offset
15424 - frame.reg_save_offset),
15425 style, false);
15427 else if (m->fs.sp_offset != frame.reg_save_offset)
15429 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15430 GEN_INT (m->fs.sp_offset
15431 - frame.reg_save_offset),
15432 style,
15433 m->fs.cfa_reg == stack_pointer_rtx);
15436 ix86_emit_restore_regs_using_pop ();
15439 /* If we used a stack pointer and haven't already got rid of it,
15440 then do so now. */
15441 if (m->fs.fp_valid)
15443 /* If the stack pointer is valid and pointing at the frame
15444 pointer store address, then we only need a pop. */
15445 if (sp_valid_at (frame.hfp_save_offset)
15446 && m->fs.sp_offset == frame.hfp_save_offset)
15447 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15448 /* Leave results in shorter dependency chains on CPUs that are
15449 able to grok it fast. */
15450 else if (TARGET_USE_LEAVE
15451 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15452 || !cfun->machine->use_fast_prologue_epilogue)
15453 ix86_emit_leave (NULL);
15454 else
15456 pro_epilogue_adjust_stack (stack_pointer_rtx,
15457 hard_frame_pointer_rtx,
15458 const0_rtx, style, !using_drap);
15459 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15463 if (using_drap)
15465 int param_ptr_offset = UNITS_PER_WORD;
15466 rtx_insn *insn;
15468 gcc_assert (stack_realign_drap);
15470 if (ix86_static_chain_on_stack)
15471 param_ptr_offset += UNITS_PER_WORD;
15472 if (!call_used_regs[REGNO (crtl->drap_reg)])
15473 param_ptr_offset += UNITS_PER_WORD;
15475 insn = emit_insn (gen_rtx_SET
15476 (stack_pointer_rtx,
15477 gen_rtx_PLUS (Pmode,
15478 crtl->drap_reg,
15479 GEN_INT (-param_ptr_offset))));
15480 m->fs.cfa_reg = stack_pointer_rtx;
15481 m->fs.cfa_offset = param_ptr_offset;
15482 m->fs.sp_offset = param_ptr_offset;
15483 m->fs.realigned = false;
15485 add_reg_note (insn, REG_CFA_DEF_CFA,
15486 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15487 GEN_INT (param_ptr_offset)));
15488 RTX_FRAME_RELATED_P (insn) = 1;
15490 if (!call_used_regs[REGNO (crtl->drap_reg)])
15491 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15494 /* At this point the stack pointer must be valid, and we must have
15495 restored all of the registers. We may not have deallocated the
15496 entire stack frame. We've delayed this until now because it may
15497 be possible to merge the local stack deallocation with the
15498 deallocation forced by ix86_static_chain_on_stack. */
15499 gcc_assert (m->fs.sp_valid);
15500 gcc_assert (!m->fs.sp_realigned);
15501 gcc_assert (!m->fs.fp_valid);
15502 gcc_assert (!m->fs.realigned);
15503 if (m->fs.sp_offset != UNITS_PER_WORD)
15505 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15506 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15507 style, true);
15509 else
15510 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15512 /* Sibcall epilogues don't want a return instruction. */
15513 if (style == 0)
15515 m->fs = frame_state_save;
15516 return;
15519 if (cfun->machine->func_type != TYPE_NORMAL)
15521 /* Return with the "IRET" instruction from interrupt handler.
15522 Pop the 'ERROR_CODE' off the stack before the 'IRET'
15523 instruction in exception handler. */
15524 if (cfun->machine->func_type == TYPE_EXCEPTION)
15526 rtx r = plus_constant (Pmode, stack_pointer_rtx,
15527 UNITS_PER_WORD);
15528 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
15530 emit_jump_insn (gen_interrupt_return ());
15532 else if (crtl->args.pops_args && crtl->args.size)
15534 rtx popc = GEN_INT (crtl->args.pops_args);
15536 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15537 address, do explicit add, and jump indirectly to the caller. */
15539 if (crtl->args.pops_args >= 65536)
15541 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15542 rtx_insn *insn;
15544 /* There is no "pascal" calling convention in any 64bit ABI. */
15545 gcc_assert (!TARGET_64BIT);
15547 insn = emit_insn (gen_pop (ecx));
15548 m->fs.cfa_offset -= UNITS_PER_WORD;
15549 m->fs.sp_offset -= UNITS_PER_WORD;
15551 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15552 x = gen_rtx_SET (stack_pointer_rtx, x);
15553 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15554 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15555 RTX_FRAME_RELATED_P (insn) = 1;
15557 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15558 popc, -1, true);
15559 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15561 else
15562 emit_jump_insn (gen_simple_return_pop_internal (popc));
15564 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15565 emit_jump_insn (gen_simple_return_internal ());
15567 /* Restore the state back to the state from the prologue,
15568 so that it's correct for the next epilogue. */
15569 m->fs = frame_state_save;
15572 /* Reset from the function's potential modifications. */
15574 static void
15575 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
15577 if (pic_offset_table_rtx
15578 && !ix86_use_pseudo_pic_reg ())
15579 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15581 if (TARGET_MACHO)
15583 rtx_insn *insn = get_last_insn ();
15584 rtx_insn *deleted_debug_label = NULL;
15586 /* Mach-O doesn't support labels at the end of objects, so if
15587 it looks like we might want one, take special action.
15588 First, collect any sequence of deleted debug labels. */
15589 while (insn
15590 && NOTE_P (insn)
15591 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15593 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15594 notes only, instead set their CODE_LABEL_NUMBER to -1,
15595 otherwise there would be code generation differences
15596 in between -g and -g0. */
15597 if (NOTE_P (insn) && NOTE_KIND (insn)
15598 == NOTE_INSN_DELETED_DEBUG_LABEL)
15599 deleted_debug_label = insn;
15600 insn = PREV_INSN (insn);
15603 /* If we have:
15604 label:
15605 barrier
15606 then this needs to be detected, so skip past the barrier. */
15608 if (insn && BARRIER_P (insn))
15609 insn = PREV_INSN (insn);
15611 /* Up to now we've only seen notes or barriers. */
15612 if (insn)
15614 if (LABEL_P (insn)
15615 || (NOTE_P (insn)
15616 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15617 /* Trailing label. */
15618 fputs ("\tnop\n", file);
15619 else if (cfun && ! cfun->is_thunk)
15621 /* See if we have a completely empty function body, skipping
15622 the special case of the picbase thunk emitted as asm. */
15623 while (insn && ! INSN_P (insn))
15624 insn = PREV_INSN (insn);
15625 /* If we don't find any insns, we've got an empty function body;
15626 I.e. completely empty - without a return or branch. This is
15627 taken as the case where a function body has been removed
15628 because it contains an inline __builtin_unreachable(). GCC
15629 declares that reaching __builtin_unreachable() means UB so
15630 we're not obliged to do anything special; however, we want
15631 non-zero-sized function bodies. To meet this, and help the
15632 user out, let's trap the case. */
15633 if (insn == NULL)
15634 fputs ("\tud2\n", file);
15637 else if (deleted_debug_label)
15638 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15639 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15640 CODE_LABEL_NUMBER (insn) = -1;
15644 /* Return a scratch register to use in the split stack prologue. The
15645 split stack prologue is used for -fsplit-stack. It is the first
15646 instructions in the function, even before the regular prologue.
15647 The scratch register can be any caller-saved register which is not
15648 used for parameters or for the static chain. */
15650 static unsigned int
15651 split_stack_prologue_scratch_regno (void)
15653 if (TARGET_64BIT)
15654 return R11_REG;
15655 else
15657 bool is_fastcall, is_thiscall;
15658 int regparm;
15660 is_fastcall = (lookup_attribute ("fastcall",
15661 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15662 != NULL);
15663 is_thiscall = (lookup_attribute ("thiscall",
15664 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15665 != NULL);
15666 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15668 if (is_fastcall)
15670 if (DECL_STATIC_CHAIN (cfun->decl))
15672 sorry ("-fsplit-stack does not support fastcall with "
15673 "nested function");
15674 return INVALID_REGNUM;
15676 return AX_REG;
15678 else if (is_thiscall)
15680 if (!DECL_STATIC_CHAIN (cfun->decl))
15681 return DX_REG;
15682 return AX_REG;
15684 else if (regparm < 3)
15686 if (!DECL_STATIC_CHAIN (cfun->decl))
15687 return CX_REG;
15688 else
15690 if (regparm >= 2)
15692 sorry ("-fsplit-stack does not support 2 register "
15693 "parameters for a nested function");
15694 return INVALID_REGNUM;
15696 return DX_REG;
15699 else
15701 /* FIXME: We could make this work by pushing a register
15702 around the addition and comparison. */
15703 sorry ("-fsplit-stack does not support 3 register parameters");
15704 return INVALID_REGNUM;
15709 /* A SYMBOL_REF for the function which allocates new stackspace for
15710 -fsplit-stack. */
15712 static GTY(()) rtx split_stack_fn;
15714 /* A SYMBOL_REF for the more stack function when using the large
15715 model. */
15717 static GTY(()) rtx split_stack_fn_large;
15719 /* Handle -fsplit-stack. These are the first instructions in the
15720 function, even before the regular prologue. */
15722 void
15723 ix86_expand_split_stack_prologue (void)
15725 struct ix86_frame frame;
15726 HOST_WIDE_INT allocate;
15727 unsigned HOST_WIDE_INT args_size;
15728 rtx_code_label *label;
15729 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15730 rtx scratch_reg = NULL_RTX;
15731 rtx_code_label *varargs_label = NULL;
15732 rtx fn;
15734 gcc_assert (flag_split_stack && reload_completed);
15736 ix86_finalize_stack_realign_flags ();
15737 frame = cfun->machine->frame;
15738 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15740 /* This is the label we will branch to if we have enough stack
15741 space. We expect the basic block reordering pass to reverse this
15742 branch if optimizing, so that we branch in the unlikely case. */
15743 label = gen_label_rtx ();
15745 /* We need to compare the stack pointer minus the frame size with
15746 the stack boundary in the TCB. The stack boundary always gives
15747 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15748 can compare directly. Otherwise we need to do an addition. */
15750 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
15751 UNSPEC_STACK_CHECK);
15752 limit = gen_rtx_CONST (Pmode, limit);
15753 limit = gen_rtx_MEM (Pmode, limit);
15754 if (allocate < SPLIT_STACK_AVAILABLE)
15755 current = stack_pointer_rtx;
15756 else
15758 unsigned int scratch_regno;
15759 rtx offset;
15761 /* We need a scratch register to hold the stack pointer minus
15762 the required frame size. Since this is the very start of the
15763 function, the scratch register can be any caller-saved
15764 register which is not used for parameters. */
15765 offset = GEN_INT (- allocate);
15766 scratch_regno = split_stack_prologue_scratch_regno ();
15767 if (scratch_regno == INVALID_REGNUM)
15768 return;
15769 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15770 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15772 /* We don't use ix86_gen_add3 in this case because it will
15773 want to split to lea, but when not optimizing the insn
15774 will not be split after this point. */
15775 emit_insn (gen_rtx_SET (scratch_reg,
15776 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15777 offset)));
15779 else
15781 emit_move_insn (scratch_reg, offset);
15782 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15783 stack_pointer_rtx));
15785 current = scratch_reg;
15788 ix86_expand_branch (GEU, current, limit, label);
15789 rtx_insn *jump_insn = get_last_insn ();
15790 JUMP_LABEL (jump_insn) = label;
15792 /* Mark the jump as very likely to be taken. */
15793 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15795 if (split_stack_fn == NULL_RTX)
15797 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15798 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15800 fn = split_stack_fn;
15802 /* Get more stack space. We pass in the desired stack space and the
15803 size of the arguments to copy to the new stack. In 32-bit mode
15804 we push the parameters; __morestack will return on a new stack
15805 anyhow. In 64-bit mode we pass the parameters in r10 and
15806 r11. */
15807 allocate_rtx = GEN_INT (allocate);
15808 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15809 call_fusage = NULL_RTX;
15810 rtx pop = NULL_RTX;
15811 if (TARGET_64BIT)
15813 rtx reg10, reg11;
15815 reg10 = gen_rtx_REG (Pmode, R10_REG);
15816 reg11 = gen_rtx_REG (Pmode, R11_REG);
15818 /* If this function uses a static chain, it will be in %r10.
15819 Preserve it across the call to __morestack. */
15820 if (DECL_STATIC_CHAIN (cfun->decl))
15822 rtx rax;
15824 rax = gen_rtx_REG (word_mode, AX_REG);
15825 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15826 use_reg (&call_fusage, rax);
15829 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15830 && !TARGET_PECOFF)
15832 HOST_WIDE_INT argval;
15834 gcc_assert (Pmode == DImode);
15835 /* When using the large model we need to load the address
15836 into a register, and we've run out of registers. So we
15837 switch to a different calling convention, and we call a
15838 different function: __morestack_large. We pass the
15839 argument size in the upper 32 bits of r10 and pass the
15840 frame size in the lower 32 bits. */
15841 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15842 gcc_assert ((args_size & 0xffffffff) == args_size);
15844 if (split_stack_fn_large == NULL_RTX)
15846 split_stack_fn_large =
15847 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15848 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15850 if (ix86_cmodel == CM_LARGE_PIC)
15852 rtx_code_label *label;
15853 rtx x;
15855 label = gen_label_rtx ();
15856 emit_label (label);
15857 LABEL_PRESERVE_P (label) = 1;
15858 emit_insn (gen_set_rip_rex64 (reg10, label));
15859 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15860 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15861 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15862 UNSPEC_GOT);
15863 x = gen_rtx_CONST (Pmode, x);
15864 emit_move_insn (reg11, x);
15865 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15866 x = gen_const_mem (Pmode, x);
15867 emit_move_insn (reg11, x);
15869 else
15870 emit_move_insn (reg11, split_stack_fn_large);
15872 fn = reg11;
15874 argval = ((args_size << 16) << 16) + allocate;
15875 emit_move_insn (reg10, GEN_INT (argval));
15877 else
15879 emit_move_insn (reg10, allocate_rtx);
15880 emit_move_insn (reg11, GEN_INT (args_size));
15881 use_reg (&call_fusage, reg11);
15884 use_reg (&call_fusage, reg10);
15886 else
15888 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15889 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15890 insn = emit_insn (gen_push (allocate_rtx));
15891 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15892 pop = GEN_INT (2 * UNITS_PER_WORD);
15894 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15895 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15896 pop, false);
15897 add_function_usage_to (call_insn, call_fusage);
15898 if (!TARGET_64BIT)
15899 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15900 /* Indicate that this function can't jump to non-local gotos. */
15901 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15903 /* In order to make call/return prediction work right, we now need
15904 to execute a return instruction. See
15905 libgcc/config/i386/morestack.S for the details on how this works.
15907 For flow purposes gcc must not see this as a return
15908 instruction--we need control flow to continue at the subsequent
15909 label. Therefore, we use an unspec. */
15910 gcc_assert (crtl->args.pops_args < 65536);
15911 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15913 /* If we are in 64-bit mode and this function uses a static chain,
15914 we saved %r10 in %rax before calling _morestack. */
15915 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15916 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15917 gen_rtx_REG (word_mode, AX_REG));
15919 /* If this function calls va_start, we need to store a pointer to
15920 the arguments on the old stack, because they may not have been
15921 all copied to the new stack. At this point the old stack can be
15922 found at the frame pointer value used by __morestack, because
15923 __morestack has set that up before calling back to us. Here we
15924 store that pointer in a scratch register, and in
15925 ix86_expand_prologue we store the scratch register in a stack
15926 slot. */
15927 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15929 unsigned int scratch_regno;
15930 rtx frame_reg;
15931 int words;
15933 scratch_regno = split_stack_prologue_scratch_regno ();
15934 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15935 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15937 /* 64-bit:
15938 fp -> old fp value
15939 return address within this function
15940 return address of caller of this function
15941 stack arguments
15942 So we add three words to get to the stack arguments.
15944 32-bit:
15945 fp -> old fp value
15946 return address within this function
15947 first argument to __morestack
15948 second argument to __morestack
15949 return address of caller of this function
15950 stack arguments
15951 So we add five words to get to the stack arguments.
15953 words = TARGET_64BIT ? 3 : 5;
15954 emit_insn (gen_rtx_SET (scratch_reg,
15955 gen_rtx_PLUS (Pmode, frame_reg,
15956 GEN_INT (words * UNITS_PER_WORD))));
15958 varargs_label = gen_label_rtx ();
15959 emit_jump_insn (gen_jump (varargs_label));
15960 JUMP_LABEL (get_last_insn ()) = varargs_label;
15962 emit_barrier ();
15965 emit_label (label);
15966 LABEL_NUSES (label) = 1;
15968 /* If this function calls va_start, we now have to set the scratch
15969 register for the case where we do not call __morestack. In this
15970 case we need to set it based on the stack pointer. */
15971 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15973 emit_insn (gen_rtx_SET (scratch_reg,
15974 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15975 GEN_INT (UNITS_PER_WORD))));
15977 emit_label (varargs_label);
15978 LABEL_NUSES (varargs_label) = 1;
15982 /* We may have to tell the dataflow pass that the split stack prologue
15983 is initializing a scratch register. */
15985 static void
15986 ix86_live_on_entry (bitmap regs)
15988 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15990 gcc_assert (flag_split_stack);
15991 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15995 /* Extract the parts of an RTL expression that is a valid memory address
15996 for an instruction. Return 0 if the structure of the address is
15997 grossly off. Return -1 if the address contains ASHIFT, so it is not
15998 strictly valid, but still used for computing length of lea instruction. */
16001 ix86_decompose_address (rtx addr, struct ix86_address *out)
16003 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16004 rtx base_reg, index_reg;
16005 HOST_WIDE_INT scale = 1;
16006 rtx scale_rtx = NULL_RTX;
16007 rtx tmp;
16008 int retval = 1;
16009 addr_space_t seg = ADDR_SPACE_GENERIC;
16011 /* Allow zero-extended SImode addresses,
16012 they will be emitted with addr32 prefix. */
16013 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16015 if (GET_CODE (addr) == ZERO_EXTEND
16016 && GET_MODE (XEXP (addr, 0)) == SImode)
16018 addr = XEXP (addr, 0);
16019 if (CONST_INT_P (addr))
16020 return 0;
16022 else if (GET_CODE (addr) == AND
16023 && const_32bit_mask (XEXP (addr, 1), DImode))
16025 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16026 if (addr == NULL_RTX)
16027 return 0;
16029 if (CONST_INT_P (addr))
16030 return 0;
16034 /* Allow SImode subregs of DImode addresses,
16035 they will be emitted with addr32 prefix. */
16036 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16038 if (SUBREG_P (addr)
16039 && GET_MODE (SUBREG_REG (addr)) == DImode)
16041 addr = SUBREG_REG (addr);
16042 if (CONST_INT_P (addr))
16043 return 0;
16047 if (REG_P (addr))
16048 base = addr;
16049 else if (SUBREG_P (addr))
16051 if (REG_P (SUBREG_REG (addr)))
16052 base = addr;
16053 else
16054 return 0;
16056 else if (GET_CODE (addr) == PLUS)
16058 rtx addends[4], op;
16059 int n = 0, i;
16061 op = addr;
16064 if (n >= 4)
16065 return 0;
16066 addends[n++] = XEXP (op, 1);
16067 op = XEXP (op, 0);
16069 while (GET_CODE (op) == PLUS);
16070 if (n >= 4)
16071 return 0;
16072 addends[n] = op;
16074 for (i = n; i >= 0; --i)
16076 op = addends[i];
16077 switch (GET_CODE (op))
16079 case MULT:
16080 if (index)
16081 return 0;
16082 index = XEXP (op, 0);
16083 scale_rtx = XEXP (op, 1);
16084 break;
16086 case ASHIFT:
16087 if (index)
16088 return 0;
16089 index = XEXP (op, 0);
16090 tmp = XEXP (op, 1);
16091 if (!CONST_INT_P (tmp))
16092 return 0;
16093 scale = INTVAL (tmp);
16094 if ((unsigned HOST_WIDE_INT) scale > 3)
16095 return 0;
16096 scale = 1 << scale;
16097 break;
16099 case ZERO_EXTEND:
16100 op = XEXP (op, 0);
16101 if (GET_CODE (op) != UNSPEC)
16102 return 0;
16103 /* FALLTHRU */
16105 case UNSPEC:
16106 if (XINT (op, 1) == UNSPEC_TP
16107 && TARGET_TLS_DIRECT_SEG_REFS
16108 && seg == ADDR_SPACE_GENERIC)
16109 seg = DEFAULT_TLS_SEG_REG;
16110 else
16111 return 0;
16112 break;
16114 case SUBREG:
16115 if (!REG_P (SUBREG_REG (op)))
16116 return 0;
16117 /* FALLTHRU */
16119 case REG:
16120 if (!base)
16121 base = op;
16122 else if (!index)
16123 index = op;
16124 else
16125 return 0;
16126 break;
16128 case CONST:
16129 case CONST_INT:
16130 case SYMBOL_REF:
16131 case LABEL_REF:
16132 if (disp)
16133 return 0;
16134 disp = op;
16135 break;
16137 default:
16138 return 0;
16142 else if (GET_CODE (addr) == MULT)
16144 index = XEXP (addr, 0); /* index*scale */
16145 scale_rtx = XEXP (addr, 1);
16147 else if (GET_CODE (addr) == ASHIFT)
16149 /* We're called for lea too, which implements ashift on occasion. */
16150 index = XEXP (addr, 0);
16151 tmp = XEXP (addr, 1);
16152 if (!CONST_INT_P (tmp))
16153 return 0;
16154 scale = INTVAL (tmp);
16155 if ((unsigned HOST_WIDE_INT) scale > 3)
16156 return 0;
16157 scale = 1 << scale;
16158 retval = -1;
16160 else
16161 disp = addr; /* displacement */
16163 if (index)
16165 if (REG_P (index))
16167 else if (SUBREG_P (index)
16168 && REG_P (SUBREG_REG (index)))
16170 else
16171 return 0;
16174 /* Extract the integral value of scale. */
16175 if (scale_rtx)
16177 if (!CONST_INT_P (scale_rtx))
16178 return 0;
16179 scale = INTVAL (scale_rtx);
16182 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16183 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16185 /* Avoid useless 0 displacement. */
16186 if (disp == const0_rtx && (base || index))
16187 disp = NULL_RTX;
16189 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16190 if (base_reg && index_reg && scale == 1
16191 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16192 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16193 || REGNO (index_reg) == SP_REG))
16195 std::swap (base, index);
16196 std::swap (base_reg, index_reg);
16199 /* Special case: %ebp cannot be encoded as a base without a displacement.
16200 Similarly %r13. */
16201 if (!disp && base_reg
16202 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16203 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16204 || REGNO (base_reg) == BP_REG
16205 || REGNO (base_reg) == R13_REG))
16206 disp = const0_rtx;
16208 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16209 Avoid this by transforming to [%esi+0].
16210 Reload calls address legitimization without cfun defined, so we need
16211 to test cfun for being non-NULL. */
16212 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16213 && base_reg && !index_reg && !disp
16214 && REGNO (base_reg) == SI_REG)
16215 disp = const0_rtx;
16217 /* Special case: encode reg+reg instead of reg*2. */
16218 if (!base && index && scale == 2)
16219 base = index, base_reg = index_reg, scale = 1;
16221 /* Special case: scaling cannot be encoded without base or displacement. */
16222 if (!base && !disp && index && scale != 1)
16223 disp = const0_rtx;
16225 out->base = base;
16226 out->index = index;
16227 out->disp = disp;
16228 out->scale = scale;
16229 out->seg = seg;
16231 return retval;
16234 /* Return cost of the memory address x.
16235 For i386, it is better to use a complex address than let gcc copy
16236 the address into a reg and make a new pseudo. But not if the address
16237 requires to two regs - that would mean more pseudos with longer
16238 lifetimes. */
16239 static int
16240 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16242 struct ix86_address parts;
16243 int cost = 1;
16244 int ok = ix86_decompose_address (x, &parts);
16246 gcc_assert (ok);
16248 if (parts.base && SUBREG_P (parts.base))
16249 parts.base = SUBREG_REG (parts.base);
16250 if (parts.index && SUBREG_P (parts.index))
16251 parts.index = SUBREG_REG (parts.index);
16253 /* Attempt to minimize number of registers in the address by increasing
16254 address cost for each used register. We don't increase address cost
16255 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16256 is not invariant itself it most likely means that base or index is not
16257 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16258 which is not profitable for x86. */
16259 if (parts.base
16260 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16261 && (current_pass->type == GIMPLE_PASS
16262 || !pic_offset_table_rtx
16263 || !REG_P (parts.base)
16264 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16265 cost++;
16267 if (parts.index
16268 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16269 && (current_pass->type == GIMPLE_PASS
16270 || !pic_offset_table_rtx
16271 || !REG_P (parts.index)
16272 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16273 cost++;
16275 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16276 since it's predecode logic can't detect the length of instructions
16277 and it degenerates to vector decoded. Increase cost of such
16278 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16279 to split such addresses or even refuse such addresses at all.
16281 Following addressing modes are affected:
16282 [base+scale*index]
16283 [scale*index+disp]
16284 [base+index]
16286 The first and last case may be avoidable by explicitly coding the zero in
16287 memory address, but I don't have AMD-K6 machine handy to check this
16288 theory. */
16290 if (TARGET_K6
16291 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16292 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16293 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16294 cost += 10;
16296 return cost;
16299 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16300 this is used for to form addresses to local data when -fPIC is in
16301 use. */
16303 static bool
16304 darwin_local_data_pic (rtx disp)
16306 return (GET_CODE (disp) == UNSPEC
16307 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16310 /* True if operand X should be loaded from GOT. */
16312 bool
16313 ix86_force_load_from_GOT_p (rtx x)
16315 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16316 && !TARGET_PECOFF && !TARGET_MACHO
16317 && !flag_plt && !flag_pic
16318 && ix86_cmodel != CM_LARGE
16319 && GET_CODE (x) == SYMBOL_REF
16320 && SYMBOL_REF_FUNCTION_P (x)
16321 && !SYMBOL_REF_LOCAL_P (x));
16324 /* Determine if a given RTX is a valid constant. We already know this
16325 satisfies CONSTANT_P. */
16327 static bool
16328 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16330 /* Pointer bounds constants are not valid. */
16331 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16332 return false;
16334 switch (GET_CODE (x))
16336 case CONST:
16337 x = XEXP (x, 0);
16339 if (GET_CODE (x) == PLUS)
16341 if (!CONST_INT_P (XEXP (x, 1)))
16342 return false;
16343 x = XEXP (x, 0);
16346 if (TARGET_MACHO && darwin_local_data_pic (x))
16347 return true;
16349 /* Only some unspecs are valid as "constants". */
16350 if (GET_CODE (x) == UNSPEC)
16351 switch (XINT (x, 1))
16353 case UNSPEC_GOT:
16354 case UNSPEC_GOTOFF:
16355 case UNSPEC_PLTOFF:
16356 return TARGET_64BIT;
16357 case UNSPEC_TPOFF:
16358 case UNSPEC_NTPOFF:
16359 x = XVECEXP (x, 0, 0);
16360 return (GET_CODE (x) == SYMBOL_REF
16361 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16362 case UNSPEC_DTPOFF:
16363 x = XVECEXP (x, 0, 0);
16364 return (GET_CODE (x) == SYMBOL_REF
16365 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16366 default:
16367 return false;
16370 /* We must have drilled down to a symbol. */
16371 if (GET_CODE (x) == LABEL_REF)
16372 return true;
16373 if (GET_CODE (x) != SYMBOL_REF)
16374 return false;
16375 /* FALLTHRU */
16377 case SYMBOL_REF:
16378 /* TLS symbols are never valid. */
16379 if (SYMBOL_REF_TLS_MODEL (x))
16380 return false;
16382 /* DLLIMPORT symbols are never valid. */
16383 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16384 && SYMBOL_REF_DLLIMPORT_P (x))
16385 return false;
16387 #if TARGET_MACHO
16388 /* mdynamic-no-pic */
16389 if (MACHO_DYNAMIC_NO_PIC_P)
16390 return machopic_symbol_defined_p (x);
16391 #endif
16393 /* External function address should be loaded
16394 via the GOT slot to avoid PLT. */
16395 if (ix86_force_load_from_GOT_p (x))
16396 return false;
16398 break;
16400 CASE_CONST_SCALAR_INT:
16401 switch (mode)
16403 case TImode:
16404 if (TARGET_64BIT)
16405 return true;
16406 /* FALLTHRU */
16407 case OImode:
16408 case XImode:
16409 if (!standard_sse_constant_p (x, mode))
16410 return false;
16411 default:
16412 break;
16414 break;
16416 case CONST_VECTOR:
16417 if (!standard_sse_constant_p (x, mode))
16418 return false;
16420 default:
16421 break;
16424 /* Otherwise we handle everything else in the move patterns. */
16425 return true;
16428 /* Determine if it's legal to put X into the constant pool. This
16429 is not possible for the address of thread-local symbols, which
16430 is checked above. */
16432 static bool
16433 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16435 /* We can put any immediate constant in memory. */
16436 switch (GET_CODE (x))
16438 CASE_CONST_ANY:
16439 return false;
16441 default:
16442 break;
16445 return !ix86_legitimate_constant_p (mode, x);
16448 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16449 otherwise zero. */
16451 static bool
16452 is_imported_p (rtx x)
16454 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16455 || GET_CODE (x) != SYMBOL_REF)
16456 return false;
16458 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16462 /* Nonzero if the constant value X is a legitimate general operand
16463 when generating PIC code. It is given that flag_pic is on and
16464 that X satisfies CONSTANT_P. */
16466 bool
16467 legitimate_pic_operand_p (rtx x)
16469 rtx inner;
16471 switch (GET_CODE (x))
16473 case CONST:
16474 inner = XEXP (x, 0);
16475 if (GET_CODE (inner) == PLUS
16476 && CONST_INT_P (XEXP (inner, 1)))
16477 inner = XEXP (inner, 0);
16479 /* Only some unspecs are valid as "constants". */
16480 if (GET_CODE (inner) == UNSPEC)
16481 switch (XINT (inner, 1))
16483 case UNSPEC_GOT:
16484 case UNSPEC_GOTOFF:
16485 case UNSPEC_PLTOFF:
16486 return TARGET_64BIT;
16487 case UNSPEC_TPOFF:
16488 x = XVECEXP (inner, 0, 0);
16489 return (GET_CODE (x) == SYMBOL_REF
16490 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16491 case UNSPEC_MACHOPIC_OFFSET:
16492 return legitimate_pic_address_disp_p (x);
16493 default:
16494 return false;
16496 /* FALLTHRU */
16498 case SYMBOL_REF:
16499 case LABEL_REF:
16500 return legitimate_pic_address_disp_p (x);
16502 default:
16503 return true;
16507 /* Determine if a given CONST RTX is a valid memory displacement
16508 in PIC mode. */
16510 bool
16511 legitimate_pic_address_disp_p (rtx disp)
16513 bool saw_plus;
16515 /* In 64bit mode we can allow direct addresses of symbols and labels
16516 when they are not dynamic symbols. */
16517 if (TARGET_64BIT)
16519 rtx op0 = disp, op1;
16521 switch (GET_CODE (disp))
16523 case LABEL_REF:
16524 return true;
16526 case CONST:
16527 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16528 break;
16529 op0 = XEXP (XEXP (disp, 0), 0);
16530 op1 = XEXP (XEXP (disp, 0), 1);
16531 if (!CONST_INT_P (op1)
16532 || INTVAL (op1) >= 16*1024*1024
16533 || INTVAL (op1) < -16*1024*1024)
16534 break;
16535 if (GET_CODE (op0) == LABEL_REF)
16536 return true;
16537 if (GET_CODE (op0) == CONST
16538 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16539 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16540 return true;
16541 if (GET_CODE (op0) == UNSPEC
16542 && XINT (op0, 1) == UNSPEC_PCREL)
16543 return true;
16544 if (GET_CODE (op0) != SYMBOL_REF)
16545 break;
16546 /* FALLTHRU */
16548 case SYMBOL_REF:
16549 /* TLS references should always be enclosed in UNSPEC.
16550 The dllimported symbol needs always to be resolved. */
16551 if (SYMBOL_REF_TLS_MODEL (op0)
16552 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16553 return false;
16555 if (TARGET_PECOFF)
16557 if (is_imported_p (op0))
16558 return true;
16560 if (SYMBOL_REF_FAR_ADDR_P (op0)
16561 || !SYMBOL_REF_LOCAL_P (op0))
16562 break;
16564 /* Function-symbols need to be resolved only for
16565 large-model.
16566 For the small-model we don't need to resolve anything
16567 here. */
16568 if ((ix86_cmodel != CM_LARGE_PIC
16569 && SYMBOL_REF_FUNCTION_P (op0))
16570 || ix86_cmodel == CM_SMALL_PIC)
16571 return true;
16572 /* Non-external symbols don't need to be resolved for
16573 large, and medium-model. */
16574 if ((ix86_cmodel == CM_LARGE_PIC
16575 || ix86_cmodel == CM_MEDIUM_PIC)
16576 && !SYMBOL_REF_EXTERNAL_P (op0))
16577 return true;
16579 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16580 && (SYMBOL_REF_LOCAL_P (op0)
16581 || (HAVE_LD_PIE_COPYRELOC
16582 && flag_pie
16583 && !SYMBOL_REF_WEAK (op0)
16584 && !SYMBOL_REF_FUNCTION_P (op0)))
16585 && ix86_cmodel != CM_LARGE_PIC)
16586 return true;
16587 break;
16589 default:
16590 break;
16593 if (GET_CODE (disp) != CONST)
16594 return false;
16595 disp = XEXP (disp, 0);
16597 if (TARGET_64BIT)
16599 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16600 of GOT tables. We should not need these anyway. */
16601 if (GET_CODE (disp) != UNSPEC
16602 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16603 && XINT (disp, 1) != UNSPEC_GOTOFF
16604 && XINT (disp, 1) != UNSPEC_PCREL
16605 && XINT (disp, 1) != UNSPEC_PLTOFF))
16606 return false;
16608 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16609 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16610 return false;
16611 return true;
16614 saw_plus = false;
16615 if (GET_CODE (disp) == PLUS)
16617 if (!CONST_INT_P (XEXP (disp, 1)))
16618 return false;
16619 disp = XEXP (disp, 0);
16620 saw_plus = true;
16623 if (TARGET_MACHO && darwin_local_data_pic (disp))
16624 return true;
16626 if (GET_CODE (disp) != UNSPEC)
16627 return false;
16629 switch (XINT (disp, 1))
16631 case UNSPEC_GOT:
16632 if (saw_plus)
16633 return false;
16634 /* We need to check for both symbols and labels because VxWorks loads
16635 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16636 details. */
16637 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16638 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16639 case UNSPEC_GOTOFF:
16640 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16641 While ABI specify also 32bit relocation but we don't produce it in
16642 small PIC model at all. */
16643 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16644 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16645 && !TARGET_64BIT)
16646 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16647 return false;
16648 case UNSPEC_GOTTPOFF:
16649 case UNSPEC_GOTNTPOFF:
16650 case UNSPEC_INDNTPOFF:
16651 if (saw_plus)
16652 return false;
16653 disp = XVECEXP (disp, 0, 0);
16654 return (GET_CODE (disp) == SYMBOL_REF
16655 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16656 case UNSPEC_NTPOFF:
16657 disp = XVECEXP (disp, 0, 0);
16658 return (GET_CODE (disp) == SYMBOL_REF
16659 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16660 case UNSPEC_DTPOFF:
16661 disp = XVECEXP (disp, 0, 0);
16662 return (GET_CODE (disp) == SYMBOL_REF
16663 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16666 return false;
16669 /* Determine if op is suitable RTX for an address register.
16670 Return naked register if a register or a register subreg is
16671 found, otherwise return NULL_RTX. */
16673 static rtx
16674 ix86_validate_address_register (rtx op)
16676 machine_mode mode = GET_MODE (op);
16678 /* Only SImode or DImode registers can form the address. */
16679 if (mode != SImode && mode != DImode)
16680 return NULL_RTX;
16682 if (REG_P (op))
16683 return op;
16684 else if (SUBREG_P (op))
16686 rtx reg = SUBREG_REG (op);
16688 if (!REG_P (reg))
16689 return NULL_RTX;
16691 mode = GET_MODE (reg);
16693 /* Don't allow SUBREGs that span more than a word. It can
16694 lead to spill failures when the register is one word out
16695 of a two word structure. */
16696 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16697 return NULL_RTX;
16699 /* Allow only SUBREGs of non-eliminable hard registers. */
16700 if (register_no_elim_operand (reg, mode))
16701 return reg;
16704 /* Op is not a register. */
16705 return NULL_RTX;
16708 /* Recognizes RTL expressions that are valid memory addresses for an
16709 instruction. The MODE argument is the machine mode for the MEM
16710 expression that wants to use this address.
16712 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16713 convert common non-canonical forms to canonical form so that they will
16714 be recognized. */
16716 static bool
16717 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16719 struct ix86_address parts;
16720 rtx base, index, disp;
16721 HOST_WIDE_INT scale;
16722 addr_space_t seg;
16724 if (ix86_decompose_address (addr, &parts) <= 0)
16725 /* Decomposition failed. */
16726 return false;
16728 base = parts.base;
16729 index = parts.index;
16730 disp = parts.disp;
16731 scale = parts.scale;
16732 seg = parts.seg;
16734 /* Validate base register. */
16735 if (base)
16737 rtx reg = ix86_validate_address_register (base);
16739 if (reg == NULL_RTX)
16740 return false;
16742 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16743 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16744 /* Base is not valid. */
16745 return false;
16748 /* Validate index register. */
16749 if (index)
16751 rtx reg = ix86_validate_address_register (index);
16753 if (reg == NULL_RTX)
16754 return false;
16756 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16757 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16758 /* Index is not valid. */
16759 return false;
16762 /* Index and base should have the same mode. */
16763 if (base && index
16764 && GET_MODE (base) != GET_MODE (index))
16765 return false;
16767 /* Address override works only on the (%reg) part of %fs:(%reg). */
16768 if (seg != ADDR_SPACE_GENERIC
16769 && ((base && GET_MODE (base) != word_mode)
16770 || (index && GET_MODE (index) != word_mode)))
16771 return false;
16773 /* Validate scale factor. */
16774 if (scale != 1)
16776 if (!index)
16777 /* Scale without index. */
16778 return false;
16780 if (scale != 2 && scale != 4 && scale != 8)
16781 /* Scale is not a valid multiplier. */
16782 return false;
16785 /* Validate displacement. */
16786 if (disp)
16788 if (GET_CODE (disp) == CONST
16789 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16790 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16791 switch (XINT (XEXP (disp, 0), 1))
16793 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16794 when used. While ABI specify also 32bit relocations, we
16795 don't produce them at all and use IP relative instead.
16796 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16797 should be loaded via GOT. */
16798 case UNSPEC_GOT:
16799 if (!TARGET_64BIT
16800 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16801 goto is_legitimate_pic;
16802 /* FALLTHRU */
16803 case UNSPEC_GOTOFF:
16804 gcc_assert (flag_pic);
16805 if (!TARGET_64BIT)
16806 goto is_legitimate_pic;
16808 /* 64bit address unspec. */
16809 return false;
16811 case UNSPEC_GOTPCREL:
16812 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16813 goto is_legitimate_pic;
16814 /* FALLTHRU */
16815 case UNSPEC_PCREL:
16816 gcc_assert (flag_pic);
16817 goto is_legitimate_pic;
16819 case UNSPEC_GOTTPOFF:
16820 case UNSPEC_GOTNTPOFF:
16821 case UNSPEC_INDNTPOFF:
16822 case UNSPEC_NTPOFF:
16823 case UNSPEC_DTPOFF:
16824 break;
16826 case UNSPEC_STACK_CHECK:
16827 gcc_assert (flag_split_stack);
16828 break;
16830 default:
16831 /* Invalid address unspec. */
16832 return false;
16835 else if (SYMBOLIC_CONST (disp)
16836 && (flag_pic
16837 || (TARGET_MACHO
16838 #if TARGET_MACHO
16839 && MACHOPIC_INDIRECT
16840 && !machopic_operand_p (disp)
16841 #endif
16845 is_legitimate_pic:
16846 if (TARGET_64BIT && (index || base))
16848 /* foo@dtpoff(%rX) is ok. */
16849 if (GET_CODE (disp) != CONST
16850 || GET_CODE (XEXP (disp, 0)) != PLUS
16851 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16852 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16853 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16854 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16855 /* Non-constant pic memory reference. */
16856 return false;
16858 else if ((!TARGET_MACHO || flag_pic)
16859 && ! legitimate_pic_address_disp_p (disp))
16860 /* Displacement is an invalid pic construct. */
16861 return false;
16862 #if TARGET_MACHO
16863 else if (MACHO_DYNAMIC_NO_PIC_P
16864 && !ix86_legitimate_constant_p (Pmode, disp))
16865 /* displacment must be referenced via non_lazy_pointer */
16866 return false;
16867 #endif
16869 /* This code used to verify that a symbolic pic displacement
16870 includes the pic_offset_table_rtx register.
16872 While this is good idea, unfortunately these constructs may
16873 be created by "adds using lea" optimization for incorrect
16874 code like:
16876 int a;
16877 int foo(int i)
16879 return *(&a+i);
16882 This code is nonsensical, but results in addressing
16883 GOT table with pic_offset_table_rtx base. We can't
16884 just refuse it easily, since it gets matched by
16885 "addsi3" pattern, that later gets split to lea in the
16886 case output register differs from input. While this
16887 can be handled by separate addsi pattern for this case
16888 that never results in lea, this seems to be easier and
16889 correct fix for crash to disable this test. */
16891 else if (GET_CODE (disp) != LABEL_REF
16892 && !CONST_INT_P (disp)
16893 && (GET_CODE (disp) != CONST
16894 || !ix86_legitimate_constant_p (Pmode, disp))
16895 && (GET_CODE (disp) != SYMBOL_REF
16896 || !ix86_legitimate_constant_p (Pmode, disp)))
16897 /* Displacement is not constant. */
16898 return false;
16899 else if (TARGET_64BIT
16900 && !x86_64_immediate_operand (disp, VOIDmode))
16901 /* Displacement is out of range. */
16902 return false;
16903 /* In x32 mode, constant addresses are sign extended to 64bit, so
16904 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16905 else if (TARGET_X32 && !(index || base)
16906 && CONST_INT_P (disp)
16907 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16908 return false;
16911 /* Everything looks valid. */
16912 return true;
16915 /* Determine if a given RTX is a valid constant address. */
16917 bool
16918 constant_address_p (rtx x)
16920 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16923 /* Return a unique alias set for the GOT. */
16925 static alias_set_type
16926 ix86_GOT_alias_set (void)
16928 static alias_set_type set = -1;
16929 if (set == -1)
16930 set = new_alias_set ();
16931 return set;
16934 /* Return a legitimate reference for ORIG (an address) using the
16935 register REG. If REG is 0, a new pseudo is generated.
16937 There are two types of references that must be handled:
16939 1. Global data references must load the address from the GOT, via
16940 the PIC reg. An insn is emitted to do this load, and the reg is
16941 returned.
16943 2. Static data references, constant pool addresses, and code labels
16944 compute the address as an offset from the GOT, whose base is in
16945 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16946 differentiate them from global data objects. The returned
16947 address is the PIC reg + an unspec constant.
16949 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16950 reg also appears in the address. */
16952 static rtx
16953 legitimize_pic_address (rtx orig, rtx reg)
16955 rtx addr = orig;
16956 rtx new_rtx = orig;
16958 #if TARGET_MACHO
16959 if (TARGET_MACHO && !TARGET_64BIT)
16961 if (reg == 0)
16962 reg = gen_reg_rtx (Pmode);
16963 /* Use the generic Mach-O PIC machinery. */
16964 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16966 #endif
16968 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16970 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16971 if (tmp)
16972 return tmp;
16975 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16976 new_rtx = addr;
16977 else if ((!TARGET_64BIT
16978 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16979 && !TARGET_PECOFF
16980 && gotoff_operand (addr, Pmode))
16982 /* This symbol may be referenced via a displacement
16983 from the PIC base address (@GOTOFF). */
16984 if (GET_CODE (addr) == CONST)
16985 addr = XEXP (addr, 0);
16987 if (GET_CODE (addr) == PLUS)
16989 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16990 UNSPEC_GOTOFF);
16991 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16993 else
16994 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16996 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16998 if (TARGET_64BIT)
16999 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17001 if (reg != 0)
17003 gcc_assert (REG_P (reg));
17004 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17005 new_rtx, reg, 1, OPTAB_DIRECT);
17007 else
17008 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17010 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17011 /* We can't use @GOTOFF for text labels
17012 on VxWorks, see gotoff_operand. */
17013 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17015 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17016 if (tmp)
17017 return tmp;
17019 /* For x64 PE-COFF there is no GOT table,
17020 so we use address directly. */
17021 if (TARGET_64BIT && TARGET_PECOFF)
17023 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17024 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17026 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17028 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17029 UNSPEC_GOTPCREL);
17030 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17031 new_rtx = gen_const_mem (Pmode, new_rtx);
17032 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17034 else
17036 /* This symbol must be referenced via a load
17037 from the Global Offset Table (@GOT). */
17038 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17039 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17040 if (TARGET_64BIT)
17041 new_rtx = force_reg (Pmode, new_rtx);
17042 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17043 new_rtx = gen_const_mem (Pmode, new_rtx);
17044 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17047 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17049 else
17051 if (CONST_INT_P (addr)
17052 && !x86_64_immediate_operand (addr, VOIDmode))
17053 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17054 else if (GET_CODE (addr) == CONST)
17056 addr = XEXP (addr, 0);
17058 /* We must match stuff we generate before. Assume the only
17059 unspecs that can get here are ours. Not that we could do
17060 anything with them anyway.... */
17061 if (GET_CODE (addr) == UNSPEC
17062 || (GET_CODE (addr) == PLUS
17063 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17064 return orig;
17065 gcc_assert (GET_CODE (addr) == PLUS);
17068 if (GET_CODE (addr) == PLUS)
17070 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17072 /* Check first to see if this is a constant
17073 offset from a @GOTOFF symbol reference. */
17074 if (!TARGET_PECOFF
17075 && gotoff_operand (op0, Pmode)
17076 && CONST_INT_P (op1))
17078 if (!TARGET_64BIT)
17080 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17081 UNSPEC_GOTOFF);
17082 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17083 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17085 if (reg != 0)
17087 gcc_assert (REG_P (reg));
17088 new_rtx = expand_simple_binop (Pmode, PLUS,
17089 pic_offset_table_rtx,
17090 new_rtx, reg, 1,
17091 OPTAB_DIRECT);
17093 else
17094 new_rtx
17095 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17097 else
17099 if (INTVAL (op1) < -16*1024*1024
17100 || INTVAL (op1) >= 16*1024*1024)
17102 if (!x86_64_immediate_operand (op1, Pmode))
17103 op1 = force_reg (Pmode, op1);
17105 new_rtx
17106 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17110 else
17112 rtx base = legitimize_pic_address (op0, reg);
17113 machine_mode mode = GET_MODE (base);
17114 new_rtx
17115 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17117 if (CONST_INT_P (new_rtx))
17119 if (INTVAL (new_rtx) < -16*1024*1024
17120 || INTVAL (new_rtx) >= 16*1024*1024)
17122 if (!x86_64_immediate_operand (new_rtx, mode))
17123 new_rtx = force_reg (mode, new_rtx);
17125 new_rtx
17126 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17128 else
17129 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17131 else
17133 /* For %rip addressing, we have to use
17134 just disp32, not base nor index. */
17135 if (TARGET_64BIT
17136 && (GET_CODE (base) == SYMBOL_REF
17137 || GET_CODE (base) == LABEL_REF))
17138 base = force_reg (mode, base);
17139 if (GET_CODE (new_rtx) == PLUS
17140 && CONSTANT_P (XEXP (new_rtx, 1)))
17142 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17143 new_rtx = XEXP (new_rtx, 1);
17145 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17150 return new_rtx;
17153 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17155 static rtx
17156 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17158 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17160 if (GET_MODE (tp) != tp_mode)
17162 gcc_assert (GET_MODE (tp) == SImode);
17163 gcc_assert (tp_mode == DImode);
17165 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17168 if (to_reg)
17169 tp = copy_to_mode_reg (tp_mode, tp);
17171 return tp;
17174 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17176 static GTY(()) rtx ix86_tls_symbol;
17178 static rtx
17179 ix86_tls_get_addr (void)
17181 if (!ix86_tls_symbol)
17183 const char *sym
17184 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17185 ? "___tls_get_addr" : "__tls_get_addr");
17187 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17190 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17192 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17193 UNSPEC_PLTOFF);
17194 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17195 gen_rtx_CONST (Pmode, unspec));
17198 return ix86_tls_symbol;
17201 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17203 static GTY(()) rtx ix86_tls_module_base_symbol;
17206 ix86_tls_module_base (void)
17208 if (!ix86_tls_module_base_symbol)
17210 ix86_tls_module_base_symbol
17211 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17213 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17214 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17217 return ix86_tls_module_base_symbol;
17220 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17221 false if we expect this to be used for a memory address and true if
17222 we expect to load the address into a register. */
17224 static rtx
17225 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17227 rtx dest, base, off;
17228 rtx pic = NULL_RTX, tp = NULL_RTX;
17229 machine_mode tp_mode = Pmode;
17230 int type;
17232 /* Fall back to global dynamic model if tool chain cannot support local
17233 dynamic. */
17234 if (TARGET_SUN_TLS && !TARGET_64BIT
17235 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17236 && model == TLS_MODEL_LOCAL_DYNAMIC)
17237 model = TLS_MODEL_GLOBAL_DYNAMIC;
17239 switch (model)
17241 case TLS_MODEL_GLOBAL_DYNAMIC:
17242 dest = gen_reg_rtx (Pmode);
17244 if (!TARGET_64BIT)
17246 if (flag_pic && !TARGET_PECOFF)
17247 pic = pic_offset_table_rtx;
17248 else
17250 pic = gen_reg_rtx (Pmode);
17251 emit_insn (gen_set_got (pic));
17255 if (TARGET_GNU2_TLS)
17257 if (TARGET_64BIT)
17258 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17259 else
17260 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17262 tp = get_thread_pointer (Pmode, true);
17263 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17265 if (GET_MODE (x) != Pmode)
17266 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17268 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17270 else
17272 rtx caddr = ix86_tls_get_addr ();
17274 if (TARGET_64BIT)
17276 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17277 rtx_insn *insns;
17279 start_sequence ();
17280 emit_call_insn
17281 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17282 insns = get_insns ();
17283 end_sequence ();
17285 if (GET_MODE (x) != Pmode)
17286 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17288 RTL_CONST_CALL_P (insns) = 1;
17289 emit_libcall_block (insns, dest, rax, x);
17291 else
17292 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17294 break;
17296 case TLS_MODEL_LOCAL_DYNAMIC:
17297 base = gen_reg_rtx (Pmode);
17299 if (!TARGET_64BIT)
17301 if (flag_pic)
17302 pic = pic_offset_table_rtx;
17303 else
17305 pic = gen_reg_rtx (Pmode);
17306 emit_insn (gen_set_got (pic));
17310 if (TARGET_GNU2_TLS)
17312 rtx tmp = ix86_tls_module_base ();
17314 if (TARGET_64BIT)
17315 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17316 else
17317 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17319 tp = get_thread_pointer (Pmode, true);
17320 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17321 gen_rtx_MINUS (Pmode, tmp, tp));
17323 else
17325 rtx caddr = ix86_tls_get_addr ();
17327 if (TARGET_64BIT)
17329 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17330 rtx_insn *insns;
17331 rtx eqv;
17333 start_sequence ();
17334 emit_call_insn
17335 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17336 insns = get_insns ();
17337 end_sequence ();
17339 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17340 share the LD_BASE result with other LD model accesses. */
17341 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17342 UNSPEC_TLS_LD_BASE);
17344 RTL_CONST_CALL_P (insns) = 1;
17345 emit_libcall_block (insns, base, rax, eqv);
17347 else
17348 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17351 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17352 off = gen_rtx_CONST (Pmode, off);
17354 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17356 if (TARGET_GNU2_TLS)
17358 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17360 if (GET_MODE (x) != Pmode)
17361 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17363 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17365 break;
17367 case TLS_MODEL_INITIAL_EXEC:
17368 if (TARGET_64BIT)
17370 if (TARGET_SUN_TLS && !TARGET_X32)
17372 /* The Sun linker took the AMD64 TLS spec literally
17373 and can only handle %rax as destination of the
17374 initial executable code sequence. */
17376 dest = gen_reg_rtx (DImode);
17377 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17378 return dest;
17381 /* Generate DImode references to avoid %fs:(%reg32)
17382 problems and linker IE->LE relaxation bug. */
17383 tp_mode = DImode;
17384 pic = NULL;
17385 type = UNSPEC_GOTNTPOFF;
17387 else if (flag_pic)
17389 pic = pic_offset_table_rtx;
17390 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17392 else if (!TARGET_ANY_GNU_TLS)
17394 pic = gen_reg_rtx (Pmode);
17395 emit_insn (gen_set_got (pic));
17396 type = UNSPEC_GOTTPOFF;
17398 else
17400 pic = NULL;
17401 type = UNSPEC_INDNTPOFF;
17404 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17405 off = gen_rtx_CONST (tp_mode, off);
17406 if (pic)
17407 off = gen_rtx_PLUS (tp_mode, pic, off);
17408 off = gen_const_mem (tp_mode, off);
17409 set_mem_alias_set (off, ix86_GOT_alias_set ());
17411 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17413 base = get_thread_pointer (tp_mode,
17414 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17415 off = force_reg (tp_mode, off);
17416 dest = gen_rtx_PLUS (tp_mode, base, off);
17417 if (tp_mode != Pmode)
17418 dest = convert_to_mode (Pmode, dest, 1);
17420 else
17422 base = get_thread_pointer (Pmode, true);
17423 dest = gen_reg_rtx (Pmode);
17424 emit_insn (ix86_gen_sub3 (dest, base, off));
17426 break;
17428 case TLS_MODEL_LOCAL_EXEC:
17429 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17430 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17431 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17432 off = gen_rtx_CONST (Pmode, off);
17434 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17436 base = get_thread_pointer (Pmode,
17437 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17438 return gen_rtx_PLUS (Pmode, base, off);
17440 else
17442 base = get_thread_pointer (Pmode, true);
17443 dest = gen_reg_rtx (Pmode);
17444 emit_insn (ix86_gen_sub3 (dest, base, off));
17446 break;
17448 default:
17449 gcc_unreachable ();
17452 return dest;
17455 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17456 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17457 unique refptr-DECL symbol corresponding to symbol DECL. */
17459 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17461 static inline hashval_t hash (tree_map *m) { return m->hash; }
17462 static inline bool
17463 equal (tree_map *a, tree_map *b)
17465 return a->base.from == b->base.from;
17468 static int
17469 keep_cache_entry (tree_map *&m)
17471 return ggc_marked_p (m->base.from);
17475 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17477 static tree
17478 get_dllimport_decl (tree decl, bool beimport)
17480 struct tree_map *h, in;
17481 const char *name;
17482 const char *prefix;
17483 size_t namelen, prefixlen;
17484 char *imp_name;
17485 tree to;
17486 rtx rtl;
17488 if (!dllimport_map)
17489 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17491 in.hash = htab_hash_pointer (decl);
17492 in.base.from = decl;
17493 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17494 h = *loc;
17495 if (h)
17496 return h->to;
17498 *loc = h = ggc_alloc<tree_map> ();
17499 h->hash = in.hash;
17500 h->base.from = decl;
17501 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17502 VAR_DECL, NULL, ptr_type_node);
17503 DECL_ARTIFICIAL (to) = 1;
17504 DECL_IGNORED_P (to) = 1;
17505 DECL_EXTERNAL (to) = 1;
17506 TREE_READONLY (to) = 1;
17508 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17509 name = targetm.strip_name_encoding (name);
17510 if (beimport)
17511 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17512 ? "*__imp_" : "*__imp__";
17513 else
17514 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17515 namelen = strlen (name);
17516 prefixlen = strlen (prefix);
17517 imp_name = (char *) alloca (namelen + prefixlen + 1);
17518 memcpy (imp_name, prefix, prefixlen);
17519 memcpy (imp_name + prefixlen, name, namelen + 1);
17521 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17522 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17523 SET_SYMBOL_REF_DECL (rtl, to);
17524 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17525 if (!beimport)
17527 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17528 #ifdef SUB_TARGET_RECORD_STUB
17529 SUB_TARGET_RECORD_STUB (name);
17530 #endif
17533 rtl = gen_const_mem (Pmode, rtl);
17534 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17536 SET_DECL_RTL (to, rtl);
17537 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17539 return to;
17542 /* Expand SYMBOL into its corresponding far-address symbol.
17543 WANT_REG is true if we require the result be a register. */
17545 static rtx
17546 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17548 tree imp_decl;
17549 rtx x;
17551 gcc_assert (SYMBOL_REF_DECL (symbol));
17552 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17554 x = DECL_RTL (imp_decl);
17555 if (want_reg)
17556 x = force_reg (Pmode, x);
17557 return x;
17560 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17561 true if we require the result be a register. */
17563 static rtx
17564 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17566 tree imp_decl;
17567 rtx x;
17569 gcc_assert (SYMBOL_REF_DECL (symbol));
17570 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17572 x = DECL_RTL (imp_decl);
17573 if (want_reg)
17574 x = force_reg (Pmode, x);
17575 return x;
17578 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17579 is true if we require the result be a register. */
17581 static rtx
17582 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17584 if (!TARGET_PECOFF)
17585 return NULL_RTX;
17587 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17589 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17590 return legitimize_dllimport_symbol (addr, inreg);
17591 if (GET_CODE (addr) == CONST
17592 && GET_CODE (XEXP (addr, 0)) == PLUS
17593 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17594 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17596 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17597 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17601 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17602 return NULL_RTX;
17603 if (GET_CODE (addr) == SYMBOL_REF
17604 && !is_imported_p (addr)
17605 && SYMBOL_REF_EXTERNAL_P (addr)
17606 && SYMBOL_REF_DECL (addr))
17607 return legitimize_pe_coff_extern_decl (addr, inreg);
17609 if (GET_CODE (addr) == CONST
17610 && GET_CODE (XEXP (addr, 0)) == PLUS
17611 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17612 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17613 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17614 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17616 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17617 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17619 return NULL_RTX;
17622 /* Try machine-dependent ways of modifying an illegitimate address
17623 to be legitimate. If we find one, return the new, valid address.
17624 This macro is used in only one place: `memory_address' in explow.c.
17626 OLDX is the address as it was before break_out_memory_refs was called.
17627 In some cases it is useful to look at this to decide what needs to be done.
17629 It is always safe for this macro to do nothing. It exists to recognize
17630 opportunities to optimize the output.
17632 For the 80386, we handle X+REG by loading X into a register R and
17633 using R+REG. R will go in a general reg and indexing will be used.
17634 However, if REG is a broken-out memory address or multiplication,
17635 nothing needs to be done because REG can certainly go in a general reg.
17637 When -fpic is used, special handling is needed for symbolic references.
17638 See comments by legitimize_pic_address in i386.c for details. */
17640 static rtx
17641 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17643 bool changed = false;
17644 unsigned log;
17646 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17647 if (log)
17648 return legitimize_tls_address (x, (enum tls_model) log, false);
17649 if (GET_CODE (x) == CONST
17650 && GET_CODE (XEXP (x, 0)) == PLUS
17651 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17652 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17654 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17655 (enum tls_model) log, false);
17656 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17659 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17661 rtx tmp = legitimize_pe_coff_symbol (x, true);
17662 if (tmp)
17663 return tmp;
17666 if (flag_pic && SYMBOLIC_CONST (x))
17667 return legitimize_pic_address (x, 0);
17669 #if TARGET_MACHO
17670 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17671 return machopic_indirect_data_reference (x, 0);
17672 #endif
17674 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17675 if (GET_CODE (x) == ASHIFT
17676 && CONST_INT_P (XEXP (x, 1))
17677 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17679 changed = true;
17680 log = INTVAL (XEXP (x, 1));
17681 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17682 GEN_INT (1 << log));
17685 if (GET_CODE (x) == PLUS)
17687 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17689 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17690 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17691 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17693 changed = true;
17694 log = INTVAL (XEXP (XEXP (x, 0), 1));
17695 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17696 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17697 GEN_INT (1 << log));
17700 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17701 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17702 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17704 changed = true;
17705 log = INTVAL (XEXP (XEXP (x, 1), 1));
17706 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17707 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17708 GEN_INT (1 << log));
17711 /* Put multiply first if it isn't already. */
17712 if (GET_CODE (XEXP (x, 1)) == MULT)
17714 std::swap (XEXP (x, 0), XEXP (x, 1));
17715 changed = true;
17718 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17719 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17720 created by virtual register instantiation, register elimination, and
17721 similar optimizations. */
17722 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17724 changed = true;
17725 x = gen_rtx_PLUS (Pmode,
17726 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17727 XEXP (XEXP (x, 1), 0)),
17728 XEXP (XEXP (x, 1), 1));
17731 /* Canonicalize
17732 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17733 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17734 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17735 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17736 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17737 && CONSTANT_P (XEXP (x, 1)))
17739 rtx constant;
17740 rtx other = NULL_RTX;
17742 if (CONST_INT_P (XEXP (x, 1)))
17744 constant = XEXP (x, 1);
17745 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17747 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17749 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17750 other = XEXP (x, 1);
17752 else
17753 constant = 0;
17755 if (constant)
17757 changed = true;
17758 x = gen_rtx_PLUS (Pmode,
17759 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17760 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17761 plus_constant (Pmode, other,
17762 INTVAL (constant)));
17766 if (changed && ix86_legitimate_address_p (mode, x, false))
17767 return x;
17769 if (GET_CODE (XEXP (x, 0)) == MULT)
17771 changed = true;
17772 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17775 if (GET_CODE (XEXP (x, 1)) == MULT)
17777 changed = true;
17778 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17781 if (changed
17782 && REG_P (XEXP (x, 1))
17783 && REG_P (XEXP (x, 0)))
17784 return x;
17786 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17788 changed = true;
17789 x = legitimize_pic_address (x, 0);
17792 if (changed && ix86_legitimate_address_p (mode, x, false))
17793 return x;
17795 if (REG_P (XEXP (x, 0)))
17797 rtx temp = gen_reg_rtx (Pmode);
17798 rtx val = force_operand (XEXP (x, 1), temp);
17799 if (val != temp)
17801 val = convert_to_mode (Pmode, val, 1);
17802 emit_move_insn (temp, val);
17805 XEXP (x, 1) = temp;
17806 return x;
17809 else if (REG_P (XEXP (x, 1)))
17811 rtx temp = gen_reg_rtx (Pmode);
17812 rtx val = force_operand (XEXP (x, 0), temp);
17813 if (val != temp)
17815 val = convert_to_mode (Pmode, val, 1);
17816 emit_move_insn (temp, val);
17819 XEXP (x, 0) = temp;
17820 return x;
17824 return x;
17827 /* Print an integer constant expression in assembler syntax. Addition
17828 and subtraction are the only arithmetic that may appear in these
17829 expressions. FILE is the stdio stream to write to, X is the rtx, and
17830 CODE is the operand print code from the output string. */
17832 static void
17833 output_pic_addr_const (FILE *file, rtx x, int code)
17835 char buf[256];
17837 switch (GET_CODE (x))
17839 case PC:
17840 gcc_assert (flag_pic);
17841 putc ('.', file);
17842 break;
17844 case SYMBOL_REF:
17845 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17846 output_addr_const (file, x);
17847 else
17849 const char *name = XSTR (x, 0);
17851 /* Mark the decl as referenced so that cgraph will
17852 output the function. */
17853 if (SYMBOL_REF_DECL (x))
17854 mark_decl_referenced (SYMBOL_REF_DECL (x));
17856 #if TARGET_MACHO
17857 if (MACHOPIC_INDIRECT
17858 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17859 name = machopic_indirection_name (x, /*stub_p=*/true);
17860 #endif
17861 assemble_name (file, name);
17863 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17864 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17865 fputs ("@PLT", file);
17866 break;
17868 case LABEL_REF:
17869 x = XEXP (x, 0);
17870 /* FALLTHRU */
17871 case CODE_LABEL:
17872 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17873 assemble_name (asm_out_file, buf);
17874 break;
17876 case CONST_INT:
17877 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17878 break;
17880 case CONST:
17881 /* This used to output parentheses around the expression,
17882 but that does not work on the 386 (either ATT or BSD assembler). */
17883 output_pic_addr_const (file, XEXP (x, 0), code);
17884 break;
17886 case CONST_DOUBLE:
17887 /* We can't handle floating point constants;
17888 TARGET_PRINT_OPERAND must handle them. */
17889 output_operand_lossage ("floating constant misused");
17890 break;
17892 case PLUS:
17893 /* Some assemblers need integer constants to appear first. */
17894 if (CONST_INT_P (XEXP (x, 0)))
17896 output_pic_addr_const (file, XEXP (x, 0), code);
17897 putc ('+', file);
17898 output_pic_addr_const (file, XEXP (x, 1), code);
17900 else
17902 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17903 output_pic_addr_const (file, XEXP (x, 1), code);
17904 putc ('+', file);
17905 output_pic_addr_const (file, XEXP (x, 0), code);
17907 break;
17909 case MINUS:
17910 if (!TARGET_MACHO)
17911 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17912 output_pic_addr_const (file, XEXP (x, 0), code);
17913 putc ('-', file);
17914 output_pic_addr_const (file, XEXP (x, 1), code);
17915 if (!TARGET_MACHO)
17916 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17917 break;
17919 case UNSPEC:
17920 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17922 bool f = i386_asm_output_addr_const_extra (file, x);
17923 gcc_assert (f);
17924 break;
17927 gcc_assert (XVECLEN (x, 0) == 1);
17928 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17929 switch (XINT (x, 1))
17931 case UNSPEC_GOT:
17932 fputs ("@GOT", file);
17933 break;
17934 case UNSPEC_GOTOFF:
17935 fputs ("@GOTOFF", file);
17936 break;
17937 case UNSPEC_PLTOFF:
17938 fputs ("@PLTOFF", file);
17939 break;
17940 case UNSPEC_PCREL:
17941 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17942 "(%rip)" : "[rip]", file);
17943 break;
17944 case UNSPEC_GOTPCREL:
17945 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17946 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17947 break;
17948 case UNSPEC_GOTTPOFF:
17949 /* FIXME: This might be @TPOFF in Sun ld too. */
17950 fputs ("@gottpoff", file);
17951 break;
17952 case UNSPEC_TPOFF:
17953 fputs ("@tpoff", file);
17954 break;
17955 case UNSPEC_NTPOFF:
17956 if (TARGET_64BIT)
17957 fputs ("@tpoff", file);
17958 else
17959 fputs ("@ntpoff", file);
17960 break;
17961 case UNSPEC_DTPOFF:
17962 fputs ("@dtpoff", file);
17963 break;
17964 case UNSPEC_GOTNTPOFF:
17965 if (TARGET_64BIT)
17966 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17967 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17968 else
17969 fputs ("@gotntpoff", file);
17970 break;
17971 case UNSPEC_INDNTPOFF:
17972 fputs ("@indntpoff", file);
17973 break;
17974 #if TARGET_MACHO
17975 case UNSPEC_MACHOPIC_OFFSET:
17976 putc ('-', file);
17977 machopic_output_function_base_name (file);
17978 break;
17979 #endif
17980 default:
17981 output_operand_lossage ("invalid UNSPEC as operand");
17982 break;
17984 break;
17986 default:
17987 output_operand_lossage ("invalid expression as operand");
17991 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17992 We need to emit DTP-relative relocations. */
17994 static void ATTRIBUTE_UNUSED
17995 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17997 fputs (ASM_LONG, file);
17998 output_addr_const (file, x);
17999 fputs ("@dtpoff", file);
18000 switch (size)
18002 case 4:
18003 break;
18004 case 8:
18005 fputs (", 0", file);
18006 break;
18007 default:
18008 gcc_unreachable ();
18012 /* Return true if X is a representation of the PIC register. This copes
18013 with calls from ix86_find_base_term, where the register might have
18014 been replaced by a cselib value. */
18016 static bool
18017 ix86_pic_register_p (rtx x)
18019 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18020 return (pic_offset_table_rtx
18021 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18022 else if (!REG_P (x))
18023 return false;
18024 else if (pic_offset_table_rtx)
18026 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18027 return true;
18028 if (HARD_REGISTER_P (x)
18029 && !HARD_REGISTER_P (pic_offset_table_rtx)
18030 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18031 return true;
18032 return false;
18034 else
18035 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18038 /* Helper function for ix86_delegitimize_address.
18039 Attempt to delegitimize TLS local-exec accesses. */
18041 static rtx
18042 ix86_delegitimize_tls_address (rtx orig_x)
18044 rtx x = orig_x, unspec;
18045 struct ix86_address addr;
18047 if (!TARGET_TLS_DIRECT_SEG_REFS)
18048 return orig_x;
18049 if (MEM_P (x))
18050 x = XEXP (x, 0);
18051 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18052 return orig_x;
18053 if (ix86_decompose_address (x, &addr) == 0
18054 || addr.seg != DEFAULT_TLS_SEG_REG
18055 || addr.disp == NULL_RTX
18056 || GET_CODE (addr.disp) != CONST)
18057 return orig_x;
18058 unspec = XEXP (addr.disp, 0);
18059 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18060 unspec = XEXP (unspec, 0);
18061 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18062 return orig_x;
18063 x = XVECEXP (unspec, 0, 0);
18064 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18065 if (unspec != XEXP (addr.disp, 0))
18066 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18067 if (addr.index)
18069 rtx idx = addr.index;
18070 if (addr.scale != 1)
18071 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18072 x = gen_rtx_PLUS (Pmode, idx, x);
18074 if (addr.base)
18075 x = gen_rtx_PLUS (Pmode, addr.base, x);
18076 if (MEM_P (orig_x))
18077 x = replace_equiv_address_nv (orig_x, x);
18078 return x;
18081 /* In the name of slightly smaller debug output, and to cater to
18082 general assembler lossage, recognize PIC+GOTOFF and turn it back
18083 into a direct symbol reference.
18085 On Darwin, this is necessary to avoid a crash, because Darwin
18086 has a different PIC label for each routine but the DWARF debugging
18087 information is not associated with any particular routine, so it's
18088 necessary to remove references to the PIC label from RTL stored by
18089 the DWARF output code.
18091 This helper is used in the normal ix86_delegitimize_address
18092 entrypoint (e.g. used in the target delegitimization hook) and
18093 in ix86_find_base_term. As compile time memory optimization, we
18094 avoid allocating rtxes that will not change anything on the outcome
18095 of the callers (find_base_value and find_base_term). */
18097 static inline rtx
18098 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18100 rtx orig_x = delegitimize_mem_from_attrs (x);
18101 /* addend is NULL or some rtx if x is something+GOTOFF where
18102 something doesn't include the PIC register. */
18103 rtx addend = NULL_RTX;
18104 /* reg_addend is NULL or a multiple of some register. */
18105 rtx reg_addend = NULL_RTX;
18106 /* const_addend is NULL or a const_int. */
18107 rtx const_addend = NULL_RTX;
18108 /* This is the result, or NULL. */
18109 rtx result = NULL_RTX;
18111 x = orig_x;
18113 if (MEM_P (x))
18114 x = XEXP (x, 0);
18116 if (TARGET_64BIT)
18118 if (GET_CODE (x) == CONST
18119 && GET_CODE (XEXP (x, 0)) == PLUS
18120 && GET_MODE (XEXP (x, 0)) == Pmode
18121 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18122 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18123 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18125 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18126 base. A CONST can't be arg_pointer_rtx based. */
18127 if (base_term_p && MEM_P (orig_x))
18128 return orig_x;
18129 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18130 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18131 if (MEM_P (orig_x))
18132 x = replace_equiv_address_nv (orig_x, x);
18133 return x;
18136 if (GET_CODE (x) == CONST
18137 && GET_CODE (XEXP (x, 0)) == UNSPEC
18138 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18139 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18140 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18142 x = XVECEXP (XEXP (x, 0), 0, 0);
18143 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18145 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18146 if (x == NULL_RTX)
18147 return orig_x;
18149 return x;
18152 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18153 return ix86_delegitimize_tls_address (orig_x);
18155 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18156 and -mcmodel=medium -fpic. */
18159 if (GET_CODE (x) != PLUS
18160 || GET_CODE (XEXP (x, 1)) != CONST)
18161 return ix86_delegitimize_tls_address (orig_x);
18163 if (ix86_pic_register_p (XEXP (x, 0)))
18164 /* %ebx + GOT/GOTOFF */
18166 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18168 /* %ebx + %reg * scale + GOT/GOTOFF */
18169 reg_addend = XEXP (x, 0);
18170 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18171 reg_addend = XEXP (reg_addend, 1);
18172 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18173 reg_addend = XEXP (reg_addend, 0);
18174 else
18176 reg_addend = NULL_RTX;
18177 addend = XEXP (x, 0);
18180 else
18181 addend = XEXP (x, 0);
18183 x = XEXP (XEXP (x, 1), 0);
18184 if (GET_CODE (x) == PLUS
18185 && CONST_INT_P (XEXP (x, 1)))
18187 const_addend = XEXP (x, 1);
18188 x = XEXP (x, 0);
18191 if (GET_CODE (x) == UNSPEC
18192 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18193 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18194 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18195 && !MEM_P (orig_x) && !addend)))
18196 result = XVECEXP (x, 0, 0);
18198 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18199 && !MEM_P (orig_x))
18200 result = XVECEXP (x, 0, 0);
18202 if (! result)
18203 return ix86_delegitimize_tls_address (orig_x);
18205 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18206 recurse on the first operand. */
18207 if (const_addend && !base_term_p)
18208 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18209 if (reg_addend)
18210 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18211 if (addend)
18213 /* If the rest of original X doesn't involve the PIC register, add
18214 addend and subtract pic_offset_table_rtx. This can happen e.g.
18215 for code like:
18216 leal (%ebx, %ecx, 4), %ecx
18218 movl foo@GOTOFF(%ecx), %edx
18219 in which case we return (%ecx - %ebx) + foo
18220 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18221 and reload has completed. */
18222 if (pic_offset_table_rtx
18223 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18224 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18225 pic_offset_table_rtx),
18226 result);
18227 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18229 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18230 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18231 result = gen_rtx_PLUS (Pmode, tmp, result);
18233 else
18234 return orig_x;
18236 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18238 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18239 if (result == NULL_RTX)
18240 return orig_x;
18242 return result;
18245 /* The normal instantiation of the above template. */
18247 static rtx
18248 ix86_delegitimize_address (rtx x)
18250 return ix86_delegitimize_address_1 (x, false);
18253 /* If X is a machine specific address (i.e. a symbol or label being
18254 referenced as a displacement from the GOT implemented using an
18255 UNSPEC), then return the base term. Otherwise return X. */
18258 ix86_find_base_term (rtx x)
18260 rtx term;
18262 if (TARGET_64BIT)
18264 if (GET_CODE (x) != CONST)
18265 return x;
18266 term = XEXP (x, 0);
18267 if (GET_CODE (term) == PLUS
18268 && CONST_INT_P (XEXP (term, 1)))
18269 term = XEXP (term, 0);
18270 if (GET_CODE (term) != UNSPEC
18271 || (XINT (term, 1) != UNSPEC_GOTPCREL
18272 && XINT (term, 1) != UNSPEC_PCREL))
18273 return x;
18275 return XVECEXP (term, 0, 0);
18278 return ix86_delegitimize_address_1 (x, true);
18281 static void
18282 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18283 bool fp, FILE *file)
18285 const char *suffix;
18287 if (mode == CCFPmode || mode == CCFPUmode)
18289 code = ix86_fp_compare_code_to_integer (code);
18290 mode = CCmode;
18292 if (reverse)
18293 code = reverse_condition (code);
18295 switch (code)
18297 case EQ:
18298 switch (mode)
18300 case CCAmode:
18301 suffix = "a";
18302 break;
18303 case CCCmode:
18304 suffix = "c";
18305 break;
18306 case CCOmode:
18307 suffix = "o";
18308 break;
18309 case CCPmode:
18310 suffix = "p";
18311 break;
18312 case CCSmode:
18313 suffix = "s";
18314 break;
18315 default:
18316 suffix = "e";
18317 break;
18319 break;
18320 case NE:
18321 switch (mode)
18323 case CCAmode:
18324 suffix = "na";
18325 break;
18326 case CCCmode:
18327 suffix = "nc";
18328 break;
18329 case CCOmode:
18330 suffix = "no";
18331 break;
18332 case CCPmode:
18333 suffix = "np";
18334 break;
18335 case CCSmode:
18336 suffix = "ns";
18337 break;
18338 default:
18339 suffix = "ne";
18340 break;
18342 break;
18343 case GT:
18344 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18345 suffix = "g";
18346 break;
18347 case GTU:
18348 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18349 Those same assemblers have the same but opposite lossage on cmov. */
18350 if (mode == CCmode)
18351 suffix = fp ? "nbe" : "a";
18352 else
18353 gcc_unreachable ();
18354 break;
18355 case LT:
18356 switch (mode)
18358 case CCNOmode:
18359 case CCGOCmode:
18360 suffix = "s";
18361 break;
18363 case CCmode:
18364 case CCGCmode:
18365 suffix = "l";
18366 break;
18368 default:
18369 gcc_unreachable ();
18371 break;
18372 case LTU:
18373 if (mode == CCmode)
18374 suffix = "b";
18375 else if (mode == CCCmode)
18376 suffix = fp ? "b" : "c";
18377 else
18378 gcc_unreachable ();
18379 break;
18380 case GE:
18381 switch (mode)
18383 case CCNOmode:
18384 case CCGOCmode:
18385 suffix = "ns";
18386 break;
18388 case CCmode:
18389 case CCGCmode:
18390 suffix = "ge";
18391 break;
18393 default:
18394 gcc_unreachable ();
18396 break;
18397 case GEU:
18398 if (mode == CCmode)
18399 suffix = "nb";
18400 else if (mode == CCCmode)
18401 suffix = fp ? "nb" : "nc";
18402 else
18403 gcc_unreachable ();
18404 break;
18405 case LE:
18406 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18407 suffix = "le";
18408 break;
18409 case LEU:
18410 if (mode == CCmode)
18411 suffix = "be";
18412 else
18413 gcc_unreachable ();
18414 break;
18415 case UNORDERED:
18416 suffix = fp ? "u" : "p";
18417 break;
18418 case ORDERED:
18419 suffix = fp ? "nu" : "np";
18420 break;
18421 default:
18422 gcc_unreachable ();
18424 fputs (suffix, file);
18427 /* Print the name of register X to FILE based on its machine mode and number.
18428 If CODE is 'w', pretend the mode is HImode.
18429 If CODE is 'b', pretend the mode is QImode.
18430 If CODE is 'k', pretend the mode is SImode.
18431 If CODE is 'q', pretend the mode is DImode.
18432 If CODE is 'x', pretend the mode is V4SFmode.
18433 If CODE is 't', pretend the mode is V8SFmode.
18434 If CODE is 'g', pretend the mode is V16SFmode.
18435 If CODE is 'h', pretend the reg is the 'high' byte register.
18436 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18437 If CODE is 'd', duplicate the operand for AVX instruction.
18440 void
18441 print_reg (rtx x, int code, FILE *file)
18443 const char *reg;
18444 int msize;
18445 unsigned int regno;
18446 bool duplicated;
18448 if (ASSEMBLER_DIALECT == ASM_ATT)
18449 putc ('%', file);
18451 if (x == pc_rtx)
18453 gcc_assert (TARGET_64BIT);
18454 fputs ("rip", file);
18455 return;
18458 if (code == 'y' && STACK_TOP_P (x))
18460 fputs ("st(0)", file);
18461 return;
18464 if (code == 'w')
18465 msize = 2;
18466 else if (code == 'b')
18467 msize = 1;
18468 else if (code == 'k')
18469 msize = 4;
18470 else if (code == 'q')
18471 msize = 8;
18472 else if (code == 'h')
18473 msize = 0;
18474 else if (code == 'x')
18475 msize = 16;
18476 else if (code == 't')
18477 msize = 32;
18478 else if (code == 'g')
18479 msize = 64;
18480 else
18481 msize = GET_MODE_SIZE (GET_MODE (x));
18483 regno = REGNO (x);
18485 if (regno == ARG_POINTER_REGNUM
18486 || regno == FRAME_POINTER_REGNUM
18487 || regno == FPSR_REG
18488 || regno == FPCR_REG)
18490 output_operand_lossage
18491 ("invalid use of register '%s'", reg_names[regno]);
18492 return;
18494 else if (regno == FLAGS_REG)
18496 output_operand_lossage ("invalid use of asm flag output");
18497 return;
18500 duplicated = code == 'd' && TARGET_AVX;
18502 switch (msize)
18504 case 16:
18505 case 12:
18506 case 8:
18507 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18508 warning (0, "unsupported size for integer register");
18509 /* FALLTHRU */
18510 case 4:
18511 if (LEGACY_INT_REGNO_P (regno))
18512 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18513 /* FALLTHRU */
18514 case 2:
18515 normal:
18516 reg = hi_reg_name[regno];
18517 break;
18518 case 1:
18519 if (regno >= ARRAY_SIZE (qi_reg_name))
18520 goto normal;
18521 if (!ANY_QI_REGNO_P (regno))
18522 error ("unsupported size for integer register");
18523 reg = qi_reg_name[regno];
18524 break;
18525 case 0:
18526 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18527 goto normal;
18528 reg = qi_high_reg_name[regno];
18529 break;
18530 case 32:
18531 case 64:
18532 if (SSE_REGNO_P (regno))
18534 gcc_assert (!duplicated);
18535 putc (msize == 32 ? 'y' : 'z', file);
18536 reg = hi_reg_name[regno] + 1;
18537 break;
18539 goto normal;
18540 default:
18541 gcc_unreachable ();
18544 fputs (reg, file);
18546 /* Irritatingly, AMD extended registers use
18547 different naming convention: "r%d[bwd]" */
18548 if (REX_INT_REGNO_P (regno))
18550 gcc_assert (TARGET_64BIT);
18551 switch (msize)
18553 case 0:
18554 error ("extended registers have no high halves");
18555 break;
18556 case 1:
18557 putc ('b', file);
18558 break;
18559 case 2:
18560 putc ('w', file);
18561 break;
18562 case 4:
18563 putc ('d', file);
18564 break;
18565 case 8:
18566 /* no suffix */
18567 break;
18568 default:
18569 error ("unsupported operand size for extended register");
18570 break;
18572 return;
18575 if (duplicated)
18577 if (ASSEMBLER_DIALECT == ASM_ATT)
18578 fprintf (file, ", %%%s", reg);
18579 else
18580 fprintf (file, ", %s", reg);
18584 /* Meaning of CODE:
18585 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18586 C -- print opcode suffix for set/cmov insn.
18587 c -- like C, but print reversed condition
18588 F,f -- likewise, but for floating-point.
18589 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18590 otherwise nothing
18591 R -- print embeded rounding and sae.
18592 r -- print only sae.
18593 z -- print the opcode suffix for the size of the current operand.
18594 Z -- likewise, with special suffixes for x87 instructions.
18595 * -- print a star (in certain assembler syntax)
18596 A -- print an absolute memory reference.
18597 E -- print address with DImode register names if TARGET_64BIT.
18598 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18599 s -- print a shift double count, followed by the assemblers argument
18600 delimiter.
18601 b -- print the QImode name of the register for the indicated operand.
18602 %b0 would print %al if operands[0] is reg 0.
18603 w -- likewise, print the HImode name of the register.
18604 k -- likewise, print the SImode name of the register.
18605 q -- likewise, print the DImode name of the register.
18606 x -- likewise, print the V4SFmode name of the register.
18607 t -- likewise, print the V8SFmode name of the register.
18608 g -- likewise, print the V16SFmode name of the register.
18609 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18610 y -- print "st(0)" instead of "st" as a register.
18611 d -- print duplicated register operand for AVX instruction.
18612 D -- print condition for SSE cmp instruction.
18613 P -- if PIC, print an @PLT suffix.
18614 p -- print raw symbol name.
18615 X -- don't print any sort of PIC '@' suffix for a symbol.
18616 & -- print some in-use local-dynamic symbol name.
18617 H -- print a memory address offset by 8; used for sse high-parts
18618 Y -- print condition for XOP pcom* instruction.
18619 + -- print a branch hint as 'cs' or 'ds' prefix
18620 ; -- print a semicolon (after prefixes due to bug in older gas).
18621 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18622 @ -- print a segment register of thread base pointer load
18623 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18624 ! -- print MPX prefix for jxx/call/ret instructions if required.
18627 void
18628 ix86_print_operand (FILE *file, rtx x, int code)
18630 if (code)
18632 switch (code)
18634 case 'A':
18635 switch (ASSEMBLER_DIALECT)
18637 case ASM_ATT:
18638 putc ('*', file);
18639 break;
18641 case ASM_INTEL:
18642 /* Intel syntax. For absolute addresses, registers should not
18643 be surrounded by braces. */
18644 if (!REG_P (x))
18646 putc ('[', file);
18647 ix86_print_operand (file, x, 0);
18648 putc (']', file);
18649 return;
18651 break;
18653 default:
18654 gcc_unreachable ();
18657 ix86_print_operand (file, x, 0);
18658 return;
18660 case 'E':
18661 /* Wrap address in an UNSPEC to declare special handling. */
18662 if (TARGET_64BIT)
18663 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18665 output_address (VOIDmode, x);
18666 return;
18668 case 'L':
18669 if (ASSEMBLER_DIALECT == ASM_ATT)
18670 putc ('l', file);
18671 return;
18673 case 'W':
18674 if (ASSEMBLER_DIALECT == ASM_ATT)
18675 putc ('w', file);
18676 return;
18678 case 'B':
18679 if (ASSEMBLER_DIALECT == ASM_ATT)
18680 putc ('b', file);
18681 return;
18683 case 'Q':
18684 if (ASSEMBLER_DIALECT == ASM_ATT)
18685 putc ('l', file);
18686 return;
18688 case 'S':
18689 if (ASSEMBLER_DIALECT == ASM_ATT)
18690 putc ('s', file);
18691 return;
18693 case 'T':
18694 if (ASSEMBLER_DIALECT == ASM_ATT)
18695 putc ('t', file);
18696 return;
18698 case 'O':
18699 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18700 if (ASSEMBLER_DIALECT != ASM_ATT)
18701 return;
18703 switch (GET_MODE_SIZE (GET_MODE (x)))
18705 case 2:
18706 putc ('w', file);
18707 break;
18709 case 4:
18710 putc ('l', file);
18711 break;
18713 case 8:
18714 putc ('q', file);
18715 break;
18717 default:
18718 output_operand_lossage ("invalid operand size for operand "
18719 "code 'O'");
18720 return;
18723 putc ('.', file);
18724 #endif
18725 return;
18727 case 'z':
18728 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18730 /* Opcodes don't get size suffixes if using Intel opcodes. */
18731 if (ASSEMBLER_DIALECT == ASM_INTEL)
18732 return;
18734 switch (GET_MODE_SIZE (GET_MODE (x)))
18736 case 1:
18737 putc ('b', file);
18738 return;
18740 case 2:
18741 putc ('w', file);
18742 return;
18744 case 4:
18745 putc ('l', file);
18746 return;
18748 case 8:
18749 putc ('q', file);
18750 return;
18752 default:
18753 output_operand_lossage ("invalid operand size for operand "
18754 "code 'z'");
18755 return;
18759 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18760 warning (0, "non-integer operand used with operand code 'z'");
18761 /* FALLTHRU */
18763 case 'Z':
18764 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18765 if (ASSEMBLER_DIALECT == ASM_INTEL)
18766 return;
18768 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18770 switch (GET_MODE_SIZE (GET_MODE (x)))
18772 case 2:
18773 #ifdef HAVE_AS_IX86_FILDS
18774 putc ('s', file);
18775 #endif
18776 return;
18778 case 4:
18779 putc ('l', file);
18780 return;
18782 case 8:
18783 #ifdef HAVE_AS_IX86_FILDQ
18784 putc ('q', file);
18785 #else
18786 fputs ("ll", file);
18787 #endif
18788 return;
18790 default:
18791 break;
18794 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18796 /* 387 opcodes don't get size suffixes
18797 if the operands are registers. */
18798 if (STACK_REG_P (x))
18799 return;
18801 switch (GET_MODE_SIZE (GET_MODE (x)))
18803 case 4:
18804 putc ('s', file);
18805 return;
18807 case 8:
18808 putc ('l', file);
18809 return;
18811 case 12:
18812 case 16:
18813 putc ('t', file);
18814 return;
18816 default:
18817 break;
18820 else
18822 output_operand_lossage ("invalid operand type used with "
18823 "operand code 'Z'");
18824 return;
18827 output_operand_lossage ("invalid operand size for operand code 'Z'");
18828 return;
18830 case 'd':
18831 case 'b':
18832 case 'w':
18833 case 'k':
18834 case 'q':
18835 case 'h':
18836 case 't':
18837 case 'g':
18838 case 'y':
18839 case 'x':
18840 case 'X':
18841 case 'P':
18842 case 'p':
18843 break;
18845 case 's':
18846 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18848 ix86_print_operand (file, x, 0);
18849 fputs (", ", file);
18851 return;
18853 case 'Y':
18854 switch (GET_CODE (x))
18856 case NE:
18857 fputs ("neq", file);
18858 break;
18859 case EQ:
18860 fputs ("eq", file);
18861 break;
18862 case GE:
18863 case GEU:
18864 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18865 break;
18866 case GT:
18867 case GTU:
18868 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18869 break;
18870 case LE:
18871 case LEU:
18872 fputs ("le", file);
18873 break;
18874 case LT:
18875 case LTU:
18876 fputs ("lt", file);
18877 break;
18878 case UNORDERED:
18879 fputs ("unord", file);
18880 break;
18881 case ORDERED:
18882 fputs ("ord", file);
18883 break;
18884 case UNEQ:
18885 fputs ("ueq", file);
18886 break;
18887 case UNGE:
18888 fputs ("nlt", file);
18889 break;
18890 case UNGT:
18891 fputs ("nle", file);
18892 break;
18893 case UNLE:
18894 fputs ("ule", file);
18895 break;
18896 case UNLT:
18897 fputs ("ult", file);
18898 break;
18899 case LTGT:
18900 fputs ("une", file);
18901 break;
18902 default:
18903 output_operand_lossage ("operand is not a condition code, "
18904 "invalid operand code 'Y'");
18905 return;
18907 return;
18909 case 'D':
18910 /* Little bit of braindamage here. The SSE compare instructions
18911 does use completely different names for the comparisons that the
18912 fp conditional moves. */
18913 switch (GET_CODE (x))
18915 case UNEQ:
18916 if (TARGET_AVX)
18918 fputs ("eq_us", file);
18919 break;
18921 /* FALLTHRU */
18922 case EQ:
18923 fputs ("eq", file);
18924 break;
18925 case UNLT:
18926 if (TARGET_AVX)
18928 fputs ("nge", file);
18929 break;
18931 /* FALLTHRU */
18932 case LT:
18933 fputs ("lt", file);
18934 break;
18935 case UNLE:
18936 if (TARGET_AVX)
18938 fputs ("ngt", file);
18939 break;
18941 /* FALLTHRU */
18942 case LE:
18943 fputs ("le", file);
18944 break;
18945 case UNORDERED:
18946 fputs ("unord", file);
18947 break;
18948 case LTGT:
18949 if (TARGET_AVX)
18951 fputs ("neq_oq", file);
18952 break;
18954 /* FALLTHRU */
18955 case NE:
18956 fputs ("neq", file);
18957 break;
18958 case GE:
18959 if (TARGET_AVX)
18961 fputs ("ge", file);
18962 break;
18964 /* FALLTHRU */
18965 case UNGE:
18966 fputs ("nlt", file);
18967 break;
18968 case GT:
18969 if (TARGET_AVX)
18971 fputs ("gt", file);
18972 break;
18974 /* FALLTHRU */
18975 case UNGT:
18976 fputs ("nle", file);
18977 break;
18978 case ORDERED:
18979 fputs ("ord", file);
18980 break;
18981 default:
18982 output_operand_lossage ("operand is not a condition code, "
18983 "invalid operand code 'D'");
18984 return;
18986 return;
18988 case 'F':
18989 case 'f':
18990 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18991 if (ASSEMBLER_DIALECT == ASM_ATT)
18992 putc ('.', file);
18993 gcc_fallthrough ();
18994 #endif
18996 case 'C':
18997 case 'c':
18998 if (!COMPARISON_P (x))
19000 output_operand_lossage ("operand is not a condition code, "
19001 "invalid operand code '%c'", code);
19002 return;
19004 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19005 code == 'c' || code == 'f',
19006 code == 'F' || code == 'f',
19007 file);
19008 return;
19010 case 'H':
19011 if (!offsettable_memref_p (x))
19013 output_operand_lossage ("operand is not an offsettable memory "
19014 "reference, invalid operand code 'H'");
19015 return;
19017 /* It doesn't actually matter what mode we use here, as we're
19018 only going to use this for printing. */
19019 x = adjust_address_nv (x, DImode, 8);
19020 /* Output 'qword ptr' for intel assembler dialect. */
19021 if (ASSEMBLER_DIALECT == ASM_INTEL)
19022 code = 'q';
19023 break;
19025 case 'K':
19026 if (!CONST_INT_P (x))
19028 output_operand_lossage ("operand is not an integer, invalid "
19029 "operand code 'K'");
19030 return;
19033 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19034 #ifdef HAVE_AS_IX86_HLE
19035 fputs ("xacquire ", file);
19036 #else
19037 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19038 #endif
19039 else if (INTVAL (x) & IX86_HLE_RELEASE)
19040 #ifdef HAVE_AS_IX86_HLE
19041 fputs ("xrelease ", file);
19042 #else
19043 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19044 #endif
19045 /* We do not want to print value of the operand. */
19046 return;
19048 case 'N':
19049 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19050 fputs ("{z}", file);
19051 return;
19053 case 'r':
19054 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19056 output_operand_lossage ("operand is not a specific integer, "
19057 "invalid operand code 'r'");
19058 return;
19061 if (ASSEMBLER_DIALECT == ASM_INTEL)
19062 fputs (", ", file);
19064 fputs ("{sae}", file);
19066 if (ASSEMBLER_DIALECT == ASM_ATT)
19067 fputs (", ", file);
19069 return;
19071 case 'R':
19072 if (!CONST_INT_P (x))
19074 output_operand_lossage ("operand is not an integer, invalid "
19075 "operand code 'R'");
19076 return;
19079 if (ASSEMBLER_DIALECT == ASM_INTEL)
19080 fputs (", ", file);
19082 switch (INTVAL (x))
19084 case ROUND_NEAREST_INT | ROUND_SAE:
19085 fputs ("{rn-sae}", file);
19086 break;
19087 case ROUND_NEG_INF | ROUND_SAE:
19088 fputs ("{rd-sae}", file);
19089 break;
19090 case ROUND_POS_INF | ROUND_SAE:
19091 fputs ("{ru-sae}", file);
19092 break;
19093 case ROUND_ZERO | ROUND_SAE:
19094 fputs ("{rz-sae}", file);
19095 break;
19096 default:
19097 output_operand_lossage ("operand is not a specific integer, "
19098 "invalid operand code 'R'");
19101 if (ASSEMBLER_DIALECT == ASM_ATT)
19102 fputs (", ", file);
19104 return;
19106 case '*':
19107 if (ASSEMBLER_DIALECT == ASM_ATT)
19108 putc ('*', file);
19109 return;
19111 case '&':
19113 const char *name = get_some_local_dynamic_name ();
19114 if (name == NULL)
19115 output_operand_lossage ("'%%&' used without any "
19116 "local dynamic TLS references");
19117 else
19118 assemble_name (file, name);
19119 return;
19122 case '+':
19124 rtx x;
19126 if (!optimize
19127 || optimize_function_for_size_p (cfun)
19128 || !TARGET_BRANCH_PREDICTION_HINTS)
19129 return;
19131 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19132 if (x)
19134 int pred_val = profile_probability::from_reg_br_prob_note
19135 (XINT (x, 0)).to_reg_br_prob_base ();
19137 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19138 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19140 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19141 bool cputaken
19142 = final_forward_branch_p (current_output_insn) == 0;
19144 /* Emit hints only in the case default branch prediction
19145 heuristics would fail. */
19146 if (taken != cputaken)
19148 /* We use 3e (DS) prefix for taken branches and
19149 2e (CS) prefix for not taken branches. */
19150 if (taken)
19151 fputs ("ds ; ", file);
19152 else
19153 fputs ("cs ; ", file);
19157 return;
19160 case ';':
19161 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19162 putc (';', file);
19163 #endif
19164 return;
19166 case '@':
19167 if (ASSEMBLER_DIALECT == ASM_ATT)
19168 putc ('%', file);
19170 /* The kernel uses a different segment register for performance
19171 reasons; a system call would not have to trash the userspace
19172 segment register, which would be expensive. */
19173 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
19174 fputs ("fs", file);
19175 else
19176 fputs ("gs", file);
19177 return;
19179 case '~':
19180 putc (TARGET_AVX2 ? 'i' : 'f', file);
19181 return;
19183 case '^':
19184 if (TARGET_64BIT && Pmode != word_mode)
19185 fputs ("addr32 ", file);
19186 return;
19188 case '!':
19189 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19190 fputs ("bnd ", file);
19191 return;
19193 default:
19194 output_operand_lossage ("invalid operand code '%c'", code);
19198 if (REG_P (x))
19199 print_reg (x, code, file);
19201 else if (MEM_P (x))
19203 rtx addr = XEXP (x, 0);
19205 /* No `byte ptr' prefix for call instructions ... */
19206 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19208 machine_mode mode = GET_MODE (x);
19209 const char *size;
19211 /* Check for explicit size override codes. */
19212 if (code == 'b')
19213 size = "BYTE";
19214 else if (code == 'w')
19215 size = "WORD";
19216 else if (code == 'k')
19217 size = "DWORD";
19218 else if (code == 'q')
19219 size = "QWORD";
19220 else if (code == 'x')
19221 size = "XMMWORD";
19222 else if (code == 't')
19223 size = "YMMWORD";
19224 else if (code == 'g')
19225 size = "ZMMWORD";
19226 else if (mode == BLKmode)
19227 /* ... or BLKmode operands, when not overridden. */
19228 size = NULL;
19229 else
19230 switch (GET_MODE_SIZE (mode))
19232 case 1: size = "BYTE"; break;
19233 case 2: size = "WORD"; break;
19234 case 4: size = "DWORD"; break;
19235 case 8: size = "QWORD"; break;
19236 case 12: size = "TBYTE"; break;
19237 case 16:
19238 if (mode == XFmode)
19239 size = "TBYTE";
19240 else
19241 size = "XMMWORD";
19242 break;
19243 case 32: size = "YMMWORD"; break;
19244 case 64: size = "ZMMWORD"; break;
19245 default:
19246 gcc_unreachable ();
19248 if (size)
19250 fputs (size, file);
19251 fputs (" PTR ", file);
19255 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19256 output_operand_lossage ("invalid constraints for operand");
19257 else
19258 ix86_print_operand_address_as
19259 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19262 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19264 long l;
19266 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19268 if (ASSEMBLER_DIALECT == ASM_ATT)
19269 putc ('$', file);
19270 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19271 if (code == 'q')
19272 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19273 (unsigned long long) (int) l);
19274 else
19275 fprintf (file, "0x%08x", (unsigned int) l);
19278 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19280 long l[2];
19282 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19284 if (ASSEMBLER_DIALECT == ASM_ATT)
19285 putc ('$', file);
19286 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19289 /* These float cases don't actually occur as immediate operands. */
19290 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19292 char dstr[30];
19294 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19295 fputs (dstr, file);
19298 else
19300 /* We have patterns that allow zero sets of memory, for instance.
19301 In 64-bit mode, we should probably support all 8-byte vectors,
19302 since we can in fact encode that into an immediate. */
19303 if (GET_CODE (x) == CONST_VECTOR)
19305 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19306 x = const0_rtx;
19309 if (code != 'P' && code != 'p')
19311 if (CONST_INT_P (x))
19313 if (ASSEMBLER_DIALECT == ASM_ATT)
19314 putc ('$', file);
19316 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19317 || GET_CODE (x) == LABEL_REF)
19319 if (ASSEMBLER_DIALECT == ASM_ATT)
19320 putc ('$', file);
19321 else
19322 fputs ("OFFSET FLAT:", file);
19325 if (CONST_INT_P (x))
19326 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19327 else if (flag_pic || MACHOPIC_INDIRECT)
19328 output_pic_addr_const (file, x, code);
19329 else
19330 output_addr_const (file, x);
19334 static bool
19335 ix86_print_operand_punct_valid_p (unsigned char code)
19337 return (code == '@' || code == '*' || code == '+' || code == '&'
19338 || code == ';' || code == '~' || code == '^' || code == '!');
19341 /* Print a memory operand whose address is ADDR. */
19343 static void
19344 ix86_print_operand_address_as (FILE *file, rtx addr,
19345 addr_space_t as, bool no_rip)
19347 struct ix86_address parts;
19348 rtx base, index, disp;
19349 int scale;
19350 int ok;
19351 bool vsib = false;
19352 int code = 0;
19354 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19356 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19357 gcc_assert (parts.index == NULL_RTX);
19358 parts.index = XVECEXP (addr, 0, 1);
19359 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19360 addr = XVECEXP (addr, 0, 0);
19361 vsib = true;
19363 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19365 gcc_assert (TARGET_64BIT);
19366 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19367 code = 'q';
19369 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19371 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19372 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19373 if (parts.base != NULL_RTX)
19375 parts.index = parts.base;
19376 parts.scale = 1;
19378 parts.base = XVECEXP (addr, 0, 0);
19379 addr = XVECEXP (addr, 0, 0);
19381 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19383 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19384 gcc_assert (parts.index == NULL_RTX);
19385 parts.index = XVECEXP (addr, 0, 1);
19386 addr = XVECEXP (addr, 0, 0);
19388 else
19389 ok = ix86_decompose_address (addr, &parts);
19391 gcc_assert (ok);
19393 base = parts.base;
19394 index = parts.index;
19395 disp = parts.disp;
19396 scale = parts.scale;
19398 if (ADDR_SPACE_GENERIC_P (as))
19399 as = parts.seg;
19400 else
19401 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19403 if (!ADDR_SPACE_GENERIC_P (as))
19405 const char *string;
19407 if (as == ADDR_SPACE_SEG_FS)
19408 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19409 else if (as == ADDR_SPACE_SEG_GS)
19410 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19411 else
19412 gcc_unreachable ();
19413 fputs (string, file);
19416 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19417 if (TARGET_64BIT && !base && !index && !no_rip)
19419 rtx symbol = disp;
19421 if (GET_CODE (disp) == CONST
19422 && GET_CODE (XEXP (disp, 0)) == PLUS
19423 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19424 symbol = XEXP (XEXP (disp, 0), 0);
19426 if (GET_CODE (symbol) == LABEL_REF
19427 || (GET_CODE (symbol) == SYMBOL_REF
19428 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19429 base = pc_rtx;
19432 if (!base && !index)
19434 /* Displacement only requires special attention. */
19435 if (CONST_INT_P (disp))
19437 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
19438 fputs ("ds:", file);
19439 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19441 /* Load the external function address via the GOT slot to avoid PLT. */
19442 else if (GET_CODE (disp) == CONST
19443 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19444 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19445 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19446 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19447 output_pic_addr_const (file, disp, 0);
19448 else if (flag_pic)
19449 output_pic_addr_const (file, disp, 0);
19450 else
19451 output_addr_const (file, disp);
19453 else
19455 /* Print SImode register names to force addr32 prefix. */
19456 if (SImode_address_operand (addr, VOIDmode))
19458 if (flag_checking)
19460 gcc_assert (TARGET_64BIT);
19461 switch (GET_CODE (addr))
19463 case SUBREG:
19464 gcc_assert (GET_MODE (addr) == SImode);
19465 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19466 break;
19467 case ZERO_EXTEND:
19468 case AND:
19469 gcc_assert (GET_MODE (addr) == DImode);
19470 break;
19471 default:
19472 gcc_unreachable ();
19475 gcc_assert (!code);
19476 code = 'k';
19478 else if (code == 0
19479 && TARGET_X32
19480 && disp
19481 && CONST_INT_P (disp)
19482 && INTVAL (disp) < -16*1024*1024)
19484 /* X32 runs in 64-bit mode, where displacement, DISP, in
19485 address DISP(%r64), is encoded as 32-bit immediate sign-
19486 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19487 address is %r64 + 0xffffffffbffffd00. When %r64 <
19488 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19489 which is invalid for x32. The correct address is %r64
19490 - 0x40000300 == 0xf7ffdd64. To properly encode
19491 -0x40000300(%r64) for x32, we zero-extend negative
19492 displacement by forcing addr32 prefix which truncates
19493 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19494 zero-extend all negative displacements, including -1(%rsp).
19495 However, for small negative displacements, sign-extension
19496 won't cause overflow. We only zero-extend negative
19497 displacements if they < -16*1024*1024, which is also used
19498 to check legitimate address displacements for PIC. */
19499 code = 'k';
19502 if (ASSEMBLER_DIALECT == ASM_ATT)
19504 if (disp)
19506 if (flag_pic)
19507 output_pic_addr_const (file, disp, 0);
19508 else if (GET_CODE (disp) == LABEL_REF)
19509 output_asm_label (disp);
19510 else
19511 output_addr_const (file, disp);
19514 putc ('(', file);
19515 if (base)
19516 print_reg (base, code, file);
19517 if (index)
19519 putc (',', file);
19520 print_reg (index, vsib ? 0 : code, file);
19521 if (scale != 1 || vsib)
19522 fprintf (file, ",%d", scale);
19524 putc (')', file);
19526 else
19528 rtx offset = NULL_RTX;
19530 if (disp)
19532 /* Pull out the offset of a symbol; print any symbol itself. */
19533 if (GET_CODE (disp) == CONST
19534 && GET_CODE (XEXP (disp, 0)) == PLUS
19535 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19537 offset = XEXP (XEXP (disp, 0), 1);
19538 disp = gen_rtx_CONST (VOIDmode,
19539 XEXP (XEXP (disp, 0), 0));
19542 if (flag_pic)
19543 output_pic_addr_const (file, disp, 0);
19544 else if (GET_CODE (disp) == LABEL_REF)
19545 output_asm_label (disp);
19546 else if (CONST_INT_P (disp))
19547 offset = disp;
19548 else
19549 output_addr_const (file, disp);
19552 putc ('[', file);
19553 if (base)
19555 print_reg (base, code, file);
19556 if (offset)
19558 if (INTVAL (offset) >= 0)
19559 putc ('+', file);
19560 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19563 else if (offset)
19564 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19565 else
19566 putc ('0', file);
19568 if (index)
19570 putc ('+', file);
19571 print_reg (index, vsib ? 0 : code, file);
19572 if (scale != 1 || vsib)
19573 fprintf (file, "*%d", scale);
19575 putc (']', file);
19580 static void
19581 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19583 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19586 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19588 static bool
19589 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19591 rtx op;
19593 if (GET_CODE (x) != UNSPEC)
19594 return false;
19596 op = XVECEXP (x, 0, 0);
19597 switch (XINT (x, 1))
19599 case UNSPEC_GOTTPOFF:
19600 output_addr_const (file, op);
19601 /* FIXME: This might be @TPOFF in Sun ld. */
19602 fputs ("@gottpoff", file);
19603 break;
19604 case UNSPEC_TPOFF:
19605 output_addr_const (file, op);
19606 fputs ("@tpoff", file);
19607 break;
19608 case UNSPEC_NTPOFF:
19609 output_addr_const (file, op);
19610 if (TARGET_64BIT)
19611 fputs ("@tpoff", file);
19612 else
19613 fputs ("@ntpoff", file);
19614 break;
19615 case UNSPEC_DTPOFF:
19616 output_addr_const (file, op);
19617 fputs ("@dtpoff", file);
19618 break;
19619 case UNSPEC_GOTNTPOFF:
19620 output_addr_const (file, op);
19621 if (TARGET_64BIT)
19622 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19623 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19624 else
19625 fputs ("@gotntpoff", file);
19626 break;
19627 case UNSPEC_INDNTPOFF:
19628 output_addr_const (file, op);
19629 fputs ("@indntpoff", file);
19630 break;
19631 #if TARGET_MACHO
19632 case UNSPEC_MACHOPIC_OFFSET:
19633 output_addr_const (file, op);
19634 putc ('-', file);
19635 machopic_output_function_base_name (file);
19636 break;
19637 #endif
19639 case UNSPEC_STACK_CHECK:
19641 int offset;
19643 gcc_assert (flag_split_stack);
19645 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
19646 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
19647 #else
19648 gcc_unreachable ();
19649 #endif
19651 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
19653 break;
19655 default:
19656 return false;
19659 return true;
19662 /* Split one or more double-mode RTL references into pairs of half-mode
19663 references. The RTL can be REG, offsettable MEM, integer constant, or
19664 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19665 split and "num" is its length. lo_half and hi_half are output arrays
19666 that parallel "operands". */
19668 void
19669 split_double_mode (machine_mode mode, rtx operands[],
19670 int num, rtx lo_half[], rtx hi_half[])
19672 machine_mode half_mode;
19673 unsigned int byte;
19675 switch (mode)
19677 case TImode:
19678 half_mode = DImode;
19679 break;
19680 case DImode:
19681 half_mode = SImode;
19682 break;
19683 default:
19684 gcc_unreachable ();
19687 byte = GET_MODE_SIZE (half_mode);
19689 while (num--)
19691 rtx op = operands[num];
19693 /* simplify_subreg refuse to split volatile memory addresses,
19694 but we still have to handle it. */
19695 if (MEM_P (op))
19697 lo_half[num] = adjust_address (op, half_mode, 0);
19698 hi_half[num] = adjust_address (op, half_mode, byte);
19700 else
19702 lo_half[num] = simplify_gen_subreg (half_mode, op,
19703 GET_MODE (op) == VOIDmode
19704 ? mode : GET_MODE (op), 0);
19705 hi_half[num] = simplify_gen_subreg (half_mode, op,
19706 GET_MODE (op) == VOIDmode
19707 ? mode : GET_MODE (op), byte);
19712 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19713 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19714 is the expression of the binary operation. The output may either be
19715 emitted here, or returned to the caller, like all output_* functions.
19717 There is no guarantee that the operands are the same mode, as they
19718 might be within FLOAT or FLOAT_EXTEND expressions. */
19720 #ifndef SYSV386_COMPAT
19721 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19722 wants to fix the assemblers because that causes incompatibility
19723 with gcc. No-one wants to fix gcc because that causes
19724 incompatibility with assemblers... You can use the option of
19725 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19726 #define SYSV386_COMPAT 1
19727 #endif
19729 const char *
19730 output_387_binary_op (rtx_insn *insn, rtx *operands)
19732 static char buf[40];
19733 const char *p;
19734 const char *ssep;
19735 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19737 /* Even if we do not want to check the inputs, this documents input
19738 constraints. Which helps in understanding the following code. */
19739 if (flag_checking)
19741 if (STACK_REG_P (operands[0])
19742 && ((REG_P (operands[1])
19743 && REGNO (operands[0]) == REGNO (operands[1])
19744 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19745 || (REG_P (operands[2])
19746 && REGNO (operands[0]) == REGNO (operands[2])
19747 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19748 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19749 ; /* ok */
19750 else
19751 gcc_assert (is_sse);
19754 switch (GET_CODE (operands[3]))
19756 case PLUS:
19757 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19758 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19759 p = "fiadd";
19760 else
19761 p = "fadd";
19762 ssep = "vadd";
19763 break;
19765 case MINUS:
19766 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19767 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19768 p = "fisub";
19769 else
19770 p = "fsub";
19771 ssep = "vsub";
19772 break;
19774 case MULT:
19775 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19776 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19777 p = "fimul";
19778 else
19779 p = "fmul";
19780 ssep = "vmul";
19781 break;
19783 case DIV:
19784 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19785 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19786 p = "fidiv";
19787 else
19788 p = "fdiv";
19789 ssep = "vdiv";
19790 break;
19792 default:
19793 gcc_unreachable ();
19796 if (is_sse)
19798 if (TARGET_AVX)
19800 strcpy (buf, ssep);
19801 if (GET_MODE (operands[0]) == SFmode)
19802 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19803 else
19804 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19806 else
19808 strcpy (buf, ssep + 1);
19809 if (GET_MODE (operands[0]) == SFmode)
19810 strcat (buf, "ss\t{%2, %0|%0, %2}");
19811 else
19812 strcat (buf, "sd\t{%2, %0|%0, %2}");
19814 return buf;
19816 strcpy (buf, p);
19818 switch (GET_CODE (operands[3]))
19820 case MULT:
19821 case PLUS:
19822 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19823 std::swap (operands[1], operands[2]);
19825 /* know operands[0] == operands[1]. */
19827 if (MEM_P (operands[2]))
19829 p = "%Z2\t%2";
19830 break;
19833 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19835 if (STACK_TOP_P (operands[0]))
19836 /* How is it that we are storing to a dead operand[2]?
19837 Well, presumably operands[1] is dead too. We can't
19838 store the result to st(0) as st(0) gets popped on this
19839 instruction. Instead store to operands[2] (which I
19840 think has to be st(1)). st(1) will be popped later.
19841 gcc <= 2.8.1 didn't have this check and generated
19842 assembly code that the Unixware assembler rejected. */
19843 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19844 else
19845 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19846 break;
19849 if (STACK_TOP_P (operands[0]))
19850 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19851 else
19852 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19853 break;
19855 case MINUS:
19856 case DIV:
19857 if (MEM_P (operands[1]))
19859 p = "r%Z1\t%1";
19860 break;
19863 if (MEM_P (operands[2]))
19865 p = "%Z2\t%2";
19866 break;
19869 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19871 #if SYSV386_COMPAT
19872 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19873 derived assemblers, confusingly reverse the direction of
19874 the operation for fsub{r} and fdiv{r} when the
19875 destination register is not st(0). The Intel assembler
19876 doesn't have this brain damage. Read !SYSV386_COMPAT to
19877 figure out what the hardware really does. */
19878 if (STACK_TOP_P (operands[0]))
19879 p = "{p\t%0, %2|rp\t%2, %0}";
19880 else
19881 p = "{rp\t%2, %0|p\t%0, %2}";
19882 #else
19883 if (STACK_TOP_P (operands[0]))
19884 /* As above for fmul/fadd, we can't store to st(0). */
19885 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19886 else
19887 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19888 #endif
19889 break;
19892 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19894 #if SYSV386_COMPAT
19895 if (STACK_TOP_P (operands[0]))
19896 p = "{rp\t%0, %1|p\t%1, %0}";
19897 else
19898 p = "{p\t%1, %0|rp\t%0, %1}";
19899 #else
19900 if (STACK_TOP_P (operands[0]))
19901 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19902 else
19903 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19904 #endif
19905 break;
19908 if (STACK_TOP_P (operands[0]))
19910 if (STACK_TOP_P (operands[1]))
19911 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19912 else
19913 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19914 break;
19916 else if (STACK_TOP_P (operands[1]))
19918 #if SYSV386_COMPAT
19919 p = "{\t%1, %0|r\t%0, %1}";
19920 #else
19921 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19922 #endif
19924 else
19926 #if SYSV386_COMPAT
19927 p = "{r\t%2, %0|\t%0, %2}";
19928 #else
19929 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19930 #endif
19932 break;
19934 default:
19935 gcc_unreachable ();
19938 strcat (buf, p);
19939 return buf;
19942 /* Return needed mode for entity in optimize_mode_switching pass. */
19944 static int
19945 ix86_dirflag_mode_needed (rtx_insn *insn)
19947 if (CALL_P (insn))
19949 if (cfun->machine->func_type == TYPE_NORMAL)
19950 return X86_DIRFLAG_ANY;
19951 else
19952 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19953 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19956 if (recog_memoized (insn) < 0)
19957 return X86_DIRFLAG_ANY;
19959 if (get_attr_type (insn) == TYPE_STR)
19961 /* Emit cld instruction if stringops are used in the function. */
19962 if (cfun->machine->func_type == TYPE_NORMAL)
19963 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19964 else
19965 return X86_DIRFLAG_RESET;
19968 return X86_DIRFLAG_ANY;
19971 /* Check if a 256bit AVX register is referenced inside of EXP. */
19973 static bool
19974 ix86_check_avx256_register (const_rtx exp)
19976 if (SUBREG_P (exp))
19977 exp = SUBREG_REG (exp);
19979 return (REG_P (exp)
19980 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19983 /* Return needed mode for entity in optimize_mode_switching pass. */
19985 static int
19986 ix86_avx_u128_mode_needed (rtx_insn *insn)
19988 if (CALL_P (insn))
19990 rtx link;
19992 /* Needed mode is set to AVX_U128_CLEAN if there are
19993 no 256bit modes used in function arguments. */
19994 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19995 link;
19996 link = XEXP (link, 1))
19998 if (GET_CODE (XEXP (link, 0)) == USE)
20000 rtx arg = XEXP (XEXP (link, 0), 0);
20002 if (ix86_check_avx256_register (arg))
20003 return AVX_U128_DIRTY;
20007 return AVX_U128_CLEAN;
20010 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20011 changes state only when a 256bit register is written to, but we need
20012 to prevent the compiler from moving optimal insertion point above
20013 eventual read from 256bit register. */
20014 subrtx_iterator::array_type array;
20015 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20016 if (ix86_check_avx256_register (*iter))
20017 return AVX_U128_DIRTY;
20019 return AVX_U128_ANY;
20022 /* Return mode that i387 must be switched into
20023 prior to the execution of insn. */
20025 static int
20026 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20028 enum attr_i387_cw mode;
20030 /* The mode UNINITIALIZED is used to store control word after a
20031 function call or ASM pattern. The mode ANY specify that function
20032 has no requirements on the control word and make no changes in the
20033 bits we are interested in. */
20035 if (CALL_P (insn)
20036 || (NONJUMP_INSN_P (insn)
20037 && (asm_noperands (PATTERN (insn)) >= 0
20038 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20039 return I387_CW_UNINITIALIZED;
20041 if (recog_memoized (insn) < 0)
20042 return I387_CW_ANY;
20044 mode = get_attr_i387_cw (insn);
20046 switch (entity)
20048 case I387_TRUNC:
20049 if (mode == I387_CW_TRUNC)
20050 return mode;
20051 break;
20053 case I387_FLOOR:
20054 if (mode == I387_CW_FLOOR)
20055 return mode;
20056 break;
20058 case I387_CEIL:
20059 if (mode == I387_CW_CEIL)
20060 return mode;
20061 break;
20063 case I387_MASK_PM:
20064 if (mode == I387_CW_MASK_PM)
20065 return mode;
20066 break;
20068 default:
20069 gcc_unreachable ();
20072 return I387_CW_ANY;
20075 /* Return mode that entity must be switched into
20076 prior to the execution of insn. */
20078 static int
20079 ix86_mode_needed (int entity, rtx_insn *insn)
20081 switch (entity)
20083 case X86_DIRFLAG:
20084 return ix86_dirflag_mode_needed (insn);
20085 case AVX_U128:
20086 return ix86_avx_u128_mode_needed (insn);
20087 case I387_TRUNC:
20088 case I387_FLOOR:
20089 case I387_CEIL:
20090 case I387_MASK_PM:
20091 return ix86_i387_mode_needed (entity, insn);
20092 default:
20093 gcc_unreachable ();
20095 return 0;
20098 /* Check if a 256bit AVX register is referenced in stores. */
20100 static void
20101 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20103 if (ix86_check_avx256_register (dest))
20105 bool *used = (bool *) data;
20106 *used = true;
20110 /* Calculate mode of upper 128bit AVX registers after the insn. */
20112 static int
20113 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20115 rtx pat = PATTERN (insn);
20117 if (vzeroupper_operation (pat, VOIDmode)
20118 || vzeroall_operation (pat, VOIDmode))
20119 return AVX_U128_CLEAN;
20121 /* We know that state is clean after CALL insn if there are no
20122 256bit registers used in the function return register. */
20123 if (CALL_P (insn))
20125 bool avx_reg256_found = false;
20126 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20128 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20131 /* Otherwise, return current mode. Remember that if insn
20132 references AVX 256bit registers, the mode was already changed
20133 to DIRTY from MODE_NEEDED. */
20134 return mode;
20137 /* Return the mode that an insn results in. */
20139 static int
20140 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20142 switch (entity)
20144 case X86_DIRFLAG:
20145 return mode;
20146 case AVX_U128:
20147 return ix86_avx_u128_mode_after (mode, insn);
20148 case I387_TRUNC:
20149 case I387_FLOOR:
20150 case I387_CEIL:
20151 case I387_MASK_PM:
20152 return mode;
20153 default:
20154 gcc_unreachable ();
20158 static int
20159 ix86_dirflag_mode_entry (void)
20161 /* For TARGET_CLD or in the interrupt handler we can't assume
20162 direction flag state at function entry. */
20163 if (TARGET_CLD
20164 || cfun->machine->func_type != TYPE_NORMAL)
20165 return X86_DIRFLAG_ANY;
20167 return X86_DIRFLAG_RESET;
20170 static int
20171 ix86_avx_u128_mode_entry (void)
20173 tree arg;
20175 /* Entry mode is set to AVX_U128_DIRTY if there are
20176 256bit modes used in function arguments. */
20177 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20178 arg = TREE_CHAIN (arg))
20180 rtx incoming = DECL_INCOMING_RTL (arg);
20182 if (incoming && ix86_check_avx256_register (incoming))
20183 return AVX_U128_DIRTY;
20186 return AVX_U128_CLEAN;
20189 /* Return a mode that ENTITY is assumed to be
20190 switched to at function entry. */
20192 static int
20193 ix86_mode_entry (int entity)
20195 switch (entity)
20197 case X86_DIRFLAG:
20198 return ix86_dirflag_mode_entry ();
20199 case AVX_U128:
20200 return ix86_avx_u128_mode_entry ();
20201 case I387_TRUNC:
20202 case I387_FLOOR:
20203 case I387_CEIL:
20204 case I387_MASK_PM:
20205 return I387_CW_ANY;
20206 default:
20207 gcc_unreachable ();
20211 static int
20212 ix86_avx_u128_mode_exit (void)
20214 rtx reg = crtl->return_rtx;
20216 /* Exit mode is set to AVX_U128_DIRTY if there are
20217 256bit modes used in the function return register. */
20218 if (reg && ix86_check_avx256_register (reg))
20219 return AVX_U128_DIRTY;
20221 return AVX_U128_CLEAN;
20224 /* Return a mode that ENTITY is assumed to be
20225 switched to at function exit. */
20227 static int
20228 ix86_mode_exit (int entity)
20230 switch (entity)
20232 case X86_DIRFLAG:
20233 return X86_DIRFLAG_ANY;
20234 case AVX_U128:
20235 return ix86_avx_u128_mode_exit ();
20236 case I387_TRUNC:
20237 case I387_FLOOR:
20238 case I387_CEIL:
20239 case I387_MASK_PM:
20240 return I387_CW_ANY;
20241 default:
20242 gcc_unreachable ();
20246 static int
20247 ix86_mode_priority (int, int n)
20249 return n;
20252 /* Output code to initialize control word copies used by trunc?f?i and
20253 rounding patterns. CURRENT_MODE is set to current control word,
20254 while NEW_MODE is set to new control word. */
20256 static void
20257 emit_i387_cw_initialization (int mode)
20259 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20260 rtx new_mode;
20262 enum ix86_stack_slot slot;
20264 rtx reg = gen_reg_rtx (HImode);
20266 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20267 emit_move_insn (reg, copy_rtx (stored_mode));
20269 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20270 || optimize_insn_for_size_p ())
20272 switch (mode)
20274 case I387_CW_TRUNC:
20275 /* round toward zero (truncate) */
20276 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20277 slot = SLOT_CW_TRUNC;
20278 break;
20280 case I387_CW_FLOOR:
20281 /* round down toward -oo */
20282 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20283 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20284 slot = SLOT_CW_FLOOR;
20285 break;
20287 case I387_CW_CEIL:
20288 /* round up toward +oo */
20289 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20290 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20291 slot = SLOT_CW_CEIL;
20292 break;
20294 case I387_CW_MASK_PM:
20295 /* mask precision exception for nearbyint() */
20296 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20297 slot = SLOT_CW_MASK_PM;
20298 break;
20300 default:
20301 gcc_unreachable ();
20304 else
20306 switch (mode)
20308 case I387_CW_TRUNC:
20309 /* round toward zero (truncate) */
20310 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20311 slot = SLOT_CW_TRUNC;
20312 break;
20314 case I387_CW_FLOOR:
20315 /* round down toward -oo */
20316 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20317 slot = SLOT_CW_FLOOR;
20318 break;
20320 case I387_CW_CEIL:
20321 /* round up toward +oo */
20322 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20323 slot = SLOT_CW_CEIL;
20324 break;
20326 case I387_CW_MASK_PM:
20327 /* mask precision exception for nearbyint() */
20328 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20329 slot = SLOT_CW_MASK_PM;
20330 break;
20332 default:
20333 gcc_unreachable ();
20337 gcc_assert (slot < MAX_386_STACK_LOCALS);
20339 new_mode = assign_386_stack_local (HImode, slot);
20340 emit_move_insn (new_mode, reg);
20343 /* Emit vzeroupper. */
20345 void
20346 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20348 int i;
20350 /* Cancel automatic vzeroupper insertion if there are
20351 live call-saved SSE registers at the insertion point. */
20353 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20354 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20355 return;
20357 if (TARGET_64BIT)
20358 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20359 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20360 return;
20362 emit_insn (gen_avx_vzeroupper ());
20365 /* Generate one or more insns to set ENTITY to MODE. */
20367 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20368 is the set of hard registers live at the point where the insn(s)
20369 are to be inserted. */
20371 static void
20372 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20373 HARD_REG_SET regs_live)
20375 switch (entity)
20377 case X86_DIRFLAG:
20378 if (mode == X86_DIRFLAG_RESET)
20379 emit_insn (gen_cld ());
20380 break;
20381 case AVX_U128:
20382 if (mode == AVX_U128_CLEAN)
20383 ix86_avx_emit_vzeroupper (regs_live);
20384 break;
20385 case I387_TRUNC:
20386 case I387_FLOOR:
20387 case I387_CEIL:
20388 case I387_MASK_PM:
20389 if (mode != I387_CW_ANY
20390 && mode != I387_CW_UNINITIALIZED)
20391 emit_i387_cw_initialization (mode);
20392 break;
20393 default:
20394 gcc_unreachable ();
20398 /* Output code for INSN to convert a float to a signed int. OPERANDS
20399 are the insn operands. The output may be [HSD]Imode and the input
20400 operand may be [SDX]Fmode. */
20402 const char *
20403 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20405 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20406 int dimode_p = GET_MODE (operands[0]) == DImode;
20407 int round_mode = get_attr_i387_cw (insn);
20409 /* Jump through a hoop or two for DImode, since the hardware has no
20410 non-popping instruction. We used to do this a different way, but
20411 that was somewhat fragile and broke with post-reload splitters. */
20412 if ((dimode_p || fisttp) && !stack_top_dies)
20413 output_asm_insn ("fld\t%y1", operands);
20415 gcc_assert (STACK_TOP_P (operands[1]));
20416 gcc_assert (MEM_P (operands[0]));
20417 gcc_assert (GET_MODE (operands[1]) != TFmode);
20419 if (fisttp)
20420 output_asm_insn ("fisttp%Z0\t%0", operands);
20421 else
20423 if (round_mode != I387_CW_ANY)
20424 output_asm_insn ("fldcw\t%3", operands);
20425 if (stack_top_dies || dimode_p)
20426 output_asm_insn ("fistp%Z0\t%0", operands);
20427 else
20428 output_asm_insn ("fist%Z0\t%0", operands);
20429 if (round_mode != I387_CW_ANY)
20430 output_asm_insn ("fldcw\t%2", operands);
20433 return "";
20436 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20437 have the values zero or one, indicates the ffreep insn's operand
20438 from the OPERANDS array. */
20440 static const char *
20441 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20443 if (TARGET_USE_FFREEP)
20444 #ifdef HAVE_AS_IX86_FFREEP
20445 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20446 #else
20448 static char retval[32];
20449 int regno = REGNO (operands[opno]);
20451 gcc_assert (STACK_REGNO_P (regno));
20453 regno -= FIRST_STACK_REG;
20455 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20456 return retval;
20458 #endif
20460 return opno ? "fstp\t%y1" : "fstp\t%y0";
20464 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20465 should be used. UNORDERED_P is true when fucom should be used. */
20467 const char *
20468 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20470 int stack_top_dies;
20471 rtx cmp_op0, cmp_op1;
20472 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20474 if (eflags_p)
20476 cmp_op0 = operands[0];
20477 cmp_op1 = operands[1];
20479 else
20481 cmp_op0 = operands[1];
20482 cmp_op1 = operands[2];
20485 if (is_sse)
20487 if (GET_MODE (operands[0]) == SFmode)
20488 if (unordered_p)
20489 return "%vucomiss\t{%1, %0|%0, %1}";
20490 else
20491 return "%vcomiss\t{%1, %0|%0, %1}";
20492 else
20493 if (unordered_p)
20494 return "%vucomisd\t{%1, %0|%0, %1}";
20495 else
20496 return "%vcomisd\t{%1, %0|%0, %1}";
20499 gcc_assert (STACK_TOP_P (cmp_op0));
20501 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20503 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20505 if (stack_top_dies)
20507 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20508 return output_387_ffreep (operands, 1);
20510 else
20511 return "ftst\n\tfnstsw\t%0";
20514 if (STACK_REG_P (cmp_op1)
20515 && stack_top_dies
20516 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20517 && REGNO (cmp_op1) != FIRST_STACK_REG)
20519 /* If both the top of the 387 stack dies, and the other operand
20520 is also a stack register that dies, then this must be a
20521 `fcompp' float compare */
20523 if (eflags_p)
20525 /* There is no double popping fcomi variant. Fortunately,
20526 eflags is immune from the fstp's cc clobbering. */
20527 if (unordered_p)
20528 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20529 else
20530 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20531 return output_387_ffreep (operands, 0);
20533 else
20535 if (unordered_p)
20536 return "fucompp\n\tfnstsw\t%0";
20537 else
20538 return "fcompp\n\tfnstsw\t%0";
20541 else
20543 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20545 static const char * const alt[16] =
20547 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20548 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20549 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20550 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20552 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20553 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20554 NULL,
20555 NULL,
20557 "fcomi\t{%y1, %0|%0, %y1}",
20558 "fcomip\t{%y1, %0|%0, %y1}",
20559 "fucomi\t{%y1, %0|%0, %y1}",
20560 "fucomip\t{%y1, %0|%0, %y1}",
20562 NULL,
20563 NULL,
20564 NULL,
20565 NULL
20568 int mask;
20569 const char *ret;
20571 mask = eflags_p << 3;
20572 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20573 mask |= unordered_p << 1;
20574 mask |= stack_top_dies;
20576 gcc_assert (mask < 16);
20577 ret = alt[mask];
20578 gcc_assert (ret);
20580 return ret;
20584 void
20585 ix86_output_addr_vec_elt (FILE *file, int value)
20587 const char *directive = ASM_LONG;
20589 #ifdef ASM_QUAD
20590 if (TARGET_LP64)
20591 directive = ASM_QUAD;
20592 #else
20593 gcc_assert (!TARGET_64BIT);
20594 #endif
20596 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20599 void
20600 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20602 const char *directive = ASM_LONG;
20604 #ifdef ASM_QUAD
20605 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20606 directive = ASM_QUAD;
20607 #else
20608 gcc_assert (!TARGET_64BIT);
20609 #endif
20610 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20611 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20612 fprintf (file, "%s%s%d-%s%d\n",
20613 directive, LPREFIX, value, LPREFIX, rel);
20614 else if (HAVE_AS_GOTOFF_IN_DATA)
20615 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20616 #if TARGET_MACHO
20617 else if (TARGET_MACHO)
20619 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20620 machopic_output_function_base_name (file);
20621 putc ('\n', file);
20623 #endif
20624 else
20625 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20626 GOT_SYMBOL_NAME, LPREFIX, value);
20629 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20630 for the target. */
20632 void
20633 ix86_expand_clear (rtx dest)
20635 rtx tmp;
20637 /* We play register width games, which are only valid after reload. */
20638 gcc_assert (reload_completed);
20640 /* Avoid HImode and its attendant prefix byte. */
20641 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20642 dest = gen_rtx_REG (SImode, REGNO (dest));
20643 tmp = gen_rtx_SET (dest, const0_rtx);
20645 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20647 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20648 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20651 emit_insn (tmp);
20654 /* X is an unchanging MEM. If it is a constant pool reference, return
20655 the constant pool rtx, else NULL. */
20658 maybe_get_pool_constant (rtx x)
20660 x = ix86_delegitimize_address (XEXP (x, 0));
20662 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20663 return get_pool_constant (x);
20665 return NULL_RTX;
20668 void
20669 ix86_expand_move (machine_mode mode, rtx operands[])
20671 rtx op0, op1;
20672 rtx tmp, addend = NULL_RTX;
20673 enum tls_model model;
20675 op0 = operands[0];
20676 op1 = operands[1];
20678 switch (GET_CODE (op1))
20680 case CONST:
20681 tmp = XEXP (op1, 0);
20683 if (GET_CODE (tmp) != PLUS
20684 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20685 break;
20687 op1 = XEXP (tmp, 0);
20688 addend = XEXP (tmp, 1);
20689 /* FALLTHRU */
20691 case SYMBOL_REF:
20692 model = SYMBOL_REF_TLS_MODEL (op1);
20694 if (model)
20695 op1 = legitimize_tls_address (op1, model, true);
20696 else if (ix86_force_load_from_GOT_p (op1))
20698 /* Load the external function address via GOT slot to avoid PLT. */
20699 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20700 (TARGET_64BIT
20701 ? UNSPEC_GOTPCREL
20702 : UNSPEC_GOT));
20703 op1 = gen_rtx_CONST (Pmode, op1);
20704 op1 = gen_const_mem (Pmode, op1);
20705 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20707 else
20709 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20710 if (tmp)
20712 op1 = tmp;
20713 if (!addend)
20714 break;
20716 else
20718 op1 = operands[1];
20719 break;
20723 if (addend)
20725 op1 = force_operand (op1, NULL_RTX);
20726 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20727 op0, 1, OPTAB_DIRECT);
20729 else
20730 op1 = force_operand (op1, op0);
20732 if (op1 == op0)
20733 return;
20735 op1 = convert_to_mode (mode, op1, 1);
20737 default:
20738 break;
20741 if ((flag_pic || MACHOPIC_INDIRECT)
20742 && symbolic_operand (op1, mode))
20744 if (TARGET_MACHO && !TARGET_64BIT)
20746 #if TARGET_MACHO
20747 /* dynamic-no-pic */
20748 if (MACHOPIC_INDIRECT)
20750 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20751 ? op0 : gen_reg_rtx (Pmode);
20752 op1 = machopic_indirect_data_reference (op1, temp);
20753 if (MACHOPIC_PURE)
20754 op1 = machopic_legitimize_pic_address (op1, mode,
20755 temp == op1 ? 0 : temp);
20757 if (op0 != op1 && GET_CODE (op0) != MEM)
20759 rtx insn = gen_rtx_SET (op0, op1);
20760 emit_insn (insn);
20761 return;
20763 if (GET_CODE (op0) == MEM)
20764 op1 = force_reg (Pmode, op1);
20765 else
20767 rtx temp = op0;
20768 if (GET_CODE (temp) != REG)
20769 temp = gen_reg_rtx (Pmode);
20770 temp = legitimize_pic_address (op1, temp);
20771 if (temp == op0)
20772 return;
20773 op1 = temp;
20775 /* dynamic-no-pic */
20776 #endif
20778 else
20780 if (MEM_P (op0))
20781 op1 = force_reg (mode, op1);
20782 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20784 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20785 op1 = legitimize_pic_address (op1, reg);
20786 if (op0 == op1)
20787 return;
20788 op1 = convert_to_mode (mode, op1, 1);
20792 else
20794 if (MEM_P (op0)
20795 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20796 || !push_operand (op0, mode))
20797 && MEM_P (op1))
20798 op1 = force_reg (mode, op1);
20800 if (push_operand (op0, mode)
20801 && ! general_no_elim_operand (op1, mode))
20802 op1 = copy_to_mode_reg (mode, op1);
20804 /* Force large constants in 64bit compilation into register
20805 to get them CSEed. */
20806 if (can_create_pseudo_p ()
20807 && (mode == DImode) && TARGET_64BIT
20808 && immediate_operand (op1, mode)
20809 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20810 && !register_operand (op0, mode)
20811 && optimize)
20812 op1 = copy_to_mode_reg (mode, op1);
20814 if (can_create_pseudo_p ()
20815 && CONST_DOUBLE_P (op1))
20817 /* If we are loading a floating point constant to a register,
20818 force the value to memory now, since we'll get better code
20819 out the back end. */
20821 op1 = validize_mem (force_const_mem (mode, op1));
20822 if (!register_operand (op0, mode))
20824 rtx temp = gen_reg_rtx (mode);
20825 emit_insn (gen_rtx_SET (temp, op1));
20826 emit_move_insn (op0, temp);
20827 return;
20832 emit_insn (gen_rtx_SET (op0, op1));
20835 void
20836 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20838 rtx op0 = operands[0], op1 = operands[1];
20839 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20840 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20841 unsigned int align = (TARGET_IAMCU
20842 ? GET_MODE_BITSIZE (mode)
20843 : GET_MODE_ALIGNMENT (mode));
20845 if (push_operand (op0, VOIDmode))
20846 op0 = emit_move_resolve_push (mode, op0);
20848 /* Force constants other than zero into memory. We do not know how
20849 the instructions used to build constants modify the upper 64 bits
20850 of the register, once we have that information we may be able
20851 to handle some of them more efficiently. */
20852 if (can_create_pseudo_p ()
20853 && (CONSTANT_P (op1)
20854 || (SUBREG_P (op1)
20855 && CONSTANT_P (SUBREG_REG (op1))))
20856 && ((register_operand (op0, mode)
20857 && !standard_sse_constant_p (op1, mode))
20858 /* ix86_expand_vector_move_misalign() does not like constants. */
20859 || (SSE_REG_MODE_P (mode)
20860 && MEM_P (op0)
20861 && MEM_ALIGN (op0) < align)))
20863 if (SUBREG_P (op1))
20865 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20866 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20867 if (r)
20868 r = validize_mem (r);
20869 else
20870 r = force_reg (imode, SUBREG_REG (op1));
20871 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20873 else
20874 op1 = validize_mem (force_const_mem (mode, op1));
20877 /* We need to check memory alignment for SSE mode since attribute
20878 can make operands unaligned. */
20879 if (can_create_pseudo_p ()
20880 && SSE_REG_MODE_P (mode)
20881 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20882 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20884 rtx tmp[2];
20886 /* ix86_expand_vector_move_misalign() does not like both
20887 arguments in memory. */
20888 if (!register_operand (op0, mode)
20889 && !register_operand (op1, mode))
20890 op1 = force_reg (mode, op1);
20892 tmp[0] = op0; tmp[1] = op1;
20893 ix86_expand_vector_move_misalign (mode, tmp);
20894 return;
20897 /* Make operand1 a register if it isn't already. */
20898 if (can_create_pseudo_p ()
20899 && !register_operand (op0, mode)
20900 && !register_operand (op1, mode))
20902 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20903 return;
20906 emit_insn (gen_rtx_SET (op0, op1));
20909 /* Split 32-byte AVX unaligned load and store if needed. */
20911 static void
20912 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20914 rtx m;
20915 rtx (*extract) (rtx, rtx, rtx);
20916 machine_mode mode;
20918 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20919 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20921 emit_insn (gen_rtx_SET (op0, op1));
20922 return;
20925 rtx orig_op0 = NULL_RTX;
20926 mode = GET_MODE (op0);
20927 switch (GET_MODE_CLASS (mode))
20929 case MODE_VECTOR_INT:
20930 case MODE_INT:
20931 if (mode != V32QImode)
20933 if (!MEM_P (op0))
20935 orig_op0 = op0;
20936 op0 = gen_reg_rtx (V32QImode);
20938 else
20939 op0 = gen_lowpart (V32QImode, op0);
20940 op1 = gen_lowpart (V32QImode, op1);
20941 mode = V32QImode;
20943 break;
20944 case MODE_VECTOR_FLOAT:
20945 break;
20946 default:
20947 gcc_unreachable ();
20950 switch (mode)
20952 default:
20953 gcc_unreachable ();
20954 case V32QImode:
20955 extract = gen_avx_vextractf128v32qi;
20956 mode = V16QImode;
20957 break;
20958 case V8SFmode:
20959 extract = gen_avx_vextractf128v8sf;
20960 mode = V4SFmode;
20961 break;
20962 case V4DFmode:
20963 extract = gen_avx_vextractf128v4df;
20964 mode = V2DFmode;
20965 break;
20968 if (MEM_P (op1))
20970 rtx r = gen_reg_rtx (mode);
20971 m = adjust_address (op1, mode, 0);
20972 emit_move_insn (r, m);
20973 m = adjust_address (op1, mode, 16);
20974 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20975 emit_move_insn (op0, r);
20977 else if (MEM_P (op0))
20979 m = adjust_address (op0, mode, 0);
20980 emit_insn (extract (m, op1, const0_rtx));
20981 m = adjust_address (op0, mode, 16);
20982 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20984 else
20985 gcc_unreachable ();
20987 if (orig_op0)
20988 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20991 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20992 straight to ix86_expand_vector_move. */
20993 /* Code generation for scalar reg-reg moves of single and double precision data:
20994 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20995 movaps reg, reg
20996 else
20997 movss reg, reg
20998 if (x86_sse_partial_reg_dependency == true)
20999 movapd reg, reg
21000 else
21001 movsd reg, reg
21003 Code generation for scalar loads of double precision data:
21004 if (x86_sse_split_regs == true)
21005 movlpd mem, reg (gas syntax)
21006 else
21007 movsd mem, reg
21009 Code generation for unaligned packed loads of single precision data
21010 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21011 if (x86_sse_unaligned_move_optimal)
21012 movups mem, reg
21014 if (x86_sse_partial_reg_dependency == true)
21016 xorps reg, reg
21017 movlps mem, reg
21018 movhps mem+8, reg
21020 else
21022 movlps mem, reg
21023 movhps mem+8, reg
21026 Code generation for unaligned packed loads of double precision data
21027 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21028 if (x86_sse_unaligned_move_optimal)
21029 movupd mem, reg
21031 if (x86_sse_split_regs == true)
21033 movlpd mem, reg
21034 movhpd mem+8, reg
21036 else
21038 movsd mem, reg
21039 movhpd mem+8, reg
21043 void
21044 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21046 rtx op0, op1, m;
21048 op0 = operands[0];
21049 op1 = operands[1];
21051 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21052 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21054 emit_insn (gen_rtx_SET (op0, op1));
21055 return;
21058 if (TARGET_AVX)
21060 if (GET_MODE_SIZE (mode) == 32)
21061 ix86_avx256_split_vector_move_misalign (op0, op1);
21062 else
21063 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21064 emit_insn (gen_rtx_SET (op0, op1));
21065 return;
21068 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21069 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21071 emit_insn (gen_rtx_SET (op0, op1));
21072 return;
21075 /* ??? If we have typed data, then it would appear that using
21076 movdqu is the only way to get unaligned data loaded with
21077 integer type. */
21078 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21080 emit_insn (gen_rtx_SET (op0, op1));
21081 return;
21084 if (MEM_P (op1))
21086 if (TARGET_SSE2 && mode == V2DFmode)
21088 rtx zero;
21090 /* When SSE registers are split into halves, we can avoid
21091 writing to the top half twice. */
21092 if (TARGET_SSE_SPLIT_REGS)
21094 emit_clobber (op0);
21095 zero = op0;
21097 else
21099 /* ??? Not sure about the best option for the Intel chips.
21100 The following would seem to satisfy; the register is
21101 entirely cleared, breaking the dependency chain. We
21102 then store to the upper half, with a dependency depth
21103 of one. A rumor has it that Intel recommends two movsd
21104 followed by an unpacklpd, but this is unconfirmed. And
21105 given that the dependency depth of the unpacklpd would
21106 still be one, I'm not sure why this would be better. */
21107 zero = CONST0_RTX (V2DFmode);
21110 m = adjust_address (op1, DFmode, 0);
21111 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21112 m = adjust_address (op1, DFmode, 8);
21113 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21115 else
21117 rtx t;
21119 if (mode != V4SFmode)
21120 t = gen_reg_rtx (V4SFmode);
21121 else
21122 t = op0;
21124 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21125 emit_move_insn (t, CONST0_RTX (V4SFmode));
21126 else
21127 emit_clobber (t);
21129 m = adjust_address (op1, V2SFmode, 0);
21130 emit_insn (gen_sse_loadlps (t, t, m));
21131 m = adjust_address (op1, V2SFmode, 8);
21132 emit_insn (gen_sse_loadhps (t, t, m));
21133 if (mode != V4SFmode)
21134 emit_move_insn (op0, gen_lowpart (mode, t));
21137 else if (MEM_P (op0))
21139 if (TARGET_SSE2 && mode == V2DFmode)
21141 m = adjust_address (op0, DFmode, 0);
21142 emit_insn (gen_sse2_storelpd (m, op1));
21143 m = adjust_address (op0, DFmode, 8);
21144 emit_insn (gen_sse2_storehpd (m, op1));
21146 else
21148 if (mode != V4SFmode)
21149 op1 = gen_lowpart (V4SFmode, op1);
21151 m = adjust_address (op0, V2SFmode, 0);
21152 emit_insn (gen_sse_storelps (m, op1));
21153 m = adjust_address (op0, V2SFmode, 8);
21154 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21157 else
21158 gcc_unreachable ();
21161 /* Helper function of ix86_fixup_binary_operands to canonicalize
21162 operand order. Returns true if the operands should be swapped. */
21164 static bool
21165 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21166 rtx operands[])
21168 rtx dst = operands[0];
21169 rtx src1 = operands[1];
21170 rtx src2 = operands[2];
21172 /* If the operation is not commutative, we can't do anything. */
21173 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21174 return false;
21176 /* Highest priority is that src1 should match dst. */
21177 if (rtx_equal_p (dst, src1))
21178 return false;
21179 if (rtx_equal_p (dst, src2))
21180 return true;
21182 /* Next highest priority is that immediate constants come second. */
21183 if (immediate_operand (src2, mode))
21184 return false;
21185 if (immediate_operand (src1, mode))
21186 return true;
21188 /* Lowest priority is that memory references should come second. */
21189 if (MEM_P (src2))
21190 return false;
21191 if (MEM_P (src1))
21192 return true;
21194 return false;
21198 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21199 destination to use for the operation. If different from the true
21200 destination in operands[0], a copy operation will be required. */
21203 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21204 rtx operands[])
21206 rtx dst = operands[0];
21207 rtx src1 = operands[1];
21208 rtx src2 = operands[2];
21210 /* Canonicalize operand order. */
21211 if (ix86_swap_binary_operands_p (code, mode, operands))
21213 /* It is invalid to swap operands of different modes. */
21214 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21216 std::swap (src1, src2);
21219 /* Both source operands cannot be in memory. */
21220 if (MEM_P (src1) && MEM_P (src2))
21222 /* Optimization: Only read from memory once. */
21223 if (rtx_equal_p (src1, src2))
21225 src2 = force_reg (mode, src2);
21226 src1 = src2;
21228 else if (rtx_equal_p (dst, src1))
21229 src2 = force_reg (mode, src2);
21230 else
21231 src1 = force_reg (mode, src1);
21234 /* If the destination is memory, and we do not have matching source
21235 operands, do things in registers. */
21236 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21237 dst = gen_reg_rtx (mode);
21239 /* Source 1 cannot be a constant. */
21240 if (CONSTANT_P (src1))
21241 src1 = force_reg (mode, src1);
21243 /* Source 1 cannot be a non-matching memory. */
21244 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21245 src1 = force_reg (mode, src1);
21247 /* Improve address combine. */
21248 if (code == PLUS
21249 && GET_MODE_CLASS (mode) == MODE_INT
21250 && MEM_P (src2))
21251 src2 = force_reg (mode, src2);
21253 operands[1] = src1;
21254 operands[2] = src2;
21255 return dst;
21258 /* Similarly, but assume that the destination has already been
21259 set up properly. */
21261 void
21262 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21263 machine_mode mode, rtx operands[])
21265 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21266 gcc_assert (dst == operands[0]);
21269 /* Attempt to expand a binary operator. Make the expansion closer to the
21270 actual machine, then just general_operand, which will allow 3 separate
21271 memory references (one output, two input) in a single insn. */
21273 void
21274 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21275 rtx operands[])
21277 rtx src1, src2, dst, op, clob;
21279 dst = ix86_fixup_binary_operands (code, mode, operands);
21280 src1 = operands[1];
21281 src2 = operands[2];
21283 /* Emit the instruction. */
21285 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21287 if (reload_completed
21288 && code == PLUS
21289 && !rtx_equal_p (dst, src1))
21291 /* This is going to be an LEA; avoid splitting it later. */
21292 emit_insn (op);
21294 else
21296 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21297 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21300 /* Fix up the destination if needed. */
21301 if (dst != operands[0])
21302 emit_move_insn (operands[0], dst);
21305 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21306 the given OPERANDS. */
21308 void
21309 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21310 rtx operands[])
21312 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21313 if (SUBREG_P (operands[1]))
21315 op1 = operands[1];
21316 op2 = operands[2];
21318 else if (SUBREG_P (operands[2]))
21320 op1 = operands[2];
21321 op2 = operands[1];
21323 /* Optimize (__m128i) d | (__m128i) e and similar code
21324 when d and e are float vectors into float vector logical
21325 insn. In C/C++ without using intrinsics there is no other way
21326 to express vector logical operation on float vectors than
21327 to cast them temporarily to integer vectors. */
21328 if (op1
21329 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21330 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21331 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21332 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21333 && SUBREG_BYTE (op1) == 0
21334 && (GET_CODE (op2) == CONST_VECTOR
21335 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21336 && SUBREG_BYTE (op2) == 0))
21337 && can_create_pseudo_p ())
21339 rtx dst;
21340 switch (GET_MODE (SUBREG_REG (op1)))
21342 case V4SFmode:
21343 case V8SFmode:
21344 case V16SFmode:
21345 case V2DFmode:
21346 case V4DFmode:
21347 case V8DFmode:
21348 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21349 if (GET_CODE (op2) == CONST_VECTOR)
21351 op2 = gen_lowpart (GET_MODE (dst), op2);
21352 op2 = force_reg (GET_MODE (dst), op2);
21354 else
21356 op1 = operands[1];
21357 op2 = SUBREG_REG (operands[2]);
21358 if (!vector_operand (op2, GET_MODE (dst)))
21359 op2 = force_reg (GET_MODE (dst), op2);
21361 op1 = SUBREG_REG (op1);
21362 if (!vector_operand (op1, GET_MODE (dst)))
21363 op1 = force_reg (GET_MODE (dst), op1);
21364 emit_insn (gen_rtx_SET (dst,
21365 gen_rtx_fmt_ee (code, GET_MODE (dst),
21366 op1, op2)));
21367 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21368 return;
21369 default:
21370 break;
21373 if (!vector_operand (operands[1], mode))
21374 operands[1] = force_reg (mode, operands[1]);
21375 if (!vector_operand (operands[2], mode))
21376 operands[2] = force_reg (mode, operands[2]);
21377 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21378 emit_insn (gen_rtx_SET (operands[0],
21379 gen_rtx_fmt_ee (code, mode, operands[1],
21380 operands[2])));
21383 /* Return TRUE or FALSE depending on whether the binary operator meets the
21384 appropriate constraints. */
21386 bool
21387 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21388 rtx operands[3])
21390 rtx dst = operands[0];
21391 rtx src1 = operands[1];
21392 rtx src2 = operands[2];
21394 /* Both source operands cannot be in memory. */
21395 if (MEM_P (src1) && MEM_P (src2))
21396 return false;
21398 /* Canonicalize operand order for commutative operators. */
21399 if (ix86_swap_binary_operands_p (code, mode, operands))
21400 std::swap (src1, src2);
21402 /* If the destination is memory, we must have a matching source operand. */
21403 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21404 return false;
21406 /* Source 1 cannot be a constant. */
21407 if (CONSTANT_P (src1))
21408 return false;
21410 /* Source 1 cannot be a non-matching memory. */
21411 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21412 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21413 return (code == AND
21414 && (mode == HImode
21415 || mode == SImode
21416 || (TARGET_64BIT && mode == DImode))
21417 && satisfies_constraint_L (src2));
21419 return true;
21422 /* Attempt to expand a unary operator. Make the expansion closer to the
21423 actual machine, then just general_operand, which will allow 2 separate
21424 memory references (one output, one input) in a single insn. */
21426 void
21427 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21428 rtx operands[])
21430 bool matching_memory = false;
21431 rtx src, dst, op, clob;
21433 dst = operands[0];
21434 src = operands[1];
21436 /* If the destination is memory, and we do not have matching source
21437 operands, do things in registers. */
21438 if (MEM_P (dst))
21440 if (rtx_equal_p (dst, src))
21441 matching_memory = true;
21442 else
21443 dst = gen_reg_rtx (mode);
21446 /* When source operand is memory, destination must match. */
21447 if (MEM_P (src) && !matching_memory)
21448 src = force_reg (mode, src);
21450 /* Emit the instruction. */
21452 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21454 if (code == NOT)
21455 emit_insn (op);
21456 else
21458 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21459 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21462 /* Fix up the destination if needed. */
21463 if (dst != operands[0])
21464 emit_move_insn (operands[0], dst);
21467 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21468 divisor are within the range [0-255]. */
21470 void
21471 ix86_split_idivmod (machine_mode mode, rtx operands[],
21472 bool signed_p)
21474 rtx_code_label *end_label, *qimode_label;
21475 rtx div, mod;
21476 rtx_insn *insn;
21477 rtx scratch, tmp0, tmp1, tmp2;
21478 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21479 rtx (*gen_zero_extend) (rtx, rtx);
21480 rtx (*gen_test_ccno_1) (rtx, rtx);
21482 switch (mode)
21484 case SImode:
21485 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21486 gen_test_ccno_1 = gen_testsi_ccno_1;
21487 gen_zero_extend = gen_zero_extendqisi2;
21488 break;
21489 case DImode:
21490 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21491 gen_test_ccno_1 = gen_testdi_ccno_1;
21492 gen_zero_extend = gen_zero_extendqidi2;
21493 break;
21494 default:
21495 gcc_unreachable ();
21498 end_label = gen_label_rtx ();
21499 qimode_label = gen_label_rtx ();
21501 scratch = gen_reg_rtx (mode);
21503 /* Use 8bit unsigned divimod if dividend and divisor are within
21504 the range [0-255]. */
21505 emit_move_insn (scratch, operands[2]);
21506 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21507 scratch, 1, OPTAB_DIRECT);
21508 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21509 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21510 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21511 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21512 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21513 pc_rtx);
21514 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21515 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21516 JUMP_LABEL (insn) = qimode_label;
21518 /* Generate original signed/unsigned divimod. */
21519 div = gen_divmod4_1 (operands[0], operands[1],
21520 operands[2], operands[3]);
21521 emit_insn (div);
21523 /* Branch to the end. */
21524 emit_jump_insn (gen_jump (end_label));
21525 emit_barrier ();
21527 /* Generate 8bit unsigned divide. */
21528 emit_label (qimode_label);
21529 /* Don't use operands[0] for result of 8bit divide since not all
21530 registers support QImode ZERO_EXTRACT. */
21531 tmp0 = lowpart_subreg (HImode, scratch, mode);
21532 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21533 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21534 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21536 if (signed_p)
21538 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21539 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21541 else
21543 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21544 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21547 /* Extract remainder from AH. */
21548 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21549 if (REG_P (operands[1]))
21550 insn = emit_move_insn (operands[1], tmp1);
21551 else
21553 /* Need a new scratch register since the old one has result
21554 of 8bit divide. */
21555 scratch = gen_reg_rtx (mode);
21556 emit_move_insn (scratch, tmp1);
21557 insn = emit_move_insn (operands[1], scratch);
21559 set_unique_reg_note (insn, REG_EQUAL, mod);
21561 /* Zero extend quotient from AL. */
21562 tmp1 = gen_lowpart (QImode, tmp0);
21563 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21564 set_unique_reg_note (insn, REG_EQUAL, div);
21566 emit_label (end_label);
21569 #define LEA_MAX_STALL (3)
21570 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21572 /* Increase given DISTANCE in half-cycles according to
21573 dependencies between PREV and NEXT instructions.
21574 Add 1 half-cycle if there is no dependency and
21575 go to next cycle if there is some dependecy. */
21577 static unsigned int
21578 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21580 df_ref def, use;
21582 if (!prev || !next)
21583 return distance + (distance & 1) + 2;
21585 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21586 return distance + 1;
21588 FOR_EACH_INSN_USE (use, next)
21589 FOR_EACH_INSN_DEF (def, prev)
21590 if (!DF_REF_IS_ARTIFICIAL (def)
21591 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21592 return distance + (distance & 1) + 2;
21594 return distance + 1;
21597 /* Function checks if instruction INSN defines register number
21598 REGNO1 or REGNO2. */
21600 static bool
21601 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21602 rtx_insn *insn)
21604 df_ref def;
21606 FOR_EACH_INSN_DEF (def, insn)
21607 if (DF_REF_REG_DEF_P (def)
21608 && !DF_REF_IS_ARTIFICIAL (def)
21609 && (regno1 == DF_REF_REGNO (def)
21610 || regno2 == DF_REF_REGNO (def)))
21611 return true;
21613 return false;
21616 /* Function checks if instruction INSN uses register number
21617 REGNO as a part of address expression. */
21619 static bool
21620 insn_uses_reg_mem (unsigned int regno, rtx insn)
21622 df_ref use;
21624 FOR_EACH_INSN_USE (use, insn)
21625 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21626 return true;
21628 return false;
21631 /* Search backward for non-agu definition of register number REGNO1
21632 or register number REGNO2 in basic block starting from instruction
21633 START up to head of basic block or instruction INSN.
21635 Function puts true value into *FOUND var if definition was found
21636 and false otherwise.
21638 Distance in half-cycles between START and found instruction or head
21639 of BB is added to DISTANCE and returned. */
21641 static int
21642 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21643 rtx_insn *insn, int distance,
21644 rtx_insn *start, bool *found)
21646 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21647 rtx_insn *prev = start;
21648 rtx_insn *next = NULL;
21650 *found = false;
21652 while (prev
21653 && prev != insn
21654 && distance < LEA_SEARCH_THRESHOLD)
21656 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21658 distance = increase_distance (prev, next, distance);
21659 if (insn_defines_reg (regno1, regno2, prev))
21661 if (recog_memoized (prev) < 0
21662 || get_attr_type (prev) != TYPE_LEA)
21664 *found = true;
21665 return distance;
21669 next = prev;
21671 if (prev == BB_HEAD (bb))
21672 break;
21674 prev = PREV_INSN (prev);
21677 return distance;
21680 /* Search backward for non-agu definition of register number REGNO1
21681 or register number REGNO2 in INSN's basic block until
21682 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21683 2. Reach neighbor BBs boundary, or
21684 3. Reach agu definition.
21685 Returns the distance between the non-agu definition point and INSN.
21686 If no definition point, returns -1. */
21688 static int
21689 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21690 rtx_insn *insn)
21692 basic_block bb = BLOCK_FOR_INSN (insn);
21693 int distance = 0;
21694 bool found = false;
21696 if (insn != BB_HEAD (bb))
21697 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21698 distance, PREV_INSN (insn),
21699 &found);
21701 if (!found && distance < LEA_SEARCH_THRESHOLD)
21703 edge e;
21704 edge_iterator ei;
21705 bool simple_loop = false;
21707 FOR_EACH_EDGE (e, ei, bb->preds)
21708 if (e->src == bb)
21710 simple_loop = true;
21711 break;
21714 if (simple_loop)
21715 distance = distance_non_agu_define_in_bb (regno1, regno2,
21716 insn, distance,
21717 BB_END (bb), &found);
21718 else
21720 int shortest_dist = -1;
21721 bool found_in_bb = false;
21723 FOR_EACH_EDGE (e, ei, bb->preds)
21725 int bb_dist
21726 = distance_non_agu_define_in_bb (regno1, regno2,
21727 insn, distance,
21728 BB_END (e->src),
21729 &found_in_bb);
21730 if (found_in_bb)
21732 if (shortest_dist < 0)
21733 shortest_dist = bb_dist;
21734 else if (bb_dist > 0)
21735 shortest_dist = MIN (bb_dist, shortest_dist);
21737 found = true;
21741 distance = shortest_dist;
21745 /* get_attr_type may modify recog data. We want to make sure
21746 that recog data is valid for instruction INSN, on which
21747 distance_non_agu_define is called. INSN is unchanged here. */
21748 extract_insn_cached (insn);
21750 if (!found)
21751 return -1;
21753 return distance >> 1;
21756 /* Return the distance in half-cycles between INSN and the next
21757 insn that uses register number REGNO in memory address added
21758 to DISTANCE. Return -1 if REGNO0 is set.
21760 Put true value into *FOUND if register usage was found and
21761 false otherwise.
21762 Put true value into *REDEFINED if register redefinition was
21763 found and false otherwise. */
21765 static int
21766 distance_agu_use_in_bb (unsigned int regno,
21767 rtx_insn *insn, int distance, rtx_insn *start,
21768 bool *found, bool *redefined)
21770 basic_block bb = NULL;
21771 rtx_insn *next = start;
21772 rtx_insn *prev = NULL;
21774 *found = false;
21775 *redefined = false;
21777 if (start != NULL_RTX)
21779 bb = BLOCK_FOR_INSN (start);
21780 if (start != BB_HEAD (bb))
21781 /* If insn and start belong to the same bb, set prev to insn,
21782 so the call to increase_distance will increase the distance
21783 between insns by 1. */
21784 prev = insn;
21787 while (next
21788 && next != insn
21789 && distance < LEA_SEARCH_THRESHOLD)
21791 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21793 distance = increase_distance(prev, next, distance);
21794 if (insn_uses_reg_mem (regno, next))
21796 /* Return DISTANCE if OP0 is used in memory
21797 address in NEXT. */
21798 *found = true;
21799 return distance;
21802 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21804 /* Return -1 if OP0 is set in NEXT. */
21805 *redefined = true;
21806 return -1;
21809 prev = next;
21812 if (next == BB_END (bb))
21813 break;
21815 next = NEXT_INSN (next);
21818 return distance;
21821 /* Return the distance between INSN and the next insn that uses
21822 register number REGNO0 in memory address. Return -1 if no such
21823 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21825 static int
21826 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21828 basic_block bb = BLOCK_FOR_INSN (insn);
21829 int distance = 0;
21830 bool found = false;
21831 bool redefined = false;
21833 if (insn != BB_END (bb))
21834 distance = distance_agu_use_in_bb (regno0, insn, distance,
21835 NEXT_INSN (insn),
21836 &found, &redefined);
21838 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21840 edge e;
21841 edge_iterator ei;
21842 bool simple_loop = false;
21844 FOR_EACH_EDGE (e, ei, bb->succs)
21845 if (e->dest == bb)
21847 simple_loop = true;
21848 break;
21851 if (simple_loop)
21852 distance = distance_agu_use_in_bb (regno0, insn,
21853 distance, BB_HEAD (bb),
21854 &found, &redefined);
21855 else
21857 int shortest_dist = -1;
21858 bool found_in_bb = false;
21859 bool redefined_in_bb = false;
21861 FOR_EACH_EDGE (e, ei, bb->succs)
21863 int bb_dist
21864 = distance_agu_use_in_bb (regno0, insn,
21865 distance, BB_HEAD (e->dest),
21866 &found_in_bb, &redefined_in_bb);
21867 if (found_in_bb)
21869 if (shortest_dist < 0)
21870 shortest_dist = bb_dist;
21871 else if (bb_dist > 0)
21872 shortest_dist = MIN (bb_dist, shortest_dist);
21874 found = true;
21878 distance = shortest_dist;
21882 if (!found || redefined)
21883 return -1;
21885 return distance >> 1;
21888 /* Define this macro to tune LEA priority vs ADD, it take effect when
21889 there is a dilemma of choicing LEA or ADD
21890 Negative value: ADD is more preferred than LEA
21891 Zero: Netrual
21892 Positive value: LEA is more preferred than ADD*/
21893 #define IX86_LEA_PRIORITY 0
21895 /* Return true if usage of lea INSN has performance advantage
21896 over a sequence of instructions. Instructions sequence has
21897 SPLIT_COST cycles higher latency than lea latency. */
21899 static bool
21900 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21901 unsigned int regno2, int split_cost, bool has_scale)
21903 int dist_define, dist_use;
21905 /* For Silvermont if using a 2-source or 3-source LEA for
21906 non-destructive destination purposes, or due to wanting
21907 ability to use SCALE, the use of LEA is justified. */
21908 if (TARGET_SILVERMONT || TARGET_INTEL)
21910 if (has_scale)
21911 return true;
21912 if (split_cost < 1)
21913 return false;
21914 if (regno0 == regno1 || regno0 == regno2)
21915 return false;
21916 return true;
21919 dist_define = distance_non_agu_define (regno1, regno2, insn);
21920 dist_use = distance_agu_use (regno0, insn);
21922 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21924 /* If there is no non AGU operand definition, no AGU
21925 operand usage and split cost is 0 then both lea
21926 and non lea variants have same priority. Currently
21927 we prefer lea for 64 bit code and non lea on 32 bit
21928 code. */
21929 if (dist_use < 0 && split_cost == 0)
21930 return TARGET_64BIT || IX86_LEA_PRIORITY;
21931 else
21932 return true;
21935 /* With longer definitions distance lea is more preferable.
21936 Here we change it to take into account splitting cost and
21937 lea priority. */
21938 dist_define += split_cost + IX86_LEA_PRIORITY;
21940 /* If there is no use in memory addess then we just check
21941 that split cost exceeds AGU stall. */
21942 if (dist_use < 0)
21943 return dist_define > LEA_MAX_STALL;
21945 /* If this insn has both backward non-agu dependence and forward
21946 agu dependence, the one with short distance takes effect. */
21947 return dist_define >= dist_use;
21950 /* Return true if it is legal to clobber flags by INSN and
21951 false otherwise. */
21953 static bool
21954 ix86_ok_to_clobber_flags (rtx_insn *insn)
21956 basic_block bb = BLOCK_FOR_INSN (insn);
21957 df_ref use;
21958 bitmap live;
21960 while (insn)
21962 if (NONDEBUG_INSN_P (insn))
21964 FOR_EACH_INSN_USE (use, insn)
21965 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21966 return false;
21968 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21969 return true;
21972 if (insn == BB_END (bb))
21973 break;
21975 insn = NEXT_INSN (insn);
21978 live = df_get_live_out(bb);
21979 return !REGNO_REG_SET_P (live, FLAGS_REG);
21982 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21983 move and add to avoid AGU stalls. */
21985 bool
21986 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21988 unsigned int regno0, regno1, regno2;
21990 /* Check if we need to optimize. */
21991 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21992 return false;
21994 /* Check it is correct to split here. */
21995 if (!ix86_ok_to_clobber_flags(insn))
21996 return false;
21998 regno0 = true_regnum (operands[0]);
21999 regno1 = true_regnum (operands[1]);
22000 regno2 = true_regnum (operands[2]);
22002 /* We need to split only adds with non destructive
22003 destination operand. */
22004 if (regno0 == regno1 || regno0 == regno2)
22005 return false;
22006 else
22007 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22010 /* Return true if we should emit lea instruction instead of mov
22011 instruction. */
22013 bool
22014 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22016 unsigned int regno0, regno1;
22018 /* Check if we need to optimize. */
22019 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22020 return false;
22022 /* Use lea for reg to reg moves only. */
22023 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22024 return false;
22026 regno0 = true_regnum (operands[0]);
22027 regno1 = true_regnum (operands[1]);
22029 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22032 /* Return true if we need to split lea into a sequence of
22033 instructions to avoid AGU stalls. */
22035 bool
22036 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22038 unsigned int regno0, regno1, regno2;
22039 int split_cost;
22040 struct ix86_address parts;
22041 int ok;
22043 /* Check we need to optimize. */
22044 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22045 return false;
22047 /* The "at least two components" test below might not catch simple
22048 move or zero extension insns if parts.base is non-NULL and parts.disp
22049 is const0_rtx as the only components in the address, e.g. if the
22050 register is %rbp or %r13. As this test is much cheaper and moves or
22051 zero extensions are the common case, do this check first. */
22052 if (REG_P (operands[1])
22053 || (SImode_address_operand (operands[1], VOIDmode)
22054 && REG_P (XEXP (operands[1], 0))))
22055 return false;
22057 /* Check if it is OK to split here. */
22058 if (!ix86_ok_to_clobber_flags (insn))
22059 return false;
22061 ok = ix86_decompose_address (operands[1], &parts);
22062 gcc_assert (ok);
22064 /* There should be at least two components in the address. */
22065 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22066 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22067 return false;
22069 /* We should not split into add if non legitimate pic
22070 operand is used as displacement. */
22071 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22072 return false;
22074 regno0 = true_regnum (operands[0]) ;
22075 regno1 = INVALID_REGNUM;
22076 regno2 = INVALID_REGNUM;
22078 if (parts.base)
22079 regno1 = true_regnum (parts.base);
22080 if (parts.index)
22081 regno2 = true_regnum (parts.index);
22083 split_cost = 0;
22085 /* Compute how many cycles we will add to execution time
22086 if split lea into a sequence of instructions. */
22087 if (parts.base || parts.index)
22089 /* Have to use mov instruction if non desctructive
22090 destination form is used. */
22091 if (regno1 != regno0 && regno2 != regno0)
22092 split_cost += 1;
22094 /* Have to add index to base if both exist. */
22095 if (parts.base && parts.index)
22096 split_cost += 1;
22098 /* Have to use shift and adds if scale is 2 or greater. */
22099 if (parts.scale > 1)
22101 if (regno0 != regno1)
22102 split_cost += 1;
22103 else if (regno2 == regno0)
22104 split_cost += 4;
22105 else
22106 split_cost += parts.scale;
22109 /* Have to use add instruction with immediate if
22110 disp is non zero. */
22111 if (parts.disp && parts.disp != const0_rtx)
22112 split_cost += 1;
22114 /* Subtract the price of lea. */
22115 split_cost -= 1;
22118 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22119 parts.scale > 1);
22122 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22123 matches destination. RTX includes clobber of FLAGS_REG. */
22125 static void
22126 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22127 rtx dst, rtx src)
22129 rtx op, clob;
22131 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22132 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22134 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22137 /* Return true if regno1 def is nearest to the insn. */
22139 static bool
22140 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22142 rtx_insn *prev = insn;
22143 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22145 if (insn == start)
22146 return false;
22147 while (prev && prev != start)
22149 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22151 prev = PREV_INSN (prev);
22152 continue;
22154 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22155 return true;
22156 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22157 return false;
22158 prev = PREV_INSN (prev);
22161 /* None of the regs is defined in the bb. */
22162 return false;
22165 /* Split lea instructions into a sequence of instructions
22166 which are executed on ALU to avoid AGU stalls.
22167 It is assumed that it is allowed to clobber flags register
22168 at lea position. */
22170 void
22171 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22173 unsigned int regno0, regno1, regno2;
22174 struct ix86_address parts;
22175 rtx target, tmp;
22176 int ok, adds;
22178 ok = ix86_decompose_address (operands[1], &parts);
22179 gcc_assert (ok);
22181 target = gen_lowpart (mode, operands[0]);
22183 regno0 = true_regnum (target);
22184 regno1 = INVALID_REGNUM;
22185 regno2 = INVALID_REGNUM;
22187 if (parts.base)
22189 parts.base = gen_lowpart (mode, parts.base);
22190 regno1 = true_regnum (parts.base);
22193 if (parts.index)
22195 parts.index = gen_lowpart (mode, parts.index);
22196 regno2 = true_regnum (parts.index);
22199 if (parts.disp)
22200 parts.disp = gen_lowpart (mode, parts.disp);
22202 if (parts.scale > 1)
22204 /* Case r1 = r1 + ... */
22205 if (regno1 == regno0)
22207 /* If we have a case r1 = r1 + C * r2 then we
22208 should use multiplication which is very
22209 expensive. Assume cost model is wrong if we
22210 have such case here. */
22211 gcc_assert (regno2 != regno0);
22213 for (adds = parts.scale; adds > 0; adds--)
22214 ix86_emit_binop (PLUS, mode, target, parts.index);
22216 else
22218 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22219 if (regno0 != regno2)
22220 emit_insn (gen_rtx_SET (target, parts.index));
22222 /* Use shift for scaling. */
22223 ix86_emit_binop (ASHIFT, mode, target,
22224 GEN_INT (exact_log2 (parts.scale)));
22226 if (parts.base)
22227 ix86_emit_binop (PLUS, mode, target, parts.base);
22229 if (parts.disp && parts.disp != const0_rtx)
22230 ix86_emit_binop (PLUS, mode, target, parts.disp);
22233 else if (!parts.base && !parts.index)
22235 gcc_assert(parts.disp);
22236 emit_insn (gen_rtx_SET (target, parts.disp));
22238 else
22240 if (!parts.base)
22242 if (regno0 != regno2)
22243 emit_insn (gen_rtx_SET (target, parts.index));
22245 else if (!parts.index)
22247 if (regno0 != regno1)
22248 emit_insn (gen_rtx_SET (target, parts.base));
22250 else
22252 if (regno0 == regno1)
22253 tmp = parts.index;
22254 else if (regno0 == regno2)
22255 tmp = parts.base;
22256 else
22258 rtx tmp1;
22260 /* Find better operand for SET instruction, depending
22261 on which definition is farther from the insn. */
22262 if (find_nearest_reg_def (insn, regno1, regno2))
22263 tmp = parts.index, tmp1 = parts.base;
22264 else
22265 tmp = parts.base, tmp1 = parts.index;
22267 emit_insn (gen_rtx_SET (target, tmp));
22269 if (parts.disp && parts.disp != const0_rtx)
22270 ix86_emit_binop (PLUS, mode, target, parts.disp);
22272 ix86_emit_binop (PLUS, mode, target, tmp1);
22273 return;
22276 ix86_emit_binop (PLUS, mode, target, tmp);
22279 if (parts.disp && parts.disp != const0_rtx)
22280 ix86_emit_binop (PLUS, mode, target, parts.disp);
22284 /* Return true if it is ok to optimize an ADD operation to LEA
22285 operation to avoid flag register consumation. For most processors,
22286 ADD is faster than LEA. For the processors like BONNELL, if the
22287 destination register of LEA holds an actual address which will be
22288 used soon, LEA is better and otherwise ADD is better. */
22290 bool
22291 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22293 unsigned int regno0 = true_regnum (operands[0]);
22294 unsigned int regno1 = true_regnum (operands[1]);
22295 unsigned int regno2 = true_regnum (operands[2]);
22297 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22298 if (regno0 != regno1 && regno0 != regno2)
22299 return true;
22301 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22302 return false;
22304 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22307 /* Return true if destination reg of SET_BODY is shift count of
22308 USE_BODY. */
22310 static bool
22311 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22313 rtx set_dest;
22314 rtx shift_rtx;
22315 int i;
22317 /* Retrieve destination of SET_BODY. */
22318 switch (GET_CODE (set_body))
22320 case SET:
22321 set_dest = SET_DEST (set_body);
22322 if (!set_dest || !REG_P (set_dest))
22323 return false;
22324 break;
22325 case PARALLEL:
22326 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22327 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22328 use_body))
22329 return true;
22330 /* FALLTHROUGH */
22331 default:
22332 return false;
22335 /* Retrieve shift count of USE_BODY. */
22336 switch (GET_CODE (use_body))
22338 case SET:
22339 shift_rtx = XEXP (use_body, 1);
22340 break;
22341 case PARALLEL:
22342 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22343 if (ix86_dep_by_shift_count_body (set_body,
22344 XVECEXP (use_body, 0, i)))
22345 return true;
22346 /* FALLTHROUGH */
22347 default:
22348 return false;
22351 if (shift_rtx
22352 && (GET_CODE (shift_rtx) == ASHIFT
22353 || GET_CODE (shift_rtx) == LSHIFTRT
22354 || GET_CODE (shift_rtx) == ASHIFTRT
22355 || GET_CODE (shift_rtx) == ROTATE
22356 || GET_CODE (shift_rtx) == ROTATERT))
22358 rtx shift_count = XEXP (shift_rtx, 1);
22360 /* Return true if shift count is dest of SET_BODY. */
22361 if (REG_P (shift_count))
22363 /* Add check since it can be invoked before register
22364 allocation in pre-reload schedule. */
22365 if (reload_completed
22366 && true_regnum (set_dest) == true_regnum (shift_count))
22367 return true;
22368 else if (REGNO(set_dest) == REGNO(shift_count))
22369 return true;
22373 return false;
22376 /* Return true if destination reg of SET_INSN is shift count of
22377 USE_INSN. */
22379 bool
22380 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22382 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22383 PATTERN (use_insn));
22386 /* Return TRUE or FALSE depending on whether the unary operator meets the
22387 appropriate constraints. */
22389 bool
22390 ix86_unary_operator_ok (enum rtx_code,
22391 machine_mode,
22392 rtx operands[2])
22394 /* If one of operands is memory, source and destination must match. */
22395 if ((MEM_P (operands[0])
22396 || MEM_P (operands[1]))
22397 && ! rtx_equal_p (operands[0], operands[1]))
22398 return false;
22399 return true;
22402 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22403 are ok, keeping in mind the possible movddup alternative. */
22405 bool
22406 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22408 if (MEM_P (operands[0]))
22409 return rtx_equal_p (operands[0], operands[1 + high]);
22410 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22411 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22412 return true;
22415 /* Post-reload splitter for converting an SF or DFmode value in an
22416 SSE register into an unsigned SImode. */
22418 void
22419 ix86_split_convert_uns_si_sse (rtx operands[])
22421 machine_mode vecmode;
22422 rtx value, large, zero_or_two31, input, two31, x;
22424 large = operands[1];
22425 zero_or_two31 = operands[2];
22426 input = operands[3];
22427 two31 = operands[4];
22428 vecmode = GET_MODE (large);
22429 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22431 /* Load up the value into the low element. We must ensure that the other
22432 elements are valid floats -- zero is the easiest such value. */
22433 if (MEM_P (input))
22435 if (vecmode == V4SFmode)
22436 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22437 else
22438 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22440 else
22442 input = gen_rtx_REG (vecmode, REGNO (input));
22443 emit_move_insn (value, CONST0_RTX (vecmode));
22444 if (vecmode == V4SFmode)
22445 emit_insn (gen_sse_movss (value, value, input));
22446 else
22447 emit_insn (gen_sse2_movsd (value, value, input));
22450 emit_move_insn (large, two31);
22451 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22453 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22454 emit_insn (gen_rtx_SET (large, x));
22456 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22457 emit_insn (gen_rtx_SET (zero_or_two31, x));
22459 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22460 emit_insn (gen_rtx_SET (value, x));
22462 large = gen_rtx_REG (V4SImode, REGNO (large));
22463 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22465 x = gen_rtx_REG (V4SImode, REGNO (value));
22466 if (vecmode == V4SFmode)
22467 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22468 else
22469 emit_insn (gen_sse2_cvttpd2dq (x, value));
22470 value = x;
22472 emit_insn (gen_xorv4si3 (value, value, large));
22475 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22476 Expects the 64-bit DImode to be supplied in a pair of integral
22477 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22478 -mfpmath=sse, !optimize_size only. */
22480 void
22481 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22483 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22484 rtx int_xmm, fp_xmm;
22485 rtx biases, exponents;
22486 rtx x;
22488 int_xmm = gen_reg_rtx (V4SImode);
22489 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22490 emit_insn (gen_movdi_to_sse (int_xmm, input));
22491 else if (TARGET_SSE_SPLIT_REGS)
22493 emit_clobber (int_xmm);
22494 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22496 else
22498 x = gen_reg_rtx (V2DImode);
22499 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22500 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22503 x = gen_rtx_CONST_VECTOR (V4SImode,
22504 gen_rtvec (4, GEN_INT (0x43300000UL),
22505 GEN_INT (0x45300000UL),
22506 const0_rtx, const0_rtx));
22507 exponents = validize_mem (force_const_mem (V4SImode, x));
22509 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22510 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22512 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22513 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22514 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22515 (0x1.0p84 + double(fp_value_hi_xmm)).
22516 Note these exponents differ by 32. */
22518 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22520 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22521 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22522 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22523 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22524 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22525 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22526 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22527 biases = validize_mem (force_const_mem (V2DFmode, biases));
22528 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22530 /* Add the upper and lower DFmode values together. */
22531 if (TARGET_SSE3)
22532 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22533 else
22535 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22536 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22537 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22540 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22543 /* Not used, but eases macroization of patterns. */
22544 void
22545 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22547 gcc_unreachable ();
22550 /* Convert an unsigned SImode value into a DFmode. Only currently used
22551 for SSE, but applicable anywhere. */
22553 void
22554 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22556 REAL_VALUE_TYPE TWO31r;
22557 rtx x, fp;
22559 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22560 NULL, 1, OPTAB_DIRECT);
22562 fp = gen_reg_rtx (DFmode);
22563 emit_insn (gen_floatsidf2 (fp, x));
22565 real_ldexp (&TWO31r, &dconst1, 31);
22566 x = const_double_from_real_value (TWO31r, DFmode);
22568 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22569 if (x != target)
22570 emit_move_insn (target, x);
22573 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22574 32-bit mode; otherwise we have a direct convert instruction. */
22576 void
22577 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22579 REAL_VALUE_TYPE TWO32r;
22580 rtx fp_lo, fp_hi, x;
22582 fp_lo = gen_reg_rtx (DFmode);
22583 fp_hi = gen_reg_rtx (DFmode);
22585 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22587 real_ldexp (&TWO32r, &dconst1, 32);
22588 x = const_double_from_real_value (TWO32r, DFmode);
22589 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22591 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22593 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22594 0, OPTAB_DIRECT);
22595 if (x != target)
22596 emit_move_insn (target, x);
22599 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22600 For x86_32, -mfpmath=sse, !optimize_size only. */
22601 void
22602 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22604 REAL_VALUE_TYPE ONE16r;
22605 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22607 real_ldexp (&ONE16r, &dconst1, 16);
22608 x = const_double_from_real_value (ONE16r, SFmode);
22609 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22610 NULL, 0, OPTAB_DIRECT);
22611 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22612 NULL, 0, OPTAB_DIRECT);
22613 fp_hi = gen_reg_rtx (SFmode);
22614 fp_lo = gen_reg_rtx (SFmode);
22615 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22616 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22617 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22618 0, OPTAB_DIRECT);
22619 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22620 0, OPTAB_DIRECT);
22621 if (!rtx_equal_p (target, fp_hi))
22622 emit_move_insn (target, fp_hi);
22625 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22626 a vector of unsigned ints VAL to vector of floats TARGET. */
22628 void
22629 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22631 rtx tmp[8];
22632 REAL_VALUE_TYPE TWO16r;
22633 machine_mode intmode = GET_MODE (val);
22634 machine_mode fltmode = GET_MODE (target);
22635 rtx (*cvt) (rtx, rtx);
22637 if (intmode == V4SImode)
22638 cvt = gen_floatv4siv4sf2;
22639 else
22640 cvt = gen_floatv8siv8sf2;
22641 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22642 tmp[0] = force_reg (intmode, tmp[0]);
22643 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22644 OPTAB_DIRECT);
22645 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22646 NULL_RTX, 1, OPTAB_DIRECT);
22647 tmp[3] = gen_reg_rtx (fltmode);
22648 emit_insn (cvt (tmp[3], tmp[1]));
22649 tmp[4] = gen_reg_rtx (fltmode);
22650 emit_insn (cvt (tmp[4], tmp[2]));
22651 real_ldexp (&TWO16r, &dconst1, 16);
22652 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22653 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22654 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22655 OPTAB_DIRECT);
22656 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22657 OPTAB_DIRECT);
22658 if (tmp[7] != target)
22659 emit_move_insn (target, tmp[7]);
22662 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22663 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22664 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22665 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22668 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22670 REAL_VALUE_TYPE TWO31r;
22671 rtx two31r, tmp[4];
22672 machine_mode mode = GET_MODE (val);
22673 machine_mode scalarmode = GET_MODE_INNER (mode);
22674 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22675 rtx (*cmp) (rtx, rtx, rtx, rtx);
22676 int i;
22678 for (i = 0; i < 3; i++)
22679 tmp[i] = gen_reg_rtx (mode);
22680 real_ldexp (&TWO31r, &dconst1, 31);
22681 two31r = const_double_from_real_value (TWO31r, scalarmode);
22682 two31r = ix86_build_const_vector (mode, 1, two31r);
22683 two31r = force_reg (mode, two31r);
22684 switch (mode)
22686 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22687 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22688 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22689 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22690 default: gcc_unreachable ();
22692 tmp[3] = gen_rtx_LE (mode, two31r, val);
22693 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22694 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22695 0, OPTAB_DIRECT);
22696 if (intmode == V4SImode || TARGET_AVX2)
22697 *xorp = expand_simple_binop (intmode, ASHIFT,
22698 gen_lowpart (intmode, tmp[0]),
22699 GEN_INT (31), NULL_RTX, 0,
22700 OPTAB_DIRECT);
22701 else
22703 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22704 two31 = ix86_build_const_vector (intmode, 1, two31);
22705 *xorp = expand_simple_binop (intmode, AND,
22706 gen_lowpart (intmode, tmp[0]),
22707 two31, NULL_RTX, 0,
22708 OPTAB_DIRECT);
22710 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22711 0, OPTAB_DIRECT);
22714 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22715 then replicate the value for all elements of the vector
22716 register. */
22719 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22721 int i, n_elt;
22722 rtvec v;
22723 machine_mode scalar_mode;
22725 switch (mode)
22727 case V64QImode:
22728 case V32QImode:
22729 case V16QImode:
22730 case V32HImode:
22731 case V16HImode:
22732 case V8HImode:
22733 case V16SImode:
22734 case V8SImode:
22735 case V4SImode:
22736 case V8DImode:
22737 case V4DImode:
22738 case V2DImode:
22739 gcc_assert (vect);
22740 /* FALLTHRU */
22741 case V16SFmode:
22742 case V8SFmode:
22743 case V4SFmode:
22744 case V8DFmode:
22745 case V4DFmode:
22746 case V2DFmode:
22747 n_elt = GET_MODE_NUNITS (mode);
22748 v = rtvec_alloc (n_elt);
22749 scalar_mode = GET_MODE_INNER (mode);
22751 RTVEC_ELT (v, 0) = value;
22753 for (i = 1; i < n_elt; ++i)
22754 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22756 return gen_rtx_CONST_VECTOR (mode, v);
22758 default:
22759 gcc_unreachable ();
22763 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22764 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22765 for an SSE register. If VECT is true, then replicate the mask for
22766 all elements of the vector register. If INVERT is true, then create
22767 a mask excluding the sign bit. */
22770 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22772 machine_mode vec_mode, imode;
22773 wide_int w;
22774 rtx mask, v;
22776 switch (mode)
22778 case V16SImode:
22779 case V16SFmode:
22780 case V8SImode:
22781 case V4SImode:
22782 case V8SFmode:
22783 case V4SFmode:
22784 vec_mode = mode;
22785 imode = SImode;
22786 break;
22788 case V8DImode:
22789 case V4DImode:
22790 case V2DImode:
22791 case V8DFmode:
22792 case V4DFmode:
22793 case V2DFmode:
22794 vec_mode = mode;
22795 imode = DImode;
22796 break;
22798 case TImode:
22799 case TFmode:
22800 vec_mode = VOIDmode;
22801 imode = TImode;
22802 break;
22804 default:
22805 gcc_unreachable ();
22808 machine_mode inner_mode = GET_MODE_INNER (mode);
22809 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22810 GET_MODE_BITSIZE (inner_mode));
22811 if (invert)
22812 w = wi::bit_not (w);
22814 /* Force this value into the low part of a fp vector constant. */
22815 mask = immed_wide_int_const (w, imode);
22816 mask = gen_lowpart (inner_mode, mask);
22818 if (vec_mode == VOIDmode)
22819 return force_reg (inner_mode, mask);
22821 v = ix86_build_const_vector (vec_mode, vect, mask);
22822 return force_reg (vec_mode, v);
22825 /* Generate code for floating point ABS or NEG. */
22827 void
22828 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22829 rtx operands[])
22831 rtx mask, set, dst, src;
22832 bool use_sse = false;
22833 bool vector_mode = VECTOR_MODE_P (mode);
22834 machine_mode vmode = mode;
22836 if (vector_mode)
22837 use_sse = true;
22838 else if (mode == TFmode)
22839 use_sse = true;
22840 else if (TARGET_SSE_MATH)
22842 use_sse = SSE_FLOAT_MODE_P (mode);
22843 if (mode == SFmode)
22844 vmode = V4SFmode;
22845 else if (mode == DFmode)
22846 vmode = V2DFmode;
22849 /* NEG and ABS performed with SSE use bitwise mask operations.
22850 Create the appropriate mask now. */
22851 if (use_sse)
22852 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22853 else
22854 mask = NULL_RTX;
22856 dst = operands[0];
22857 src = operands[1];
22859 set = gen_rtx_fmt_e (code, mode, src);
22860 set = gen_rtx_SET (dst, set);
22862 if (mask)
22864 rtx use, clob;
22865 rtvec par;
22867 use = gen_rtx_USE (VOIDmode, mask);
22868 if (vector_mode)
22869 par = gen_rtvec (2, set, use);
22870 else
22872 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22873 par = gen_rtvec (3, set, use, clob);
22875 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22877 else
22878 emit_insn (set);
22881 /* Expand a copysign operation. Special case operand 0 being a constant. */
22883 void
22884 ix86_expand_copysign (rtx operands[])
22886 machine_mode mode, vmode;
22887 rtx dest, op0, op1, mask, nmask;
22889 dest = operands[0];
22890 op0 = operands[1];
22891 op1 = operands[2];
22893 mode = GET_MODE (dest);
22895 if (mode == SFmode)
22896 vmode = V4SFmode;
22897 else if (mode == DFmode)
22898 vmode = V2DFmode;
22899 else
22900 vmode = mode;
22902 if (CONST_DOUBLE_P (op0))
22904 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22906 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22907 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22909 if (mode == SFmode || mode == DFmode)
22911 if (op0 == CONST0_RTX (mode))
22912 op0 = CONST0_RTX (vmode);
22913 else
22915 rtx v = ix86_build_const_vector (vmode, false, op0);
22917 op0 = force_reg (vmode, v);
22920 else if (op0 != CONST0_RTX (mode))
22921 op0 = force_reg (mode, op0);
22923 mask = ix86_build_signbit_mask (vmode, 0, 0);
22925 if (mode == SFmode)
22926 copysign_insn = gen_copysignsf3_const;
22927 else if (mode == DFmode)
22928 copysign_insn = gen_copysigndf3_const;
22929 else
22930 copysign_insn = gen_copysigntf3_const;
22932 emit_insn (copysign_insn (dest, op0, op1, mask));
22934 else
22936 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22938 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22939 mask = ix86_build_signbit_mask (vmode, 0, 0);
22941 if (mode == SFmode)
22942 copysign_insn = gen_copysignsf3_var;
22943 else if (mode == DFmode)
22944 copysign_insn = gen_copysigndf3_var;
22945 else
22946 copysign_insn = gen_copysigntf3_var;
22948 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22952 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22953 be a constant, and so has already been expanded into a vector constant. */
22955 void
22956 ix86_split_copysign_const (rtx operands[])
22958 machine_mode mode, vmode;
22959 rtx dest, op0, mask, x;
22961 dest = operands[0];
22962 op0 = operands[1];
22963 mask = operands[3];
22965 mode = GET_MODE (dest);
22966 vmode = GET_MODE (mask);
22968 dest = lowpart_subreg (vmode, dest, mode);
22969 x = gen_rtx_AND (vmode, dest, mask);
22970 emit_insn (gen_rtx_SET (dest, x));
22972 if (op0 != CONST0_RTX (vmode))
22974 x = gen_rtx_IOR (vmode, dest, op0);
22975 emit_insn (gen_rtx_SET (dest, x));
22979 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22980 so we have to do two masks. */
22982 void
22983 ix86_split_copysign_var (rtx operands[])
22985 machine_mode mode, vmode;
22986 rtx dest, scratch, op0, op1, mask, nmask, x;
22988 dest = operands[0];
22989 scratch = operands[1];
22990 op0 = operands[2];
22991 op1 = operands[3];
22992 nmask = operands[4];
22993 mask = operands[5];
22995 mode = GET_MODE (dest);
22996 vmode = GET_MODE (mask);
22998 if (rtx_equal_p (op0, op1))
23000 /* Shouldn't happen often (it's useless, obviously), but when it does
23001 we'd generate incorrect code if we continue below. */
23002 emit_move_insn (dest, op0);
23003 return;
23006 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23008 gcc_assert (REGNO (op1) == REGNO (scratch));
23010 x = gen_rtx_AND (vmode, scratch, mask);
23011 emit_insn (gen_rtx_SET (scratch, x));
23013 dest = mask;
23014 op0 = lowpart_subreg (vmode, op0, mode);
23015 x = gen_rtx_NOT (vmode, dest);
23016 x = gen_rtx_AND (vmode, x, op0);
23017 emit_insn (gen_rtx_SET (dest, x));
23019 else
23021 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23023 x = gen_rtx_AND (vmode, scratch, mask);
23025 else /* alternative 2,4 */
23027 gcc_assert (REGNO (mask) == REGNO (scratch));
23028 op1 = lowpart_subreg (vmode, op1, mode);
23029 x = gen_rtx_AND (vmode, scratch, op1);
23031 emit_insn (gen_rtx_SET (scratch, x));
23033 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23035 dest = lowpart_subreg (vmode, op0, mode);
23036 x = gen_rtx_AND (vmode, dest, nmask);
23038 else /* alternative 3,4 */
23040 gcc_assert (REGNO (nmask) == REGNO (dest));
23041 dest = nmask;
23042 op0 = lowpart_subreg (vmode, op0, mode);
23043 x = gen_rtx_AND (vmode, dest, op0);
23045 emit_insn (gen_rtx_SET (dest, x));
23048 x = gen_rtx_IOR (vmode, dest, scratch);
23049 emit_insn (gen_rtx_SET (dest, x));
23052 /* Return TRUE or FALSE depending on whether the first SET in INSN
23053 has source and destination with matching CC modes, and that the
23054 CC mode is at least as constrained as REQ_MODE. */
23056 bool
23057 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23059 rtx set;
23060 machine_mode set_mode;
23062 set = PATTERN (insn);
23063 if (GET_CODE (set) == PARALLEL)
23064 set = XVECEXP (set, 0, 0);
23065 gcc_assert (GET_CODE (set) == SET);
23066 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23068 set_mode = GET_MODE (SET_DEST (set));
23069 switch (set_mode)
23071 case CCNOmode:
23072 if (req_mode != CCNOmode
23073 && (req_mode != CCmode
23074 || XEXP (SET_SRC (set), 1) != const0_rtx))
23075 return false;
23076 break;
23077 case CCmode:
23078 if (req_mode == CCGCmode)
23079 return false;
23080 /* FALLTHRU */
23081 case CCGCmode:
23082 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23083 return false;
23084 /* FALLTHRU */
23085 case CCGOCmode:
23086 if (req_mode == CCZmode)
23087 return false;
23088 /* FALLTHRU */
23089 case CCZmode:
23090 break;
23092 case CCAmode:
23093 case CCCmode:
23094 case CCOmode:
23095 case CCPmode:
23096 case CCSmode:
23097 if (set_mode != req_mode)
23098 return false;
23099 break;
23101 default:
23102 gcc_unreachable ();
23105 return GET_MODE (SET_SRC (set)) == set_mode;
23108 /* Generate insn patterns to do an integer compare of OPERANDS. */
23110 static rtx
23111 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23113 machine_mode cmpmode;
23114 rtx tmp, flags;
23116 cmpmode = SELECT_CC_MODE (code, op0, op1);
23117 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23119 /* This is very simple, but making the interface the same as in the
23120 FP case makes the rest of the code easier. */
23121 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23122 emit_insn (gen_rtx_SET (flags, tmp));
23124 /* Return the test that should be put into the flags user, i.e.
23125 the bcc, scc, or cmov instruction. */
23126 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23129 /* Figure out whether to use ordered or unordered fp comparisons.
23130 Return the appropriate mode to use. */
23132 machine_mode
23133 ix86_fp_compare_mode (enum rtx_code)
23135 /* ??? In order to make all comparisons reversible, we do all comparisons
23136 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23137 all forms trapping and nontrapping comparisons, we can make inequality
23138 comparisons trapping again, since it results in better code when using
23139 FCOM based compares. */
23140 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23143 machine_mode
23144 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23146 machine_mode mode = GET_MODE (op0);
23148 if (SCALAR_FLOAT_MODE_P (mode))
23150 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23151 return ix86_fp_compare_mode (code);
23154 switch (code)
23156 /* Only zero flag is needed. */
23157 case EQ: /* ZF=0 */
23158 case NE: /* ZF!=0 */
23159 return CCZmode;
23160 /* Codes needing carry flag. */
23161 case GEU: /* CF=0 */
23162 case LTU: /* CF=1 */
23163 /* Detect overflow checks. They need just the carry flag. */
23164 if (GET_CODE (op0) == PLUS
23165 && (rtx_equal_p (op1, XEXP (op0, 0))
23166 || rtx_equal_p (op1, XEXP (op0, 1))))
23167 return CCCmode;
23168 else
23169 return CCmode;
23170 case GTU: /* CF=0 & ZF=0 */
23171 case LEU: /* CF=1 | ZF=1 */
23172 return CCmode;
23173 /* Codes possibly doable only with sign flag when
23174 comparing against zero. */
23175 case GE: /* SF=OF or SF=0 */
23176 case LT: /* SF<>OF or SF=1 */
23177 if (op1 == const0_rtx)
23178 return CCGOCmode;
23179 else
23180 /* For other cases Carry flag is not required. */
23181 return CCGCmode;
23182 /* Codes doable only with sign flag when comparing
23183 against zero, but we miss jump instruction for it
23184 so we need to use relational tests against overflow
23185 that thus needs to be zero. */
23186 case GT: /* ZF=0 & SF=OF */
23187 case LE: /* ZF=1 | SF<>OF */
23188 if (op1 == const0_rtx)
23189 return CCNOmode;
23190 else
23191 return CCGCmode;
23192 /* strcmp pattern do (use flags) and combine may ask us for proper
23193 mode. */
23194 case USE:
23195 return CCmode;
23196 default:
23197 gcc_unreachable ();
23201 /* Return the fixed registers used for condition codes. */
23203 static bool
23204 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23206 *p1 = FLAGS_REG;
23207 *p2 = FPSR_REG;
23208 return true;
23211 /* If two condition code modes are compatible, return a condition code
23212 mode which is compatible with both. Otherwise, return
23213 VOIDmode. */
23215 static machine_mode
23216 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23218 if (m1 == m2)
23219 return m1;
23221 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23222 return VOIDmode;
23224 if ((m1 == CCGCmode && m2 == CCGOCmode)
23225 || (m1 == CCGOCmode && m2 == CCGCmode))
23226 return CCGCmode;
23228 if ((m1 == CCNOmode && m2 == CCGOCmode)
23229 || (m1 == CCGOCmode && m2 == CCNOmode))
23230 return CCNOmode;
23232 if (m1 == CCZmode
23233 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23234 return m2;
23235 else if (m2 == CCZmode
23236 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23237 return m1;
23239 switch (m1)
23241 default:
23242 gcc_unreachable ();
23244 case CCmode:
23245 case CCGCmode:
23246 case CCGOCmode:
23247 case CCNOmode:
23248 case CCAmode:
23249 case CCCmode:
23250 case CCOmode:
23251 case CCPmode:
23252 case CCSmode:
23253 case CCZmode:
23254 switch (m2)
23256 default:
23257 return VOIDmode;
23259 case CCmode:
23260 case CCGCmode:
23261 case CCGOCmode:
23262 case CCNOmode:
23263 case CCAmode:
23264 case CCCmode:
23265 case CCOmode:
23266 case CCPmode:
23267 case CCSmode:
23268 case CCZmode:
23269 return CCmode;
23272 case CCFPmode:
23273 case CCFPUmode:
23274 /* These are only compatible with themselves, which we already
23275 checked above. */
23276 return VOIDmode;
23281 /* Return a comparison we can do and that it is equivalent to
23282 swap_condition (code) apart possibly from orderedness.
23283 But, never change orderedness if TARGET_IEEE_FP, returning
23284 UNKNOWN in that case if necessary. */
23286 static enum rtx_code
23287 ix86_fp_swap_condition (enum rtx_code code)
23289 switch (code)
23291 case GT: /* GTU - CF=0 & ZF=0 */
23292 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23293 case GE: /* GEU - CF=0 */
23294 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23295 case UNLT: /* LTU - CF=1 */
23296 return TARGET_IEEE_FP ? UNKNOWN : GT;
23297 case UNLE: /* LEU - CF=1 | ZF=1 */
23298 return TARGET_IEEE_FP ? UNKNOWN : GE;
23299 default:
23300 return swap_condition (code);
23304 /* Return cost of comparison CODE using the best strategy for performance.
23305 All following functions do use number of instructions as a cost metrics.
23306 In future this should be tweaked to compute bytes for optimize_size and
23307 take into account performance of various instructions on various CPUs. */
23309 static int
23310 ix86_fp_comparison_cost (enum rtx_code code)
23312 int arith_cost;
23314 /* The cost of code using bit-twiddling on %ah. */
23315 switch (code)
23317 case UNLE:
23318 case UNLT:
23319 case LTGT:
23320 case GT:
23321 case GE:
23322 case UNORDERED:
23323 case ORDERED:
23324 case UNEQ:
23325 arith_cost = 4;
23326 break;
23327 case LT:
23328 case NE:
23329 case EQ:
23330 case UNGE:
23331 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23332 break;
23333 case LE:
23334 case UNGT:
23335 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23336 break;
23337 default:
23338 gcc_unreachable ();
23341 switch (ix86_fp_comparison_strategy (code))
23343 case IX86_FPCMP_COMI:
23344 return arith_cost > 4 ? 3 : 2;
23345 case IX86_FPCMP_SAHF:
23346 return arith_cost > 4 ? 4 : 3;
23347 default:
23348 return arith_cost;
23352 /* Return strategy to use for floating-point. We assume that fcomi is always
23353 preferrable where available, since that is also true when looking at size
23354 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23356 enum ix86_fpcmp_strategy
23357 ix86_fp_comparison_strategy (enum rtx_code)
23359 /* Do fcomi/sahf based test when profitable. */
23361 if (TARGET_CMOVE)
23362 return IX86_FPCMP_COMI;
23364 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23365 return IX86_FPCMP_SAHF;
23367 return IX86_FPCMP_ARITH;
23370 /* Swap, force into registers, or otherwise massage the two operands
23371 to a fp comparison. The operands are updated in place; the new
23372 comparison code is returned. */
23374 static enum rtx_code
23375 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23377 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23378 rtx op0 = *pop0, op1 = *pop1;
23379 machine_mode op_mode = GET_MODE (op0);
23380 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23382 /* All of the unordered compare instructions only work on registers.
23383 The same is true of the fcomi compare instructions. The XFmode
23384 compare instructions require registers except when comparing
23385 against zero or when converting operand 1 from fixed point to
23386 floating point. */
23388 if (!is_sse
23389 && (fpcmp_mode == CCFPUmode
23390 || (op_mode == XFmode
23391 && ! (standard_80387_constant_p (op0) == 1
23392 || standard_80387_constant_p (op1) == 1)
23393 && GET_CODE (op1) != FLOAT)
23394 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23396 op0 = force_reg (op_mode, op0);
23397 op1 = force_reg (op_mode, op1);
23399 else
23401 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23402 things around if they appear profitable, otherwise force op0
23403 into a register. */
23405 if (standard_80387_constant_p (op0) == 0
23406 || (MEM_P (op0)
23407 && ! (standard_80387_constant_p (op1) == 0
23408 || MEM_P (op1))))
23410 enum rtx_code new_code = ix86_fp_swap_condition (code);
23411 if (new_code != UNKNOWN)
23413 std::swap (op0, op1);
23414 code = new_code;
23418 if (!REG_P (op0))
23419 op0 = force_reg (op_mode, op0);
23421 if (CONSTANT_P (op1))
23423 int tmp = standard_80387_constant_p (op1);
23424 if (tmp == 0)
23425 op1 = validize_mem (force_const_mem (op_mode, op1));
23426 else if (tmp == 1)
23428 if (TARGET_CMOVE)
23429 op1 = force_reg (op_mode, op1);
23431 else
23432 op1 = force_reg (op_mode, op1);
23436 /* Try to rearrange the comparison to make it cheaper. */
23437 if (ix86_fp_comparison_cost (code)
23438 > ix86_fp_comparison_cost (swap_condition (code))
23439 && (REG_P (op1) || can_create_pseudo_p ()))
23441 std::swap (op0, op1);
23442 code = swap_condition (code);
23443 if (!REG_P (op0))
23444 op0 = force_reg (op_mode, op0);
23447 *pop0 = op0;
23448 *pop1 = op1;
23449 return code;
23452 /* Convert comparison codes we use to represent FP comparison to integer
23453 code that will result in proper branch. Return UNKNOWN if no such code
23454 is available. */
23456 enum rtx_code
23457 ix86_fp_compare_code_to_integer (enum rtx_code code)
23459 switch (code)
23461 case GT:
23462 return GTU;
23463 case GE:
23464 return GEU;
23465 case ORDERED:
23466 case UNORDERED:
23467 return code;
23468 case UNEQ:
23469 return EQ;
23470 case UNLT:
23471 return LTU;
23472 case UNLE:
23473 return LEU;
23474 case LTGT:
23475 return NE;
23476 default:
23477 return UNKNOWN;
23481 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23483 static rtx
23484 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23486 machine_mode fpcmp_mode, intcmp_mode;
23487 rtx tmp, tmp2;
23489 fpcmp_mode = ix86_fp_compare_mode (code);
23490 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23492 /* Do fcomi/sahf based test when profitable. */
23493 switch (ix86_fp_comparison_strategy (code))
23495 case IX86_FPCMP_COMI:
23496 intcmp_mode = fpcmp_mode;
23497 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23498 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23499 emit_insn (tmp);
23500 break;
23502 case IX86_FPCMP_SAHF:
23503 intcmp_mode = fpcmp_mode;
23504 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23505 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23507 if (!scratch)
23508 scratch = gen_reg_rtx (HImode);
23509 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23510 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23511 break;
23513 case IX86_FPCMP_ARITH:
23514 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23515 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23516 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23517 if (!scratch)
23518 scratch = gen_reg_rtx (HImode);
23519 emit_insn (gen_rtx_SET (scratch, tmp2));
23521 /* In the unordered case, we have to check C2 for NaN's, which
23522 doesn't happen to work out to anything nice combination-wise.
23523 So do some bit twiddling on the value we've got in AH to come
23524 up with an appropriate set of condition codes. */
23526 intcmp_mode = CCNOmode;
23527 switch (code)
23529 case GT:
23530 case UNGT:
23531 if (code == GT || !TARGET_IEEE_FP)
23533 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23534 code = EQ;
23536 else
23538 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23539 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23540 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23541 intcmp_mode = CCmode;
23542 code = GEU;
23544 break;
23545 case LT:
23546 case UNLT:
23547 if (code == LT && TARGET_IEEE_FP)
23549 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23550 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23551 intcmp_mode = CCmode;
23552 code = EQ;
23554 else
23556 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23557 code = NE;
23559 break;
23560 case GE:
23561 case UNGE:
23562 if (code == GE || !TARGET_IEEE_FP)
23564 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23565 code = EQ;
23567 else
23569 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23570 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23571 code = NE;
23573 break;
23574 case LE:
23575 case UNLE:
23576 if (code == LE && TARGET_IEEE_FP)
23578 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23579 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23580 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23581 intcmp_mode = CCmode;
23582 code = LTU;
23584 else
23586 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23587 code = NE;
23589 break;
23590 case EQ:
23591 case UNEQ:
23592 if (code == EQ && TARGET_IEEE_FP)
23594 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23595 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23596 intcmp_mode = CCmode;
23597 code = EQ;
23599 else
23601 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23602 code = NE;
23604 break;
23605 case NE:
23606 case LTGT:
23607 if (code == NE && TARGET_IEEE_FP)
23609 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23610 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23611 GEN_INT (0x40)));
23612 code = NE;
23614 else
23616 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23617 code = EQ;
23619 break;
23621 case UNORDERED:
23622 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23623 code = NE;
23624 break;
23625 case ORDERED:
23626 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23627 code = EQ;
23628 break;
23630 default:
23631 gcc_unreachable ();
23633 break;
23635 default:
23636 gcc_unreachable();
23639 /* Return the test that should be put into the flags user, i.e.
23640 the bcc, scc, or cmov instruction. */
23641 return gen_rtx_fmt_ee (code, VOIDmode,
23642 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23643 const0_rtx);
23646 static rtx
23647 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23649 rtx ret;
23651 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23652 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23654 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23656 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23657 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23659 else
23660 ret = ix86_expand_int_compare (code, op0, op1);
23662 return ret;
23665 void
23666 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23668 machine_mode mode = GET_MODE (op0);
23669 rtx tmp;
23671 /* Handle special case - vector comparsion with boolean result, transform
23672 it using ptest instruction. */
23673 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23675 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23676 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23678 gcc_assert (code == EQ || code == NE);
23679 /* Generate XOR since we can't check that one operand is zero vector. */
23680 tmp = gen_reg_rtx (mode);
23681 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23682 tmp = gen_lowpart (p_mode, tmp);
23683 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23684 gen_rtx_UNSPEC (CCmode,
23685 gen_rtvec (2, tmp, tmp),
23686 UNSPEC_PTEST)));
23687 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23688 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23689 gen_rtx_LABEL_REF (VOIDmode, label),
23690 pc_rtx);
23691 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23692 return;
23695 switch (mode)
23697 case SFmode:
23698 case DFmode:
23699 case XFmode:
23700 case QImode:
23701 case HImode:
23702 case SImode:
23703 simple:
23704 tmp = ix86_expand_compare (code, op0, op1);
23705 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23706 gen_rtx_LABEL_REF (VOIDmode, label),
23707 pc_rtx);
23708 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23709 return;
23711 case DImode:
23712 if (TARGET_64BIT)
23713 goto simple;
23714 /* For 32-bit target DI comparison may be performed on
23715 SSE registers. To allow this we should avoid split
23716 to SI mode which is achieved by doing xor in DI mode
23717 and then comparing with zero (which is recognized by
23718 STV pass). We don't compare using xor when optimizing
23719 for size. */
23720 if (!optimize_insn_for_size_p ()
23721 && TARGET_STV
23722 && (code == EQ || code == NE))
23724 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23725 op1 = const0_rtx;
23727 /* FALLTHRU */
23728 case TImode:
23729 /* Expand DImode branch into multiple compare+branch. */
23731 rtx lo[2], hi[2];
23732 rtx_code_label *label2;
23733 enum rtx_code code1, code2, code3;
23734 machine_mode submode;
23736 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23738 std::swap (op0, op1);
23739 code = swap_condition (code);
23742 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23743 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23745 submode = mode == DImode ? SImode : DImode;
23747 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23748 avoid two branches. This costs one extra insn, so disable when
23749 optimizing for size. */
23751 if ((code == EQ || code == NE)
23752 && (!optimize_insn_for_size_p ()
23753 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23755 rtx xor0, xor1;
23757 xor1 = hi[0];
23758 if (hi[1] != const0_rtx)
23759 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23760 NULL_RTX, 0, OPTAB_WIDEN);
23762 xor0 = lo[0];
23763 if (lo[1] != const0_rtx)
23764 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23765 NULL_RTX, 0, OPTAB_WIDEN);
23767 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23768 NULL_RTX, 0, OPTAB_WIDEN);
23770 ix86_expand_branch (code, tmp, const0_rtx, label);
23771 return;
23774 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23775 op1 is a constant and the low word is zero, then we can just
23776 examine the high word. Similarly for low word -1 and
23777 less-or-equal-than or greater-than. */
23779 if (CONST_INT_P (hi[1]))
23780 switch (code)
23782 case LT: case LTU: case GE: case GEU:
23783 if (lo[1] == const0_rtx)
23785 ix86_expand_branch (code, hi[0], hi[1], label);
23786 return;
23788 break;
23789 case LE: case LEU: case GT: case GTU:
23790 if (lo[1] == constm1_rtx)
23792 ix86_expand_branch (code, hi[0], hi[1], label);
23793 return;
23795 break;
23796 default:
23797 break;
23800 /* Otherwise, we need two or three jumps. */
23802 label2 = gen_label_rtx ();
23804 code1 = code;
23805 code2 = swap_condition (code);
23806 code3 = unsigned_condition (code);
23808 switch (code)
23810 case LT: case GT: case LTU: case GTU:
23811 break;
23813 case LE: code1 = LT; code2 = GT; break;
23814 case GE: code1 = GT; code2 = LT; break;
23815 case LEU: code1 = LTU; code2 = GTU; break;
23816 case GEU: code1 = GTU; code2 = LTU; break;
23818 case EQ: code1 = UNKNOWN; code2 = NE; break;
23819 case NE: code2 = UNKNOWN; break;
23821 default:
23822 gcc_unreachable ();
23826 * a < b =>
23827 * if (hi(a) < hi(b)) goto true;
23828 * if (hi(a) > hi(b)) goto false;
23829 * if (lo(a) < lo(b)) goto true;
23830 * false:
23833 if (code1 != UNKNOWN)
23834 ix86_expand_branch (code1, hi[0], hi[1], label);
23835 if (code2 != UNKNOWN)
23836 ix86_expand_branch (code2, hi[0], hi[1], label2);
23838 ix86_expand_branch (code3, lo[0], lo[1], label);
23840 if (code2 != UNKNOWN)
23841 emit_label (label2);
23842 return;
23845 default:
23846 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23847 goto simple;
23851 /* Split branch based on floating point condition. */
23852 void
23853 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23854 rtx target1, rtx target2, rtx tmp)
23856 rtx condition;
23857 rtx_insn *i;
23859 if (target2 != pc_rtx)
23861 std::swap (target1, target2);
23862 code = reverse_condition_maybe_unordered (code);
23865 condition = ix86_expand_fp_compare (code, op1, op2,
23866 tmp);
23868 i = emit_jump_insn (gen_rtx_SET
23869 (pc_rtx,
23870 gen_rtx_IF_THEN_ELSE (VOIDmode,
23871 condition, target1, target2)));
23872 if (split_branch_probability.initialized_p ())
23873 add_reg_br_prob_note (i, split_branch_probability);
23876 void
23877 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23879 rtx ret;
23881 gcc_assert (GET_MODE (dest) == QImode);
23883 ret = ix86_expand_compare (code, op0, op1);
23884 PUT_MODE (ret, QImode);
23885 emit_insn (gen_rtx_SET (dest, ret));
23888 /* Expand comparison setting or clearing carry flag. Return true when
23889 successful and set pop for the operation. */
23890 static bool
23891 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23893 machine_mode mode =
23894 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23896 /* Do not handle double-mode compares that go through special path. */
23897 if (mode == (TARGET_64BIT ? TImode : DImode))
23898 return false;
23900 if (SCALAR_FLOAT_MODE_P (mode))
23902 rtx compare_op;
23903 rtx_insn *compare_seq;
23905 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23907 /* Shortcut: following common codes never translate
23908 into carry flag compares. */
23909 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23910 || code == ORDERED || code == UNORDERED)
23911 return false;
23913 /* These comparisons require zero flag; swap operands so they won't. */
23914 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23915 && !TARGET_IEEE_FP)
23917 std::swap (op0, op1);
23918 code = swap_condition (code);
23921 /* Try to expand the comparison and verify that we end up with
23922 carry flag based comparison. This fails to be true only when
23923 we decide to expand comparison using arithmetic that is not
23924 too common scenario. */
23925 start_sequence ();
23926 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23927 compare_seq = get_insns ();
23928 end_sequence ();
23930 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23931 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23932 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23933 else
23934 code = GET_CODE (compare_op);
23936 if (code != LTU && code != GEU)
23937 return false;
23939 emit_insn (compare_seq);
23940 *pop = compare_op;
23941 return true;
23944 if (!INTEGRAL_MODE_P (mode))
23945 return false;
23947 switch (code)
23949 case LTU:
23950 case GEU:
23951 break;
23953 /* Convert a==0 into (unsigned)a<1. */
23954 case EQ:
23955 case NE:
23956 if (op1 != const0_rtx)
23957 return false;
23958 op1 = const1_rtx;
23959 code = (code == EQ ? LTU : GEU);
23960 break;
23962 /* Convert a>b into b<a or a>=b-1. */
23963 case GTU:
23964 case LEU:
23965 if (CONST_INT_P (op1))
23967 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23968 /* Bail out on overflow. We still can swap operands but that
23969 would force loading of the constant into register. */
23970 if (op1 == const0_rtx
23971 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23972 return false;
23973 code = (code == GTU ? GEU : LTU);
23975 else
23977 std::swap (op0, op1);
23978 code = (code == GTU ? LTU : GEU);
23980 break;
23982 /* Convert a>=0 into (unsigned)a<0x80000000. */
23983 case LT:
23984 case GE:
23985 if (mode == DImode || op1 != const0_rtx)
23986 return false;
23987 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23988 code = (code == LT ? GEU : LTU);
23989 break;
23990 case LE:
23991 case GT:
23992 if (mode == DImode || op1 != constm1_rtx)
23993 return false;
23994 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23995 code = (code == LE ? GEU : LTU);
23996 break;
23998 default:
23999 return false;
24001 /* Swapping operands may cause constant to appear as first operand. */
24002 if (!nonimmediate_operand (op0, VOIDmode))
24004 if (!can_create_pseudo_p ())
24005 return false;
24006 op0 = force_reg (mode, op0);
24008 *pop = ix86_expand_compare (code, op0, op1);
24009 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24010 return true;
24013 bool
24014 ix86_expand_int_movcc (rtx operands[])
24016 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24017 rtx_insn *compare_seq;
24018 rtx compare_op;
24019 machine_mode mode = GET_MODE (operands[0]);
24020 bool sign_bit_compare_p = false;
24021 rtx op0 = XEXP (operands[1], 0);
24022 rtx op1 = XEXP (operands[1], 1);
24024 if (GET_MODE (op0) == TImode
24025 || (GET_MODE (op0) == DImode
24026 && !TARGET_64BIT))
24027 return false;
24029 start_sequence ();
24030 compare_op = ix86_expand_compare (code, op0, op1);
24031 compare_seq = get_insns ();
24032 end_sequence ();
24034 compare_code = GET_CODE (compare_op);
24036 if ((op1 == const0_rtx && (code == GE || code == LT))
24037 || (op1 == constm1_rtx && (code == GT || code == LE)))
24038 sign_bit_compare_p = true;
24040 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24041 HImode insns, we'd be swallowed in word prefix ops. */
24043 if ((mode != HImode || TARGET_FAST_PREFIX)
24044 && (mode != (TARGET_64BIT ? TImode : DImode))
24045 && CONST_INT_P (operands[2])
24046 && CONST_INT_P (operands[3]))
24048 rtx out = operands[0];
24049 HOST_WIDE_INT ct = INTVAL (operands[2]);
24050 HOST_WIDE_INT cf = INTVAL (operands[3]);
24051 HOST_WIDE_INT diff;
24053 diff = ct - cf;
24054 /* Sign bit compares are better done using shifts than we do by using
24055 sbb. */
24056 if (sign_bit_compare_p
24057 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24059 /* Detect overlap between destination and compare sources. */
24060 rtx tmp = out;
24062 if (!sign_bit_compare_p)
24064 rtx flags;
24065 bool fpcmp = false;
24067 compare_code = GET_CODE (compare_op);
24069 flags = XEXP (compare_op, 0);
24071 if (GET_MODE (flags) == CCFPmode
24072 || GET_MODE (flags) == CCFPUmode)
24074 fpcmp = true;
24075 compare_code
24076 = ix86_fp_compare_code_to_integer (compare_code);
24079 /* To simplify rest of code, restrict to the GEU case. */
24080 if (compare_code == LTU)
24082 std::swap (ct, cf);
24083 compare_code = reverse_condition (compare_code);
24084 code = reverse_condition (code);
24086 else
24088 if (fpcmp)
24089 PUT_CODE (compare_op,
24090 reverse_condition_maybe_unordered
24091 (GET_CODE (compare_op)));
24092 else
24093 PUT_CODE (compare_op,
24094 reverse_condition (GET_CODE (compare_op)));
24096 diff = ct - cf;
24098 if (reg_overlap_mentioned_p (out, op0)
24099 || reg_overlap_mentioned_p (out, op1))
24100 tmp = gen_reg_rtx (mode);
24102 if (mode == DImode)
24103 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24104 else
24105 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24106 flags, compare_op));
24108 else
24110 if (code == GT || code == GE)
24111 code = reverse_condition (code);
24112 else
24114 std::swap (ct, cf);
24115 diff = ct - cf;
24117 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24120 if (diff == 1)
24123 * cmpl op0,op1
24124 * sbbl dest,dest
24125 * [addl dest, ct]
24127 * Size 5 - 8.
24129 if (ct)
24130 tmp = expand_simple_binop (mode, PLUS,
24131 tmp, GEN_INT (ct),
24132 copy_rtx (tmp), 1, OPTAB_DIRECT);
24134 else if (cf == -1)
24137 * cmpl op0,op1
24138 * sbbl dest,dest
24139 * orl $ct, dest
24141 * Size 8.
24143 tmp = expand_simple_binop (mode, IOR,
24144 tmp, GEN_INT (ct),
24145 copy_rtx (tmp), 1, OPTAB_DIRECT);
24147 else if (diff == -1 && ct)
24150 * cmpl op0,op1
24151 * sbbl dest,dest
24152 * notl dest
24153 * [addl dest, cf]
24155 * Size 8 - 11.
24157 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24158 if (cf)
24159 tmp = expand_simple_binop (mode, PLUS,
24160 copy_rtx (tmp), GEN_INT (cf),
24161 copy_rtx (tmp), 1, OPTAB_DIRECT);
24163 else
24166 * cmpl op0,op1
24167 * sbbl dest,dest
24168 * [notl dest]
24169 * andl cf - ct, dest
24170 * [addl dest, ct]
24172 * Size 8 - 11.
24175 if (cf == 0)
24177 cf = ct;
24178 ct = 0;
24179 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24182 tmp = expand_simple_binop (mode, AND,
24183 copy_rtx (tmp),
24184 gen_int_mode (cf - ct, mode),
24185 copy_rtx (tmp), 1, OPTAB_DIRECT);
24186 if (ct)
24187 tmp = expand_simple_binop (mode, PLUS,
24188 copy_rtx (tmp), GEN_INT (ct),
24189 copy_rtx (tmp), 1, OPTAB_DIRECT);
24192 if (!rtx_equal_p (tmp, out))
24193 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24195 return true;
24198 if (diff < 0)
24200 machine_mode cmp_mode = GET_MODE (op0);
24201 enum rtx_code new_code;
24203 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24205 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24207 /* We may be reversing unordered compare to normal compare, that
24208 is not valid in general (we may convert non-trapping condition
24209 to trapping one), however on i386 we currently emit all
24210 comparisons unordered. */
24211 new_code = reverse_condition_maybe_unordered (code);
24213 else
24214 new_code = ix86_reverse_condition (code, cmp_mode);
24215 if (new_code != UNKNOWN)
24217 std::swap (ct, cf);
24218 diff = -diff;
24219 code = new_code;
24223 compare_code = UNKNOWN;
24224 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24225 && CONST_INT_P (op1))
24227 if (op1 == const0_rtx
24228 && (code == LT || code == GE))
24229 compare_code = code;
24230 else if (op1 == constm1_rtx)
24232 if (code == LE)
24233 compare_code = LT;
24234 else if (code == GT)
24235 compare_code = GE;
24239 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24240 if (compare_code != UNKNOWN
24241 && GET_MODE (op0) == GET_MODE (out)
24242 && (cf == -1 || ct == -1))
24244 /* If lea code below could be used, only optimize
24245 if it results in a 2 insn sequence. */
24247 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24248 || diff == 3 || diff == 5 || diff == 9)
24249 || (compare_code == LT && ct == -1)
24250 || (compare_code == GE && cf == -1))
24253 * notl op1 (if necessary)
24254 * sarl $31, op1
24255 * orl cf, op1
24257 if (ct != -1)
24259 cf = ct;
24260 ct = -1;
24261 code = reverse_condition (code);
24264 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24266 out = expand_simple_binop (mode, IOR,
24267 out, GEN_INT (cf),
24268 out, 1, OPTAB_DIRECT);
24269 if (out != operands[0])
24270 emit_move_insn (operands[0], out);
24272 return true;
24277 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24278 || diff == 3 || diff == 5 || diff == 9)
24279 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24280 && (mode != DImode
24281 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24284 * xorl dest,dest
24285 * cmpl op1,op2
24286 * setcc dest
24287 * lea cf(dest*(ct-cf)),dest
24289 * Size 14.
24291 * This also catches the degenerate setcc-only case.
24294 rtx tmp;
24295 int nops;
24297 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24299 nops = 0;
24300 /* On x86_64 the lea instruction operates on Pmode, so we need
24301 to get arithmetics done in proper mode to match. */
24302 if (diff == 1)
24303 tmp = copy_rtx (out);
24304 else
24306 rtx out1;
24307 out1 = copy_rtx (out);
24308 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24309 nops++;
24310 if (diff & 1)
24312 tmp = gen_rtx_PLUS (mode, tmp, out1);
24313 nops++;
24316 if (cf != 0)
24318 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24319 nops++;
24321 if (!rtx_equal_p (tmp, out))
24323 if (nops == 1)
24324 out = force_operand (tmp, copy_rtx (out));
24325 else
24326 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24328 if (!rtx_equal_p (out, operands[0]))
24329 emit_move_insn (operands[0], copy_rtx (out));
24331 return true;
24335 * General case: Jumpful:
24336 * xorl dest,dest cmpl op1, op2
24337 * cmpl op1, op2 movl ct, dest
24338 * setcc dest jcc 1f
24339 * decl dest movl cf, dest
24340 * andl (cf-ct),dest 1:
24341 * addl ct,dest
24343 * Size 20. Size 14.
24345 * This is reasonably steep, but branch mispredict costs are
24346 * high on modern cpus, so consider failing only if optimizing
24347 * for space.
24350 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24351 && BRANCH_COST (optimize_insn_for_speed_p (),
24352 false) >= 2)
24354 if (cf == 0)
24356 machine_mode cmp_mode = GET_MODE (op0);
24357 enum rtx_code new_code;
24359 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24361 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24363 /* We may be reversing unordered compare to normal compare,
24364 that is not valid in general (we may convert non-trapping
24365 condition to trapping one), however on i386 we currently
24366 emit all comparisons unordered. */
24367 new_code = reverse_condition_maybe_unordered (code);
24369 else
24371 new_code = ix86_reverse_condition (code, cmp_mode);
24372 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24373 compare_code = reverse_condition (compare_code);
24376 if (new_code != UNKNOWN)
24378 cf = ct;
24379 ct = 0;
24380 code = new_code;
24384 if (compare_code != UNKNOWN)
24386 /* notl op1 (if needed)
24387 sarl $31, op1
24388 andl (cf-ct), op1
24389 addl ct, op1
24391 For x < 0 (resp. x <= -1) there will be no notl,
24392 so if possible swap the constants to get rid of the
24393 complement.
24394 True/false will be -1/0 while code below (store flag
24395 followed by decrement) is 0/-1, so the constants need
24396 to be exchanged once more. */
24398 if (compare_code == GE || !cf)
24400 code = reverse_condition (code);
24401 compare_code = LT;
24403 else
24404 std::swap (ct, cf);
24406 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24408 else
24410 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24412 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24413 constm1_rtx,
24414 copy_rtx (out), 1, OPTAB_DIRECT);
24417 out = expand_simple_binop (mode, AND, copy_rtx (out),
24418 gen_int_mode (cf - ct, mode),
24419 copy_rtx (out), 1, OPTAB_DIRECT);
24420 if (ct)
24421 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24422 copy_rtx (out), 1, OPTAB_DIRECT);
24423 if (!rtx_equal_p (out, operands[0]))
24424 emit_move_insn (operands[0], copy_rtx (out));
24426 return true;
24430 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24432 /* Try a few things more with specific constants and a variable. */
24434 optab op;
24435 rtx var, orig_out, out, tmp;
24437 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24438 return false;
24440 /* If one of the two operands is an interesting constant, load a
24441 constant with the above and mask it in with a logical operation. */
24443 if (CONST_INT_P (operands[2]))
24445 var = operands[3];
24446 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24447 operands[3] = constm1_rtx, op = and_optab;
24448 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24449 operands[3] = const0_rtx, op = ior_optab;
24450 else
24451 return false;
24453 else if (CONST_INT_P (operands[3]))
24455 var = operands[2];
24456 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24457 operands[2] = constm1_rtx, op = and_optab;
24458 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24459 operands[2] = const0_rtx, op = ior_optab;
24460 else
24461 return false;
24463 else
24464 return false;
24466 orig_out = operands[0];
24467 tmp = gen_reg_rtx (mode);
24468 operands[0] = tmp;
24470 /* Recurse to get the constant loaded. */
24471 if (!ix86_expand_int_movcc (operands))
24472 return false;
24474 /* Mask in the interesting variable. */
24475 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24476 OPTAB_WIDEN);
24477 if (!rtx_equal_p (out, orig_out))
24478 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24480 return true;
24484 * For comparison with above,
24486 * movl cf,dest
24487 * movl ct,tmp
24488 * cmpl op1,op2
24489 * cmovcc tmp,dest
24491 * Size 15.
24494 if (! nonimmediate_operand (operands[2], mode))
24495 operands[2] = force_reg (mode, operands[2]);
24496 if (! nonimmediate_operand (operands[3], mode))
24497 operands[3] = force_reg (mode, operands[3]);
24499 if (! register_operand (operands[2], VOIDmode)
24500 && (mode == QImode
24501 || ! register_operand (operands[3], VOIDmode)))
24502 operands[2] = force_reg (mode, operands[2]);
24504 if (mode == QImode
24505 && ! register_operand (operands[3], VOIDmode))
24506 operands[3] = force_reg (mode, operands[3]);
24508 emit_insn (compare_seq);
24509 emit_insn (gen_rtx_SET (operands[0],
24510 gen_rtx_IF_THEN_ELSE (mode,
24511 compare_op, operands[2],
24512 operands[3])));
24513 return true;
24516 /* Swap, force into registers, or otherwise massage the two operands
24517 to an sse comparison with a mask result. Thus we differ a bit from
24518 ix86_prepare_fp_compare_args which expects to produce a flags result.
24520 The DEST operand exists to help determine whether to commute commutative
24521 operators. The POP0/POP1 operands are updated in place. The new
24522 comparison code is returned, or UNKNOWN if not implementable. */
24524 static enum rtx_code
24525 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24526 rtx *pop0, rtx *pop1)
24528 switch (code)
24530 case LTGT:
24531 case UNEQ:
24532 /* AVX supports all the needed comparisons. */
24533 if (TARGET_AVX)
24534 break;
24535 /* We have no LTGT as an operator. We could implement it with
24536 NE & ORDERED, but this requires an extra temporary. It's
24537 not clear that it's worth it. */
24538 return UNKNOWN;
24540 case LT:
24541 case LE:
24542 case UNGT:
24543 case UNGE:
24544 /* These are supported directly. */
24545 break;
24547 case EQ:
24548 case NE:
24549 case UNORDERED:
24550 case ORDERED:
24551 /* AVX has 3 operand comparisons, no need to swap anything. */
24552 if (TARGET_AVX)
24553 break;
24554 /* For commutative operators, try to canonicalize the destination
24555 operand to be first in the comparison - this helps reload to
24556 avoid extra moves. */
24557 if (!dest || !rtx_equal_p (dest, *pop1))
24558 break;
24559 /* FALLTHRU */
24561 case GE:
24562 case GT:
24563 case UNLE:
24564 case UNLT:
24565 /* These are not supported directly before AVX, and furthermore
24566 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24567 comparison operands to transform into something that is
24568 supported. */
24569 std::swap (*pop0, *pop1);
24570 code = swap_condition (code);
24571 break;
24573 default:
24574 gcc_unreachable ();
24577 return code;
24580 /* Detect conditional moves that exactly match min/max operational
24581 semantics. Note that this is IEEE safe, as long as we don't
24582 interchange the operands.
24584 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24585 and TRUE if the operation is successful and instructions are emitted. */
24587 static bool
24588 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24589 rtx cmp_op1, rtx if_true, rtx if_false)
24591 machine_mode mode;
24592 bool is_min;
24593 rtx tmp;
24595 if (code == LT)
24597 else if (code == UNGE)
24598 std::swap (if_true, if_false);
24599 else
24600 return false;
24602 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24603 is_min = true;
24604 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24605 is_min = false;
24606 else
24607 return false;
24609 mode = GET_MODE (dest);
24611 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24612 but MODE may be a vector mode and thus not appropriate. */
24613 if (!flag_finite_math_only || flag_signed_zeros)
24615 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24616 rtvec v;
24618 if_true = force_reg (mode, if_true);
24619 v = gen_rtvec (2, if_true, if_false);
24620 tmp = gen_rtx_UNSPEC (mode, v, u);
24622 else
24624 code = is_min ? SMIN : SMAX;
24625 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24628 emit_insn (gen_rtx_SET (dest, tmp));
24629 return true;
24632 /* Expand an sse vector comparison. Return the register with the result. */
24634 static rtx
24635 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24636 rtx op_true, rtx op_false)
24638 machine_mode mode = GET_MODE (dest);
24639 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24641 /* In general case result of comparison can differ from operands' type. */
24642 machine_mode cmp_mode;
24644 /* In AVX512F the result of comparison is an integer mask. */
24645 bool maskcmp = false;
24646 rtx x;
24648 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24650 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24651 gcc_assert (cmp_mode != BLKmode);
24653 maskcmp = true;
24655 else
24656 cmp_mode = cmp_ops_mode;
24659 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24660 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24661 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24663 if (optimize
24664 || (maskcmp && cmp_mode != mode)
24665 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24666 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24667 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24669 /* Compare patterns for int modes are unspec in AVX512F only. */
24670 if (maskcmp && (code == GT || code == EQ))
24672 rtx (*gen)(rtx, rtx, rtx);
24674 switch (cmp_ops_mode)
24676 case V64QImode:
24677 gcc_assert (TARGET_AVX512BW);
24678 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24679 break;
24680 case V32HImode:
24681 gcc_assert (TARGET_AVX512BW);
24682 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24683 break;
24684 case V16SImode:
24685 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24686 break;
24687 case V8DImode:
24688 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24689 break;
24690 default:
24691 gen = NULL;
24694 if (gen)
24696 emit_insn (gen (dest, cmp_op0, cmp_op1));
24697 return dest;
24700 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24702 if (cmp_mode != mode && !maskcmp)
24704 x = force_reg (cmp_ops_mode, x);
24705 convert_move (dest, x, false);
24707 else
24708 emit_insn (gen_rtx_SET (dest, x));
24710 return dest;
24713 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24714 operations. This is used for both scalar and vector conditional moves. */
24716 void
24717 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24719 machine_mode mode = GET_MODE (dest);
24720 machine_mode cmpmode = GET_MODE (cmp);
24722 /* In AVX512F the result of comparison is an integer mask. */
24723 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24725 rtx t2, t3, x;
24727 /* If we have an integer mask and FP value then we need
24728 to cast mask to FP mode. */
24729 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24731 cmp = force_reg (cmpmode, cmp);
24732 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24735 if (vector_all_ones_operand (op_true, mode)
24736 && rtx_equal_p (op_false, CONST0_RTX (mode))
24737 && !maskcmp)
24739 emit_insn (gen_rtx_SET (dest, cmp));
24741 else if (op_false == CONST0_RTX (mode)
24742 && !maskcmp)
24744 op_true = force_reg (mode, op_true);
24745 x = gen_rtx_AND (mode, cmp, op_true);
24746 emit_insn (gen_rtx_SET (dest, x));
24748 else if (op_true == CONST0_RTX (mode)
24749 && !maskcmp)
24751 op_false = force_reg (mode, op_false);
24752 x = gen_rtx_NOT (mode, cmp);
24753 x = gen_rtx_AND (mode, x, op_false);
24754 emit_insn (gen_rtx_SET (dest, x));
24756 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24757 && !maskcmp)
24759 op_false = force_reg (mode, op_false);
24760 x = gen_rtx_IOR (mode, cmp, op_false);
24761 emit_insn (gen_rtx_SET (dest, x));
24763 else if (TARGET_XOP
24764 && !maskcmp)
24766 op_true = force_reg (mode, op_true);
24768 if (!nonimmediate_operand (op_false, mode))
24769 op_false = force_reg (mode, op_false);
24771 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24772 op_true,
24773 op_false)));
24775 else
24777 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24778 rtx d = dest;
24780 if (!nonimmediate_operand (op_true, mode))
24781 op_true = force_reg (mode, op_true);
24783 op_false = force_reg (mode, op_false);
24785 switch (mode)
24787 case V4SFmode:
24788 if (TARGET_SSE4_1)
24789 gen = gen_sse4_1_blendvps;
24790 break;
24791 case V2DFmode:
24792 if (TARGET_SSE4_1)
24793 gen = gen_sse4_1_blendvpd;
24794 break;
24795 case V16QImode:
24796 case V8HImode:
24797 case V4SImode:
24798 case V2DImode:
24799 if (TARGET_SSE4_1)
24801 gen = gen_sse4_1_pblendvb;
24802 if (mode != V16QImode)
24803 d = gen_reg_rtx (V16QImode);
24804 op_false = gen_lowpart (V16QImode, op_false);
24805 op_true = gen_lowpart (V16QImode, op_true);
24806 cmp = gen_lowpart (V16QImode, cmp);
24808 break;
24809 case V8SFmode:
24810 if (TARGET_AVX)
24811 gen = gen_avx_blendvps256;
24812 break;
24813 case V4DFmode:
24814 if (TARGET_AVX)
24815 gen = gen_avx_blendvpd256;
24816 break;
24817 case V32QImode:
24818 case V16HImode:
24819 case V8SImode:
24820 case V4DImode:
24821 if (TARGET_AVX2)
24823 gen = gen_avx2_pblendvb;
24824 if (mode != V32QImode)
24825 d = gen_reg_rtx (V32QImode);
24826 op_false = gen_lowpart (V32QImode, op_false);
24827 op_true = gen_lowpart (V32QImode, op_true);
24828 cmp = gen_lowpart (V32QImode, cmp);
24830 break;
24832 case V64QImode:
24833 gen = gen_avx512bw_blendmv64qi;
24834 break;
24835 case V32HImode:
24836 gen = gen_avx512bw_blendmv32hi;
24837 break;
24838 case V16SImode:
24839 gen = gen_avx512f_blendmv16si;
24840 break;
24841 case V8DImode:
24842 gen = gen_avx512f_blendmv8di;
24843 break;
24844 case V8DFmode:
24845 gen = gen_avx512f_blendmv8df;
24846 break;
24847 case V16SFmode:
24848 gen = gen_avx512f_blendmv16sf;
24849 break;
24851 default:
24852 break;
24855 if (gen != NULL)
24857 emit_insn (gen (d, op_false, op_true, cmp));
24858 if (d != dest)
24859 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24861 else
24863 op_true = force_reg (mode, op_true);
24865 t2 = gen_reg_rtx (mode);
24866 if (optimize)
24867 t3 = gen_reg_rtx (mode);
24868 else
24869 t3 = dest;
24871 x = gen_rtx_AND (mode, op_true, cmp);
24872 emit_insn (gen_rtx_SET (t2, x));
24874 x = gen_rtx_NOT (mode, cmp);
24875 x = gen_rtx_AND (mode, x, op_false);
24876 emit_insn (gen_rtx_SET (t3, x));
24878 x = gen_rtx_IOR (mode, t3, t2);
24879 emit_insn (gen_rtx_SET (dest, x));
24884 /* Expand a floating-point conditional move. Return true if successful. */
24886 bool
24887 ix86_expand_fp_movcc (rtx operands[])
24889 machine_mode mode = GET_MODE (operands[0]);
24890 enum rtx_code code = GET_CODE (operands[1]);
24891 rtx tmp, compare_op;
24892 rtx op0 = XEXP (operands[1], 0);
24893 rtx op1 = XEXP (operands[1], 1);
24895 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24897 machine_mode cmode;
24899 /* Since we've no cmove for sse registers, don't force bad register
24900 allocation just to gain access to it. Deny movcc when the
24901 comparison mode doesn't match the move mode. */
24902 cmode = GET_MODE (op0);
24903 if (cmode == VOIDmode)
24904 cmode = GET_MODE (op1);
24905 if (cmode != mode)
24906 return false;
24908 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24909 if (code == UNKNOWN)
24910 return false;
24912 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24913 operands[2], operands[3]))
24914 return true;
24916 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24917 operands[2], operands[3]);
24918 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24919 return true;
24922 if (GET_MODE (op0) == TImode
24923 || (GET_MODE (op0) == DImode
24924 && !TARGET_64BIT))
24925 return false;
24927 /* The floating point conditional move instructions don't directly
24928 support conditions resulting from a signed integer comparison. */
24930 compare_op = ix86_expand_compare (code, op0, op1);
24931 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24933 tmp = gen_reg_rtx (QImode);
24934 ix86_expand_setcc (tmp, code, op0, op1);
24936 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24939 emit_insn (gen_rtx_SET (operands[0],
24940 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24941 operands[2], operands[3])));
24943 return true;
24946 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24948 static int
24949 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24951 switch (code)
24953 case EQ:
24954 return 0;
24955 case LT:
24956 case LTU:
24957 return 1;
24958 case LE:
24959 case LEU:
24960 return 2;
24961 case NE:
24962 return 4;
24963 case GE:
24964 case GEU:
24965 return 5;
24966 case GT:
24967 case GTU:
24968 return 6;
24969 default:
24970 gcc_unreachable ();
24974 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24976 static int
24977 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24979 switch (code)
24981 case EQ:
24982 return 0x00;
24983 case NE:
24984 return 0x04;
24985 case GT:
24986 return 0x0e;
24987 case LE:
24988 return 0x02;
24989 case GE:
24990 return 0x0d;
24991 case LT:
24992 return 0x01;
24993 case UNLE:
24994 return 0x0a;
24995 case UNLT:
24996 return 0x09;
24997 case UNGE:
24998 return 0x05;
24999 case UNGT:
25000 return 0x06;
25001 case UNEQ:
25002 return 0x18;
25003 case LTGT:
25004 return 0x0c;
25005 case ORDERED:
25006 return 0x07;
25007 case UNORDERED:
25008 return 0x03;
25009 default:
25010 gcc_unreachable ();
25014 /* Return immediate value to be used in UNSPEC_PCMP
25015 for comparison CODE in MODE. */
25017 static int
25018 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25020 if (FLOAT_MODE_P (mode))
25021 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25022 return ix86_int_cmp_code_to_pcmp_immediate (code);
25025 /* Expand AVX-512 vector comparison. */
25027 bool
25028 ix86_expand_mask_vec_cmp (rtx operands[])
25030 machine_mode mask_mode = GET_MODE (operands[0]);
25031 machine_mode cmp_mode = GET_MODE (operands[2]);
25032 enum rtx_code code = GET_CODE (operands[1]);
25033 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25034 int unspec_code;
25035 rtx unspec;
25037 switch (code)
25039 case LEU:
25040 case GTU:
25041 case GEU:
25042 case LTU:
25043 unspec_code = UNSPEC_UNSIGNED_PCMP;
25044 break;
25046 default:
25047 unspec_code = UNSPEC_PCMP;
25050 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25051 operands[3], imm),
25052 unspec_code);
25053 emit_insn (gen_rtx_SET (operands[0], unspec));
25055 return true;
25058 /* Expand fp vector comparison. */
25060 bool
25061 ix86_expand_fp_vec_cmp (rtx operands[])
25063 enum rtx_code code = GET_CODE (operands[1]);
25064 rtx cmp;
25066 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25067 &operands[2], &operands[3]);
25068 if (code == UNKNOWN)
25070 rtx temp;
25071 switch (GET_CODE (operands[1]))
25073 case LTGT:
25074 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25075 operands[3], NULL, NULL);
25076 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25077 operands[3], NULL, NULL);
25078 code = AND;
25079 break;
25080 case UNEQ:
25081 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25082 operands[3], NULL, NULL);
25083 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25084 operands[3], NULL, NULL);
25085 code = IOR;
25086 break;
25087 default:
25088 gcc_unreachable ();
25090 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25091 OPTAB_DIRECT);
25093 else
25094 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25095 operands[1], operands[2]);
25097 if (operands[0] != cmp)
25098 emit_move_insn (operands[0], cmp);
25100 return true;
25103 static rtx
25104 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25105 rtx op_true, rtx op_false, bool *negate)
25107 machine_mode data_mode = GET_MODE (dest);
25108 machine_mode mode = GET_MODE (cop0);
25109 rtx x;
25111 *negate = false;
25113 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25114 if (TARGET_XOP
25115 && (mode == V16QImode || mode == V8HImode
25116 || mode == V4SImode || mode == V2DImode))
25118 else
25120 /* Canonicalize the comparison to EQ, GT, GTU. */
25121 switch (code)
25123 case EQ:
25124 case GT:
25125 case GTU:
25126 break;
25128 case NE:
25129 case LE:
25130 case LEU:
25131 code = reverse_condition (code);
25132 *negate = true;
25133 break;
25135 case GE:
25136 case GEU:
25137 code = reverse_condition (code);
25138 *negate = true;
25139 /* FALLTHRU */
25141 case LT:
25142 case LTU:
25143 std::swap (cop0, cop1);
25144 code = swap_condition (code);
25145 break;
25147 default:
25148 gcc_unreachable ();
25151 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25152 if (mode == V2DImode)
25154 switch (code)
25156 case EQ:
25157 /* SSE4.1 supports EQ. */
25158 if (!TARGET_SSE4_1)
25159 return NULL;
25160 break;
25162 case GT:
25163 case GTU:
25164 /* SSE4.2 supports GT/GTU. */
25165 if (!TARGET_SSE4_2)
25166 return NULL;
25167 break;
25169 default:
25170 gcc_unreachable ();
25174 /* Unsigned parallel compare is not supported by the hardware.
25175 Play some tricks to turn this into a signed comparison
25176 against 0. */
25177 if (code == GTU)
25179 cop0 = force_reg (mode, cop0);
25181 switch (mode)
25183 case V16SImode:
25184 case V8DImode:
25185 case V8SImode:
25186 case V4DImode:
25187 case V4SImode:
25188 case V2DImode:
25190 rtx t1, t2, mask;
25191 rtx (*gen_sub3) (rtx, rtx, rtx);
25193 switch (mode)
25195 case V16SImode: gen_sub3 = gen_subv16si3; break;
25196 case V8DImode: gen_sub3 = gen_subv8di3; break;
25197 case V8SImode: gen_sub3 = gen_subv8si3; break;
25198 case V4DImode: gen_sub3 = gen_subv4di3; break;
25199 case V4SImode: gen_sub3 = gen_subv4si3; break;
25200 case V2DImode: gen_sub3 = gen_subv2di3; break;
25201 default:
25202 gcc_unreachable ();
25204 /* Subtract (-(INT MAX) - 1) from both operands to make
25205 them signed. */
25206 mask = ix86_build_signbit_mask (mode, true, false);
25207 t1 = gen_reg_rtx (mode);
25208 emit_insn (gen_sub3 (t1, cop0, mask));
25210 t2 = gen_reg_rtx (mode);
25211 emit_insn (gen_sub3 (t2, cop1, mask));
25213 cop0 = t1;
25214 cop1 = t2;
25215 code = GT;
25217 break;
25219 case V64QImode:
25220 case V32HImode:
25221 case V32QImode:
25222 case V16HImode:
25223 case V16QImode:
25224 case V8HImode:
25225 /* Perform a parallel unsigned saturating subtraction. */
25226 x = gen_reg_rtx (mode);
25227 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25228 cop1)));
25230 cop0 = x;
25231 cop1 = CONST0_RTX (mode);
25232 code = EQ;
25233 *negate = !*negate;
25234 break;
25236 default:
25237 gcc_unreachable ();
25242 if (*negate)
25243 std::swap (op_true, op_false);
25245 /* Allow the comparison to be done in one mode, but the movcc to
25246 happen in another mode. */
25247 if (data_mode == mode)
25249 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25250 op_true, op_false);
25252 else
25254 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25255 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25256 op_true, op_false);
25257 if (GET_MODE (x) == mode)
25258 x = gen_lowpart (data_mode, x);
25261 return x;
25264 /* Expand integer vector comparison. */
25266 bool
25267 ix86_expand_int_vec_cmp (rtx operands[])
25269 rtx_code code = GET_CODE (operands[1]);
25270 bool negate = false;
25271 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25272 operands[3], NULL, NULL, &negate);
25274 if (!cmp)
25275 return false;
25277 if (negate)
25278 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25279 CONST0_RTX (GET_MODE (cmp)),
25280 NULL, NULL, &negate);
25282 gcc_assert (!negate);
25284 if (operands[0] != cmp)
25285 emit_move_insn (operands[0], cmp);
25287 return true;
25290 /* Expand a floating-point vector conditional move; a vcond operation
25291 rather than a movcc operation. */
25293 bool
25294 ix86_expand_fp_vcond (rtx operands[])
25296 enum rtx_code code = GET_CODE (operands[3]);
25297 rtx cmp;
25299 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25300 &operands[4], &operands[5]);
25301 if (code == UNKNOWN)
25303 rtx temp;
25304 switch (GET_CODE (operands[3]))
25306 case LTGT:
25307 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25308 operands[5], operands[0], operands[0]);
25309 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25310 operands[5], operands[1], operands[2]);
25311 code = AND;
25312 break;
25313 case UNEQ:
25314 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25315 operands[5], operands[0], operands[0]);
25316 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25317 operands[5], operands[1], operands[2]);
25318 code = IOR;
25319 break;
25320 default:
25321 gcc_unreachable ();
25323 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25324 OPTAB_DIRECT);
25325 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25326 return true;
25329 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25330 operands[5], operands[1], operands[2]))
25331 return true;
25333 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25334 operands[1], operands[2]);
25335 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25336 return true;
25339 /* Expand a signed/unsigned integral vector conditional move. */
25341 bool
25342 ix86_expand_int_vcond (rtx operands[])
25344 machine_mode data_mode = GET_MODE (operands[0]);
25345 machine_mode mode = GET_MODE (operands[4]);
25346 enum rtx_code code = GET_CODE (operands[3]);
25347 bool negate = false;
25348 rtx x, cop0, cop1;
25350 cop0 = operands[4];
25351 cop1 = operands[5];
25353 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25354 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25355 if ((code == LT || code == GE)
25356 && data_mode == mode
25357 && cop1 == CONST0_RTX (mode)
25358 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25359 && GET_MODE_UNIT_SIZE (data_mode) > 1
25360 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25361 && (GET_MODE_SIZE (data_mode) == 16
25362 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25364 rtx negop = operands[2 - (code == LT)];
25365 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25366 if (negop == CONST1_RTX (data_mode))
25368 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25369 operands[0], 1, OPTAB_DIRECT);
25370 if (res != operands[0])
25371 emit_move_insn (operands[0], res);
25372 return true;
25374 else if (GET_MODE_INNER (data_mode) != DImode
25375 && vector_all_ones_operand (negop, data_mode))
25377 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25378 operands[0], 0, OPTAB_DIRECT);
25379 if (res != operands[0])
25380 emit_move_insn (operands[0], res);
25381 return true;
25385 if (!nonimmediate_operand (cop1, mode))
25386 cop1 = force_reg (mode, cop1);
25387 if (!general_operand (operands[1], data_mode))
25388 operands[1] = force_reg (data_mode, operands[1]);
25389 if (!general_operand (operands[2], data_mode))
25390 operands[2] = force_reg (data_mode, operands[2]);
25392 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25393 operands[1], operands[2], &negate);
25395 if (!x)
25396 return false;
25398 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25399 operands[2-negate]);
25400 return true;
25403 /* AVX512F does support 64-byte integer vector operations,
25404 thus the longest vector we are faced with is V64QImode. */
25405 #define MAX_VECT_LEN 64
25407 struct expand_vec_perm_d
25409 rtx target, op0, op1;
25410 unsigned char perm[MAX_VECT_LEN];
25411 machine_mode vmode;
25412 unsigned char nelt;
25413 bool one_operand_p;
25414 bool testing_p;
25417 static bool
25418 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25419 struct expand_vec_perm_d *d)
25421 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25422 expander, so args are either in d, or in op0, op1 etc. */
25423 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25424 machine_mode maskmode = mode;
25425 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25427 switch (mode)
25429 case V8HImode:
25430 if (TARGET_AVX512VL && TARGET_AVX512BW)
25431 gen = gen_avx512vl_vpermi2varv8hi3;
25432 break;
25433 case V16HImode:
25434 if (TARGET_AVX512VL && TARGET_AVX512BW)
25435 gen = gen_avx512vl_vpermi2varv16hi3;
25436 break;
25437 case V64QImode:
25438 if (TARGET_AVX512VBMI)
25439 gen = gen_avx512bw_vpermi2varv64qi3;
25440 break;
25441 case V32HImode:
25442 if (TARGET_AVX512BW)
25443 gen = gen_avx512bw_vpermi2varv32hi3;
25444 break;
25445 case V4SImode:
25446 if (TARGET_AVX512VL)
25447 gen = gen_avx512vl_vpermi2varv4si3;
25448 break;
25449 case V8SImode:
25450 if (TARGET_AVX512VL)
25451 gen = gen_avx512vl_vpermi2varv8si3;
25452 break;
25453 case V16SImode:
25454 if (TARGET_AVX512F)
25455 gen = gen_avx512f_vpermi2varv16si3;
25456 break;
25457 case V4SFmode:
25458 if (TARGET_AVX512VL)
25460 gen = gen_avx512vl_vpermi2varv4sf3;
25461 maskmode = V4SImode;
25463 break;
25464 case V8SFmode:
25465 if (TARGET_AVX512VL)
25467 gen = gen_avx512vl_vpermi2varv8sf3;
25468 maskmode = V8SImode;
25470 break;
25471 case V16SFmode:
25472 if (TARGET_AVX512F)
25474 gen = gen_avx512f_vpermi2varv16sf3;
25475 maskmode = V16SImode;
25477 break;
25478 case V2DImode:
25479 if (TARGET_AVX512VL)
25480 gen = gen_avx512vl_vpermi2varv2di3;
25481 break;
25482 case V4DImode:
25483 if (TARGET_AVX512VL)
25484 gen = gen_avx512vl_vpermi2varv4di3;
25485 break;
25486 case V8DImode:
25487 if (TARGET_AVX512F)
25488 gen = gen_avx512f_vpermi2varv8di3;
25489 break;
25490 case V2DFmode:
25491 if (TARGET_AVX512VL)
25493 gen = gen_avx512vl_vpermi2varv2df3;
25494 maskmode = V2DImode;
25496 break;
25497 case V4DFmode:
25498 if (TARGET_AVX512VL)
25500 gen = gen_avx512vl_vpermi2varv4df3;
25501 maskmode = V4DImode;
25503 break;
25504 case V8DFmode:
25505 if (TARGET_AVX512F)
25507 gen = gen_avx512f_vpermi2varv8df3;
25508 maskmode = V8DImode;
25510 break;
25511 default:
25512 break;
25515 if (gen == NULL)
25516 return false;
25518 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25519 expander, so args are either in d, or in op0, op1 etc. */
25520 if (d)
25522 rtx vec[64];
25523 target = d->target;
25524 op0 = d->op0;
25525 op1 = d->op1;
25526 for (int i = 0; i < d->nelt; ++i)
25527 vec[i] = GEN_INT (d->perm[i]);
25528 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25531 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25532 return true;
25535 /* Expand a variable vector permutation. */
25537 void
25538 ix86_expand_vec_perm (rtx operands[])
25540 rtx target = operands[0];
25541 rtx op0 = operands[1];
25542 rtx op1 = operands[2];
25543 rtx mask = operands[3];
25544 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25545 machine_mode mode = GET_MODE (op0);
25546 machine_mode maskmode = GET_MODE (mask);
25547 int w, e, i;
25548 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25550 /* Number of elements in the vector. */
25551 w = GET_MODE_NUNITS (mode);
25552 e = GET_MODE_UNIT_SIZE (mode);
25553 gcc_assert (w <= 64);
25555 if (TARGET_AVX512F && one_operand_shuffle)
25557 rtx (*gen) (rtx, rtx, rtx) = NULL;
25558 switch (mode)
25560 case V16SImode:
25561 gen =gen_avx512f_permvarv16si;
25562 break;
25563 case V16SFmode:
25564 gen = gen_avx512f_permvarv16sf;
25565 break;
25566 case V8DImode:
25567 gen = gen_avx512f_permvarv8di;
25568 break;
25569 case V8DFmode:
25570 gen = gen_avx512f_permvarv8df;
25571 break;
25572 default:
25573 break;
25575 if (gen != NULL)
25577 emit_insn (gen (target, op0, mask));
25578 return;
25582 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25583 return;
25585 if (TARGET_AVX2)
25587 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25589 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25590 an constant shuffle operand. With a tiny bit of effort we can
25591 use VPERMD instead. A re-interpretation stall for V4DFmode is
25592 unfortunate but there's no avoiding it.
25593 Similarly for V16HImode we don't have instructions for variable
25594 shuffling, while for V32QImode we can use after preparing suitable
25595 masks vpshufb; vpshufb; vpermq; vpor. */
25597 if (mode == V16HImode)
25599 maskmode = mode = V32QImode;
25600 w = 32;
25601 e = 1;
25603 else
25605 maskmode = mode = V8SImode;
25606 w = 8;
25607 e = 4;
25609 t1 = gen_reg_rtx (maskmode);
25611 /* Replicate the low bits of the V4DImode mask into V8SImode:
25612 mask = { A B C D }
25613 t1 = { A A B B C C D D }. */
25614 for (i = 0; i < w / 2; ++i)
25615 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25616 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25617 vt = force_reg (maskmode, vt);
25618 mask = gen_lowpart (maskmode, mask);
25619 if (maskmode == V8SImode)
25620 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25621 else
25622 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25624 /* Multiply the shuffle indicies by two. */
25625 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25626 OPTAB_DIRECT);
25628 /* Add one to the odd shuffle indicies:
25629 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25630 for (i = 0; i < w / 2; ++i)
25632 vec[i * 2] = const0_rtx;
25633 vec[i * 2 + 1] = const1_rtx;
25635 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25636 vt = validize_mem (force_const_mem (maskmode, vt));
25637 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25638 OPTAB_DIRECT);
25640 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25641 operands[3] = mask = t1;
25642 target = gen_reg_rtx (mode);
25643 op0 = gen_lowpart (mode, op0);
25644 op1 = gen_lowpart (mode, op1);
25647 switch (mode)
25649 case V8SImode:
25650 /* The VPERMD and VPERMPS instructions already properly ignore
25651 the high bits of the shuffle elements. No need for us to
25652 perform an AND ourselves. */
25653 if (one_operand_shuffle)
25655 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25656 if (target != operands[0])
25657 emit_move_insn (operands[0],
25658 gen_lowpart (GET_MODE (operands[0]), target));
25660 else
25662 t1 = gen_reg_rtx (V8SImode);
25663 t2 = gen_reg_rtx (V8SImode);
25664 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25665 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25666 goto merge_two;
25668 return;
25670 case V8SFmode:
25671 mask = gen_lowpart (V8SImode, mask);
25672 if (one_operand_shuffle)
25673 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25674 else
25676 t1 = gen_reg_rtx (V8SFmode);
25677 t2 = gen_reg_rtx (V8SFmode);
25678 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25679 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25680 goto merge_two;
25682 return;
25684 case V4SImode:
25685 /* By combining the two 128-bit input vectors into one 256-bit
25686 input vector, we can use VPERMD and VPERMPS for the full
25687 two-operand shuffle. */
25688 t1 = gen_reg_rtx (V8SImode);
25689 t2 = gen_reg_rtx (V8SImode);
25690 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25691 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25692 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25693 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25694 return;
25696 case V4SFmode:
25697 t1 = gen_reg_rtx (V8SFmode);
25698 t2 = gen_reg_rtx (V8SImode);
25699 mask = gen_lowpart (V4SImode, mask);
25700 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25701 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25702 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25703 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25704 return;
25706 case V32QImode:
25707 t1 = gen_reg_rtx (V32QImode);
25708 t2 = gen_reg_rtx (V32QImode);
25709 t3 = gen_reg_rtx (V32QImode);
25710 vt2 = GEN_INT (-128);
25711 for (i = 0; i < 32; i++)
25712 vec[i] = vt2;
25713 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25714 vt = force_reg (V32QImode, vt);
25715 for (i = 0; i < 32; i++)
25716 vec[i] = i < 16 ? vt2 : const0_rtx;
25717 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25718 vt2 = force_reg (V32QImode, vt2);
25719 /* From mask create two adjusted masks, which contain the same
25720 bits as mask in the low 7 bits of each vector element.
25721 The first mask will have the most significant bit clear
25722 if it requests element from the same 128-bit lane
25723 and MSB set if it requests element from the other 128-bit lane.
25724 The second mask will have the opposite values of the MSB,
25725 and additionally will have its 128-bit lanes swapped.
25726 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25727 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25728 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25729 stands for other 12 bytes. */
25730 /* The bit whether element is from the same lane or the other
25731 lane is bit 4, so shift it up by 3 to the MSB position. */
25732 t5 = gen_reg_rtx (V4DImode);
25733 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25734 GEN_INT (3)));
25735 /* Clear MSB bits from the mask just in case it had them set. */
25736 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25737 /* After this t1 will have MSB set for elements from other lane. */
25738 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25739 /* Clear bits other than MSB. */
25740 emit_insn (gen_andv32qi3 (t1, t1, vt));
25741 /* Or in the lower bits from mask into t3. */
25742 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25743 /* And invert MSB bits in t1, so MSB is set for elements from the same
25744 lane. */
25745 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25746 /* Swap 128-bit lanes in t3. */
25747 t6 = gen_reg_rtx (V4DImode);
25748 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25749 const2_rtx, GEN_INT (3),
25750 const0_rtx, const1_rtx));
25751 /* And or in the lower bits from mask into t1. */
25752 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25753 if (one_operand_shuffle)
25755 /* Each of these shuffles will put 0s in places where
25756 element from the other 128-bit lane is needed, otherwise
25757 will shuffle in the requested value. */
25758 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25759 gen_lowpart (V32QImode, t6)));
25760 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25761 /* For t3 the 128-bit lanes are swapped again. */
25762 t7 = gen_reg_rtx (V4DImode);
25763 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25764 const2_rtx, GEN_INT (3),
25765 const0_rtx, const1_rtx));
25766 /* And oring both together leads to the result. */
25767 emit_insn (gen_iorv32qi3 (target, t1,
25768 gen_lowpart (V32QImode, t7)));
25769 if (target != operands[0])
25770 emit_move_insn (operands[0],
25771 gen_lowpart (GET_MODE (operands[0]), target));
25772 return;
25775 t4 = gen_reg_rtx (V32QImode);
25776 /* Similarly to the above one_operand_shuffle code,
25777 just for repeated twice for each operand. merge_two:
25778 code will merge the two results together. */
25779 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25780 gen_lowpart (V32QImode, t6)));
25781 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25782 gen_lowpart (V32QImode, t6)));
25783 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25784 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25785 t7 = gen_reg_rtx (V4DImode);
25786 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25787 const2_rtx, GEN_INT (3),
25788 const0_rtx, const1_rtx));
25789 t8 = gen_reg_rtx (V4DImode);
25790 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25791 const2_rtx, GEN_INT (3),
25792 const0_rtx, const1_rtx));
25793 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25794 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25795 t1 = t4;
25796 t2 = t3;
25797 goto merge_two;
25799 default:
25800 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25801 break;
25805 if (TARGET_XOP)
25807 /* The XOP VPPERM insn supports three inputs. By ignoring the
25808 one_operand_shuffle special case, we avoid creating another
25809 set of constant vectors in memory. */
25810 one_operand_shuffle = false;
25812 /* mask = mask & {2*w-1, ...} */
25813 vt = GEN_INT (2*w - 1);
25815 else
25817 /* mask = mask & {w-1, ...} */
25818 vt = GEN_INT (w - 1);
25821 for (i = 0; i < w; i++)
25822 vec[i] = vt;
25823 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25824 mask = expand_simple_binop (maskmode, AND, mask, vt,
25825 NULL_RTX, 0, OPTAB_DIRECT);
25827 /* For non-QImode operations, convert the word permutation control
25828 into a byte permutation control. */
25829 if (mode != V16QImode)
25831 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25832 GEN_INT (exact_log2 (e)),
25833 NULL_RTX, 0, OPTAB_DIRECT);
25835 /* Convert mask to vector of chars. */
25836 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25838 /* Replicate each of the input bytes into byte positions:
25839 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25840 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25841 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25842 for (i = 0; i < 16; ++i)
25843 vec[i] = GEN_INT (i/e * e);
25844 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25845 vt = validize_mem (force_const_mem (V16QImode, vt));
25846 if (TARGET_XOP)
25847 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25848 else
25849 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25851 /* Convert it into the byte positions by doing
25852 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25853 for (i = 0; i < 16; ++i)
25854 vec[i] = GEN_INT (i % e);
25855 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25856 vt = validize_mem (force_const_mem (V16QImode, vt));
25857 emit_insn (gen_addv16qi3 (mask, mask, vt));
25860 /* The actual shuffle operations all operate on V16QImode. */
25861 op0 = gen_lowpart (V16QImode, op0);
25862 op1 = gen_lowpart (V16QImode, op1);
25864 if (TARGET_XOP)
25866 if (GET_MODE (target) != V16QImode)
25867 target = gen_reg_rtx (V16QImode);
25868 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25869 if (target != operands[0])
25870 emit_move_insn (operands[0],
25871 gen_lowpart (GET_MODE (operands[0]), target));
25873 else if (one_operand_shuffle)
25875 if (GET_MODE (target) != V16QImode)
25876 target = gen_reg_rtx (V16QImode);
25877 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25878 if (target != operands[0])
25879 emit_move_insn (operands[0],
25880 gen_lowpart (GET_MODE (operands[0]), target));
25882 else
25884 rtx xops[6];
25885 bool ok;
25887 /* Shuffle the two input vectors independently. */
25888 t1 = gen_reg_rtx (V16QImode);
25889 t2 = gen_reg_rtx (V16QImode);
25890 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25891 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25893 merge_two:
25894 /* Then merge them together. The key is whether any given control
25895 element contained a bit set that indicates the second word. */
25896 mask = operands[3];
25897 vt = GEN_INT (w);
25898 if (maskmode == V2DImode && !TARGET_SSE4_1)
25900 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25901 more shuffle to convert the V2DI input mask into a V4SI
25902 input mask. At which point the masking that expand_int_vcond
25903 will work as desired. */
25904 rtx t3 = gen_reg_rtx (V4SImode);
25905 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25906 const0_rtx, const0_rtx,
25907 const2_rtx, const2_rtx));
25908 mask = t3;
25909 maskmode = V4SImode;
25910 e = w = 4;
25913 for (i = 0; i < w; i++)
25914 vec[i] = vt;
25915 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25916 vt = force_reg (maskmode, vt);
25917 mask = expand_simple_binop (maskmode, AND, mask, vt,
25918 NULL_RTX, 0, OPTAB_DIRECT);
25920 if (GET_MODE (target) != mode)
25921 target = gen_reg_rtx (mode);
25922 xops[0] = target;
25923 xops[1] = gen_lowpart (mode, t2);
25924 xops[2] = gen_lowpart (mode, t1);
25925 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25926 xops[4] = mask;
25927 xops[5] = vt;
25928 ok = ix86_expand_int_vcond (xops);
25929 gcc_assert (ok);
25930 if (target != operands[0])
25931 emit_move_insn (operands[0],
25932 gen_lowpart (GET_MODE (operands[0]), target));
25936 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25937 true if we should do zero extension, else sign extension. HIGH_P is
25938 true if we want the N/2 high elements, else the low elements. */
25940 void
25941 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25943 machine_mode imode = GET_MODE (src);
25944 rtx tmp;
25946 if (TARGET_SSE4_1)
25948 rtx (*unpack)(rtx, rtx);
25949 rtx (*extract)(rtx, rtx) = NULL;
25950 machine_mode halfmode = BLKmode;
25952 switch (imode)
25954 case V64QImode:
25955 if (unsigned_p)
25956 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25957 else
25958 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25959 halfmode = V32QImode;
25960 extract
25961 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25962 break;
25963 case V32QImode:
25964 if (unsigned_p)
25965 unpack = gen_avx2_zero_extendv16qiv16hi2;
25966 else
25967 unpack = gen_avx2_sign_extendv16qiv16hi2;
25968 halfmode = V16QImode;
25969 extract
25970 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25971 break;
25972 case V32HImode:
25973 if (unsigned_p)
25974 unpack = gen_avx512f_zero_extendv16hiv16si2;
25975 else
25976 unpack = gen_avx512f_sign_extendv16hiv16si2;
25977 halfmode = V16HImode;
25978 extract
25979 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25980 break;
25981 case V16HImode:
25982 if (unsigned_p)
25983 unpack = gen_avx2_zero_extendv8hiv8si2;
25984 else
25985 unpack = gen_avx2_sign_extendv8hiv8si2;
25986 halfmode = V8HImode;
25987 extract
25988 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25989 break;
25990 case V16SImode:
25991 if (unsigned_p)
25992 unpack = gen_avx512f_zero_extendv8siv8di2;
25993 else
25994 unpack = gen_avx512f_sign_extendv8siv8di2;
25995 halfmode = V8SImode;
25996 extract
25997 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25998 break;
25999 case V8SImode:
26000 if (unsigned_p)
26001 unpack = gen_avx2_zero_extendv4siv4di2;
26002 else
26003 unpack = gen_avx2_sign_extendv4siv4di2;
26004 halfmode = V4SImode;
26005 extract
26006 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26007 break;
26008 case V16QImode:
26009 if (unsigned_p)
26010 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26011 else
26012 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26013 break;
26014 case V8HImode:
26015 if (unsigned_p)
26016 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26017 else
26018 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26019 break;
26020 case V4SImode:
26021 if (unsigned_p)
26022 unpack = gen_sse4_1_zero_extendv2siv2di2;
26023 else
26024 unpack = gen_sse4_1_sign_extendv2siv2di2;
26025 break;
26026 default:
26027 gcc_unreachable ();
26030 if (GET_MODE_SIZE (imode) >= 32)
26032 tmp = gen_reg_rtx (halfmode);
26033 emit_insn (extract (tmp, src));
26035 else if (high_p)
26037 /* Shift higher 8 bytes to lower 8 bytes. */
26038 tmp = gen_reg_rtx (V1TImode);
26039 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26040 GEN_INT (64)));
26041 tmp = gen_lowpart (imode, tmp);
26043 else
26044 tmp = src;
26046 emit_insn (unpack (dest, tmp));
26048 else
26050 rtx (*unpack)(rtx, rtx, rtx);
26052 switch (imode)
26054 case V16QImode:
26055 if (high_p)
26056 unpack = gen_vec_interleave_highv16qi;
26057 else
26058 unpack = gen_vec_interleave_lowv16qi;
26059 break;
26060 case V8HImode:
26061 if (high_p)
26062 unpack = gen_vec_interleave_highv8hi;
26063 else
26064 unpack = gen_vec_interleave_lowv8hi;
26065 break;
26066 case V4SImode:
26067 if (high_p)
26068 unpack = gen_vec_interleave_highv4si;
26069 else
26070 unpack = gen_vec_interleave_lowv4si;
26071 break;
26072 default:
26073 gcc_unreachable ();
26076 if (unsigned_p)
26077 tmp = force_reg (imode, CONST0_RTX (imode));
26078 else
26079 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26080 src, pc_rtx, pc_rtx);
26082 rtx tmp2 = gen_reg_rtx (imode);
26083 emit_insn (unpack (tmp2, src, tmp));
26084 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26088 /* Expand conditional increment or decrement using adb/sbb instructions.
26089 The default case using setcc followed by the conditional move can be
26090 done by generic code. */
26091 bool
26092 ix86_expand_int_addcc (rtx operands[])
26094 enum rtx_code code = GET_CODE (operands[1]);
26095 rtx flags;
26096 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26097 rtx compare_op;
26098 rtx val = const0_rtx;
26099 bool fpcmp = false;
26100 machine_mode mode;
26101 rtx op0 = XEXP (operands[1], 0);
26102 rtx op1 = XEXP (operands[1], 1);
26104 if (operands[3] != const1_rtx
26105 && operands[3] != constm1_rtx)
26106 return false;
26107 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26108 return false;
26109 code = GET_CODE (compare_op);
26111 flags = XEXP (compare_op, 0);
26113 if (GET_MODE (flags) == CCFPmode
26114 || GET_MODE (flags) == CCFPUmode)
26116 fpcmp = true;
26117 code = ix86_fp_compare_code_to_integer (code);
26120 if (code != LTU)
26122 val = constm1_rtx;
26123 if (fpcmp)
26124 PUT_CODE (compare_op,
26125 reverse_condition_maybe_unordered
26126 (GET_CODE (compare_op)));
26127 else
26128 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26131 mode = GET_MODE (operands[0]);
26133 /* Construct either adc or sbb insn. */
26134 if ((code == LTU) == (operands[3] == constm1_rtx))
26136 switch (mode)
26138 case QImode:
26139 insn = gen_subqi3_carry;
26140 break;
26141 case HImode:
26142 insn = gen_subhi3_carry;
26143 break;
26144 case SImode:
26145 insn = gen_subsi3_carry;
26146 break;
26147 case DImode:
26148 insn = gen_subdi3_carry;
26149 break;
26150 default:
26151 gcc_unreachable ();
26154 else
26156 switch (mode)
26158 case QImode:
26159 insn = gen_addqi3_carry;
26160 break;
26161 case HImode:
26162 insn = gen_addhi3_carry;
26163 break;
26164 case SImode:
26165 insn = gen_addsi3_carry;
26166 break;
26167 case DImode:
26168 insn = gen_adddi3_carry;
26169 break;
26170 default:
26171 gcc_unreachable ();
26174 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26176 return true;
26180 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26181 but works for floating pointer parameters and nonoffsetable memories.
26182 For pushes, it returns just stack offsets; the values will be saved
26183 in the right order. Maximally three parts are generated. */
26185 static int
26186 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26188 int size;
26190 if (!TARGET_64BIT)
26191 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26192 else
26193 size = (GET_MODE_SIZE (mode) + 4) / 8;
26195 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26196 gcc_assert (size >= 2 && size <= 4);
26198 /* Optimize constant pool reference to immediates. This is used by fp
26199 moves, that force all constants to memory to allow combining. */
26200 if (MEM_P (operand) && MEM_READONLY_P (operand))
26202 rtx tmp = maybe_get_pool_constant (operand);
26203 if (tmp)
26204 operand = tmp;
26207 if (MEM_P (operand) && !offsettable_memref_p (operand))
26209 /* The only non-offsetable memories we handle are pushes. */
26210 int ok = push_operand (operand, VOIDmode);
26212 gcc_assert (ok);
26214 operand = copy_rtx (operand);
26215 PUT_MODE (operand, word_mode);
26216 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26217 return size;
26220 if (GET_CODE (operand) == CONST_VECTOR)
26222 machine_mode imode = int_mode_for_mode (mode);
26223 /* Caution: if we looked through a constant pool memory above,
26224 the operand may actually have a different mode now. That's
26225 ok, since we want to pun this all the way back to an integer. */
26226 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26227 gcc_assert (operand != NULL);
26228 mode = imode;
26231 if (!TARGET_64BIT)
26233 if (mode == DImode)
26234 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26235 else
26237 int i;
26239 if (REG_P (operand))
26241 gcc_assert (reload_completed);
26242 for (i = 0; i < size; i++)
26243 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26245 else if (offsettable_memref_p (operand))
26247 operand = adjust_address (operand, SImode, 0);
26248 parts[0] = operand;
26249 for (i = 1; i < size; i++)
26250 parts[i] = adjust_address (operand, SImode, 4 * i);
26252 else if (CONST_DOUBLE_P (operand))
26254 const REAL_VALUE_TYPE *r;
26255 long l[4];
26257 r = CONST_DOUBLE_REAL_VALUE (operand);
26258 switch (mode)
26260 case TFmode:
26261 real_to_target (l, r, mode);
26262 parts[3] = gen_int_mode (l[3], SImode);
26263 parts[2] = gen_int_mode (l[2], SImode);
26264 break;
26265 case XFmode:
26266 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26267 long double may not be 80-bit. */
26268 real_to_target (l, r, mode);
26269 parts[2] = gen_int_mode (l[2], SImode);
26270 break;
26271 case DFmode:
26272 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26273 break;
26274 default:
26275 gcc_unreachable ();
26277 parts[1] = gen_int_mode (l[1], SImode);
26278 parts[0] = gen_int_mode (l[0], SImode);
26280 else
26281 gcc_unreachable ();
26284 else
26286 if (mode == TImode)
26287 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26288 if (mode == XFmode || mode == TFmode)
26290 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26291 if (REG_P (operand))
26293 gcc_assert (reload_completed);
26294 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26295 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26297 else if (offsettable_memref_p (operand))
26299 operand = adjust_address (operand, DImode, 0);
26300 parts[0] = operand;
26301 parts[1] = adjust_address (operand, upper_mode, 8);
26303 else if (CONST_DOUBLE_P (operand))
26305 long l[4];
26307 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26309 /* real_to_target puts 32-bit pieces in each long. */
26310 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26311 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26312 << 32), DImode);
26314 if (upper_mode == SImode)
26315 parts[1] = gen_int_mode (l[2], SImode);
26316 else
26317 parts[1]
26318 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26319 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26320 << 32), DImode);
26322 else
26323 gcc_unreachable ();
26327 return size;
26330 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26331 Return false when normal moves are needed; true when all required
26332 insns have been emitted. Operands 2-4 contain the input values
26333 int the correct order; operands 5-7 contain the output values. */
26335 void
26336 ix86_split_long_move (rtx operands[])
26338 rtx part[2][4];
26339 int nparts, i, j;
26340 int push = 0;
26341 int collisions = 0;
26342 machine_mode mode = GET_MODE (operands[0]);
26343 bool collisionparts[4];
26345 /* The DFmode expanders may ask us to move double.
26346 For 64bit target this is single move. By hiding the fact
26347 here we simplify i386.md splitters. */
26348 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26350 /* Optimize constant pool reference to immediates. This is used by
26351 fp moves, that force all constants to memory to allow combining. */
26353 if (MEM_P (operands[1])
26354 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26355 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26356 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26357 if (push_operand (operands[0], VOIDmode))
26359 operands[0] = copy_rtx (operands[0]);
26360 PUT_MODE (operands[0], word_mode);
26362 else
26363 operands[0] = gen_lowpart (DImode, operands[0]);
26364 operands[1] = gen_lowpart (DImode, operands[1]);
26365 emit_move_insn (operands[0], operands[1]);
26366 return;
26369 /* The only non-offsettable memory we handle is push. */
26370 if (push_operand (operands[0], VOIDmode))
26371 push = 1;
26372 else
26373 gcc_assert (!MEM_P (operands[0])
26374 || offsettable_memref_p (operands[0]));
26376 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26377 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26379 /* When emitting push, take care for source operands on the stack. */
26380 if (push && MEM_P (operands[1])
26381 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26383 rtx src_base = XEXP (part[1][nparts - 1], 0);
26385 /* Compensate for the stack decrement by 4. */
26386 if (!TARGET_64BIT && nparts == 3
26387 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26388 src_base = plus_constant (Pmode, src_base, 4);
26390 /* src_base refers to the stack pointer and is
26391 automatically decreased by emitted push. */
26392 for (i = 0; i < nparts; i++)
26393 part[1][i] = change_address (part[1][i],
26394 GET_MODE (part[1][i]), src_base);
26397 /* We need to do copy in the right order in case an address register
26398 of the source overlaps the destination. */
26399 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26401 rtx tmp;
26403 for (i = 0; i < nparts; i++)
26405 collisionparts[i]
26406 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26407 if (collisionparts[i])
26408 collisions++;
26411 /* Collision in the middle part can be handled by reordering. */
26412 if (collisions == 1 && nparts == 3 && collisionparts [1])
26414 std::swap (part[0][1], part[0][2]);
26415 std::swap (part[1][1], part[1][2]);
26417 else if (collisions == 1
26418 && nparts == 4
26419 && (collisionparts [1] || collisionparts [2]))
26421 if (collisionparts [1])
26423 std::swap (part[0][1], part[0][2]);
26424 std::swap (part[1][1], part[1][2]);
26426 else
26428 std::swap (part[0][2], part[0][3]);
26429 std::swap (part[1][2], part[1][3]);
26433 /* If there are more collisions, we can't handle it by reordering.
26434 Do an lea to the last part and use only one colliding move. */
26435 else if (collisions > 1)
26437 rtx base, addr, tls_base = NULL_RTX;
26439 collisions = 1;
26441 base = part[0][nparts - 1];
26443 /* Handle the case when the last part isn't valid for lea.
26444 Happens in 64-bit mode storing the 12-byte XFmode. */
26445 if (GET_MODE (base) != Pmode)
26446 base = gen_rtx_REG (Pmode, REGNO (base));
26448 addr = XEXP (part[1][0], 0);
26449 if (TARGET_TLS_DIRECT_SEG_REFS)
26451 struct ix86_address parts;
26452 int ok = ix86_decompose_address (addr, &parts);
26453 gcc_assert (ok);
26454 if (parts.seg == DEFAULT_TLS_SEG_REG)
26456 /* It is not valid to use %gs: or %fs: in
26457 lea though, so we need to remove it from the
26458 address used for lea and add it to each individual
26459 memory loads instead. */
26460 addr = copy_rtx (addr);
26461 rtx *x = &addr;
26462 while (GET_CODE (*x) == PLUS)
26464 for (i = 0; i < 2; i++)
26466 rtx u = XEXP (*x, i);
26467 if (GET_CODE (u) == ZERO_EXTEND)
26468 u = XEXP (u, 0);
26469 if (GET_CODE (u) == UNSPEC
26470 && XINT (u, 1) == UNSPEC_TP)
26472 tls_base = XEXP (*x, i);
26473 *x = XEXP (*x, 1 - i);
26474 break;
26477 if (tls_base)
26478 break;
26479 x = &XEXP (*x, 0);
26481 gcc_assert (tls_base);
26484 emit_insn (gen_rtx_SET (base, addr));
26485 if (tls_base)
26486 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26487 part[1][0] = replace_equiv_address (part[1][0], base);
26488 for (i = 1; i < nparts; i++)
26490 if (tls_base)
26491 base = copy_rtx (base);
26492 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26493 part[1][i] = replace_equiv_address (part[1][i], tmp);
26498 if (push)
26500 if (!TARGET_64BIT)
26502 if (nparts == 3)
26504 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26505 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26506 stack_pointer_rtx, GEN_INT (-4)));
26507 emit_move_insn (part[0][2], part[1][2]);
26509 else if (nparts == 4)
26511 emit_move_insn (part[0][3], part[1][3]);
26512 emit_move_insn (part[0][2], part[1][2]);
26515 else
26517 /* In 64bit mode we don't have 32bit push available. In case this is
26518 register, it is OK - we will just use larger counterpart. We also
26519 retype memory - these comes from attempt to avoid REX prefix on
26520 moving of second half of TFmode value. */
26521 if (GET_MODE (part[1][1]) == SImode)
26523 switch (GET_CODE (part[1][1]))
26525 case MEM:
26526 part[1][1] = adjust_address (part[1][1], DImode, 0);
26527 break;
26529 case REG:
26530 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26531 break;
26533 default:
26534 gcc_unreachable ();
26537 if (GET_MODE (part[1][0]) == SImode)
26538 part[1][0] = part[1][1];
26541 emit_move_insn (part[0][1], part[1][1]);
26542 emit_move_insn (part[0][0], part[1][0]);
26543 return;
26546 /* Choose correct order to not overwrite the source before it is copied. */
26547 if ((REG_P (part[0][0])
26548 && REG_P (part[1][1])
26549 && (REGNO (part[0][0]) == REGNO (part[1][1])
26550 || (nparts == 3
26551 && REGNO (part[0][0]) == REGNO (part[1][2]))
26552 || (nparts == 4
26553 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26554 || (collisions > 0
26555 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26557 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26559 operands[2 + i] = part[0][j];
26560 operands[6 + i] = part[1][j];
26563 else
26565 for (i = 0; i < nparts; i++)
26567 operands[2 + i] = part[0][i];
26568 operands[6 + i] = part[1][i];
26572 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26573 if (optimize_insn_for_size_p ())
26575 for (j = 0; j < nparts - 1; j++)
26576 if (CONST_INT_P (operands[6 + j])
26577 && operands[6 + j] != const0_rtx
26578 && REG_P (operands[2 + j]))
26579 for (i = j; i < nparts - 1; i++)
26580 if (CONST_INT_P (operands[7 + i])
26581 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26582 operands[7 + i] = operands[2 + j];
26585 for (i = 0; i < nparts; i++)
26586 emit_move_insn (operands[2 + i], operands[6 + i]);
26588 return;
26591 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26592 left shift by a constant, either using a single shift or
26593 a sequence of add instructions. */
26595 static void
26596 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26598 rtx (*insn)(rtx, rtx, rtx);
26600 if (count == 1
26601 || (count * ix86_cost->add <= ix86_cost->shift_const
26602 && !optimize_insn_for_size_p ()))
26604 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26605 while (count-- > 0)
26606 emit_insn (insn (operand, operand, operand));
26608 else
26610 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26611 emit_insn (insn (operand, operand, GEN_INT (count)));
26615 void
26616 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26618 rtx (*gen_ashl3)(rtx, rtx, rtx);
26619 rtx (*gen_shld)(rtx, rtx, rtx);
26620 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26622 rtx low[2], high[2];
26623 int count;
26625 if (CONST_INT_P (operands[2]))
26627 split_double_mode (mode, operands, 2, low, high);
26628 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26630 if (count >= half_width)
26632 emit_move_insn (high[0], low[1]);
26633 emit_move_insn (low[0], const0_rtx);
26635 if (count > half_width)
26636 ix86_expand_ashl_const (high[0], count - half_width, mode);
26638 else
26640 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26642 if (!rtx_equal_p (operands[0], operands[1]))
26643 emit_move_insn (operands[0], operands[1]);
26645 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26646 ix86_expand_ashl_const (low[0], count, mode);
26648 return;
26651 split_double_mode (mode, operands, 1, low, high);
26653 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26655 if (operands[1] == const1_rtx)
26657 /* Assuming we've chosen a QImode capable registers, then 1 << N
26658 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26659 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26661 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26663 ix86_expand_clear (low[0]);
26664 ix86_expand_clear (high[0]);
26665 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26667 d = gen_lowpart (QImode, low[0]);
26668 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26669 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26670 emit_insn (gen_rtx_SET (d, s));
26672 d = gen_lowpart (QImode, high[0]);
26673 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26674 s = gen_rtx_NE (QImode, flags, const0_rtx);
26675 emit_insn (gen_rtx_SET (d, s));
26678 /* Otherwise, we can get the same results by manually performing
26679 a bit extract operation on bit 5/6, and then performing the two
26680 shifts. The two methods of getting 0/1 into low/high are exactly
26681 the same size. Avoiding the shift in the bit extract case helps
26682 pentium4 a bit; no one else seems to care much either way. */
26683 else
26685 machine_mode half_mode;
26686 rtx (*gen_lshr3)(rtx, rtx, rtx);
26687 rtx (*gen_and3)(rtx, rtx, rtx);
26688 rtx (*gen_xor3)(rtx, rtx, rtx);
26689 HOST_WIDE_INT bits;
26690 rtx x;
26692 if (mode == DImode)
26694 half_mode = SImode;
26695 gen_lshr3 = gen_lshrsi3;
26696 gen_and3 = gen_andsi3;
26697 gen_xor3 = gen_xorsi3;
26698 bits = 5;
26700 else
26702 half_mode = DImode;
26703 gen_lshr3 = gen_lshrdi3;
26704 gen_and3 = gen_anddi3;
26705 gen_xor3 = gen_xordi3;
26706 bits = 6;
26709 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26710 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26711 else
26712 x = gen_lowpart (half_mode, operands[2]);
26713 emit_insn (gen_rtx_SET (high[0], x));
26715 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26716 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26717 emit_move_insn (low[0], high[0]);
26718 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26721 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26722 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26723 return;
26726 if (operands[1] == constm1_rtx)
26728 /* For -1 << N, we can avoid the shld instruction, because we
26729 know that we're shifting 0...31/63 ones into a -1. */
26730 emit_move_insn (low[0], constm1_rtx);
26731 if (optimize_insn_for_size_p ())
26732 emit_move_insn (high[0], low[0]);
26733 else
26734 emit_move_insn (high[0], constm1_rtx);
26736 else
26738 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26740 if (!rtx_equal_p (operands[0], operands[1]))
26741 emit_move_insn (operands[0], operands[1]);
26743 split_double_mode (mode, operands, 1, low, high);
26744 emit_insn (gen_shld (high[0], low[0], operands[2]));
26747 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26749 if (TARGET_CMOVE && scratch)
26751 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26752 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26754 ix86_expand_clear (scratch);
26755 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26757 else
26759 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26760 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26762 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26766 void
26767 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26769 rtx (*gen_ashr3)(rtx, rtx, rtx)
26770 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26771 rtx (*gen_shrd)(rtx, rtx, rtx);
26772 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26774 rtx low[2], high[2];
26775 int count;
26777 if (CONST_INT_P (operands[2]))
26779 split_double_mode (mode, operands, 2, low, high);
26780 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26782 if (count == GET_MODE_BITSIZE (mode) - 1)
26784 emit_move_insn (high[0], high[1]);
26785 emit_insn (gen_ashr3 (high[0], high[0],
26786 GEN_INT (half_width - 1)));
26787 emit_move_insn (low[0], high[0]);
26790 else if (count >= half_width)
26792 emit_move_insn (low[0], high[1]);
26793 emit_move_insn (high[0], low[0]);
26794 emit_insn (gen_ashr3 (high[0], high[0],
26795 GEN_INT (half_width - 1)));
26797 if (count > half_width)
26798 emit_insn (gen_ashr3 (low[0], low[0],
26799 GEN_INT (count - half_width)));
26801 else
26803 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26805 if (!rtx_equal_p (operands[0], operands[1]))
26806 emit_move_insn (operands[0], operands[1]);
26808 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26809 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26812 else
26814 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26816 if (!rtx_equal_p (operands[0], operands[1]))
26817 emit_move_insn (operands[0], operands[1]);
26819 split_double_mode (mode, operands, 1, low, high);
26821 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26822 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26824 if (TARGET_CMOVE && scratch)
26826 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26827 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26829 emit_move_insn (scratch, high[0]);
26830 emit_insn (gen_ashr3 (scratch, scratch,
26831 GEN_INT (half_width - 1)));
26832 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26833 scratch));
26835 else
26837 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26838 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26840 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26845 void
26846 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26848 rtx (*gen_lshr3)(rtx, rtx, rtx)
26849 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26850 rtx (*gen_shrd)(rtx, rtx, rtx);
26851 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26853 rtx low[2], high[2];
26854 int count;
26856 if (CONST_INT_P (operands[2]))
26858 split_double_mode (mode, operands, 2, low, high);
26859 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26861 if (count >= half_width)
26863 emit_move_insn (low[0], high[1]);
26864 ix86_expand_clear (high[0]);
26866 if (count > half_width)
26867 emit_insn (gen_lshr3 (low[0], low[0],
26868 GEN_INT (count - half_width)));
26870 else
26872 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26874 if (!rtx_equal_p (operands[0], operands[1]))
26875 emit_move_insn (operands[0], operands[1]);
26877 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26878 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26881 else
26883 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26885 if (!rtx_equal_p (operands[0], operands[1]))
26886 emit_move_insn (operands[0], operands[1]);
26888 split_double_mode (mode, operands, 1, low, high);
26890 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26891 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26893 if (TARGET_CMOVE && scratch)
26895 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26896 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26898 ix86_expand_clear (scratch);
26899 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26900 scratch));
26902 else
26904 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26905 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26907 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26912 /* Predict just emitted jump instruction to be taken with probability PROB. */
26913 static void
26914 predict_jump (int prob)
26916 rtx_insn *insn = get_last_insn ();
26917 gcc_assert (JUMP_P (insn));
26918 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26921 /* Helper function for the string operations below. Dest VARIABLE whether
26922 it is aligned to VALUE bytes. If true, jump to the label. */
26923 static rtx_code_label *
26924 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26926 rtx_code_label *label = gen_label_rtx ();
26927 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26928 if (GET_MODE (variable) == DImode)
26929 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26930 else
26931 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26932 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26933 1, label);
26934 if (epilogue)
26935 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26936 else
26937 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26938 return label;
26941 /* Adjust COUNTER by the VALUE. */
26942 static void
26943 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26945 rtx (*gen_add)(rtx, rtx, rtx)
26946 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26948 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26951 /* Zero extend possibly SImode EXP to Pmode register. */
26953 ix86_zero_extend_to_Pmode (rtx exp)
26955 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26958 /* Divide COUNTREG by SCALE. */
26959 static rtx
26960 scale_counter (rtx countreg, int scale)
26962 rtx sc;
26964 if (scale == 1)
26965 return countreg;
26966 if (CONST_INT_P (countreg))
26967 return GEN_INT (INTVAL (countreg) / scale);
26968 gcc_assert (REG_P (countreg));
26970 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26971 GEN_INT (exact_log2 (scale)),
26972 NULL, 1, OPTAB_DIRECT);
26973 return sc;
26976 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26977 DImode for constant loop counts. */
26979 static machine_mode
26980 counter_mode (rtx count_exp)
26982 if (GET_MODE (count_exp) != VOIDmode)
26983 return GET_MODE (count_exp);
26984 if (!CONST_INT_P (count_exp))
26985 return Pmode;
26986 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26987 return DImode;
26988 return SImode;
26991 /* Copy the address to a Pmode register. This is used for x32 to
26992 truncate DImode TLS address to a SImode register. */
26994 static rtx
26995 ix86_copy_addr_to_reg (rtx addr)
26997 rtx reg;
26998 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27000 reg = copy_addr_to_reg (addr);
27001 REG_POINTER (reg) = 1;
27002 return reg;
27004 else
27006 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27007 reg = copy_to_mode_reg (DImode, addr);
27008 REG_POINTER (reg) = 1;
27009 return gen_rtx_SUBREG (SImode, reg, 0);
27013 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27014 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27015 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27016 memory by VALUE (supposed to be in MODE).
27018 The size is rounded down to whole number of chunk size moved at once.
27019 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27022 static void
27023 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27024 rtx destptr, rtx srcptr, rtx value,
27025 rtx count, machine_mode mode, int unroll,
27026 int expected_size, bool issetmem)
27028 rtx_code_label *out_label, *top_label;
27029 rtx iter, tmp;
27030 machine_mode iter_mode = counter_mode (count);
27031 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27032 rtx piece_size = GEN_INT (piece_size_n);
27033 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27034 rtx size;
27035 int i;
27037 top_label = gen_label_rtx ();
27038 out_label = gen_label_rtx ();
27039 iter = gen_reg_rtx (iter_mode);
27041 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27042 NULL, 1, OPTAB_DIRECT);
27043 /* Those two should combine. */
27044 if (piece_size == const1_rtx)
27046 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27047 true, out_label);
27048 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27050 emit_move_insn (iter, const0_rtx);
27052 emit_label (top_label);
27054 tmp = convert_modes (Pmode, iter_mode, iter, true);
27056 /* This assert could be relaxed - in this case we'll need to compute
27057 smallest power of two, containing in PIECE_SIZE_N and pass it to
27058 offset_address. */
27059 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27060 destmem = offset_address (destmem, tmp, piece_size_n);
27061 destmem = adjust_address (destmem, mode, 0);
27063 if (!issetmem)
27065 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27066 srcmem = adjust_address (srcmem, mode, 0);
27068 /* When unrolling for chips that reorder memory reads and writes,
27069 we can save registers by using single temporary.
27070 Also using 4 temporaries is overkill in 32bit mode. */
27071 if (!TARGET_64BIT && 0)
27073 for (i = 0; i < unroll; i++)
27075 if (i)
27077 destmem =
27078 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27079 srcmem =
27080 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27082 emit_move_insn (destmem, srcmem);
27085 else
27087 rtx tmpreg[4];
27088 gcc_assert (unroll <= 4);
27089 for (i = 0; i < unroll; i++)
27091 tmpreg[i] = gen_reg_rtx (mode);
27092 if (i)
27094 srcmem =
27095 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27097 emit_move_insn (tmpreg[i], srcmem);
27099 for (i = 0; i < unroll; i++)
27101 if (i)
27103 destmem =
27104 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27106 emit_move_insn (destmem, tmpreg[i]);
27110 else
27111 for (i = 0; i < unroll; i++)
27113 if (i)
27114 destmem =
27115 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27116 emit_move_insn (destmem, value);
27119 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27120 true, OPTAB_LIB_WIDEN);
27121 if (tmp != iter)
27122 emit_move_insn (iter, tmp);
27124 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27125 true, top_label);
27126 if (expected_size != -1)
27128 expected_size /= GET_MODE_SIZE (mode) * unroll;
27129 if (expected_size == 0)
27130 predict_jump (0);
27131 else if (expected_size > REG_BR_PROB_BASE)
27132 predict_jump (REG_BR_PROB_BASE - 1);
27133 else
27134 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27136 else
27137 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27138 iter = ix86_zero_extend_to_Pmode (iter);
27139 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27140 true, OPTAB_LIB_WIDEN);
27141 if (tmp != destptr)
27142 emit_move_insn (destptr, tmp);
27143 if (!issetmem)
27145 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27146 true, OPTAB_LIB_WIDEN);
27147 if (tmp != srcptr)
27148 emit_move_insn (srcptr, tmp);
27150 emit_label (out_label);
27153 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27154 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27155 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27156 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27157 ORIG_VALUE is the original value passed to memset to fill the memory with.
27158 Other arguments have same meaning as for previous function. */
27160 static void
27161 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27162 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27163 rtx count,
27164 machine_mode mode, bool issetmem)
27166 rtx destexp;
27167 rtx srcexp;
27168 rtx countreg;
27169 HOST_WIDE_INT rounded_count;
27171 /* If possible, it is shorter to use rep movs.
27172 TODO: Maybe it is better to move this logic to decide_alg. */
27173 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27174 && (!issetmem || orig_value == const0_rtx))
27175 mode = SImode;
27177 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27178 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27180 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27181 GET_MODE_SIZE (mode)));
27182 if (mode != QImode)
27184 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27185 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27186 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27188 else
27189 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27190 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27192 rounded_count
27193 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27194 destmem = shallow_copy_rtx (destmem);
27195 set_mem_size (destmem, rounded_count);
27197 else if (MEM_SIZE_KNOWN_P (destmem))
27198 clear_mem_size (destmem);
27200 if (issetmem)
27202 value = force_reg (mode, gen_lowpart (mode, value));
27203 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27205 else
27207 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27208 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27209 if (mode != QImode)
27211 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27212 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27213 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27215 else
27216 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27217 if (CONST_INT_P (count))
27219 rounded_count
27220 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27221 srcmem = shallow_copy_rtx (srcmem);
27222 set_mem_size (srcmem, rounded_count);
27224 else
27226 if (MEM_SIZE_KNOWN_P (srcmem))
27227 clear_mem_size (srcmem);
27229 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27230 destexp, srcexp));
27234 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27235 DESTMEM.
27236 SRC is passed by pointer to be updated on return.
27237 Return value is updated DST. */
27238 static rtx
27239 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27240 HOST_WIDE_INT size_to_move)
27242 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27243 enum insn_code code;
27244 machine_mode move_mode;
27245 int piece_size, i;
27247 /* Find the widest mode in which we could perform moves.
27248 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27249 it until move of such size is supported. */
27250 piece_size = 1 << floor_log2 (size_to_move);
27251 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27252 code = optab_handler (mov_optab, move_mode);
27253 while (code == CODE_FOR_nothing && piece_size > 1)
27255 piece_size >>= 1;
27256 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27257 code = optab_handler (mov_optab, move_mode);
27260 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27261 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27262 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27264 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27265 move_mode = mode_for_vector (word_mode, nunits);
27266 code = optab_handler (mov_optab, move_mode);
27267 if (code == CODE_FOR_nothing)
27269 move_mode = word_mode;
27270 piece_size = GET_MODE_SIZE (move_mode);
27271 code = optab_handler (mov_optab, move_mode);
27274 gcc_assert (code != CODE_FOR_nothing);
27276 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27277 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27279 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27280 gcc_assert (size_to_move % piece_size == 0);
27281 adjust = GEN_INT (piece_size);
27282 for (i = 0; i < size_to_move; i += piece_size)
27284 /* We move from memory to memory, so we'll need to do it via
27285 a temporary register. */
27286 tempreg = gen_reg_rtx (move_mode);
27287 emit_insn (GEN_FCN (code) (tempreg, src));
27288 emit_insn (GEN_FCN (code) (dst, tempreg));
27290 emit_move_insn (destptr,
27291 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27292 emit_move_insn (srcptr,
27293 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27295 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27296 piece_size);
27297 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27298 piece_size);
27301 /* Update DST and SRC rtx. */
27302 *srcmem = src;
27303 return dst;
27306 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27307 static void
27308 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27309 rtx destptr, rtx srcptr, rtx count, int max_size)
27311 rtx src, dest;
27312 if (CONST_INT_P (count))
27314 HOST_WIDE_INT countval = INTVAL (count);
27315 HOST_WIDE_INT epilogue_size = countval % max_size;
27316 int i;
27318 /* For now MAX_SIZE should be a power of 2. This assert could be
27319 relaxed, but it'll require a bit more complicated epilogue
27320 expanding. */
27321 gcc_assert ((max_size & (max_size - 1)) == 0);
27322 for (i = max_size; i >= 1; i >>= 1)
27324 if (epilogue_size & i)
27325 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27327 return;
27329 if (max_size > 8)
27331 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27332 count, 1, OPTAB_DIRECT);
27333 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27334 count, QImode, 1, 4, false);
27335 return;
27338 /* When there are stringops, we can cheaply increase dest and src pointers.
27339 Otherwise we save code size by maintaining offset (zero is readily
27340 available from preceding rep operation) and using x86 addressing modes.
27342 if (TARGET_SINGLE_STRINGOP)
27344 if (max_size > 4)
27346 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27347 src = change_address (srcmem, SImode, srcptr);
27348 dest = change_address (destmem, SImode, destptr);
27349 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27350 emit_label (label);
27351 LABEL_NUSES (label) = 1;
27353 if (max_size > 2)
27355 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27356 src = change_address (srcmem, HImode, srcptr);
27357 dest = change_address (destmem, HImode, destptr);
27358 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27359 emit_label (label);
27360 LABEL_NUSES (label) = 1;
27362 if (max_size > 1)
27364 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27365 src = change_address (srcmem, QImode, srcptr);
27366 dest = change_address (destmem, QImode, destptr);
27367 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27368 emit_label (label);
27369 LABEL_NUSES (label) = 1;
27372 else
27374 rtx offset = force_reg (Pmode, const0_rtx);
27375 rtx tmp;
27377 if (max_size > 4)
27379 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27380 src = change_address (srcmem, SImode, srcptr);
27381 dest = change_address (destmem, SImode, destptr);
27382 emit_move_insn (dest, src);
27383 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27384 true, OPTAB_LIB_WIDEN);
27385 if (tmp != offset)
27386 emit_move_insn (offset, tmp);
27387 emit_label (label);
27388 LABEL_NUSES (label) = 1;
27390 if (max_size > 2)
27392 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27393 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27394 src = change_address (srcmem, HImode, tmp);
27395 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27396 dest = change_address (destmem, HImode, tmp);
27397 emit_move_insn (dest, src);
27398 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27399 true, OPTAB_LIB_WIDEN);
27400 if (tmp != offset)
27401 emit_move_insn (offset, tmp);
27402 emit_label (label);
27403 LABEL_NUSES (label) = 1;
27405 if (max_size > 1)
27407 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27408 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27409 src = change_address (srcmem, QImode, tmp);
27410 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27411 dest = change_address (destmem, QImode, tmp);
27412 emit_move_insn (dest, src);
27413 emit_label (label);
27414 LABEL_NUSES (label) = 1;
27419 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27420 with value PROMOTED_VAL.
27421 SRC is passed by pointer to be updated on return.
27422 Return value is updated DST. */
27423 static rtx
27424 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27425 HOST_WIDE_INT size_to_move)
27427 rtx dst = destmem, adjust;
27428 enum insn_code code;
27429 machine_mode move_mode;
27430 int piece_size, i;
27432 /* Find the widest mode in which we could perform moves.
27433 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27434 it until move of such size is supported. */
27435 move_mode = GET_MODE (promoted_val);
27436 if (move_mode == VOIDmode)
27437 move_mode = QImode;
27438 if (size_to_move < GET_MODE_SIZE (move_mode))
27440 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27441 promoted_val = gen_lowpart (move_mode, promoted_val);
27443 piece_size = GET_MODE_SIZE (move_mode);
27444 code = optab_handler (mov_optab, move_mode);
27445 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27447 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27449 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27450 gcc_assert (size_to_move % piece_size == 0);
27451 adjust = GEN_INT (piece_size);
27452 for (i = 0; i < size_to_move; i += piece_size)
27454 if (piece_size <= GET_MODE_SIZE (word_mode))
27456 emit_insn (gen_strset (destptr, dst, promoted_val));
27457 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27458 piece_size);
27459 continue;
27462 emit_insn (GEN_FCN (code) (dst, promoted_val));
27464 emit_move_insn (destptr,
27465 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27467 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27468 piece_size);
27471 /* Update DST rtx. */
27472 return dst;
27474 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27475 static void
27476 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27477 rtx count, int max_size)
27479 count =
27480 expand_simple_binop (counter_mode (count), AND, count,
27481 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27482 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27483 gen_lowpart (QImode, value), count, QImode,
27484 1, max_size / 2, true);
27487 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27488 static void
27489 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27490 rtx count, int max_size)
27492 rtx dest;
27494 if (CONST_INT_P (count))
27496 HOST_WIDE_INT countval = INTVAL (count);
27497 HOST_WIDE_INT epilogue_size = countval % max_size;
27498 int i;
27500 /* For now MAX_SIZE should be a power of 2. This assert could be
27501 relaxed, but it'll require a bit more complicated epilogue
27502 expanding. */
27503 gcc_assert ((max_size & (max_size - 1)) == 0);
27504 for (i = max_size; i >= 1; i >>= 1)
27506 if (epilogue_size & i)
27508 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27509 destmem = emit_memset (destmem, destptr, vec_value, i);
27510 else
27511 destmem = emit_memset (destmem, destptr, value, i);
27514 return;
27516 if (max_size > 32)
27518 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27519 return;
27521 if (max_size > 16)
27523 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27524 if (TARGET_64BIT)
27526 dest = change_address (destmem, DImode, destptr);
27527 emit_insn (gen_strset (destptr, dest, value));
27528 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27529 emit_insn (gen_strset (destptr, dest, value));
27531 else
27533 dest = change_address (destmem, SImode, destptr);
27534 emit_insn (gen_strset (destptr, dest, value));
27535 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27536 emit_insn (gen_strset (destptr, dest, value));
27537 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27538 emit_insn (gen_strset (destptr, dest, value));
27539 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27540 emit_insn (gen_strset (destptr, dest, value));
27542 emit_label (label);
27543 LABEL_NUSES (label) = 1;
27545 if (max_size > 8)
27547 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27548 if (TARGET_64BIT)
27550 dest = change_address (destmem, DImode, destptr);
27551 emit_insn (gen_strset (destptr, dest, value));
27553 else
27555 dest = change_address (destmem, SImode, destptr);
27556 emit_insn (gen_strset (destptr, dest, value));
27557 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27558 emit_insn (gen_strset (destptr, dest, value));
27560 emit_label (label);
27561 LABEL_NUSES (label) = 1;
27563 if (max_size > 4)
27565 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27566 dest = change_address (destmem, SImode, destptr);
27567 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27568 emit_label (label);
27569 LABEL_NUSES (label) = 1;
27571 if (max_size > 2)
27573 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27574 dest = change_address (destmem, HImode, destptr);
27575 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27576 emit_label (label);
27577 LABEL_NUSES (label) = 1;
27579 if (max_size > 1)
27581 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27582 dest = change_address (destmem, QImode, destptr);
27583 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27584 emit_label (label);
27585 LABEL_NUSES (label) = 1;
27589 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27590 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27591 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27592 ignored.
27593 Return value is updated DESTMEM. */
27594 static rtx
27595 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27596 rtx destptr, rtx srcptr, rtx value,
27597 rtx vec_value, rtx count, int align,
27598 int desired_alignment, bool issetmem)
27600 int i;
27601 for (i = 1; i < desired_alignment; i <<= 1)
27603 if (align <= i)
27605 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27606 if (issetmem)
27608 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27609 destmem = emit_memset (destmem, destptr, vec_value, i);
27610 else
27611 destmem = emit_memset (destmem, destptr, value, i);
27613 else
27614 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27615 ix86_adjust_counter (count, i);
27616 emit_label (label);
27617 LABEL_NUSES (label) = 1;
27618 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27621 return destmem;
27624 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27625 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27626 and jump to DONE_LABEL. */
27627 static void
27628 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27629 rtx destptr, rtx srcptr,
27630 rtx value, rtx vec_value,
27631 rtx count, int size,
27632 rtx done_label, bool issetmem)
27634 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27635 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27636 rtx modesize;
27637 int n;
27639 /* If we do not have vector value to copy, we must reduce size. */
27640 if (issetmem)
27642 if (!vec_value)
27644 if (GET_MODE (value) == VOIDmode && size > 8)
27645 mode = Pmode;
27646 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27647 mode = GET_MODE (value);
27649 else
27650 mode = GET_MODE (vec_value), value = vec_value;
27652 else
27654 /* Choose appropriate vector mode. */
27655 if (size >= 32)
27656 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27657 else if (size >= 16)
27658 mode = TARGET_SSE ? V16QImode : DImode;
27659 srcmem = change_address (srcmem, mode, srcptr);
27661 destmem = change_address (destmem, mode, destptr);
27662 modesize = GEN_INT (GET_MODE_SIZE (mode));
27663 gcc_assert (GET_MODE_SIZE (mode) <= size);
27664 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27666 if (issetmem)
27667 emit_move_insn (destmem, gen_lowpart (mode, value));
27668 else
27670 emit_move_insn (destmem, srcmem);
27671 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27673 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27676 destmem = offset_address (destmem, count, 1);
27677 destmem = offset_address (destmem, GEN_INT (-2 * size),
27678 GET_MODE_SIZE (mode));
27679 if (!issetmem)
27681 srcmem = offset_address (srcmem, count, 1);
27682 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27683 GET_MODE_SIZE (mode));
27685 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27687 if (issetmem)
27688 emit_move_insn (destmem, gen_lowpart (mode, value));
27689 else
27691 emit_move_insn (destmem, srcmem);
27692 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27694 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27696 emit_jump_insn (gen_jump (done_label));
27697 emit_barrier ();
27699 emit_label (label);
27700 LABEL_NUSES (label) = 1;
27703 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27704 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27705 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27706 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27707 DONE_LABEL is a label after the whole copying sequence. The label is created
27708 on demand if *DONE_LABEL is NULL.
27709 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27710 bounds after the initial copies.
27712 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27713 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27714 we will dispatch to a library call for large blocks.
27716 In pseudocode we do:
27718 if (COUNT < SIZE)
27720 Assume that SIZE is 4. Bigger sizes are handled analogously
27721 if (COUNT & 4)
27723 copy 4 bytes from SRCPTR to DESTPTR
27724 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27725 goto done_label
27727 if (!COUNT)
27728 goto done_label;
27729 copy 1 byte from SRCPTR to DESTPTR
27730 if (COUNT & 2)
27732 copy 2 bytes from SRCPTR to DESTPTR
27733 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27736 else
27738 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27739 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27741 OLD_DESPTR = DESTPTR;
27742 Align DESTPTR up to DESIRED_ALIGN
27743 SRCPTR += DESTPTR - OLD_DESTPTR
27744 COUNT -= DEST_PTR - OLD_DESTPTR
27745 if (DYNAMIC_CHECK)
27746 Round COUNT down to multiple of SIZE
27747 << optional caller supplied zero size guard is here >>
27748 << optional caller supplied dynamic check is here >>
27749 << caller supplied main copy loop is here >>
27751 done_label:
27753 static void
27754 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27755 rtx *destptr, rtx *srcptr,
27756 machine_mode mode,
27757 rtx value, rtx vec_value,
27758 rtx *count,
27759 rtx_code_label **done_label,
27760 int size,
27761 int desired_align,
27762 int align,
27763 unsigned HOST_WIDE_INT *min_size,
27764 bool dynamic_check,
27765 bool issetmem)
27767 rtx_code_label *loop_label = NULL, *label;
27768 int n;
27769 rtx modesize;
27770 int prolog_size = 0;
27771 rtx mode_value;
27773 /* Chose proper value to copy. */
27774 if (issetmem && VECTOR_MODE_P (mode))
27775 mode_value = vec_value;
27776 else
27777 mode_value = value;
27778 gcc_assert (GET_MODE_SIZE (mode) <= size);
27780 /* See if block is big or small, handle small blocks. */
27781 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27783 int size2 = size;
27784 loop_label = gen_label_rtx ();
27786 if (!*done_label)
27787 *done_label = gen_label_rtx ();
27789 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27790 1, loop_label);
27791 size2 >>= 1;
27793 /* Handle sizes > 3. */
27794 for (;size2 > 2; size2 >>= 1)
27795 expand_small_movmem_or_setmem (destmem, srcmem,
27796 *destptr, *srcptr,
27797 value, vec_value,
27798 *count,
27799 size2, *done_label, issetmem);
27800 /* Nothing to copy? Jump to DONE_LABEL if so */
27801 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27802 1, *done_label);
27804 /* Do a byte copy. */
27805 destmem = change_address (destmem, QImode, *destptr);
27806 if (issetmem)
27807 emit_move_insn (destmem, gen_lowpart (QImode, value));
27808 else
27810 srcmem = change_address (srcmem, QImode, *srcptr);
27811 emit_move_insn (destmem, srcmem);
27814 /* Handle sizes 2 and 3. */
27815 label = ix86_expand_aligntest (*count, 2, false);
27816 destmem = change_address (destmem, HImode, *destptr);
27817 destmem = offset_address (destmem, *count, 1);
27818 destmem = offset_address (destmem, GEN_INT (-2), 2);
27819 if (issetmem)
27820 emit_move_insn (destmem, gen_lowpart (HImode, value));
27821 else
27823 srcmem = change_address (srcmem, HImode, *srcptr);
27824 srcmem = offset_address (srcmem, *count, 1);
27825 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27826 emit_move_insn (destmem, srcmem);
27829 emit_label (label);
27830 LABEL_NUSES (label) = 1;
27831 emit_jump_insn (gen_jump (*done_label));
27832 emit_barrier ();
27834 else
27835 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27836 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27838 /* Start memcpy for COUNT >= SIZE. */
27839 if (loop_label)
27841 emit_label (loop_label);
27842 LABEL_NUSES (loop_label) = 1;
27845 /* Copy first desired_align bytes. */
27846 if (!issetmem)
27847 srcmem = change_address (srcmem, mode, *srcptr);
27848 destmem = change_address (destmem, mode, *destptr);
27849 modesize = GEN_INT (GET_MODE_SIZE (mode));
27850 for (n = 0; prolog_size < desired_align - align; n++)
27852 if (issetmem)
27853 emit_move_insn (destmem, mode_value);
27854 else
27856 emit_move_insn (destmem, srcmem);
27857 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27859 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27860 prolog_size += GET_MODE_SIZE (mode);
27864 /* Copy last SIZE bytes. */
27865 destmem = offset_address (destmem, *count, 1);
27866 destmem = offset_address (destmem,
27867 GEN_INT (-size - prolog_size),
27869 if (issetmem)
27870 emit_move_insn (destmem, mode_value);
27871 else
27873 srcmem = offset_address (srcmem, *count, 1);
27874 srcmem = offset_address (srcmem,
27875 GEN_INT (-size - prolog_size),
27877 emit_move_insn (destmem, srcmem);
27879 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27881 destmem = offset_address (destmem, modesize, 1);
27882 if (issetmem)
27883 emit_move_insn (destmem, mode_value);
27884 else
27886 srcmem = offset_address (srcmem, modesize, 1);
27887 emit_move_insn (destmem, srcmem);
27891 /* Align destination. */
27892 if (desired_align > 1 && desired_align > align)
27894 rtx saveddest = *destptr;
27896 gcc_assert (desired_align <= size);
27897 /* Align destptr up, place it to new register. */
27898 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27899 GEN_INT (prolog_size),
27900 NULL_RTX, 1, OPTAB_DIRECT);
27901 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27902 REG_POINTER (*destptr) = 1;
27903 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27904 GEN_INT (-desired_align),
27905 *destptr, 1, OPTAB_DIRECT);
27906 /* See how many bytes we skipped. */
27907 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27908 *destptr,
27909 saveddest, 1, OPTAB_DIRECT);
27910 /* Adjust srcptr and count. */
27911 if (!issetmem)
27912 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27913 saveddest, *srcptr, 1, OPTAB_DIRECT);
27914 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27915 saveddest, *count, 1, OPTAB_DIRECT);
27916 /* We copied at most size + prolog_size. */
27917 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27918 *min_size
27919 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27920 else
27921 *min_size = 0;
27923 /* Our loops always round down the block size, but for dispatch to
27924 library we need precise value. */
27925 if (dynamic_check)
27926 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27927 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27929 else
27931 gcc_assert (prolog_size == 0);
27932 /* Decrease count, so we won't end up copying last word twice. */
27933 if (!CONST_INT_P (*count))
27934 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27935 constm1_rtx, *count, 1, OPTAB_DIRECT);
27936 else
27937 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27938 (unsigned HOST_WIDE_INT)size));
27939 if (*min_size)
27940 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27945 /* This function is like the previous one, except here we know how many bytes
27946 need to be copied. That allows us to update alignment not only of DST, which
27947 is returned, but also of SRC, which is passed as a pointer for that
27948 reason. */
27949 static rtx
27950 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27951 rtx srcreg, rtx value, rtx vec_value,
27952 int desired_align, int align_bytes,
27953 bool issetmem)
27955 rtx src = NULL;
27956 rtx orig_dst = dst;
27957 rtx orig_src = NULL;
27958 int piece_size = 1;
27959 int copied_bytes = 0;
27961 if (!issetmem)
27963 gcc_assert (srcp != NULL);
27964 src = *srcp;
27965 orig_src = src;
27968 for (piece_size = 1;
27969 piece_size <= desired_align && copied_bytes < align_bytes;
27970 piece_size <<= 1)
27972 if (align_bytes & piece_size)
27974 if (issetmem)
27976 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27977 dst = emit_memset (dst, destreg, vec_value, piece_size);
27978 else
27979 dst = emit_memset (dst, destreg, value, piece_size);
27981 else
27982 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27983 copied_bytes += piece_size;
27986 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27987 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27988 if (MEM_SIZE_KNOWN_P (orig_dst))
27989 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27991 if (!issetmem)
27993 int src_align_bytes = get_mem_align_offset (src, desired_align
27994 * BITS_PER_UNIT);
27995 if (src_align_bytes >= 0)
27996 src_align_bytes = desired_align - src_align_bytes;
27997 if (src_align_bytes >= 0)
27999 unsigned int src_align;
28000 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28002 if ((src_align_bytes & (src_align - 1))
28003 == (align_bytes & (src_align - 1)))
28004 break;
28006 if (src_align > (unsigned int) desired_align)
28007 src_align = desired_align;
28008 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28009 set_mem_align (src, src_align * BITS_PER_UNIT);
28011 if (MEM_SIZE_KNOWN_P (orig_src))
28012 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28013 *srcp = src;
28016 return dst;
28019 /* Return true if ALG can be used in current context.
28020 Assume we expand memset if MEMSET is true. */
28021 static bool
28022 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28024 if (alg == no_stringop)
28025 return false;
28026 if (alg == vector_loop)
28027 return TARGET_SSE || TARGET_AVX;
28028 /* Algorithms using the rep prefix want at least edi and ecx;
28029 additionally, memset wants eax and memcpy wants esi. Don't
28030 consider such algorithms if the user has appropriated those
28031 registers for their own purposes, or if we have a non-default
28032 address space, since some string insns cannot override the segment. */
28033 if (alg == rep_prefix_1_byte
28034 || alg == rep_prefix_4_byte
28035 || alg == rep_prefix_8_byte)
28037 if (have_as)
28038 return false;
28039 if (fixed_regs[CX_REG]
28040 || fixed_regs[DI_REG]
28041 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28042 return false;
28044 return true;
28047 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28048 static enum stringop_alg
28049 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28050 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28051 bool memset, bool zero_memset, bool have_as,
28052 int *dynamic_check, bool *noalign, bool recur)
28054 const struct stringop_algs *algs;
28055 bool optimize_for_speed;
28056 int max = 0;
28057 const struct processor_costs *cost;
28058 int i;
28059 bool any_alg_usable_p = false;
28061 *noalign = false;
28062 *dynamic_check = -1;
28064 /* Even if the string operation call is cold, we still might spend a lot
28065 of time processing large blocks. */
28066 if (optimize_function_for_size_p (cfun)
28067 || (optimize_insn_for_size_p ()
28068 && (max_size < 256
28069 || (expected_size != -1 && expected_size < 256))))
28070 optimize_for_speed = false;
28071 else
28072 optimize_for_speed = true;
28074 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28075 if (memset)
28076 algs = &cost->memset[TARGET_64BIT != 0];
28077 else
28078 algs = &cost->memcpy[TARGET_64BIT != 0];
28080 /* See maximal size for user defined algorithm. */
28081 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28083 enum stringop_alg candidate = algs->size[i].alg;
28084 bool usable = alg_usable_p (candidate, memset, have_as);
28085 any_alg_usable_p |= usable;
28087 if (candidate != libcall && candidate && usable)
28088 max = algs->size[i].max;
28091 /* If expected size is not known but max size is small enough
28092 so inline version is a win, set expected size into
28093 the range. */
28094 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28095 && expected_size == -1)
28096 expected_size = min_size / 2 + max_size / 2;
28098 /* If user specified the algorithm, honor it if possible. */
28099 if (ix86_stringop_alg != no_stringop
28100 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28101 return ix86_stringop_alg;
28102 /* rep; movq or rep; movl is the smallest variant. */
28103 else if (!optimize_for_speed)
28105 *noalign = true;
28106 if (!count || (count & 3) || (memset && !zero_memset))
28107 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28108 ? rep_prefix_1_byte : loop_1_byte;
28109 else
28110 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28111 ? rep_prefix_4_byte : loop;
28113 /* Very tiny blocks are best handled via the loop, REP is expensive to
28114 setup. */
28115 else if (expected_size != -1 && expected_size < 4)
28116 return loop_1_byte;
28117 else if (expected_size != -1)
28119 enum stringop_alg alg = libcall;
28120 bool alg_noalign = false;
28121 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28123 /* We get here if the algorithms that were not libcall-based
28124 were rep-prefix based and we are unable to use rep prefixes
28125 based on global register usage. Break out of the loop and
28126 use the heuristic below. */
28127 if (algs->size[i].max == 0)
28128 break;
28129 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28131 enum stringop_alg candidate = algs->size[i].alg;
28133 if (candidate != libcall
28134 && alg_usable_p (candidate, memset, have_as))
28136 alg = candidate;
28137 alg_noalign = algs->size[i].noalign;
28139 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28140 last non-libcall inline algorithm. */
28141 if (TARGET_INLINE_ALL_STRINGOPS)
28143 /* When the current size is best to be copied by a libcall,
28144 but we are still forced to inline, run the heuristic below
28145 that will pick code for medium sized blocks. */
28146 if (alg != libcall)
28148 *noalign = alg_noalign;
28149 return alg;
28151 else if (!any_alg_usable_p)
28152 break;
28154 else if (alg_usable_p (candidate, memset, have_as))
28156 *noalign = algs->size[i].noalign;
28157 return candidate;
28162 /* When asked to inline the call anyway, try to pick meaningful choice.
28163 We look for maximal size of block that is faster to copy by hand and
28164 take blocks of at most of that size guessing that average size will
28165 be roughly half of the block.
28167 If this turns out to be bad, we might simply specify the preferred
28168 choice in ix86_costs. */
28169 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28170 && (algs->unknown_size == libcall
28171 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28173 enum stringop_alg alg;
28174 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28176 /* If there aren't any usable algorithms or if recursing already,
28177 then recursing on smaller sizes or same size isn't going to
28178 find anything. Just return the simple byte-at-a-time copy loop. */
28179 if (!any_alg_usable_p || recur)
28181 /* Pick something reasonable. */
28182 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28183 *dynamic_check = 128;
28184 return loop_1_byte;
28186 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28187 zero_memset, have_as, dynamic_check, noalign, true);
28188 gcc_assert (*dynamic_check == -1);
28189 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28190 *dynamic_check = max;
28191 else
28192 gcc_assert (alg != libcall);
28193 return alg;
28195 return (alg_usable_p (algs->unknown_size, memset, have_as)
28196 ? algs->unknown_size : libcall);
28199 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28200 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28201 static int
28202 decide_alignment (int align,
28203 enum stringop_alg alg,
28204 int expected_size,
28205 machine_mode move_mode)
28207 int desired_align = 0;
28209 gcc_assert (alg != no_stringop);
28211 if (alg == libcall)
28212 return 0;
28213 if (move_mode == VOIDmode)
28214 return 0;
28216 desired_align = GET_MODE_SIZE (move_mode);
28217 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28218 copying whole cacheline at once. */
28219 if (TARGET_PENTIUMPRO
28220 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28221 desired_align = 8;
28223 if (optimize_size)
28224 desired_align = 1;
28225 if (desired_align < align)
28226 desired_align = align;
28227 if (expected_size != -1 && expected_size < 4)
28228 desired_align = align;
28230 return desired_align;
28234 /* Helper function for memcpy. For QImode value 0xXY produce
28235 0xXYXYXYXY of wide specified by MODE. This is essentially
28236 a * 0x10101010, but we can do slightly better than
28237 synth_mult by unwinding the sequence by hand on CPUs with
28238 slow multiply. */
28239 static rtx
28240 promote_duplicated_reg (machine_mode mode, rtx val)
28242 machine_mode valmode = GET_MODE (val);
28243 rtx tmp;
28244 int nops = mode == DImode ? 3 : 2;
28246 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28247 if (val == const0_rtx)
28248 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28249 if (CONST_INT_P (val))
28251 HOST_WIDE_INT v = INTVAL (val) & 255;
28253 v |= v << 8;
28254 v |= v << 16;
28255 if (mode == DImode)
28256 v |= (v << 16) << 16;
28257 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28260 if (valmode == VOIDmode)
28261 valmode = QImode;
28262 if (valmode != QImode)
28263 val = gen_lowpart (QImode, val);
28264 if (mode == QImode)
28265 return val;
28266 if (!TARGET_PARTIAL_REG_STALL)
28267 nops--;
28268 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28269 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28270 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28271 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28273 rtx reg = convert_modes (mode, QImode, val, true);
28274 tmp = promote_duplicated_reg (mode, const1_rtx);
28275 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28276 OPTAB_DIRECT);
28278 else
28280 rtx reg = convert_modes (mode, QImode, val, true);
28282 if (!TARGET_PARTIAL_REG_STALL)
28283 if (mode == SImode)
28284 emit_insn (gen_insvsi_1 (reg, reg));
28285 else
28286 emit_insn (gen_insvdi_1 (reg, reg));
28287 else
28289 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28290 NULL, 1, OPTAB_DIRECT);
28291 reg =
28292 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28294 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28295 NULL, 1, OPTAB_DIRECT);
28296 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28297 if (mode == SImode)
28298 return reg;
28299 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28300 NULL, 1, OPTAB_DIRECT);
28301 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28302 return reg;
28306 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28307 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28308 alignment from ALIGN to DESIRED_ALIGN. */
28309 static rtx
28310 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28311 int align)
28313 rtx promoted_val;
28315 if (TARGET_64BIT
28316 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28317 promoted_val = promote_duplicated_reg (DImode, val);
28318 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28319 promoted_val = promote_duplicated_reg (SImode, val);
28320 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28321 promoted_val = promote_duplicated_reg (HImode, val);
28322 else
28323 promoted_val = val;
28325 return promoted_val;
28328 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28329 operations when profitable. The code depends upon architecture, block size
28330 and alignment, but always has one of the following overall structures:
28332 Aligned move sequence:
28334 1) Prologue guard: Conditional that jumps up to epilogues for small
28335 blocks that can be handled by epilogue alone. This is faster
28336 but also needed for correctness, since prologue assume the block
28337 is larger than the desired alignment.
28339 Optional dynamic check for size and libcall for large
28340 blocks is emitted here too, with -minline-stringops-dynamically.
28342 2) Prologue: copy first few bytes in order to get destination
28343 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28344 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28345 copied. We emit either a jump tree on power of two sized
28346 blocks, or a byte loop.
28348 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28349 with specified algorithm.
28351 4) Epilogue: code copying tail of the block that is too small to be
28352 handled by main body (or up to size guarded by prologue guard).
28354 Misaligned move sequence
28356 1) missaligned move prologue/epilogue containing:
28357 a) Prologue handling small memory blocks and jumping to done_label
28358 (skipped if blocks are known to be large enough)
28359 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28360 needed by single possibly misaligned move
28361 (skipped if alignment is not needed)
28362 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28364 2) Zero size guard dispatching to done_label, if needed
28366 3) dispatch to library call, if needed,
28368 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28369 with specified algorithm. */
28370 bool
28371 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28372 rtx align_exp, rtx expected_align_exp,
28373 rtx expected_size_exp, rtx min_size_exp,
28374 rtx max_size_exp, rtx probable_max_size_exp,
28375 bool issetmem)
28377 rtx destreg;
28378 rtx srcreg = NULL;
28379 rtx_code_label *label = NULL;
28380 rtx tmp;
28381 rtx_code_label *jump_around_label = NULL;
28382 HOST_WIDE_INT align = 1;
28383 unsigned HOST_WIDE_INT count = 0;
28384 HOST_WIDE_INT expected_size = -1;
28385 int size_needed = 0, epilogue_size_needed;
28386 int desired_align = 0, align_bytes = 0;
28387 enum stringop_alg alg;
28388 rtx promoted_val = NULL;
28389 rtx vec_promoted_val = NULL;
28390 bool force_loopy_epilogue = false;
28391 int dynamic_check;
28392 bool need_zero_guard = false;
28393 bool noalign;
28394 machine_mode move_mode = VOIDmode;
28395 int unroll_factor = 1;
28396 /* TODO: Once value ranges are available, fill in proper data. */
28397 unsigned HOST_WIDE_INT min_size = 0;
28398 unsigned HOST_WIDE_INT max_size = -1;
28399 unsigned HOST_WIDE_INT probable_max_size = -1;
28400 bool misaligned_prologue_used = false;
28401 bool have_as;
28403 if (CONST_INT_P (align_exp))
28404 align = INTVAL (align_exp);
28405 /* i386 can do misaligned access on reasonably increased cost. */
28406 if (CONST_INT_P (expected_align_exp)
28407 && INTVAL (expected_align_exp) > align)
28408 align = INTVAL (expected_align_exp);
28409 /* ALIGN is the minimum of destination and source alignment, but we care here
28410 just about destination alignment. */
28411 else if (!issetmem
28412 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28413 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28415 if (CONST_INT_P (count_exp))
28417 min_size = max_size = probable_max_size = count = expected_size
28418 = INTVAL (count_exp);
28419 /* When COUNT is 0, there is nothing to do. */
28420 if (!count)
28421 return true;
28423 else
28425 if (min_size_exp)
28426 min_size = INTVAL (min_size_exp);
28427 if (max_size_exp)
28428 max_size = INTVAL (max_size_exp);
28429 if (probable_max_size_exp)
28430 probable_max_size = INTVAL (probable_max_size_exp);
28431 if (CONST_INT_P (expected_size_exp))
28432 expected_size = INTVAL (expected_size_exp);
28435 /* Make sure we don't need to care about overflow later on. */
28436 if (count > (HOST_WIDE_INT_1U << 30))
28437 return false;
28439 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28440 if (!issetmem)
28441 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28443 /* Step 0: Decide on preferred algorithm, desired alignment and
28444 size of chunks to be copied by main loop. */
28445 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28446 issetmem,
28447 issetmem && val_exp == const0_rtx, have_as,
28448 &dynamic_check, &noalign, false);
28449 if (alg == libcall)
28450 return false;
28451 gcc_assert (alg != no_stringop);
28453 /* For now vector-version of memset is generated only for memory zeroing, as
28454 creating of promoted vector value is very cheap in this case. */
28455 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28456 alg = unrolled_loop;
28458 if (!count)
28459 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28460 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28461 if (!issetmem)
28462 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28464 unroll_factor = 1;
28465 move_mode = word_mode;
28466 switch (alg)
28468 case libcall:
28469 case no_stringop:
28470 case last_alg:
28471 gcc_unreachable ();
28472 case loop_1_byte:
28473 need_zero_guard = true;
28474 move_mode = QImode;
28475 break;
28476 case loop:
28477 need_zero_guard = true;
28478 break;
28479 case unrolled_loop:
28480 need_zero_guard = true;
28481 unroll_factor = (TARGET_64BIT ? 4 : 2);
28482 break;
28483 case vector_loop:
28484 need_zero_guard = true;
28485 unroll_factor = 4;
28486 /* Find the widest supported mode. */
28487 move_mode = word_mode;
28488 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
28489 != CODE_FOR_nothing)
28490 move_mode = GET_MODE_WIDER_MODE (move_mode);
28492 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28493 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28494 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28496 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28497 move_mode = mode_for_vector (word_mode, nunits);
28498 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28499 move_mode = word_mode;
28501 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28502 break;
28503 case rep_prefix_8_byte:
28504 move_mode = DImode;
28505 break;
28506 case rep_prefix_4_byte:
28507 move_mode = SImode;
28508 break;
28509 case rep_prefix_1_byte:
28510 move_mode = QImode;
28511 break;
28513 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28514 epilogue_size_needed = size_needed;
28516 /* If we are going to call any library calls conditionally, make sure any
28517 pending stack adjustment happen before the first conditional branch,
28518 otherwise they will be emitted before the library call only and won't
28519 happen from the other branches. */
28520 if (dynamic_check != -1)
28521 do_pending_stack_adjust ();
28523 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28524 if (!TARGET_ALIGN_STRINGOPS || noalign)
28525 align = desired_align;
28527 /* Step 1: Prologue guard. */
28529 /* Alignment code needs count to be in register. */
28530 if (CONST_INT_P (count_exp) && desired_align > align)
28532 if (INTVAL (count_exp) > desired_align
28533 && INTVAL (count_exp) > size_needed)
28535 align_bytes
28536 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28537 if (align_bytes <= 0)
28538 align_bytes = 0;
28539 else
28540 align_bytes = desired_align - align_bytes;
28542 if (align_bytes == 0)
28543 count_exp = force_reg (counter_mode (count_exp), count_exp);
28545 gcc_assert (desired_align >= 1 && align >= 1);
28547 /* Misaligned move sequences handle both prologue and epilogue at once.
28548 Default code generation results in a smaller code for large alignments
28549 and also avoids redundant job when sizes are known precisely. */
28550 misaligned_prologue_used
28551 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28552 && MAX (desired_align, epilogue_size_needed) <= 32
28553 && desired_align <= epilogue_size_needed
28554 && ((desired_align > align && !align_bytes)
28555 || (!count && epilogue_size_needed > 1)));
28557 /* Do the cheap promotion to allow better CSE across the
28558 main loop and epilogue (ie one load of the big constant in the
28559 front of all code.
28560 For now the misaligned move sequences do not have fast path
28561 without broadcasting. */
28562 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28564 if (alg == vector_loop)
28566 gcc_assert (val_exp == const0_rtx);
28567 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28568 promoted_val = promote_duplicated_reg_to_size (val_exp,
28569 GET_MODE_SIZE (word_mode),
28570 desired_align, align);
28572 else
28574 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28575 desired_align, align);
28578 /* Misaligned move sequences handles both prologues and epilogues at once.
28579 Default code generation results in smaller code for large alignments and
28580 also avoids redundant job when sizes are known precisely. */
28581 if (misaligned_prologue_used)
28583 /* Misaligned move prologue handled small blocks by itself. */
28584 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28585 (dst, src, &destreg, &srcreg,
28586 move_mode, promoted_val, vec_promoted_val,
28587 &count_exp,
28588 &jump_around_label,
28589 desired_align < align
28590 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28591 desired_align, align, &min_size, dynamic_check, issetmem);
28592 if (!issetmem)
28593 src = change_address (src, BLKmode, srcreg);
28594 dst = change_address (dst, BLKmode, destreg);
28595 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28596 epilogue_size_needed = 0;
28597 if (need_zero_guard
28598 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28600 /* It is possible that we copied enough so the main loop will not
28601 execute. */
28602 gcc_assert (size_needed > 1);
28603 if (jump_around_label == NULL_RTX)
28604 jump_around_label = gen_label_rtx ();
28605 emit_cmp_and_jump_insns (count_exp,
28606 GEN_INT (size_needed),
28607 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28608 if (expected_size == -1
28609 || expected_size < (desired_align - align) / 2 + size_needed)
28610 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28611 else
28612 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28615 /* Ensure that alignment prologue won't copy past end of block. */
28616 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28618 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28619 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28620 Make sure it is power of 2. */
28621 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28623 /* To improve performance of small blocks, we jump around the VAL
28624 promoting mode. This mean that if the promoted VAL is not constant,
28625 we might not use it in the epilogue and have to use byte
28626 loop variant. */
28627 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28628 force_loopy_epilogue = true;
28629 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28630 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28632 /* If main algorithm works on QImode, no epilogue is needed.
28633 For small sizes just don't align anything. */
28634 if (size_needed == 1)
28635 desired_align = align;
28636 else
28637 goto epilogue;
28639 else if (!count
28640 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28642 label = gen_label_rtx ();
28643 emit_cmp_and_jump_insns (count_exp,
28644 GEN_INT (epilogue_size_needed),
28645 LTU, 0, counter_mode (count_exp), 1, label);
28646 if (expected_size == -1 || expected_size < epilogue_size_needed)
28647 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28648 else
28649 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28653 /* Emit code to decide on runtime whether library call or inline should be
28654 used. */
28655 if (dynamic_check != -1)
28657 if (!issetmem && CONST_INT_P (count_exp))
28659 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28661 emit_block_copy_via_libcall (dst, src, count_exp);
28662 count_exp = const0_rtx;
28663 goto epilogue;
28666 else
28668 rtx_code_label *hot_label = gen_label_rtx ();
28669 if (jump_around_label == NULL_RTX)
28670 jump_around_label = gen_label_rtx ();
28671 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28672 LEU, 0, counter_mode (count_exp),
28673 1, hot_label);
28674 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28675 if (issetmem)
28676 set_storage_via_libcall (dst, count_exp, val_exp);
28677 else
28678 emit_block_copy_via_libcall (dst, src, count_exp);
28679 emit_jump (jump_around_label);
28680 emit_label (hot_label);
28684 /* Step 2: Alignment prologue. */
28685 /* Do the expensive promotion once we branched off the small blocks. */
28686 if (issetmem && !promoted_val)
28687 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28688 desired_align, align);
28690 if (desired_align > align && !misaligned_prologue_used)
28692 if (align_bytes == 0)
28694 /* Except for the first move in prologue, we no longer know
28695 constant offset in aliasing info. It don't seems to worth
28696 the pain to maintain it for the first move, so throw away
28697 the info early. */
28698 dst = change_address (dst, BLKmode, destreg);
28699 if (!issetmem)
28700 src = change_address (src, BLKmode, srcreg);
28701 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28702 promoted_val, vec_promoted_val,
28703 count_exp, align, desired_align,
28704 issetmem);
28705 /* At most desired_align - align bytes are copied. */
28706 if (min_size < (unsigned)(desired_align - align))
28707 min_size = 0;
28708 else
28709 min_size -= desired_align - align;
28711 else
28713 /* If we know how many bytes need to be stored before dst is
28714 sufficiently aligned, maintain aliasing info accurately. */
28715 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28716 srcreg,
28717 promoted_val,
28718 vec_promoted_val,
28719 desired_align,
28720 align_bytes,
28721 issetmem);
28723 count_exp = plus_constant (counter_mode (count_exp),
28724 count_exp, -align_bytes);
28725 count -= align_bytes;
28726 min_size -= align_bytes;
28727 max_size -= align_bytes;
28729 if (need_zero_guard
28730 && min_size < (unsigned HOST_WIDE_INT) size_needed
28731 && (count < (unsigned HOST_WIDE_INT) size_needed
28732 || (align_bytes == 0
28733 && count < ((unsigned HOST_WIDE_INT) size_needed
28734 + desired_align - align))))
28736 /* It is possible that we copied enough so the main loop will not
28737 execute. */
28738 gcc_assert (size_needed > 1);
28739 if (label == NULL_RTX)
28740 label = gen_label_rtx ();
28741 emit_cmp_and_jump_insns (count_exp,
28742 GEN_INT (size_needed),
28743 LTU, 0, counter_mode (count_exp), 1, label);
28744 if (expected_size == -1
28745 || expected_size < (desired_align - align) / 2 + size_needed)
28746 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28747 else
28748 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28751 if (label && size_needed == 1)
28753 emit_label (label);
28754 LABEL_NUSES (label) = 1;
28755 label = NULL;
28756 epilogue_size_needed = 1;
28757 if (issetmem)
28758 promoted_val = val_exp;
28760 else if (label == NULL_RTX && !misaligned_prologue_used)
28761 epilogue_size_needed = size_needed;
28763 /* Step 3: Main loop. */
28765 switch (alg)
28767 case libcall:
28768 case no_stringop:
28769 case last_alg:
28770 gcc_unreachable ();
28771 case loop_1_byte:
28772 case loop:
28773 case unrolled_loop:
28774 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28775 count_exp, move_mode, unroll_factor,
28776 expected_size, issetmem);
28777 break;
28778 case vector_loop:
28779 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28780 vec_promoted_val, count_exp, move_mode,
28781 unroll_factor, expected_size, issetmem);
28782 break;
28783 case rep_prefix_8_byte:
28784 case rep_prefix_4_byte:
28785 case rep_prefix_1_byte:
28786 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28787 val_exp, count_exp, move_mode, issetmem);
28788 break;
28790 /* Adjust properly the offset of src and dest memory for aliasing. */
28791 if (CONST_INT_P (count_exp))
28793 if (!issetmem)
28794 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28795 (count / size_needed) * size_needed);
28796 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28797 (count / size_needed) * size_needed);
28799 else
28801 if (!issetmem)
28802 src = change_address (src, BLKmode, srcreg);
28803 dst = change_address (dst, BLKmode, destreg);
28806 /* Step 4: Epilogue to copy the remaining bytes. */
28807 epilogue:
28808 if (label)
28810 /* When the main loop is done, COUNT_EXP might hold original count,
28811 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28812 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28813 bytes. Compensate if needed. */
28815 if (size_needed < epilogue_size_needed)
28817 tmp =
28818 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28819 GEN_INT (size_needed - 1), count_exp, 1,
28820 OPTAB_DIRECT);
28821 if (tmp != count_exp)
28822 emit_move_insn (count_exp, tmp);
28824 emit_label (label);
28825 LABEL_NUSES (label) = 1;
28828 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28830 if (force_loopy_epilogue)
28831 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28832 epilogue_size_needed);
28833 else
28835 if (issetmem)
28836 expand_setmem_epilogue (dst, destreg, promoted_val,
28837 vec_promoted_val, count_exp,
28838 epilogue_size_needed);
28839 else
28840 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28841 epilogue_size_needed);
28844 if (jump_around_label)
28845 emit_label (jump_around_label);
28846 return true;
28850 /* Expand the appropriate insns for doing strlen if not just doing
28851 repnz; scasb
28853 out = result, initialized with the start address
28854 align_rtx = alignment of the address.
28855 scratch = scratch register, initialized with the startaddress when
28856 not aligned, otherwise undefined
28858 This is just the body. It needs the initializations mentioned above and
28859 some address computing at the end. These things are done in i386.md. */
28861 static void
28862 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28864 int align;
28865 rtx tmp;
28866 rtx_code_label *align_2_label = NULL;
28867 rtx_code_label *align_3_label = NULL;
28868 rtx_code_label *align_4_label = gen_label_rtx ();
28869 rtx_code_label *end_0_label = gen_label_rtx ();
28870 rtx mem;
28871 rtx tmpreg = gen_reg_rtx (SImode);
28872 rtx scratch = gen_reg_rtx (SImode);
28873 rtx cmp;
28875 align = 0;
28876 if (CONST_INT_P (align_rtx))
28877 align = INTVAL (align_rtx);
28879 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28881 /* Is there a known alignment and is it less than 4? */
28882 if (align < 4)
28884 rtx scratch1 = gen_reg_rtx (Pmode);
28885 emit_move_insn (scratch1, out);
28886 /* Is there a known alignment and is it not 2? */
28887 if (align != 2)
28889 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28890 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28892 /* Leave just the 3 lower bits. */
28893 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28894 NULL_RTX, 0, OPTAB_WIDEN);
28896 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28897 Pmode, 1, align_4_label);
28898 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28899 Pmode, 1, align_2_label);
28900 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28901 Pmode, 1, align_3_label);
28903 else
28905 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28906 check if is aligned to 4 - byte. */
28908 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28909 NULL_RTX, 0, OPTAB_WIDEN);
28911 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28912 Pmode, 1, align_4_label);
28915 mem = change_address (src, QImode, out);
28917 /* Now compare the bytes. */
28919 /* Compare the first n unaligned byte on a byte per byte basis. */
28920 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28921 QImode, 1, end_0_label);
28923 /* Increment the address. */
28924 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28926 /* Not needed with an alignment of 2 */
28927 if (align != 2)
28929 emit_label (align_2_label);
28931 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28932 end_0_label);
28934 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28936 emit_label (align_3_label);
28939 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28940 end_0_label);
28942 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28945 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28946 align this loop. It gives only huge programs, but does not help to
28947 speed up. */
28948 emit_label (align_4_label);
28950 mem = change_address (src, SImode, out);
28951 emit_move_insn (scratch, mem);
28952 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28954 /* This formula yields a nonzero result iff one of the bytes is zero.
28955 This saves three branches inside loop and many cycles. */
28957 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28958 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28959 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28960 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28961 gen_int_mode (0x80808080, SImode)));
28962 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28963 align_4_label);
28965 if (TARGET_CMOVE)
28967 rtx reg = gen_reg_rtx (SImode);
28968 rtx reg2 = gen_reg_rtx (Pmode);
28969 emit_move_insn (reg, tmpreg);
28970 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28972 /* If zero is not in the first two bytes, move two bytes forward. */
28973 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28974 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28975 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28976 emit_insn (gen_rtx_SET (tmpreg,
28977 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28978 reg,
28979 tmpreg)));
28980 /* Emit lea manually to avoid clobbering of flags. */
28981 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28983 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28984 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28985 emit_insn (gen_rtx_SET (out,
28986 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28987 reg2,
28988 out)));
28990 else
28992 rtx_code_label *end_2_label = gen_label_rtx ();
28993 /* Is zero in the first two bytes? */
28995 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28996 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28997 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28998 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28999 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29000 pc_rtx);
29001 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29002 JUMP_LABEL (tmp) = end_2_label;
29004 /* Not in the first two. Move two bytes forward. */
29005 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29006 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29008 emit_label (end_2_label);
29012 /* Avoid branch in fixing the byte. */
29013 tmpreg = gen_lowpart (QImode, tmpreg);
29014 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29015 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29016 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29017 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29019 emit_label (end_0_label);
29022 /* Expand strlen. */
29024 bool
29025 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29027 rtx addr, scratch1, scratch2, scratch3, scratch4;
29029 /* The generic case of strlen expander is long. Avoid it's
29030 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29032 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29033 && !TARGET_INLINE_ALL_STRINGOPS
29034 && !optimize_insn_for_size_p ()
29035 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29036 return false;
29038 addr = force_reg (Pmode, XEXP (src, 0));
29039 scratch1 = gen_reg_rtx (Pmode);
29041 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29042 && !optimize_insn_for_size_p ())
29044 /* Well it seems that some optimizer does not combine a call like
29045 foo(strlen(bar), strlen(bar));
29046 when the move and the subtraction is done here. It does calculate
29047 the length just once when these instructions are done inside of
29048 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29049 often used and I use one fewer register for the lifetime of
29050 output_strlen_unroll() this is better. */
29052 emit_move_insn (out, addr);
29054 ix86_expand_strlensi_unroll_1 (out, src, align);
29056 /* strlensi_unroll_1 returns the address of the zero at the end of
29057 the string, like memchr(), so compute the length by subtracting
29058 the start address. */
29059 emit_insn (ix86_gen_sub3 (out, out, addr));
29061 else
29063 rtx unspec;
29065 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29066 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29067 return false;
29068 /* Can't use this for non-default address spaces. */
29069 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29070 return false;
29072 scratch2 = gen_reg_rtx (Pmode);
29073 scratch3 = gen_reg_rtx (Pmode);
29074 scratch4 = force_reg (Pmode, constm1_rtx);
29076 emit_move_insn (scratch3, addr);
29077 eoschar = force_reg (QImode, eoschar);
29079 src = replace_equiv_address_nv (src, scratch3);
29081 /* If .md starts supporting :P, this can be done in .md. */
29082 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29083 scratch4), UNSPEC_SCAS);
29084 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29085 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29086 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29088 return true;
29091 /* For given symbol (function) construct code to compute address of it's PLT
29092 entry in large x86-64 PIC model. */
29093 static rtx
29094 construct_plt_address (rtx symbol)
29096 rtx tmp, unspec;
29098 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29099 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29100 gcc_assert (Pmode == DImode);
29102 tmp = gen_reg_rtx (Pmode);
29103 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29105 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29106 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29107 return tmp;
29111 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29112 rtx callarg2,
29113 rtx pop, bool sibcall)
29115 rtx vec[3];
29116 rtx use = NULL, call;
29117 unsigned int vec_len = 0;
29118 tree fndecl;
29120 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29122 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29123 if (fndecl
29124 && (lookup_attribute ("interrupt",
29125 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29126 error ("interrupt service routine can't be called directly");
29128 else
29129 fndecl = NULL_TREE;
29131 if (pop == const0_rtx)
29132 pop = NULL;
29133 gcc_assert (!TARGET_64BIT || !pop);
29135 if (TARGET_MACHO && !TARGET_64BIT)
29137 #if TARGET_MACHO
29138 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29139 fnaddr = machopic_indirect_call_target (fnaddr);
29140 #endif
29142 else
29144 /* Static functions and indirect calls don't need the pic register. Also,
29145 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29146 it an indirect call. */
29147 rtx addr = XEXP (fnaddr, 0);
29148 if (flag_pic
29149 && GET_CODE (addr) == SYMBOL_REF
29150 && !SYMBOL_REF_LOCAL_P (addr))
29152 if (flag_plt
29153 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29154 || !lookup_attribute ("noplt",
29155 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29157 if (!TARGET_64BIT
29158 || (ix86_cmodel == CM_LARGE_PIC
29159 && DEFAULT_ABI != MS_ABI))
29161 use_reg (&use, gen_rtx_REG (Pmode,
29162 REAL_PIC_OFFSET_TABLE_REGNUM));
29163 if (ix86_use_pseudo_pic_reg ())
29164 emit_move_insn (gen_rtx_REG (Pmode,
29165 REAL_PIC_OFFSET_TABLE_REGNUM),
29166 pic_offset_table_rtx);
29169 else if (!TARGET_PECOFF && !TARGET_MACHO)
29171 if (TARGET_64BIT)
29173 fnaddr = gen_rtx_UNSPEC (Pmode,
29174 gen_rtvec (1, addr),
29175 UNSPEC_GOTPCREL);
29176 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29178 else
29180 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29181 UNSPEC_GOT);
29182 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29183 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29184 fnaddr);
29186 fnaddr = gen_const_mem (Pmode, fnaddr);
29187 /* Pmode may not be the same as word_mode for x32, which
29188 doesn't support indirect branch via 32-bit memory slot.
29189 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29190 indirect branch via x32 GOT slot is OK. */
29191 if (GET_MODE (fnaddr) != word_mode)
29192 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29193 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29198 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29199 parameters passed in vector registers. */
29200 if (TARGET_64BIT
29201 && (INTVAL (callarg2) > 0
29202 || (INTVAL (callarg2) == 0
29203 && (TARGET_SSE || !flag_skip_rax_setup))))
29205 rtx al = gen_rtx_REG (QImode, AX_REG);
29206 emit_move_insn (al, callarg2);
29207 use_reg (&use, al);
29210 if (ix86_cmodel == CM_LARGE_PIC
29211 && !TARGET_PECOFF
29212 && MEM_P (fnaddr)
29213 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29214 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29215 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29216 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29217 branch via x32 GOT slot is OK. */
29218 else if (!(TARGET_X32
29219 && MEM_P (fnaddr)
29220 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29221 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29222 && (sibcall
29223 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29224 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29226 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29227 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29230 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29232 if (retval)
29234 /* We should add bounds as destination register in case
29235 pointer with bounds may be returned. */
29236 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29238 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29239 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29240 if (GET_CODE (retval) == PARALLEL)
29242 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29243 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29244 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29245 retval = chkp_join_splitted_slot (retval, par);
29247 else
29249 retval = gen_rtx_PARALLEL (VOIDmode,
29250 gen_rtvec (3, retval, b0, b1));
29251 chkp_put_regs_to_expr_list (retval);
29255 call = gen_rtx_SET (retval, call);
29257 vec[vec_len++] = call;
29259 if (pop)
29261 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29262 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29263 vec[vec_len++] = pop;
29266 if (cfun->machine->no_caller_saved_registers
29267 && (!fndecl
29268 || (!TREE_THIS_VOLATILE (fndecl)
29269 && !lookup_attribute ("no_caller_saved_registers",
29270 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29272 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29273 bool is_64bit_ms_abi = (TARGET_64BIT
29274 && ix86_function_abi (fndecl) == MS_ABI);
29275 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29277 /* If there are no caller-saved registers, add all registers
29278 that are clobbered by the call which returns. */
29279 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29280 if (!fixed_regs[i]
29281 && (ix86_call_used_regs[i] == 1
29282 || (ix86_call_used_regs[i] & c_mask))
29283 && !STACK_REGNO_P (i)
29284 && !MMX_REGNO_P (i))
29285 clobber_reg (&use,
29286 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29288 else if (TARGET_64BIT_MS_ABI
29289 && (!callarg2 || INTVAL (callarg2) != -2))
29291 unsigned i;
29293 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29295 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29296 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29298 clobber_reg (&use, gen_rtx_REG (mode, regno));
29301 /* Set here, but it may get cleared later. */
29302 if (TARGET_CALL_MS2SYSV_XLOGUES)
29304 if (!TARGET_SSE)
29307 /* Don't break hot-patched functions. */
29308 else if (ix86_function_ms_hook_prologue (current_function_decl))
29311 /* TODO: Cases not yet examined. */
29312 else if (flag_split_stack)
29313 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29315 else
29317 gcc_assert (!reload_completed);
29318 cfun->machine->call_ms2sysv = true;
29323 if (vec_len > 1)
29324 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29325 call = emit_call_insn (call);
29326 if (use)
29327 CALL_INSN_FUNCTION_USAGE (call) = use;
29329 return call;
29332 /* Return true if the function being called was marked with attribute
29333 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29334 to handle the non-PIC case in the backend because there is no easy
29335 interface for the front-end to force non-PLT calls to use the GOT.
29336 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29337 to call the function marked "noplt" indirectly. */
29339 static bool
29340 ix86_nopic_noplt_attribute_p (rtx call_op)
29342 if (flag_pic || ix86_cmodel == CM_LARGE
29343 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29344 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29345 || SYMBOL_REF_LOCAL_P (call_op))
29346 return false;
29348 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29350 if (!flag_plt
29351 || (symbol_decl != NULL_TREE
29352 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29353 return true;
29355 return false;
29358 /* Output the assembly for a call instruction. */
29360 const char *
29361 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29363 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29364 bool seh_nop_p = false;
29365 const char *xasm;
29367 if (SIBLING_CALL_P (insn))
29369 if (direct_p)
29371 if (ix86_nopic_noplt_attribute_p (call_op))
29373 if (TARGET_64BIT)
29374 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29375 else
29376 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29378 else
29379 xasm = "%!jmp\t%P0";
29381 /* SEH epilogue detection requires the indirect branch case
29382 to include REX.W. */
29383 else if (TARGET_SEH)
29384 xasm = "%!rex.W jmp\t%A0";
29385 else
29386 xasm = "%!jmp\t%A0";
29388 output_asm_insn (xasm, &call_op);
29389 return "";
29392 /* SEH unwinding can require an extra nop to be emitted in several
29393 circumstances. Determine if we have one of those. */
29394 if (TARGET_SEH)
29396 rtx_insn *i;
29398 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29400 /* If we get to another real insn, we don't need the nop. */
29401 if (INSN_P (i))
29402 break;
29404 /* If we get to the epilogue note, prevent a catch region from
29405 being adjacent to the standard epilogue sequence. If non-
29406 call-exceptions, we'll have done this during epilogue emission. */
29407 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29408 && !flag_non_call_exceptions
29409 && !can_throw_internal (insn))
29411 seh_nop_p = true;
29412 break;
29416 /* If we didn't find a real insn following the call, prevent the
29417 unwinder from looking into the next function. */
29418 if (i == NULL)
29419 seh_nop_p = true;
29422 if (direct_p)
29424 if (ix86_nopic_noplt_attribute_p (call_op))
29426 if (TARGET_64BIT)
29427 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29428 else
29429 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29431 else
29432 xasm = "%!call\t%P0";
29434 else
29435 xasm = "%!call\t%A0";
29437 output_asm_insn (xasm, &call_op);
29439 if (seh_nop_p)
29440 return "nop";
29442 return "";
29445 /* Clear stack slot assignments remembered from previous functions.
29446 This is called from INIT_EXPANDERS once before RTL is emitted for each
29447 function. */
29449 static struct machine_function *
29450 ix86_init_machine_status (void)
29452 struct machine_function *f;
29454 f = ggc_cleared_alloc<machine_function> ();
29455 f->call_abi = ix86_abi;
29457 return f;
29460 /* Return a MEM corresponding to a stack slot with mode MODE.
29461 Allocate a new slot if necessary.
29463 The RTL for a function can have several slots available: N is
29464 which slot to use. */
29467 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29469 struct stack_local_entry *s;
29471 gcc_assert (n < MAX_386_STACK_LOCALS);
29473 for (s = ix86_stack_locals; s; s = s->next)
29474 if (s->mode == mode && s->n == n)
29475 return validize_mem (copy_rtx (s->rtl));
29477 s = ggc_alloc<stack_local_entry> ();
29478 s->n = n;
29479 s->mode = mode;
29480 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29482 s->next = ix86_stack_locals;
29483 ix86_stack_locals = s;
29484 return validize_mem (copy_rtx (s->rtl));
29487 static void
29488 ix86_instantiate_decls (void)
29490 struct stack_local_entry *s;
29492 for (s = ix86_stack_locals; s; s = s->next)
29493 if (s->rtl != NULL_RTX)
29494 instantiate_decl_rtl (s->rtl);
29497 /* Return the number used for encoding REG, in the range 0..7. */
29499 static int
29500 reg_encoded_number (rtx reg)
29502 unsigned regno = REGNO (reg);
29503 switch (regno)
29505 case AX_REG:
29506 return 0;
29507 case CX_REG:
29508 return 1;
29509 case DX_REG:
29510 return 2;
29511 case BX_REG:
29512 return 3;
29513 case SP_REG:
29514 return 4;
29515 case BP_REG:
29516 return 5;
29517 case SI_REG:
29518 return 6;
29519 case DI_REG:
29520 return 7;
29521 default:
29522 break;
29524 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29525 return regno - FIRST_STACK_REG;
29526 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29527 return regno - FIRST_SSE_REG;
29528 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29529 return regno - FIRST_MMX_REG;
29530 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29531 return regno - FIRST_REX_SSE_REG;
29532 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29533 return regno - FIRST_REX_INT_REG;
29534 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29535 return regno - FIRST_MASK_REG;
29536 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29537 return regno - FIRST_BND_REG;
29538 return -1;
29541 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29542 in its encoding if it could be relevant for ROP mitigation, otherwise
29543 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29544 used for calculating it into them. */
29546 static int
29547 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29548 int *popno0 = 0, int *popno1 = 0)
29550 if (asm_noperands (PATTERN (insn)) >= 0)
29551 return -1;
29552 int has_modrm = get_attr_modrm (insn);
29553 if (!has_modrm)
29554 return -1;
29555 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29556 rtx op0, op1;
29557 switch (cls)
29559 case MODRM_CLASS_OP02:
29560 gcc_assert (noperands >= 3);
29561 if (popno0)
29563 *popno0 = 0;
29564 *popno1 = 2;
29566 op0 = operands[0];
29567 op1 = operands[2];
29568 break;
29569 case MODRM_CLASS_OP01:
29570 gcc_assert (noperands >= 2);
29571 if (popno0)
29573 *popno0 = 0;
29574 *popno1 = 1;
29576 op0 = operands[0];
29577 op1 = operands[1];
29578 break;
29579 default:
29580 return -1;
29582 if (REG_P (op0) && REG_P (op1))
29584 int enc0 = reg_encoded_number (op0);
29585 int enc1 = reg_encoded_number (op1);
29586 return 0xc0 + (enc1 << 3) + enc0;
29588 return -1;
29591 /* Check whether x86 address PARTS is a pc-relative address. */
29593 static bool
29594 rip_relative_addr_p (struct ix86_address *parts)
29596 rtx base, index, disp;
29598 base = parts->base;
29599 index = parts->index;
29600 disp = parts->disp;
29602 if (disp && !base && !index)
29604 if (TARGET_64BIT)
29606 rtx symbol = disp;
29608 if (GET_CODE (disp) == CONST)
29609 symbol = XEXP (disp, 0);
29610 if (GET_CODE (symbol) == PLUS
29611 && CONST_INT_P (XEXP (symbol, 1)))
29612 symbol = XEXP (symbol, 0);
29614 if (GET_CODE (symbol) == LABEL_REF
29615 || (GET_CODE (symbol) == SYMBOL_REF
29616 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29617 || (GET_CODE (symbol) == UNSPEC
29618 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29619 || XINT (symbol, 1) == UNSPEC_PCREL
29620 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29621 return true;
29624 return false;
29627 /* Calculate the length of the memory address in the instruction encoding.
29628 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29629 or other prefixes. We never generate addr32 prefix for LEA insn. */
29632 memory_address_length (rtx addr, bool lea)
29634 struct ix86_address parts;
29635 rtx base, index, disp;
29636 int len;
29637 int ok;
29639 if (GET_CODE (addr) == PRE_DEC
29640 || GET_CODE (addr) == POST_INC
29641 || GET_CODE (addr) == PRE_MODIFY
29642 || GET_CODE (addr) == POST_MODIFY)
29643 return 0;
29645 ok = ix86_decompose_address (addr, &parts);
29646 gcc_assert (ok);
29648 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29650 /* If this is not LEA instruction, add the length of addr32 prefix. */
29651 if (TARGET_64BIT && !lea
29652 && (SImode_address_operand (addr, VOIDmode)
29653 || (parts.base && GET_MODE (parts.base) == SImode)
29654 || (parts.index && GET_MODE (parts.index) == SImode)))
29655 len++;
29657 base = parts.base;
29658 index = parts.index;
29659 disp = parts.disp;
29661 if (base && SUBREG_P (base))
29662 base = SUBREG_REG (base);
29663 if (index && SUBREG_P (index))
29664 index = SUBREG_REG (index);
29666 gcc_assert (base == NULL_RTX || REG_P (base));
29667 gcc_assert (index == NULL_RTX || REG_P (index));
29669 /* Rule of thumb:
29670 - esp as the base always wants an index,
29671 - ebp as the base always wants a displacement,
29672 - r12 as the base always wants an index,
29673 - r13 as the base always wants a displacement. */
29675 /* Register Indirect. */
29676 if (base && !index && !disp)
29678 /* esp (for its index) and ebp (for its displacement) need
29679 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29680 code. */
29681 if (base == arg_pointer_rtx
29682 || base == frame_pointer_rtx
29683 || REGNO (base) == SP_REG
29684 || REGNO (base) == BP_REG
29685 || REGNO (base) == R12_REG
29686 || REGNO (base) == R13_REG)
29687 len++;
29690 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29691 is not disp32, but disp32(%rip), so for disp32
29692 SIB byte is needed, unless print_operand_address
29693 optimizes it into disp32(%rip) or (%rip) is implied
29694 by UNSPEC. */
29695 else if (disp && !base && !index)
29697 len += 4;
29698 if (!rip_relative_addr_p (&parts))
29699 len++;
29701 else
29703 /* Find the length of the displacement constant. */
29704 if (disp)
29706 if (base && satisfies_constraint_K (disp))
29707 len += 1;
29708 else
29709 len += 4;
29711 /* ebp always wants a displacement. Similarly r13. */
29712 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29713 len++;
29715 /* An index requires the two-byte modrm form.... */
29716 if (index
29717 /* ...like esp (or r12), which always wants an index. */
29718 || base == arg_pointer_rtx
29719 || base == frame_pointer_rtx
29720 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29721 len++;
29724 return len;
29727 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29728 is set, expect that insn have 8bit immediate alternative. */
29730 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29732 int len = 0;
29733 int i;
29734 extract_insn_cached (insn);
29735 for (i = recog_data.n_operands - 1; i >= 0; --i)
29736 if (CONSTANT_P (recog_data.operand[i]))
29738 enum attr_mode mode = get_attr_mode (insn);
29740 gcc_assert (!len);
29741 if (shortform && CONST_INT_P (recog_data.operand[i]))
29743 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29744 switch (mode)
29746 case MODE_QI:
29747 len = 1;
29748 continue;
29749 case MODE_HI:
29750 ival = trunc_int_for_mode (ival, HImode);
29751 break;
29752 case MODE_SI:
29753 ival = trunc_int_for_mode (ival, SImode);
29754 break;
29755 default:
29756 break;
29758 if (IN_RANGE (ival, -128, 127))
29760 len = 1;
29761 continue;
29764 switch (mode)
29766 case MODE_QI:
29767 len = 1;
29768 break;
29769 case MODE_HI:
29770 len = 2;
29771 break;
29772 case MODE_SI:
29773 len = 4;
29774 break;
29775 /* Immediates for DImode instructions are encoded
29776 as 32bit sign extended values. */
29777 case MODE_DI:
29778 len = 4;
29779 break;
29780 default:
29781 fatal_insn ("unknown insn mode", insn);
29784 return len;
29787 /* Compute default value for "length_address" attribute. */
29789 ix86_attr_length_address_default (rtx_insn *insn)
29791 int i;
29793 if (get_attr_type (insn) == TYPE_LEA)
29795 rtx set = PATTERN (insn), addr;
29797 if (GET_CODE (set) == PARALLEL)
29798 set = XVECEXP (set, 0, 0);
29800 gcc_assert (GET_CODE (set) == SET);
29802 addr = SET_SRC (set);
29804 return memory_address_length (addr, true);
29807 extract_insn_cached (insn);
29808 for (i = recog_data.n_operands - 1; i >= 0; --i)
29810 rtx op = recog_data.operand[i];
29811 if (MEM_P (op))
29813 constrain_operands_cached (insn, reload_completed);
29814 if (which_alternative != -1)
29816 const char *constraints = recog_data.constraints[i];
29817 int alt = which_alternative;
29819 while (*constraints == '=' || *constraints == '+')
29820 constraints++;
29821 while (alt-- > 0)
29822 while (*constraints++ != ',')
29824 /* Skip ignored operands. */
29825 if (*constraints == 'X')
29826 continue;
29829 int len = memory_address_length (XEXP (op, 0), false);
29831 /* Account for segment prefix for non-default addr spaces. */
29832 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29833 len++;
29835 return len;
29838 return 0;
29841 /* Compute default value for "length_vex" attribute. It includes
29842 2 or 3 byte VEX prefix and 1 opcode byte. */
29845 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29846 bool has_vex_w)
29848 int i;
29850 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29851 byte VEX prefix. */
29852 if (!has_0f_opcode || has_vex_w)
29853 return 3 + 1;
29855 /* We can always use 2 byte VEX prefix in 32bit. */
29856 if (!TARGET_64BIT)
29857 return 2 + 1;
29859 extract_insn_cached (insn);
29861 for (i = recog_data.n_operands - 1; i >= 0; --i)
29862 if (REG_P (recog_data.operand[i]))
29864 /* REX.W bit uses 3 byte VEX prefix. */
29865 if (GET_MODE (recog_data.operand[i]) == DImode
29866 && GENERAL_REG_P (recog_data.operand[i]))
29867 return 3 + 1;
29869 else
29871 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29872 if (MEM_P (recog_data.operand[i])
29873 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29874 return 3 + 1;
29877 return 2 + 1;
29880 /* Return the maximum number of instructions a cpu can issue. */
29882 static int
29883 ix86_issue_rate (void)
29885 switch (ix86_tune)
29887 case PROCESSOR_PENTIUM:
29888 case PROCESSOR_LAKEMONT:
29889 case PROCESSOR_BONNELL:
29890 case PROCESSOR_SILVERMONT:
29891 case PROCESSOR_KNL:
29892 case PROCESSOR_INTEL:
29893 case PROCESSOR_K6:
29894 case PROCESSOR_BTVER2:
29895 case PROCESSOR_PENTIUM4:
29896 case PROCESSOR_NOCONA:
29897 return 2;
29899 case PROCESSOR_PENTIUMPRO:
29900 case PROCESSOR_ATHLON:
29901 case PROCESSOR_K8:
29902 case PROCESSOR_AMDFAM10:
29903 case PROCESSOR_GENERIC:
29904 case PROCESSOR_BTVER1:
29905 return 3;
29907 case PROCESSOR_BDVER1:
29908 case PROCESSOR_BDVER2:
29909 case PROCESSOR_BDVER3:
29910 case PROCESSOR_BDVER4:
29911 case PROCESSOR_ZNVER1:
29912 case PROCESSOR_CORE2:
29913 case PROCESSOR_NEHALEM:
29914 case PROCESSOR_SANDYBRIDGE:
29915 case PROCESSOR_HASWELL:
29916 return 4;
29918 default:
29919 return 1;
29923 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29924 by DEP_INSN and nothing set by DEP_INSN. */
29926 static bool
29927 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29929 rtx set, set2;
29931 /* Simplify the test for uninteresting insns. */
29932 if (insn_type != TYPE_SETCC
29933 && insn_type != TYPE_ICMOV
29934 && insn_type != TYPE_FCMOV
29935 && insn_type != TYPE_IBR)
29936 return false;
29938 if ((set = single_set (dep_insn)) != 0)
29940 set = SET_DEST (set);
29941 set2 = NULL_RTX;
29943 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29944 && XVECLEN (PATTERN (dep_insn), 0) == 2
29945 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29946 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29948 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29949 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29951 else
29952 return false;
29954 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29955 return false;
29957 /* This test is true if the dependent insn reads the flags but
29958 not any other potentially set register. */
29959 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29960 return false;
29962 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29963 return false;
29965 return true;
29968 /* Return true iff USE_INSN has a memory address with operands set by
29969 SET_INSN. */
29971 bool
29972 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29974 int i;
29975 extract_insn_cached (use_insn);
29976 for (i = recog_data.n_operands - 1; i >= 0; --i)
29977 if (MEM_P (recog_data.operand[i]))
29979 rtx addr = XEXP (recog_data.operand[i], 0);
29980 if (modified_in_p (addr, set_insn) != 0)
29982 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
29983 has SP based memory (unless index reg is modified in a pop). */
29984 rtx set = single_set (set_insn);
29985 if (set
29986 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
29987 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
29989 struct ix86_address parts;
29990 if (ix86_decompose_address (addr, &parts)
29991 && parts.base == stack_pointer_rtx
29992 && (parts.index == NULL_RTX
29993 || MEM_P (SET_DEST (set))
29994 || !modified_in_p (parts.index, set_insn)))
29995 return false;
29997 return true;
29999 return false;
30001 return false;
30004 /* Helper function for exact_store_load_dependency.
30005 Return true if addr is found in insn. */
30006 static bool
30007 exact_dependency_1 (rtx addr, rtx insn)
30009 enum rtx_code code;
30010 const char *format_ptr;
30011 int i, j;
30013 code = GET_CODE (insn);
30014 switch (code)
30016 case MEM:
30017 if (rtx_equal_p (addr, insn))
30018 return true;
30019 break;
30020 case REG:
30021 CASE_CONST_ANY:
30022 case SYMBOL_REF:
30023 case CODE_LABEL:
30024 case PC:
30025 case CC0:
30026 case EXPR_LIST:
30027 return false;
30028 default:
30029 break;
30032 format_ptr = GET_RTX_FORMAT (code);
30033 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30035 switch (*format_ptr++)
30037 case 'e':
30038 if (exact_dependency_1 (addr, XEXP (insn, i)))
30039 return true;
30040 break;
30041 case 'E':
30042 for (j = 0; j < XVECLEN (insn, i); j++)
30043 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30044 return true;
30045 break;
30048 return false;
30051 /* Return true if there exists exact dependency for store & load, i.e.
30052 the same memory address is used in them. */
30053 static bool
30054 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30056 rtx set1, set2;
30058 set1 = single_set (store);
30059 if (!set1)
30060 return false;
30061 if (!MEM_P (SET_DEST (set1)))
30062 return false;
30063 set2 = single_set (load);
30064 if (!set2)
30065 return false;
30066 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30067 return true;
30068 return false;
30071 static int
30072 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30073 unsigned int)
30075 enum attr_type insn_type, dep_insn_type;
30076 enum attr_memory memory;
30077 rtx set, set2;
30078 int dep_insn_code_number;
30080 /* Anti and output dependencies have zero cost on all CPUs. */
30081 if (dep_type != 0)
30082 return 0;
30084 dep_insn_code_number = recog_memoized (dep_insn);
30086 /* If we can't recognize the insns, we can't really do anything. */
30087 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30088 return cost;
30090 insn_type = get_attr_type (insn);
30091 dep_insn_type = get_attr_type (dep_insn);
30093 switch (ix86_tune)
30095 case PROCESSOR_PENTIUM:
30096 case PROCESSOR_LAKEMONT:
30097 /* Address Generation Interlock adds a cycle of latency. */
30098 if (insn_type == TYPE_LEA)
30100 rtx addr = PATTERN (insn);
30102 if (GET_CODE (addr) == PARALLEL)
30103 addr = XVECEXP (addr, 0, 0);
30105 gcc_assert (GET_CODE (addr) == SET);
30107 addr = SET_SRC (addr);
30108 if (modified_in_p (addr, dep_insn))
30109 cost += 1;
30111 else if (ix86_agi_dependent (dep_insn, insn))
30112 cost += 1;
30114 /* ??? Compares pair with jump/setcc. */
30115 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30116 cost = 0;
30118 /* Floating point stores require value to be ready one cycle earlier. */
30119 if (insn_type == TYPE_FMOV
30120 && get_attr_memory (insn) == MEMORY_STORE
30121 && !ix86_agi_dependent (dep_insn, insn))
30122 cost += 1;
30123 break;
30125 case PROCESSOR_PENTIUMPRO:
30126 /* INT->FP conversion is expensive. */
30127 if (get_attr_fp_int_src (dep_insn))
30128 cost += 5;
30130 /* There is one cycle extra latency between an FP op and a store. */
30131 if (insn_type == TYPE_FMOV
30132 && (set = single_set (dep_insn)) != NULL_RTX
30133 && (set2 = single_set (insn)) != NULL_RTX
30134 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30135 && MEM_P (SET_DEST (set2)))
30136 cost += 1;
30138 memory = get_attr_memory (insn);
30140 /* Show ability of reorder buffer to hide latency of load by executing
30141 in parallel with previous instruction in case
30142 previous instruction is not needed to compute the address. */
30143 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30144 && !ix86_agi_dependent (dep_insn, insn))
30146 /* Claim moves to take one cycle, as core can issue one load
30147 at time and the next load can start cycle later. */
30148 if (dep_insn_type == TYPE_IMOV
30149 || dep_insn_type == TYPE_FMOV)
30150 cost = 1;
30151 else if (cost > 1)
30152 cost--;
30154 break;
30156 case PROCESSOR_K6:
30157 /* The esp dependency is resolved before
30158 the instruction is really finished. */
30159 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30160 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30161 return 1;
30163 /* INT->FP conversion is expensive. */
30164 if (get_attr_fp_int_src (dep_insn))
30165 cost += 5;
30167 memory = get_attr_memory (insn);
30169 /* Show ability of reorder buffer to hide latency of load by executing
30170 in parallel with previous instruction in case
30171 previous instruction is not needed to compute the address. */
30172 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30173 && !ix86_agi_dependent (dep_insn, insn))
30175 /* Claim moves to take one cycle, as core can issue one load
30176 at time and the next load can start cycle later. */
30177 if (dep_insn_type == TYPE_IMOV
30178 || dep_insn_type == TYPE_FMOV)
30179 cost = 1;
30180 else if (cost > 2)
30181 cost -= 2;
30182 else
30183 cost = 1;
30185 break;
30187 case PROCESSOR_AMDFAM10:
30188 case PROCESSOR_BDVER1:
30189 case PROCESSOR_BDVER2:
30190 case PROCESSOR_BDVER3:
30191 case PROCESSOR_BDVER4:
30192 case PROCESSOR_ZNVER1:
30193 case PROCESSOR_BTVER1:
30194 case PROCESSOR_BTVER2:
30195 case PROCESSOR_GENERIC:
30196 /* Stack engine allows to execute push&pop instructions in parall. */
30197 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30198 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30199 return 0;
30200 /* FALLTHRU */
30202 case PROCESSOR_ATHLON:
30203 case PROCESSOR_K8:
30204 memory = get_attr_memory (insn);
30206 /* Show ability of reorder buffer to hide latency of load by executing
30207 in parallel with previous instruction in case
30208 previous instruction is not needed to compute the address. */
30209 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30210 && !ix86_agi_dependent (dep_insn, insn))
30212 enum attr_unit unit = get_attr_unit (insn);
30213 int loadcost = 3;
30215 /* Because of the difference between the length of integer and
30216 floating unit pipeline preparation stages, the memory operands
30217 for floating point are cheaper.
30219 ??? For Athlon it the difference is most probably 2. */
30220 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30221 loadcost = 3;
30222 else
30223 loadcost = TARGET_ATHLON ? 2 : 0;
30225 if (cost >= loadcost)
30226 cost -= loadcost;
30227 else
30228 cost = 0;
30230 break;
30232 case PROCESSOR_CORE2:
30233 case PROCESSOR_NEHALEM:
30234 case PROCESSOR_SANDYBRIDGE:
30235 case PROCESSOR_HASWELL:
30236 /* Stack engine allows to execute push&pop instructions in parall. */
30237 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30238 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30239 return 0;
30241 memory = get_attr_memory (insn);
30243 /* Show ability of reorder buffer to hide latency of load by executing
30244 in parallel with previous instruction in case
30245 previous instruction is not needed to compute the address. */
30246 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30247 && !ix86_agi_dependent (dep_insn, insn))
30249 if (cost >= 4)
30250 cost -= 4;
30251 else
30252 cost = 0;
30254 break;
30256 case PROCESSOR_SILVERMONT:
30257 case PROCESSOR_KNL:
30258 case PROCESSOR_INTEL:
30259 if (!reload_completed)
30260 return cost;
30262 /* Increase cost of integer loads. */
30263 memory = get_attr_memory (dep_insn);
30264 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30266 enum attr_unit unit = get_attr_unit (dep_insn);
30267 if (unit == UNIT_INTEGER && cost == 1)
30269 if (memory == MEMORY_LOAD)
30270 cost = 3;
30271 else
30273 /* Increase cost of ld/st for short int types only
30274 because of store forwarding issue. */
30275 rtx set = single_set (dep_insn);
30276 if (set && (GET_MODE (SET_DEST (set)) == QImode
30277 || GET_MODE (SET_DEST (set)) == HImode))
30279 /* Increase cost of store/load insn if exact
30280 dependence exists and it is load insn. */
30281 enum attr_memory insn_memory = get_attr_memory (insn);
30282 if (insn_memory == MEMORY_LOAD
30283 && exact_store_load_dependency (dep_insn, insn))
30284 cost = 3;
30290 default:
30291 break;
30294 return cost;
30297 /* How many alternative schedules to try. This should be as wide as the
30298 scheduling freedom in the DFA, but no wider. Making this value too
30299 large results extra work for the scheduler. */
30301 static int
30302 ia32_multipass_dfa_lookahead (void)
30304 switch (ix86_tune)
30306 case PROCESSOR_PENTIUM:
30307 case PROCESSOR_LAKEMONT:
30308 return 2;
30310 case PROCESSOR_PENTIUMPRO:
30311 case PROCESSOR_K6:
30312 return 1;
30314 case PROCESSOR_BDVER1:
30315 case PROCESSOR_BDVER2:
30316 case PROCESSOR_BDVER3:
30317 case PROCESSOR_BDVER4:
30318 /* We use lookahead value 4 for BD both before and after reload
30319 schedules. Plan is to have value 8 included for O3. */
30320 return 4;
30322 case PROCESSOR_CORE2:
30323 case PROCESSOR_NEHALEM:
30324 case PROCESSOR_SANDYBRIDGE:
30325 case PROCESSOR_HASWELL:
30326 case PROCESSOR_BONNELL:
30327 case PROCESSOR_SILVERMONT:
30328 case PROCESSOR_KNL:
30329 case PROCESSOR_INTEL:
30330 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30331 as many instructions can be executed on a cycle, i.e.,
30332 issue_rate. I wonder why tuning for many CPUs does not do this. */
30333 if (reload_completed)
30334 return ix86_issue_rate ();
30335 /* Don't use lookahead for pre-reload schedule to save compile time. */
30336 return 0;
30338 default:
30339 return 0;
30343 /* Return true if target platform supports macro-fusion. */
30345 static bool
30346 ix86_macro_fusion_p ()
30348 return TARGET_FUSE_CMP_AND_BRANCH;
30351 /* Check whether current microarchitecture support macro fusion
30352 for insn pair "CONDGEN + CONDJMP". Refer to
30353 "Intel Architectures Optimization Reference Manual". */
30355 static bool
30356 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30358 rtx src, dest;
30359 enum rtx_code ccode;
30360 rtx compare_set = NULL_RTX, test_if, cond;
30361 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30363 if (!any_condjump_p (condjmp))
30364 return false;
30366 unsigned int condreg1, condreg2;
30367 rtx cc_reg_1;
30368 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30369 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30370 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30371 || !condgen
30372 || !modified_in_p (cc_reg_1, condgen))
30373 return false;
30375 if (get_attr_type (condgen) != TYPE_TEST
30376 && get_attr_type (condgen) != TYPE_ICMP
30377 && get_attr_type (condgen) != TYPE_INCDEC
30378 && get_attr_type (condgen) != TYPE_ALU)
30379 return false;
30381 compare_set = single_set (condgen);
30382 if (compare_set == NULL_RTX
30383 && !TARGET_FUSE_ALU_AND_BRANCH)
30384 return false;
30386 if (compare_set == NULL_RTX)
30388 int i;
30389 rtx pat = PATTERN (condgen);
30390 for (i = 0; i < XVECLEN (pat, 0); i++)
30391 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30393 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30394 if (GET_CODE (set_src) == COMPARE)
30395 compare_set = XVECEXP (pat, 0, i);
30396 else
30397 alu_set = XVECEXP (pat, 0, i);
30400 if (compare_set == NULL_RTX)
30401 return false;
30402 src = SET_SRC (compare_set);
30403 if (GET_CODE (src) != COMPARE)
30404 return false;
30406 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30407 supported. */
30408 if ((MEM_P (XEXP (src, 0))
30409 && CONST_INT_P (XEXP (src, 1)))
30410 || (MEM_P (XEXP (src, 1))
30411 && CONST_INT_P (XEXP (src, 0))))
30412 return false;
30414 /* No fusion for RIP-relative address. */
30415 if (MEM_P (XEXP (src, 0)))
30416 addr = XEXP (XEXP (src, 0), 0);
30417 else if (MEM_P (XEXP (src, 1)))
30418 addr = XEXP (XEXP (src, 1), 0);
30420 if (addr) {
30421 ix86_address parts;
30422 int ok = ix86_decompose_address (addr, &parts);
30423 gcc_assert (ok);
30425 if (rip_relative_addr_p (&parts))
30426 return false;
30429 test_if = SET_SRC (pc_set (condjmp));
30430 cond = XEXP (test_if, 0);
30431 ccode = GET_CODE (cond);
30432 /* Check whether conditional jump use Sign or Overflow Flags. */
30433 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30434 && (ccode == GE
30435 || ccode == GT
30436 || ccode == LE
30437 || ccode == LT))
30438 return false;
30440 /* Return true for TYPE_TEST and TYPE_ICMP. */
30441 if (get_attr_type (condgen) == TYPE_TEST
30442 || get_attr_type (condgen) == TYPE_ICMP)
30443 return true;
30445 /* The following is the case that macro-fusion for alu + jmp. */
30446 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30447 return false;
30449 /* No fusion for alu op with memory destination operand. */
30450 dest = SET_DEST (alu_set);
30451 if (MEM_P (dest))
30452 return false;
30454 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30455 supported. */
30456 if (get_attr_type (condgen) == TYPE_INCDEC
30457 && (ccode == GEU
30458 || ccode == GTU
30459 || ccode == LEU
30460 || ccode == LTU))
30461 return false;
30463 return true;
30466 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30467 execution. It is applied if
30468 (1) IMUL instruction is on the top of list;
30469 (2) There exists the only producer of independent IMUL instruction in
30470 ready list.
30471 Return index of IMUL producer if it was found and -1 otherwise. */
30472 static int
30473 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30475 rtx_insn *insn;
30476 rtx set, insn1, insn2;
30477 sd_iterator_def sd_it;
30478 dep_t dep;
30479 int index = -1;
30480 int i;
30482 if (!TARGET_BONNELL)
30483 return index;
30485 /* Check that IMUL instruction is on the top of ready list. */
30486 insn = ready[n_ready - 1];
30487 set = single_set (insn);
30488 if (!set)
30489 return index;
30490 if (!(GET_CODE (SET_SRC (set)) == MULT
30491 && GET_MODE (SET_SRC (set)) == SImode))
30492 return index;
30494 /* Search for producer of independent IMUL instruction. */
30495 for (i = n_ready - 2; i >= 0; i--)
30497 insn = ready[i];
30498 if (!NONDEBUG_INSN_P (insn))
30499 continue;
30500 /* Skip IMUL instruction. */
30501 insn2 = PATTERN (insn);
30502 if (GET_CODE (insn2) == PARALLEL)
30503 insn2 = XVECEXP (insn2, 0, 0);
30504 if (GET_CODE (insn2) == SET
30505 && GET_CODE (SET_SRC (insn2)) == MULT
30506 && GET_MODE (SET_SRC (insn2)) == SImode)
30507 continue;
30509 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30511 rtx con;
30512 con = DEP_CON (dep);
30513 if (!NONDEBUG_INSN_P (con))
30514 continue;
30515 insn1 = PATTERN (con);
30516 if (GET_CODE (insn1) == PARALLEL)
30517 insn1 = XVECEXP (insn1, 0, 0);
30519 if (GET_CODE (insn1) == SET
30520 && GET_CODE (SET_SRC (insn1)) == MULT
30521 && GET_MODE (SET_SRC (insn1)) == SImode)
30523 sd_iterator_def sd_it1;
30524 dep_t dep1;
30525 /* Check if there is no other dependee for IMUL. */
30526 index = i;
30527 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30529 rtx pro;
30530 pro = DEP_PRO (dep1);
30531 if (!NONDEBUG_INSN_P (pro))
30532 continue;
30533 if (pro != insn)
30534 index = -1;
30536 if (index >= 0)
30537 break;
30540 if (index >= 0)
30541 break;
30543 return index;
30546 /* Try to find the best candidate on the top of ready list if two insns
30547 have the same priority - candidate is best if its dependees were
30548 scheduled earlier. Applied for Silvermont only.
30549 Return true if top 2 insns must be interchanged. */
30550 static bool
30551 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30553 rtx_insn *top = ready[n_ready - 1];
30554 rtx_insn *next = ready[n_ready - 2];
30555 rtx set;
30556 sd_iterator_def sd_it;
30557 dep_t dep;
30558 int clock1 = -1;
30559 int clock2 = -1;
30560 #define INSN_TICK(INSN) (HID (INSN)->tick)
30562 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30563 return false;
30565 if (!NONDEBUG_INSN_P (top))
30566 return false;
30567 if (!NONJUMP_INSN_P (top))
30568 return false;
30569 if (!NONDEBUG_INSN_P (next))
30570 return false;
30571 if (!NONJUMP_INSN_P (next))
30572 return false;
30573 set = single_set (top);
30574 if (!set)
30575 return false;
30576 set = single_set (next);
30577 if (!set)
30578 return false;
30580 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30582 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30583 return false;
30584 /* Determine winner more precise. */
30585 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30587 rtx pro;
30588 pro = DEP_PRO (dep);
30589 if (!NONDEBUG_INSN_P (pro))
30590 continue;
30591 if (INSN_TICK (pro) > clock1)
30592 clock1 = INSN_TICK (pro);
30594 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30596 rtx pro;
30597 pro = DEP_PRO (dep);
30598 if (!NONDEBUG_INSN_P (pro))
30599 continue;
30600 if (INSN_TICK (pro) > clock2)
30601 clock2 = INSN_TICK (pro);
30604 if (clock1 == clock2)
30606 /* Determine winner - load must win. */
30607 enum attr_memory memory1, memory2;
30608 memory1 = get_attr_memory (top);
30609 memory2 = get_attr_memory (next);
30610 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30611 return true;
30613 return (bool) (clock2 < clock1);
30615 return false;
30616 #undef INSN_TICK
30619 /* Perform possible reodering of ready list for Atom/Silvermont only.
30620 Return issue rate. */
30621 static int
30622 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30623 int *pn_ready, int clock_var)
30625 int issue_rate = -1;
30626 int n_ready = *pn_ready;
30627 int i;
30628 rtx_insn *insn;
30629 int index = -1;
30631 /* Set up issue rate. */
30632 issue_rate = ix86_issue_rate ();
30634 /* Do reodering for BONNELL/SILVERMONT only. */
30635 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30636 return issue_rate;
30638 /* Nothing to do if ready list contains only 1 instruction. */
30639 if (n_ready <= 1)
30640 return issue_rate;
30642 /* Do reodering for post-reload scheduler only. */
30643 if (!reload_completed)
30644 return issue_rate;
30646 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30648 if (sched_verbose > 1)
30649 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30650 INSN_UID (ready[index]));
30652 /* Put IMUL producer (ready[index]) at the top of ready list. */
30653 insn = ready[index];
30654 for (i = index; i < n_ready - 1; i++)
30655 ready[i] = ready[i + 1];
30656 ready[n_ready - 1] = insn;
30657 return issue_rate;
30660 /* Skip selective scheduling since HID is not populated in it. */
30661 if (clock_var != 0
30662 && !sel_sched_p ()
30663 && swap_top_of_ready_list (ready, n_ready))
30665 if (sched_verbose > 1)
30666 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30667 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30668 /* Swap 2 top elements of ready list. */
30669 insn = ready[n_ready - 1];
30670 ready[n_ready - 1] = ready[n_ready - 2];
30671 ready[n_ready - 2] = insn;
30673 return issue_rate;
30676 static bool
30677 ix86_class_likely_spilled_p (reg_class_t);
30679 /* Returns true if lhs of insn is HW function argument register and set up
30680 is_spilled to true if it is likely spilled HW register. */
30681 static bool
30682 insn_is_function_arg (rtx insn, bool* is_spilled)
30684 rtx dst;
30686 if (!NONDEBUG_INSN_P (insn))
30687 return false;
30688 /* Call instructions are not movable, ignore it. */
30689 if (CALL_P (insn))
30690 return false;
30691 insn = PATTERN (insn);
30692 if (GET_CODE (insn) == PARALLEL)
30693 insn = XVECEXP (insn, 0, 0);
30694 if (GET_CODE (insn) != SET)
30695 return false;
30696 dst = SET_DEST (insn);
30697 if (REG_P (dst) && HARD_REGISTER_P (dst)
30698 && ix86_function_arg_regno_p (REGNO (dst)))
30700 /* Is it likely spilled HW register? */
30701 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30702 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30703 *is_spilled = true;
30704 return true;
30706 return false;
30709 /* Add output dependencies for chain of function adjacent arguments if only
30710 there is a move to likely spilled HW register. Return first argument
30711 if at least one dependence was added or NULL otherwise. */
30712 static rtx_insn *
30713 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30715 rtx_insn *insn;
30716 rtx_insn *last = call;
30717 rtx_insn *first_arg = NULL;
30718 bool is_spilled = false;
30720 head = PREV_INSN (head);
30722 /* Find nearest to call argument passing instruction. */
30723 while (true)
30725 last = PREV_INSN (last);
30726 if (last == head)
30727 return NULL;
30728 if (!NONDEBUG_INSN_P (last))
30729 continue;
30730 if (insn_is_function_arg (last, &is_spilled))
30731 break;
30732 return NULL;
30735 first_arg = last;
30736 while (true)
30738 insn = PREV_INSN (last);
30739 if (!INSN_P (insn))
30740 break;
30741 if (insn == head)
30742 break;
30743 if (!NONDEBUG_INSN_P (insn))
30745 last = insn;
30746 continue;
30748 if (insn_is_function_arg (insn, &is_spilled))
30750 /* Add output depdendence between two function arguments if chain
30751 of output arguments contains likely spilled HW registers. */
30752 if (is_spilled)
30753 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30754 first_arg = last = insn;
30756 else
30757 break;
30759 if (!is_spilled)
30760 return NULL;
30761 return first_arg;
30764 /* Add output or anti dependency from insn to first_arg to restrict its code
30765 motion. */
30766 static void
30767 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30769 rtx set;
30770 rtx tmp;
30772 /* Add anti dependencies for bounds stores. */
30773 if (INSN_P (insn)
30774 && GET_CODE (PATTERN (insn)) == PARALLEL
30775 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30776 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30778 add_dependence (first_arg, insn, REG_DEP_ANTI);
30779 return;
30782 set = single_set (insn);
30783 if (!set)
30784 return;
30785 tmp = SET_DEST (set);
30786 if (REG_P (tmp))
30788 /* Add output dependency to the first function argument. */
30789 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30790 return;
30792 /* Add anti dependency. */
30793 add_dependence (first_arg, insn, REG_DEP_ANTI);
30796 /* Avoid cross block motion of function argument through adding dependency
30797 from the first non-jump instruction in bb. */
30798 static void
30799 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30801 rtx_insn *insn = BB_END (bb);
30803 while (insn)
30805 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30807 rtx set = single_set (insn);
30808 if (set)
30810 avoid_func_arg_motion (arg, insn);
30811 return;
30814 if (insn == BB_HEAD (bb))
30815 return;
30816 insn = PREV_INSN (insn);
30820 /* Hook for pre-reload schedule - avoid motion of function arguments
30821 passed in likely spilled HW registers. */
30822 static void
30823 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30825 rtx_insn *insn;
30826 rtx_insn *first_arg = NULL;
30827 if (reload_completed)
30828 return;
30829 while (head != tail && DEBUG_INSN_P (head))
30830 head = NEXT_INSN (head);
30831 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30832 if (INSN_P (insn) && CALL_P (insn))
30834 first_arg = add_parameter_dependencies (insn, head);
30835 if (first_arg)
30837 /* Add dependee for first argument to predecessors if only
30838 region contains more than one block. */
30839 basic_block bb = BLOCK_FOR_INSN (insn);
30840 int rgn = CONTAINING_RGN (bb->index);
30841 int nr_blks = RGN_NR_BLOCKS (rgn);
30842 /* Skip trivial regions and region head blocks that can have
30843 predecessors outside of region. */
30844 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30846 edge e;
30847 edge_iterator ei;
30849 /* Regions are SCCs with the exception of selective
30850 scheduling with pipelining of outer blocks enabled.
30851 So also check that immediate predecessors of a non-head
30852 block are in the same region. */
30853 FOR_EACH_EDGE (e, ei, bb->preds)
30855 /* Avoid creating of loop-carried dependencies through
30856 using topological ordering in the region. */
30857 if (rgn == CONTAINING_RGN (e->src->index)
30858 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
30859 add_dependee_for_func_arg (first_arg, e->src);
30862 insn = first_arg;
30863 if (insn == head)
30864 break;
30867 else if (first_arg)
30868 avoid_func_arg_motion (first_arg, insn);
30871 /* Hook for pre-reload schedule - set priority of moves from likely spilled
30872 HW registers to maximum, to schedule them at soon as possible. These are
30873 moves from function argument registers at the top of the function entry
30874 and moves from function return value registers after call. */
30875 static int
30876 ix86_adjust_priority (rtx_insn *insn, int priority)
30878 rtx set;
30880 if (reload_completed)
30881 return priority;
30883 if (!NONDEBUG_INSN_P (insn))
30884 return priority;
30886 set = single_set (insn);
30887 if (set)
30889 rtx tmp = SET_SRC (set);
30890 if (REG_P (tmp)
30891 && HARD_REGISTER_P (tmp)
30892 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
30893 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
30894 return current_sched_info->sched_max_insns_priority;
30897 return priority;
30900 /* Model decoder of Core 2/i7.
30901 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
30902 track the instruction fetch block boundaries and make sure that long
30903 (9+ bytes) instructions are assigned to D0. */
30905 /* Maximum length of an insn that can be handled by
30906 a secondary decoder unit. '8' for Core 2/i7. */
30907 static int core2i7_secondary_decoder_max_insn_size;
30909 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
30910 '16' for Core 2/i7. */
30911 static int core2i7_ifetch_block_size;
30913 /* Maximum number of instructions decoder can handle per cycle.
30914 '6' for Core 2/i7. */
30915 static int core2i7_ifetch_block_max_insns;
30917 typedef struct ix86_first_cycle_multipass_data_ *
30918 ix86_first_cycle_multipass_data_t;
30919 typedef const struct ix86_first_cycle_multipass_data_ *
30920 const_ix86_first_cycle_multipass_data_t;
30922 /* A variable to store target state across calls to max_issue within
30923 one cycle. */
30924 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30925 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30927 /* Initialize DATA. */
30928 static void
30929 core2i7_first_cycle_multipass_init (void *_data)
30931 ix86_first_cycle_multipass_data_t data
30932 = (ix86_first_cycle_multipass_data_t) _data;
30934 data->ifetch_block_len = 0;
30935 data->ifetch_block_n_insns = 0;
30936 data->ready_try_change = NULL;
30937 data->ready_try_change_size = 0;
30940 /* Advancing the cycle; reset ifetch block counts. */
30941 static void
30942 core2i7_dfa_post_advance_cycle (void)
30944 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30946 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30948 data->ifetch_block_len = 0;
30949 data->ifetch_block_n_insns = 0;
30952 static int min_insn_size (rtx_insn *);
30954 /* Filter out insns from ready_try that the core will not be able to issue
30955 on current cycle due to decoder. */
30956 static void
30957 core2i7_first_cycle_multipass_filter_ready_try
30958 (const_ix86_first_cycle_multipass_data_t data,
30959 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30961 while (n_ready--)
30963 rtx_insn *insn;
30964 int insn_size;
30966 if (ready_try[n_ready])
30967 continue;
30969 insn = get_ready_element (n_ready);
30970 insn_size = min_insn_size (insn);
30972 if (/* If this is a too long an insn for a secondary decoder ... */
30973 (!first_cycle_insn_p
30974 && insn_size > core2i7_secondary_decoder_max_insn_size)
30975 /* ... or it would not fit into the ifetch block ... */
30976 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30977 /* ... or the decoder is full already ... */
30978 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30979 /* ... mask the insn out. */
30981 ready_try[n_ready] = 1;
30983 if (data->ready_try_change)
30984 bitmap_set_bit (data->ready_try_change, n_ready);
30989 /* Prepare for a new round of multipass lookahead scheduling. */
30990 static void
30991 core2i7_first_cycle_multipass_begin (void *_data,
30992 signed char *ready_try, int n_ready,
30993 bool first_cycle_insn_p)
30995 ix86_first_cycle_multipass_data_t data
30996 = (ix86_first_cycle_multipass_data_t) _data;
30997 const_ix86_first_cycle_multipass_data_t prev_data
30998 = ix86_first_cycle_multipass_data;
31000 /* Restore the state from the end of the previous round. */
31001 data->ifetch_block_len = prev_data->ifetch_block_len;
31002 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31004 /* Filter instructions that cannot be issued on current cycle due to
31005 decoder restrictions. */
31006 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31007 first_cycle_insn_p);
31010 /* INSN is being issued in current solution. Account for its impact on
31011 the decoder model. */
31012 static void
31013 core2i7_first_cycle_multipass_issue (void *_data,
31014 signed char *ready_try, int n_ready,
31015 rtx_insn *insn, const void *_prev_data)
31017 ix86_first_cycle_multipass_data_t data
31018 = (ix86_first_cycle_multipass_data_t) _data;
31019 const_ix86_first_cycle_multipass_data_t prev_data
31020 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31022 int insn_size = min_insn_size (insn);
31024 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31025 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31026 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31027 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31029 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31030 if (!data->ready_try_change)
31032 data->ready_try_change = sbitmap_alloc (n_ready);
31033 data->ready_try_change_size = n_ready;
31035 else if (data->ready_try_change_size < n_ready)
31037 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31038 n_ready, 0);
31039 data->ready_try_change_size = n_ready;
31041 bitmap_clear (data->ready_try_change);
31043 /* Filter out insns from ready_try that the core will not be able to issue
31044 on current cycle due to decoder. */
31045 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31046 false);
31049 /* Revert the effect on ready_try. */
31050 static void
31051 core2i7_first_cycle_multipass_backtrack (const void *_data,
31052 signed char *ready_try,
31053 int n_ready ATTRIBUTE_UNUSED)
31055 const_ix86_first_cycle_multipass_data_t data
31056 = (const_ix86_first_cycle_multipass_data_t) _data;
31057 unsigned int i = 0;
31058 sbitmap_iterator sbi;
31060 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31061 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31063 ready_try[i] = 0;
31067 /* Save the result of multipass lookahead scheduling for the next round. */
31068 static void
31069 core2i7_first_cycle_multipass_end (const void *_data)
31071 const_ix86_first_cycle_multipass_data_t data
31072 = (const_ix86_first_cycle_multipass_data_t) _data;
31073 ix86_first_cycle_multipass_data_t next_data
31074 = ix86_first_cycle_multipass_data;
31076 if (data != NULL)
31078 next_data->ifetch_block_len = data->ifetch_block_len;
31079 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31083 /* Deallocate target data. */
31084 static void
31085 core2i7_first_cycle_multipass_fini (void *_data)
31087 ix86_first_cycle_multipass_data_t data
31088 = (ix86_first_cycle_multipass_data_t) _data;
31090 if (data->ready_try_change)
31092 sbitmap_free (data->ready_try_change);
31093 data->ready_try_change = NULL;
31094 data->ready_try_change_size = 0;
31098 /* Prepare for scheduling pass. */
31099 static void
31100 ix86_sched_init_global (FILE *, int, int)
31102 /* Install scheduling hooks for current CPU. Some of these hooks are used
31103 in time-critical parts of the scheduler, so we only set them up when
31104 they are actually used. */
31105 switch (ix86_tune)
31107 case PROCESSOR_CORE2:
31108 case PROCESSOR_NEHALEM:
31109 case PROCESSOR_SANDYBRIDGE:
31110 case PROCESSOR_HASWELL:
31111 /* Do not perform multipass scheduling for pre-reload schedule
31112 to save compile time. */
31113 if (reload_completed)
31115 targetm.sched.dfa_post_advance_cycle
31116 = core2i7_dfa_post_advance_cycle;
31117 targetm.sched.first_cycle_multipass_init
31118 = core2i7_first_cycle_multipass_init;
31119 targetm.sched.first_cycle_multipass_begin
31120 = core2i7_first_cycle_multipass_begin;
31121 targetm.sched.first_cycle_multipass_issue
31122 = core2i7_first_cycle_multipass_issue;
31123 targetm.sched.first_cycle_multipass_backtrack
31124 = core2i7_first_cycle_multipass_backtrack;
31125 targetm.sched.first_cycle_multipass_end
31126 = core2i7_first_cycle_multipass_end;
31127 targetm.sched.first_cycle_multipass_fini
31128 = core2i7_first_cycle_multipass_fini;
31130 /* Set decoder parameters. */
31131 core2i7_secondary_decoder_max_insn_size = 8;
31132 core2i7_ifetch_block_size = 16;
31133 core2i7_ifetch_block_max_insns = 6;
31134 break;
31136 /* Fall through. */
31137 default:
31138 targetm.sched.dfa_post_advance_cycle = NULL;
31139 targetm.sched.first_cycle_multipass_init = NULL;
31140 targetm.sched.first_cycle_multipass_begin = NULL;
31141 targetm.sched.first_cycle_multipass_issue = NULL;
31142 targetm.sched.first_cycle_multipass_backtrack = NULL;
31143 targetm.sched.first_cycle_multipass_end = NULL;
31144 targetm.sched.first_cycle_multipass_fini = NULL;
31145 break;
31150 /* Compute the alignment given to a constant that is being placed in memory.
31151 EXP is the constant and ALIGN is the alignment that the object would
31152 ordinarily have.
31153 The value of this function is used instead of that alignment to align
31154 the object. */
31157 ix86_constant_alignment (tree exp, int align)
31159 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31160 || TREE_CODE (exp) == INTEGER_CST)
31162 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31163 return 64;
31164 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31165 return 128;
31167 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31168 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31169 return BITS_PER_WORD;
31171 return align;
31174 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31175 the data type, and ALIGN is the alignment that the object would
31176 ordinarily have. */
31178 static int
31179 iamcu_alignment (tree type, int align)
31181 machine_mode mode;
31183 if (align < 32 || TYPE_USER_ALIGN (type))
31184 return align;
31186 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31187 bytes. */
31188 mode = TYPE_MODE (strip_array_types (type));
31189 switch (GET_MODE_CLASS (mode))
31191 case MODE_INT:
31192 case MODE_COMPLEX_INT:
31193 case MODE_COMPLEX_FLOAT:
31194 case MODE_FLOAT:
31195 case MODE_DECIMAL_FLOAT:
31196 return 32;
31197 default:
31198 return align;
31202 /* Compute the alignment for a static variable.
31203 TYPE is the data type, and ALIGN is the alignment that
31204 the object would ordinarily have. The value of this function is used
31205 instead of that alignment to align the object. */
31208 ix86_data_alignment (tree type, int align, bool opt)
31210 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31211 for symbols from other compilation units or symbols that don't need
31212 to bind locally. In order to preserve some ABI compatibility with
31213 those compilers, ensure we don't decrease alignment from what we
31214 used to assume. */
31216 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31218 /* A data structure, equal or greater than the size of a cache line
31219 (64 bytes in the Pentium 4 and other recent Intel processors, including
31220 processors based on Intel Core microarchitecture) should be aligned
31221 so that its base address is a multiple of a cache line size. */
31223 int max_align
31224 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31226 if (max_align < BITS_PER_WORD)
31227 max_align = BITS_PER_WORD;
31229 switch (ix86_align_data_type)
31231 case ix86_align_data_type_abi: opt = false; break;
31232 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31233 case ix86_align_data_type_cacheline: break;
31236 if (TARGET_IAMCU)
31237 align = iamcu_alignment (type, align);
31239 if (opt
31240 && AGGREGATE_TYPE_P (type)
31241 && TYPE_SIZE (type)
31242 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31244 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31245 && align < max_align_compat)
31246 align = max_align_compat;
31247 if (wi::geu_p (TYPE_SIZE (type), max_align)
31248 && align < max_align)
31249 align = max_align;
31252 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31253 to 16byte boundary. */
31254 if (TARGET_64BIT)
31256 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31257 && TYPE_SIZE (type)
31258 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31259 && wi::geu_p (TYPE_SIZE (type), 128)
31260 && align < 128)
31261 return 128;
31264 if (!opt)
31265 return align;
31267 if (TREE_CODE (type) == ARRAY_TYPE)
31269 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31270 return 64;
31271 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31272 return 128;
31274 else if (TREE_CODE (type) == COMPLEX_TYPE)
31277 if (TYPE_MODE (type) == DCmode && align < 64)
31278 return 64;
31279 if ((TYPE_MODE (type) == XCmode
31280 || TYPE_MODE (type) == TCmode) && align < 128)
31281 return 128;
31283 else if ((TREE_CODE (type) == RECORD_TYPE
31284 || TREE_CODE (type) == UNION_TYPE
31285 || TREE_CODE (type) == QUAL_UNION_TYPE)
31286 && TYPE_FIELDS (type))
31288 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31289 return 64;
31290 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31291 return 128;
31293 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31294 || TREE_CODE (type) == INTEGER_TYPE)
31296 if (TYPE_MODE (type) == DFmode && align < 64)
31297 return 64;
31298 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31299 return 128;
31302 return align;
31305 /* Compute the alignment for a local variable or a stack slot. EXP is
31306 the data type or decl itself, MODE is the widest mode available and
31307 ALIGN is the alignment that the object would ordinarily have. The
31308 value of this macro is used instead of that alignment to align the
31309 object. */
31311 unsigned int
31312 ix86_local_alignment (tree exp, machine_mode mode,
31313 unsigned int align)
31315 tree type, decl;
31317 if (exp && DECL_P (exp))
31319 type = TREE_TYPE (exp);
31320 decl = exp;
31322 else
31324 type = exp;
31325 decl = NULL;
31328 /* Don't do dynamic stack realignment for long long objects with
31329 -mpreferred-stack-boundary=2. */
31330 if (!TARGET_64BIT
31331 && align == 64
31332 && ix86_preferred_stack_boundary < 64
31333 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31334 && (!type || !TYPE_USER_ALIGN (type))
31335 && (!decl || !DECL_USER_ALIGN (decl)))
31336 align = 32;
31338 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31339 register in MODE. We will return the largest alignment of XF
31340 and DF. */
31341 if (!type)
31343 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31344 align = GET_MODE_ALIGNMENT (DFmode);
31345 return align;
31348 /* Don't increase alignment for Intel MCU psABI. */
31349 if (TARGET_IAMCU)
31350 return align;
31352 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31353 to 16byte boundary. Exact wording is:
31355 An array uses the same alignment as its elements, except that a local or
31356 global array variable of length at least 16 bytes or
31357 a C99 variable-length array variable always has alignment of at least 16 bytes.
31359 This was added to allow use of aligned SSE instructions at arrays. This
31360 rule is meant for static storage (where compiler can not do the analysis
31361 by itself). We follow it for automatic variables only when convenient.
31362 We fully control everything in the function compiled and functions from
31363 other unit can not rely on the alignment.
31365 Exclude va_list type. It is the common case of local array where
31366 we can not benefit from the alignment.
31368 TODO: Probably one should optimize for size only when var is not escaping. */
31369 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31370 && TARGET_SSE)
31372 if (AGGREGATE_TYPE_P (type)
31373 && (va_list_type_node == NULL_TREE
31374 || (TYPE_MAIN_VARIANT (type)
31375 != TYPE_MAIN_VARIANT (va_list_type_node)))
31376 && TYPE_SIZE (type)
31377 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31378 && wi::geu_p (TYPE_SIZE (type), 128)
31379 && align < 128)
31380 return 128;
31382 if (TREE_CODE (type) == ARRAY_TYPE)
31384 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31385 return 64;
31386 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31387 return 128;
31389 else if (TREE_CODE (type) == COMPLEX_TYPE)
31391 if (TYPE_MODE (type) == DCmode && align < 64)
31392 return 64;
31393 if ((TYPE_MODE (type) == XCmode
31394 || TYPE_MODE (type) == TCmode) && align < 128)
31395 return 128;
31397 else if ((TREE_CODE (type) == RECORD_TYPE
31398 || TREE_CODE (type) == UNION_TYPE
31399 || TREE_CODE (type) == QUAL_UNION_TYPE)
31400 && TYPE_FIELDS (type))
31402 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31403 return 64;
31404 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31405 return 128;
31407 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31408 || TREE_CODE (type) == INTEGER_TYPE)
31411 if (TYPE_MODE (type) == DFmode && align < 64)
31412 return 64;
31413 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31414 return 128;
31416 return align;
31419 /* Compute the minimum required alignment for dynamic stack realignment
31420 purposes for a local variable, parameter or a stack slot. EXP is
31421 the data type or decl itself, MODE is its mode and ALIGN is the
31422 alignment that the object would ordinarily have. */
31424 unsigned int
31425 ix86_minimum_alignment (tree exp, machine_mode mode,
31426 unsigned int align)
31428 tree type, decl;
31430 if (exp && DECL_P (exp))
31432 type = TREE_TYPE (exp);
31433 decl = exp;
31435 else
31437 type = exp;
31438 decl = NULL;
31441 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31442 return align;
31444 /* Don't do dynamic stack realignment for long long objects with
31445 -mpreferred-stack-boundary=2. */
31446 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31447 && (!type || !TYPE_USER_ALIGN (type))
31448 && (!decl || !DECL_USER_ALIGN (decl)))
31450 gcc_checking_assert (!TARGET_STV);
31451 return 32;
31454 return align;
31457 /* Find a location for the static chain incoming to a nested function.
31458 This is a register, unless all free registers are used by arguments. */
31460 static rtx
31461 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31463 unsigned regno;
31465 /* While this function won't be called by the middle-end when a static
31466 chain isn't needed, it's also used throughout the backend so it's
31467 easiest to keep this check centralized. */
31468 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31469 return NULL;
31471 if (TARGET_64BIT)
31473 /* We always use R10 in 64-bit mode. */
31474 regno = R10_REG;
31476 else
31478 const_tree fntype, fndecl;
31479 unsigned int ccvt;
31481 /* By default in 32-bit mode we use ECX to pass the static chain. */
31482 regno = CX_REG;
31484 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31486 fntype = TREE_TYPE (fndecl_or_type);
31487 fndecl = fndecl_or_type;
31489 else
31491 fntype = fndecl_or_type;
31492 fndecl = NULL;
31495 ccvt = ix86_get_callcvt (fntype);
31496 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31498 /* Fastcall functions use ecx/edx for arguments, which leaves
31499 us with EAX for the static chain.
31500 Thiscall functions use ecx for arguments, which also
31501 leaves us with EAX for the static chain. */
31502 regno = AX_REG;
31504 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31506 /* Thiscall functions use ecx for arguments, which leaves
31507 us with EAX and EDX for the static chain.
31508 We are using for abi-compatibility EAX. */
31509 regno = AX_REG;
31511 else if (ix86_function_regparm (fntype, fndecl) == 3)
31513 /* For regparm 3, we have no free call-clobbered registers in
31514 which to store the static chain. In order to implement this,
31515 we have the trampoline push the static chain to the stack.
31516 However, we can't push a value below the return address when
31517 we call the nested function directly, so we have to use an
31518 alternate entry point. For this we use ESI, and have the
31519 alternate entry point push ESI, so that things appear the
31520 same once we're executing the nested function. */
31521 if (incoming_p)
31523 if (fndecl == current_function_decl
31524 && !ix86_static_chain_on_stack)
31526 gcc_assert (!reload_completed);
31527 ix86_static_chain_on_stack = true;
31529 return gen_frame_mem (SImode,
31530 plus_constant (Pmode,
31531 arg_pointer_rtx, -8));
31533 regno = SI_REG;
31537 return gen_rtx_REG (Pmode, regno);
31540 /* Emit RTL insns to initialize the variable parts of a trampoline.
31541 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31542 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31543 to be passed to the target function. */
31545 static void
31546 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31548 rtx mem, fnaddr;
31549 int opcode;
31550 int offset = 0;
31552 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31554 if (TARGET_64BIT)
31556 int size;
31558 /* Load the function address to r11. Try to load address using
31559 the shorter movl instead of movabs. We may want to support
31560 movq for kernel mode, but kernel does not use trampolines at
31561 the moment. FNADDR is a 32bit address and may not be in
31562 DImode when ptr_mode == SImode. Always use movl in this
31563 case. */
31564 if (ptr_mode == SImode
31565 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31567 fnaddr = copy_addr_to_reg (fnaddr);
31569 mem = adjust_address (m_tramp, HImode, offset);
31570 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31572 mem = adjust_address (m_tramp, SImode, offset + 2);
31573 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31574 offset += 6;
31576 else
31578 mem = adjust_address (m_tramp, HImode, offset);
31579 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31581 mem = adjust_address (m_tramp, DImode, offset + 2);
31582 emit_move_insn (mem, fnaddr);
31583 offset += 10;
31586 /* Load static chain using movabs to r10. Use the shorter movl
31587 instead of movabs when ptr_mode == SImode. */
31588 if (ptr_mode == SImode)
31590 opcode = 0xba41;
31591 size = 6;
31593 else
31595 opcode = 0xba49;
31596 size = 10;
31599 mem = adjust_address (m_tramp, HImode, offset);
31600 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31602 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31603 emit_move_insn (mem, chain_value);
31604 offset += size;
31606 /* Jump to r11; the last (unused) byte is a nop, only there to
31607 pad the write out to a single 32-bit store. */
31608 mem = adjust_address (m_tramp, SImode, offset);
31609 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31610 offset += 4;
31612 else
31614 rtx disp, chain;
31616 /* Depending on the static chain location, either load a register
31617 with a constant, or push the constant to the stack. All of the
31618 instructions are the same size. */
31619 chain = ix86_static_chain (fndecl, true);
31620 if (REG_P (chain))
31622 switch (REGNO (chain))
31624 case AX_REG:
31625 opcode = 0xb8; break;
31626 case CX_REG:
31627 opcode = 0xb9; break;
31628 default:
31629 gcc_unreachable ();
31632 else
31633 opcode = 0x68;
31635 mem = adjust_address (m_tramp, QImode, offset);
31636 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31638 mem = adjust_address (m_tramp, SImode, offset + 1);
31639 emit_move_insn (mem, chain_value);
31640 offset += 5;
31642 mem = adjust_address (m_tramp, QImode, offset);
31643 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31645 mem = adjust_address (m_tramp, SImode, offset + 1);
31647 /* Compute offset from the end of the jmp to the target function.
31648 In the case in which the trampoline stores the static chain on
31649 the stack, we need to skip the first insn which pushes the
31650 (call-saved) register static chain; this push is 1 byte. */
31651 offset += 5;
31652 disp = expand_binop (SImode, sub_optab, fnaddr,
31653 plus_constant (Pmode, XEXP (m_tramp, 0),
31654 offset - (MEM_P (chain) ? 1 : 0)),
31655 NULL_RTX, 1, OPTAB_DIRECT);
31656 emit_move_insn (mem, disp);
31659 gcc_assert (offset <= TRAMPOLINE_SIZE);
31661 #ifdef HAVE_ENABLE_EXECUTE_STACK
31662 #ifdef CHECK_EXECUTE_STACK_ENABLED
31663 if (CHECK_EXECUTE_STACK_ENABLED)
31664 #endif
31665 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31666 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31667 #endif
31670 /* The following file contains several enumerations and data structures
31671 built from the definitions in i386-builtin-types.def. */
31673 #include "i386-builtin-types.inc"
31675 /* Table for the ix86 builtin non-function types. */
31676 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31678 /* Retrieve an element from the above table, building some of
31679 the types lazily. */
31681 static tree
31682 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31684 unsigned int index;
31685 tree type, itype;
31687 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31689 type = ix86_builtin_type_tab[(int) tcode];
31690 if (type != NULL)
31691 return type;
31693 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31694 if (tcode <= IX86_BT_LAST_VECT)
31696 machine_mode mode;
31698 index = tcode - IX86_BT_LAST_PRIM - 1;
31699 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31700 mode = ix86_builtin_type_vect_mode[index];
31702 type = build_vector_type_for_mode (itype, mode);
31704 else
31706 int quals;
31708 index = tcode - IX86_BT_LAST_VECT - 1;
31709 if (tcode <= IX86_BT_LAST_PTR)
31710 quals = TYPE_UNQUALIFIED;
31711 else
31712 quals = TYPE_QUAL_CONST;
31714 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31715 if (quals != TYPE_UNQUALIFIED)
31716 itype = build_qualified_type (itype, quals);
31718 type = build_pointer_type (itype);
31721 ix86_builtin_type_tab[(int) tcode] = type;
31722 return type;
31725 /* Table for the ix86 builtin function types. */
31726 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31728 /* Retrieve an element from the above table, building some of
31729 the types lazily. */
31731 static tree
31732 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31734 tree type;
31736 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31738 type = ix86_builtin_func_type_tab[(int) tcode];
31739 if (type != NULL)
31740 return type;
31742 if (tcode <= IX86_BT_LAST_FUNC)
31744 unsigned start = ix86_builtin_func_start[(int) tcode];
31745 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31746 tree rtype, atype, args = void_list_node;
31747 unsigned i;
31749 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31750 for (i = after - 1; i > start; --i)
31752 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31753 args = tree_cons (NULL, atype, args);
31756 type = build_function_type (rtype, args);
31758 else
31760 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31761 enum ix86_builtin_func_type icode;
31763 icode = ix86_builtin_func_alias_base[index];
31764 type = ix86_get_builtin_func_type (icode);
31767 ix86_builtin_func_type_tab[(int) tcode] = type;
31768 return type;
31772 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31773 bdesc_* arrays below should come first, then builtins for each bdesc_*
31774 array in ascending order, so that we can use direct array accesses. */
31775 enum ix86_builtins
31777 IX86_BUILTIN_MASKMOVQ,
31778 IX86_BUILTIN_LDMXCSR,
31779 IX86_BUILTIN_STMXCSR,
31780 IX86_BUILTIN_MASKMOVDQU,
31781 IX86_BUILTIN_PSLLDQ128,
31782 IX86_BUILTIN_CLFLUSH,
31783 IX86_BUILTIN_MONITOR,
31784 IX86_BUILTIN_MWAIT,
31785 IX86_BUILTIN_CLZERO,
31786 IX86_BUILTIN_VEC_INIT_V2SI,
31787 IX86_BUILTIN_VEC_INIT_V4HI,
31788 IX86_BUILTIN_VEC_INIT_V8QI,
31789 IX86_BUILTIN_VEC_EXT_V2DF,
31790 IX86_BUILTIN_VEC_EXT_V2DI,
31791 IX86_BUILTIN_VEC_EXT_V4SF,
31792 IX86_BUILTIN_VEC_EXT_V4SI,
31793 IX86_BUILTIN_VEC_EXT_V8HI,
31794 IX86_BUILTIN_VEC_EXT_V2SI,
31795 IX86_BUILTIN_VEC_EXT_V4HI,
31796 IX86_BUILTIN_VEC_EXT_V16QI,
31797 IX86_BUILTIN_VEC_SET_V2DI,
31798 IX86_BUILTIN_VEC_SET_V4SF,
31799 IX86_BUILTIN_VEC_SET_V4SI,
31800 IX86_BUILTIN_VEC_SET_V8HI,
31801 IX86_BUILTIN_VEC_SET_V4HI,
31802 IX86_BUILTIN_VEC_SET_V16QI,
31803 IX86_BUILTIN_GATHERSIV2DF,
31804 IX86_BUILTIN_GATHERSIV4DF,
31805 IX86_BUILTIN_GATHERDIV2DF,
31806 IX86_BUILTIN_GATHERDIV4DF,
31807 IX86_BUILTIN_GATHERSIV4SF,
31808 IX86_BUILTIN_GATHERSIV8SF,
31809 IX86_BUILTIN_GATHERDIV4SF,
31810 IX86_BUILTIN_GATHERDIV8SF,
31811 IX86_BUILTIN_GATHERSIV2DI,
31812 IX86_BUILTIN_GATHERSIV4DI,
31813 IX86_BUILTIN_GATHERDIV2DI,
31814 IX86_BUILTIN_GATHERDIV4DI,
31815 IX86_BUILTIN_GATHERSIV4SI,
31816 IX86_BUILTIN_GATHERSIV8SI,
31817 IX86_BUILTIN_GATHERDIV4SI,
31818 IX86_BUILTIN_GATHERDIV8SI,
31819 IX86_BUILTIN_VFMSUBSD3_MASK3,
31820 IX86_BUILTIN_VFMSUBSS3_MASK3,
31821 IX86_BUILTIN_GATHER3SIV8SF,
31822 IX86_BUILTIN_GATHER3SIV4SF,
31823 IX86_BUILTIN_GATHER3SIV4DF,
31824 IX86_BUILTIN_GATHER3SIV2DF,
31825 IX86_BUILTIN_GATHER3DIV8SF,
31826 IX86_BUILTIN_GATHER3DIV4SF,
31827 IX86_BUILTIN_GATHER3DIV4DF,
31828 IX86_BUILTIN_GATHER3DIV2DF,
31829 IX86_BUILTIN_GATHER3SIV8SI,
31830 IX86_BUILTIN_GATHER3SIV4SI,
31831 IX86_BUILTIN_GATHER3SIV4DI,
31832 IX86_BUILTIN_GATHER3SIV2DI,
31833 IX86_BUILTIN_GATHER3DIV8SI,
31834 IX86_BUILTIN_GATHER3DIV4SI,
31835 IX86_BUILTIN_GATHER3DIV4DI,
31836 IX86_BUILTIN_GATHER3DIV2DI,
31837 IX86_BUILTIN_SCATTERSIV8SF,
31838 IX86_BUILTIN_SCATTERSIV4SF,
31839 IX86_BUILTIN_SCATTERSIV4DF,
31840 IX86_BUILTIN_SCATTERSIV2DF,
31841 IX86_BUILTIN_SCATTERDIV8SF,
31842 IX86_BUILTIN_SCATTERDIV4SF,
31843 IX86_BUILTIN_SCATTERDIV4DF,
31844 IX86_BUILTIN_SCATTERDIV2DF,
31845 IX86_BUILTIN_SCATTERSIV8SI,
31846 IX86_BUILTIN_SCATTERSIV4SI,
31847 IX86_BUILTIN_SCATTERSIV4DI,
31848 IX86_BUILTIN_SCATTERSIV2DI,
31849 IX86_BUILTIN_SCATTERDIV8SI,
31850 IX86_BUILTIN_SCATTERDIV4SI,
31851 IX86_BUILTIN_SCATTERDIV4DI,
31852 IX86_BUILTIN_SCATTERDIV2DI,
31853 /* Alternate 4 and 8 element gather/scatter for the vectorizer
31854 where all operands are 32-byte or 64-byte wide respectively. */
31855 IX86_BUILTIN_GATHERALTSIV4DF,
31856 IX86_BUILTIN_GATHERALTDIV8SF,
31857 IX86_BUILTIN_GATHERALTSIV4DI,
31858 IX86_BUILTIN_GATHERALTDIV8SI,
31859 IX86_BUILTIN_GATHER3ALTDIV16SF,
31860 IX86_BUILTIN_GATHER3ALTDIV16SI,
31861 IX86_BUILTIN_GATHER3ALTSIV4DF,
31862 IX86_BUILTIN_GATHER3ALTDIV8SF,
31863 IX86_BUILTIN_GATHER3ALTSIV4DI,
31864 IX86_BUILTIN_GATHER3ALTDIV8SI,
31865 IX86_BUILTIN_GATHER3ALTSIV8DF,
31866 IX86_BUILTIN_GATHER3ALTSIV8DI,
31867 IX86_BUILTIN_GATHER3DIV16SF,
31868 IX86_BUILTIN_GATHER3DIV16SI,
31869 IX86_BUILTIN_GATHER3DIV8DF,
31870 IX86_BUILTIN_GATHER3DIV8DI,
31871 IX86_BUILTIN_GATHER3SIV16SF,
31872 IX86_BUILTIN_GATHER3SIV16SI,
31873 IX86_BUILTIN_GATHER3SIV8DF,
31874 IX86_BUILTIN_GATHER3SIV8DI,
31875 IX86_BUILTIN_SCATTERALTSIV8DF,
31876 IX86_BUILTIN_SCATTERALTDIV16SF,
31877 IX86_BUILTIN_SCATTERALTSIV8DI,
31878 IX86_BUILTIN_SCATTERALTDIV16SI,
31879 IX86_BUILTIN_SCATTERDIV16SF,
31880 IX86_BUILTIN_SCATTERDIV16SI,
31881 IX86_BUILTIN_SCATTERDIV8DF,
31882 IX86_BUILTIN_SCATTERDIV8DI,
31883 IX86_BUILTIN_SCATTERSIV16SF,
31884 IX86_BUILTIN_SCATTERSIV16SI,
31885 IX86_BUILTIN_SCATTERSIV8DF,
31886 IX86_BUILTIN_SCATTERSIV8DI,
31887 IX86_BUILTIN_GATHERPFQPD,
31888 IX86_BUILTIN_GATHERPFDPS,
31889 IX86_BUILTIN_GATHERPFDPD,
31890 IX86_BUILTIN_GATHERPFQPS,
31891 IX86_BUILTIN_SCATTERPFDPD,
31892 IX86_BUILTIN_SCATTERPFDPS,
31893 IX86_BUILTIN_SCATTERPFQPD,
31894 IX86_BUILTIN_SCATTERPFQPS,
31895 IX86_BUILTIN_CLWB,
31896 IX86_BUILTIN_CLFLUSHOPT,
31897 IX86_BUILTIN_INFQ,
31898 IX86_BUILTIN_HUGE_VALQ,
31899 IX86_BUILTIN_NANQ,
31900 IX86_BUILTIN_NANSQ,
31901 IX86_BUILTIN_XABORT,
31902 IX86_BUILTIN_ADDCARRYX32,
31903 IX86_BUILTIN_ADDCARRYX64,
31904 IX86_BUILTIN_SBB32,
31905 IX86_BUILTIN_SBB64,
31906 IX86_BUILTIN_RDRAND16_STEP,
31907 IX86_BUILTIN_RDRAND32_STEP,
31908 IX86_BUILTIN_RDRAND64_STEP,
31909 IX86_BUILTIN_RDSEED16_STEP,
31910 IX86_BUILTIN_RDSEED32_STEP,
31911 IX86_BUILTIN_RDSEED64_STEP,
31912 IX86_BUILTIN_MONITORX,
31913 IX86_BUILTIN_MWAITX,
31914 IX86_BUILTIN_CFSTRING,
31915 IX86_BUILTIN_CPU_INIT,
31916 IX86_BUILTIN_CPU_IS,
31917 IX86_BUILTIN_CPU_SUPPORTS,
31918 IX86_BUILTIN_READ_FLAGS,
31919 IX86_BUILTIN_WRITE_FLAGS,
31921 /* All the remaining builtins are tracked in bdesc_* arrays in
31922 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
31923 this point. */
31924 #define BDESC(mask, icode, name, code, comparison, flag) \
31925 code,
31926 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31927 code, \
31928 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31929 #define BDESC_END(kind, next_kind)
31931 #include "i386-builtin.def"
31933 #undef BDESC
31934 #undef BDESC_FIRST
31935 #undef BDESC_END
31937 IX86_BUILTIN_MAX,
31939 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31941 /* Now just the aliases for bdesc_* start/end. */
31942 #define BDESC(mask, icode, name, code, comparison, flag)
31943 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31944 #define BDESC_END(kind, next_kind) \
31945 IX86_BUILTIN__BDESC_##kind##_LAST \
31946 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31948 #include "i386-builtin.def"
31950 #undef BDESC
31951 #undef BDESC_FIRST
31952 #undef BDESC_END
31954 /* Just to make sure there is no comma after the last enumerator. */
31955 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31958 /* Table for the ix86 builtin decls. */
31959 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31961 /* Table of all of the builtin functions that are possible with different ISA's
31962 but are waiting to be built until a function is declared to use that
31963 ISA. */
31964 struct builtin_isa {
31965 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31966 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31967 const char *name; /* function name */
31968 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
31969 unsigned char const_p:1; /* true if the declaration is constant */
31970 unsigned char pure_p:1; /* true if the declaration has pure attribute */
31971 bool leaf_p; /* true if the declaration has leaf attribute */
31972 bool nothrow_p; /* true if the declaration has nothrow attribute */
31973 bool set_and_not_built_p;
31976 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
31978 /* Bits that can still enable any inclusion of a builtin. */
31979 static HOST_WIDE_INT deferred_isa_values = 0;
31980 static HOST_WIDE_INT deferred_isa_values2 = 0;
31982 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
31983 of which isa_flags to use in the ix86_builtins_isa array. Stores the
31984 function decl in the ix86_builtins array. Returns the function decl or
31985 NULL_TREE, if the builtin was not added.
31987 If the front end has a special hook for builtin functions, delay adding
31988 builtin functions that aren't in the current ISA until the ISA is changed
31989 with function specific optimization. Doing so, can save about 300K for the
31990 default compiler. When the builtin is expanded, check at that time whether
31991 it is valid.
31993 If the front end doesn't have a special hook, record all builtins, even if
31994 it isn't an instruction set in the current ISA in case the user uses
31995 function specific options for a different ISA, so that we don't get scope
31996 errors if a builtin is added in the middle of a function scope. */
31998 static inline tree
31999 def_builtin (HOST_WIDE_INT mask, const char *name,
32000 enum ix86_builtin_func_type tcode,
32001 enum ix86_builtins code)
32003 tree decl = NULL_TREE;
32005 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32007 ix86_builtins_isa[(int) code].isa = mask;
32009 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32010 where any bit set means that built-in is enable, this bit must be *and-ed*
32011 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32012 means that *both* cpuid bits must be set for the built-in to be available.
32013 Handle this here. */
32014 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32015 mask &= ~OPTION_MASK_ISA_AVX512VL;
32017 mask &= ~OPTION_MASK_ISA_64BIT;
32018 if (mask == 0
32019 || (mask & ix86_isa_flags) != 0
32020 || (lang_hooks.builtin_function
32021 == lang_hooks.builtin_function_ext_scope))
32024 tree type = ix86_get_builtin_func_type (tcode);
32025 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32026 NULL, NULL_TREE);
32027 ix86_builtins[(int) code] = decl;
32028 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32030 else
32032 /* Just a MASK where set_and_not_built_p == true can potentially
32033 include a builtin. */
32034 deferred_isa_values |= mask;
32035 ix86_builtins[(int) code] = NULL_TREE;
32036 ix86_builtins_isa[(int) code].tcode = tcode;
32037 ix86_builtins_isa[(int) code].name = name;
32038 ix86_builtins_isa[(int) code].leaf_p = false;
32039 ix86_builtins_isa[(int) code].nothrow_p = false;
32040 ix86_builtins_isa[(int) code].const_p = false;
32041 ix86_builtins_isa[(int) code].pure_p = false;
32042 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32046 return decl;
32049 /* Like def_builtin, but also marks the function decl "const". */
32051 static inline tree
32052 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32053 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32055 tree decl = def_builtin (mask, name, tcode, code);
32056 if (decl)
32057 TREE_READONLY (decl) = 1;
32058 else
32059 ix86_builtins_isa[(int) code].const_p = true;
32061 return decl;
32064 /* Like def_builtin, but also marks the function decl "pure". */
32066 static inline tree
32067 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32068 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32070 tree decl = def_builtin (mask, name, tcode, code);
32071 if (decl)
32072 DECL_PURE_P (decl) = 1;
32073 else
32074 ix86_builtins_isa[(int) code].pure_p = true;
32076 return decl;
32079 /* Like def_builtin, but for additional isa2 flags. */
32081 static inline tree
32082 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32083 enum ix86_builtin_func_type tcode,
32084 enum ix86_builtins code)
32086 tree decl = NULL_TREE;
32088 ix86_builtins_isa[(int) code].isa2 = mask;
32090 if (mask == 0
32091 || (mask & ix86_isa_flags2) != 0
32092 || (lang_hooks.builtin_function
32093 == lang_hooks.builtin_function_ext_scope))
32096 tree type = ix86_get_builtin_func_type (tcode);
32097 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32098 NULL, NULL_TREE);
32099 ix86_builtins[(int) code] = decl;
32100 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32102 else
32104 /* Just a MASK where set_and_not_built_p == true can potentially
32105 include a builtin. */
32106 deferred_isa_values2 |= mask;
32107 ix86_builtins[(int) code] = NULL_TREE;
32108 ix86_builtins_isa[(int) code].tcode = tcode;
32109 ix86_builtins_isa[(int) code].name = name;
32110 ix86_builtins_isa[(int) code].leaf_p = false;
32111 ix86_builtins_isa[(int) code].nothrow_p = false;
32112 ix86_builtins_isa[(int) code].const_p = false;
32113 ix86_builtins_isa[(int) code].pure_p = false;
32114 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32117 return decl;
32120 /* Like def_builtin, but also marks the function decl "const". */
32122 static inline tree
32123 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32124 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32126 tree decl = def_builtin2 (mask, name, tcode, code);
32127 if (decl)
32128 TREE_READONLY (decl) = 1;
32129 else
32130 ix86_builtins_isa[(int) code].const_p = true;
32132 return decl;
32135 /* Like def_builtin, but also marks the function decl "pure". */
32137 static inline tree
32138 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32139 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32141 tree decl = def_builtin2 (mask, name, tcode, code);
32142 if (decl)
32143 DECL_PURE_P (decl) = 1;
32144 else
32145 ix86_builtins_isa[(int) code].pure_p = true;
32147 return decl;
32150 /* Add any new builtin functions for a given ISA that may not have been
32151 declared. This saves a bit of space compared to adding all of the
32152 declarations to the tree, even if we didn't use them. */
32154 static void
32155 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32157 if ((isa & deferred_isa_values) == 0
32158 && (isa2 & deferred_isa_values2) == 0)
32159 return;
32161 /* Bits in ISA value can be removed from potential isa values. */
32162 deferred_isa_values &= ~isa;
32163 deferred_isa_values2 &= ~isa2;
32165 int i;
32166 tree saved_current_target_pragma = current_target_pragma;
32167 current_target_pragma = NULL_TREE;
32169 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32171 if (((ix86_builtins_isa[i].isa & isa) != 0
32172 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32173 && ix86_builtins_isa[i].set_and_not_built_p)
32175 tree decl, type;
32177 /* Don't define the builtin again. */
32178 ix86_builtins_isa[i].set_and_not_built_p = false;
32180 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32181 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32182 type, i, BUILT_IN_MD, NULL,
32183 NULL_TREE);
32185 ix86_builtins[i] = decl;
32186 if (ix86_builtins_isa[i].const_p)
32187 TREE_READONLY (decl) = 1;
32188 if (ix86_builtins_isa[i].pure_p)
32189 DECL_PURE_P (decl) = 1;
32190 if (ix86_builtins_isa[i].leaf_p)
32191 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32192 NULL_TREE);
32193 if (ix86_builtins_isa[i].nothrow_p)
32194 TREE_NOTHROW (decl) = 1;
32198 current_target_pragma = saved_current_target_pragma;
32201 /* Bits for builtin_description.flag. */
32203 /* Set when we don't support the comparison natively, and should
32204 swap_comparison in order to support it. */
32205 #define BUILTIN_DESC_SWAP_OPERANDS 1
32207 struct builtin_description
32209 const HOST_WIDE_INT mask;
32210 const enum insn_code icode;
32211 const char *const name;
32212 const enum ix86_builtins code;
32213 const enum rtx_code comparison;
32214 const int flag;
32217 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32218 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32219 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32220 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32221 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32222 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32223 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32224 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32225 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32226 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32227 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32228 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32229 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32230 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32231 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32232 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32233 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32234 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32235 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32236 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32237 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32238 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32239 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32240 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32241 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32242 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32243 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32244 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32245 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32246 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32247 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32248 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32249 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32250 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32251 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32252 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32253 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32254 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32255 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32256 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32257 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32258 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32259 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32260 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32261 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32262 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32263 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32264 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32265 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32266 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32267 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32268 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32270 #define BDESC(mask, icode, name, code, comparison, flag) \
32271 { mask, icode, name, code, comparison, flag },
32272 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32273 static const struct builtin_description bdesc_##kind[] = \
32275 BDESC (mask, icode, name, code, comparison, flag)
32276 #define BDESC_END(kind, next_kind) \
32279 #include "i386-builtin.def"
32281 #undef BDESC
32282 #undef BDESC_FIRST
32283 #undef BDESC_END
32285 /* TM vector builtins. */
32287 /* Reuse the existing x86-specific `struct builtin_description' cause
32288 we're lazy. Add casts to make them fit. */
32289 static const struct builtin_description bdesc_tm[] =
32291 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32292 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32293 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32294 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32295 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32296 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32297 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32299 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32300 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32301 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32302 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32303 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32304 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32305 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32307 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32308 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32309 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32310 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32311 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32312 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32313 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32315 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32316 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32317 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32320 /* Initialize the transactional memory vector load/store builtins. */
32322 static void
32323 ix86_init_tm_builtins (void)
32325 enum ix86_builtin_func_type ftype;
32326 const struct builtin_description *d;
32327 size_t i;
32328 tree decl;
32329 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32330 tree attrs_log, attrs_type_log;
32332 if (!flag_tm)
32333 return;
32335 /* If there are no builtins defined, we must be compiling in a
32336 language without trans-mem support. */
32337 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32338 return;
32340 /* Use whatever attributes a normal TM load has. */
32341 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32342 attrs_load = DECL_ATTRIBUTES (decl);
32343 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32344 /* Use whatever attributes a normal TM store has. */
32345 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32346 attrs_store = DECL_ATTRIBUTES (decl);
32347 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32348 /* Use whatever attributes a normal TM log has. */
32349 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32350 attrs_log = DECL_ATTRIBUTES (decl);
32351 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32353 for (i = 0, d = bdesc_tm;
32354 i < ARRAY_SIZE (bdesc_tm);
32355 i++, d++)
32357 if ((d->mask & ix86_isa_flags) != 0
32358 || (lang_hooks.builtin_function
32359 == lang_hooks.builtin_function_ext_scope))
32361 tree type, attrs, attrs_type;
32362 enum built_in_function code = (enum built_in_function) d->code;
32364 ftype = (enum ix86_builtin_func_type) d->flag;
32365 type = ix86_get_builtin_func_type (ftype);
32367 if (BUILTIN_TM_LOAD_P (code))
32369 attrs = attrs_load;
32370 attrs_type = attrs_type_load;
32372 else if (BUILTIN_TM_STORE_P (code))
32374 attrs = attrs_store;
32375 attrs_type = attrs_type_store;
32377 else
32379 attrs = attrs_log;
32380 attrs_type = attrs_type_log;
32382 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32383 /* The builtin without the prefix for
32384 calling it directly. */
32385 d->name + strlen ("__builtin_"),
32386 attrs);
32387 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32388 set the TYPE_ATTRIBUTES. */
32389 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32391 set_builtin_decl (code, decl, false);
32396 /* Macros for verification of enum ix86_builtins order. */
32397 #define BDESC_VERIFY(x, y, z) \
32398 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32399 #define BDESC_VERIFYS(x, y, z) \
32400 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32402 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32403 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32404 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32405 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32406 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32407 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32408 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32409 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32410 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32411 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32412 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32413 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32414 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32415 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32416 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32417 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32418 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32419 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32420 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32421 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32423 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32424 in the current target ISA to allow the user to compile particular modules
32425 with different target specific options that differ from the command line
32426 options. */
32427 static void
32428 ix86_init_mmx_sse_builtins (void)
32430 const struct builtin_description * d;
32431 enum ix86_builtin_func_type ftype;
32432 size_t i;
32434 /* Add all special builtins with variable number of operands. */
32435 for (i = 0, d = bdesc_special_args;
32436 i < ARRAY_SIZE (bdesc_special_args);
32437 i++, d++)
32439 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32440 if (d->name == 0)
32441 continue;
32443 ftype = (enum ix86_builtin_func_type) d->flag;
32444 def_builtin (d->mask, d->name, ftype, d->code);
32446 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32447 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32448 ARRAY_SIZE (bdesc_special_args) - 1);
32450 /* Add all builtins with variable number of operands. */
32451 for (i = 0, d = bdesc_args;
32452 i < ARRAY_SIZE (bdesc_args);
32453 i++, d++)
32455 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32456 if (d->name == 0)
32457 continue;
32459 ftype = (enum ix86_builtin_func_type) d->flag;
32460 def_builtin_const (d->mask, d->name, ftype, d->code);
32462 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32463 IX86_BUILTIN__BDESC_ARGS_FIRST,
32464 ARRAY_SIZE (bdesc_args) - 1);
32466 /* Add all builtins with variable number of operands. */
32467 for (i = 0, d = bdesc_args2;
32468 i < ARRAY_SIZE (bdesc_args2);
32469 i++, d++)
32471 if (d->name == 0)
32472 continue;
32474 ftype = (enum ix86_builtin_func_type) d->flag;
32475 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32478 /* Add all builtins with rounding. */
32479 for (i = 0, d = bdesc_round_args;
32480 i < ARRAY_SIZE (bdesc_round_args);
32481 i++, d++)
32483 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32484 if (d->name == 0)
32485 continue;
32487 ftype = (enum ix86_builtin_func_type) d->flag;
32488 def_builtin_const (d->mask, d->name, ftype, d->code);
32490 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32491 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32492 ARRAY_SIZE (bdesc_round_args) - 1);
32494 /* pcmpestr[im] insns. */
32495 for (i = 0, d = bdesc_pcmpestr;
32496 i < ARRAY_SIZE (bdesc_pcmpestr);
32497 i++, d++)
32499 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32500 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32501 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32502 else
32503 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32504 def_builtin_const (d->mask, d->name, ftype, d->code);
32506 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32507 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32508 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32510 /* pcmpistr[im] insns. */
32511 for (i = 0, d = bdesc_pcmpistr;
32512 i < ARRAY_SIZE (bdesc_pcmpistr);
32513 i++, d++)
32515 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32516 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32517 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32518 else
32519 ftype = INT_FTYPE_V16QI_V16QI_INT;
32520 def_builtin_const (d->mask, d->name, ftype, d->code);
32522 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32523 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32524 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32526 /* comi/ucomi insns. */
32527 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32529 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32530 if (d->mask == OPTION_MASK_ISA_SSE2)
32531 ftype = INT_FTYPE_V2DF_V2DF;
32532 else
32533 ftype = INT_FTYPE_V4SF_V4SF;
32534 def_builtin_const (d->mask, d->name, ftype, d->code);
32536 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32537 IX86_BUILTIN__BDESC_COMI_FIRST,
32538 ARRAY_SIZE (bdesc_comi) - 1);
32540 /* SSE */
32541 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32542 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32543 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32544 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32546 /* SSE or 3DNow!A */
32547 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32548 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32549 IX86_BUILTIN_MASKMOVQ);
32551 /* SSE2 */
32552 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32553 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32555 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32556 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32557 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32558 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32560 /* SSE3. */
32561 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32562 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32563 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32564 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32566 /* AES */
32567 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32568 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32569 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32570 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32571 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32572 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32573 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32574 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32575 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32576 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32577 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32578 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32580 /* PCLMUL */
32581 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32582 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32584 /* RDRND */
32585 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32586 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32587 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32588 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32589 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32590 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32591 IX86_BUILTIN_RDRAND64_STEP);
32593 /* AVX2 */
32594 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32595 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32596 IX86_BUILTIN_GATHERSIV2DF);
32598 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32599 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32600 IX86_BUILTIN_GATHERSIV4DF);
32602 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32603 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32604 IX86_BUILTIN_GATHERDIV2DF);
32606 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32607 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32608 IX86_BUILTIN_GATHERDIV4DF);
32610 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32611 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32612 IX86_BUILTIN_GATHERSIV4SF);
32614 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32615 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32616 IX86_BUILTIN_GATHERSIV8SF);
32618 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32619 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32620 IX86_BUILTIN_GATHERDIV4SF);
32622 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32623 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32624 IX86_BUILTIN_GATHERDIV8SF);
32626 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32627 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32628 IX86_BUILTIN_GATHERSIV2DI);
32630 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32631 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32632 IX86_BUILTIN_GATHERSIV4DI);
32634 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32635 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32636 IX86_BUILTIN_GATHERDIV2DI);
32638 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32639 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32640 IX86_BUILTIN_GATHERDIV4DI);
32642 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32643 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32644 IX86_BUILTIN_GATHERSIV4SI);
32646 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32647 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32648 IX86_BUILTIN_GATHERSIV8SI);
32650 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32651 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32652 IX86_BUILTIN_GATHERDIV4SI);
32654 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32655 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32656 IX86_BUILTIN_GATHERDIV8SI);
32658 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32659 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32660 IX86_BUILTIN_GATHERALTSIV4DF);
32662 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32663 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32664 IX86_BUILTIN_GATHERALTDIV8SF);
32666 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32667 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32668 IX86_BUILTIN_GATHERALTSIV4DI);
32670 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32671 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32672 IX86_BUILTIN_GATHERALTDIV8SI);
32674 /* AVX512F */
32675 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32676 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32677 IX86_BUILTIN_GATHER3SIV16SF);
32679 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32680 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32681 IX86_BUILTIN_GATHER3SIV8DF);
32683 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32684 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32685 IX86_BUILTIN_GATHER3DIV16SF);
32687 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32688 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32689 IX86_BUILTIN_GATHER3DIV8DF);
32691 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32692 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32693 IX86_BUILTIN_GATHER3SIV16SI);
32695 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32696 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32697 IX86_BUILTIN_GATHER3SIV8DI);
32699 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32700 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32701 IX86_BUILTIN_GATHER3DIV16SI);
32703 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32704 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32705 IX86_BUILTIN_GATHER3DIV8DI);
32707 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32708 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32709 IX86_BUILTIN_GATHER3ALTSIV8DF);
32711 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32712 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32713 IX86_BUILTIN_GATHER3ALTDIV16SF);
32715 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32716 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32717 IX86_BUILTIN_GATHER3ALTSIV8DI);
32719 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32720 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32721 IX86_BUILTIN_GATHER3ALTDIV16SI);
32723 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32724 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32725 IX86_BUILTIN_SCATTERSIV16SF);
32727 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32728 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32729 IX86_BUILTIN_SCATTERSIV8DF);
32731 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32732 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32733 IX86_BUILTIN_SCATTERDIV16SF);
32735 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32736 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32737 IX86_BUILTIN_SCATTERDIV8DF);
32739 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32740 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32741 IX86_BUILTIN_SCATTERSIV16SI);
32743 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32744 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32745 IX86_BUILTIN_SCATTERSIV8DI);
32747 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32748 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32749 IX86_BUILTIN_SCATTERDIV16SI);
32751 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32752 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32753 IX86_BUILTIN_SCATTERDIV8DI);
32755 /* AVX512VL */
32756 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32757 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32758 IX86_BUILTIN_GATHER3SIV2DF);
32760 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32761 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32762 IX86_BUILTIN_GATHER3SIV4DF);
32764 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32765 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32766 IX86_BUILTIN_GATHER3DIV2DF);
32768 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32769 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32770 IX86_BUILTIN_GATHER3DIV4DF);
32772 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32773 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32774 IX86_BUILTIN_GATHER3SIV4SF);
32776 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32777 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32778 IX86_BUILTIN_GATHER3SIV8SF);
32780 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32781 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32782 IX86_BUILTIN_GATHER3DIV4SF);
32784 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32785 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32786 IX86_BUILTIN_GATHER3DIV8SF);
32788 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32789 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32790 IX86_BUILTIN_GATHER3SIV2DI);
32792 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32793 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32794 IX86_BUILTIN_GATHER3SIV4DI);
32796 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32797 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32798 IX86_BUILTIN_GATHER3DIV2DI);
32800 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32801 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32802 IX86_BUILTIN_GATHER3DIV4DI);
32804 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32805 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32806 IX86_BUILTIN_GATHER3SIV4SI);
32808 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32809 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32810 IX86_BUILTIN_GATHER3SIV8SI);
32812 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32813 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32814 IX86_BUILTIN_GATHER3DIV4SI);
32816 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32817 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32818 IX86_BUILTIN_GATHER3DIV8SI);
32820 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32821 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32822 IX86_BUILTIN_GATHER3ALTSIV4DF);
32824 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32825 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32826 IX86_BUILTIN_GATHER3ALTDIV8SF);
32828 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32829 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32830 IX86_BUILTIN_GATHER3ALTSIV4DI);
32832 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32833 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32834 IX86_BUILTIN_GATHER3ALTDIV8SI);
32836 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32837 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32838 IX86_BUILTIN_SCATTERSIV8SF);
32840 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32841 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
32842 IX86_BUILTIN_SCATTERSIV4SF);
32844 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
32845 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
32846 IX86_BUILTIN_SCATTERSIV4DF);
32848 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
32849 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
32850 IX86_BUILTIN_SCATTERSIV2DF);
32852 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
32853 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
32854 IX86_BUILTIN_SCATTERDIV8SF);
32856 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
32857 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
32858 IX86_BUILTIN_SCATTERDIV4SF);
32860 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
32861 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
32862 IX86_BUILTIN_SCATTERDIV4DF);
32864 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
32865 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
32866 IX86_BUILTIN_SCATTERDIV2DF);
32868 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
32869 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
32870 IX86_BUILTIN_SCATTERSIV8SI);
32872 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
32873 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
32874 IX86_BUILTIN_SCATTERSIV4SI);
32876 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
32877 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
32878 IX86_BUILTIN_SCATTERSIV4DI);
32880 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
32881 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
32882 IX86_BUILTIN_SCATTERSIV2DI);
32884 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
32885 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
32886 IX86_BUILTIN_SCATTERDIV8SI);
32888 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
32889 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
32890 IX86_BUILTIN_SCATTERDIV4SI);
32892 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
32893 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
32894 IX86_BUILTIN_SCATTERDIV4DI);
32896 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
32897 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
32898 IX86_BUILTIN_SCATTERDIV2DI);
32899 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
32900 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
32901 IX86_BUILTIN_SCATTERALTSIV8DF);
32903 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
32904 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
32905 IX86_BUILTIN_SCATTERALTDIV16SF);
32907 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
32908 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
32909 IX86_BUILTIN_SCATTERALTSIV8DI);
32911 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
32912 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
32913 IX86_BUILTIN_SCATTERALTDIV16SI);
32915 /* AVX512PF */
32916 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
32917 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32918 IX86_BUILTIN_GATHERPFDPD);
32919 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
32920 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32921 IX86_BUILTIN_GATHERPFDPS);
32922 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
32923 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32924 IX86_BUILTIN_GATHERPFQPD);
32925 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
32926 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32927 IX86_BUILTIN_GATHERPFQPS);
32928 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
32929 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32930 IX86_BUILTIN_SCATTERPFDPD);
32931 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
32932 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32933 IX86_BUILTIN_SCATTERPFDPS);
32934 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
32935 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32936 IX86_BUILTIN_SCATTERPFQPD);
32937 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
32938 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32939 IX86_BUILTIN_SCATTERPFQPS);
32941 /* SHA */
32942 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
32943 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
32944 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
32945 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
32946 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
32947 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
32948 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
32949 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
32950 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
32951 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
32952 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
32953 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
32954 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
32955 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
32957 /* RTM. */
32958 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
32959 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32961 /* MMX access to the vec_init patterns. */
32962 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32963 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32965 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32966 V4HI_FTYPE_HI_HI_HI_HI,
32967 IX86_BUILTIN_VEC_INIT_V4HI);
32969 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
32970 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
32971 IX86_BUILTIN_VEC_INIT_V8QI);
32973 /* Access to the vec_extract patterns. */
32974 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
32975 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
32976 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
32977 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
32978 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
32979 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
32980 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
32981 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
32982 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
32983 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
32985 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32986 "__builtin_ia32_vec_ext_v4hi",
32987 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
32989 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
32990 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
32992 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
32993 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
32995 /* Access to the vec_set patterns. */
32996 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
32997 "__builtin_ia32_vec_set_v2di",
32998 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33000 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33001 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33003 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33004 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33006 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33007 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33009 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33010 "__builtin_ia32_vec_set_v4hi",
33011 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33013 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33014 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33016 /* RDSEED */
33017 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33018 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33019 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33020 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33021 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33022 "__builtin_ia32_rdseed_di_step",
33023 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33025 /* ADCX */
33026 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33027 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33028 def_builtin (OPTION_MASK_ISA_64BIT,
33029 "__builtin_ia32_addcarryx_u64",
33030 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33031 IX86_BUILTIN_ADDCARRYX64);
33033 /* SBB */
33034 def_builtin (0, "__builtin_ia32_sbb_u32",
33035 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33036 def_builtin (OPTION_MASK_ISA_64BIT,
33037 "__builtin_ia32_sbb_u64",
33038 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33039 IX86_BUILTIN_SBB64);
33041 /* Read/write FLAGS. */
33042 def_builtin (0, "__builtin_ia32_readeflags_u32",
33043 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33044 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33045 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33046 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33047 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33048 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33049 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33051 /* CLFLUSHOPT. */
33052 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33053 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33055 /* CLWB. */
33056 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33057 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33059 /* MONITORX and MWAITX. */
33060 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33061 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33062 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33063 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33065 /* CLZERO. */
33066 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33067 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33069 /* Add FMA4 multi-arg argument instructions */
33070 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33072 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33073 if (d->name == 0)
33074 continue;
33076 ftype = (enum ix86_builtin_func_type) d->flag;
33077 def_builtin_const (d->mask, d->name, ftype, d->code);
33079 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33080 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33081 ARRAY_SIZE (bdesc_multi_arg) - 1);
33084 static void
33085 ix86_init_mpx_builtins ()
33087 const struct builtin_description * d;
33088 enum ix86_builtin_func_type ftype;
33089 tree decl;
33090 size_t i;
33092 for (i = 0, d = bdesc_mpx;
33093 i < ARRAY_SIZE (bdesc_mpx);
33094 i++, d++)
33096 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33097 if (d->name == 0)
33098 continue;
33100 ftype = (enum ix86_builtin_func_type) d->flag;
33101 decl = def_builtin (d->mask, d->name, ftype, d->code);
33103 /* With no leaf and nothrow flags for MPX builtins
33104 abnormal edges may follow its call when setjmp
33105 presents in the function. Since we may have a lot
33106 of MPX builtins calls it causes lots of useless
33107 edges and enormous PHI nodes. To avoid this we mark
33108 MPX builtins as leaf and nothrow. */
33109 if (decl)
33111 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33112 NULL_TREE);
33113 TREE_NOTHROW (decl) = 1;
33115 else
33117 ix86_builtins_isa[(int)d->code].leaf_p = true;
33118 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33121 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33122 IX86_BUILTIN__BDESC_MPX_FIRST,
33123 ARRAY_SIZE (bdesc_mpx) - 1);
33125 for (i = 0, d = bdesc_mpx_const;
33126 i < ARRAY_SIZE (bdesc_mpx_const);
33127 i++, d++)
33129 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33130 if (d->name == 0)
33131 continue;
33133 ftype = (enum ix86_builtin_func_type) d->flag;
33134 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33136 if (decl)
33138 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33139 NULL_TREE);
33140 TREE_NOTHROW (decl) = 1;
33142 else
33144 ix86_builtins_isa[(int)d->code].leaf_p = true;
33145 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33148 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33149 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33150 ARRAY_SIZE (bdesc_mpx_const) - 1);
33152 #undef BDESC_VERIFY
33153 #undef BDESC_VERIFYS
33155 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33156 to return a pointer to VERSION_DECL if the outcome of the expression
33157 formed by PREDICATE_CHAIN is true. This function will be called during
33158 version dispatch to decide which function version to execute. It returns
33159 the basic block at the end, to which more conditions can be added. */
33161 static basic_block
33162 add_condition_to_bb (tree function_decl, tree version_decl,
33163 tree predicate_chain, basic_block new_bb)
33165 gimple *return_stmt;
33166 tree convert_expr, result_var;
33167 gimple *convert_stmt;
33168 gimple *call_cond_stmt;
33169 gimple *if_else_stmt;
33171 basic_block bb1, bb2, bb3;
33172 edge e12, e23;
33174 tree cond_var, and_expr_var = NULL_TREE;
33175 gimple_seq gseq;
33177 tree predicate_decl, predicate_arg;
33179 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33181 gcc_assert (new_bb != NULL);
33182 gseq = bb_seq (new_bb);
33185 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33186 build_fold_addr_expr (version_decl));
33187 result_var = create_tmp_var (ptr_type_node);
33188 convert_stmt = gimple_build_assign (result_var, convert_expr);
33189 return_stmt = gimple_build_return (result_var);
33191 if (predicate_chain == NULL_TREE)
33193 gimple_seq_add_stmt (&gseq, convert_stmt);
33194 gimple_seq_add_stmt (&gseq, return_stmt);
33195 set_bb_seq (new_bb, gseq);
33196 gimple_set_bb (convert_stmt, new_bb);
33197 gimple_set_bb (return_stmt, new_bb);
33198 pop_cfun ();
33199 return new_bb;
33202 while (predicate_chain != NULL)
33204 cond_var = create_tmp_var (integer_type_node);
33205 predicate_decl = TREE_PURPOSE (predicate_chain);
33206 predicate_arg = TREE_VALUE (predicate_chain);
33207 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33208 gimple_call_set_lhs (call_cond_stmt, cond_var);
33210 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33211 gimple_set_bb (call_cond_stmt, new_bb);
33212 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33214 predicate_chain = TREE_CHAIN (predicate_chain);
33216 if (and_expr_var == NULL)
33217 and_expr_var = cond_var;
33218 else
33220 gimple *assign_stmt;
33221 /* Use MIN_EXPR to check if any integer is zero?.
33222 and_expr_var = min_expr <cond_var, and_expr_var> */
33223 assign_stmt = gimple_build_assign (and_expr_var,
33224 build2 (MIN_EXPR, integer_type_node,
33225 cond_var, and_expr_var));
33227 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33228 gimple_set_bb (assign_stmt, new_bb);
33229 gimple_seq_add_stmt (&gseq, assign_stmt);
33233 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33234 integer_zero_node,
33235 NULL_TREE, NULL_TREE);
33236 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33237 gimple_set_bb (if_else_stmt, new_bb);
33238 gimple_seq_add_stmt (&gseq, if_else_stmt);
33240 gimple_seq_add_stmt (&gseq, convert_stmt);
33241 gimple_seq_add_stmt (&gseq, return_stmt);
33242 set_bb_seq (new_bb, gseq);
33244 bb1 = new_bb;
33245 e12 = split_block (bb1, if_else_stmt);
33246 bb2 = e12->dest;
33247 e12->flags &= ~EDGE_FALLTHRU;
33248 e12->flags |= EDGE_TRUE_VALUE;
33250 e23 = split_block (bb2, return_stmt);
33252 gimple_set_bb (convert_stmt, bb2);
33253 gimple_set_bb (return_stmt, bb2);
33255 bb3 = e23->dest;
33256 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33258 remove_edge (e23);
33259 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33261 pop_cfun ();
33263 return bb3;
33266 /* This parses the attribute arguments to target in DECL and determines
33267 the right builtin to use to match the platform specification.
33268 It returns the priority value for this version decl. If PREDICATE_LIST
33269 is not NULL, it stores the list of cpu features that need to be checked
33270 before dispatching this function. */
33272 static unsigned int
33273 get_builtin_code_for_version (tree decl, tree *predicate_list)
33275 tree attrs;
33276 struct cl_target_option cur_target;
33277 tree target_node;
33278 struct cl_target_option *new_target;
33279 const char *arg_str = NULL;
33280 const char *attrs_str = NULL;
33281 char *tok_str = NULL;
33282 char *token;
33284 /* Priority of i386 features, greater value is higher priority. This is
33285 used to decide the order in which function dispatch must happen. For
33286 instance, a version specialized for SSE4.2 should be checked for dispatch
33287 before a version for SSE3, as SSE4.2 implies SSE3. */
33288 enum feature_priority
33290 P_ZERO = 0,
33291 P_MMX,
33292 P_SSE,
33293 P_SSE2,
33294 P_SSE3,
33295 P_SSSE3,
33296 P_PROC_SSSE3,
33297 P_SSE4_A,
33298 P_PROC_SSE4_A,
33299 P_SSE4_1,
33300 P_SSE4_2,
33301 P_PROC_SSE4_2,
33302 P_POPCNT,
33303 P_AES,
33304 P_PCLMUL,
33305 P_AVX,
33306 P_PROC_AVX,
33307 P_BMI,
33308 P_PROC_BMI,
33309 P_FMA4,
33310 P_XOP,
33311 P_PROC_XOP,
33312 P_FMA,
33313 P_PROC_FMA,
33314 P_BMI2,
33315 P_AVX2,
33316 P_PROC_AVX2,
33317 P_AVX512F,
33318 P_PROC_AVX512F
33321 enum feature_priority priority = P_ZERO;
33323 /* These are the target attribute strings for which a dispatcher is
33324 available, from fold_builtin_cpu. */
33326 static struct _feature_list
33328 const char *const name;
33329 const enum feature_priority priority;
33331 const feature_list[] =
33333 {"mmx", P_MMX},
33334 {"sse", P_SSE},
33335 {"sse2", P_SSE2},
33336 {"sse3", P_SSE3},
33337 {"sse4a", P_SSE4_A},
33338 {"ssse3", P_SSSE3},
33339 {"sse4.1", P_SSE4_1},
33340 {"sse4.2", P_SSE4_2},
33341 {"popcnt", P_POPCNT},
33342 {"aes", P_AES},
33343 {"pclmul", P_PCLMUL},
33344 {"avx", P_AVX},
33345 {"bmi", P_BMI},
33346 {"fma4", P_FMA4},
33347 {"xop", P_XOP},
33348 {"fma", P_FMA},
33349 {"bmi2", P_BMI2},
33350 {"avx2", P_AVX2},
33351 {"avx512f", P_AVX512F}
33355 static unsigned int NUM_FEATURES
33356 = sizeof (feature_list) / sizeof (struct _feature_list);
33358 unsigned int i;
33360 tree predicate_chain = NULL_TREE;
33361 tree predicate_decl, predicate_arg;
33363 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33364 gcc_assert (attrs != NULL);
33366 attrs = TREE_VALUE (TREE_VALUE (attrs));
33368 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33369 attrs_str = TREE_STRING_POINTER (attrs);
33371 /* Return priority zero for default function. */
33372 if (strcmp (attrs_str, "default") == 0)
33373 return 0;
33375 /* Handle arch= if specified. For priority, set it to be 1 more than
33376 the best instruction set the processor can handle. For instance, if
33377 there is a version for atom and a version for ssse3 (the highest ISA
33378 priority for atom), the atom version must be checked for dispatch
33379 before the ssse3 version. */
33380 if (strstr (attrs_str, "arch=") != NULL)
33382 cl_target_option_save (&cur_target, &global_options);
33383 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33384 &global_options_set);
33386 gcc_assert (target_node);
33387 new_target = TREE_TARGET_OPTION (target_node);
33388 gcc_assert (new_target);
33390 if (new_target->arch_specified && new_target->arch > 0)
33392 switch (new_target->arch)
33394 case PROCESSOR_CORE2:
33395 arg_str = "core2";
33396 priority = P_PROC_SSSE3;
33397 break;
33398 case PROCESSOR_NEHALEM:
33399 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33400 arg_str = "westmere";
33401 else
33402 /* We translate "arch=corei7" and "arch=nehalem" to
33403 "corei7" so that it will be mapped to M_INTEL_COREI7
33404 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33405 arg_str = "corei7";
33406 priority = P_PROC_SSE4_2;
33407 break;
33408 case PROCESSOR_SANDYBRIDGE:
33409 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33410 arg_str = "ivybridge";
33411 else
33412 arg_str = "sandybridge";
33413 priority = P_PROC_AVX;
33414 break;
33415 case PROCESSOR_HASWELL:
33416 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33417 arg_str = "skylake-avx512";
33418 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33419 arg_str = "skylake";
33420 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33421 arg_str = "broadwell";
33422 else
33423 arg_str = "haswell";
33424 priority = P_PROC_AVX2;
33425 break;
33426 case PROCESSOR_BONNELL:
33427 arg_str = "bonnell";
33428 priority = P_PROC_SSSE3;
33429 break;
33430 case PROCESSOR_KNL:
33431 arg_str = "knl";
33432 priority = P_PROC_AVX512F;
33433 break;
33434 case PROCESSOR_SILVERMONT:
33435 arg_str = "silvermont";
33436 priority = P_PROC_SSE4_2;
33437 break;
33438 case PROCESSOR_AMDFAM10:
33439 arg_str = "amdfam10h";
33440 priority = P_PROC_SSE4_A;
33441 break;
33442 case PROCESSOR_BTVER1:
33443 arg_str = "btver1";
33444 priority = P_PROC_SSE4_A;
33445 break;
33446 case PROCESSOR_BTVER2:
33447 arg_str = "btver2";
33448 priority = P_PROC_BMI;
33449 break;
33450 case PROCESSOR_BDVER1:
33451 arg_str = "bdver1";
33452 priority = P_PROC_XOP;
33453 break;
33454 case PROCESSOR_BDVER2:
33455 arg_str = "bdver2";
33456 priority = P_PROC_FMA;
33457 break;
33458 case PROCESSOR_BDVER3:
33459 arg_str = "bdver3";
33460 priority = P_PROC_FMA;
33461 break;
33462 case PROCESSOR_BDVER4:
33463 arg_str = "bdver4";
33464 priority = P_PROC_AVX2;
33465 break;
33466 case PROCESSOR_ZNVER1:
33467 arg_str = "znver1";
33468 priority = P_PROC_AVX2;
33469 break;
33473 cl_target_option_restore (&global_options, &cur_target);
33475 if (predicate_list && arg_str == NULL)
33477 error_at (DECL_SOURCE_LOCATION (decl),
33478 "No dispatcher found for the versioning attributes");
33479 return 0;
33482 if (predicate_list)
33484 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33485 /* For a C string literal the length includes the trailing NULL. */
33486 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33487 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33488 predicate_chain);
33492 /* Process feature name. */
33493 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33494 strcpy (tok_str, attrs_str);
33495 token = strtok (tok_str, ",");
33496 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33498 while (token != NULL)
33500 /* Do not process "arch=" */
33501 if (strncmp (token, "arch=", 5) == 0)
33503 token = strtok (NULL, ",");
33504 continue;
33506 for (i = 0; i < NUM_FEATURES; ++i)
33508 if (strcmp (token, feature_list[i].name) == 0)
33510 if (predicate_list)
33512 predicate_arg = build_string_literal (
33513 strlen (feature_list[i].name) + 1,
33514 feature_list[i].name);
33515 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33516 predicate_chain);
33518 /* Find the maximum priority feature. */
33519 if (feature_list[i].priority > priority)
33520 priority = feature_list[i].priority;
33522 break;
33525 if (predicate_list && i == NUM_FEATURES)
33527 error_at (DECL_SOURCE_LOCATION (decl),
33528 "No dispatcher found for %s", token);
33529 return 0;
33531 token = strtok (NULL, ",");
33533 free (tok_str);
33535 if (predicate_list && predicate_chain == NULL_TREE)
33537 error_at (DECL_SOURCE_LOCATION (decl),
33538 "No dispatcher found for the versioning attributes : %s",
33539 attrs_str);
33540 return 0;
33542 else if (predicate_list)
33544 predicate_chain = nreverse (predicate_chain);
33545 *predicate_list = predicate_chain;
33548 return priority;
33551 /* This compares the priority of target features in function DECL1
33552 and DECL2. It returns positive value if DECL1 is higher priority,
33553 negative value if DECL2 is higher priority and 0 if they are the
33554 same. */
33556 static int
33557 ix86_compare_version_priority (tree decl1, tree decl2)
33559 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33560 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33562 return (int)priority1 - (int)priority2;
33565 /* V1 and V2 point to function versions with different priorities
33566 based on the target ISA. This function compares their priorities. */
33568 static int
33569 feature_compare (const void *v1, const void *v2)
33571 typedef struct _function_version_info
33573 tree version_decl;
33574 tree predicate_chain;
33575 unsigned int dispatch_priority;
33576 } function_version_info;
33578 const function_version_info c1 = *(const function_version_info *)v1;
33579 const function_version_info c2 = *(const function_version_info *)v2;
33580 return (c2.dispatch_priority - c1.dispatch_priority);
33583 /* This function generates the dispatch function for
33584 multi-versioned functions. DISPATCH_DECL is the function which will
33585 contain the dispatch logic. FNDECLS are the function choices for
33586 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33587 in DISPATCH_DECL in which the dispatch code is generated. */
33589 static int
33590 dispatch_function_versions (tree dispatch_decl,
33591 void *fndecls_p,
33592 basic_block *empty_bb)
33594 tree default_decl;
33595 gimple *ifunc_cpu_init_stmt;
33596 gimple_seq gseq;
33597 int ix;
33598 tree ele;
33599 vec<tree> *fndecls;
33600 unsigned int num_versions = 0;
33601 unsigned int actual_versions = 0;
33602 unsigned int i;
33604 struct _function_version_info
33606 tree version_decl;
33607 tree predicate_chain;
33608 unsigned int dispatch_priority;
33609 }*function_version_info;
33611 gcc_assert (dispatch_decl != NULL
33612 && fndecls_p != NULL
33613 && empty_bb != NULL);
33615 /*fndecls_p is actually a vector. */
33616 fndecls = static_cast<vec<tree> *> (fndecls_p);
33618 /* At least one more version other than the default. */
33619 num_versions = fndecls->length ();
33620 gcc_assert (num_versions >= 2);
33622 function_version_info = (struct _function_version_info *)
33623 XNEWVEC (struct _function_version_info, (num_versions - 1));
33625 /* The first version in the vector is the default decl. */
33626 default_decl = (*fndecls)[0];
33628 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33630 gseq = bb_seq (*empty_bb);
33631 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33632 constructors, so explicity call __builtin_cpu_init here. */
33633 ifunc_cpu_init_stmt = gimple_build_call_vec (
33634 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33635 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33636 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33637 set_bb_seq (*empty_bb, gseq);
33639 pop_cfun ();
33642 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33644 tree version_decl = ele;
33645 tree predicate_chain = NULL_TREE;
33646 unsigned int priority;
33647 /* Get attribute string, parse it and find the right predicate decl.
33648 The predicate function could be a lengthy combination of many
33649 features, like arch-type and various isa-variants. */
33650 priority = get_builtin_code_for_version (version_decl,
33651 &predicate_chain);
33653 if (predicate_chain == NULL_TREE)
33654 continue;
33656 function_version_info [actual_versions].version_decl = version_decl;
33657 function_version_info [actual_versions].predicate_chain
33658 = predicate_chain;
33659 function_version_info [actual_versions].dispatch_priority = priority;
33660 actual_versions++;
33663 /* Sort the versions according to descending order of dispatch priority. The
33664 priority is based on the ISA. This is not a perfect solution. There
33665 could still be ambiguity. If more than one function version is suitable
33666 to execute, which one should be dispatched? In future, allow the user
33667 to specify a dispatch priority next to the version. */
33668 qsort (function_version_info, actual_versions,
33669 sizeof (struct _function_version_info), feature_compare);
33671 for (i = 0; i < actual_versions; ++i)
33672 *empty_bb = add_condition_to_bb (dispatch_decl,
33673 function_version_info[i].version_decl,
33674 function_version_info[i].predicate_chain,
33675 *empty_bb);
33677 /* dispatch default version at the end. */
33678 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33679 NULL, *empty_bb);
33681 free (function_version_info);
33682 return 0;
33685 /* This function changes the assembler name for functions that are
33686 versions. If DECL is a function version and has a "target"
33687 attribute, it appends the attribute string to its assembler name. */
33689 static tree
33690 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33692 tree version_attr;
33693 const char *orig_name, *version_string;
33694 char *attr_str, *assembler_name;
33696 if (DECL_DECLARED_INLINE_P (decl)
33697 && lookup_attribute ("gnu_inline",
33698 DECL_ATTRIBUTES (decl)))
33699 error_at (DECL_SOURCE_LOCATION (decl),
33700 "Function versions cannot be marked as gnu_inline,"
33701 " bodies have to be generated");
33703 if (DECL_VIRTUAL_P (decl)
33704 || DECL_VINDEX (decl))
33705 sorry ("Virtual function multiversioning not supported");
33707 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33709 /* target attribute string cannot be NULL. */
33710 gcc_assert (version_attr != NULL_TREE);
33712 orig_name = IDENTIFIER_POINTER (id);
33713 version_string
33714 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33716 if (strcmp (version_string, "default") == 0)
33717 return id;
33719 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33720 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33722 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33724 /* Allow assembler name to be modified if already set. */
33725 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33726 SET_DECL_RTL (decl, NULL);
33728 tree ret = get_identifier (assembler_name);
33729 XDELETEVEC (attr_str);
33730 XDELETEVEC (assembler_name);
33731 return ret;
33735 static tree
33736 ix86_mangle_decl_assembler_name (tree decl, tree id)
33738 /* For function version, add the target suffix to the assembler name. */
33739 if (TREE_CODE (decl) == FUNCTION_DECL
33740 && DECL_FUNCTION_VERSIONED (decl))
33741 id = ix86_mangle_function_version_assembler_name (decl, id);
33742 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33743 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33744 #endif
33746 return id;
33749 /* Make a dispatcher declaration for the multi-versioned function DECL.
33750 Calls to DECL function will be replaced with calls to the dispatcher
33751 by the front-end. Returns the decl of the dispatcher function. */
33753 static tree
33754 ix86_get_function_versions_dispatcher (void *decl)
33756 tree fn = (tree) decl;
33757 struct cgraph_node *node = NULL;
33758 struct cgraph_node *default_node = NULL;
33759 struct cgraph_function_version_info *node_v = NULL;
33760 struct cgraph_function_version_info *first_v = NULL;
33762 tree dispatch_decl = NULL;
33764 struct cgraph_function_version_info *default_version_info = NULL;
33766 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33768 node = cgraph_node::get (fn);
33769 gcc_assert (node != NULL);
33771 node_v = node->function_version ();
33772 gcc_assert (node_v != NULL);
33774 if (node_v->dispatcher_resolver != NULL)
33775 return node_v->dispatcher_resolver;
33777 /* Find the default version and make it the first node. */
33778 first_v = node_v;
33779 /* Go to the beginning of the chain. */
33780 while (first_v->prev != NULL)
33781 first_v = first_v->prev;
33782 default_version_info = first_v;
33783 while (default_version_info != NULL)
33785 if (is_function_default_version
33786 (default_version_info->this_node->decl))
33787 break;
33788 default_version_info = default_version_info->next;
33791 /* If there is no default node, just return NULL. */
33792 if (default_version_info == NULL)
33793 return NULL;
33795 /* Make default info the first node. */
33796 if (first_v != default_version_info)
33798 default_version_info->prev->next = default_version_info->next;
33799 if (default_version_info->next)
33800 default_version_info->next->prev = default_version_info->prev;
33801 first_v->prev = default_version_info;
33802 default_version_info->next = first_v;
33803 default_version_info->prev = NULL;
33806 default_node = default_version_info->this_node;
33808 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33809 if (targetm.has_ifunc_p ())
33811 struct cgraph_function_version_info *it_v = NULL;
33812 struct cgraph_node *dispatcher_node = NULL;
33813 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33815 /* Right now, the dispatching is done via ifunc. */
33816 dispatch_decl = make_dispatcher_decl (default_node->decl);
33818 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33819 gcc_assert (dispatcher_node != NULL);
33820 dispatcher_node->dispatcher_function = 1;
33821 dispatcher_version_info
33822 = dispatcher_node->insert_new_function_version ();
33823 dispatcher_version_info->next = default_version_info;
33824 dispatcher_node->definition = 1;
33826 /* Set the dispatcher for all the versions. */
33827 it_v = default_version_info;
33828 while (it_v != NULL)
33830 it_v->dispatcher_resolver = dispatch_decl;
33831 it_v = it_v->next;
33834 else
33835 #endif
33837 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33838 "multiversioning needs ifunc which is not supported "
33839 "on this target");
33842 return dispatch_decl;
33845 /* Make the resolver function decl to dispatch the versions of
33846 a multi-versioned function, DEFAULT_DECL. Create an
33847 empty basic block in the resolver and store the pointer in
33848 EMPTY_BB. Return the decl of the resolver function. */
33850 static tree
33851 make_resolver_func (const tree default_decl,
33852 const tree dispatch_decl,
33853 basic_block *empty_bb)
33855 char *resolver_name;
33856 tree decl, type, decl_name, t;
33857 bool is_uniq = false;
33859 /* IFUNC's have to be globally visible. So, if the default_decl is
33860 not, then the name of the IFUNC should be made unique. */
33861 if (TREE_PUBLIC (default_decl) == 0)
33862 is_uniq = true;
33864 /* Append the filename to the resolver function if the versions are
33865 not externally visible. This is because the resolver function has
33866 to be externally visible for the loader to find it. So, appending
33867 the filename will prevent conflicts with a resolver function from
33868 another module which is based on the same version name. */
33869 resolver_name = make_unique_name (default_decl, "resolver", is_uniq);
33871 /* The resolver function should return a (void *). */
33872 type = build_function_type_list (ptr_type_node, NULL_TREE);
33874 decl = build_fn_decl (resolver_name, type);
33875 decl_name = get_identifier (resolver_name);
33876 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33878 DECL_NAME (decl) = decl_name;
33879 TREE_USED (decl) = 1;
33880 DECL_ARTIFICIAL (decl) = 1;
33881 DECL_IGNORED_P (decl) = 0;
33882 /* IFUNC resolvers have to be externally visible. */
33883 TREE_PUBLIC (decl) = 1;
33884 DECL_UNINLINABLE (decl) = 1;
33886 /* Resolver is not external, body is generated. */
33887 DECL_EXTERNAL (decl) = 0;
33888 DECL_EXTERNAL (dispatch_decl) = 0;
33890 DECL_CONTEXT (decl) = NULL_TREE;
33891 DECL_INITIAL (decl) = make_node (BLOCK);
33892 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33894 if (DECL_COMDAT_GROUP (default_decl)
33895 || TREE_PUBLIC (default_decl))
33897 /* In this case, each translation unit with a call to this
33898 versioned function will put out a resolver. Ensure it
33899 is comdat to keep just one copy. */
33900 DECL_COMDAT (decl) = 1;
33901 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33903 /* Build result decl and add to function_decl. */
33904 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33905 DECL_ARTIFICIAL (t) = 1;
33906 DECL_IGNORED_P (t) = 1;
33907 DECL_RESULT (decl) = t;
33909 gimplify_function_tree (decl);
33910 push_cfun (DECL_STRUCT_FUNCTION (decl));
33911 *empty_bb = init_lowered_empty_function (decl, false,
33912 profile_count::uninitialized ());
33914 cgraph_node::add_new_function (decl, true);
33915 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33917 pop_cfun ();
33919 gcc_assert (dispatch_decl != NULL);
33920 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33921 DECL_ATTRIBUTES (dispatch_decl)
33922 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33924 /* Create the alias for dispatch to resolver here. */
33925 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33926 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33927 XDELETEVEC (resolver_name);
33928 return decl;
33931 /* Generate the dispatching code body to dispatch multi-versioned function
33932 DECL. The target hook is called to process the "target" attributes and
33933 provide the code to dispatch the right function at run-time. NODE points
33934 to the dispatcher decl whose body will be created. */
33936 static tree
33937 ix86_generate_version_dispatcher_body (void *node_p)
33939 tree resolver_decl;
33940 basic_block empty_bb;
33941 tree default_ver_decl;
33942 struct cgraph_node *versn;
33943 struct cgraph_node *node;
33945 struct cgraph_function_version_info *node_version_info = NULL;
33946 struct cgraph_function_version_info *versn_info = NULL;
33948 node = (cgraph_node *)node_p;
33950 node_version_info = node->function_version ();
33951 gcc_assert (node->dispatcher_function
33952 && node_version_info != NULL);
33954 if (node_version_info->dispatcher_resolver)
33955 return node_version_info->dispatcher_resolver;
33957 /* The first version in the chain corresponds to the default version. */
33958 default_ver_decl = node_version_info->next->this_node->decl;
33960 /* node is going to be an alias, so remove the finalized bit. */
33961 node->definition = false;
33963 resolver_decl = make_resolver_func (default_ver_decl,
33964 node->decl, &empty_bb);
33966 node_version_info->dispatcher_resolver = resolver_decl;
33968 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33970 auto_vec<tree, 2> fn_ver_vec;
33972 for (versn_info = node_version_info->next; versn_info;
33973 versn_info = versn_info->next)
33975 versn = versn_info->this_node;
33976 /* Check for virtual functions here again, as by this time it should
33977 have been determined if this function needs a vtable index or
33978 not. This happens for methods in derived classes that override
33979 virtual methods in base classes but are not explicitly marked as
33980 virtual. */
33981 if (DECL_VINDEX (versn->decl))
33982 sorry ("Virtual function multiversioning not supported");
33984 fn_ver_vec.safe_push (versn->decl);
33987 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
33988 cgraph_edge::rebuild_edges ();
33989 pop_cfun ();
33990 return resolver_decl;
33992 /* This builds the processor_model struct type defined in
33993 libgcc/config/i386/cpuinfo.c */
33995 static tree
33996 build_processor_model_struct (void)
33998 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
33999 "__cpu_features"};
34000 tree field = NULL_TREE, field_chain = NULL_TREE;
34001 int i;
34002 tree type = make_node (RECORD_TYPE);
34004 /* The first 3 fields are unsigned int. */
34005 for (i = 0; i < 3; ++i)
34007 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34008 get_identifier (field_name[i]), unsigned_type_node);
34009 if (field_chain != NULL_TREE)
34010 DECL_CHAIN (field) = field_chain;
34011 field_chain = field;
34014 /* The last field is an array of unsigned integers of size one. */
34015 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34016 get_identifier (field_name[3]),
34017 build_array_type (unsigned_type_node,
34018 build_index_type (size_one_node)));
34019 if (field_chain != NULL_TREE)
34020 DECL_CHAIN (field) = field_chain;
34021 field_chain = field;
34023 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34024 return type;
34027 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34029 static tree
34030 make_var_decl (tree type, const char *name)
34032 tree new_decl;
34034 new_decl = build_decl (UNKNOWN_LOCATION,
34035 VAR_DECL,
34036 get_identifier(name),
34037 type);
34039 DECL_EXTERNAL (new_decl) = 1;
34040 TREE_STATIC (new_decl) = 1;
34041 TREE_PUBLIC (new_decl) = 1;
34042 DECL_INITIAL (new_decl) = 0;
34043 DECL_ARTIFICIAL (new_decl) = 0;
34044 DECL_PRESERVE_P (new_decl) = 1;
34046 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34047 assemble_variable (new_decl, 0, 0, 0);
34049 return new_decl;
34052 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34053 into an integer defined in libgcc/config/i386/cpuinfo.c */
34055 static tree
34056 fold_builtin_cpu (tree fndecl, tree *args)
34058 unsigned int i;
34059 enum ix86_builtins fn_code = (enum ix86_builtins)
34060 DECL_FUNCTION_CODE (fndecl);
34061 tree param_string_cst = NULL;
34063 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34064 enum processor_features
34066 F_CMOV = 0,
34067 F_MMX,
34068 F_POPCNT,
34069 F_SSE,
34070 F_SSE2,
34071 F_SSE3,
34072 F_SSSE3,
34073 F_SSE4_1,
34074 F_SSE4_2,
34075 F_AVX,
34076 F_AVX2,
34077 F_SSE4_A,
34078 F_FMA4,
34079 F_XOP,
34080 F_FMA,
34081 F_AVX512F,
34082 F_BMI,
34083 F_BMI2,
34084 F_AES,
34085 F_PCLMUL,
34086 F_AVX512VL,
34087 F_AVX512BW,
34088 F_AVX512DQ,
34089 F_AVX512CD,
34090 F_AVX512ER,
34091 F_AVX512PF,
34092 F_AVX512VBMI,
34093 F_AVX512IFMA,
34094 F_AVX5124VNNIW,
34095 F_AVX5124FMAPS,
34096 F_AVX512VPOPCNTDQ,
34097 F_MAX
34100 /* These are the values for vendor types and cpu types and subtypes
34101 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34102 the corresponding start value. */
34103 enum processor_model
34105 M_INTEL = 1,
34106 M_AMD,
34107 M_CPU_TYPE_START,
34108 M_INTEL_BONNELL,
34109 M_INTEL_CORE2,
34110 M_INTEL_COREI7,
34111 M_AMDFAM10H,
34112 M_AMDFAM15H,
34113 M_INTEL_SILVERMONT,
34114 M_INTEL_KNL,
34115 M_AMD_BTVER1,
34116 M_AMD_BTVER2,
34117 M_CPU_SUBTYPE_START,
34118 M_INTEL_COREI7_NEHALEM,
34119 M_INTEL_COREI7_WESTMERE,
34120 M_INTEL_COREI7_SANDYBRIDGE,
34121 M_AMDFAM10H_BARCELONA,
34122 M_AMDFAM10H_SHANGHAI,
34123 M_AMDFAM10H_ISTANBUL,
34124 M_AMDFAM15H_BDVER1,
34125 M_AMDFAM15H_BDVER2,
34126 M_AMDFAM15H_BDVER3,
34127 M_AMDFAM15H_BDVER4,
34128 M_AMDFAM17H_ZNVER1,
34129 M_INTEL_COREI7_IVYBRIDGE,
34130 M_INTEL_COREI7_HASWELL,
34131 M_INTEL_COREI7_BROADWELL,
34132 M_INTEL_COREI7_SKYLAKE,
34133 M_INTEL_COREI7_SKYLAKE_AVX512
34136 static struct _arch_names_table
34138 const char *const name;
34139 const enum processor_model model;
34141 const arch_names_table[] =
34143 {"amd", M_AMD},
34144 {"intel", M_INTEL},
34145 {"atom", M_INTEL_BONNELL},
34146 {"slm", M_INTEL_SILVERMONT},
34147 {"core2", M_INTEL_CORE2},
34148 {"corei7", M_INTEL_COREI7},
34149 {"nehalem", M_INTEL_COREI7_NEHALEM},
34150 {"westmere", M_INTEL_COREI7_WESTMERE},
34151 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34152 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34153 {"haswell", M_INTEL_COREI7_HASWELL},
34154 {"broadwell", M_INTEL_COREI7_BROADWELL},
34155 {"skylake", M_INTEL_COREI7_SKYLAKE},
34156 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34157 {"bonnell", M_INTEL_BONNELL},
34158 {"silvermont", M_INTEL_SILVERMONT},
34159 {"knl", M_INTEL_KNL},
34160 {"amdfam10h", M_AMDFAM10H},
34161 {"barcelona", M_AMDFAM10H_BARCELONA},
34162 {"shanghai", M_AMDFAM10H_SHANGHAI},
34163 {"istanbul", M_AMDFAM10H_ISTANBUL},
34164 {"btver1", M_AMD_BTVER1},
34165 {"amdfam15h", M_AMDFAM15H},
34166 {"bdver1", M_AMDFAM15H_BDVER1},
34167 {"bdver2", M_AMDFAM15H_BDVER2},
34168 {"bdver3", M_AMDFAM15H_BDVER3},
34169 {"bdver4", M_AMDFAM15H_BDVER4},
34170 {"btver2", M_AMD_BTVER2},
34171 {"znver1", M_AMDFAM17H_ZNVER1},
34174 static struct _isa_names_table
34176 const char *const name;
34177 const enum processor_features feature;
34179 const isa_names_table[] =
34181 {"cmov", F_CMOV},
34182 {"mmx", F_MMX},
34183 {"popcnt", F_POPCNT},
34184 {"sse", F_SSE},
34185 {"sse2", F_SSE2},
34186 {"sse3", F_SSE3},
34187 {"ssse3", F_SSSE3},
34188 {"sse4a", F_SSE4_A},
34189 {"sse4.1", F_SSE4_1},
34190 {"sse4.2", F_SSE4_2},
34191 {"avx", F_AVX},
34192 {"fma4", F_FMA4},
34193 {"xop", F_XOP},
34194 {"fma", F_FMA},
34195 {"avx2", F_AVX2},
34196 {"avx512f", F_AVX512F},
34197 {"bmi", F_BMI},
34198 {"bmi2", F_BMI2},
34199 {"aes", F_AES},
34200 {"pclmul", F_PCLMUL},
34201 {"avx512vl",F_AVX512VL},
34202 {"avx512bw",F_AVX512BW},
34203 {"avx512dq",F_AVX512DQ},
34204 {"avx512cd",F_AVX512CD},
34205 {"avx512er",F_AVX512ER},
34206 {"avx512pf",F_AVX512PF},
34207 {"avx512vbmi",F_AVX512VBMI},
34208 {"avx512ifma",F_AVX512IFMA},
34209 {"avx5124vnniw",F_AVX5124VNNIW},
34210 {"avx5124fmaps",F_AVX5124FMAPS},
34211 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34214 tree __processor_model_type = build_processor_model_struct ();
34215 tree __cpu_model_var = make_var_decl (__processor_model_type,
34216 "__cpu_model");
34219 varpool_node::add (__cpu_model_var);
34221 gcc_assert ((args != NULL) && (*args != NULL));
34223 param_string_cst = *args;
34224 while (param_string_cst
34225 && TREE_CODE (param_string_cst) != STRING_CST)
34227 /* *args must be a expr that can contain other EXPRS leading to a
34228 STRING_CST. */
34229 if (!EXPR_P (param_string_cst))
34231 error ("Parameter to builtin must be a string constant or literal");
34232 return integer_zero_node;
34234 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34237 gcc_assert (param_string_cst);
34239 if (fn_code == IX86_BUILTIN_CPU_IS)
34241 tree ref;
34242 tree field;
34243 tree final;
34245 unsigned int field_val = 0;
34246 unsigned int NUM_ARCH_NAMES
34247 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34249 for (i = 0; i < NUM_ARCH_NAMES; i++)
34250 if (strcmp (arch_names_table[i].name,
34251 TREE_STRING_POINTER (param_string_cst)) == 0)
34252 break;
34254 if (i == NUM_ARCH_NAMES)
34256 error ("Parameter to builtin not valid: %s",
34257 TREE_STRING_POINTER (param_string_cst));
34258 return integer_zero_node;
34261 field = TYPE_FIELDS (__processor_model_type);
34262 field_val = arch_names_table[i].model;
34264 /* CPU types are stored in the next field. */
34265 if (field_val > M_CPU_TYPE_START
34266 && field_val < M_CPU_SUBTYPE_START)
34268 field = DECL_CHAIN (field);
34269 field_val -= M_CPU_TYPE_START;
34272 /* CPU subtypes are stored in the next field. */
34273 if (field_val > M_CPU_SUBTYPE_START)
34275 field = DECL_CHAIN ( DECL_CHAIN (field));
34276 field_val -= M_CPU_SUBTYPE_START;
34279 /* Get the appropriate field in __cpu_model. */
34280 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34281 field, NULL_TREE);
34283 /* Check the value. */
34284 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34285 build_int_cstu (unsigned_type_node, field_val));
34286 return build1 (CONVERT_EXPR, integer_type_node, final);
34288 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34290 tree ref;
34291 tree array_elt;
34292 tree field;
34293 tree final;
34295 unsigned int field_val = 0;
34296 unsigned int NUM_ISA_NAMES
34297 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34299 for (i = 0; i < NUM_ISA_NAMES; i++)
34300 if (strcmp (isa_names_table[i].name,
34301 TREE_STRING_POINTER (param_string_cst)) == 0)
34302 break;
34304 if (i == NUM_ISA_NAMES)
34306 error ("Parameter to builtin not valid: %s",
34307 TREE_STRING_POINTER (param_string_cst));
34308 return integer_zero_node;
34311 field = TYPE_FIELDS (__processor_model_type);
34312 /* Get the last field, which is __cpu_features. */
34313 while (DECL_CHAIN (field))
34314 field = DECL_CHAIN (field);
34316 /* Get the appropriate field: __cpu_model.__cpu_features */
34317 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34318 field, NULL_TREE);
34320 /* Access the 0th element of __cpu_features array. */
34321 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34322 integer_zero_node, NULL_TREE, NULL_TREE);
34324 field_val = (1 << isa_names_table[i].feature);
34325 /* Return __cpu_model.__cpu_features[0] & field_val */
34326 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34327 build_int_cstu (unsigned_type_node, field_val));
34328 return build1 (CONVERT_EXPR, integer_type_node, final);
34330 gcc_unreachable ();
34333 static tree
34334 ix86_fold_builtin (tree fndecl, int n_args,
34335 tree *args, bool ignore ATTRIBUTE_UNUSED)
34337 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34339 enum ix86_builtins fn_code = (enum ix86_builtins)
34340 DECL_FUNCTION_CODE (fndecl);
34341 switch (fn_code)
34343 case IX86_BUILTIN_CPU_IS:
34344 case IX86_BUILTIN_CPU_SUPPORTS:
34345 gcc_assert (n_args == 1);
34346 return fold_builtin_cpu (fndecl, args);
34348 case IX86_BUILTIN_NANQ:
34349 case IX86_BUILTIN_NANSQ:
34351 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34352 const char *str = c_getstr (*args);
34353 int quiet = fn_code == IX86_BUILTIN_NANQ;
34354 REAL_VALUE_TYPE real;
34356 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34357 return build_real (type, real);
34358 return NULL_TREE;
34361 case IX86_BUILTIN_INFQ:
34362 case IX86_BUILTIN_HUGE_VALQ:
34364 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34365 REAL_VALUE_TYPE inf;
34366 real_inf (&inf);
34367 return build_real (type, inf);
34370 case IX86_BUILTIN_TZCNT16:
34371 case IX86_BUILTIN_CTZS:
34372 case IX86_BUILTIN_TZCNT32:
34373 case IX86_BUILTIN_TZCNT64:
34374 gcc_assert (n_args == 1);
34375 if (TREE_CODE (args[0]) == INTEGER_CST)
34377 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34378 tree arg = args[0];
34379 if (fn_code == IX86_BUILTIN_TZCNT16
34380 || fn_code == IX86_BUILTIN_CTZS)
34381 arg = fold_convert (short_unsigned_type_node, arg);
34382 if (integer_zerop (arg))
34383 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34384 else
34385 return fold_const_call (CFN_CTZ, type, arg);
34387 break;
34389 case IX86_BUILTIN_LZCNT16:
34390 case IX86_BUILTIN_CLZS:
34391 case IX86_BUILTIN_LZCNT32:
34392 case IX86_BUILTIN_LZCNT64:
34393 gcc_assert (n_args == 1);
34394 if (TREE_CODE (args[0]) == INTEGER_CST)
34396 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34397 tree arg = args[0];
34398 if (fn_code == IX86_BUILTIN_LZCNT16
34399 || fn_code == IX86_BUILTIN_CLZS)
34400 arg = fold_convert (short_unsigned_type_node, arg);
34401 if (integer_zerop (arg))
34402 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34403 else
34404 return fold_const_call (CFN_CLZ, type, arg);
34406 break;
34408 case IX86_BUILTIN_BEXTR32:
34409 case IX86_BUILTIN_BEXTR64:
34410 case IX86_BUILTIN_BEXTRI32:
34411 case IX86_BUILTIN_BEXTRI64:
34412 gcc_assert (n_args == 2);
34413 if (tree_fits_uhwi_p (args[1]))
34415 unsigned HOST_WIDE_INT res = 0;
34416 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34417 unsigned int start = tree_to_uhwi (args[1]);
34418 unsigned int len = (start & 0xff00) >> 8;
34419 start &= 0xff;
34420 if (start >= prec || len == 0)
34421 res = 0;
34422 else if (!tree_fits_uhwi_p (args[0]))
34423 break;
34424 else
34425 res = tree_to_uhwi (args[0]) >> start;
34426 if (len > prec)
34427 len = prec;
34428 if (len < HOST_BITS_PER_WIDE_INT)
34429 res &= (HOST_WIDE_INT_1U << len) - 1;
34430 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34432 break;
34434 case IX86_BUILTIN_BZHI32:
34435 case IX86_BUILTIN_BZHI64:
34436 gcc_assert (n_args == 2);
34437 if (tree_fits_uhwi_p (args[1]))
34439 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34440 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34441 return args[0];
34442 if (!tree_fits_uhwi_p (args[0]))
34443 break;
34444 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34445 res &= ~(HOST_WIDE_INT_M1U << idx);
34446 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34448 break;
34450 case IX86_BUILTIN_PDEP32:
34451 case IX86_BUILTIN_PDEP64:
34452 gcc_assert (n_args == 2);
34453 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34455 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34456 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34457 unsigned HOST_WIDE_INT res = 0;
34458 unsigned HOST_WIDE_INT m, k = 1;
34459 for (m = 1; m; m <<= 1)
34460 if ((mask & m) != 0)
34462 if ((src & k) != 0)
34463 res |= m;
34464 k <<= 1;
34466 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34468 break;
34470 case IX86_BUILTIN_PEXT32:
34471 case IX86_BUILTIN_PEXT64:
34472 gcc_assert (n_args == 2);
34473 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34475 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34476 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34477 unsigned HOST_WIDE_INT res = 0;
34478 unsigned HOST_WIDE_INT m, k = 1;
34479 for (m = 1; m; m <<= 1)
34480 if ((mask & m) != 0)
34482 if ((src & m) != 0)
34483 res |= k;
34484 k <<= 1;
34486 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34488 break;
34490 default:
34491 break;
34495 #ifdef SUBTARGET_FOLD_BUILTIN
34496 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34497 #endif
34499 return NULL_TREE;
34502 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34503 constant) in GIMPLE. */
34505 bool
34506 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34508 gimple *stmt = gsi_stmt (*gsi);
34509 tree fndecl = gimple_call_fndecl (stmt);
34510 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34511 int n_args = gimple_call_num_args (stmt);
34512 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34513 tree decl = NULL_TREE;
34514 tree arg0, arg1;
34516 switch (fn_code)
34518 case IX86_BUILTIN_TZCNT32:
34519 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34520 goto fold_tzcnt_lzcnt;
34522 case IX86_BUILTIN_TZCNT64:
34523 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34524 goto fold_tzcnt_lzcnt;
34526 case IX86_BUILTIN_LZCNT32:
34527 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34528 goto fold_tzcnt_lzcnt;
34530 case IX86_BUILTIN_LZCNT64:
34531 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34532 goto fold_tzcnt_lzcnt;
34534 fold_tzcnt_lzcnt:
34535 gcc_assert (n_args == 1);
34536 arg0 = gimple_call_arg (stmt, 0);
34537 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34539 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34540 /* If arg0 is provably non-zero, optimize into generic
34541 __builtin_c[tl]z{,ll} function the middle-end handles
34542 better. */
34543 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34544 return false;
34546 location_t loc = gimple_location (stmt);
34547 gimple *g = gimple_build_call (decl, 1, arg0);
34548 gimple_set_location (g, loc);
34549 tree lhs = make_ssa_name (integer_type_node);
34550 gimple_call_set_lhs (g, lhs);
34551 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34552 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34553 gimple_set_location (g, loc);
34554 gsi_replace (gsi, g, false);
34555 return true;
34557 break;
34559 case IX86_BUILTIN_BZHI32:
34560 case IX86_BUILTIN_BZHI64:
34561 gcc_assert (n_args == 2);
34562 arg1 = gimple_call_arg (stmt, 1);
34563 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34565 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34566 arg0 = gimple_call_arg (stmt, 0);
34567 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34568 break;
34569 location_t loc = gimple_location (stmt);
34570 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34571 gimple_set_location (g, loc);
34572 gsi_replace (gsi, g, false);
34573 return true;
34575 break;
34577 case IX86_BUILTIN_PDEP32:
34578 case IX86_BUILTIN_PDEP64:
34579 case IX86_BUILTIN_PEXT32:
34580 case IX86_BUILTIN_PEXT64:
34581 gcc_assert (n_args == 2);
34582 arg1 = gimple_call_arg (stmt, 1);
34583 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34585 location_t loc = gimple_location (stmt);
34586 arg0 = gimple_call_arg (stmt, 0);
34587 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34588 gimple_set_location (g, loc);
34589 gsi_replace (gsi, g, false);
34590 return true;
34592 break;
34594 default:
34595 break;
34598 return false;
34601 /* Make builtins to detect cpu type and features supported. NAME is
34602 the builtin name, CODE is the builtin code, and FTYPE is the function
34603 type of the builtin. */
34605 static void
34606 make_cpu_type_builtin (const char* name, int code,
34607 enum ix86_builtin_func_type ftype, bool is_const)
34609 tree decl;
34610 tree type;
34612 type = ix86_get_builtin_func_type (ftype);
34613 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34614 NULL, NULL_TREE);
34615 gcc_assert (decl != NULL_TREE);
34616 ix86_builtins[(int) code] = decl;
34617 TREE_READONLY (decl) = is_const;
34620 /* Make builtins to get CPU type and features supported. The created
34621 builtins are :
34623 __builtin_cpu_init (), to detect cpu type and features,
34624 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34625 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34628 static void
34629 ix86_init_platform_type_builtins (void)
34631 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34632 INT_FTYPE_VOID, false);
34633 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34634 INT_FTYPE_PCCHAR, true);
34635 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34636 INT_FTYPE_PCCHAR, true);
34639 /* Internal method for ix86_init_builtins. */
34641 static void
34642 ix86_init_builtins_va_builtins_abi (void)
34644 tree ms_va_ref, sysv_va_ref;
34645 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34646 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34647 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34648 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34650 if (!TARGET_64BIT)
34651 return;
34652 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34653 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34654 ms_va_ref = build_reference_type (ms_va_list_type_node);
34655 sysv_va_ref =
34656 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34658 fnvoid_va_end_ms =
34659 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34660 fnvoid_va_start_ms =
34661 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34662 fnvoid_va_end_sysv =
34663 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34664 fnvoid_va_start_sysv =
34665 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34666 NULL_TREE);
34667 fnvoid_va_copy_ms =
34668 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34669 NULL_TREE);
34670 fnvoid_va_copy_sysv =
34671 build_function_type_list (void_type_node, sysv_va_ref,
34672 sysv_va_ref, NULL_TREE);
34674 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34675 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34676 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34677 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34678 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34679 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34680 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34681 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34682 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34683 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34684 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34685 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34688 static void
34689 ix86_init_builtin_types (void)
34691 tree float80_type_node, const_string_type_node;
34693 /* The __float80 type. */
34694 float80_type_node = long_double_type_node;
34695 if (TYPE_MODE (float80_type_node) != XFmode)
34697 if (float64x_type_node != NULL_TREE
34698 && TYPE_MODE (float64x_type_node) == XFmode)
34699 float80_type_node = float64x_type_node;
34700 else
34702 /* The __float80 type. */
34703 float80_type_node = make_node (REAL_TYPE);
34705 TYPE_PRECISION (float80_type_node) = 80;
34706 layout_type (float80_type_node);
34709 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34711 /* The __float128 type. The node has already been created as
34712 _Float128, so we only need to register the __float128 name for
34713 it. */
34714 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34716 const_string_type_node
34717 = build_pointer_type (build_qualified_type
34718 (char_type_node, TYPE_QUAL_CONST));
34720 /* This macro is built by i386-builtin-types.awk. */
34721 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34724 static void
34725 ix86_init_builtins (void)
34727 tree ftype, decl;
34729 ix86_init_builtin_types ();
34731 /* Builtins to get CPU type and features. */
34732 ix86_init_platform_type_builtins ();
34734 /* TFmode support builtins. */
34735 def_builtin_const (0, "__builtin_infq",
34736 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34737 def_builtin_const (0, "__builtin_huge_valq",
34738 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34740 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34741 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34742 BUILT_IN_MD, "nanq", NULL_TREE);
34743 TREE_READONLY (decl) = 1;
34744 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34746 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34747 BUILT_IN_MD, "nansq", NULL_TREE);
34748 TREE_READONLY (decl) = 1;
34749 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34751 /* We will expand them to normal call if SSE isn't available since
34752 they are used by libgcc. */
34753 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34754 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34755 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34756 TREE_READONLY (decl) = 1;
34757 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34759 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34760 decl = add_builtin_function ("__builtin_copysignq", ftype,
34761 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34762 "__copysigntf3", NULL_TREE);
34763 TREE_READONLY (decl) = 1;
34764 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34766 ix86_init_tm_builtins ();
34767 ix86_init_mmx_sse_builtins ();
34768 ix86_init_mpx_builtins ();
34770 if (TARGET_LP64)
34771 ix86_init_builtins_va_builtins_abi ();
34773 #ifdef SUBTARGET_INIT_BUILTINS
34774 SUBTARGET_INIT_BUILTINS;
34775 #endif
34778 /* Return the ix86 builtin for CODE. */
34780 static tree
34781 ix86_builtin_decl (unsigned code, bool)
34783 if (code >= IX86_BUILTIN_MAX)
34784 return error_mark_node;
34786 return ix86_builtins[code];
34789 /* Errors in the source file can cause expand_expr to return const0_rtx
34790 where we expect a vector. To avoid crashing, use one of the vector
34791 clear instructions. */
34792 static rtx
34793 safe_vector_operand (rtx x, machine_mode mode)
34795 if (x == const0_rtx)
34796 x = CONST0_RTX (mode);
34797 return x;
34800 /* Fixup modeless constants to fit required mode. */
34801 static rtx
34802 fixup_modeless_constant (rtx x, machine_mode mode)
34804 if (GET_MODE (x) == VOIDmode)
34805 x = convert_to_mode (mode, x, 1);
34806 return x;
34809 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34811 static rtx
34812 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34814 rtx pat;
34815 tree arg0 = CALL_EXPR_ARG (exp, 0);
34816 tree arg1 = CALL_EXPR_ARG (exp, 1);
34817 rtx op0 = expand_normal (arg0);
34818 rtx op1 = expand_normal (arg1);
34819 machine_mode tmode = insn_data[icode].operand[0].mode;
34820 machine_mode mode0 = insn_data[icode].operand[1].mode;
34821 machine_mode mode1 = insn_data[icode].operand[2].mode;
34823 if (VECTOR_MODE_P (mode0))
34824 op0 = safe_vector_operand (op0, mode0);
34825 if (VECTOR_MODE_P (mode1))
34826 op1 = safe_vector_operand (op1, mode1);
34828 if (optimize || !target
34829 || GET_MODE (target) != tmode
34830 || !insn_data[icode].operand[0].predicate (target, tmode))
34831 target = gen_reg_rtx (tmode);
34833 if (GET_MODE (op1) == SImode && mode1 == TImode)
34835 rtx x = gen_reg_rtx (V4SImode);
34836 emit_insn (gen_sse2_loadd (x, op1));
34837 op1 = gen_lowpart (TImode, x);
34840 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34841 op0 = copy_to_mode_reg (mode0, op0);
34842 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34843 op1 = copy_to_mode_reg (mode1, op1);
34845 pat = GEN_FCN (icode) (target, op0, op1);
34846 if (! pat)
34847 return 0;
34849 emit_insn (pat);
34851 return target;
34854 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34856 static rtx
34857 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34858 enum ix86_builtin_func_type m_type,
34859 enum rtx_code sub_code)
34861 rtx pat;
34862 int i;
34863 int nargs;
34864 bool comparison_p = false;
34865 bool tf_p = false;
34866 bool last_arg_constant = false;
34867 int num_memory = 0;
34868 struct {
34869 rtx op;
34870 machine_mode mode;
34871 } args[4];
34873 machine_mode tmode = insn_data[icode].operand[0].mode;
34875 switch (m_type)
34877 case MULTI_ARG_4_DF2_DI_I:
34878 case MULTI_ARG_4_DF2_DI_I1:
34879 case MULTI_ARG_4_SF2_SI_I:
34880 case MULTI_ARG_4_SF2_SI_I1:
34881 nargs = 4;
34882 last_arg_constant = true;
34883 break;
34885 case MULTI_ARG_3_SF:
34886 case MULTI_ARG_3_DF:
34887 case MULTI_ARG_3_SF2:
34888 case MULTI_ARG_3_DF2:
34889 case MULTI_ARG_3_DI:
34890 case MULTI_ARG_3_SI:
34891 case MULTI_ARG_3_SI_DI:
34892 case MULTI_ARG_3_HI:
34893 case MULTI_ARG_3_HI_SI:
34894 case MULTI_ARG_3_QI:
34895 case MULTI_ARG_3_DI2:
34896 case MULTI_ARG_3_SI2:
34897 case MULTI_ARG_3_HI2:
34898 case MULTI_ARG_3_QI2:
34899 nargs = 3;
34900 break;
34902 case MULTI_ARG_2_SF:
34903 case MULTI_ARG_2_DF:
34904 case MULTI_ARG_2_DI:
34905 case MULTI_ARG_2_SI:
34906 case MULTI_ARG_2_HI:
34907 case MULTI_ARG_2_QI:
34908 nargs = 2;
34909 break;
34911 case MULTI_ARG_2_DI_IMM:
34912 case MULTI_ARG_2_SI_IMM:
34913 case MULTI_ARG_2_HI_IMM:
34914 case MULTI_ARG_2_QI_IMM:
34915 nargs = 2;
34916 last_arg_constant = true;
34917 break;
34919 case MULTI_ARG_1_SF:
34920 case MULTI_ARG_1_DF:
34921 case MULTI_ARG_1_SF2:
34922 case MULTI_ARG_1_DF2:
34923 case MULTI_ARG_1_DI:
34924 case MULTI_ARG_1_SI:
34925 case MULTI_ARG_1_HI:
34926 case MULTI_ARG_1_QI:
34927 case MULTI_ARG_1_SI_DI:
34928 case MULTI_ARG_1_HI_DI:
34929 case MULTI_ARG_1_HI_SI:
34930 case MULTI_ARG_1_QI_DI:
34931 case MULTI_ARG_1_QI_SI:
34932 case MULTI_ARG_1_QI_HI:
34933 nargs = 1;
34934 break;
34936 case MULTI_ARG_2_DI_CMP:
34937 case MULTI_ARG_2_SI_CMP:
34938 case MULTI_ARG_2_HI_CMP:
34939 case MULTI_ARG_2_QI_CMP:
34940 nargs = 2;
34941 comparison_p = true;
34942 break;
34944 case MULTI_ARG_2_SF_TF:
34945 case MULTI_ARG_2_DF_TF:
34946 case MULTI_ARG_2_DI_TF:
34947 case MULTI_ARG_2_SI_TF:
34948 case MULTI_ARG_2_HI_TF:
34949 case MULTI_ARG_2_QI_TF:
34950 nargs = 2;
34951 tf_p = true;
34952 break;
34954 default:
34955 gcc_unreachable ();
34958 if (optimize || !target
34959 || GET_MODE (target) != tmode
34960 || !insn_data[icode].operand[0].predicate (target, tmode))
34961 target = gen_reg_rtx (tmode);
34962 else if (memory_operand (target, tmode))
34963 num_memory++;
34965 gcc_assert (nargs <= 4);
34967 for (i = 0; i < nargs; i++)
34969 tree arg = CALL_EXPR_ARG (exp, i);
34970 rtx op = expand_normal (arg);
34971 int adjust = (comparison_p) ? 1 : 0;
34972 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34974 if (last_arg_constant && i == nargs - 1)
34976 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34978 enum insn_code new_icode = icode;
34979 switch (icode)
34981 case CODE_FOR_xop_vpermil2v2df3:
34982 case CODE_FOR_xop_vpermil2v4sf3:
34983 case CODE_FOR_xop_vpermil2v4df3:
34984 case CODE_FOR_xop_vpermil2v8sf3:
34985 error ("the last argument must be a 2-bit immediate");
34986 return gen_reg_rtx (tmode);
34987 case CODE_FOR_xop_rotlv2di3:
34988 new_icode = CODE_FOR_rotlv2di3;
34989 goto xop_rotl;
34990 case CODE_FOR_xop_rotlv4si3:
34991 new_icode = CODE_FOR_rotlv4si3;
34992 goto xop_rotl;
34993 case CODE_FOR_xop_rotlv8hi3:
34994 new_icode = CODE_FOR_rotlv8hi3;
34995 goto xop_rotl;
34996 case CODE_FOR_xop_rotlv16qi3:
34997 new_icode = CODE_FOR_rotlv16qi3;
34998 xop_rotl:
34999 if (CONST_INT_P (op))
35001 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35002 op = GEN_INT (INTVAL (op) & mask);
35003 gcc_checking_assert
35004 (insn_data[icode].operand[i + 1].predicate (op, mode));
35006 else
35008 gcc_checking_assert
35009 (nargs == 2
35010 && insn_data[new_icode].operand[0].mode == tmode
35011 && insn_data[new_icode].operand[1].mode == tmode
35012 && insn_data[new_icode].operand[2].mode == mode
35013 && insn_data[new_icode].operand[0].predicate
35014 == insn_data[icode].operand[0].predicate
35015 && insn_data[new_icode].operand[1].predicate
35016 == insn_data[icode].operand[1].predicate);
35017 icode = new_icode;
35018 goto non_constant;
35020 break;
35021 default:
35022 gcc_unreachable ();
35026 else
35028 non_constant:
35029 if (VECTOR_MODE_P (mode))
35030 op = safe_vector_operand (op, mode);
35032 /* If we aren't optimizing, only allow one memory operand to be
35033 generated. */
35034 if (memory_operand (op, mode))
35035 num_memory++;
35037 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35039 if (optimize
35040 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35041 || num_memory > 1)
35042 op = force_reg (mode, op);
35045 args[i].op = op;
35046 args[i].mode = mode;
35049 switch (nargs)
35051 case 1:
35052 pat = GEN_FCN (icode) (target, args[0].op);
35053 break;
35055 case 2:
35056 if (tf_p)
35057 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35058 GEN_INT ((int)sub_code));
35059 else if (! comparison_p)
35060 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35061 else
35063 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35064 args[0].op,
35065 args[1].op);
35067 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35069 break;
35071 case 3:
35072 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35073 break;
35075 case 4:
35076 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35077 break;
35079 default:
35080 gcc_unreachable ();
35083 if (! pat)
35084 return 0;
35086 emit_insn (pat);
35087 return target;
35090 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35091 insns with vec_merge. */
35093 static rtx
35094 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35095 rtx target)
35097 rtx pat;
35098 tree arg0 = CALL_EXPR_ARG (exp, 0);
35099 rtx op1, op0 = expand_normal (arg0);
35100 machine_mode tmode = insn_data[icode].operand[0].mode;
35101 machine_mode mode0 = insn_data[icode].operand[1].mode;
35103 if (optimize || !target
35104 || GET_MODE (target) != tmode
35105 || !insn_data[icode].operand[0].predicate (target, tmode))
35106 target = gen_reg_rtx (tmode);
35108 if (VECTOR_MODE_P (mode0))
35109 op0 = safe_vector_operand (op0, mode0);
35111 if ((optimize && !register_operand (op0, mode0))
35112 || !insn_data[icode].operand[1].predicate (op0, mode0))
35113 op0 = copy_to_mode_reg (mode0, op0);
35115 op1 = op0;
35116 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35117 op1 = copy_to_mode_reg (mode0, op1);
35119 pat = GEN_FCN (icode) (target, op0, op1);
35120 if (! pat)
35121 return 0;
35122 emit_insn (pat);
35123 return target;
35126 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35128 static rtx
35129 ix86_expand_sse_compare (const struct builtin_description *d,
35130 tree exp, rtx target, bool swap)
35132 rtx pat;
35133 tree arg0 = CALL_EXPR_ARG (exp, 0);
35134 tree arg1 = CALL_EXPR_ARG (exp, 1);
35135 rtx op0 = expand_normal (arg0);
35136 rtx op1 = expand_normal (arg1);
35137 rtx op2;
35138 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35139 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35140 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35141 enum rtx_code comparison = d->comparison;
35143 if (VECTOR_MODE_P (mode0))
35144 op0 = safe_vector_operand (op0, mode0);
35145 if (VECTOR_MODE_P (mode1))
35146 op1 = safe_vector_operand (op1, mode1);
35148 /* Swap operands if we have a comparison that isn't available in
35149 hardware. */
35150 if (swap)
35151 std::swap (op0, op1);
35153 if (optimize || !target
35154 || GET_MODE (target) != tmode
35155 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35156 target = gen_reg_rtx (tmode);
35158 if ((optimize && !register_operand (op0, mode0))
35159 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35160 op0 = copy_to_mode_reg (mode0, op0);
35161 if ((optimize && !register_operand (op1, mode1))
35162 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35163 op1 = copy_to_mode_reg (mode1, op1);
35165 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35166 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35167 if (! pat)
35168 return 0;
35169 emit_insn (pat);
35170 return target;
35173 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35175 static rtx
35176 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35177 rtx target)
35179 rtx pat;
35180 tree arg0 = CALL_EXPR_ARG (exp, 0);
35181 tree arg1 = CALL_EXPR_ARG (exp, 1);
35182 rtx op0 = expand_normal (arg0);
35183 rtx op1 = expand_normal (arg1);
35184 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35185 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35186 enum rtx_code comparison = d->comparison;
35188 if (VECTOR_MODE_P (mode0))
35189 op0 = safe_vector_operand (op0, mode0);
35190 if (VECTOR_MODE_P (mode1))
35191 op1 = safe_vector_operand (op1, mode1);
35193 /* Swap operands if we have a comparison that isn't available in
35194 hardware. */
35195 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35196 std::swap (op0, op1);
35198 target = gen_reg_rtx (SImode);
35199 emit_move_insn (target, const0_rtx);
35200 target = gen_rtx_SUBREG (QImode, target, 0);
35202 if ((optimize && !register_operand (op0, mode0))
35203 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35204 op0 = copy_to_mode_reg (mode0, op0);
35205 if ((optimize && !register_operand (op1, mode1))
35206 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35207 op1 = copy_to_mode_reg (mode1, op1);
35209 pat = GEN_FCN (d->icode) (op0, op1);
35210 if (! pat)
35211 return 0;
35212 emit_insn (pat);
35213 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35214 gen_rtx_fmt_ee (comparison, QImode,
35215 SET_DEST (pat),
35216 const0_rtx)));
35218 return SUBREG_REG (target);
35221 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35223 static rtx
35224 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35225 rtx target)
35227 rtx pat;
35228 tree arg0 = CALL_EXPR_ARG (exp, 0);
35229 rtx op1, op0 = expand_normal (arg0);
35230 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35231 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35233 if (optimize || target == 0
35234 || GET_MODE (target) != tmode
35235 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35236 target = gen_reg_rtx (tmode);
35238 if (VECTOR_MODE_P (mode0))
35239 op0 = safe_vector_operand (op0, mode0);
35241 if ((optimize && !register_operand (op0, mode0))
35242 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35243 op0 = copy_to_mode_reg (mode0, op0);
35245 op1 = GEN_INT (d->comparison);
35247 pat = GEN_FCN (d->icode) (target, op0, op1);
35248 if (! pat)
35249 return 0;
35250 emit_insn (pat);
35251 return target;
35254 static rtx
35255 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35256 tree exp, rtx target)
35258 rtx pat;
35259 tree arg0 = CALL_EXPR_ARG (exp, 0);
35260 tree arg1 = CALL_EXPR_ARG (exp, 1);
35261 rtx op0 = expand_normal (arg0);
35262 rtx op1 = expand_normal (arg1);
35263 rtx op2;
35264 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35265 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35266 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35268 if (optimize || target == 0
35269 || GET_MODE (target) != tmode
35270 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35271 target = gen_reg_rtx (tmode);
35273 op0 = safe_vector_operand (op0, mode0);
35274 op1 = safe_vector_operand (op1, mode1);
35276 if ((optimize && !register_operand (op0, mode0))
35277 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35278 op0 = copy_to_mode_reg (mode0, op0);
35279 if ((optimize && !register_operand (op1, mode1))
35280 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35281 op1 = copy_to_mode_reg (mode1, op1);
35283 op2 = GEN_INT (d->comparison);
35285 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35286 if (! pat)
35287 return 0;
35288 emit_insn (pat);
35289 return target;
35292 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35294 static rtx
35295 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35296 rtx target)
35298 rtx pat;
35299 tree arg0 = CALL_EXPR_ARG (exp, 0);
35300 tree arg1 = CALL_EXPR_ARG (exp, 1);
35301 rtx op0 = expand_normal (arg0);
35302 rtx op1 = expand_normal (arg1);
35303 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35304 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35305 enum rtx_code comparison = d->comparison;
35307 if (VECTOR_MODE_P (mode0))
35308 op0 = safe_vector_operand (op0, mode0);
35309 if (VECTOR_MODE_P (mode1))
35310 op1 = safe_vector_operand (op1, mode1);
35312 target = gen_reg_rtx (SImode);
35313 emit_move_insn (target, const0_rtx);
35314 target = gen_rtx_SUBREG (QImode, target, 0);
35316 if ((optimize && !register_operand (op0, mode0))
35317 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35318 op0 = copy_to_mode_reg (mode0, op0);
35319 if ((optimize && !register_operand (op1, mode1))
35320 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35321 op1 = copy_to_mode_reg (mode1, op1);
35323 pat = GEN_FCN (d->icode) (op0, op1);
35324 if (! pat)
35325 return 0;
35326 emit_insn (pat);
35327 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35328 gen_rtx_fmt_ee (comparison, QImode,
35329 SET_DEST (pat),
35330 const0_rtx)));
35332 return SUBREG_REG (target);
35335 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35337 static rtx
35338 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35339 tree exp, rtx target)
35341 rtx pat;
35342 tree arg0 = CALL_EXPR_ARG (exp, 0);
35343 tree arg1 = CALL_EXPR_ARG (exp, 1);
35344 tree arg2 = CALL_EXPR_ARG (exp, 2);
35345 tree arg3 = CALL_EXPR_ARG (exp, 3);
35346 tree arg4 = CALL_EXPR_ARG (exp, 4);
35347 rtx scratch0, scratch1;
35348 rtx op0 = expand_normal (arg0);
35349 rtx op1 = expand_normal (arg1);
35350 rtx op2 = expand_normal (arg2);
35351 rtx op3 = expand_normal (arg3);
35352 rtx op4 = expand_normal (arg4);
35353 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35355 tmode0 = insn_data[d->icode].operand[0].mode;
35356 tmode1 = insn_data[d->icode].operand[1].mode;
35357 modev2 = insn_data[d->icode].operand[2].mode;
35358 modei3 = insn_data[d->icode].operand[3].mode;
35359 modev4 = insn_data[d->icode].operand[4].mode;
35360 modei5 = insn_data[d->icode].operand[5].mode;
35361 modeimm = insn_data[d->icode].operand[6].mode;
35363 if (VECTOR_MODE_P (modev2))
35364 op0 = safe_vector_operand (op0, modev2);
35365 if (VECTOR_MODE_P (modev4))
35366 op2 = safe_vector_operand (op2, modev4);
35368 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35369 op0 = copy_to_mode_reg (modev2, op0);
35370 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35371 op1 = copy_to_mode_reg (modei3, op1);
35372 if ((optimize && !register_operand (op2, modev4))
35373 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35374 op2 = copy_to_mode_reg (modev4, op2);
35375 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35376 op3 = copy_to_mode_reg (modei5, op3);
35378 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35380 error ("the fifth argument must be an 8-bit immediate");
35381 return const0_rtx;
35384 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35386 if (optimize || !target
35387 || GET_MODE (target) != tmode0
35388 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35389 target = gen_reg_rtx (tmode0);
35391 scratch1 = gen_reg_rtx (tmode1);
35393 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35395 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35397 if (optimize || !target
35398 || GET_MODE (target) != tmode1
35399 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35400 target = gen_reg_rtx (tmode1);
35402 scratch0 = gen_reg_rtx (tmode0);
35404 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35406 else
35408 gcc_assert (d->flag);
35410 scratch0 = gen_reg_rtx (tmode0);
35411 scratch1 = gen_reg_rtx (tmode1);
35413 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35416 if (! pat)
35417 return 0;
35419 emit_insn (pat);
35421 if (d->flag)
35423 target = gen_reg_rtx (SImode);
35424 emit_move_insn (target, const0_rtx);
35425 target = gen_rtx_SUBREG (QImode, target, 0);
35427 emit_insn
35428 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35429 gen_rtx_fmt_ee (EQ, QImode,
35430 gen_rtx_REG ((machine_mode) d->flag,
35431 FLAGS_REG),
35432 const0_rtx)));
35433 return SUBREG_REG (target);
35435 else
35436 return target;
35440 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35442 static rtx
35443 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35444 tree exp, rtx target)
35446 rtx pat;
35447 tree arg0 = CALL_EXPR_ARG (exp, 0);
35448 tree arg1 = CALL_EXPR_ARG (exp, 1);
35449 tree arg2 = CALL_EXPR_ARG (exp, 2);
35450 rtx scratch0, scratch1;
35451 rtx op0 = expand_normal (arg0);
35452 rtx op1 = expand_normal (arg1);
35453 rtx op2 = expand_normal (arg2);
35454 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35456 tmode0 = insn_data[d->icode].operand[0].mode;
35457 tmode1 = insn_data[d->icode].operand[1].mode;
35458 modev2 = insn_data[d->icode].operand[2].mode;
35459 modev3 = insn_data[d->icode].operand[3].mode;
35460 modeimm = insn_data[d->icode].operand[4].mode;
35462 if (VECTOR_MODE_P (modev2))
35463 op0 = safe_vector_operand (op0, modev2);
35464 if (VECTOR_MODE_P (modev3))
35465 op1 = safe_vector_operand (op1, modev3);
35467 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35468 op0 = copy_to_mode_reg (modev2, op0);
35469 if ((optimize && !register_operand (op1, modev3))
35470 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35471 op1 = copy_to_mode_reg (modev3, op1);
35473 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35475 error ("the third argument must be an 8-bit immediate");
35476 return const0_rtx;
35479 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35481 if (optimize || !target
35482 || GET_MODE (target) != tmode0
35483 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35484 target = gen_reg_rtx (tmode0);
35486 scratch1 = gen_reg_rtx (tmode1);
35488 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35490 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35492 if (optimize || !target
35493 || GET_MODE (target) != tmode1
35494 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35495 target = gen_reg_rtx (tmode1);
35497 scratch0 = gen_reg_rtx (tmode0);
35499 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35501 else
35503 gcc_assert (d->flag);
35505 scratch0 = gen_reg_rtx (tmode0);
35506 scratch1 = gen_reg_rtx (tmode1);
35508 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35511 if (! pat)
35512 return 0;
35514 emit_insn (pat);
35516 if (d->flag)
35518 target = gen_reg_rtx (SImode);
35519 emit_move_insn (target, const0_rtx);
35520 target = gen_rtx_SUBREG (QImode, target, 0);
35522 emit_insn
35523 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35524 gen_rtx_fmt_ee (EQ, QImode,
35525 gen_rtx_REG ((machine_mode) d->flag,
35526 FLAGS_REG),
35527 const0_rtx)));
35528 return SUBREG_REG (target);
35530 else
35531 return target;
35534 /* Subroutine of ix86_expand_builtin to take care of insns with
35535 variable number of operands. */
35537 static rtx
35538 ix86_expand_args_builtin (const struct builtin_description *d,
35539 tree exp, rtx target)
35541 rtx pat, real_target;
35542 unsigned int i, nargs;
35543 unsigned int nargs_constant = 0;
35544 unsigned int mask_pos = 0;
35545 int num_memory = 0;
35546 struct
35548 rtx op;
35549 machine_mode mode;
35550 } args[6];
35551 bool second_arg_count = false;
35552 enum insn_code icode = d->icode;
35553 const struct insn_data_d *insn_p = &insn_data[icode];
35554 machine_mode tmode = insn_p->operand[0].mode;
35555 machine_mode rmode = VOIDmode;
35556 bool swap = false;
35557 enum rtx_code comparison = d->comparison;
35559 switch ((enum ix86_builtin_func_type) d->flag)
35561 case V2DF_FTYPE_V2DF_ROUND:
35562 case V4DF_FTYPE_V4DF_ROUND:
35563 case V8DF_FTYPE_V8DF_ROUND:
35564 case V4SF_FTYPE_V4SF_ROUND:
35565 case V8SF_FTYPE_V8SF_ROUND:
35566 case V16SF_FTYPE_V16SF_ROUND:
35567 case V4SI_FTYPE_V4SF_ROUND:
35568 case V8SI_FTYPE_V8SF_ROUND:
35569 case V16SI_FTYPE_V16SF_ROUND:
35570 return ix86_expand_sse_round (d, exp, target);
35571 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35572 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35573 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35574 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35575 case INT_FTYPE_V8SF_V8SF_PTEST:
35576 case INT_FTYPE_V4DI_V4DI_PTEST:
35577 case INT_FTYPE_V4DF_V4DF_PTEST:
35578 case INT_FTYPE_V4SF_V4SF_PTEST:
35579 case INT_FTYPE_V2DI_V2DI_PTEST:
35580 case INT_FTYPE_V2DF_V2DF_PTEST:
35581 return ix86_expand_sse_ptest (d, exp, target);
35582 case FLOAT128_FTYPE_FLOAT128:
35583 case FLOAT_FTYPE_FLOAT:
35584 case INT_FTYPE_INT:
35585 case UINT_FTYPE_UINT:
35586 case UINT16_FTYPE_UINT16:
35587 case UINT64_FTYPE_INT:
35588 case UINT64_FTYPE_UINT64:
35589 case INT64_FTYPE_INT64:
35590 case INT64_FTYPE_V4SF:
35591 case INT64_FTYPE_V2DF:
35592 case INT_FTYPE_V16QI:
35593 case INT_FTYPE_V8QI:
35594 case INT_FTYPE_V8SF:
35595 case INT_FTYPE_V4DF:
35596 case INT_FTYPE_V4SF:
35597 case INT_FTYPE_V2DF:
35598 case INT_FTYPE_V32QI:
35599 case V16QI_FTYPE_V16QI:
35600 case V8SI_FTYPE_V8SF:
35601 case V8SI_FTYPE_V4SI:
35602 case V8HI_FTYPE_V8HI:
35603 case V8HI_FTYPE_V16QI:
35604 case V8QI_FTYPE_V8QI:
35605 case V8SF_FTYPE_V8SF:
35606 case V8SF_FTYPE_V8SI:
35607 case V8SF_FTYPE_V4SF:
35608 case V8SF_FTYPE_V8HI:
35609 case V4SI_FTYPE_V4SI:
35610 case V4SI_FTYPE_V16QI:
35611 case V4SI_FTYPE_V4SF:
35612 case V4SI_FTYPE_V8SI:
35613 case V4SI_FTYPE_V8HI:
35614 case V4SI_FTYPE_V4DF:
35615 case V4SI_FTYPE_V2DF:
35616 case V4HI_FTYPE_V4HI:
35617 case V4DF_FTYPE_V4DF:
35618 case V4DF_FTYPE_V4SI:
35619 case V4DF_FTYPE_V4SF:
35620 case V4DF_FTYPE_V2DF:
35621 case V4SF_FTYPE_V4SF:
35622 case V4SF_FTYPE_V4SI:
35623 case V4SF_FTYPE_V8SF:
35624 case V4SF_FTYPE_V4DF:
35625 case V4SF_FTYPE_V8HI:
35626 case V4SF_FTYPE_V2DF:
35627 case V2DI_FTYPE_V2DI:
35628 case V2DI_FTYPE_V16QI:
35629 case V2DI_FTYPE_V8HI:
35630 case V2DI_FTYPE_V4SI:
35631 case V2DF_FTYPE_V2DF:
35632 case V2DF_FTYPE_V4SI:
35633 case V2DF_FTYPE_V4DF:
35634 case V2DF_FTYPE_V4SF:
35635 case V2DF_FTYPE_V2SI:
35636 case V2SI_FTYPE_V2SI:
35637 case V2SI_FTYPE_V4SF:
35638 case V2SI_FTYPE_V2SF:
35639 case V2SI_FTYPE_V2DF:
35640 case V2SF_FTYPE_V2SF:
35641 case V2SF_FTYPE_V2SI:
35642 case V32QI_FTYPE_V32QI:
35643 case V32QI_FTYPE_V16QI:
35644 case V16HI_FTYPE_V16HI:
35645 case V16HI_FTYPE_V8HI:
35646 case V8SI_FTYPE_V8SI:
35647 case V16HI_FTYPE_V16QI:
35648 case V8SI_FTYPE_V16QI:
35649 case V4DI_FTYPE_V16QI:
35650 case V8SI_FTYPE_V8HI:
35651 case V4DI_FTYPE_V8HI:
35652 case V4DI_FTYPE_V4SI:
35653 case V4DI_FTYPE_V2DI:
35654 case UQI_FTYPE_UQI:
35655 case UHI_FTYPE_UHI:
35656 case USI_FTYPE_USI:
35657 case USI_FTYPE_UQI:
35658 case USI_FTYPE_UHI:
35659 case UDI_FTYPE_UDI:
35660 case UHI_FTYPE_V16QI:
35661 case USI_FTYPE_V32QI:
35662 case UDI_FTYPE_V64QI:
35663 case V16QI_FTYPE_UHI:
35664 case V32QI_FTYPE_USI:
35665 case V64QI_FTYPE_UDI:
35666 case V8HI_FTYPE_UQI:
35667 case V16HI_FTYPE_UHI:
35668 case V32HI_FTYPE_USI:
35669 case V4SI_FTYPE_UQI:
35670 case V8SI_FTYPE_UQI:
35671 case V4SI_FTYPE_UHI:
35672 case V8SI_FTYPE_UHI:
35673 case UQI_FTYPE_V8HI:
35674 case UHI_FTYPE_V16HI:
35675 case USI_FTYPE_V32HI:
35676 case UQI_FTYPE_V4SI:
35677 case UQI_FTYPE_V8SI:
35678 case UHI_FTYPE_V16SI:
35679 case UQI_FTYPE_V2DI:
35680 case UQI_FTYPE_V4DI:
35681 case UQI_FTYPE_V8DI:
35682 case V16SI_FTYPE_UHI:
35683 case V2DI_FTYPE_UQI:
35684 case V4DI_FTYPE_UQI:
35685 case V16SI_FTYPE_INT:
35686 case V16SF_FTYPE_V8SF:
35687 case V16SI_FTYPE_V8SI:
35688 case V16SF_FTYPE_V4SF:
35689 case V16SI_FTYPE_V4SI:
35690 case V16SI_FTYPE_V16SF:
35691 case V16SI_FTYPE_V16SI:
35692 case V16SF_FTYPE_V16SF:
35693 case V8DI_FTYPE_UQI:
35694 case V8DI_FTYPE_V8DI:
35695 case V8DF_FTYPE_V4DF:
35696 case V8DF_FTYPE_V2DF:
35697 case V8DF_FTYPE_V8DF:
35698 nargs = 1;
35699 break;
35700 case V4SF_FTYPE_V4SF_VEC_MERGE:
35701 case V2DF_FTYPE_V2DF_VEC_MERGE:
35702 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35703 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35704 case V16QI_FTYPE_V16QI_V16QI:
35705 case V16QI_FTYPE_V8HI_V8HI:
35706 case V16SF_FTYPE_V16SF_V16SF:
35707 case V8QI_FTYPE_V8QI_V8QI:
35708 case V8QI_FTYPE_V4HI_V4HI:
35709 case V8HI_FTYPE_V8HI_V8HI:
35710 case V8HI_FTYPE_V16QI_V16QI:
35711 case V8HI_FTYPE_V4SI_V4SI:
35712 case V8SF_FTYPE_V8SF_V8SF:
35713 case V8SF_FTYPE_V8SF_V8SI:
35714 case V8DF_FTYPE_V8DF_V8DF:
35715 case V4SI_FTYPE_V4SI_V4SI:
35716 case V4SI_FTYPE_V8HI_V8HI:
35717 case V4SI_FTYPE_V2DF_V2DF:
35718 case V4HI_FTYPE_V4HI_V4HI:
35719 case V4HI_FTYPE_V8QI_V8QI:
35720 case V4HI_FTYPE_V2SI_V2SI:
35721 case V4DF_FTYPE_V4DF_V4DF:
35722 case V4DF_FTYPE_V4DF_V4DI:
35723 case V4SF_FTYPE_V4SF_V4SF:
35724 case V4SF_FTYPE_V4SF_V4SI:
35725 case V4SF_FTYPE_V4SF_V2SI:
35726 case V4SF_FTYPE_V4SF_V2DF:
35727 case V4SF_FTYPE_V4SF_UINT:
35728 case V4SF_FTYPE_V4SF_DI:
35729 case V4SF_FTYPE_V4SF_SI:
35730 case V2DI_FTYPE_V2DI_V2DI:
35731 case V2DI_FTYPE_V16QI_V16QI:
35732 case V2DI_FTYPE_V4SI_V4SI:
35733 case V2DI_FTYPE_V2DI_V16QI:
35734 case V2SI_FTYPE_V2SI_V2SI:
35735 case V2SI_FTYPE_V4HI_V4HI:
35736 case V2SI_FTYPE_V2SF_V2SF:
35737 case V2DF_FTYPE_V2DF_V2DF:
35738 case V2DF_FTYPE_V2DF_V4SF:
35739 case V2DF_FTYPE_V2DF_V2DI:
35740 case V2DF_FTYPE_V2DF_DI:
35741 case V2DF_FTYPE_V2DF_SI:
35742 case V2DF_FTYPE_V2DF_UINT:
35743 case V2SF_FTYPE_V2SF_V2SF:
35744 case V1DI_FTYPE_V1DI_V1DI:
35745 case V1DI_FTYPE_V8QI_V8QI:
35746 case V1DI_FTYPE_V2SI_V2SI:
35747 case V32QI_FTYPE_V16HI_V16HI:
35748 case V16HI_FTYPE_V8SI_V8SI:
35749 case V32QI_FTYPE_V32QI_V32QI:
35750 case V16HI_FTYPE_V32QI_V32QI:
35751 case V16HI_FTYPE_V16HI_V16HI:
35752 case V8SI_FTYPE_V4DF_V4DF:
35753 case V8SI_FTYPE_V8SI_V8SI:
35754 case V8SI_FTYPE_V16HI_V16HI:
35755 case V4DI_FTYPE_V4DI_V4DI:
35756 case V4DI_FTYPE_V8SI_V8SI:
35757 case V8DI_FTYPE_V64QI_V64QI:
35758 if (comparison == UNKNOWN)
35759 return ix86_expand_binop_builtin (icode, exp, target);
35760 nargs = 2;
35761 break;
35762 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35763 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35764 gcc_assert (comparison != UNKNOWN);
35765 nargs = 2;
35766 swap = true;
35767 break;
35768 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35769 case V16HI_FTYPE_V16HI_SI_COUNT:
35770 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35771 case V8SI_FTYPE_V8SI_SI_COUNT:
35772 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35773 case V4DI_FTYPE_V4DI_INT_COUNT:
35774 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35775 case V8HI_FTYPE_V8HI_SI_COUNT:
35776 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35777 case V4SI_FTYPE_V4SI_SI_COUNT:
35778 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35779 case V4HI_FTYPE_V4HI_SI_COUNT:
35780 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35781 case V2DI_FTYPE_V2DI_SI_COUNT:
35782 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35783 case V2SI_FTYPE_V2SI_SI_COUNT:
35784 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35785 case V1DI_FTYPE_V1DI_SI_COUNT:
35786 nargs = 2;
35787 second_arg_count = true;
35788 break;
35789 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35790 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35791 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35792 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35793 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35794 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35795 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35796 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35797 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35798 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35799 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35800 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35801 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35802 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35803 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35804 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35805 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35806 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35807 nargs = 4;
35808 second_arg_count = true;
35809 break;
35810 case UINT64_FTYPE_UINT64_UINT64:
35811 case UINT_FTYPE_UINT_UINT:
35812 case UINT_FTYPE_UINT_USHORT:
35813 case UINT_FTYPE_UINT_UCHAR:
35814 case UINT16_FTYPE_UINT16_INT:
35815 case UINT8_FTYPE_UINT8_INT:
35816 case UQI_FTYPE_UQI_UQI:
35817 case UHI_FTYPE_UHI_UHI:
35818 case USI_FTYPE_USI_USI:
35819 case UDI_FTYPE_UDI_UDI:
35820 case V16SI_FTYPE_V8DF_V8DF:
35821 nargs = 2;
35822 break;
35823 case V2DI_FTYPE_V2DI_INT_CONVERT:
35824 nargs = 2;
35825 rmode = V1TImode;
35826 nargs_constant = 1;
35827 break;
35828 case V4DI_FTYPE_V4DI_INT_CONVERT:
35829 nargs = 2;
35830 rmode = V2TImode;
35831 nargs_constant = 1;
35832 break;
35833 case V8DI_FTYPE_V8DI_INT_CONVERT:
35834 nargs = 2;
35835 rmode = V4TImode;
35836 nargs_constant = 1;
35837 break;
35838 case V8HI_FTYPE_V8HI_INT:
35839 case V8HI_FTYPE_V8SF_INT:
35840 case V16HI_FTYPE_V16SF_INT:
35841 case V8HI_FTYPE_V4SF_INT:
35842 case V8SF_FTYPE_V8SF_INT:
35843 case V4SF_FTYPE_V16SF_INT:
35844 case V16SF_FTYPE_V16SF_INT:
35845 case V4SI_FTYPE_V4SI_INT:
35846 case V4SI_FTYPE_V8SI_INT:
35847 case V4HI_FTYPE_V4HI_INT:
35848 case V4DF_FTYPE_V4DF_INT:
35849 case V4DF_FTYPE_V8DF_INT:
35850 case V4SF_FTYPE_V4SF_INT:
35851 case V4SF_FTYPE_V8SF_INT:
35852 case V2DI_FTYPE_V2DI_INT:
35853 case V2DF_FTYPE_V2DF_INT:
35854 case V2DF_FTYPE_V4DF_INT:
35855 case V16HI_FTYPE_V16HI_INT:
35856 case V8SI_FTYPE_V8SI_INT:
35857 case V16SI_FTYPE_V16SI_INT:
35858 case V4SI_FTYPE_V16SI_INT:
35859 case V4DI_FTYPE_V4DI_INT:
35860 case V2DI_FTYPE_V4DI_INT:
35861 case V4DI_FTYPE_V8DI_INT:
35862 case QI_FTYPE_V4SF_INT:
35863 case QI_FTYPE_V2DF_INT:
35864 case UQI_FTYPE_UQI_UQI_CONST:
35865 case UHI_FTYPE_UHI_UQI:
35866 case USI_FTYPE_USI_UQI:
35867 case UDI_FTYPE_UDI_UQI:
35868 nargs = 2;
35869 nargs_constant = 1;
35870 break;
35871 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35872 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35873 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35874 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35875 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35876 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35877 case UHI_FTYPE_V16SI_V16SI_UHI:
35878 case UQI_FTYPE_V8DI_V8DI_UQI:
35879 case V16HI_FTYPE_V16SI_V16HI_UHI:
35880 case V16QI_FTYPE_V16SI_V16QI_UHI:
35881 case V16QI_FTYPE_V8DI_V16QI_UQI:
35882 case V16SF_FTYPE_V16SF_V16SF_UHI:
35883 case V16SF_FTYPE_V4SF_V16SF_UHI:
35884 case V16SI_FTYPE_SI_V16SI_UHI:
35885 case V16SI_FTYPE_V16HI_V16SI_UHI:
35886 case V16SI_FTYPE_V16QI_V16SI_UHI:
35887 case V8SF_FTYPE_V4SF_V8SF_UQI:
35888 case V4DF_FTYPE_V2DF_V4DF_UQI:
35889 case V8SI_FTYPE_V4SI_V8SI_UQI:
35890 case V8SI_FTYPE_SI_V8SI_UQI:
35891 case V4SI_FTYPE_V4SI_V4SI_UQI:
35892 case V4SI_FTYPE_SI_V4SI_UQI:
35893 case V4DI_FTYPE_V2DI_V4DI_UQI:
35894 case V4DI_FTYPE_DI_V4DI_UQI:
35895 case V2DI_FTYPE_V2DI_V2DI_UQI:
35896 case V2DI_FTYPE_DI_V2DI_UQI:
35897 case V64QI_FTYPE_V64QI_V64QI_UDI:
35898 case V64QI_FTYPE_V16QI_V64QI_UDI:
35899 case V64QI_FTYPE_QI_V64QI_UDI:
35900 case V32QI_FTYPE_V32QI_V32QI_USI:
35901 case V32QI_FTYPE_V16QI_V32QI_USI:
35902 case V32QI_FTYPE_QI_V32QI_USI:
35903 case V16QI_FTYPE_V16QI_V16QI_UHI:
35904 case V16QI_FTYPE_QI_V16QI_UHI:
35905 case V32HI_FTYPE_V8HI_V32HI_USI:
35906 case V32HI_FTYPE_HI_V32HI_USI:
35907 case V16HI_FTYPE_V8HI_V16HI_UHI:
35908 case V16HI_FTYPE_HI_V16HI_UHI:
35909 case V8HI_FTYPE_V8HI_V8HI_UQI:
35910 case V8HI_FTYPE_HI_V8HI_UQI:
35911 case V8SF_FTYPE_V8HI_V8SF_UQI:
35912 case V4SF_FTYPE_V8HI_V4SF_UQI:
35913 case V8SI_FTYPE_V8SF_V8SI_UQI:
35914 case V4SI_FTYPE_V4SF_V4SI_UQI:
35915 case V4DI_FTYPE_V4SF_V4DI_UQI:
35916 case V2DI_FTYPE_V4SF_V2DI_UQI:
35917 case V4SF_FTYPE_V4DI_V4SF_UQI:
35918 case V4SF_FTYPE_V2DI_V4SF_UQI:
35919 case V4DF_FTYPE_V4DI_V4DF_UQI:
35920 case V2DF_FTYPE_V2DI_V2DF_UQI:
35921 case V16QI_FTYPE_V8HI_V16QI_UQI:
35922 case V16QI_FTYPE_V16HI_V16QI_UHI:
35923 case V16QI_FTYPE_V4SI_V16QI_UQI:
35924 case V16QI_FTYPE_V8SI_V16QI_UQI:
35925 case V8HI_FTYPE_V4SI_V8HI_UQI:
35926 case V8HI_FTYPE_V8SI_V8HI_UQI:
35927 case V16QI_FTYPE_V2DI_V16QI_UQI:
35928 case V16QI_FTYPE_V4DI_V16QI_UQI:
35929 case V8HI_FTYPE_V2DI_V8HI_UQI:
35930 case V8HI_FTYPE_V4DI_V8HI_UQI:
35931 case V4SI_FTYPE_V2DI_V4SI_UQI:
35932 case V4SI_FTYPE_V4DI_V4SI_UQI:
35933 case V32QI_FTYPE_V32HI_V32QI_USI:
35934 case UHI_FTYPE_V16QI_V16QI_UHI:
35935 case USI_FTYPE_V32QI_V32QI_USI:
35936 case UDI_FTYPE_V64QI_V64QI_UDI:
35937 case UQI_FTYPE_V8HI_V8HI_UQI:
35938 case UHI_FTYPE_V16HI_V16HI_UHI:
35939 case USI_FTYPE_V32HI_V32HI_USI:
35940 case UQI_FTYPE_V4SI_V4SI_UQI:
35941 case UQI_FTYPE_V8SI_V8SI_UQI:
35942 case UQI_FTYPE_V2DI_V2DI_UQI:
35943 case UQI_FTYPE_V4DI_V4DI_UQI:
35944 case V4SF_FTYPE_V2DF_V4SF_UQI:
35945 case V4SF_FTYPE_V4DF_V4SF_UQI:
35946 case V16SI_FTYPE_V16SI_V16SI_UHI:
35947 case V16SI_FTYPE_V4SI_V16SI_UHI:
35948 case V2DI_FTYPE_V4SI_V2DI_UQI:
35949 case V2DI_FTYPE_V8HI_V2DI_UQI:
35950 case V2DI_FTYPE_V16QI_V2DI_UQI:
35951 case V4DI_FTYPE_V4DI_V4DI_UQI:
35952 case V4DI_FTYPE_V4SI_V4DI_UQI:
35953 case V4DI_FTYPE_V8HI_V4DI_UQI:
35954 case V4DI_FTYPE_V16QI_V4DI_UQI:
35955 case V4DI_FTYPE_V4DF_V4DI_UQI:
35956 case V2DI_FTYPE_V2DF_V2DI_UQI:
35957 case V4SI_FTYPE_V4DF_V4SI_UQI:
35958 case V4SI_FTYPE_V2DF_V4SI_UQI:
35959 case V4SI_FTYPE_V8HI_V4SI_UQI:
35960 case V4SI_FTYPE_V16QI_V4SI_UQI:
35961 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35962 case V8DF_FTYPE_V2DF_V8DF_UQI:
35963 case V8DF_FTYPE_V4DF_V8DF_UQI:
35964 case V8DF_FTYPE_V8DF_V8DF_UQI:
35965 case V8SF_FTYPE_V8SF_V8SF_UQI:
35966 case V8SF_FTYPE_V8SI_V8SF_UQI:
35967 case V4DF_FTYPE_V4DF_V4DF_UQI:
35968 case V4SF_FTYPE_V4SF_V4SF_UQI:
35969 case V2DF_FTYPE_V2DF_V2DF_UQI:
35970 case V2DF_FTYPE_V4SF_V2DF_UQI:
35971 case V2DF_FTYPE_V4SI_V2DF_UQI:
35972 case V4SF_FTYPE_V4SI_V4SF_UQI:
35973 case V4DF_FTYPE_V4SF_V4DF_UQI:
35974 case V4DF_FTYPE_V4SI_V4DF_UQI:
35975 case V8SI_FTYPE_V8SI_V8SI_UQI:
35976 case V8SI_FTYPE_V8HI_V8SI_UQI:
35977 case V8SI_FTYPE_V16QI_V8SI_UQI:
35978 case V8DF_FTYPE_V8SI_V8DF_UQI:
35979 case V8DI_FTYPE_DI_V8DI_UQI:
35980 case V16SF_FTYPE_V8SF_V16SF_UHI:
35981 case V16SI_FTYPE_V8SI_V16SI_UHI:
35982 case V16HI_FTYPE_V16HI_V16HI_UHI:
35983 case V8HI_FTYPE_V16QI_V8HI_UQI:
35984 case V16HI_FTYPE_V16QI_V16HI_UHI:
35985 case V32HI_FTYPE_V32HI_V32HI_USI:
35986 case V32HI_FTYPE_V32QI_V32HI_USI:
35987 case V8DI_FTYPE_V16QI_V8DI_UQI:
35988 case V8DI_FTYPE_V2DI_V8DI_UQI:
35989 case V8DI_FTYPE_V4DI_V8DI_UQI:
35990 case V8DI_FTYPE_V8DI_V8DI_UQI:
35991 case V8DI_FTYPE_V8HI_V8DI_UQI:
35992 case V8DI_FTYPE_V8SI_V8DI_UQI:
35993 case V8HI_FTYPE_V8DI_V8HI_UQI:
35994 case V8SI_FTYPE_V8DI_V8SI_UQI:
35995 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35996 nargs = 3;
35997 break;
35998 case V32QI_FTYPE_V32QI_V32QI_INT:
35999 case V16HI_FTYPE_V16HI_V16HI_INT:
36000 case V16QI_FTYPE_V16QI_V16QI_INT:
36001 case V4DI_FTYPE_V4DI_V4DI_INT:
36002 case V8HI_FTYPE_V8HI_V8HI_INT:
36003 case V8SI_FTYPE_V8SI_V8SI_INT:
36004 case V8SI_FTYPE_V8SI_V4SI_INT:
36005 case V8SF_FTYPE_V8SF_V8SF_INT:
36006 case V8SF_FTYPE_V8SF_V4SF_INT:
36007 case V4SI_FTYPE_V4SI_V4SI_INT:
36008 case V4DF_FTYPE_V4DF_V4DF_INT:
36009 case V16SF_FTYPE_V16SF_V16SF_INT:
36010 case V16SF_FTYPE_V16SF_V4SF_INT:
36011 case V16SI_FTYPE_V16SI_V4SI_INT:
36012 case V4DF_FTYPE_V4DF_V2DF_INT:
36013 case V4SF_FTYPE_V4SF_V4SF_INT:
36014 case V2DI_FTYPE_V2DI_V2DI_INT:
36015 case V4DI_FTYPE_V4DI_V2DI_INT:
36016 case V2DF_FTYPE_V2DF_V2DF_INT:
36017 case UQI_FTYPE_V8DI_V8UDI_INT:
36018 case UQI_FTYPE_V8DF_V8DF_INT:
36019 case UQI_FTYPE_V2DF_V2DF_INT:
36020 case UQI_FTYPE_V4SF_V4SF_INT:
36021 case UHI_FTYPE_V16SI_V16SI_INT:
36022 case UHI_FTYPE_V16SF_V16SF_INT:
36023 nargs = 3;
36024 nargs_constant = 1;
36025 break;
36026 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36027 nargs = 3;
36028 rmode = V4DImode;
36029 nargs_constant = 1;
36030 break;
36031 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36032 nargs = 3;
36033 rmode = V2DImode;
36034 nargs_constant = 1;
36035 break;
36036 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36037 nargs = 3;
36038 rmode = DImode;
36039 nargs_constant = 1;
36040 break;
36041 case V2DI_FTYPE_V2DI_UINT_UINT:
36042 nargs = 3;
36043 nargs_constant = 2;
36044 break;
36045 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36046 nargs = 3;
36047 rmode = V8DImode;
36048 nargs_constant = 1;
36049 break;
36050 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36051 nargs = 5;
36052 rmode = V8DImode;
36053 mask_pos = 2;
36054 nargs_constant = 1;
36055 break;
36056 case QI_FTYPE_V8DF_INT_UQI:
36057 case QI_FTYPE_V4DF_INT_UQI:
36058 case QI_FTYPE_V2DF_INT_UQI:
36059 case HI_FTYPE_V16SF_INT_UHI:
36060 case QI_FTYPE_V8SF_INT_UQI:
36061 case QI_FTYPE_V4SF_INT_UQI:
36062 nargs = 3;
36063 mask_pos = 1;
36064 nargs_constant = 1;
36065 break;
36066 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36067 nargs = 5;
36068 rmode = V4DImode;
36069 mask_pos = 2;
36070 nargs_constant = 1;
36071 break;
36072 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36073 nargs = 5;
36074 rmode = V2DImode;
36075 mask_pos = 2;
36076 nargs_constant = 1;
36077 break;
36078 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36079 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36080 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36081 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36082 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36083 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36084 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36085 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36086 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36087 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36088 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36089 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36090 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36091 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36092 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36093 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36094 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36095 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36096 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36097 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36098 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36099 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36100 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36101 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36102 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36103 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36104 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36105 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36106 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36107 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36108 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36109 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36110 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36111 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36112 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36113 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36114 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36115 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36116 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36117 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36118 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36119 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36120 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36121 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36122 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36123 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36124 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36125 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36126 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36127 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36128 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36129 nargs = 4;
36130 break;
36131 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36132 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36133 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36134 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36135 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36136 nargs = 4;
36137 nargs_constant = 1;
36138 break;
36139 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36140 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36141 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36142 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36143 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36144 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36145 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36146 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36147 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36148 case USI_FTYPE_V32QI_V32QI_INT_USI:
36149 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36150 case USI_FTYPE_V32HI_V32HI_INT_USI:
36151 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36152 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36153 nargs = 4;
36154 mask_pos = 1;
36155 nargs_constant = 1;
36156 break;
36157 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36158 nargs = 4;
36159 nargs_constant = 2;
36160 break;
36161 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36162 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36163 nargs = 4;
36164 break;
36165 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36166 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36167 mask_pos = 1;
36168 nargs = 4;
36169 nargs_constant = 1;
36170 break;
36171 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36172 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36173 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36174 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36175 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36176 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36177 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36178 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36179 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36180 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36181 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36182 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36183 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36184 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36185 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36186 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36187 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36188 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36189 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36190 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36191 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36192 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36193 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36194 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36195 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36196 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36197 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36198 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36199 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36200 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36201 nargs = 4;
36202 mask_pos = 2;
36203 nargs_constant = 1;
36204 break;
36205 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36206 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36207 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36208 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36209 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36210 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36211 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36212 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36213 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36214 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36215 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36216 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36217 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36218 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36219 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36220 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36221 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36222 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36223 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36224 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36225 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36226 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36227 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36228 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36229 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36230 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36231 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36232 nargs = 5;
36233 mask_pos = 2;
36234 nargs_constant = 1;
36235 break;
36236 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36237 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36238 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36239 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36240 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36241 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36242 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36243 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36244 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36245 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36246 nargs = 5;
36247 mask_pos = 1;
36248 nargs_constant = 1;
36249 break;
36251 default:
36252 gcc_unreachable ();
36255 gcc_assert (nargs <= ARRAY_SIZE (args));
36257 if (comparison != UNKNOWN)
36259 gcc_assert (nargs == 2);
36260 return ix86_expand_sse_compare (d, exp, target, swap);
36263 if (rmode == VOIDmode || rmode == tmode)
36265 if (optimize
36266 || target == 0
36267 || GET_MODE (target) != tmode
36268 || !insn_p->operand[0].predicate (target, tmode))
36269 target = gen_reg_rtx (tmode);
36270 else if (memory_operand (target, tmode))
36271 num_memory++;
36272 real_target = target;
36274 else
36276 real_target = gen_reg_rtx (tmode);
36277 target = lowpart_subreg (rmode, real_target, tmode);
36280 for (i = 0; i < nargs; i++)
36282 tree arg = CALL_EXPR_ARG (exp, i);
36283 rtx op = expand_normal (arg);
36284 machine_mode mode = insn_p->operand[i + 1].mode;
36285 bool match = insn_p->operand[i + 1].predicate (op, mode);
36287 if (second_arg_count && i == 1)
36289 /* SIMD shift insns take either an 8-bit immediate or
36290 register as count. But builtin functions take int as
36291 count. If count doesn't match, we put it in register.
36292 The instructions are using 64-bit count, if op is just
36293 32-bit, zero-extend it, as negative shift counts
36294 are undefined behavior and zero-extension is more
36295 efficient. */
36296 if (!match)
36298 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36299 op = convert_modes (mode, GET_MODE (op), op, 1);
36300 else
36301 op = lowpart_subreg (mode, op, GET_MODE (op));
36302 if (!insn_p->operand[i + 1].predicate (op, mode))
36303 op = copy_to_reg (op);
36306 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36307 (!mask_pos && (nargs - i) <= nargs_constant))
36309 if (!match)
36310 switch (icode)
36312 case CODE_FOR_avx_vinsertf128v4di:
36313 case CODE_FOR_avx_vextractf128v4di:
36314 error ("the last argument must be an 1-bit immediate");
36315 return const0_rtx;
36317 case CODE_FOR_avx512f_cmpv8di3_mask:
36318 case CODE_FOR_avx512f_cmpv16si3_mask:
36319 case CODE_FOR_avx512f_ucmpv8di3_mask:
36320 case CODE_FOR_avx512f_ucmpv16si3_mask:
36321 case CODE_FOR_avx512vl_cmpv4di3_mask:
36322 case CODE_FOR_avx512vl_cmpv8si3_mask:
36323 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36324 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36325 case CODE_FOR_avx512vl_cmpv2di3_mask:
36326 case CODE_FOR_avx512vl_cmpv4si3_mask:
36327 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36328 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36329 error ("the last argument must be a 3-bit immediate");
36330 return const0_rtx;
36332 case CODE_FOR_sse4_1_roundsd:
36333 case CODE_FOR_sse4_1_roundss:
36335 case CODE_FOR_sse4_1_roundpd:
36336 case CODE_FOR_sse4_1_roundps:
36337 case CODE_FOR_avx_roundpd256:
36338 case CODE_FOR_avx_roundps256:
36340 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36341 case CODE_FOR_sse4_1_roundps_sfix:
36342 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36343 case CODE_FOR_avx_roundps_sfix256:
36345 case CODE_FOR_sse4_1_blendps:
36346 case CODE_FOR_avx_blendpd256:
36347 case CODE_FOR_avx_vpermilv4df:
36348 case CODE_FOR_avx_vpermilv4df_mask:
36349 case CODE_FOR_avx512f_getmantv8df_mask:
36350 case CODE_FOR_avx512f_getmantv16sf_mask:
36351 case CODE_FOR_avx512vl_getmantv8sf_mask:
36352 case CODE_FOR_avx512vl_getmantv4df_mask:
36353 case CODE_FOR_avx512vl_getmantv4sf_mask:
36354 case CODE_FOR_avx512vl_getmantv2df_mask:
36355 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36356 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36357 case CODE_FOR_avx512dq_rangepv4df_mask:
36358 case CODE_FOR_avx512dq_rangepv8sf_mask:
36359 case CODE_FOR_avx512dq_rangepv2df_mask:
36360 case CODE_FOR_avx512dq_rangepv4sf_mask:
36361 case CODE_FOR_avx_shufpd256_mask:
36362 error ("the last argument must be a 4-bit immediate");
36363 return const0_rtx;
36365 case CODE_FOR_sha1rnds4:
36366 case CODE_FOR_sse4_1_blendpd:
36367 case CODE_FOR_avx_vpermilv2df:
36368 case CODE_FOR_avx_vpermilv2df_mask:
36369 case CODE_FOR_xop_vpermil2v2df3:
36370 case CODE_FOR_xop_vpermil2v4sf3:
36371 case CODE_FOR_xop_vpermil2v4df3:
36372 case CODE_FOR_xop_vpermil2v8sf3:
36373 case CODE_FOR_avx512f_vinsertf32x4_mask:
36374 case CODE_FOR_avx512f_vinserti32x4_mask:
36375 case CODE_FOR_avx512f_vextractf32x4_mask:
36376 case CODE_FOR_avx512f_vextracti32x4_mask:
36377 case CODE_FOR_sse2_shufpd:
36378 case CODE_FOR_sse2_shufpd_mask:
36379 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36380 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36381 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36382 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36383 error ("the last argument must be a 2-bit immediate");
36384 return const0_rtx;
36386 case CODE_FOR_avx_vextractf128v4df:
36387 case CODE_FOR_avx_vextractf128v8sf:
36388 case CODE_FOR_avx_vextractf128v8si:
36389 case CODE_FOR_avx_vinsertf128v4df:
36390 case CODE_FOR_avx_vinsertf128v8sf:
36391 case CODE_FOR_avx_vinsertf128v8si:
36392 case CODE_FOR_avx512f_vinsertf64x4_mask:
36393 case CODE_FOR_avx512f_vinserti64x4_mask:
36394 case CODE_FOR_avx512f_vextractf64x4_mask:
36395 case CODE_FOR_avx512f_vextracti64x4_mask:
36396 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36397 case CODE_FOR_avx512dq_vinserti32x8_mask:
36398 case CODE_FOR_avx512vl_vinsertv4df:
36399 case CODE_FOR_avx512vl_vinsertv4di:
36400 case CODE_FOR_avx512vl_vinsertv8sf:
36401 case CODE_FOR_avx512vl_vinsertv8si:
36402 error ("the last argument must be a 1-bit immediate");
36403 return const0_rtx;
36405 case CODE_FOR_avx_vmcmpv2df3:
36406 case CODE_FOR_avx_vmcmpv4sf3:
36407 case CODE_FOR_avx_cmpv2df3:
36408 case CODE_FOR_avx_cmpv4sf3:
36409 case CODE_FOR_avx_cmpv4df3:
36410 case CODE_FOR_avx_cmpv8sf3:
36411 case CODE_FOR_avx512f_cmpv8df3_mask:
36412 case CODE_FOR_avx512f_cmpv16sf3_mask:
36413 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36414 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36415 error ("the last argument must be a 5-bit immediate");
36416 return const0_rtx;
36418 default:
36419 switch (nargs_constant)
36421 case 2:
36422 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36423 (!mask_pos && (nargs - i) == nargs_constant))
36425 error ("the next to last argument must be an 8-bit immediate");
36426 break;
36428 /* FALLTHRU */
36429 case 1:
36430 error ("the last argument must be an 8-bit immediate");
36431 break;
36432 default:
36433 gcc_unreachable ();
36435 return const0_rtx;
36438 else
36440 if (VECTOR_MODE_P (mode))
36441 op = safe_vector_operand (op, mode);
36443 /* If we aren't optimizing, only allow one memory operand to
36444 be generated. */
36445 if (memory_operand (op, mode))
36446 num_memory++;
36448 op = fixup_modeless_constant (op, mode);
36450 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36452 if (optimize || !match || num_memory > 1)
36453 op = copy_to_mode_reg (mode, op);
36455 else
36457 op = copy_to_reg (op);
36458 op = lowpart_subreg (mode, op, GET_MODE (op));
36462 args[i].op = op;
36463 args[i].mode = mode;
36466 switch (nargs)
36468 case 1:
36469 pat = GEN_FCN (icode) (real_target, args[0].op);
36470 break;
36471 case 2:
36472 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36473 break;
36474 case 3:
36475 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36476 args[2].op);
36477 break;
36478 case 4:
36479 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36480 args[2].op, args[3].op);
36481 break;
36482 case 5:
36483 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36484 args[2].op, args[3].op, args[4].op);
36485 break;
36486 case 6:
36487 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36488 args[2].op, args[3].op, args[4].op,
36489 args[5].op);
36490 break;
36491 default:
36492 gcc_unreachable ();
36495 if (! pat)
36496 return 0;
36498 emit_insn (pat);
36499 return target;
36502 /* Transform pattern of following layout:
36503 (set A
36504 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36506 into:
36507 (set (A B)) */
36509 static rtx
36510 ix86_erase_embedded_rounding (rtx pat)
36512 if (GET_CODE (pat) == INSN)
36513 pat = PATTERN (pat);
36515 gcc_assert (GET_CODE (pat) == SET);
36516 rtx src = SET_SRC (pat);
36517 gcc_assert (XVECLEN (src, 0) == 2);
36518 rtx p0 = XVECEXP (src, 0, 0);
36519 gcc_assert (GET_CODE (src) == UNSPEC
36520 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36521 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36522 return res;
36525 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36526 with rounding. */
36527 static rtx
36528 ix86_expand_sse_comi_round (const struct builtin_description *d,
36529 tree exp, rtx target)
36531 rtx pat, set_dst;
36532 tree arg0 = CALL_EXPR_ARG (exp, 0);
36533 tree arg1 = CALL_EXPR_ARG (exp, 1);
36534 tree arg2 = CALL_EXPR_ARG (exp, 2);
36535 tree arg3 = CALL_EXPR_ARG (exp, 3);
36536 rtx op0 = expand_normal (arg0);
36537 rtx op1 = expand_normal (arg1);
36538 rtx op2 = expand_normal (arg2);
36539 rtx op3 = expand_normal (arg3);
36540 enum insn_code icode = d->icode;
36541 const struct insn_data_d *insn_p = &insn_data[icode];
36542 machine_mode mode0 = insn_p->operand[0].mode;
36543 machine_mode mode1 = insn_p->operand[1].mode;
36544 enum rtx_code comparison = UNEQ;
36545 bool need_ucomi = false;
36547 /* See avxintrin.h for values. */
36548 enum rtx_code comi_comparisons[32] =
36550 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36551 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36552 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36554 bool need_ucomi_values[32] =
36556 true, false, false, true, true, false, false, true,
36557 true, false, false, true, true, false, false, true,
36558 false, true, true, false, false, true, true, false,
36559 false, true, true, false, false, true, true, false
36562 if (!CONST_INT_P (op2))
36564 error ("the third argument must be comparison constant");
36565 return const0_rtx;
36567 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36569 error ("incorrect comparison mode");
36570 return const0_rtx;
36573 if (!insn_p->operand[2].predicate (op3, SImode))
36575 error ("incorrect rounding operand");
36576 return const0_rtx;
36579 comparison = comi_comparisons[INTVAL (op2)];
36580 need_ucomi = need_ucomi_values[INTVAL (op2)];
36582 if (VECTOR_MODE_P (mode0))
36583 op0 = safe_vector_operand (op0, mode0);
36584 if (VECTOR_MODE_P (mode1))
36585 op1 = safe_vector_operand (op1, mode1);
36587 target = gen_reg_rtx (SImode);
36588 emit_move_insn (target, const0_rtx);
36589 target = gen_rtx_SUBREG (QImode, target, 0);
36591 if ((optimize && !register_operand (op0, mode0))
36592 || !insn_p->operand[0].predicate (op0, mode0))
36593 op0 = copy_to_mode_reg (mode0, op0);
36594 if ((optimize && !register_operand (op1, mode1))
36595 || !insn_p->operand[1].predicate (op1, mode1))
36596 op1 = copy_to_mode_reg (mode1, op1);
36598 if (need_ucomi)
36599 icode = icode == CODE_FOR_sse_comi_round
36600 ? CODE_FOR_sse_ucomi_round
36601 : CODE_FOR_sse2_ucomi_round;
36603 pat = GEN_FCN (icode) (op0, op1, op3);
36604 if (! pat)
36605 return 0;
36607 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36608 if (INTVAL (op3) == NO_ROUND)
36610 pat = ix86_erase_embedded_rounding (pat);
36611 if (! pat)
36612 return 0;
36614 set_dst = SET_DEST (pat);
36616 else
36618 gcc_assert (GET_CODE (pat) == SET);
36619 set_dst = SET_DEST (pat);
36622 emit_insn (pat);
36623 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36624 gen_rtx_fmt_ee (comparison, QImode,
36625 set_dst,
36626 const0_rtx)));
36628 return SUBREG_REG (target);
36631 static rtx
36632 ix86_expand_round_builtin (const struct builtin_description *d,
36633 tree exp, rtx target)
36635 rtx pat;
36636 unsigned int i, nargs;
36637 struct
36639 rtx op;
36640 machine_mode mode;
36641 } args[6];
36642 enum insn_code icode = d->icode;
36643 const struct insn_data_d *insn_p = &insn_data[icode];
36644 machine_mode tmode = insn_p->operand[0].mode;
36645 unsigned int nargs_constant = 0;
36646 unsigned int redundant_embed_rnd = 0;
36648 switch ((enum ix86_builtin_func_type) d->flag)
36650 case UINT64_FTYPE_V2DF_INT:
36651 case UINT64_FTYPE_V4SF_INT:
36652 case UINT_FTYPE_V2DF_INT:
36653 case UINT_FTYPE_V4SF_INT:
36654 case INT64_FTYPE_V2DF_INT:
36655 case INT64_FTYPE_V4SF_INT:
36656 case INT_FTYPE_V2DF_INT:
36657 case INT_FTYPE_V4SF_INT:
36658 nargs = 2;
36659 break;
36660 case V4SF_FTYPE_V4SF_UINT_INT:
36661 case V4SF_FTYPE_V4SF_UINT64_INT:
36662 case V2DF_FTYPE_V2DF_UINT64_INT:
36663 case V4SF_FTYPE_V4SF_INT_INT:
36664 case V4SF_FTYPE_V4SF_INT64_INT:
36665 case V2DF_FTYPE_V2DF_INT64_INT:
36666 case V4SF_FTYPE_V4SF_V4SF_INT:
36667 case V2DF_FTYPE_V2DF_V2DF_INT:
36668 case V4SF_FTYPE_V4SF_V2DF_INT:
36669 case V2DF_FTYPE_V2DF_V4SF_INT:
36670 nargs = 3;
36671 break;
36672 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36673 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36674 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36675 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36676 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36677 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36678 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36679 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36680 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36681 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36682 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36683 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36684 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36685 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36686 nargs = 4;
36687 break;
36688 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36689 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36690 nargs_constant = 2;
36691 nargs = 4;
36692 break;
36693 case INT_FTYPE_V4SF_V4SF_INT_INT:
36694 case INT_FTYPE_V2DF_V2DF_INT_INT:
36695 return ix86_expand_sse_comi_round (d, exp, target);
36696 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36697 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36698 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36699 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36700 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36701 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36702 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36703 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36704 nargs = 5;
36705 break;
36706 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36707 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36708 nargs_constant = 4;
36709 nargs = 5;
36710 break;
36711 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36712 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36713 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36714 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36715 nargs_constant = 3;
36716 nargs = 5;
36717 break;
36718 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36719 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36720 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36721 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36722 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36723 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36724 nargs = 6;
36725 nargs_constant = 4;
36726 break;
36727 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36728 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36729 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36730 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36731 nargs = 6;
36732 nargs_constant = 3;
36733 break;
36734 default:
36735 gcc_unreachable ();
36737 gcc_assert (nargs <= ARRAY_SIZE (args));
36739 if (optimize
36740 || target == 0
36741 || GET_MODE (target) != tmode
36742 || !insn_p->operand[0].predicate (target, tmode))
36743 target = gen_reg_rtx (tmode);
36745 for (i = 0; i < nargs; i++)
36747 tree arg = CALL_EXPR_ARG (exp, i);
36748 rtx op = expand_normal (arg);
36749 machine_mode mode = insn_p->operand[i + 1].mode;
36750 bool match = insn_p->operand[i + 1].predicate (op, mode);
36752 if (i == nargs - nargs_constant)
36754 if (!match)
36756 switch (icode)
36758 case CODE_FOR_avx512f_getmantv8df_mask_round:
36759 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36760 case CODE_FOR_avx512f_vgetmantv2df_round:
36761 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36762 case CODE_FOR_avx512f_vgetmantv4sf_round:
36763 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36764 error ("the immediate argument must be a 4-bit immediate");
36765 return const0_rtx;
36766 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36767 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36768 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36769 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36770 error ("the immediate argument must be a 5-bit immediate");
36771 return const0_rtx;
36772 default:
36773 error ("the immediate argument must be an 8-bit immediate");
36774 return const0_rtx;
36778 else if (i == nargs-1)
36780 if (!insn_p->operand[nargs].predicate (op, SImode))
36782 error ("incorrect rounding operand");
36783 return const0_rtx;
36786 /* If there is no rounding use normal version of the pattern. */
36787 if (INTVAL (op) == NO_ROUND)
36788 redundant_embed_rnd = 1;
36790 else
36792 if (VECTOR_MODE_P (mode))
36793 op = safe_vector_operand (op, mode);
36795 op = fixup_modeless_constant (op, mode);
36797 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36799 if (optimize || !match)
36800 op = copy_to_mode_reg (mode, op);
36802 else
36804 op = copy_to_reg (op);
36805 op = lowpart_subreg (mode, op, GET_MODE (op));
36809 args[i].op = op;
36810 args[i].mode = mode;
36813 switch (nargs)
36815 case 1:
36816 pat = GEN_FCN (icode) (target, args[0].op);
36817 break;
36818 case 2:
36819 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36820 break;
36821 case 3:
36822 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36823 args[2].op);
36824 break;
36825 case 4:
36826 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36827 args[2].op, args[3].op);
36828 break;
36829 case 5:
36830 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36831 args[2].op, args[3].op, args[4].op);
36832 break;
36833 case 6:
36834 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36835 args[2].op, args[3].op, args[4].op,
36836 args[5].op);
36837 break;
36838 default:
36839 gcc_unreachable ();
36842 if (!pat)
36843 return 0;
36845 if (redundant_embed_rnd)
36846 pat = ix86_erase_embedded_rounding (pat);
36848 emit_insn (pat);
36849 return target;
36852 /* Subroutine of ix86_expand_builtin to take care of special insns
36853 with variable number of operands. */
36855 static rtx
36856 ix86_expand_special_args_builtin (const struct builtin_description *d,
36857 tree exp, rtx target)
36859 tree arg;
36860 rtx pat, op;
36861 unsigned int i, nargs, arg_adjust, memory;
36862 bool aligned_mem = false;
36863 struct
36865 rtx op;
36866 machine_mode mode;
36867 } args[3];
36868 enum insn_code icode = d->icode;
36869 bool last_arg_constant = false;
36870 const struct insn_data_d *insn_p = &insn_data[icode];
36871 machine_mode tmode = insn_p->operand[0].mode;
36872 enum { load, store } klass;
36874 switch ((enum ix86_builtin_func_type) d->flag)
36876 case VOID_FTYPE_VOID:
36877 emit_insn (GEN_FCN (icode) (target));
36878 return 0;
36879 case VOID_FTYPE_UINT64:
36880 case VOID_FTYPE_UNSIGNED:
36881 nargs = 0;
36882 klass = store;
36883 memory = 0;
36884 break;
36886 case INT_FTYPE_VOID:
36887 case USHORT_FTYPE_VOID:
36888 case UINT64_FTYPE_VOID:
36889 case UNSIGNED_FTYPE_VOID:
36890 nargs = 0;
36891 klass = load;
36892 memory = 0;
36893 break;
36894 case UINT64_FTYPE_PUNSIGNED:
36895 case V2DI_FTYPE_PV2DI:
36896 case V4DI_FTYPE_PV4DI:
36897 case V32QI_FTYPE_PCCHAR:
36898 case V16QI_FTYPE_PCCHAR:
36899 case V8SF_FTYPE_PCV4SF:
36900 case V8SF_FTYPE_PCFLOAT:
36901 case V4SF_FTYPE_PCFLOAT:
36902 case V4DF_FTYPE_PCV2DF:
36903 case V4DF_FTYPE_PCDOUBLE:
36904 case V2DF_FTYPE_PCDOUBLE:
36905 case VOID_FTYPE_PVOID:
36906 case V8DI_FTYPE_PV8DI:
36907 nargs = 1;
36908 klass = load;
36909 memory = 0;
36910 switch (icode)
36912 case CODE_FOR_sse4_1_movntdqa:
36913 case CODE_FOR_avx2_movntdqa:
36914 case CODE_FOR_avx512f_movntdqa:
36915 aligned_mem = true;
36916 break;
36917 default:
36918 break;
36920 break;
36921 case VOID_FTYPE_PV2SF_V4SF:
36922 case VOID_FTYPE_PV8DI_V8DI:
36923 case VOID_FTYPE_PV4DI_V4DI:
36924 case VOID_FTYPE_PV2DI_V2DI:
36925 case VOID_FTYPE_PCHAR_V32QI:
36926 case VOID_FTYPE_PCHAR_V16QI:
36927 case VOID_FTYPE_PFLOAT_V16SF:
36928 case VOID_FTYPE_PFLOAT_V8SF:
36929 case VOID_FTYPE_PFLOAT_V4SF:
36930 case VOID_FTYPE_PDOUBLE_V8DF:
36931 case VOID_FTYPE_PDOUBLE_V4DF:
36932 case VOID_FTYPE_PDOUBLE_V2DF:
36933 case VOID_FTYPE_PLONGLONG_LONGLONG:
36934 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36935 case VOID_FTYPE_PINT_INT:
36936 nargs = 1;
36937 klass = store;
36938 /* Reserve memory operand for target. */
36939 memory = ARRAY_SIZE (args);
36940 switch (icode)
36942 /* These builtins and instructions require the memory
36943 to be properly aligned. */
36944 case CODE_FOR_avx_movntv4di:
36945 case CODE_FOR_sse2_movntv2di:
36946 case CODE_FOR_avx_movntv8sf:
36947 case CODE_FOR_sse_movntv4sf:
36948 case CODE_FOR_sse4a_vmmovntv4sf:
36949 case CODE_FOR_avx_movntv4df:
36950 case CODE_FOR_sse2_movntv2df:
36951 case CODE_FOR_sse4a_vmmovntv2df:
36952 case CODE_FOR_sse2_movntidi:
36953 case CODE_FOR_sse_movntq:
36954 case CODE_FOR_sse2_movntisi:
36955 case CODE_FOR_avx512f_movntv16sf:
36956 case CODE_FOR_avx512f_movntv8df:
36957 case CODE_FOR_avx512f_movntv8di:
36958 aligned_mem = true;
36959 break;
36960 default:
36961 break;
36963 break;
36964 case V4SF_FTYPE_V4SF_PCV2SF:
36965 case V2DF_FTYPE_V2DF_PCDOUBLE:
36966 nargs = 2;
36967 klass = load;
36968 memory = 1;
36969 break;
36970 case V8SF_FTYPE_PCV8SF_V8SI:
36971 case V4DF_FTYPE_PCV4DF_V4DI:
36972 case V4SF_FTYPE_PCV4SF_V4SI:
36973 case V2DF_FTYPE_PCV2DF_V2DI:
36974 case V8SI_FTYPE_PCV8SI_V8SI:
36975 case V4DI_FTYPE_PCV4DI_V4DI:
36976 case V4SI_FTYPE_PCV4SI_V4SI:
36977 case V2DI_FTYPE_PCV2DI_V2DI:
36978 case VOID_FTYPE_INT_INT64:
36979 nargs = 2;
36980 klass = load;
36981 memory = 0;
36982 break;
36983 case VOID_FTYPE_PV8DF_V8DF_UQI:
36984 case VOID_FTYPE_PV4DF_V4DF_UQI:
36985 case VOID_FTYPE_PV2DF_V2DF_UQI:
36986 case VOID_FTYPE_PV16SF_V16SF_UHI:
36987 case VOID_FTYPE_PV8SF_V8SF_UQI:
36988 case VOID_FTYPE_PV4SF_V4SF_UQI:
36989 case VOID_FTYPE_PV8DI_V8DI_UQI:
36990 case VOID_FTYPE_PV4DI_V4DI_UQI:
36991 case VOID_FTYPE_PV2DI_V2DI_UQI:
36992 case VOID_FTYPE_PV16SI_V16SI_UHI:
36993 case VOID_FTYPE_PV8SI_V8SI_UQI:
36994 case VOID_FTYPE_PV4SI_V4SI_UQI:
36995 switch (icode)
36997 /* These builtins and instructions require the memory
36998 to be properly aligned. */
36999 case CODE_FOR_avx512f_storev16sf_mask:
37000 case CODE_FOR_avx512f_storev16si_mask:
37001 case CODE_FOR_avx512f_storev8df_mask:
37002 case CODE_FOR_avx512f_storev8di_mask:
37003 case CODE_FOR_avx512vl_storev8sf_mask:
37004 case CODE_FOR_avx512vl_storev8si_mask:
37005 case CODE_FOR_avx512vl_storev4df_mask:
37006 case CODE_FOR_avx512vl_storev4di_mask:
37007 case CODE_FOR_avx512vl_storev4sf_mask:
37008 case CODE_FOR_avx512vl_storev4si_mask:
37009 case CODE_FOR_avx512vl_storev2df_mask:
37010 case CODE_FOR_avx512vl_storev2di_mask:
37011 aligned_mem = true;
37012 break;
37013 default:
37014 break;
37016 /* FALLTHRU */
37017 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37018 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37019 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37020 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37021 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37022 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37023 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37024 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37025 case VOID_FTYPE_PV8SI_V8DI_UQI:
37026 case VOID_FTYPE_PV8HI_V8DI_UQI:
37027 case VOID_FTYPE_PV16HI_V16SI_UHI:
37028 case VOID_FTYPE_PV16QI_V8DI_UQI:
37029 case VOID_FTYPE_PV16QI_V16SI_UHI:
37030 case VOID_FTYPE_PV4SI_V4DI_UQI:
37031 case VOID_FTYPE_PV4SI_V2DI_UQI:
37032 case VOID_FTYPE_PV8HI_V4DI_UQI:
37033 case VOID_FTYPE_PV8HI_V2DI_UQI:
37034 case VOID_FTYPE_PV8HI_V8SI_UQI:
37035 case VOID_FTYPE_PV8HI_V4SI_UQI:
37036 case VOID_FTYPE_PV16QI_V4DI_UQI:
37037 case VOID_FTYPE_PV16QI_V2DI_UQI:
37038 case VOID_FTYPE_PV16QI_V8SI_UQI:
37039 case VOID_FTYPE_PV16QI_V4SI_UQI:
37040 case VOID_FTYPE_PCHAR_V64QI_UDI:
37041 case VOID_FTYPE_PCHAR_V32QI_USI:
37042 case VOID_FTYPE_PCHAR_V16QI_UHI:
37043 case VOID_FTYPE_PSHORT_V32HI_USI:
37044 case VOID_FTYPE_PSHORT_V16HI_UHI:
37045 case VOID_FTYPE_PSHORT_V8HI_UQI:
37046 case VOID_FTYPE_PINT_V16SI_UHI:
37047 case VOID_FTYPE_PINT_V8SI_UQI:
37048 case VOID_FTYPE_PINT_V4SI_UQI:
37049 case VOID_FTYPE_PINT64_V8DI_UQI:
37050 case VOID_FTYPE_PINT64_V4DI_UQI:
37051 case VOID_FTYPE_PINT64_V2DI_UQI:
37052 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37053 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37054 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37055 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37056 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37057 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37058 case VOID_FTYPE_PV32QI_V32HI_USI:
37059 case VOID_FTYPE_PV16QI_V16HI_UHI:
37060 case VOID_FTYPE_PV8QI_V8HI_UQI:
37061 nargs = 2;
37062 klass = store;
37063 /* Reserve memory operand for target. */
37064 memory = ARRAY_SIZE (args);
37065 break;
37066 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37067 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37068 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37069 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37070 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37071 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37072 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37073 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37074 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37075 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37076 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37077 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37078 switch (icode)
37080 /* These builtins and instructions require the memory
37081 to be properly aligned. */
37082 case CODE_FOR_avx512f_loadv16sf_mask:
37083 case CODE_FOR_avx512f_loadv16si_mask:
37084 case CODE_FOR_avx512f_loadv8df_mask:
37085 case CODE_FOR_avx512f_loadv8di_mask:
37086 case CODE_FOR_avx512vl_loadv8sf_mask:
37087 case CODE_FOR_avx512vl_loadv8si_mask:
37088 case CODE_FOR_avx512vl_loadv4df_mask:
37089 case CODE_FOR_avx512vl_loadv4di_mask:
37090 case CODE_FOR_avx512vl_loadv4sf_mask:
37091 case CODE_FOR_avx512vl_loadv4si_mask:
37092 case CODE_FOR_avx512vl_loadv2df_mask:
37093 case CODE_FOR_avx512vl_loadv2di_mask:
37094 case CODE_FOR_avx512bw_loadv64qi_mask:
37095 case CODE_FOR_avx512vl_loadv32qi_mask:
37096 case CODE_FOR_avx512vl_loadv16qi_mask:
37097 case CODE_FOR_avx512bw_loadv32hi_mask:
37098 case CODE_FOR_avx512vl_loadv16hi_mask:
37099 case CODE_FOR_avx512vl_loadv8hi_mask:
37100 aligned_mem = true;
37101 break;
37102 default:
37103 break;
37105 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37106 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37107 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37108 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37109 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37110 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37111 case V16SI_FTYPE_PCINT_V16SI_UHI:
37112 case V8SI_FTYPE_PCINT_V8SI_UQI:
37113 case V4SI_FTYPE_PCINT_V4SI_UQI:
37114 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37115 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37116 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37117 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37118 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37119 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37120 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37121 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37122 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37123 nargs = 3;
37124 klass = load;
37125 memory = 0;
37126 break;
37127 case VOID_FTYPE_UINT_UINT_UINT:
37128 case VOID_FTYPE_UINT64_UINT_UINT:
37129 case UCHAR_FTYPE_UINT_UINT_UINT:
37130 case UCHAR_FTYPE_UINT64_UINT_UINT:
37131 nargs = 3;
37132 klass = load;
37133 memory = ARRAY_SIZE (args);
37134 last_arg_constant = true;
37135 break;
37136 default:
37137 gcc_unreachable ();
37140 gcc_assert (nargs <= ARRAY_SIZE (args));
37142 if (klass == store)
37144 arg = CALL_EXPR_ARG (exp, 0);
37145 op = expand_normal (arg);
37146 gcc_assert (target == 0);
37147 if (memory)
37149 op = ix86_zero_extend_to_Pmode (op);
37150 target = gen_rtx_MEM (tmode, op);
37151 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37152 on it. Try to improve it using get_pointer_alignment,
37153 and if the special builtin is one that requires strict
37154 mode alignment, also from it's GET_MODE_ALIGNMENT.
37155 Failure to do so could lead to ix86_legitimate_combined_insn
37156 rejecting all changes to such insns. */
37157 unsigned int align = get_pointer_alignment (arg);
37158 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37159 align = GET_MODE_ALIGNMENT (tmode);
37160 if (MEM_ALIGN (target) < align)
37161 set_mem_align (target, align);
37163 else
37164 target = force_reg (tmode, op);
37165 arg_adjust = 1;
37167 else
37169 arg_adjust = 0;
37170 if (optimize
37171 || target == 0
37172 || !register_operand (target, tmode)
37173 || GET_MODE (target) != tmode)
37174 target = gen_reg_rtx (tmode);
37177 for (i = 0; i < nargs; i++)
37179 machine_mode mode = insn_p->operand[i + 1].mode;
37180 bool match;
37182 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37183 op = expand_normal (arg);
37184 match = insn_p->operand[i + 1].predicate (op, mode);
37186 if (last_arg_constant && (i + 1) == nargs)
37188 if (!match)
37190 if (icode == CODE_FOR_lwp_lwpvalsi3
37191 || icode == CODE_FOR_lwp_lwpinssi3
37192 || icode == CODE_FOR_lwp_lwpvaldi3
37193 || icode == CODE_FOR_lwp_lwpinsdi3)
37194 error ("the last argument must be a 32-bit immediate");
37195 else
37196 error ("the last argument must be an 8-bit immediate");
37197 return const0_rtx;
37200 else
37202 if (i == memory)
37204 /* This must be the memory operand. */
37205 op = ix86_zero_extend_to_Pmode (op);
37206 op = gen_rtx_MEM (mode, op);
37207 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37208 on it. Try to improve it using get_pointer_alignment,
37209 and if the special builtin is one that requires strict
37210 mode alignment, also from it's GET_MODE_ALIGNMENT.
37211 Failure to do so could lead to ix86_legitimate_combined_insn
37212 rejecting all changes to such insns. */
37213 unsigned int align = get_pointer_alignment (arg);
37214 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37215 align = GET_MODE_ALIGNMENT (mode);
37216 if (MEM_ALIGN (op) < align)
37217 set_mem_align (op, align);
37219 else
37221 /* This must be register. */
37222 if (VECTOR_MODE_P (mode))
37223 op = safe_vector_operand (op, mode);
37225 op = fixup_modeless_constant (op, mode);
37227 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37228 op = copy_to_mode_reg (mode, op);
37229 else
37231 op = copy_to_reg (op);
37232 op = lowpart_subreg (mode, op, GET_MODE (op));
37237 args[i].op = op;
37238 args[i].mode = mode;
37241 switch (nargs)
37243 case 0:
37244 pat = GEN_FCN (icode) (target);
37245 break;
37246 case 1:
37247 pat = GEN_FCN (icode) (target, args[0].op);
37248 break;
37249 case 2:
37250 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37251 break;
37252 case 3:
37253 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37254 break;
37255 default:
37256 gcc_unreachable ();
37259 if (! pat)
37260 return 0;
37261 emit_insn (pat);
37262 return klass == store ? 0 : target;
37265 /* Return the integer constant in ARG. Constrain it to be in the range
37266 of the subparts of VEC_TYPE; issue an error if not. */
37268 static int
37269 get_element_number (tree vec_type, tree arg)
37271 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37273 if (!tree_fits_uhwi_p (arg)
37274 || (elt = tree_to_uhwi (arg), elt > max))
37276 error ("selector must be an integer constant in the range 0..%wi", max);
37277 return 0;
37280 return elt;
37283 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37284 ix86_expand_vector_init. We DO have language-level syntax for this, in
37285 the form of (type){ init-list }. Except that since we can't place emms
37286 instructions from inside the compiler, we can't allow the use of MMX
37287 registers unless the user explicitly asks for it. So we do *not* define
37288 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37289 we have builtins invoked by mmintrin.h that gives us license to emit
37290 these sorts of instructions. */
37292 static rtx
37293 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37295 machine_mode tmode = TYPE_MODE (type);
37296 machine_mode inner_mode = GET_MODE_INNER (tmode);
37297 int i, n_elt = GET_MODE_NUNITS (tmode);
37298 rtvec v = rtvec_alloc (n_elt);
37300 gcc_assert (VECTOR_MODE_P (tmode));
37301 gcc_assert (call_expr_nargs (exp) == n_elt);
37303 for (i = 0; i < n_elt; ++i)
37305 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37306 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37309 if (!target || !register_operand (target, tmode))
37310 target = gen_reg_rtx (tmode);
37312 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37313 return target;
37316 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37317 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37318 had a language-level syntax for referencing vector elements. */
37320 static rtx
37321 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37323 machine_mode tmode, mode0;
37324 tree arg0, arg1;
37325 int elt;
37326 rtx op0;
37328 arg0 = CALL_EXPR_ARG (exp, 0);
37329 arg1 = CALL_EXPR_ARG (exp, 1);
37331 op0 = expand_normal (arg0);
37332 elt = get_element_number (TREE_TYPE (arg0), arg1);
37334 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37335 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37336 gcc_assert (VECTOR_MODE_P (mode0));
37338 op0 = force_reg (mode0, op0);
37340 if (optimize || !target || !register_operand (target, tmode))
37341 target = gen_reg_rtx (tmode);
37343 ix86_expand_vector_extract (true, target, op0, elt);
37345 return target;
37348 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37349 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37350 a language-level syntax for referencing vector elements. */
37352 static rtx
37353 ix86_expand_vec_set_builtin (tree exp)
37355 machine_mode tmode, mode1;
37356 tree arg0, arg1, arg2;
37357 int elt;
37358 rtx op0, op1, target;
37360 arg0 = CALL_EXPR_ARG (exp, 0);
37361 arg1 = CALL_EXPR_ARG (exp, 1);
37362 arg2 = CALL_EXPR_ARG (exp, 2);
37364 tmode = TYPE_MODE (TREE_TYPE (arg0));
37365 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37366 gcc_assert (VECTOR_MODE_P (tmode));
37368 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37369 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37370 elt = get_element_number (TREE_TYPE (arg0), arg2);
37372 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37373 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37375 op0 = force_reg (tmode, op0);
37376 op1 = force_reg (mode1, op1);
37378 /* OP0 is the source of these builtin functions and shouldn't be
37379 modified. Create a copy, use it and return it as target. */
37380 target = gen_reg_rtx (tmode);
37381 emit_move_insn (target, op0);
37382 ix86_expand_vector_set (true, target, op1, elt);
37384 return target;
37387 /* Emit conditional move of SRC to DST with condition
37388 OP1 CODE OP2. */
37389 static void
37390 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37392 rtx t;
37394 if (TARGET_CMOVE)
37396 t = ix86_expand_compare (code, op1, op2);
37397 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37398 src, dst)));
37400 else
37402 rtx_code_label *nomove = gen_label_rtx ();
37403 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37404 const0_rtx, GET_MODE (op1), 1, nomove);
37405 emit_move_insn (dst, src);
37406 emit_label (nomove);
37410 /* Choose max of DST and SRC and put it to DST. */
37411 static void
37412 ix86_emit_move_max (rtx dst, rtx src)
37414 ix86_emit_cmove (dst, src, LTU, dst, src);
37417 /* Expand an expression EXP that calls a built-in function,
37418 with result going to TARGET if that's convenient
37419 (and in mode MODE if that's convenient).
37420 SUBTARGET may be used as the target for computing one of EXP's operands.
37421 IGNORE is nonzero if the value is to be ignored. */
37423 static rtx
37424 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37425 machine_mode mode, int ignore)
37427 size_t i;
37428 enum insn_code icode;
37429 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37430 tree arg0, arg1, arg2, arg3, arg4;
37431 rtx op0, op1, op2, op3, op4, pat, insn;
37432 machine_mode mode0, mode1, mode2, mode3, mode4;
37433 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37435 /* For CPU builtins that can be folded, fold first and expand the fold. */
37436 switch (fcode)
37438 case IX86_BUILTIN_CPU_INIT:
37440 /* Make it call __cpu_indicator_init in libgcc. */
37441 tree call_expr, fndecl, type;
37442 type = build_function_type_list (integer_type_node, NULL_TREE);
37443 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37444 call_expr = build_call_expr (fndecl, 0);
37445 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37447 case IX86_BUILTIN_CPU_IS:
37448 case IX86_BUILTIN_CPU_SUPPORTS:
37450 tree arg0 = CALL_EXPR_ARG (exp, 0);
37451 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37452 gcc_assert (fold_expr != NULL_TREE);
37453 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37457 /* Determine whether the builtin function is available under the current ISA.
37458 Originally the builtin was not created if it wasn't applicable to the
37459 current ISA based on the command line switches. With function specific
37460 options, we need to check in the context of the function making the call
37461 whether it is supported. Treat AVX512VL specially. For other flags,
37462 if isa includes more than one ISA bit, treat those are requiring any
37463 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37464 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37465 at all, -m64 is a whole TU option. */
37466 if (((ix86_builtins_isa[fcode].isa
37467 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37468 && !(ix86_builtins_isa[fcode].isa
37469 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37470 & ix86_isa_flags))
37471 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37472 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37473 || (ix86_builtins_isa[fcode].isa2
37474 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37476 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37477 ix86_builtins_isa[fcode].isa2, 0, 0,
37478 NULL, NULL, (enum fpmath_unit) 0,
37479 false);
37480 if (!opts)
37481 error ("%qE needs unknown isa option", fndecl);
37482 else
37484 gcc_assert (opts != NULL);
37485 error ("%qE needs isa option %s", fndecl, opts);
37486 free (opts);
37488 return expand_call (exp, target, ignore);
37491 switch (fcode)
37493 case IX86_BUILTIN_BNDMK:
37494 if (!target
37495 || GET_MODE (target) != BNDmode
37496 || !register_operand (target, BNDmode))
37497 target = gen_reg_rtx (BNDmode);
37499 arg0 = CALL_EXPR_ARG (exp, 0);
37500 arg1 = CALL_EXPR_ARG (exp, 1);
37502 op0 = expand_normal (arg0);
37503 op1 = expand_normal (arg1);
37505 if (!register_operand (op0, Pmode))
37506 op0 = ix86_zero_extend_to_Pmode (op0);
37507 if (!register_operand (op1, Pmode))
37508 op1 = ix86_zero_extend_to_Pmode (op1);
37510 /* Builtin arg1 is size of block but instruction op1 should
37511 be (size - 1). */
37512 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37513 NULL_RTX, 1, OPTAB_DIRECT);
37515 emit_insn (BNDmode == BND64mode
37516 ? gen_bnd64_mk (target, op0, op1)
37517 : gen_bnd32_mk (target, op0, op1));
37518 return target;
37520 case IX86_BUILTIN_BNDSTX:
37521 arg0 = CALL_EXPR_ARG (exp, 0);
37522 arg1 = CALL_EXPR_ARG (exp, 1);
37523 arg2 = CALL_EXPR_ARG (exp, 2);
37525 op0 = expand_normal (arg0);
37526 op1 = expand_normal (arg1);
37527 op2 = expand_normal (arg2);
37529 if (!register_operand (op0, Pmode))
37530 op0 = ix86_zero_extend_to_Pmode (op0);
37531 if (!register_operand (op1, BNDmode))
37532 op1 = copy_to_mode_reg (BNDmode, op1);
37533 if (!register_operand (op2, Pmode))
37534 op2 = ix86_zero_extend_to_Pmode (op2);
37536 emit_insn (BNDmode == BND64mode
37537 ? gen_bnd64_stx (op2, op0, op1)
37538 : gen_bnd32_stx (op2, op0, op1));
37539 return 0;
37541 case IX86_BUILTIN_BNDLDX:
37542 if (!target
37543 || GET_MODE (target) != BNDmode
37544 || !register_operand (target, BNDmode))
37545 target = gen_reg_rtx (BNDmode);
37547 arg0 = CALL_EXPR_ARG (exp, 0);
37548 arg1 = CALL_EXPR_ARG (exp, 1);
37550 op0 = expand_normal (arg0);
37551 op1 = expand_normal (arg1);
37553 if (!register_operand (op0, Pmode))
37554 op0 = ix86_zero_extend_to_Pmode (op0);
37555 if (!register_operand (op1, Pmode))
37556 op1 = ix86_zero_extend_to_Pmode (op1);
37558 emit_insn (BNDmode == BND64mode
37559 ? gen_bnd64_ldx (target, op0, op1)
37560 : gen_bnd32_ldx (target, op0, op1));
37561 return target;
37563 case IX86_BUILTIN_BNDCL:
37564 arg0 = CALL_EXPR_ARG (exp, 0);
37565 arg1 = CALL_EXPR_ARG (exp, 1);
37567 op0 = expand_normal (arg0);
37568 op1 = expand_normal (arg1);
37570 if (!register_operand (op0, Pmode))
37571 op0 = ix86_zero_extend_to_Pmode (op0);
37572 if (!register_operand (op1, BNDmode))
37573 op1 = copy_to_mode_reg (BNDmode, op1);
37575 emit_insn (BNDmode == BND64mode
37576 ? gen_bnd64_cl (op1, op0)
37577 : gen_bnd32_cl (op1, op0));
37578 return 0;
37580 case IX86_BUILTIN_BNDCU:
37581 arg0 = CALL_EXPR_ARG (exp, 0);
37582 arg1 = CALL_EXPR_ARG (exp, 1);
37584 op0 = expand_normal (arg0);
37585 op1 = expand_normal (arg1);
37587 if (!register_operand (op0, Pmode))
37588 op0 = ix86_zero_extend_to_Pmode (op0);
37589 if (!register_operand (op1, BNDmode))
37590 op1 = copy_to_mode_reg (BNDmode, op1);
37592 emit_insn (BNDmode == BND64mode
37593 ? gen_bnd64_cu (op1, op0)
37594 : gen_bnd32_cu (op1, op0));
37595 return 0;
37597 case IX86_BUILTIN_BNDRET:
37598 arg0 = CALL_EXPR_ARG (exp, 0);
37599 target = chkp_get_rtl_bounds (arg0);
37601 /* If no bounds were specified for returned value,
37602 then use INIT bounds. It usually happens when
37603 some built-in function is expanded. */
37604 if (!target)
37606 rtx t1 = gen_reg_rtx (Pmode);
37607 rtx t2 = gen_reg_rtx (Pmode);
37608 target = gen_reg_rtx (BNDmode);
37609 emit_move_insn (t1, const0_rtx);
37610 emit_move_insn (t2, constm1_rtx);
37611 emit_insn (BNDmode == BND64mode
37612 ? gen_bnd64_mk (target, t1, t2)
37613 : gen_bnd32_mk (target, t1, t2));
37616 gcc_assert (target && REG_P (target));
37617 return target;
37619 case IX86_BUILTIN_BNDNARROW:
37621 rtx m1, m1h1, m1h2, lb, ub, t1;
37623 /* Return value and lb. */
37624 arg0 = CALL_EXPR_ARG (exp, 0);
37625 /* Bounds. */
37626 arg1 = CALL_EXPR_ARG (exp, 1);
37627 /* Size. */
37628 arg2 = CALL_EXPR_ARG (exp, 2);
37630 lb = expand_normal (arg0);
37631 op1 = expand_normal (arg1);
37632 op2 = expand_normal (arg2);
37634 /* Size was passed but we need to use (size - 1) as for bndmk. */
37635 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37636 NULL_RTX, 1, OPTAB_DIRECT);
37638 /* Add LB to size and inverse to get UB. */
37639 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37640 op2, 1, OPTAB_DIRECT);
37641 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37643 if (!register_operand (lb, Pmode))
37644 lb = ix86_zero_extend_to_Pmode (lb);
37645 if (!register_operand (ub, Pmode))
37646 ub = ix86_zero_extend_to_Pmode (ub);
37648 /* We need to move bounds to memory before any computations. */
37649 if (MEM_P (op1))
37650 m1 = op1;
37651 else
37653 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37654 emit_move_insn (m1, op1);
37657 /* Generate mem expression to be used for access to LB and UB. */
37658 m1h1 = adjust_address (m1, Pmode, 0);
37659 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37661 t1 = gen_reg_rtx (Pmode);
37663 /* Compute LB. */
37664 emit_move_insn (t1, m1h1);
37665 ix86_emit_move_max (t1, lb);
37666 emit_move_insn (m1h1, t1);
37668 /* Compute UB. UB is stored in 1's complement form. Therefore
37669 we also use max here. */
37670 emit_move_insn (t1, m1h2);
37671 ix86_emit_move_max (t1, ub);
37672 emit_move_insn (m1h2, t1);
37674 op2 = gen_reg_rtx (BNDmode);
37675 emit_move_insn (op2, m1);
37677 return chkp_join_splitted_slot (lb, op2);
37680 case IX86_BUILTIN_BNDINT:
37682 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37684 if (!target
37685 || GET_MODE (target) != BNDmode
37686 || !register_operand (target, BNDmode))
37687 target = gen_reg_rtx (BNDmode);
37689 arg0 = CALL_EXPR_ARG (exp, 0);
37690 arg1 = CALL_EXPR_ARG (exp, 1);
37692 op0 = expand_normal (arg0);
37693 op1 = expand_normal (arg1);
37695 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37696 rh1 = adjust_address (res, Pmode, 0);
37697 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37699 /* Put first bounds to temporaries. */
37700 lb1 = gen_reg_rtx (Pmode);
37701 ub1 = gen_reg_rtx (Pmode);
37702 if (MEM_P (op0))
37704 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37705 emit_move_insn (ub1, adjust_address (op0, Pmode,
37706 GET_MODE_SIZE (Pmode)));
37708 else
37710 emit_move_insn (res, op0);
37711 emit_move_insn (lb1, rh1);
37712 emit_move_insn (ub1, rh2);
37715 /* Put second bounds to temporaries. */
37716 lb2 = gen_reg_rtx (Pmode);
37717 ub2 = gen_reg_rtx (Pmode);
37718 if (MEM_P (op1))
37720 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37721 emit_move_insn (ub2, adjust_address (op1, Pmode,
37722 GET_MODE_SIZE (Pmode)));
37724 else
37726 emit_move_insn (res, op1);
37727 emit_move_insn (lb2, rh1);
37728 emit_move_insn (ub2, rh2);
37731 /* Compute LB. */
37732 ix86_emit_move_max (lb1, lb2);
37733 emit_move_insn (rh1, lb1);
37735 /* Compute UB. UB is stored in 1's complement form. Therefore
37736 we also use max here. */
37737 ix86_emit_move_max (ub1, ub2);
37738 emit_move_insn (rh2, ub1);
37740 emit_move_insn (target, res);
37742 return target;
37745 case IX86_BUILTIN_SIZEOF:
37747 tree name;
37748 rtx symbol;
37750 if (!target
37751 || GET_MODE (target) != Pmode
37752 || !register_operand (target, Pmode))
37753 target = gen_reg_rtx (Pmode);
37755 arg0 = CALL_EXPR_ARG (exp, 0);
37756 gcc_assert (VAR_P (arg0));
37758 name = DECL_ASSEMBLER_NAME (arg0);
37759 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37761 emit_insn (Pmode == SImode
37762 ? gen_move_size_reloc_si (target, symbol)
37763 : gen_move_size_reloc_di (target, symbol));
37765 return target;
37768 case IX86_BUILTIN_BNDLOWER:
37770 rtx mem, hmem;
37772 if (!target
37773 || GET_MODE (target) != Pmode
37774 || !register_operand (target, Pmode))
37775 target = gen_reg_rtx (Pmode);
37777 arg0 = CALL_EXPR_ARG (exp, 0);
37778 op0 = expand_normal (arg0);
37780 /* We need to move bounds to memory first. */
37781 if (MEM_P (op0))
37782 mem = op0;
37783 else
37785 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37786 emit_move_insn (mem, op0);
37789 /* Generate mem expression to access LB and load it. */
37790 hmem = adjust_address (mem, Pmode, 0);
37791 emit_move_insn (target, hmem);
37793 return target;
37796 case IX86_BUILTIN_BNDUPPER:
37798 rtx mem, hmem, res;
37800 if (!target
37801 || GET_MODE (target) != Pmode
37802 || !register_operand (target, Pmode))
37803 target = gen_reg_rtx (Pmode);
37805 arg0 = CALL_EXPR_ARG (exp, 0);
37806 op0 = expand_normal (arg0);
37808 /* We need to move bounds to memory first. */
37809 if (MEM_P (op0))
37810 mem = op0;
37811 else
37813 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37814 emit_move_insn (mem, op0);
37817 /* Generate mem expression to access UB. */
37818 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37820 /* We need to inverse all bits of UB. */
37821 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37823 if (res != target)
37824 emit_move_insn (target, res);
37826 return target;
37829 case IX86_BUILTIN_MASKMOVQ:
37830 case IX86_BUILTIN_MASKMOVDQU:
37831 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37832 ? CODE_FOR_mmx_maskmovq
37833 : CODE_FOR_sse2_maskmovdqu);
37834 /* Note the arg order is different from the operand order. */
37835 arg1 = CALL_EXPR_ARG (exp, 0);
37836 arg2 = CALL_EXPR_ARG (exp, 1);
37837 arg0 = CALL_EXPR_ARG (exp, 2);
37838 op0 = expand_normal (arg0);
37839 op1 = expand_normal (arg1);
37840 op2 = expand_normal (arg2);
37841 mode0 = insn_data[icode].operand[0].mode;
37842 mode1 = insn_data[icode].operand[1].mode;
37843 mode2 = insn_data[icode].operand[2].mode;
37845 op0 = ix86_zero_extend_to_Pmode (op0);
37846 op0 = gen_rtx_MEM (mode1, op0);
37848 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37849 op0 = copy_to_mode_reg (mode0, op0);
37850 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37851 op1 = copy_to_mode_reg (mode1, op1);
37852 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37853 op2 = copy_to_mode_reg (mode2, op2);
37854 pat = GEN_FCN (icode) (op0, op1, op2);
37855 if (! pat)
37856 return 0;
37857 emit_insn (pat);
37858 return 0;
37860 case IX86_BUILTIN_LDMXCSR:
37861 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37862 target = assign_386_stack_local (SImode, SLOT_TEMP);
37863 emit_move_insn (target, op0);
37864 emit_insn (gen_sse_ldmxcsr (target));
37865 return 0;
37867 case IX86_BUILTIN_STMXCSR:
37868 target = assign_386_stack_local (SImode, SLOT_TEMP);
37869 emit_insn (gen_sse_stmxcsr (target));
37870 return copy_to_mode_reg (SImode, target);
37872 case IX86_BUILTIN_CLFLUSH:
37873 arg0 = CALL_EXPR_ARG (exp, 0);
37874 op0 = expand_normal (arg0);
37875 icode = CODE_FOR_sse2_clflush;
37876 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37877 op0 = ix86_zero_extend_to_Pmode (op0);
37879 emit_insn (gen_sse2_clflush (op0));
37880 return 0;
37882 case IX86_BUILTIN_CLWB:
37883 arg0 = CALL_EXPR_ARG (exp, 0);
37884 op0 = expand_normal (arg0);
37885 icode = CODE_FOR_clwb;
37886 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37887 op0 = ix86_zero_extend_to_Pmode (op0);
37889 emit_insn (gen_clwb (op0));
37890 return 0;
37892 case IX86_BUILTIN_CLFLUSHOPT:
37893 arg0 = CALL_EXPR_ARG (exp, 0);
37894 op0 = expand_normal (arg0);
37895 icode = CODE_FOR_clflushopt;
37896 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37897 op0 = ix86_zero_extend_to_Pmode (op0);
37899 emit_insn (gen_clflushopt (op0));
37900 return 0;
37902 case IX86_BUILTIN_MONITOR:
37903 case IX86_BUILTIN_MONITORX:
37904 arg0 = CALL_EXPR_ARG (exp, 0);
37905 arg1 = CALL_EXPR_ARG (exp, 1);
37906 arg2 = CALL_EXPR_ARG (exp, 2);
37907 op0 = expand_normal (arg0);
37908 op1 = expand_normal (arg1);
37909 op2 = expand_normal (arg2);
37910 if (!REG_P (op0))
37911 op0 = ix86_zero_extend_to_Pmode (op0);
37912 if (!REG_P (op1))
37913 op1 = copy_to_mode_reg (SImode, op1);
37914 if (!REG_P (op2))
37915 op2 = copy_to_mode_reg (SImode, op2);
37917 emit_insn (fcode == IX86_BUILTIN_MONITOR
37918 ? ix86_gen_monitor (op0, op1, op2)
37919 : ix86_gen_monitorx (op0, op1, op2));
37920 return 0;
37922 case IX86_BUILTIN_MWAIT:
37923 arg0 = CALL_EXPR_ARG (exp, 0);
37924 arg1 = CALL_EXPR_ARG (exp, 1);
37925 op0 = expand_normal (arg0);
37926 op1 = expand_normal (arg1);
37927 if (!REG_P (op0))
37928 op0 = copy_to_mode_reg (SImode, op0);
37929 if (!REG_P (op1))
37930 op1 = copy_to_mode_reg (SImode, op1);
37931 emit_insn (gen_sse3_mwait (op0, op1));
37932 return 0;
37934 case IX86_BUILTIN_MWAITX:
37935 arg0 = CALL_EXPR_ARG (exp, 0);
37936 arg1 = CALL_EXPR_ARG (exp, 1);
37937 arg2 = CALL_EXPR_ARG (exp, 2);
37938 op0 = expand_normal (arg0);
37939 op1 = expand_normal (arg1);
37940 op2 = expand_normal (arg2);
37941 if (!REG_P (op0))
37942 op0 = copy_to_mode_reg (SImode, op0);
37943 if (!REG_P (op1))
37944 op1 = copy_to_mode_reg (SImode, op1);
37945 if (!REG_P (op2))
37946 op2 = copy_to_mode_reg (SImode, op2);
37947 emit_insn (gen_mwaitx (op0, op1, op2));
37948 return 0;
37950 case IX86_BUILTIN_CLZERO:
37951 arg0 = CALL_EXPR_ARG (exp, 0);
37952 op0 = expand_normal (arg0);
37953 if (!REG_P (op0))
37954 op0 = ix86_zero_extend_to_Pmode (op0);
37955 emit_insn (ix86_gen_clzero (op0));
37956 return 0;
37958 case IX86_BUILTIN_VEC_INIT_V2SI:
37959 case IX86_BUILTIN_VEC_INIT_V4HI:
37960 case IX86_BUILTIN_VEC_INIT_V8QI:
37961 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37963 case IX86_BUILTIN_VEC_EXT_V2DF:
37964 case IX86_BUILTIN_VEC_EXT_V2DI:
37965 case IX86_BUILTIN_VEC_EXT_V4SF:
37966 case IX86_BUILTIN_VEC_EXT_V4SI:
37967 case IX86_BUILTIN_VEC_EXT_V8HI:
37968 case IX86_BUILTIN_VEC_EXT_V2SI:
37969 case IX86_BUILTIN_VEC_EXT_V4HI:
37970 case IX86_BUILTIN_VEC_EXT_V16QI:
37971 return ix86_expand_vec_ext_builtin (exp, target);
37973 case IX86_BUILTIN_VEC_SET_V2DI:
37974 case IX86_BUILTIN_VEC_SET_V4SF:
37975 case IX86_BUILTIN_VEC_SET_V4SI:
37976 case IX86_BUILTIN_VEC_SET_V8HI:
37977 case IX86_BUILTIN_VEC_SET_V4HI:
37978 case IX86_BUILTIN_VEC_SET_V16QI:
37979 return ix86_expand_vec_set_builtin (exp);
37981 case IX86_BUILTIN_NANQ:
37982 case IX86_BUILTIN_NANSQ:
37983 return expand_call (exp, target, ignore);
37985 case IX86_BUILTIN_RDPMC:
37986 case IX86_BUILTIN_RDTSC:
37987 case IX86_BUILTIN_RDTSCP:
37988 case IX86_BUILTIN_XGETBV:
37990 op0 = gen_reg_rtx (DImode);
37991 op1 = gen_reg_rtx (DImode);
37993 if (fcode == IX86_BUILTIN_RDPMC)
37995 arg0 = CALL_EXPR_ARG (exp, 0);
37996 op2 = expand_normal (arg0);
37997 if (!register_operand (op2, SImode))
37998 op2 = copy_to_mode_reg (SImode, op2);
38000 insn = (TARGET_64BIT
38001 ? gen_rdpmc_rex64 (op0, op1, op2)
38002 : gen_rdpmc (op0, op2));
38003 emit_insn (insn);
38005 else if (fcode == IX86_BUILTIN_XGETBV)
38007 arg0 = CALL_EXPR_ARG (exp, 0);
38008 op2 = expand_normal (arg0);
38009 if (!register_operand (op2, SImode))
38010 op2 = copy_to_mode_reg (SImode, op2);
38012 insn = (TARGET_64BIT
38013 ? gen_xgetbv_rex64 (op0, op1, op2)
38014 : gen_xgetbv (op0, op2));
38015 emit_insn (insn);
38017 else if (fcode == IX86_BUILTIN_RDTSC)
38019 insn = (TARGET_64BIT
38020 ? gen_rdtsc_rex64 (op0, op1)
38021 : gen_rdtsc (op0));
38022 emit_insn (insn);
38024 else
38026 op2 = gen_reg_rtx (SImode);
38028 insn = (TARGET_64BIT
38029 ? gen_rdtscp_rex64 (op0, op1, op2)
38030 : gen_rdtscp (op0, op2));
38031 emit_insn (insn);
38033 arg0 = CALL_EXPR_ARG (exp, 0);
38034 op4 = expand_normal (arg0);
38035 if (!address_operand (op4, VOIDmode))
38037 op4 = convert_memory_address (Pmode, op4);
38038 op4 = copy_addr_to_reg (op4);
38040 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38043 if (target == 0)
38045 /* mode is VOIDmode if __builtin_rd* has been called
38046 without lhs. */
38047 if (mode == VOIDmode)
38048 return target;
38049 target = gen_reg_rtx (mode);
38052 if (TARGET_64BIT)
38054 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38055 op1, 1, OPTAB_DIRECT);
38056 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38057 op0, 1, OPTAB_DIRECT);
38060 emit_move_insn (target, op0);
38061 return target;
38063 case IX86_BUILTIN_FXSAVE:
38064 case IX86_BUILTIN_FXRSTOR:
38065 case IX86_BUILTIN_FXSAVE64:
38066 case IX86_BUILTIN_FXRSTOR64:
38067 case IX86_BUILTIN_FNSTENV:
38068 case IX86_BUILTIN_FLDENV:
38069 mode0 = BLKmode;
38070 switch (fcode)
38072 case IX86_BUILTIN_FXSAVE:
38073 icode = CODE_FOR_fxsave;
38074 break;
38075 case IX86_BUILTIN_FXRSTOR:
38076 icode = CODE_FOR_fxrstor;
38077 break;
38078 case IX86_BUILTIN_FXSAVE64:
38079 icode = CODE_FOR_fxsave64;
38080 break;
38081 case IX86_BUILTIN_FXRSTOR64:
38082 icode = CODE_FOR_fxrstor64;
38083 break;
38084 case IX86_BUILTIN_FNSTENV:
38085 icode = CODE_FOR_fnstenv;
38086 break;
38087 case IX86_BUILTIN_FLDENV:
38088 icode = CODE_FOR_fldenv;
38089 break;
38090 default:
38091 gcc_unreachable ();
38094 arg0 = CALL_EXPR_ARG (exp, 0);
38095 op0 = expand_normal (arg0);
38097 if (!address_operand (op0, VOIDmode))
38099 op0 = convert_memory_address (Pmode, op0);
38100 op0 = copy_addr_to_reg (op0);
38102 op0 = gen_rtx_MEM (mode0, op0);
38104 pat = GEN_FCN (icode) (op0);
38105 if (pat)
38106 emit_insn (pat);
38107 return 0;
38109 case IX86_BUILTIN_XSETBV:
38110 arg0 = CALL_EXPR_ARG (exp, 0);
38111 arg1 = CALL_EXPR_ARG (exp, 1);
38112 op0 = expand_normal (arg0);
38113 op1 = expand_normal (arg1);
38115 if (!REG_P (op0))
38116 op0 = copy_to_mode_reg (SImode, op0);
38118 if (TARGET_64BIT)
38120 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38121 NULL, 1, OPTAB_DIRECT);
38123 op2 = gen_lowpart (SImode, op2);
38124 op1 = gen_lowpart (SImode, op1);
38125 if (!REG_P (op1))
38126 op1 = copy_to_mode_reg (SImode, op1);
38127 if (!REG_P (op2))
38128 op2 = copy_to_mode_reg (SImode, op2);
38129 icode = CODE_FOR_xsetbv_rex64;
38130 pat = GEN_FCN (icode) (op0, op1, op2);
38132 else
38134 if (!REG_P (op1))
38135 op1 = copy_to_mode_reg (DImode, op1);
38136 icode = CODE_FOR_xsetbv;
38137 pat = GEN_FCN (icode) (op0, op1);
38139 if (pat)
38140 emit_insn (pat);
38141 return 0;
38143 case IX86_BUILTIN_XSAVE:
38144 case IX86_BUILTIN_XRSTOR:
38145 case IX86_BUILTIN_XSAVE64:
38146 case IX86_BUILTIN_XRSTOR64:
38147 case IX86_BUILTIN_XSAVEOPT:
38148 case IX86_BUILTIN_XSAVEOPT64:
38149 case IX86_BUILTIN_XSAVES:
38150 case IX86_BUILTIN_XRSTORS:
38151 case IX86_BUILTIN_XSAVES64:
38152 case IX86_BUILTIN_XRSTORS64:
38153 case IX86_BUILTIN_XSAVEC:
38154 case IX86_BUILTIN_XSAVEC64:
38155 arg0 = CALL_EXPR_ARG (exp, 0);
38156 arg1 = CALL_EXPR_ARG (exp, 1);
38157 op0 = expand_normal (arg0);
38158 op1 = expand_normal (arg1);
38160 if (!address_operand (op0, VOIDmode))
38162 op0 = convert_memory_address (Pmode, op0);
38163 op0 = copy_addr_to_reg (op0);
38165 op0 = gen_rtx_MEM (BLKmode, op0);
38167 op1 = force_reg (DImode, op1);
38169 if (TARGET_64BIT)
38171 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38172 NULL, 1, OPTAB_DIRECT);
38173 switch (fcode)
38175 case IX86_BUILTIN_XSAVE:
38176 icode = CODE_FOR_xsave_rex64;
38177 break;
38178 case IX86_BUILTIN_XRSTOR:
38179 icode = CODE_FOR_xrstor_rex64;
38180 break;
38181 case IX86_BUILTIN_XSAVE64:
38182 icode = CODE_FOR_xsave64;
38183 break;
38184 case IX86_BUILTIN_XRSTOR64:
38185 icode = CODE_FOR_xrstor64;
38186 break;
38187 case IX86_BUILTIN_XSAVEOPT:
38188 icode = CODE_FOR_xsaveopt_rex64;
38189 break;
38190 case IX86_BUILTIN_XSAVEOPT64:
38191 icode = CODE_FOR_xsaveopt64;
38192 break;
38193 case IX86_BUILTIN_XSAVES:
38194 icode = CODE_FOR_xsaves_rex64;
38195 break;
38196 case IX86_BUILTIN_XRSTORS:
38197 icode = CODE_FOR_xrstors_rex64;
38198 break;
38199 case IX86_BUILTIN_XSAVES64:
38200 icode = CODE_FOR_xsaves64;
38201 break;
38202 case IX86_BUILTIN_XRSTORS64:
38203 icode = CODE_FOR_xrstors64;
38204 break;
38205 case IX86_BUILTIN_XSAVEC:
38206 icode = CODE_FOR_xsavec_rex64;
38207 break;
38208 case IX86_BUILTIN_XSAVEC64:
38209 icode = CODE_FOR_xsavec64;
38210 break;
38211 default:
38212 gcc_unreachable ();
38215 op2 = gen_lowpart (SImode, op2);
38216 op1 = gen_lowpart (SImode, op1);
38217 pat = GEN_FCN (icode) (op0, op1, op2);
38219 else
38221 switch (fcode)
38223 case IX86_BUILTIN_XSAVE:
38224 icode = CODE_FOR_xsave;
38225 break;
38226 case IX86_BUILTIN_XRSTOR:
38227 icode = CODE_FOR_xrstor;
38228 break;
38229 case IX86_BUILTIN_XSAVEOPT:
38230 icode = CODE_FOR_xsaveopt;
38231 break;
38232 case IX86_BUILTIN_XSAVES:
38233 icode = CODE_FOR_xsaves;
38234 break;
38235 case IX86_BUILTIN_XRSTORS:
38236 icode = CODE_FOR_xrstors;
38237 break;
38238 case IX86_BUILTIN_XSAVEC:
38239 icode = CODE_FOR_xsavec;
38240 break;
38241 default:
38242 gcc_unreachable ();
38244 pat = GEN_FCN (icode) (op0, op1);
38247 if (pat)
38248 emit_insn (pat);
38249 return 0;
38251 case IX86_BUILTIN_LLWPCB:
38252 arg0 = CALL_EXPR_ARG (exp, 0);
38253 op0 = expand_normal (arg0);
38254 icode = CODE_FOR_lwp_llwpcb;
38255 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38256 op0 = ix86_zero_extend_to_Pmode (op0);
38257 emit_insn (gen_lwp_llwpcb (op0));
38258 return 0;
38260 case IX86_BUILTIN_SLWPCB:
38261 icode = CODE_FOR_lwp_slwpcb;
38262 if (!target
38263 || !insn_data[icode].operand[0].predicate (target, Pmode))
38264 target = gen_reg_rtx (Pmode);
38265 emit_insn (gen_lwp_slwpcb (target));
38266 return target;
38268 case IX86_BUILTIN_BEXTRI32:
38269 case IX86_BUILTIN_BEXTRI64:
38270 arg0 = CALL_EXPR_ARG (exp, 0);
38271 arg1 = CALL_EXPR_ARG (exp, 1);
38272 op0 = expand_normal (arg0);
38273 op1 = expand_normal (arg1);
38274 icode = (fcode == IX86_BUILTIN_BEXTRI32
38275 ? CODE_FOR_tbm_bextri_si
38276 : CODE_FOR_tbm_bextri_di);
38277 if (!CONST_INT_P (op1))
38279 error ("last argument must be an immediate");
38280 return const0_rtx;
38282 else
38284 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38285 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38286 op1 = GEN_INT (length);
38287 op2 = GEN_INT (lsb_index);
38288 pat = GEN_FCN (icode) (target, op0, op1, op2);
38289 if (pat)
38290 emit_insn (pat);
38291 return target;
38294 case IX86_BUILTIN_RDRAND16_STEP:
38295 icode = CODE_FOR_rdrandhi_1;
38296 mode0 = HImode;
38297 goto rdrand_step;
38299 case IX86_BUILTIN_RDRAND32_STEP:
38300 icode = CODE_FOR_rdrandsi_1;
38301 mode0 = SImode;
38302 goto rdrand_step;
38304 case IX86_BUILTIN_RDRAND64_STEP:
38305 icode = CODE_FOR_rdranddi_1;
38306 mode0 = DImode;
38308 rdrand_step:
38309 arg0 = CALL_EXPR_ARG (exp, 0);
38310 op1 = expand_normal (arg0);
38311 if (!address_operand (op1, VOIDmode))
38313 op1 = convert_memory_address (Pmode, op1);
38314 op1 = copy_addr_to_reg (op1);
38317 op0 = gen_reg_rtx (mode0);
38318 emit_insn (GEN_FCN (icode) (op0));
38320 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38322 op1 = gen_reg_rtx (SImode);
38323 emit_move_insn (op1, CONST1_RTX (SImode));
38325 /* Emit SImode conditional move. */
38326 if (mode0 == HImode)
38328 if (TARGET_ZERO_EXTEND_WITH_AND
38329 && optimize_function_for_speed_p (cfun))
38331 op2 = force_reg (SImode, const0_rtx);
38333 emit_insn (gen_movstricthi
38334 (gen_lowpart (HImode, op2), op0));
38336 else
38338 op2 = gen_reg_rtx (SImode);
38340 emit_insn (gen_zero_extendhisi2 (op2, op0));
38343 else if (mode0 == SImode)
38344 op2 = op0;
38345 else
38346 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38348 if (target == 0
38349 || !register_operand (target, SImode))
38350 target = gen_reg_rtx (SImode);
38352 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38353 const0_rtx);
38354 emit_insn (gen_rtx_SET (target,
38355 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38356 return target;
38358 case IX86_BUILTIN_RDSEED16_STEP:
38359 icode = CODE_FOR_rdseedhi_1;
38360 mode0 = HImode;
38361 goto rdseed_step;
38363 case IX86_BUILTIN_RDSEED32_STEP:
38364 icode = CODE_FOR_rdseedsi_1;
38365 mode0 = SImode;
38366 goto rdseed_step;
38368 case IX86_BUILTIN_RDSEED64_STEP:
38369 icode = CODE_FOR_rdseeddi_1;
38370 mode0 = DImode;
38372 rdseed_step:
38373 arg0 = CALL_EXPR_ARG (exp, 0);
38374 op1 = expand_normal (arg0);
38375 if (!address_operand (op1, VOIDmode))
38377 op1 = convert_memory_address (Pmode, op1);
38378 op1 = copy_addr_to_reg (op1);
38381 op0 = gen_reg_rtx (mode0);
38382 emit_insn (GEN_FCN (icode) (op0));
38384 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38386 op2 = gen_reg_rtx (QImode);
38388 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38389 const0_rtx);
38390 emit_insn (gen_rtx_SET (op2, pat));
38392 if (target == 0
38393 || !register_operand (target, SImode))
38394 target = gen_reg_rtx (SImode);
38396 emit_insn (gen_zero_extendqisi2 (target, op2));
38397 return target;
38399 case IX86_BUILTIN_SBB32:
38400 icode = CODE_FOR_subborrowsi;
38401 mode0 = SImode;
38402 goto handlecarry;
38404 case IX86_BUILTIN_SBB64:
38405 icode = CODE_FOR_subborrowdi;
38406 mode0 = DImode;
38407 goto handlecarry;
38409 case IX86_BUILTIN_ADDCARRYX32:
38410 icode = CODE_FOR_addcarrysi;
38411 mode0 = SImode;
38412 goto handlecarry;
38414 case IX86_BUILTIN_ADDCARRYX64:
38415 icode = CODE_FOR_addcarrydi;
38416 mode0 = DImode;
38418 handlecarry:
38419 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38420 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38421 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38422 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38424 op1 = expand_normal (arg0);
38425 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38427 op2 = expand_normal (arg1);
38428 if (!register_operand (op2, mode0))
38429 op2 = copy_to_mode_reg (mode0, op2);
38431 op3 = expand_normal (arg2);
38432 if (!register_operand (op3, mode0))
38433 op3 = copy_to_mode_reg (mode0, op3);
38435 op4 = expand_normal (arg3);
38436 if (!address_operand (op4, VOIDmode))
38438 op4 = convert_memory_address (Pmode, op4);
38439 op4 = copy_addr_to_reg (op4);
38442 /* Generate CF from input operand. */
38443 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38445 /* Generate instruction that consumes CF. */
38446 op0 = gen_reg_rtx (mode0);
38448 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38449 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38450 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38452 /* Return current CF value. */
38453 if (target == 0)
38454 target = gen_reg_rtx (QImode);
38456 PUT_MODE (pat, QImode);
38457 emit_insn (gen_rtx_SET (target, pat));
38459 /* Store the result. */
38460 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38462 return target;
38464 case IX86_BUILTIN_READ_FLAGS:
38465 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38467 if (optimize
38468 || target == NULL_RTX
38469 || !nonimmediate_operand (target, word_mode)
38470 || GET_MODE (target) != word_mode)
38471 target = gen_reg_rtx (word_mode);
38473 emit_insn (gen_pop (target));
38474 return target;
38476 case IX86_BUILTIN_WRITE_FLAGS:
38478 arg0 = CALL_EXPR_ARG (exp, 0);
38479 op0 = expand_normal (arg0);
38480 if (!general_no_elim_operand (op0, word_mode))
38481 op0 = copy_to_mode_reg (word_mode, op0);
38483 emit_insn (gen_push (op0));
38484 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38485 return 0;
38487 case IX86_BUILTIN_KTESTC8:
38488 icode = CODE_FOR_ktestqi;
38489 mode3 = CCCmode;
38490 goto kortest;
38492 case IX86_BUILTIN_KTESTZ8:
38493 icode = CODE_FOR_ktestqi;
38494 mode3 = CCZmode;
38495 goto kortest;
38497 case IX86_BUILTIN_KTESTC16:
38498 icode = CODE_FOR_ktesthi;
38499 mode3 = CCCmode;
38500 goto kortest;
38502 case IX86_BUILTIN_KTESTZ16:
38503 icode = CODE_FOR_ktesthi;
38504 mode3 = CCZmode;
38505 goto kortest;
38507 case IX86_BUILTIN_KTESTC32:
38508 icode = CODE_FOR_ktestsi;
38509 mode3 = CCCmode;
38510 goto kortest;
38512 case IX86_BUILTIN_KTESTZ32:
38513 icode = CODE_FOR_ktestsi;
38514 mode3 = CCZmode;
38515 goto kortest;
38517 case IX86_BUILTIN_KTESTC64:
38518 icode = CODE_FOR_ktestdi;
38519 mode3 = CCCmode;
38520 goto kortest;
38522 case IX86_BUILTIN_KTESTZ64:
38523 icode = CODE_FOR_ktestdi;
38524 mode3 = CCZmode;
38525 goto kortest;
38527 case IX86_BUILTIN_KORTESTC8:
38528 icode = CODE_FOR_kortestqi;
38529 mode3 = CCCmode;
38530 goto kortest;
38532 case IX86_BUILTIN_KORTESTZ8:
38533 icode = CODE_FOR_kortestqi;
38534 mode3 = CCZmode;
38535 goto kortest;
38537 case IX86_BUILTIN_KORTESTC16:
38538 icode = CODE_FOR_kortesthi;
38539 mode3 = CCCmode;
38540 goto kortest;
38542 case IX86_BUILTIN_KORTESTZ16:
38543 icode = CODE_FOR_kortesthi;
38544 mode3 = CCZmode;
38545 goto kortest;
38547 case IX86_BUILTIN_KORTESTC32:
38548 icode = CODE_FOR_kortestsi;
38549 mode3 = CCCmode;
38550 goto kortest;
38552 case IX86_BUILTIN_KORTESTZ32:
38553 icode = CODE_FOR_kortestsi;
38554 mode3 = CCZmode;
38555 goto kortest;
38557 case IX86_BUILTIN_KORTESTC64:
38558 icode = CODE_FOR_kortestdi;
38559 mode3 = CCCmode;
38560 goto kortest;
38562 case IX86_BUILTIN_KORTESTZ64:
38563 icode = CODE_FOR_kortestdi;
38564 mode3 = CCZmode;
38566 kortest:
38567 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38568 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38569 op0 = expand_normal (arg0);
38570 op1 = expand_normal (arg1);
38572 mode0 = insn_data[icode].operand[0].mode;
38573 mode1 = insn_data[icode].operand[1].mode;
38575 if (GET_MODE (op0) != VOIDmode)
38576 op0 = force_reg (GET_MODE (op0), op0);
38578 op0 = gen_lowpart (mode0, op0);
38580 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38581 op0 = copy_to_mode_reg (mode0, op0);
38583 if (GET_MODE (op1) != VOIDmode)
38584 op1 = force_reg (GET_MODE (op1), op1);
38586 op1 = gen_lowpart (mode1, op1);
38588 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38589 op1 = copy_to_mode_reg (mode1, op1);
38591 target = gen_reg_rtx (QImode);
38593 /* Emit kortest. */
38594 emit_insn (GEN_FCN (icode) (op0, op1));
38595 /* And use setcc to return result from flags. */
38596 ix86_expand_setcc (target, EQ,
38597 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38598 return target;
38600 case IX86_BUILTIN_GATHERSIV2DF:
38601 icode = CODE_FOR_avx2_gathersiv2df;
38602 goto gather_gen;
38603 case IX86_BUILTIN_GATHERSIV4DF:
38604 icode = CODE_FOR_avx2_gathersiv4df;
38605 goto gather_gen;
38606 case IX86_BUILTIN_GATHERDIV2DF:
38607 icode = CODE_FOR_avx2_gatherdiv2df;
38608 goto gather_gen;
38609 case IX86_BUILTIN_GATHERDIV4DF:
38610 icode = CODE_FOR_avx2_gatherdiv4df;
38611 goto gather_gen;
38612 case IX86_BUILTIN_GATHERSIV4SF:
38613 icode = CODE_FOR_avx2_gathersiv4sf;
38614 goto gather_gen;
38615 case IX86_BUILTIN_GATHERSIV8SF:
38616 icode = CODE_FOR_avx2_gathersiv8sf;
38617 goto gather_gen;
38618 case IX86_BUILTIN_GATHERDIV4SF:
38619 icode = CODE_FOR_avx2_gatherdiv4sf;
38620 goto gather_gen;
38621 case IX86_BUILTIN_GATHERDIV8SF:
38622 icode = CODE_FOR_avx2_gatherdiv8sf;
38623 goto gather_gen;
38624 case IX86_BUILTIN_GATHERSIV2DI:
38625 icode = CODE_FOR_avx2_gathersiv2di;
38626 goto gather_gen;
38627 case IX86_BUILTIN_GATHERSIV4DI:
38628 icode = CODE_FOR_avx2_gathersiv4di;
38629 goto gather_gen;
38630 case IX86_BUILTIN_GATHERDIV2DI:
38631 icode = CODE_FOR_avx2_gatherdiv2di;
38632 goto gather_gen;
38633 case IX86_BUILTIN_GATHERDIV4DI:
38634 icode = CODE_FOR_avx2_gatherdiv4di;
38635 goto gather_gen;
38636 case IX86_BUILTIN_GATHERSIV4SI:
38637 icode = CODE_FOR_avx2_gathersiv4si;
38638 goto gather_gen;
38639 case IX86_BUILTIN_GATHERSIV8SI:
38640 icode = CODE_FOR_avx2_gathersiv8si;
38641 goto gather_gen;
38642 case IX86_BUILTIN_GATHERDIV4SI:
38643 icode = CODE_FOR_avx2_gatherdiv4si;
38644 goto gather_gen;
38645 case IX86_BUILTIN_GATHERDIV8SI:
38646 icode = CODE_FOR_avx2_gatherdiv8si;
38647 goto gather_gen;
38648 case IX86_BUILTIN_GATHERALTSIV4DF:
38649 icode = CODE_FOR_avx2_gathersiv4df;
38650 goto gather_gen;
38651 case IX86_BUILTIN_GATHERALTDIV8SF:
38652 icode = CODE_FOR_avx2_gatherdiv8sf;
38653 goto gather_gen;
38654 case IX86_BUILTIN_GATHERALTSIV4DI:
38655 icode = CODE_FOR_avx2_gathersiv4di;
38656 goto gather_gen;
38657 case IX86_BUILTIN_GATHERALTDIV8SI:
38658 icode = CODE_FOR_avx2_gatherdiv8si;
38659 goto gather_gen;
38660 case IX86_BUILTIN_GATHER3SIV16SF:
38661 icode = CODE_FOR_avx512f_gathersiv16sf;
38662 goto gather_gen;
38663 case IX86_BUILTIN_GATHER3SIV8DF:
38664 icode = CODE_FOR_avx512f_gathersiv8df;
38665 goto gather_gen;
38666 case IX86_BUILTIN_GATHER3DIV16SF:
38667 icode = CODE_FOR_avx512f_gatherdiv16sf;
38668 goto gather_gen;
38669 case IX86_BUILTIN_GATHER3DIV8DF:
38670 icode = CODE_FOR_avx512f_gatherdiv8df;
38671 goto gather_gen;
38672 case IX86_BUILTIN_GATHER3SIV16SI:
38673 icode = CODE_FOR_avx512f_gathersiv16si;
38674 goto gather_gen;
38675 case IX86_BUILTIN_GATHER3SIV8DI:
38676 icode = CODE_FOR_avx512f_gathersiv8di;
38677 goto gather_gen;
38678 case IX86_BUILTIN_GATHER3DIV16SI:
38679 icode = CODE_FOR_avx512f_gatherdiv16si;
38680 goto gather_gen;
38681 case IX86_BUILTIN_GATHER3DIV8DI:
38682 icode = CODE_FOR_avx512f_gatherdiv8di;
38683 goto gather_gen;
38684 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38685 icode = CODE_FOR_avx512f_gathersiv8df;
38686 goto gather_gen;
38687 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38688 icode = CODE_FOR_avx512f_gatherdiv16sf;
38689 goto gather_gen;
38690 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38691 icode = CODE_FOR_avx512f_gathersiv8di;
38692 goto gather_gen;
38693 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38694 icode = CODE_FOR_avx512f_gatherdiv16si;
38695 goto gather_gen;
38696 case IX86_BUILTIN_GATHER3SIV2DF:
38697 icode = CODE_FOR_avx512vl_gathersiv2df;
38698 goto gather_gen;
38699 case IX86_BUILTIN_GATHER3SIV4DF:
38700 icode = CODE_FOR_avx512vl_gathersiv4df;
38701 goto gather_gen;
38702 case IX86_BUILTIN_GATHER3DIV2DF:
38703 icode = CODE_FOR_avx512vl_gatherdiv2df;
38704 goto gather_gen;
38705 case IX86_BUILTIN_GATHER3DIV4DF:
38706 icode = CODE_FOR_avx512vl_gatherdiv4df;
38707 goto gather_gen;
38708 case IX86_BUILTIN_GATHER3SIV4SF:
38709 icode = CODE_FOR_avx512vl_gathersiv4sf;
38710 goto gather_gen;
38711 case IX86_BUILTIN_GATHER3SIV8SF:
38712 icode = CODE_FOR_avx512vl_gathersiv8sf;
38713 goto gather_gen;
38714 case IX86_BUILTIN_GATHER3DIV4SF:
38715 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38716 goto gather_gen;
38717 case IX86_BUILTIN_GATHER3DIV8SF:
38718 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38719 goto gather_gen;
38720 case IX86_BUILTIN_GATHER3SIV2DI:
38721 icode = CODE_FOR_avx512vl_gathersiv2di;
38722 goto gather_gen;
38723 case IX86_BUILTIN_GATHER3SIV4DI:
38724 icode = CODE_FOR_avx512vl_gathersiv4di;
38725 goto gather_gen;
38726 case IX86_BUILTIN_GATHER3DIV2DI:
38727 icode = CODE_FOR_avx512vl_gatherdiv2di;
38728 goto gather_gen;
38729 case IX86_BUILTIN_GATHER3DIV4DI:
38730 icode = CODE_FOR_avx512vl_gatherdiv4di;
38731 goto gather_gen;
38732 case IX86_BUILTIN_GATHER3SIV4SI:
38733 icode = CODE_FOR_avx512vl_gathersiv4si;
38734 goto gather_gen;
38735 case IX86_BUILTIN_GATHER3SIV8SI:
38736 icode = CODE_FOR_avx512vl_gathersiv8si;
38737 goto gather_gen;
38738 case IX86_BUILTIN_GATHER3DIV4SI:
38739 icode = CODE_FOR_avx512vl_gatherdiv4si;
38740 goto gather_gen;
38741 case IX86_BUILTIN_GATHER3DIV8SI:
38742 icode = CODE_FOR_avx512vl_gatherdiv8si;
38743 goto gather_gen;
38744 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38745 icode = CODE_FOR_avx512vl_gathersiv4df;
38746 goto gather_gen;
38747 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38748 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38749 goto gather_gen;
38750 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38751 icode = CODE_FOR_avx512vl_gathersiv4di;
38752 goto gather_gen;
38753 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38754 icode = CODE_FOR_avx512vl_gatherdiv8si;
38755 goto gather_gen;
38756 case IX86_BUILTIN_SCATTERSIV16SF:
38757 icode = CODE_FOR_avx512f_scattersiv16sf;
38758 goto scatter_gen;
38759 case IX86_BUILTIN_SCATTERSIV8DF:
38760 icode = CODE_FOR_avx512f_scattersiv8df;
38761 goto scatter_gen;
38762 case IX86_BUILTIN_SCATTERDIV16SF:
38763 icode = CODE_FOR_avx512f_scatterdiv16sf;
38764 goto scatter_gen;
38765 case IX86_BUILTIN_SCATTERDIV8DF:
38766 icode = CODE_FOR_avx512f_scatterdiv8df;
38767 goto scatter_gen;
38768 case IX86_BUILTIN_SCATTERSIV16SI:
38769 icode = CODE_FOR_avx512f_scattersiv16si;
38770 goto scatter_gen;
38771 case IX86_BUILTIN_SCATTERSIV8DI:
38772 icode = CODE_FOR_avx512f_scattersiv8di;
38773 goto scatter_gen;
38774 case IX86_BUILTIN_SCATTERDIV16SI:
38775 icode = CODE_FOR_avx512f_scatterdiv16si;
38776 goto scatter_gen;
38777 case IX86_BUILTIN_SCATTERDIV8DI:
38778 icode = CODE_FOR_avx512f_scatterdiv8di;
38779 goto scatter_gen;
38780 case IX86_BUILTIN_SCATTERSIV8SF:
38781 icode = CODE_FOR_avx512vl_scattersiv8sf;
38782 goto scatter_gen;
38783 case IX86_BUILTIN_SCATTERSIV4SF:
38784 icode = CODE_FOR_avx512vl_scattersiv4sf;
38785 goto scatter_gen;
38786 case IX86_BUILTIN_SCATTERSIV4DF:
38787 icode = CODE_FOR_avx512vl_scattersiv4df;
38788 goto scatter_gen;
38789 case IX86_BUILTIN_SCATTERSIV2DF:
38790 icode = CODE_FOR_avx512vl_scattersiv2df;
38791 goto scatter_gen;
38792 case IX86_BUILTIN_SCATTERDIV8SF:
38793 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38794 goto scatter_gen;
38795 case IX86_BUILTIN_SCATTERDIV4SF:
38796 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38797 goto scatter_gen;
38798 case IX86_BUILTIN_SCATTERDIV4DF:
38799 icode = CODE_FOR_avx512vl_scatterdiv4df;
38800 goto scatter_gen;
38801 case IX86_BUILTIN_SCATTERDIV2DF:
38802 icode = CODE_FOR_avx512vl_scatterdiv2df;
38803 goto scatter_gen;
38804 case IX86_BUILTIN_SCATTERSIV8SI:
38805 icode = CODE_FOR_avx512vl_scattersiv8si;
38806 goto scatter_gen;
38807 case IX86_BUILTIN_SCATTERSIV4SI:
38808 icode = CODE_FOR_avx512vl_scattersiv4si;
38809 goto scatter_gen;
38810 case IX86_BUILTIN_SCATTERSIV4DI:
38811 icode = CODE_FOR_avx512vl_scattersiv4di;
38812 goto scatter_gen;
38813 case IX86_BUILTIN_SCATTERSIV2DI:
38814 icode = CODE_FOR_avx512vl_scattersiv2di;
38815 goto scatter_gen;
38816 case IX86_BUILTIN_SCATTERDIV8SI:
38817 icode = CODE_FOR_avx512vl_scatterdiv8si;
38818 goto scatter_gen;
38819 case IX86_BUILTIN_SCATTERDIV4SI:
38820 icode = CODE_FOR_avx512vl_scatterdiv4si;
38821 goto scatter_gen;
38822 case IX86_BUILTIN_SCATTERDIV4DI:
38823 icode = CODE_FOR_avx512vl_scatterdiv4di;
38824 goto scatter_gen;
38825 case IX86_BUILTIN_SCATTERDIV2DI:
38826 icode = CODE_FOR_avx512vl_scatterdiv2di;
38827 goto scatter_gen;
38828 case IX86_BUILTIN_GATHERPFDPD:
38829 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38830 goto vec_prefetch_gen;
38831 case IX86_BUILTIN_SCATTERALTSIV8DF:
38832 icode = CODE_FOR_avx512f_scattersiv8df;
38833 goto scatter_gen;
38834 case IX86_BUILTIN_SCATTERALTDIV16SF:
38835 icode = CODE_FOR_avx512f_scatterdiv16sf;
38836 goto scatter_gen;
38837 case IX86_BUILTIN_SCATTERALTSIV8DI:
38838 icode = CODE_FOR_avx512f_scattersiv8di;
38839 goto scatter_gen;
38840 case IX86_BUILTIN_SCATTERALTDIV16SI:
38841 icode = CODE_FOR_avx512f_scatterdiv16si;
38842 goto scatter_gen;
38843 case IX86_BUILTIN_GATHERPFDPS:
38844 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38845 goto vec_prefetch_gen;
38846 case IX86_BUILTIN_GATHERPFQPD:
38847 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38848 goto vec_prefetch_gen;
38849 case IX86_BUILTIN_GATHERPFQPS:
38850 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38851 goto vec_prefetch_gen;
38852 case IX86_BUILTIN_SCATTERPFDPD:
38853 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38854 goto vec_prefetch_gen;
38855 case IX86_BUILTIN_SCATTERPFDPS:
38856 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38857 goto vec_prefetch_gen;
38858 case IX86_BUILTIN_SCATTERPFQPD:
38859 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38860 goto vec_prefetch_gen;
38861 case IX86_BUILTIN_SCATTERPFQPS:
38862 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38863 goto vec_prefetch_gen;
38865 gather_gen:
38866 rtx half;
38867 rtx (*gen) (rtx, rtx);
38869 arg0 = CALL_EXPR_ARG (exp, 0);
38870 arg1 = CALL_EXPR_ARG (exp, 1);
38871 arg2 = CALL_EXPR_ARG (exp, 2);
38872 arg3 = CALL_EXPR_ARG (exp, 3);
38873 arg4 = CALL_EXPR_ARG (exp, 4);
38874 op0 = expand_normal (arg0);
38875 op1 = expand_normal (arg1);
38876 op2 = expand_normal (arg2);
38877 op3 = expand_normal (arg3);
38878 op4 = expand_normal (arg4);
38879 /* Note the arg order is different from the operand order. */
38880 mode0 = insn_data[icode].operand[1].mode;
38881 mode2 = insn_data[icode].operand[3].mode;
38882 mode3 = insn_data[icode].operand[4].mode;
38883 mode4 = insn_data[icode].operand[5].mode;
38885 if (target == NULL_RTX
38886 || GET_MODE (target) != insn_data[icode].operand[0].mode
38887 || !insn_data[icode].operand[0].predicate (target,
38888 GET_MODE (target)))
38889 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38890 else
38891 subtarget = target;
38893 switch (fcode)
38895 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38896 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38897 half = gen_reg_rtx (V8SImode);
38898 if (!nonimmediate_operand (op2, V16SImode))
38899 op2 = copy_to_mode_reg (V16SImode, op2);
38900 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38901 op2 = half;
38902 break;
38903 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38904 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38905 case IX86_BUILTIN_GATHERALTSIV4DF:
38906 case IX86_BUILTIN_GATHERALTSIV4DI:
38907 half = gen_reg_rtx (V4SImode);
38908 if (!nonimmediate_operand (op2, V8SImode))
38909 op2 = copy_to_mode_reg (V8SImode, op2);
38910 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38911 op2 = half;
38912 break;
38913 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38914 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38915 half = gen_reg_rtx (mode0);
38916 if (mode0 == V8SFmode)
38917 gen = gen_vec_extract_lo_v16sf;
38918 else
38919 gen = gen_vec_extract_lo_v16si;
38920 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38921 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38922 emit_insn (gen (half, op0));
38923 op0 = half;
38924 if (GET_MODE (op3) != VOIDmode)
38926 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38927 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38928 emit_insn (gen (half, op3));
38929 op3 = half;
38931 break;
38932 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38933 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38934 case IX86_BUILTIN_GATHERALTDIV8SF:
38935 case IX86_BUILTIN_GATHERALTDIV8SI:
38936 half = gen_reg_rtx (mode0);
38937 if (mode0 == V4SFmode)
38938 gen = gen_vec_extract_lo_v8sf;
38939 else
38940 gen = gen_vec_extract_lo_v8si;
38941 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38942 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38943 emit_insn (gen (half, op0));
38944 op0 = half;
38945 if (GET_MODE (op3) != VOIDmode)
38947 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38948 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38949 emit_insn (gen (half, op3));
38950 op3 = half;
38952 break;
38953 default:
38954 break;
38957 /* Force memory operand only with base register here. But we
38958 don't want to do it on memory operand for other builtin
38959 functions. */
38960 op1 = ix86_zero_extend_to_Pmode (op1);
38962 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38963 op0 = copy_to_mode_reg (mode0, op0);
38964 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38965 op1 = copy_to_mode_reg (Pmode, op1);
38966 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38967 op2 = copy_to_mode_reg (mode2, op2);
38969 op3 = fixup_modeless_constant (op3, mode3);
38971 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38973 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38974 op3 = copy_to_mode_reg (mode3, op3);
38976 else
38978 op3 = copy_to_reg (op3);
38979 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38981 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38983 error ("the last argument must be scale 1, 2, 4, 8");
38984 return const0_rtx;
38987 /* Optimize. If mask is known to have all high bits set,
38988 replace op0 with pc_rtx to signal that the instruction
38989 overwrites the whole destination and doesn't use its
38990 previous contents. */
38991 if (optimize)
38993 if (TREE_CODE (arg3) == INTEGER_CST)
38995 if (integer_all_onesp (arg3))
38996 op0 = pc_rtx;
38998 else if (TREE_CODE (arg3) == VECTOR_CST)
39000 unsigned int negative = 0;
39001 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39003 tree cst = VECTOR_CST_ELT (arg3, i);
39004 if (TREE_CODE (cst) == INTEGER_CST
39005 && tree_int_cst_sign_bit (cst))
39006 negative++;
39007 else if (TREE_CODE (cst) == REAL_CST
39008 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39009 negative++;
39011 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39012 op0 = pc_rtx;
39014 else if (TREE_CODE (arg3) == SSA_NAME
39015 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39017 /* Recognize also when mask is like:
39018 __v2df src = _mm_setzero_pd ();
39019 __v2df mask = _mm_cmpeq_pd (src, src);
39021 __v8sf src = _mm256_setzero_ps ();
39022 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39023 as that is a cheaper way to load all ones into
39024 a register than having to load a constant from
39025 memory. */
39026 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39027 if (is_gimple_call (def_stmt))
39029 tree fndecl = gimple_call_fndecl (def_stmt);
39030 if (fndecl
39031 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39032 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39034 case IX86_BUILTIN_CMPPD:
39035 case IX86_BUILTIN_CMPPS:
39036 case IX86_BUILTIN_CMPPD256:
39037 case IX86_BUILTIN_CMPPS256:
39038 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39039 break;
39040 /* FALLTHRU */
39041 case IX86_BUILTIN_CMPEQPD:
39042 case IX86_BUILTIN_CMPEQPS:
39043 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39044 && initializer_zerop (gimple_call_arg (def_stmt,
39045 1)))
39046 op0 = pc_rtx;
39047 break;
39048 default:
39049 break;
39055 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39056 if (! pat)
39057 return const0_rtx;
39058 emit_insn (pat);
39060 switch (fcode)
39062 case IX86_BUILTIN_GATHER3DIV16SF:
39063 if (target == NULL_RTX)
39064 target = gen_reg_rtx (V8SFmode);
39065 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39066 break;
39067 case IX86_BUILTIN_GATHER3DIV16SI:
39068 if (target == NULL_RTX)
39069 target = gen_reg_rtx (V8SImode);
39070 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39071 break;
39072 case IX86_BUILTIN_GATHER3DIV8SF:
39073 case IX86_BUILTIN_GATHERDIV8SF:
39074 if (target == NULL_RTX)
39075 target = gen_reg_rtx (V4SFmode);
39076 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39077 break;
39078 case IX86_BUILTIN_GATHER3DIV8SI:
39079 case IX86_BUILTIN_GATHERDIV8SI:
39080 if (target == NULL_RTX)
39081 target = gen_reg_rtx (V4SImode);
39082 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39083 break;
39084 default:
39085 target = subtarget;
39086 break;
39088 return target;
39090 scatter_gen:
39091 arg0 = CALL_EXPR_ARG (exp, 0);
39092 arg1 = CALL_EXPR_ARG (exp, 1);
39093 arg2 = CALL_EXPR_ARG (exp, 2);
39094 arg3 = CALL_EXPR_ARG (exp, 3);
39095 arg4 = CALL_EXPR_ARG (exp, 4);
39096 op0 = expand_normal (arg0);
39097 op1 = expand_normal (arg1);
39098 op2 = expand_normal (arg2);
39099 op3 = expand_normal (arg3);
39100 op4 = expand_normal (arg4);
39101 mode1 = insn_data[icode].operand[1].mode;
39102 mode2 = insn_data[icode].operand[2].mode;
39103 mode3 = insn_data[icode].operand[3].mode;
39104 mode4 = insn_data[icode].operand[4].mode;
39106 /* Scatter instruction stores operand op3 to memory with
39107 indices from op2 and scale from op4 under writemask op1.
39108 If index operand op2 has more elements then source operand
39109 op3 one need to use only its low half. And vice versa. */
39110 switch (fcode)
39112 case IX86_BUILTIN_SCATTERALTSIV8DF:
39113 case IX86_BUILTIN_SCATTERALTSIV8DI:
39114 half = gen_reg_rtx (V8SImode);
39115 if (!nonimmediate_operand (op2, V16SImode))
39116 op2 = copy_to_mode_reg (V16SImode, op2);
39117 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39118 op2 = half;
39119 break;
39120 case IX86_BUILTIN_SCATTERALTDIV16SF:
39121 case IX86_BUILTIN_SCATTERALTDIV16SI:
39122 half = gen_reg_rtx (mode3);
39123 if (mode3 == V8SFmode)
39124 gen = gen_vec_extract_lo_v16sf;
39125 else
39126 gen = gen_vec_extract_lo_v16si;
39127 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39128 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39129 emit_insn (gen (half, op3));
39130 op3 = half;
39131 break;
39132 default:
39133 break;
39136 /* Force memory operand only with base register here. But we
39137 don't want to do it on memory operand for other builtin
39138 functions. */
39139 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39141 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39142 op0 = copy_to_mode_reg (Pmode, op0);
39144 op1 = fixup_modeless_constant (op1, mode1);
39146 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39148 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39149 op1 = copy_to_mode_reg (mode1, op1);
39151 else
39153 op1 = copy_to_reg (op1);
39154 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39157 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39158 op2 = copy_to_mode_reg (mode2, op2);
39160 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39161 op3 = copy_to_mode_reg (mode3, op3);
39163 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39165 error ("the last argument must be scale 1, 2, 4, 8");
39166 return const0_rtx;
39169 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39170 if (! pat)
39171 return const0_rtx;
39173 emit_insn (pat);
39174 return 0;
39176 vec_prefetch_gen:
39177 arg0 = CALL_EXPR_ARG (exp, 0);
39178 arg1 = CALL_EXPR_ARG (exp, 1);
39179 arg2 = CALL_EXPR_ARG (exp, 2);
39180 arg3 = CALL_EXPR_ARG (exp, 3);
39181 arg4 = CALL_EXPR_ARG (exp, 4);
39182 op0 = expand_normal (arg0);
39183 op1 = expand_normal (arg1);
39184 op2 = expand_normal (arg2);
39185 op3 = expand_normal (arg3);
39186 op4 = expand_normal (arg4);
39187 mode0 = insn_data[icode].operand[0].mode;
39188 mode1 = insn_data[icode].operand[1].mode;
39189 mode3 = insn_data[icode].operand[3].mode;
39190 mode4 = insn_data[icode].operand[4].mode;
39192 op0 = fixup_modeless_constant (op0, mode0);
39194 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39196 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39197 op0 = copy_to_mode_reg (mode0, op0);
39199 else
39201 op0 = copy_to_reg (op0);
39202 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39205 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39206 op1 = copy_to_mode_reg (mode1, op1);
39208 /* Force memory operand only with base register here. But we
39209 don't want to do it on memory operand for other builtin
39210 functions. */
39211 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39213 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39214 op2 = copy_to_mode_reg (Pmode, op2);
39216 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39218 error ("the forth argument must be scale 1, 2, 4, 8");
39219 return const0_rtx;
39222 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39224 error ("incorrect hint operand");
39225 return const0_rtx;
39228 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39229 if (! pat)
39230 return const0_rtx;
39232 emit_insn (pat);
39234 return 0;
39236 case IX86_BUILTIN_XABORT:
39237 icode = CODE_FOR_xabort;
39238 arg0 = CALL_EXPR_ARG (exp, 0);
39239 op0 = expand_normal (arg0);
39240 mode0 = insn_data[icode].operand[0].mode;
39241 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39243 error ("the xabort's argument must be an 8-bit immediate");
39244 return const0_rtx;
39246 emit_insn (gen_xabort (op0));
39247 return 0;
39249 default:
39250 break;
39253 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39254 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39256 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39257 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39258 target);
39261 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39262 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39264 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39265 switch (fcode)
39267 case IX86_BUILTIN_FABSQ:
39268 case IX86_BUILTIN_COPYSIGNQ:
39269 if (!TARGET_SSE)
39270 /* Emit a normal call if SSE isn't available. */
39271 return expand_call (exp, target, ignore);
39272 /* FALLTHRU */
39273 default:
39274 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39278 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39279 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39281 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39282 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39283 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39284 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39285 int masked = 1;
39286 machine_mode mode, wide_mode, nar_mode;
39288 nar_mode = V4SFmode;
39289 mode = V16SFmode;
39290 wide_mode = V64SFmode;
39291 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39292 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39294 switch (fcode)
39296 case IX86_BUILTIN_4FMAPS:
39297 fcn = gen_avx5124fmaddps_4fmaddps;
39298 masked = 0;
39299 goto v4fma_expand;
39301 case IX86_BUILTIN_4DPWSSD:
39302 nar_mode = V4SImode;
39303 mode = V16SImode;
39304 wide_mode = V64SImode;
39305 fcn = gen_avx5124vnniw_vp4dpwssd;
39306 masked = 0;
39307 goto v4fma_expand;
39309 case IX86_BUILTIN_4DPWSSDS:
39310 nar_mode = V4SImode;
39311 mode = V16SImode;
39312 wide_mode = V64SImode;
39313 fcn = gen_avx5124vnniw_vp4dpwssds;
39314 masked = 0;
39315 goto v4fma_expand;
39317 case IX86_BUILTIN_4FNMAPS:
39318 fcn = gen_avx5124fmaddps_4fnmaddps;
39319 masked = 0;
39320 goto v4fma_expand;
39322 case IX86_BUILTIN_4FNMAPS_MASK:
39323 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39324 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39325 goto v4fma_expand;
39327 case IX86_BUILTIN_4DPWSSD_MASK:
39328 nar_mode = V4SImode;
39329 mode = V16SImode;
39330 wide_mode = V64SImode;
39331 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39332 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39333 goto v4fma_expand;
39335 case IX86_BUILTIN_4DPWSSDS_MASK:
39336 nar_mode = V4SImode;
39337 mode = V16SImode;
39338 wide_mode = V64SImode;
39339 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39340 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39341 goto v4fma_expand;
39343 case IX86_BUILTIN_4FMAPS_MASK:
39345 tree args[4];
39346 rtx ops[4];
39347 rtx wide_reg;
39348 rtx accum;
39349 rtx addr;
39350 rtx mem;
39352 v4fma_expand:
39353 wide_reg = gen_reg_rtx (wide_mode);
39354 for (i = 0; i < 4; i++)
39356 args[i] = CALL_EXPR_ARG (exp, i);
39357 ops[i] = expand_normal (args[i]);
39359 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39360 ops[i]);
39363 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39364 accum = force_reg (mode, accum);
39366 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39367 addr = force_reg (Pmode, addr);
39369 mem = gen_rtx_MEM (nar_mode, addr);
39371 target = gen_reg_rtx (mode);
39373 emit_move_insn (target, accum);
39375 if (! masked)
39376 emit_insn (fcn (target, accum, wide_reg, mem));
39377 else
39379 rtx merge, mask;
39380 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39382 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39384 if (CONST_INT_P (mask))
39385 mask = fixup_modeless_constant (mask, HImode);
39387 mask = force_reg (HImode, mask);
39389 if (GET_MODE (mask) != HImode)
39390 mask = gen_rtx_SUBREG (HImode, mask, 0);
39392 /* If merge is 0 then we're about to emit z-masked variant. */
39393 if (const0_operand (merge, mode))
39394 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39395 /* If merge is the same as accum then emit merge-masked variant. */
39396 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39398 merge = force_reg (mode, merge);
39399 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39401 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39402 else
39404 target = gen_reg_rtx (mode);
39405 emit_move_insn (target, merge);
39406 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39409 return target;
39412 case IX86_BUILTIN_4FNMASS:
39413 fcn = gen_avx5124fmaddps_4fnmaddss;
39414 masked = 0;
39415 goto s4fma_expand;
39417 case IX86_BUILTIN_4FMASS:
39418 fcn = gen_avx5124fmaddps_4fmaddss;
39419 masked = 0;
39420 goto s4fma_expand;
39422 case IX86_BUILTIN_4FNMASS_MASK:
39423 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39424 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39425 goto s4fma_expand;
39427 case IX86_BUILTIN_4FMASS_MASK:
39429 tree args[4];
39430 rtx ops[4];
39431 rtx wide_reg;
39432 rtx accum;
39433 rtx addr;
39434 rtx mem;
39436 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39437 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39439 s4fma_expand:
39440 mode = V4SFmode;
39441 wide_reg = gen_reg_rtx (V64SFmode);
39442 for (i = 0; i < 4; i++)
39444 rtx tmp;
39445 args[i] = CALL_EXPR_ARG (exp, i);
39446 ops[i] = expand_normal (args[i]);
39448 tmp = gen_reg_rtx (SFmode);
39449 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39451 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39452 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39455 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39456 accum = force_reg (V4SFmode, accum);
39458 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39459 addr = force_reg (Pmode, addr);
39461 mem = gen_rtx_MEM (V4SFmode, addr);
39463 target = gen_reg_rtx (V4SFmode);
39465 emit_move_insn (target, accum);
39467 if (! masked)
39468 emit_insn (fcn (target, accum, wide_reg, mem));
39469 else
39471 rtx merge, mask;
39472 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39474 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39476 if (CONST_INT_P (mask))
39477 mask = fixup_modeless_constant (mask, QImode);
39479 mask = force_reg (QImode, mask);
39481 if (GET_MODE (mask) != QImode)
39482 mask = gen_rtx_SUBREG (QImode, mask, 0);
39484 /* If merge is 0 then we're about to emit z-masked variant. */
39485 if (const0_operand (merge, mode))
39486 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39487 /* If merge is the same as accum then emit merge-masked
39488 variant. */
39489 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39491 merge = force_reg (mode, merge);
39492 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39494 /* Merge with something unknown might happen if we z-mask
39495 w/ -O0. */
39496 else
39498 target = gen_reg_rtx (mode);
39499 emit_move_insn (target, merge);
39500 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39503 return target;
39505 case IX86_BUILTIN_RDPID:
39506 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39507 target);
39508 default:
39509 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39513 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39514 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39516 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39517 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39520 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39521 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39523 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39524 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39527 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39528 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39530 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39531 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39534 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39535 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39537 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39538 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39541 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39542 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39544 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39545 const struct builtin_description *d = bdesc_multi_arg + i;
39546 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39547 (enum ix86_builtin_func_type)
39548 d->flag, d->comparison);
39551 gcc_unreachable ();
39554 /* This returns the target-specific builtin with code CODE if
39555 current_function_decl has visibility on this builtin, which is checked
39556 using isa flags. Returns NULL_TREE otherwise. */
39558 static tree ix86_get_builtin (enum ix86_builtins code)
39560 struct cl_target_option *opts;
39561 tree target_tree = NULL_TREE;
39563 /* Determine the isa flags of current_function_decl. */
39565 if (current_function_decl)
39566 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39568 if (target_tree == NULL)
39569 target_tree = target_option_default_node;
39571 opts = TREE_TARGET_OPTION (target_tree);
39573 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39574 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39575 return ix86_builtin_decl (code, true);
39576 else
39577 return NULL_TREE;
39580 /* Return function decl for target specific builtin
39581 for given MPX builtin passed i FCODE. */
39582 static tree
39583 ix86_builtin_mpx_function (unsigned fcode)
39585 switch (fcode)
39587 case BUILT_IN_CHKP_BNDMK:
39588 return ix86_builtins[IX86_BUILTIN_BNDMK];
39590 case BUILT_IN_CHKP_BNDSTX:
39591 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39593 case BUILT_IN_CHKP_BNDLDX:
39594 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39596 case BUILT_IN_CHKP_BNDCL:
39597 return ix86_builtins[IX86_BUILTIN_BNDCL];
39599 case BUILT_IN_CHKP_BNDCU:
39600 return ix86_builtins[IX86_BUILTIN_BNDCU];
39602 case BUILT_IN_CHKP_BNDRET:
39603 return ix86_builtins[IX86_BUILTIN_BNDRET];
39605 case BUILT_IN_CHKP_INTERSECT:
39606 return ix86_builtins[IX86_BUILTIN_BNDINT];
39608 case BUILT_IN_CHKP_NARROW:
39609 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39611 case BUILT_IN_CHKP_SIZEOF:
39612 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39614 case BUILT_IN_CHKP_EXTRACT_LOWER:
39615 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39617 case BUILT_IN_CHKP_EXTRACT_UPPER:
39618 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39620 default:
39621 return NULL_TREE;
39624 gcc_unreachable ();
39627 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39629 Return an address to be used to load/store bounds for pointer
39630 passed in SLOT.
39632 SLOT_NO is an integer constant holding number of a target
39633 dependent special slot to be used in case SLOT is not a memory.
39635 SPECIAL_BASE is a pointer to be used as a base of fake address
39636 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39637 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39639 static rtx
39640 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39642 rtx addr = NULL;
39644 /* NULL slot means we pass bounds for pointer not passed to the
39645 function at all. Register slot means we pass pointer in a
39646 register. In both these cases bounds are passed via Bounds
39647 Table. Since we do not have actual pointer stored in memory,
39648 we have to use fake addresses to access Bounds Table. We
39649 start with (special_base - sizeof (void*)) and decrease this
39650 address by pointer size to get addresses for other slots. */
39651 if (!slot || REG_P (slot))
39653 gcc_assert (CONST_INT_P (slot_no));
39654 addr = plus_constant (Pmode, special_base,
39655 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39657 /* If pointer is passed in a memory then its address is used to
39658 access Bounds Table. */
39659 else if (MEM_P (slot))
39661 addr = XEXP (slot, 0);
39662 if (!register_operand (addr, Pmode))
39663 addr = copy_addr_to_reg (addr);
39665 else
39666 gcc_unreachable ();
39668 return addr;
39671 /* Expand pass uses this hook to load bounds for function parameter
39672 PTR passed in SLOT in case its bounds are not passed in a register.
39674 If SLOT is a memory, then bounds are loaded as for regular pointer
39675 loaded from memory. PTR may be NULL in case SLOT is a memory.
39676 In such case value of PTR (if required) may be loaded from SLOT.
39678 If SLOT is NULL or a register then SLOT_NO is an integer constant
39679 holding number of the target dependent special slot which should be
39680 used to obtain bounds.
39682 Return loaded bounds. */
39684 static rtx
39685 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39687 rtx reg = gen_reg_rtx (BNDmode);
39688 rtx addr;
39690 /* Get address to be used to access Bounds Table. Special slots start
39691 at the location of return address of the current function. */
39692 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39694 /* Load pointer value from a memory if we don't have it. */
39695 if (!ptr)
39697 gcc_assert (MEM_P (slot));
39698 ptr = copy_addr_to_reg (slot);
39701 if (!register_operand (ptr, Pmode))
39702 ptr = ix86_zero_extend_to_Pmode (ptr);
39704 emit_insn (BNDmode == BND64mode
39705 ? gen_bnd64_ldx (reg, addr, ptr)
39706 : gen_bnd32_ldx (reg, addr, ptr));
39708 return reg;
39711 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39712 passed in SLOT in case BOUNDS are not passed in a register.
39714 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39715 stored in memory. PTR may be NULL in case SLOT is a memory.
39716 In such case value of PTR (if required) may be loaded from SLOT.
39718 If SLOT is NULL or a register then SLOT_NO is an integer constant
39719 holding number of the target dependent special slot which should be
39720 used to store BOUNDS. */
39722 static void
39723 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39725 rtx addr;
39727 /* Get address to be used to access Bounds Table. Special slots start
39728 at the location of return address of a called function. */
39729 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39731 /* Load pointer value from a memory if we don't have it. */
39732 if (!ptr)
39734 gcc_assert (MEM_P (slot));
39735 ptr = copy_addr_to_reg (slot);
39738 if (!register_operand (ptr, Pmode))
39739 ptr = ix86_zero_extend_to_Pmode (ptr);
39741 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39742 if (!register_operand (bounds, BNDmode))
39743 bounds = copy_to_mode_reg (BNDmode, bounds);
39745 emit_insn (BNDmode == BND64mode
39746 ? gen_bnd64_stx (addr, ptr, bounds)
39747 : gen_bnd32_stx (addr, ptr, bounds));
39750 /* Load and return bounds returned by function in SLOT. */
39752 static rtx
39753 ix86_load_returned_bounds (rtx slot)
39755 rtx res;
39757 gcc_assert (REG_P (slot));
39758 res = gen_reg_rtx (BNDmode);
39759 emit_move_insn (res, slot);
39761 return res;
39764 /* Store BOUNDS returned by function into SLOT. */
39766 static void
39767 ix86_store_returned_bounds (rtx slot, rtx bounds)
39769 gcc_assert (REG_P (slot));
39770 emit_move_insn (slot, bounds);
39773 /* Returns a function decl for a vectorized version of the combined function
39774 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39775 if it is not available. */
39777 static tree
39778 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39779 tree type_in)
39781 machine_mode in_mode, out_mode;
39782 int in_n, out_n;
39784 if (TREE_CODE (type_out) != VECTOR_TYPE
39785 || TREE_CODE (type_in) != VECTOR_TYPE)
39786 return NULL_TREE;
39788 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39789 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39790 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39791 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39793 switch (fn)
39795 CASE_CFN_EXP2:
39796 if (out_mode == SFmode && in_mode == SFmode)
39798 if (out_n == 16 && in_n == 16)
39799 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39801 break;
39803 CASE_CFN_IFLOOR:
39804 CASE_CFN_LFLOOR:
39805 CASE_CFN_LLFLOOR:
39806 /* The round insn does not trap on denormals. */
39807 if (flag_trapping_math || !TARGET_ROUND)
39808 break;
39810 if (out_mode == SImode && in_mode == DFmode)
39812 if (out_n == 4 && in_n == 2)
39813 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39814 else if (out_n == 8 && in_n == 4)
39815 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39816 else if (out_n == 16 && in_n == 8)
39817 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39819 if (out_mode == SImode && in_mode == SFmode)
39821 if (out_n == 4 && in_n == 4)
39822 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39823 else if (out_n == 8 && in_n == 8)
39824 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39825 else if (out_n == 16 && in_n == 16)
39826 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39828 break;
39830 CASE_CFN_ICEIL:
39831 CASE_CFN_LCEIL:
39832 CASE_CFN_LLCEIL:
39833 /* The round insn does not trap on denormals. */
39834 if (flag_trapping_math || !TARGET_ROUND)
39835 break;
39837 if (out_mode == SImode && in_mode == DFmode)
39839 if (out_n == 4 && in_n == 2)
39840 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39841 else if (out_n == 8 && in_n == 4)
39842 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39843 else if (out_n == 16 && in_n == 8)
39844 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39846 if (out_mode == SImode && in_mode == SFmode)
39848 if (out_n == 4 && in_n == 4)
39849 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39850 else if (out_n == 8 && in_n == 8)
39851 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39852 else if (out_n == 16 && in_n == 16)
39853 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39855 break;
39857 CASE_CFN_IRINT:
39858 CASE_CFN_LRINT:
39859 CASE_CFN_LLRINT:
39860 if (out_mode == SImode && in_mode == DFmode)
39862 if (out_n == 4 && in_n == 2)
39863 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39864 else if (out_n == 8 && in_n == 4)
39865 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39866 else if (out_n == 16 && in_n == 8)
39867 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39869 if (out_mode == SImode && in_mode == SFmode)
39871 if (out_n == 4 && in_n == 4)
39872 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39873 else if (out_n == 8 && in_n == 8)
39874 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39875 else if (out_n == 16 && in_n == 16)
39876 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39878 break;
39880 CASE_CFN_IROUND:
39881 CASE_CFN_LROUND:
39882 CASE_CFN_LLROUND:
39883 /* The round insn does not trap on denormals. */
39884 if (flag_trapping_math || !TARGET_ROUND)
39885 break;
39887 if (out_mode == SImode && in_mode == DFmode)
39889 if (out_n == 4 && in_n == 2)
39890 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39891 else if (out_n == 8 && in_n == 4)
39892 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39893 else if (out_n == 16 && in_n == 8)
39894 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39896 if (out_mode == SImode && in_mode == SFmode)
39898 if (out_n == 4 && in_n == 4)
39899 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39900 else if (out_n == 8 && in_n == 8)
39901 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39902 else if (out_n == 16 && in_n == 16)
39903 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39905 break;
39907 CASE_CFN_FLOOR:
39908 /* The round insn does not trap on denormals. */
39909 if (flag_trapping_math || !TARGET_ROUND)
39910 break;
39912 if (out_mode == DFmode && in_mode == DFmode)
39914 if (out_n == 2 && in_n == 2)
39915 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39916 else if (out_n == 4 && in_n == 4)
39917 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39918 else if (out_n == 8 && in_n == 8)
39919 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39921 if (out_mode == SFmode && in_mode == SFmode)
39923 if (out_n == 4 && in_n == 4)
39924 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39925 else if (out_n == 8 && in_n == 8)
39926 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39927 else if (out_n == 16 && in_n == 16)
39928 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39930 break;
39932 CASE_CFN_CEIL:
39933 /* The round insn does not trap on denormals. */
39934 if (flag_trapping_math || !TARGET_ROUND)
39935 break;
39937 if (out_mode == DFmode && in_mode == DFmode)
39939 if (out_n == 2 && in_n == 2)
39940 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39941 else if (out_n == 4 && in_n == 4)
39942 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39943 else if (out_n == 8 && in_n == 8)
39944 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39946 if (out_mode == SFmode && in_mode == SFmode)
39948 if (out_n == 4 && in_n == 4)
39949 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39950 else if (out_n == 8 && in_n == 8)
39951 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39952 else if (out_n == 16 && in_n == 16)
39953 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39955 break;
39957 CASE_CFN_TRUNC:
39958 /* The round insn does not trap on denormals. */
39959 if (flag_trapping_math || !TARGET_ROUND)
39960 break;
39962 if (out_mode == DFmode && in_mode == DFmode)
39964 if (out_n == 2 && in_n == 2)
39965 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39966 else if (out_n == 4 && in_n == 4)
39967 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39968 else if (out_n == 8 && in_n == 8)
39969 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39971 if (out_mode == SFmode && in_mode == SFmode)
39973 if (out_n == 4 && in_n == 4)
39974 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39975 else if (out_n == 8 && in_n == 8)
39976 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39977 else if (out_n == 16 && in_n == 16)
39978 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39980 break;
39982 CASE_CFN_RINT:
39983 /* The round insn does not trap on denormals. */
39984 if (flag_trapping_math || !TARGET_ROUND)
39985 break;
39987 if (out_mode == DFmode && in_mode == DFmode)
39989 if (out_n == 2 && in_n == 2)
39990 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39991 else if (out_n == 4 && in_n == 4)
39992 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39994 if (out_mode == SFmode && in_mode == SFmode)
39996 if (out_n == 4 && in_n == 4)
39997 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39998 else if (out_n == 8 && in_n == 8)
39999 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40001 break;
40003 CASE_CFN_FMA:
40004 if (out_mode == DFmode && in_mode == DFmode)
40006 if (out_n == 2 && in_n == 2)
40007 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40008 if (out_n == 4 && in_n == 4)
40009 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40011 if (out_mode == SFmode && in_mode == SFmode)
40013 if (out_n == 4 && in_n == 4)
40014 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40015 if (out_n == 8 && in_n == 8)
40016 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40018 break;
40020 default:
40021 break;
40024 /* Dispatch to a handler for a vectorization library. */
40025 if (ix86_veclib_handler)
40026 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40028 return NULL_TREE;
40031 /* Handler for an SVML-style interface to
40032 a library with vectorized intrinsics. */
40034 static tree
40035 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40037 char name[20];
40038 tree fntype, new_fndecl, args;
40039 unsigned arity;
40040 const char *bname;
40041 machine_mode el_mode, in_mode;
40042 int n, in_n;
40044 /* The SVML is suitable for unsafe math only. */
40045 if (!flag_unsafe_math_optimizations)
40046 return NULL_TREE;
40048 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40049 n = TYPE_VECTOR_SUBPARTS (type_out);
40050 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40051 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40052 if (el_mode != in_mode
40053 || n != in_n)
40054 return NULL_TREE;
40056 switch (fn)
40058 CASE_CFN_EXP:
40059 CASE_CFN_LOG:
40060 CASE_CFN_LOG10:
40061 CASE_CFN_POW:
40062 CASE_CFN_TANH:
40063 CASE_CFN_TAN:
40064 CASE_CFN_ATAN:
40065 CASE_CFN_ATAN2:
40066 CASE_CFN_ATANH:
40067 CASE_CFN_CBRT:
40068 CASE_CFN_SINH:
40069 CASE_CFN_SIN:
40070 CASE_CFN_ASINH:
40071 CASE_CFN_ASIN:
40072 CASE_CFN_COSH:
40073 CASE_CFN_COS:
40074 CASE_CFN_ACOSH:
40075 CASE_CFN_ACOS:
40076 if ((el_mode != DFmode || n != 2)
40077 && (el_mode != SFmode || n != 4))
40078 return NULL_TREE;
40079 break;
40081 default:
40082 return NULL_TREE;
40085 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40086 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40088 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40089 strcpy (name, "vmlsLn4");
40090 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40091 strcpy (name, "vmldLn2");
40092 else if (n == 4)
40094 sprintf (name, "vmls%s", bname+10);
40095 name[strlen (name)-1] = '4';
40097 else
40098 sprintf (name, "vmld%s2", bname+10);
40100 /* Convert to uppercase. */
40101 name[4] &= ~0x20;
40103 arity = 0;
40104 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40105 arity++;
40107 if (arity == 1)
40108 fntype = build_function_type_list (type_out, type_in, NULL);
40109 else
40110 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40112 /* Build a function declaration for the vectorized function. */
40113 new_fndecl = build_decl (BUILTINS_LOCATION,
40114 FUNCTION_DECL, get_identifier (name), fntype);
40115 TREE_PUBLIC (new_fndecl) = 1;
40116 DECL_EXTERNAL (new_fndecl) = 1;
40117 DECL_IS_NOVOPS (new_fndecl) = 1;
40118 TREE_READONLY (new_fndecl) = 1;
40120 return new_fndecl;
40123 /* Handler for an ACML-style interface to
40124 a library with vectorized intrinsics. */
40126 static tree
40127 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40129 char name[20] = "__vr.._";
40130 tree fntype, new_fndecl, args;
40131 unsigned arity;
40132 const char *bname;
40133 machine_mode el_mode, in_mode;
40134 int n, in_n;
40136 /* The ACML is 64bits only and suitable for unsafe math only as
40137 it does not correctly support parts of IEEE with the required
40138 precision such as denormals. */
40139 if (!TARGET_64BIT
40140 || !flag_unsafe_math_optimizations)
40141 return NULL_TREE;
40143 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40144 n = TYPE_VECTOR_SUBPARTS (type_out);
40145 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40146 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40147 if (el_mode != in_mode
40148 || n != in_n)
40149 return NULL_TREE;
40151 switch (fn)
40153 CASE_CFN_SIN:
40154 CASE_CFN_COS:
40155 CASE_CFN_EXP:
40156 CASE_CFN_LOG:
40157 CASE_CFN_LOG2:
40158 CASE_CFN_LOG10:
40159 if (el_mode == DFmode && n == 2)
40161 name[4] = 'd';
40162 name[5] = '2';
40164 else if (el_mode == SFmode && n == 4)
40166 name[4] = 's';
40167 name[5] = '4';
40169 else
40170 return NULL_TREE;
40171 break;
40173 default:
40174 return NULL_TREE;
40177 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40178 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40179 sprintf (name + 7, "%s", bname+10);
40181 arity = 0;
40182 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40183 arity++;
40185 if (arity == 1)
40186 fntype = build_function_type_list (type_out, type_in, NULL);
40187 else
40188 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40190 /* Build a function declaration for the vectorized function. */
40191 new_fndecl = build_decl (BUILTINS_LOCATION,
40192 FUNCTION_DECL, get_identifier (name), fntype);
40193 TREE_PUBLIC (new_fndecl) = 1;
40194 DECL_EXTERNAL (new_fndecl) = 1;
40195 DECL_IS_NOVOPS (new_fndecl) = 1;
40196 TREE_READONLY (new_fndecl) = 1;
40198 return new_fndecl;
40201 /* Returns a decl of a function that implements gather load with
40202 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40203 Return NULL_TREE if it is not available. */
40205 static tree
40206 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40207 const_tree index_type, int scale)
40209 bool si;
40210 enum ix86_builtins code;
40212 if (! TARGET_AVX2)
40213 return NULL_TREE;
40215 if ((TREE_CODE (index_type) != INTEGER_TYPE
40216 && !POINTER_TYPE_P (index_type))
40217 || (TYPE_MODE (index_type) != SImode
40218 && TYPE_MODE (index_type) != DImode))
40219 return NULL_TREE;
40221 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40222 return NULL_TREE;
40224 /* v*gather* insn sign extends index to pointer mode. */
40225 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40226 && TYPE_UNSIGNED (index_type))
40227 return NULL_TREE;
40229 if (scale <= 0
40230 || scale > 8
40231 || (scale & (scale - 1)) != 0)
40232 return NULL_TREE;
40234 si = TYPE_MODE (index_type) == SImode;
40235 switch (TYPE_MODE (mem_vectype))
40237 case V2DFmode:
40238 if (TARGET_AVX512VL)
40239 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40240 else
40241 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40242 break;
40243 case V4DFmode:
40244 if (TARGET_AVX512VL)
40245 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40246 else
40247 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40248 break;
40249 case V2DImode:
40250 if (TARGET_AVX512VL)
40251 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40252 else
40253 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40254 break;
40255 case V4DImode:
40256 if (TARGET_AVX512VL)
40257 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40258 else
40259 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40260 break;
40261 case V4SFmode:
40262 if (TARGET_AVX512VL)
40263 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40264 else
40265 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40266 break;
40267 case V8SFmode:
40268 if (TARGET_AVX512VL)
40269 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40270 else
40271 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40272 break;
40273 case V4SImode:
40274 if (TARGET_AVX512VL)
40275 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40276 else
40277 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40278 break;
40279 case V8SImode:
40280 if (TARGET_AVX512VL)
40281 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40282 else
40283 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40284 break;
40285 case V8DFmode:
40286 if (TARGET_AVX512F)
40287 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40288 else
40289 return NULL_TREE;
40290 break;
40291 case V8DImode:
40292 if (TARGET_AVX512F)
40293 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40294 else
40295 return NULL_TREE;
40296 break;
40297 case V16SFmode:
40298 if (TARGET_AVX512F)
40299 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40300 else
40301 return NULL_TREE;
40302 break;
40303 case V16SImode:
40304 if (TARGET_AVX512F)
40305 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40306 else
40307 return NULL_TREE;
40308 break;
40309 default:
40310 return NULL_TREE;
40313 return ix86_get_builtin (code);
40316 /* Returns a decl of a function that implements scatter store with
40317 register type VECTYPE and index type INDEX_TYPE and SCALE.
40318 Return NULL_TREE if it is not available. */
40320 static tree
40321 ix86_vectorize_builtin_scatter (const_tree vectype,
40322 const_tree index_type, int scale)
40324 bool si;
40325 enum ix86_builtins code;
40327 if (!TARGET_AVX512F)
40328 return NULL_TREE;
40330 if ((TREE_CODE (index_type) != INTEGER_TYPE
40331 && !POINTER_TYPE_P (index_type))
40332 || (TYPE_MODE (index_type) != SImode
40333 && TYPE_MODE (index_type) != DImode))
40334 return NULL_TREE;
40336 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40337 return NULL_TREE;
40339 /* v*scatter* insn sign extends index to pointer mode. */
40340 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40341 && TYPE_UNSIGNED (index_type))
40342 return NULL_TREE;
40344 /* Scale can be 1, 2, 4 or 8. */
40345 if (scale <= 0
40346 || scale > 8
40347 || (scale & (scale - 1)) != 0)
40348 return NULL_TREE;
40350 si = TYPE_MODE (index_type) == SImode;
40351 switch (TYPE_MODE (vectype))
40353 case V8DFmode:
40354 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40355 break;
40356 case V8DImode:
40357 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40358 break;
40359 case V16SFmode:
40360 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40361 break;
40362 case V16SImode:
40363 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40364 break;
40365 default:
40366 return NULL_TREE;
40369 return ix86_builtins[code];
40372 /* Return true if it is safe to use the rsqrt optabs to optimize
40373 1.0/sqrt. */
40375 static bool
40376 use_rsqrt_p ()
40378 return (TARGET_SSE_MATH
40379 && flag_finite_math_only
40380 && !flag_trapping_math
40381 && flag_unsafe_math_optimizations);
40384 /* Returns a code for a target-specific builtin that implements
40385 reciprocal of the function, or NULL_TREE if not available. */
40387 static tree
40388 ix86_builtin_reciprocal (tree fndecl)
40390 switch (DECL_FUNCTION_CODE (fndecl))
40392 /* Vectorized version of sqrt to rsqrt conversion. */
40393 case IX86_BUILTIN_SQRTPS_NR:
40394 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40396 case IX86_BUILTIN_SQRTPS_NR256:
40397 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40399 default:
40400 return NULL_TREE;
40404 /* Helper for avx_vpermilps256_operand et al. This is also used by
40405 the expansion functions to turn the parallel back into a mask.
40406 The return value is 0 for no match and the imm8+1 for a match. */
40409 avx_vpermilp_parallel (rtx par, machine_mode mode)
40411 unsigned i, nelt = GET_MODE_NUNITS (mode);
40412 unsigned mask = 0;
40413 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40415 if (XVECLEN (par, 0) != (int) nelt)
40416 return 0;
40418 /* Validate that all of the elements are constants, and not totally
40419 out of range. Copy the data into an integral array to make the
40420 subsequent checks easier. */
40421 for (i = 0; i < nelt; ++i)
40423 rtx er = XVECEXP (par, 0, i);
40424 unsigned HOST_WIDE_INT ei;
40426 if (!CONST_INT_P (er))
40427 return 0;
40428 ei = INTVAL (er);
40429 if (ei >= nelt)
40430 return 0;
40431 ipar[i] = ei;
40434 switch (mode)
40436 case V8DFmode:
40437 /* In the 512-bit DFmode case, we can only move elements within
40438 a 128-bit lane. First fill the second part of the mask,
40439 then fallthru. */
40440 for (i = 4; i < 6; ++i)
40442 if (ipar[i] < 4 || ipar[i] >= 6)
40443 return 0;
40444 mask |= (ipar[i] - 4) << i;
40446 for (i = 6; i < 8; ++i)
40448 if (ipar[i] < 6)
40449 return 0;
40450 mask |= (ipar[i] - 6) << i;
40452 /* FALLTHRU */
40454 case V4DFmode:
40455 /* In the 256-bit DFmode case, we can only move elements within
40456 a 128-bit lane. */
40457 for (i = 0; i < 2; ++i)
40459 if (ipar[i] >= 2)
40460 return 0;
40461 mask |= ipar[i] << i;
40463 for (i = 2; i < 4; ++i)
40465 if (ipar[i] < 2)
40466 return 0;
40467 mask |= (ipar[i] - 2) << i;
40469 break;
40471 case V16SFmode:
40472 /* In 512 bit SFmode case, permutation in the upper 256 bits
40473 must mirror the permutation in the lower 256-bits. */
40474 for (i = 0; i < 8; ++i)
40475 if (ipar[i] + 8 != ipar[i + 8])
40476 return 0;
40477 /* FALLTHRU */
40479 case V8SFmode:
40480 /* In 256 bit SFmode case, we have full freedom of
40481 movement within the low 128-bit lane, but the high 128-bit
40482 lane must mirror the exact same pattern. */
40483 for (i = 0; i < 4; ++i)
40484 if (ipar[i] + 4 != ipar[i + 4])
40485 return 0;
40486 nelt = 4;
40487 /* FALLTHRU */
40489 case V2DFmode:
40490 case V4SFmode:
40491 /* In the 128-bit case, we've full freedom in the placement of
40492 the elements from the source operand. */
40493 for (i = 0; i < nelt; ++i)
40494 mask |= ipar[i] << (i * (nelt / 2));
40495 break;
40497 default:
40498 gcc_unreachable ();
40501 /* Make sure success has a non-zero value by adding one. */
40502 return mask + 1;
40505 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40506 the expansion functions to turn the parallel back into a mask.
40507 The return value is 0 for no match and the imm8+1 for a match. */
40510 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40512 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40513 unsigned mask = 0;
40514 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40516 if (XVECLEN (par, 0) != (int) nelt)
40517 return 0;
40519 /* Validate that all of the elements are constants, and not totally
40520 out of range. Copy the data into an integral array to make the
40521 subsequent checks easier. */
40522 for (i = 0; i < nelt; ++i)
40524 rtx er = XVECEXP (par, 0, i);
40525 unsigned HOST_WIDE_INT ei;
40527 if (!CONST_INT_P (er))
40528 return 0;
40529 ei = INTVAL (er);
40530 if (ei >= 2 * nelt)
40531 return 0;
40532 ipar[i] = ei;
40535 /* Validate that the halves of the permute are halves. */
40536 for (i = 0; i < nelt2 - 1; ++i)
40537 if (ipar[i] + 1 != ipar[i + 1])
40538 return 0;
40539 for (i = nelt2; i < nelt - 1; ++i)
40540 if (ipar[i] + 1 != ipar[i + 1])
40541 return 0;
40543 /* Reconstruct the mask. */
40544 for (i = 0; i < 2; ++i)
40546 unsigned e = ipar[i * nelt2];
40547 if (e % nelt2)
40548 return 0;
40549 e /= nelt2;
40550 mask |= e << (i * 4);
40553 /* Make sure success has a non-zero value by adding one. */
40554 return mask + 1;
40557 /* Return a register priority for hard reg REGNO. */
40558 static int
40559 ix86_register_priority (int hard_regno)
40561 /* ebp and r13 as the base always wants a displacement, r12 as the
40562 base always wants an index. So discourage their usage in an
40563 address. */
40564 if (hard_regno == R12_REG || hard_regno == R13_REG)
40565 return 0;
40566 if (hard_regno == BP_REG)
40567 return 1;
40568 /* New x86-64 int registers result in bigger code size. Discourage
40569 them. */
40570 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40571 return 2;
40572 /* New x86-64 SSE registers result in bigger code size. Discourage
40573 them. */
40574 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40575 return 2;
40576 /* Usage of AX register results in smaller code. Prefer it. */
40577 if (hard_regno == AX_REG)
40578 return 4;
40579 return 3;
40582 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40584 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40585 QImode must go into class Q_REGS.
40586 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40587 movdf to do mem-to-mem moves through integer regs. */
40589 static reg_class_t
40590 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40592 machine_mode mode = GET_MODE (x);
40594 /* We're only allowed to return a subclass of CLASS. Many of the
40595 following checks fail for NO_REGS, so eliminate that early. */
40596 if (regclass == NO_REGS)
40597 return NO_REGS;
40599 /* All classes can load zeros. */
40600 if (x == CONST0_RTX (mode))
40601 return regclass;
40603 /* Force constants into memory if we are loading a (nonzero) constant into
40604 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40605 instructions to load from a constant. */
40606 if (CONSTANT_P (x)
40607 && (MAYBE_MMX_CLASS_P (regclass)
40608 || MAYBE_SSE_CLASS_P (regclass)
40609 || MAYBE_MASK_CLASS_P (regclass)))
40610 return NO_REGS;
40612 /* Floating-point constants need more complex checks. */
40613 if (CONST_DOUBLE_P (x))
40615 /* General regs can load everything. */
40616 if (INTEGER_CLASS_P (regclass))
40617 return regclass;
40619 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40620 zero above. We only want to wind up preferring 80387 registers if
40621 we plan on doing computation with them. */
40622 if (IS_STACK_MODE (mode)
40623 && standard_80387_constant_p (x) > 0)
40625 /* Limit class to FP regs. */
40626 if (FLOAT_CLASS_P (regclass))
40627 return FLOAT_REGS;
40628 else if (regclass == FP_TOP_SSE_REGS)
40629 return FP_TOP_REG;
40630 else if (regclass == FP_SECOND_SSE_REGS)
40631 return FP_SECOND_REG;
40634 return NO_REGS;
40637 /* Prefer SSE regs only, if we can use them for math. */
40638 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40639 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40641 /* Generally when we see PLUS here, it's the function invariant
40642 (plus soft-fp const_int). Which can only be computed into general
40643 regs. */
40644 if (GET_CODE (x) == PLUS)
40645 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40647 /* QImode constants are easy to load, but non-constant QImode data
40648 must go into Q_REGS. */
40649 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40651 if (Q_CLASS_P (regclass))
40652 return regclass;
40653 else if (reg_class_subset_p (Q_REGS, regclass))
40654 return Q_REGS;
40655 else
40656 return NO_REGS;
40659 return regclass;
40662 /* Discourage putting floating-point values in SSE registers unless
40663 SSE math is being used, and likewise for the 387 registers. */
40664 static reg_class_t
40665 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40667 machine_mode mode = GET_MODE (x);
40669 /* Restrict the output reload class to the register bank that we are doing
40670 math on. If we would like not to return a subset of CLASS, reject this
40671 alternative: if reload cannot do this, it will still use its choice. */
40672 mode = GET_MODE (x);
40673 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40674 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40676 if (IS_STACK_MODE (mode))
40678 if (regclass == FP_TOP_SSE_REGS)
40679 return FP_TOP_REG;
40680 else if (regclass == FP_SECOND_SSE_REGS)
40681 return FP_SECOND_REG;
40682 else
40683 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40686 return regclass;
40689 static reg_class_t
40690 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40691 machine_mode mode, secondary_reload_info *sri)
40693 /* Double-word spills from general registers to non-offsettable memory
40694 references (zero-extended addresses) require special handling. */
40695 if (TARGET_64BIT
40696 && MEM_P (x)
40697 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40698 && INTEGER_CLASS_P (rclass)
40699 && !offsettable_memref_p (x))
40701 sri->icode = (in_p
40702 ? CODE_FOR_reload_noff_load
40703 : CODE_FOR_reload_noff_store);
40704 /* Add the cost of moving address to a temporary. */
40705 sri->extra_cost = 1;
40707 return NO_REGS;
40710 /* QImode spills from non-QI registers require
40711 intermediate register on 32bit targets. */
40712 if (mode == QImode
40713 && ((!TARGET_64BIT && !in_p
40714 && INTEGER_CLASS_P (rclass)
40715 && MAYBE_NON_Q_CLASS_P (rclass))
40716 || (!TARGET_AVX512DQ
40717 && MAYBE_MASK_CLASS_P (rclass))))
40719 int regno = true_regnum (x);
40721 /* Return Q_REGS if the operand is in memory. */
40722 if (regno == -1)
40723 return Q_REGS;
40725 return NO_REGS;
40728 /* This condition handles corner case where an expression involving
40729 pointers gets vectorized. We're trying to use the address of a
40730 stack slot as a vector initializer.
40732 (set (reg:V2DI 74 [ vect_cst_.2 ])
40733 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40735 Eventually frame gets turned into sp+offset like this:
40737 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40738 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40739 (const_int 392 [0x188]))))
40741 That later gets turned into:
40743 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40744 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40745 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40747 We'll have the following reload recorded:
40749 Reload 0: reload_in (DI) =
40750 (plus:DI (reg/f:DI 7 sp)
40751 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40752 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40753 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40754 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40755 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40756 reload_reg_rtx: (reg:V2DI 22 xmm1)
40758 Which isn't going to work since SSE instructions can't handle scalar
40759 additions. Returning GENERAL_REGS forces the addition into integer
40760 register and reload can handle subsequent reloads without problems. */
40762 if (in_p && GET_CODE (x) == PLUS
40763 && SSE_CLASS_P (rclass)
40764 && SCALAR_INT_MODE_P (mode))
40765 return GENERAL_REGS;
40767 return NO_REGS;
40770 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40772 static bool
40773 ix86_class_likely_spilled_p (reg_class_t rclass)
40775 switch (rclass)
40777 case AREG:
40778 case DREG:
40779 case CREG:
40780 case BREG:
40781 case AD_REGS:
40782 case SIREG:
40783 case DIREG:
40784 case SSE_FIRST_REG:
40785 case FP_TOP_REG:
40786 case FP_SECOND_REG:
40787 case BND_REGS:
40788 return true;
40790 default:
40791 break;
40794 return false;
40797 /* If we are copying between registers from different register sets
40798 (e.g. FP and integer), we may need a memory location.
40800 The function can't work reliably when one of the CLASSES is a class
40801 containing registers from multiple sets. We avoid this by never combining
40802 different sets in a single alternative in the machine description.
40803 Ensure that this constraint holds to avoid unexpected surprises.
40805 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40806 so do not enforce these sanity checks.
40808 To optimize register_move_cost performance, define inline variant. */
40810 static inline bool
40811 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40812 machine_mode mode, int strict)
40814 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40815 return false;
40817 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40818 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40819 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40820 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40821 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40822 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40823 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40824 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40826 gcc_assert (!strict || lra_in_progress);
40827 return true;
40830 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40831 return true;
40833 /* Between mask and general, we have moves no larger than word size. */
40834 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40835 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40836 return true;
40838 /* ??? This is a lie. We do have moves between mmx/general, and for
40839 mmx/sse2. But by saying we need secondary memory we discourage the
40840 register allocator from using the mmx registers unless needed. */
40841 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40842 return true;
40844 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40846 /* SSE1 doesn't have any direct moves from other classes. */
40847 if (!TARGET_SSE2)
40848 return true;
40850 /* If the target says that inter-unit moves are more expensive
40851 than moving through memory, then don't generate them. */
40852 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40853 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40854 return true;
40856 /* Between SSE and general, we have moves no larger than word size. */
40857 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40858 return true;
40861 return false;
40864 bool
40865 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40866 machine_mode mode, int strict)
40868 return inline_secondary_memory_needed (class1, class2, mode, strict);
40871 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40873 On the 80386, this is the size of MODE in words,
40874 except in the FP regs, where a single reg is always enough. */
40876 static unsigned char
40877 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40879 if (MAYBE_INTEGER_CLASS_P (rclass))
40881 if (mode == XFmode)
40882 return (TARGET_64BIT ? 2 : 3);
40883 else if (mode == XCmode)
40884 return (TARGET_64BIT ? 4 : 6);
40885 else
40886 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40888 else
40890 if (COMPLEX_MODE_P (mode))
40891 return 2;
40892 else
40893 return 1;
40897 /* Return true if the registers in CLASS cannot represent the change from
40898 modes FROM to TO. */
40900 bool
40901 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40902 enum reg_class regclass)
40904 if (from == to)
40905 return false;
40907 /* x87 registers can't do subreg at all, as all values are reformatted
40908 to extended precision. */
40909 if (MAYBE_FLOAT_CLASS_P (regclass))
40910 return true;
40912 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40914 /* Vector registers do not support QI or HImode loads. If we don't
40915 disallow a change to these modes, reload will assume it's ok to
40916 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40917 the vec_dupv4hi pattern. */
40918 if (GET_MODE_SIZE (from) < 4)
40919 return true;
40922 return false;
40925 /* Return the cost of moving data of mode M between a
40926 register and memory. A value of 2 is the default; this cost is
40927 relative to those in `REGISTER_MOVE_COST'.
40929 This function is used extensively by register_move_cost that is used to
40930 build tables at startup. Make it inline in this case.
40931 When IN is 2, return maximum of in and out move cost.
40933 If moving between registers and memory is more expensive than
40934 between two registers, you should define this macro to express the
40935 relative cost.
40937 Model also increased moving costs of QImode registers in non
40938 Q_REGS classes.
40940 static inline int
40941 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40942 int in)
40944 int cost;
40945 if (FLOAT_CLASS_P (regclass))
40947 int index;
40948 switch (mode)
40950 case SFmode:
40951 index = 0;
40952 break;
40953 case DFmode:
40954 index = 1;
40955 break;
40956 case XFmode:
40957 index = 2;
40958 break;
40959 default:
40960 return 100;
40962 if (in == 2)
40963 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40964 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40966 if (SSE_CLASS_P (regclass))
40968 int index;
40969 switch (GET_MODE_SIZE (mode))
40971 case 4:
40972 index = 0;
40973 break;
40974 case 8:
40975 index = 1;
40976 break;
40977 case 16:
40978 index = 2;
40979 break;
40980 default:
40981 return 100;
40983 if (in == 2)
40984 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40985 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40987 if (MMX_CLASS_P (regclass))
40989 int index;
40990 switch (GET_MODE_SIZE (mode))
40992 case 4:
40993 index = 0;
40994 break;
40995 case 8:
40996 index = 1;
40997 break;
40998 default:
40999 return 100;
41001 if (in)
41002 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41003 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41005 switch (GET_MODE_SIZE (mode))
41007 case 1:
41008 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41010 if (!in)
41011 return ix86_cost->int_store[0];
41012 if (TARGET_PARTIAL_REG_DEPENDENCY
41013 && optimize_function_for_speed_p (cfun))
41014 cost = ix86_cost->movzbl_load;
41015 else
41016 cost = ix86_cost->int_load[0];
41017 if (in == 2)
41018 return MAX (cost, ix86_cost->int_store[0]);
41019 return cost;
41021 else
41023 if (in == 2)
41024 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41025 if (in)
41026 return ix86_cost->movzbl_load;
41027 else
41028 return ix86_cost->int_store[0] + 4;
41030 break;
41031 case 2:
41032 if (in == 2)
41033 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41034 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41035 default:
41036 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41037 if (mode == TFmode)
41038 mode = XFmode;
41039 if (in == 2)
41040 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41041 else if (in)
41042 cost = ix86_cost->int_load[2];
41043 else
41044 cost = ix86_cost->int_store[2];
41045 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41049 static int
41050 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41051 bool in)
41053 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41057 /* Return the cost of moving data from a register in class CLASS1 to
41058 one in class CLASS2.
41060 It is not required that the cost always equal 2 when FROM is the same as TO;
41061 on some machines it is expensive to move between registers if they are not
41062 general registers. */
41064 static int
41065 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41066 reg_class_t class2_i)
41068 enum reg_class class1 = (enum reg_class) class1_i;
41069 enum reg_class class2 = (enum reg_class) class2_i;
41071 /* In case we require secondary memory, compute cost of the store followed
41072 by load. In order to avoid bad register allocation choices, we need
41073 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41075 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41077 int cost = 1;
41079 cost += inline_memory_move_cost (mode, class1, 2);
41080 cost += inline_memory_move_cost (mode, class2, 2);
41082 /* In case of copying from general_purpose_register we may emit multiple
41083 stores followed by single load causing memory size mismatch stall.
41084 Count this as arbitrarily high cost of 20. */
41085 if (targetm.class_max_nregs (class1, mode)
41086 > targetm.class_max_nregs (class2, mode))
41087 cost += 20;
41089 /* In the case of FP/MMX moves, the registers actually overlap, and we
41090 have to switch modes in order to treat them differently. */
41091 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41092 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41093 cost += 20;
41095 return cost;
41098 /* Moves between SSE/MMX and integer unit are expensive. */
41099 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41100 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41102 /* ??? By keeping returned value relatively high, we limit the number
41103 of moves between integer and MMX/SSE registers for all targets.
41104 Additionally, high value prevents problem with x86_modes_tieable_p(),
41105 where integer modes in MMX/SSE registers are not tieable
41106 because of missing QImode and HImode moves to, from or between
41107 MMX/SSE registers. */
41108 return MAX (8, ix86_cost->mmxsse_to_integer);
41110 if (MAYBE_FLOAT_CLASS_P (class1))
41111 return ix86_cost->fp_move;
41112 if (MAYBE_SSE_CLASS_P (class1))
41113 return ix86_cost->sse_move;
41114 if (MAYBE_MMX_CLASS_P (class1))
41115 return ix86_cost->mmx_move;
41116 return 2;
41119 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41120 MODE. */
41122 bool
41123 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41125 /* Flags and only flags can only hold CCmode values. */
41126 if (CC_REGNO_P (regno))
41127 return GET_MODE_CLASS (mode) == MODE_CC;
41128 if (GET_MODE_CLASS (mode) == MODE_CC
41129 || GET_MODE_CLASS (mode) == MODE_RANDOM
41130 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41131 return false;
41132 if (STACK_REGNO_P (regno))
41133 return VALID_FP_MODE_P (mode);
41134 if (MASK_REGNO_P (regno))
41135 return (VALID_MASK_REG_MODE (mode)
41136 || (TARGET_AVX512BW
41137 && VALID_MASK_AVX512BW_MODE (mode)));
41138 if (BND_REGNO_P (regno))
41139 return VALID_BND_REG_MODE (mode);
41140 if (SSE_REGNO_P (regno))
41142 /* We implement the move patterns for all vector modes into and
41143 out of SSE registers, even when no operation instructions
41144 are available. */
41146 /* For AVX-512 we allow, regardless of regno:
41147 - XI mode
41148 - any of 512-bit wide vector mode
41149 - any scalar mode. */
41150 if (TARGET_AVX512F
41151 && (mode == XImode
41152 || VALID_AVX512F_REG_MODE (mode)
41153 || VALID_AVX512F_SCALAR_MODE (mode)))
41154 return true;
41156 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41157 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41158 && MOD4_SSE_REGNO_P (regno)
41159 && mode == V64SFmode)
41160 return true;
41162 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41163 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41164 && MOD4_SSE_REGNO_P (regno)
41165 && mode == V64SImode)
41166 return true;
41168 /* TODO check for QI/HI scalars. */
41169 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41170 if (TARGET_AVX512VL
41171 && (mode == OImode
41172 || mode == TImode
41173 || VALID_AVX256_REG_MODE (mode)
41174 || VALID_AVX512VL_128_REG_MODE (mode)))
41175 return true;
41177 /* xmm16-xmm31 are only available for AVX-512. */
41178 if (EXT_REX_SSE_REGNO_P (regno))
41179 return false;
41181 /* OImode and AVX modes are available only when AVX is enabled. */
41182 return ((TARGET_AVX
41183 && VALID_AVX256_REG_OR_OI_MODE (mode))
41184 || VALID_SSE_REG_MODE (mode)
41185 || VALID_SSE2_REG_MODE (mode)
41186 || VALID_MMX_REG_MODE (mode)
41187 || VALID_MMX_REG_MODE_3DNOW (mode));
41189 if (MMX_REGNO_P (regno))
41191 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41192 so if the register is available at all, then we can move data of
41193 the given mode into or out of it. */
41194 return (VALID_MMX_REG_MODE (mode)
41195 || VALID_MMX_REG_MODE_3DNOW (mode));
41198 if (mode == QImode)
41200 /* Take care for QImode values - they can be in non-QI regs,
41201 but then they do cause partial register stalls. */
41202 if (ANY_QI_REGNO_P (regno))
41203 return true;
41204 if (!TARGET_PARTIAL_REG_STALL)
41205 return true;
41206 /* LRA checks if the hard register is OK for the given mode.
41207 QImode values can live in non-QI regs, so we allow all
41208 registers here. */
41209 if (lra_in_progress)
41210 return true;
41211 return !can_create_pseudo_p ();
41213 /* We handle both integer and floats in the general purpose registers. */
41214 else if (VALID_INT_MODE_P (mode))
41215 return true;
41216 else if (VALID_FP_MODE_P (mode))
41217 return true;
41218 else if (VALID_DFP_MODE_P (mode))
41219 return true;
41220 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41221 on to use that value in smaller contexts, this can easily force a
41222 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41223 supporting DImode, allow it. */
41224 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41225 return true;
41227 return false;
41230 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41231 tieable integer mode. */
41233 static bool
41234 ix86_tieable_integer_mode_p (machine_mode mode)
41236 switch (mode)
41238 case HImode:
41239 case SImode:
41240 return true;
41242 case QImode:
41243 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41245 case DImode:
41246 return TARGET_64BIT;
41248 default:
41249 return false;
41253 /* Return true if MODE1 is accessible in a register that can hold MODE2
41254 without copying. That is, all register classes that can hold MODE2
41255 can also hold MODE1. */
41257 bool
41258 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41260 if (mode1 == mode2)
41261 return true;
41263 if (ix86_tieable_integer_mode_p (mode1)
41264 && ix86_tieable_integer_mode_p (mode2))
41265 return true;
41267 /* MODE2 being XFmode implies fp stack or general regs, which means we
41268 can tie any smaller floating point modes to it. Note that we do not
41269 tie this with TFmode. */
41270 if (mode2 == XFmode)
41271 return mode1 == SFmode || mode1 == DFmode;
41273 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41274 that we can tie it with SFmode. */
41275 if (mode2 == DFmode)
41276 return mode1 == SFmode;
41278 /* If MODE2 is only appropriate for an SSE register, then tie with
41279 any other mode acceptable to SSE registers. */
41280 if (GET_MODE_SIZE (mode2) == 32
41281 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41282 return (GET_MODE_SIZE (mode1) == 32
41283 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41284 if (GET_MODE_SIZE (mode2) == 16
41285 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41286 return (GET_MODE_SIZE (mode1) == 16
41287 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41289 /* If MODE2 is appropriate for an MMX register, then tie
41290 with any other mode acceptable to MMX registers. */
41291 if (GET_MODE_SIZE (mode2) == 8
41292 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41293 return (GET_MODE_SIZE (mode1) == 8
41294 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41296 return false;
41299 /* Return the cost of moving between two registers of mode MODE. */
41301 static int
41302 ix86_set_reg_reg_cost (machine_mode mode)
41304 unsigned int units = UNITS_PER_WORD;
41306 switch (GET_MODE_CLASS (mode))
41308 default:
41309 break;
41311 case MODE_CC:
41312 units = GET_MODE_SIZE (CCmode);
41313 break;
41315 case MODE_FLOAT:
41316 if ((TARGET_SSE && mode == TFmode)
41317 || (TARGET_80387 && mode == XFmode)
41318 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41319 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41320 units = GET_MODE_SIZE (mode);
41321 break;
41323 case MODE_COMPLEX_FLOAT:
41324 if ((TARGET_SSE && mode == TCmode)
41325 || (TARGET_80387 && mode == XCmode)
41326 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41327 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41328 units = GET_MODE_SIZE (mode);
41329 break;
41331 case MODE_VECTOR_INT:
41332 case MODE_VECTOR_FLOAT:
41333 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41334 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41335 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41336 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41337 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41338 units = GET_MODE_SIZE (mode);
41341 /* Return the cost of moving between two registers of mode MODE,
41342 assuming that the move will be in pieces of at most UNITS bytes. */
41343 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41346 /* Compute a (partial) cost for rtx X. Return true if the complete
41347 cost has been computed, and false if subexpressions should be
41348 scanned. In either case, *TOTAL contains the cost result. */
41350 static bool
41351 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41352 int *total, bool speed)
41354 rtx mask;
41355 enum rtx_code code = GET_CODE (x);
41356 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41357 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41358 int src_cost;
41360 switch (code)
41362 case SET:
41363 if (register_operand (SET_DEST (x), VOIDmode)
41364 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41366 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41367 return true;
41370 if (register_operand (SET_SRC (x), VOIDmode))
41371 /* Avoid potentially incorrect high cost from rtx_costs
41372 for non-tieable SUBREGs. */
41373 src_cost = 0;
41374 else
41376 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41378 if (CONSTANT_P (SET_SRC (x)))
41379 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41380 a small value, possibly zero for cheap constants. */
41381 src_cost += COSTS_N_INSNS (1);
41384 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41385 return true;
41387 case CONST_INT:
41388 case CONST:
41389 case LABEL_REF:
41390 case SYMBOL_REF:
41391 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41392 *total = 3;
41393 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41394 *total = 2;
41395 else if (flag_pic && SYMBOLIC_CONST (x)
41396 && !(TARGET_64BIT
41397 && (GET_CODE (x) == LABEL_REF
41398 || (GET_CODE (x) == SYMBOL_REF
41399 && SYMBOL_REF_LOCAL_P (x))))
41400 /* Use 0 cost for CONST to improve its propagation. */
41401 && (TARGET_64BIT || GET_CODE (x) != CONST))
41402 *total = 1;
41403 else
41404 *total = 0;
41405 return true;
41407 case CONST_DOUBLE:
41408 if (IS_STACK_MODE (mode))
41409 switch (standard_80387_constant_p (x))
41411 case -1:
41412 case 0:
41413 break;
41414 case 1: /* 0.0 */
41415 *total = 1;
41416 return true;
41417 default: /* Other constants */
41418 *total = 2;
41419 return true;
41421 /* FALLTHRU */
41423 case CONST_VECTOR:
41424 switch (standard_sse_constant_p (x, mode))
41426 case 0:
41427 break;
41428 case 1: /* 0: xor eliminates false dependency */
41429 *total = 0;
41430 return true;
41431 default: /* -1: cmp contains false dependency */
41432 *total = 1;
41433 return true;
41435 /* FALLTHRU */
41437 case CONST_WIDE_INT:
41438 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41439 it'll probably end up. Add a penalty for size. */
41440 *total = (COSTS_N_INSNS (1)
41441 + (!TARGET_64BIT && flag_pic)
41442 + (GET_MODE_SIZE (mode) <= 4
41443 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41444 return true;
41446 case ZERO_EXTEND:
41447 /* The zero extensions is often completely free on x86_64, so make
41448 it as cheap as possible. */
41449 if (TARGET_64BIT && mode == DImode
41450 && GET_MODE (XEXP (x, 0)) == SImode)
41451 *total = 1;
41452 else if (TARGET_ZERO_EXTEND_WITH_AND)
41453 *total = cost->add;
41454 else
41455 *total = cost->movzx;
41456 return false;
41458 case SIGN_EXTEND:
41459 *total = cost->movsx;
41460 return false;
41462 case ASHIFT:
41463 if (SCALAR_INT_MODE_P (mode)
41464 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41465 && CONST_INT_P (XEXP (x, 1)))
41467 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41468 if (value == 1)
41470 *total = cost->add;
41471 return false;
41473 if ((value == 2 || value == 3)
41474 && cost->lea <= cost->shift_const)
41476 *total = cost->lea;
41477 return false;
41480 /* FALLTHRU */
41482 case ROTATE:
41483 case ASHIFTRT:
41484 case LSHIFTRT:
41485 case ROTATERT:
41486 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41488 /* ??? Should be SSE vector operation cost. */
41489 /* At least for published AMD latencies, this really is the same
41490 as the latency for a simple fpu operation like fabs. */
41491 /* V*QImode is emulated with 1-11 insns. */
41492 if (mode == V16QImode || mode == V32QImode)
41494 int count = 11;
41495 if (TARGET_XOP && mode == V16QImode)
41497 /* For XOP we use vpshab, which requires a broadcast of the
41498 value to the variable shift insn. For constants this
41499 means a V16Q const in mem; even when we can perform the
41500 shift with one insn set the cost to prefer paddb. */
41501 if (CONSTANT_P (XEXP (x, 1)))
41503 *total = (cost->fabs
41504 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41505 + (speed ? 2 : COSTS_N_BYTES (16)));
41506 return true;
41508 count = 3;
41510 else if (TARGET_SSSE3)
41511 count = 7;
41512 *total = cost->fabs * count;
41514 else
41515 *total = cost->fabs;
41517 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41519 if (CONST_INT_P (XEXP (x, 1)))
41521 if (INTVAL (XEXP (x, 1)) > 32)
41522 *total = cost->shift_const + COSTS_N_INSNS (2);
41523 else
41524 *total = cost->shift_const * 2;
41526 else
41528 if (GET_CODE (XEXP (x, 1)) == AND)
41529 *total = cost->shift_var * 2;
41530 else
41531 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41534 else
41536 if (CONST_INT_P (XEXP (x, 1)))
41537 *total = cost->shift_const;
41538 else if (SUBREG_P (XEXP (x, 1))
41539 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41541 /* Return the cost after shift-and truncation. */
41542 *total = cost->shift_var;
41543 return true;
41545 else
41546 *total = cost->shift_var;
41548 return false;
41550 case FMA:
41552 rtx sub;
41554 gcc_assert (FLOAT_MODE_P (mode));
41555 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41557 /* ??? SSE scalar/vector cost should be used here. */
41558 /* ??? Bald assumption that fma has the same cost as fmul. */
41559 *total = cost->fmul;
41560 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41562 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41563 sub = XEXP (x, 0);
41564 if (GET_CODE (sub) == NEG)
41565 sub = XEXP (sub, 0);
41566 *total += rtx_cost (sub, mode, FMA, 0, speed);
41568 sub = XEXP (x, 2);
41569 if (GET_CODE (sub) == NEG)
41570 sub = XEXP (sub, 0);
41571 *total += rtx_cost (sub, mode, FMA, 2, speed);
41572 return true;
41575 case MULT:
41576 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41578 /* ??? SSE scalar cost should be used here. */
41579 *total = cost->fmul;
41580 return false;
41582 else if (X87_FLOAT_MODE_P (mode))
41584 *total = cost->fmul;
41585 return false;
41587 else if (FLOAT_MODE_P (mode))
41589 /* ??? SSE vector cost should be used here. */
41590 *total = cost->fmul;
41591 return false;
41593 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41595 /* V*QImode is emulated with 7-13 insns. */
41596 if (mode == V16QImode || mode == V32QImode)
41598 int extra = 11;
41599 if (TARGET_XOP && mode == V16QImode)
41600 extra = 5;
41601 else if (TARGET_SSSE3)
41602 extra = 6;
41603 *total = cost->fmul * 2 + cost->fabs * extra;
41605 /* V*DImode is emulated with 5-8 insns. */
41606 else if (mode == V2DImode || mode == V4DImode)
41608 if (TARGET_XOP && mode == V2DImode)
41609 *total = cost->fmul * 2 + cost->fabs * 3;
41610 else
41611 *total = cost->fmul * 3 + cost->fabs * 5;
41613 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41614 insns, including two PMULUDQ. */
41615 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41616 *total = cost->fmul * 2 + cost->fabs * 5;
41617 else
41618 *total = cost->fmul;
41619 return false;
41621 else
41623 rtx op0 = XEXP (x, 0);
41624 rtx op1 = XEXP (x, 1);
41625 int nbits;
41626 if (CONST_INT_P (XEXP (x, 1)))
41628 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41629 for (nbits = 0; value != 0; value &= value - 1)
41630 nbits++;
41632 else
41633 /* This is arbitrary. */
41634 nbits = 7;
41636 /* Compute costs correctly for widening multiplication. */
41637 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41638 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41639 == GET_MODE_SIZE (mode))
41641 int is_mulwiden = 0;
41642 machine_mode inner_mode = GET_MODE (op0);
41644 if (GET_CODE (op0) == GET_CODE (op1))
41645 is_mulwiden = 1, op1 = XEXP (op1, 0);
41646 else if (CONST_INT_P (op1))
41648 if (GET_CODE (op0) == SIGN_EXTEND)
41649 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41650 == INTVAL (op1);
41651 else
41652 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41655 if (is_mulwiden)
41656 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41659 *total = (cost->mult_init[MODE_INDEX (mode)]
41660 + nbits * cost->mult_bit
41661 + rtx_cost (op0, mode, outer_code, opno, speed)
41662 + rtx_cost (op1, mode, outer_code, opno, speed));
41664 return true;
41667 case DIV:
41668 case UDIV:
41669 case MOD:
41670 case UMOD:
41671 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41672 /* ??? SSE cost should be used here. */
41673 *total = cost->fdiv;
41674 else if (X87_FLOAT_MODE_P (mode))
41675 *total = cost->fdiv;
41676 else if (FLOAT_MODE_P (mode))
41677 /* ??? SSE vector cost should be used here. */
41678 *total = cost->fdiv;
41679 else
41680 *total = cost->divide[MODE_INDEX (mode)];
41681 return false;
41683 case PLUS:
41684 if (GET_MODE_CLASS (mode) == MODE_INT
41685 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41687 if (GET_CODE (XEXP (x, 0)) == PLUS
41688 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41689 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41690 && CONSTANT_P (XEXP (x, 1)))
41692 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41693 if (val == 2 || val == 4 || val == 8)
41695 *total = cost->lea;
41696 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41697 outer_code, opno, speed);
41698 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41699 outer_code, opno, speed);
41700 *total += rtx_cost (XEXP (x, 1), mode,
41701 outer_code, opno, speed);
41702 return true;
41705 else if (GET_CODE (XEXP (x, 0)) == MULT
41706 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41708 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41709 if (val == 2 || val == 4 || val == 8)
41711 *total = cost->lea;
41712 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41713 outer_code, opno, speed);
41714 *total += rtx_cost (XEXP (x, 1), mode,
41715 outer_code, opno, speed);
41716 return true;
41719 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41721 /* Add with carry, ignore the cost of adding a carry flag. */
41722 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41723 *total = cost->add;
41724 else
41726 *total = cost->lea;
41727 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41728 outer_code, opno, speed);
41731 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41732 outer_code, opno, speed);
41733 *total += rtx_cost (XEXP (x, 1), mode,
41734 outer_code, opno, speed);
41735 return true;
41738 /* FALLTHRU */
41740 case MINUS:
41741 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41742 if (GET_MODE_CLASS (mode) == MODE_INT
41743 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41744 && GET_CODE (XEXP (x, 0)) == MINUS
41745 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41747 *total = cost->add;
41748 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41749 outer_code, opno, speed);
41750 *total += rtx_cost (XEXP (x, 1), mode,
41751 outer_code, opno, speed);
41752 return true;
41755 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41757 /* ??? SSE cost should be used here. */
41758 *total = cost->fadd;
41759 return false;
41761 else if (X87_FLOAT_MODE_P (mode))
41763 *total = cost->fadd;
41764 return false;
41766 else if (FLOAT_MODE_P (mode))
41768 /* ??? SSE vector cost should be used here. */
41769 *total = cost->fadd;
41770 return false;
41772 /* FALLTHRU */
41774 case AND:
41775 case IOR:
41776 case XOR:
41777 if (GET_MODE_CLASS (mode) == MODE_INT
41778 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41780 *total = (cost->add * 2
41781 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41782 << (GET_MODE (XEXP (x, 0)) != DImode))
41783 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41784 << (GET_MODE (XEXP (x, 1)) != DImode)));
41785 return true;
41787 /* FALLTHRU */
41789 case NEG:
41790 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41792 /* ??? SSE cost should be used here. */
41793 *total = cost->fchs;
41794 return false;
41796 else if (X87_FLOAT_MODE_P (mode))
41798 *total = cost->fchs;
41799 return false;
41801 else if (FLOAT_MODE_P (mode))
41803 /* ??? SSE vector cost should be used here. */
41804 *total = cost->fchs;
41805 return false;
41807 /* FALLTHRU */
41809 case NOT:
41810 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41812 /* ??? Should be SSE vector operation cost. */
41813 /* At least for published AMD latencies, this really is the same
41814 as the latency for a simple fpu operation like fabs. */
41815 *total = cost->fabs;
41817 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41818 *total = cost->add * 2;
41819 else
41820 *total = cost->add;
41821 return false;
41823 case COMPARE:
41824 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41825 && XEXP (XEXP (x, 0), 1) == const1_rtx
41826 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41827 && XEXP (x, 1) == const0_rtx)
41829 /* This kind of construct is implemented using test[bwl].
41830 Treat it as if we had an AND. */
41831 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41832 *total = (cost->add
41833 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41834 opno, speed)
41835 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41836 return true;
41839 /* The embedded comparison operand is completely free. */
41840 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41841 && XEXP (x, 1) == const0_rtx)
41842 *total = 0;
41844 return false;
41846 case FLOAT_EXTEND:
41847 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41848 *total = 0;
41849 return false;
41851 case ABS:
41852 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41853 /* ??? SSE cost should be used here. */
41854 *total = cost->fabs;
41855 else if (X87_FLOAT_MODE_P (mode))
41856 *total = cost->fabs;
41857 else if (FLOAT_MODE_P (mode))
41858 /* ??? SSE vector cost should be used here. */
41859 *total = cost->fabs;
41860 return false;
41862 case SQRT:
41863 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41864 /* ??? SSE cost should be used here. */
41865 *total = cost->fsqrt;
41866 else if (X87_FLOAT_MODE_P (mode))
41867 *total = cost->fsqrt;
41868 else if (FLOAT_MODE_P (mode))
41869 /* ??? SSE vector cost should be used here. */
41870 *total = cost->fsqrt;
41871 return false;
41873 case UNSPEC:
41874 if (XINT (x, 1) == UNSPEC_TP)
41875 *total = 0;
41876 return false;
41878 case VEC_SELECT:
41879 case VEC_CONCAT:
41880 case VEC_DUPLICATE:
41881 /* ??? Assume all of these vector manipulation patterns are
41882 recognizable. In which case they all pretty much have the
41883 same cost. */
41884 *total = cost->fabs;
41885 return true;
41886 case VEC_MERGE:
41887 mask = XEXP (x, 2);
41888 /* This is masked instruction, assume the same cost,
41889 as nonmasked variant. */
41890 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41891 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41892 else
41893 *total = cost->fabs;
41894 return true;
41896 default:
41897 return false;
41901 #if TARGET_MACHO
41903 static int current_machopic_label_num;
41905 /* Given a symbol name and its associated stub, write out the
41906 definition of the stub. */
41908 void
41909 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41911 unsigned int length;
41912 char *binder_name, *symbol_name, lazy_ptr_name[32];
41913 int label = ++current_machopic_label_num;
41915 /* For 64-bit we shouldn't get here. */
41916 gcc_assert (!TARGET_64BIT);
41918 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41919 symb = targetm.strip_name_encoding (symb);
41921 length = strlen (stub);
41922 binder_name = XALLOCAVEC (char, length + 32);
41923 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41925 length = strlen (symb);
41926 symbol_name = XALLOCAVEC (char, length + 32);
41927 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41929 sprintf (lazy_ptr_name, "L%d$lz", label);
41931 if (MACHOPIC_ATT_STUB)
41932 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41933 else if (MACHOPIC_PURE)
41934 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41935 else
41936 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41938 fprintf (file, "%s:\n", stub);
41939 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41941 if (MACHOPIC_ATT_STUB)
41943 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41945 else if (MACHOPIC_PURE)
41947 /* PIC stub. */
41948 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41949 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41950 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41951 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41952 label, lazy_ptr_name, label);
41953 fprintf (file, "\tjmp\t*%%ecx\n");
41955 else
41956 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41958 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41959 it needs no stub-binding-helper. */
41960 if (MACHOPIC_ATT_STUB)
41961 return;
41963 fprintf (file, "%s:\n", binder_name);
41965 if (MACHOPIC_PURE)
41967 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41968 fprintf (file, "\tpushl\t%%ecx\n");
41970 else
41971 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41973 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41975 /* N.B. Keep the correspondence of these
41976 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41977 old-pic/new-pic/non-pic stubs; altering this will break
41978 compatibility with existing dylibs. */
41979 if (MACHOPIC_PURE)
41981 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41982 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41984 else
41985 /* 16-byte -mdynamic-no-pic stub. */
41986 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41988 fprintf (file, "%s:\n", lazy_ptr_name);
41989 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41990 fprintf (file, ASM_LONG "%s\n", binder_name);
41992 #endif /* TARGET_MACHO */
41994 /* Order the registers for register allocator. */
41996 void
41997 x86_order_regs_for_local_alloc (void)
41999 int pos = 0;
42000 int i;
42002 /* First allocate the local general purpose registers. */
42003 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42004 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42005 reg_alloc_order [pos++] = i;
42007 /* Global general purpose registers. */
42008 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42009 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42010 reg_alloc_order [pos++] = i;
42012 /* x87 registers come first in case we are doing FP math
42013 using them. */
42014 if (!TARGET_SSE_MATH)
42015 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42016 reg_alloc_order [pos++] = i;
42018 /* SSE registers. */
42019 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42020 reg_alloc_order [pos++] = i;
42021 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42022 reg_alloc_order [pos++] = i;
42024 /* Extended REX SSE registers. */
42025 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42026 reg_alloc_order [pos++] = i;
42028 /* Mask register. */
42029 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42030 reg_alloc_order [pos++] = i;
42032 /* MPX bound registers. */
42033 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42034 reg_alloc_order [pos++] = i;
42036 /* x87 registers. */
42037 if (TARGET_SSE_MATH)
42038 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42039 reg_alloc_order [pos++] = i;
42041 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42042 reg_alloc_order [pos++] = i;
42044 /* Initialize the rest of array as we do not allocate some registers
42045 at all. */
42046 while (pos < FIRST_PSEUDO_REGISTER)
42047 reg_alloc_order [pos++] = 0;
42050 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42051 in struct attribute_spec handler. */
42052 static tree
42053 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42054 tree args,
42055 int,
42056 bool *no_add_attrs)
42058 if (TREE_CODE (*node) != FUNCTION_TYPE
42059 && TREE_CODE (*node) != METHOD_TYPE
42060 && TREE_CODE (*node) != FIELD_DECL
42061 && TREE_CODE (*node) != TYPE_DECL)
42063 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42064 name);
42065 *no_add_attrs = true;
42066 return NULL_TREE;
42068 if (TARGET_64BIT)
42070 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42071 name);
42072 *no_add_attrs = true;
42073 return NULL_TREE;
42075 if (is_attribute_p ("callee_pop_aggregate_return", name))
42077 tree cst;
42079 cst = TREE_VALUE (args);
42080 if (TREE_CODE (cst) != INTEGER_CST)
42082 warning (OPT_Wattributes,
42083 "%qE attribute requires an integer constant argument",
42084 name);
42085 *no_add_attrs = true;
42087 else if (compare_tree_int (cst, 0) != 0
42088 && compare_tree_int (cst, 1) != 0)
42090 warning (OPT_Wattributes,
42091 "argument to %qE attribute is neither zero, nor one",
42092 name);
42093 *no_add_attrs = true;
42096 return NULL_TREE;
42099 return NULL_TREE;
42102 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42103 struct attribute_spec.handler. */
42104 static tree
42105 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42106 bool *no_add_attrs)
42108 if (TREE_CODE (*node) != FUNCTION_TYPE
42109 && TREE_CODE (*node) != METHOD_TYPE
42110 && TREE_CODE (*node) != FIELD_DECL
42111 && TREE_CODE (*node) != TYPE_DECL)
42113 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42114 name);
42115 *no_add_attrs = true;
42116 return NULL_TREE;
42119 /* Can combine regparm with all attributes but fastcall. */
42120 if (is_attribute_p ("ms_abi", name))
42122 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42124 error ("ms_abi and sysv_abi attributes are not compatible");
42127 return NULL_TREE;
42129 else if (is_attribute_p ("sysv_abi", name))
42131 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42133 error ("ms_abi and sysv_abi attributes are not compatible");
42136 return NULL_TREE;
42139 return NULL_TREE;
42142 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42143 struct attribute_spec.handler. */
42144 static tree
42145 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42146 bool *no_add_attrs)
42148 tree *type = NULL;
42149 if (DECL_P (*node))
42151 if (TREE_CODE (*node) == TYPE_DECL)
42152 type = &TREE_TYPE (*node);
42154 else
42155 type = node;
42157 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42159 warning (OPT_Wattributes, "%qE attribute ignored",
42160 name);
42161 *no_add_attrs = true;
42164 else if ((is_attribute_p ("ms_struct", name)
42165 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42166 || ((is_attribute_p ("gcc_struct", name)
42167 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42169 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42170 name);
42171 *no_add_attrs = true;
42174 return NULL_TREE;
42177 static tree
42178 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42179 bool *no_add_attrs)
42181 if (TREE_CODE (*node) != FUNCTION_DECL)
42183 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42184 name);
42185 *no_add_attrs = true;
42187 return NULL_TREE;
42190 static tree
42191 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42192 int, bool *)
42194 return NULL_TREE;
42197 static tree
42198 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42200 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42201 but the function type contains args and return type data. */
42202 tree func_type = *node;
42203 tree return_type = TREE_TYPE (func_type);
42205 int nargs = 0;
42206 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42207 while (current_arg_type
42208 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42210 if (nargs == 0)
42212 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42213 error ("interrupt service routine should have a pointer "
42214 "as the first argument");
42216 else if (nargs == 1)
42218 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42219 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42220 error ("interrupt service routine should have unsigned %s"
42221 "int as the second argument",
42222 TARGET_64BIT
42223 ? (TARGET_X32 ? "long long " : "long ")
42224 : "");
42226 nargs++;
42227 current_arg_type = TREE_CHAIN (current_arg_type);
42229 if (!nargs || nargs > 2)
42230 error ("interrupt service routine can only have a pointer argument "
42231 "and an optional integer argument");
42232 if (! VOID_TYPE_P (return_type))
42233 error ("interrupt service routine can't have non-void return value");
42235 return NULL_TREE;
42238 static bool
42239 ix86_ms_bitfield_layout_p (const_tree record_type)
42241 return ((TARGET_MS_BITFIELD_LAYOUT
42242 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42243 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42246 /* Returns an expression indicating where the this parameter is
42247 located on entry to the FUNCTION. */
42249 static rtx
42250 x86_this_parameter (tree function)
42252 tree type = TREE_TYPE (function);
42253 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42254 int nregs;
42256 if (TARGET_64BIT)
42258 const int *parm_regs;
42260 if (ix86_function_type_abi (type) == MS_ABI)
42261 parm_regs = x86_64_ms_abi_int_parameter_registers;
42262 else
42263 parm_regs = x86_64_int_parameter_registers;
42264 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42267 nregs = ix86_function_regparm (type, function);
42269 if (nregs > 0 && !stdarg_p (type))
42271 int regno;
42272 unsigned int ccvt = ix86_get_callcvt (type);
42274 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42275 regno = aggr ? DX_REG : CX_REG;
42276 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42278 regno = CX_REG;
42279 if (aggr)
42280 return gen_rtx_MEM (SImode,
42281 plus_constant (Pmode, stack_pointer_rtx, 4));
42283 else
42285 regno = AX_REG;
42286 if (aggr)
42288 regno = DX_REG;
42289 if (nregs == 1)
42290 return gen_rtx_MEM (SImode,
42291 plus_constant (Pmode,
42292 stack_pointer_rtx, 4));
42295 return gen_rtx_REG (SImode, regno);
42298 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42299 aggr ? 8 : 4));
42302 /* Determine whether x86_output_mi_thunk can succeed. */
42304 static bool
42305 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42306 const_tree function)
42308 /* 64-bit can handle anything. */
42309 if (TARGET_64BIT)
42310 return true;
42312 /* For 32-bit, everything's fine if we have one free register. */
42313 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42314 return true;
42316 /* Need a free register for vcall_offset. */
42317 if (vcall_offset)
42318 return false;
42320 /* Need a free register for GOT references. */
42321 if (flag_pic && !targetm.binds_local_p (function))
42322 return false;
42324 /* Otherwise ok. */
42325 return true;
42328 /* Output the assembler code for a thunk function. THUNK_DECL is the
42329 declaration for the thunk function itself, FUNCTION is the decl for
42330 the target function. DELTA is an immediate constant offset to be
42331 added to THIS. If VCALL_OFFSET is nonzero, the word at
42332 *(*this + vcall_offset) should be added to THIS. */
42334 static void
42335 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42336 HOST_WIDE_INT vcall_offset, tree function)
42338 rtx this_param = x86_this_parameter (function);
42339 rtx this_reg, tmp, fnaddr;
42340 unsigned int tmp_regno;
42341 rtx_insn *insn;
42343 if (TARGET_64BIT)
42344 tmp_regno = R10_REG;
42345 else
42347 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42348 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42349 tmp_regno = AX_REG;
42350 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42351 tmp_regno = DX_REG;
42352 else
42353 tmp_regno = CX_REG;
42356 emit_note (NOTE_INSN_PROLOGUE_END);
42358 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42359 pull it in now and let DELTA benefit. */
42360 if (REG_P (this_param))
42361 this_reg = this_param;
42362 else if (vcall_offset)
42364 /* Put the this parameter into %eax. */
42365 this_reg = gen_rtx_REG (Pmode, AX_REG);
42366 emit_move_insn (this_reg, this_param);
42368 else
42369 this_reg = NULL_RTX;
42371 /* Adjust the this parameter by a fixed constant. */
42372 if (delta)
42374 rtx delta_rtx = GEN_INT (delta);
42375 rtx delta_dst = this_reg ? this_reg : this_param;
42377 if (TARGET_64BIT)
42379 if (!x86_64_general_operand (delta_rtx, Pmode))
42381 tmp = gen_rtx_REG (Pmode, tmp_regno);
42382 emit_move_insn (tmp, delta_rtx);
42383 delta_rtx = tmp;
42387 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42390 /* Adjust the this parameter by a value stored in the vtable. */
42391 if (vcall_offset)
42393 rtx vcall_addr, vcall_mem, this_mem;
42395 tmp = gen_rtx_REG (Pmode, tmp_regno);
42397 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42398 if (Pmode != ptr_mode)
42399 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42400 emit_move_insn (tmp, this_mem);
42402 /* Adjust the this parameter. */
42403 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42404 if (TARGET_64BIT
42405 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42407 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42408 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42409 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42412 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42413 if (Pmode != ptr_mode)
42414 emit_insn (gen_addsi_1_zext (this_reg,
42415 gen_rtx_REG (ptr_mode,
42416 REGNO (this_reg)),
42417 vcall_mem));
42418 else
42419 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42422 /* If necessary, drop THIS back to its stack slot. */
42423 if (this_reg && this_reg != this_param)
42424 emit_move_insn (this_param, this_reg);
42426 fnaddr = XEXP (DECL_RTL (function), 0);
42427 if (TARGET_64BIT)
42429 if (!flag_pic || targetm.binds_local_p (function)
42430 || TARGET_PECOFF)
42432 else
42434 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42435 tmp = gen_rtx_CONST (Pmode, tmp);
42436 fnaddr = gen_const_mem (Pmode, tmp);
42439 else
42441 if (!flag_pic || targetm.binds_local_p (function))
42443 #if TARGET_MACHO
42444 else if (TARGET_MACHO)
42446 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42447 fnaddr = XEXP (fnaddr, 0);
42449 #endif /* TARGET_MACHO */
42450 else
42452 tmp = gen_rtx_REG (Pmode, CX_REG);
42453 output_set_got (tmp, NULL_RTX);
42455 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42456 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42457 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42458 fnaddr = gen_const_mem (Pmode, fnaddr);
42462 /* Our sibling call patterns do not allow memories, because we have no
42463 predicate that can distinguish between frame and non-frame memory.
42464 For our purposes here, we can get away with (ab)using a jump pattern,
42465 because we're going to do no optimization. */
42466 if (MEM_P (fnaddr))
42468 if (sibcall_insn_operand (fnaddr, word_mode))
42470 fnaddr = XEXP (DECL_RTL (function), 0);
42471 tmp = gen_rtx_MEM (QImode, fnaddr);
42472 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42473 tmp = emit_call_insn (tmp);
42474 SIBLING_CALL_P (tmp) = 1;
42476 else
42477 emit_jump_insn (gen_indirect_jump (fnaddr));
42479 else
42481 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42483 // CM_LARGE_PIC always uses pseudo PIC register which is
42484 // uninitialized. Since FUNCTION is local and calling it
42485 // doesn't go through PLT, we use scratch register %r11 as
42486 // PIC register and initialize it here.
42487 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42488 ix86_init_large_pic_reg (tmp_regno);
42489 fnaddr = legitimize_pic_address (fnaddr,
42490 gen_rtx_REG (Pmode, tmp_regno));
42493 if (!sibcall_insn_operand (fnaddr, word_mode))
42495 tmp = gen_rtx_REG (word_mode, tmp_regno);
42496 if (GET_MODE (fnaddr) != word_mode)
42497 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42498 emit_move_insn (tmp, fnaddr);
42499 fnaddr = tmp;
42502 tmp = gen_rtx_MEM (QImode, fnaddr);
42503 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42504 tmp = emit_call_insn (tmp);
42505 SIBLING_CALL_P (tmp) = 1;
42507 emit_barrier ();
42509 /* Emit just enough of rest_of_compilation to get the insns emitted.
42510 Note that use_thunk calls assemble_start_function et al. */
42511 insn = get_insns ();
42512 shorten_branches (insn);
42513 final_start_function (insn, file, 1);
42514 final (insn, file, 1);
42515 final_end_function ();
42518 static void
42519 x86_file_start (void)
42521 default_file_start ();
42522 if (TARGET_16BIT)
42523 fputs ("\t.code16gcc\n", asm_out_file);
42524 #if TARGET_MACHO
42525 darwin_file_start ();
42526 #endif
42527 if (X86_FILE_START_VERSION_DIRECTIVE)
42528 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42529 if (X86_FILE_START_FLTUSED)
42530 fputs ("\t.global\t__fltused\n", asm_out_file);
42531 if (ix86_asm_dialect == ASM_INTEL)
42532 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42536 x86_field_alignment (tree type, int computed)
42538 machine_mode mode;
42540 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42541 return computed;
42542 if (TARGET_IAMCU)
42543 return iamcu_alignment (type, computed);
42544 mode = TYPE_MODE (strip_array_types (type));
42545 if (mode == DFmode || mode == DCmode
42546 || GET_MODE_CLASS (mode) == MODE_INT
42547 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42548 return MIN (32, computed);
42549 return computed;
42552 /* Print call to TARGET to FILE. */
42554 static void
42555 x86_print_call_or_nop (FILE *file, const char *target)
42557 if (flag_nop_mcount)
42558 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42559 else
42560 fprintf (file, "1:\tcall\t%s\n", target);
42563 /* Output assembler code to FILE to increment profiler label # LABELNO
42564 for profiling a function entry. */
42565 void
42566 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42568 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42569 : MCOUNT_NAME);
42570 if (TARGET_64BIT)
42572 #ifndef NO_PROFILE_COUNTERS
42573 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42574 #endif
42576 if (!TARGET_PECOFF && flag_pic)
42577 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42578 else
42579 x86_print_call_or_nop (file, mcount_name);
42581 else if (flag_pic)
42583 #ifndef NO_PROFILE_COUNTERS
42584 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42585 LPREFIX, labelno);
42586 #endif
42587 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42589 else
42591 #ifndef NO_PROFILE_COUNTERS
42592 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42593 LPREFIX, labelno);
42594 #endif
42595 x86_print_call_or_nop (file, mcount_name);
42598 if (flag_record_mcount)
42600 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42601 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42602 fprintf (file, "\t.previous\n");
42606 /* We don't have exact information about the insn sizes, but we may assume
42607 quite safely that we are informed about all 1 byte insns and memory
42608 address sizes. This is enough to eliminate unnecessary padding in
42609 99% of cases. */
42611 static int
42612 min_insn_size (rtx_insn *insn)
42614 int l = 0, len;
42616 if (!INSN_P (insn) || !active_insn_p (insn))
42617 return 0;
42619 /* Discard alignments we've emit and jump instructions. */
42620 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42621 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42622 return 0;
42624 /* Important case - calls are always 5 bytes.
42625 It is common to have many calls in the row. */
42626 if (CALL_P (insn)
42627 && symbolic_reference_mentioned_p (PATTERN (insn))
42628 && !SIBLING_CALL_P (insn))
42629 return 5;
42630 len = get_attr_length (insn);
42631 if (len <= 1)
42632 return 1;
42634 /* For normal instructions we rely on get_attr_length being exact,
42635 with a few exceptions. */
42636 if (!JUMP_P (insn))
42638 enum attr_type type = get_attr_type (insn);
42640 switch (type)
42642 case TYPE_MULTI:
42643 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42644 || asm_noperands (PATTERN (insn)) >= 0)
42645 return 0;
42646 break;
42647 case TYPE_OTHER:
42648 case TYPE_FCMP:
42649 break;
42650 default:
42651 /* Otherwise trust get_attr_length. */
42652 return len;
42655 l = get_attr_length_address (insn);
42656 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42657 l = 4;
42659 if (l)
42660 return 1+l;
42661 else
42662 return 2;
42665 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42667 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42668 window. */
42670 static void
42671 ix86_avoid_jump_mispredicts (void)
42673 rtx_insn *insn, *start = get_insns ();
42674 int nbytes = 0, njumps = 0;
42675 bool isjump = false;
42677 /* Look for all minimal intervals of instructions containing 4 jumps.
42678 The intervals are bounded by START and INSN. NBYTES is the total
42679 size of instructions in the interval including INSN and not including
42680 START. When the NBYTES is smaller than 16 bytes, it is possible
42681 that the end of START and INSN ends up in the same 16byte page.
42683 The smallest offset in the page INSN can start is the case where START
42684 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42685 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42687 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42688 have to, control transfer to label(s) can be performed through other
42689 means, and also we estimate minimum length of all asm stmts as 0. */
42690 for (insn = start; insn; insn = NEXT_INSN (insn))
42692 int min_size;
42694 if (LABEL_P (insn))
42696 int align = label_to_alignment (insn);
42697 int max_skip = label_to_max_skip (insn);
42699 if (max_skip > 15)
42700 max_skip = 15;
42701 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42702 already in the current 16 byte page, because otherwise
42703 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42704 bytes to reach 16 byte boundary. */
42705 if (align <= 0
42706 || (align <= 3 && max_skip != (1 << align) - 1))
42707 max_skip = 0;
42708 if (dump_file)
42709 fprintf (dump_file, "Label %i with max_skip %i\n",
42710 INSN_UID (insn), max_skip);
42711 if (max_skip)
42713 while (nbytes + max_skip >= 16)
42715 start = NEXT_INSN (start);
42716 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42717 || CALL_P (start))
42718 njumps--, isjump = true;
42719 else
42720 isjump = false;
42721 nbytes -= min_insn_size (start);
42724 continue;
42727 min_size = min_insn_size (insn);
42728 nbytes += min_size;
42729 if (dump_file)
42730 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42731 INSN_UID (insn), min_size);
42732 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42733 || CALL_P (insn))
42734 njumps++;
42735 else
42736 continue;
42738 while (njumps > 3)
42740 start = NEXT_INSN (start);
42741 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42742 || CALL_P (start))
42743 njumps--, isjump = true;
42744 else
42745 isjump = false;
42746 nbytes -= min_insn_size (start);
42748 gcc_assert (njumps >= 0);
42749 if (dump_file)
42750 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42751 INSN_UID (start), INSN_UID (insn), nbytes);
42753 if (njumps == 3 && isjump && nbytes < 16)
42755 int padsize = 15 - nbytes + min_insn_size (insn);
42757 if (dump_file)
42758 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42759 INSN_UID (insn), padsize);
42760 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42764 #endif
42766 /* AMD Athlon works faster
42767 when RET is not destination of conditional jump or directly preceded
42768 by other jump instruction. We avoid the penalty by inserting NOP just
42769 before the RET instructions in such cases. */
42770 static void
42771 ix86_pad_returns (void)
42773 edge e;
42774 edge_iterator ei;
42776 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42778 basic_block bb = e->src;
42779 rtx_insn *ret = BB_END (bb);
42780 rtx_insn *prev;
42781 bool replace = false;
42783 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42784 || optimize_bb_for_size_p (bb))
42785 continue;
42786 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42787 if (active_insn_p (prev) || LABEL_P (prev))
42788 break;
42789 if (prev && LABEL_P (prev))
42791 edge e;
42792 edge_iterator ei;
42794 FOR_EACH_EDGE (e, ei, bb->preds)
42795 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42796 && !(e->flags & EDGE_FALLTHRU))
42798 replace = true;
42799 break;
42802 if (!replace)
42804 prev = prev_active_insn (ret);
42805 if (prev
42806 && ((JUMP_P (prev) && any_condjump_p (prev))
42807 || CALL_P (prev)))
42808 replace = true;
42809 /* Empty functions get branch mispredict even when
42810 the jump destination is not visible to us. */
42811 if (!prev && !optimize_function_for_size_p (cfun))
42812 replace = true;
42814 if (replace)
42816 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42817 delete_insn (ret);
42822 /* Count the minimum number of instructions in BB. Return 4 if the
42823 number of instructions >= 4. */
42825 static int
42826 ix86_count_insn_bb (basic_block bb)
42828 rtx_insn *insn;
42829 int insn_count = 0;
42831 /* Count number of instructions in this block. Return 4 if the number
42832 of instructions >= 4. */
42833 FOR_BB_INSNS (bb, insn)
42835 /* Only happen in exit blocks. */
42836 if (JUMP_P (insn)
42837 && ANY_RETURN_P (PATTERN (insn)))
42838 break;
42840 if (NONDEBUG_INSN_P (insn)
42841 && GET_CODE (PATTERN (insn)) != USE
42842 && GET_CODE (PATTERN (insn)) != CLOBBER)
42844 insn_count++;
42845 if (insn_count >= 4)
42846 return insn_count;
42850 return insn_count;
42854 /* Count the minimum number of instructions in code path in BB.
42855 Return 4 if the number of instructions >= 4. */
42857 static int
42858 ix86_count_insn (basic_block bb)
42860 edge e;
42861 edge_iterator ei;
42862 int min_prev_count;
42864 /* Only bother counting instructions along paths with no
42865 more than 2 basic blocks between entry and exit. Given
42866 that BB has an edge to exit, determine if a predecessor
42867 of BB has an edge from entry. If so, compute the number
42868 of instructions in the predecessor block. If there
42869 happen to be multiple such blocks, compute the minimum. */
42870 min_prev_count = 4;
42871 FOR_EACH_EDGE (e, ei, bb->preds)
42873 edge prev_e;
42874 edge_iterator prev_ei;
42876 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42878 min_prev_count = 0;
42879 break;
42881 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42883 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42885 int count = ix86_count_insn_bb (e->src);
42886 if (count < min_prev_count)
42887 min_prev_count = count;
42888 break;
42893 if (min_prev_count < 4)
42894 min_prev_count += ix86_count_insn_bb (bb);
42896 return min_prev_count;
42899 /* Pad short function to 4 instructions. */
42901 static void
42902 ix86_pad_short_function (void)
42904 edge e;
42905 edge_iterator ei;
42907 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42909 rtx_insn *ret = BB_END (e->src);
42910 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42912 int insn_count = ix86_count_insn (e->src);
42914 /* Pad short function. */
42915 if (insn_count < 4)
42917 rtx_insn *insn = ret;
42919 /* Find epilogue. */
42920 while (insn
42921 && (!NOTE_P (insn)
42922 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42923 insn = PREV_INSN (insn);
42925 if (!insn)
42926 insn = ret;
42928 /* Two NOPs count as one instruction. */
42929 insn_count = 2 * (4 - insn_count);
42930 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42936 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42937 the epilogue, the Windows system unwinder will apply epilogue logic and
42938 produce incorrect offsets. This can be avoided by adding a nop between
42939 the last insn that can throw and the first insn of the epilogue. */
42941 static void
42942 ix86_seh_fixup_eh_fallthru (void)
42944 edge e;
42945 edge_iterator ei;
42947 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42949 rtx_insn *insn, *next;
42951 /* Find the beginning of the epilogue. */
42952 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42953 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42954 break;
42955 if (insn == NULL)
42956 continue;
42958 /* We only care about preceding insns that can throw. */
42959 insn = prev_active_insn (insn);
42960 if (insn == NULL || !can_throw_internal (insn))
42961 continue;
42963 /* Do not separate calls from their debug information. */
42964 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42965 if (NOTE_P (next)
42966 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42967 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42968 insn = next;
42969 else
42970 break;
42972 emit_insn_after (gen_nops (const1_rtx), insn);
42976 /* Given a register number BASE, the lowest of a group of registers, update
42977 regsets IN and OUT with the registers that should be avoided in input
42978 and output operands respectively when trying to avoid generating a modr/m
42979 byte for -fmitigate-rop. */
42981 static void
42982 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42984 SET_HARD_REG_BIT (out, base);
42985 SET_HARD_REG_BIT (out, base + 1);
42986 SET_HARD_REG_BIT (in, base + 2);
42987 SET_HARD_REG_BIT (in, base + 3);
42990 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
42991 that certain encodings of modr/m bytes do not occur. */
42992 static void
42993 ix86_mitigate_rop (void)
42995 HARD_REG_SET input_risky;
42996 HARD_REG_SET output_risky;
42997 HARD_REG_SET inout_risky;
42999 CLEAR_HARD_REG_SET (output_risky);
43000 CLEAR_HARD_REG_SET (input_risky);
43001 SET_HARD_REG_BIT (output_risky, AX_REG);
43002 SET_HARD_REG_BIT (output_risky, CX_REG);
43003 SET_HARD_REG_BIT (input_risky, BX_REG);
43004 SET_HARD_REG_BIT (input_risky, DX_REG);
43005 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43006 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43007 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43008 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43009 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43010 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43011 COPY_HARD_REG_SET (inout_risky, input_risky);
43012 IOR_HARD_REG_SET (inout_risky, output_risky);
43014 df_note_add_problem ();
43015 /* Fix up what stack-regs did. */
43016 df_insn_rescan_all ();
43017 df_analyze ();
43019 regrename_init (true);
43020 regrename_analyze (NULL);
43022 auto_vec<du_head_p> cands;
43024 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43026 if (!NONDEBUG_INSN_P (insn))
43027 continue;
43029 if (GET_CODE (PATTERN (insn)) == USE
43030 || GET_CODE (PATTERN (insn)) == CLOBBER)
43031 continue;
43033 extract_insn (insn);
43035 int opno0, opno1;
43036 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43037 recog_data.n_operands, &opno0,
43038 &opno1);
43040 if (!ix86_rop_should_change_byte_p (modrm))
43041 continue;
43043 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43045 /* This happens when regrename has to fail a block. */
43046 if (!info->op_info)
43047 continue;
43049 if (info->op_info[opno0].n_chains != 0)
43051 gcc_assert (info->op_info[opno0].n_chains == 1);
43052 du_head_p op0c;
43053 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43054 if (op0c->target_data_1 + op0c->target_data_2 == 0
43055 && !op0c->cannot_rename)
43056 cands.safe_push (op0c);
43058 op0c->target_data_1++;
43060 if (info->op_info[opno1].n_chains != 0)
43062 gcc_assert (info->op_info[opno1].n_chains == 1);
43063 du_head_p op1c;
43064 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43065 if (op1c->target_data_1 + op1c->target_data_2 == 0
43066 && !op1c->cannot_rename)
43067 cands.safe_push (op1c);
43069 op1c->target_data_2++;
43073 int i;
43074 du_head_p head;
43075 FOR_EACH_VEC_ELT (cands, i, head)
43077 int old_reg, best_reg;
43078 HARD_REG_SET unavailable;
43080 CLEAR_HARD_REG_SET (unavailable);
43081 if (head->target_data_1)
43082 IOR_HARD_REG_SET (unavailable, output_risky);
43083 if (head->target_data_2)
43084 IOR_HARD_REG_SET (unavailable, input_risky);
43086 int n_uses;
43087 reg_class superclass = regrename_find_superclass (head, &n_uses,
43088 &unavailable);
43089 old_reg = head->regno;
43090 best_reg = find_rename_reg (head, superclass, &unavailable,
43091 old_reg, false);
43092 bool ok = regrename_do_replace (head, best_reg);
43093 gcc_assert (ok);
43094 if (dump_file)
43095 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43096 reg_names[best_reg], reg_class_names[superclass]);
43100 regrename_finish ();
43102 df_analyze ();
43104 basic_block bb;
43105 regset_head live;
43107 INIT_REG_SET (&live);
43109 FOR_EACH_BB_FN (bb, cfun)
43111 rtx_insn *insn;
43113 COPY_REG_SET (&live, DF_LR_OUT (bb));
43114 df_simulate_initialize_backwards (bb, &live);
43116 FOR_BB_INSNS_REVERSE (bb, insn)
43118 if (!NONDEBUG_INSN_P (insn))
43119 continue;
43121 df_simulate_one_insn_backwards (bb, insn, &live);
43123 if (GET_CODE (PATTERN (insn)) == USE
43124 || GET_CODE (PATTERN (insn)) == CLOBBER)
43125 continue;
43127 extract_insn (insn);
43128 constrain_operands_cached (insn, reload_completed);
43129 int opno0, opno1;
43130 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43131 recog_data.n_operands, &opno0,
43132 &opno1);
43133 if (modrm < 0
43134 || !ix86_rop_should_change_byte_p (modrm)
43135 || opno0 == opno1)
43136 continue;
43138 rtx oldreg = recog_data.operand[opno1];
43139 preprocess_constraints (insn);
43140 const operand_alternative *alt = which_op_alt ();
43142 int i;
43143 for (i = 0; i < recog_data.n_operands; i++)
43144 if (i != opno1
43145 && alt[i].earlyclobber
43146 && reg_overlap_mentioned_p (recog_data.operand[i],
43147 oldreg))
43148 break;
43150 if (i < recog_data.n_operands)
43151 continue;
43153 if (dump_file)
43154 fprintf (dump_file,
43155 "attempting to fix modrm byte in insn %d:"
43156 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43157 reg_class_names[alt[opno1].cl]);
43159 HARD_REG_SET unavailable;
43160 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43161 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43162 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43163 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43164 IOR_HARD_REG_SET (unavailable, output_risky);
43165 IOR_COMPL_HARD_REG_SET (unavailable,
43166 reg_class_contents[alt[opno1].cl]);
43168 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43169 if (!TEST_HARD_REG_BIT (unavailable, i))
43170 break;
43171 if (i == FIRST_PSEUDO_REGISTER)
43173 if (dump_file)
43174 fprintf (dump_file, ", none available\n");
43175 continue;
43177 if (dump_file)
43178 fprintf (dump_file, " -> %d\n", i);
43179 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43180 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43181 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43186 /* Implement machine specific optimizations. We implement padding of returns
43187 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43188 static void
43189 ix86_reorg (void)
43191 /* We are freeing block_for_insn in the toplev to keep compatibility
43192 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43193 compute_bb_for_insn ();
43195 if (flag_mitigate_rop)
43196 ix86_mitigate_rop ();
43198 if (TARGET_SEH && current_function_has_exception_handlers ())
43199 ix86_seh_fixup_eh_fallthru ();
43201 if (optimize && optimize_function_for_speed_p (cfun))
43203 if (TARGET_PAD_SHORT_FUNCTION)
43204 ix86_pad_short_function ();
43205 else if (TARGET_PAD_RETURNS)
43206 ix86_pad_returns ();
43207 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43208 if (TARGET_FOUR_JUMP_LIMIT)
43209 ix86_avoid_jump_mispredicts ();
43210 #endif
43214 /* Return nonzero when QImode register that must be represented via REX prefix
43215 is used. */
43216 bool
43217 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43219 int i;
43220 extract_insn_cached (insn);
43221 for (i = 0; i < recog_data.n_operands; i++)
43222 if (GENERAL_REG_P (recog_data.operand[i])
43223 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43224 return true;
43225 return false;
43228 /* Return true when INSN mentions register that must be encoded using REX
43229 prefix. */
43230 bool
43231 x86_extended_reg_mentioned_p (rtx insn)
43233 subrtx_iterator::array_type array;
43234 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43236 const_rtx x = *iter;
43237 if (REG_P (x)
43238 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43239 return true;
43241 return false;
43244 /* If profitable, negate (without causing overflow) integer constant
43245 of mode MODE at location LOC. Return true in this case. */
43246 bool
43247 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43249 HOST_WIDE_INT val;
43251 if (!CONST_INT_P (*loc))
43252 return false;
43254 switch (mode)
43256 case DImode:
43257 /* DImode x86_64 constants must fit in 32 bits. */
43258 gcc_assert (x86_64_immediate_operand (*loc, mode));
43260 mode = SImode;
43261 break;
43263 case SImode:
43264 case HImode:
43265 case QImode:
43266 break;
43268 default:
43269 gcc_unreachable ();
43272 /* Avoid overflows. */
43273 if (mode_signbit_p (mode, *loc))
43274 return false;
43276 val = INTVAL (*loc);
43278 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43279 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43280 if ((val < 0 && val != -128)
43281 || val == 128)
43283 *loc = GEN_INT (-val);
43284 return true;
43287 return false;
43290 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43291 optabs would emit if we didn't have TFmode patterns. */
43293 void
43294 x86_emit_floatuns (rtx operands[2])
43296 rtx_code_label *neglab, *donelab;
43297 rtx i0, i1, f0, in, out;
43298 machine_mode mode, inmode;
43300 inmode = GET_MODE (operands[1]);
43301 gcc_assert (inmode == SImode || inmode == DImode);
43303 out = operands[0];
43304 in = force_reg (inmode, operands[1]);
43305 mode = GET_MODE (out);
43306 neglab = gen_label_rtx ();
43307 donelab = gen_label_rtx ();
43308 f0 = gen_reg_rtx (mode);
43310 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43312 expand_float (out, in, 0);
43314 emit_jump_insn (gen_jump (donelab));
43315 emit_barrier ();
43317 emit_label (neglab);
43319 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43320 1, OPTAB_DIRECT);
43321 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43322 1, OPTAB_DIRECT);
43323 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43325 expand_float (f0, i0, 0);
43327 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43329 emit_label (donelab);
43332 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43333 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43334 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43335 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43337 /* Get a vector mode of the same size as the original but with elements
43338 twice as wide. This is only guaranteed to apply to integral vectors. */
43340 static inline machine_mode
43341 get_mode_wider_vector (machine_mode o)
43343 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43344 machine_mode n = GET_MODE_WIDER_MODE (o);
43345 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43346 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43347 return n;
43350 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43351 fill target with val via vec_duplicate. */
43353 static bool
43354 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43356 bool ok;
43357 rtx_insn *insn;
43358 rtx dup;
43360 /* First attempt to recognize VAL as-is. */
43361 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43362 insn = emit_insn (gen_rtx_SET (target, dup));
43363 if (recog_memoized (insn) < 0)
43365 rtx_insn *seq;
43366 machine_mode innermode = GET_MODE_INNER (mode);
43367 rtx reg;
43369 /* If that fails, force VAL into a register. */
43371 start_sequence ();
43372 reg = force_reg (innermode, val);
43373 if (GET_MODE (reg) != innermode)
43374 reg = gen_lowpart (innermode, reg);
43375 XEXP (dup, 0) = reg;
43376 seq = get_insns ();
43377 end_sequence ();
43378 if (seq)
43379 emit_insn_before (seq, insn);
43381 ok = recog_memoized (insn) >= 0;
43382 gcc_assert (ok);
43384 return true;
43387 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43388 with all elements equal to VAR. Return true if successful. */
43390 static bool
43391 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43392 rtx target, rtx val)
43394 bool ok;
43396 switch (mode)
43398 case V2SImode:
43399 case V2SFmode:
43400 if (!mmx_ok)
43401 return false;
43402 /* FALLTHRU */
43404 case V4DFmode:
43405 case V4DImode:
43406 case V8SFmode:
43407 case V8SImode:
43408 case V2DFmode:
43409 case V2DImode:
43410 case V4SFmode:
43411 case V4SImode:
43412 case V16SImode:
43413 case V8DImode:
43414 case V16SFmode:
43415 case V8DFmode:
43416 return ix86_vector_duplicate_value (mode, target, val);
43418 case V4HImode:
43419 if (!mmx_ok)
43420 return false;
43421 if (TARGET_SSE || TARGET_3DNOW_A)
43423 rtx x;
43425 val = gen_lowpart (SImode, val);
43426 x = gen_rtx_TRUNCATE (HImode, val);
43427 x = gen_rtx_VEC_DUPLICATE (mode, x);
43428 emit_insn (gen_rtx_SET (target, x));
43429 return true;
43431 goto widen;
43433 case V8QImode:
43434 if (!mmx_ok)
43435 return false;
43436 goto widen;
43438 case V8HImode:
43439 if (TARGET_AVX2)
43440 return ix86_vector_duplicate_value (mode, target, val);
43442 if (TARGET_SSE2)
43444 struct expand_vec_perm_d dperm;
43445 rtx tmp1, tmp2;
43447 permute:
43448 memset (&dperm, 0, sizeof (dperm));
43449 dperm.target = target;
43450 dperm.vmode = mode;
43451 dperm.nelt = GET_MODE_NUNITS (mode);
43452 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43453 dperm.one_operand_p = true;
43455 /* Extend to SImode using a paradoxical SUBREG. */
43456 tmp1 = gen_reg_rtx (SImode);
43457 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43459 /* Insert the SImode value as low element of a V4SImode vector. */
43460 tmp2 = gen_reg_rtx (V4SImode);
43461 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43462 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43464 ok = (expand_vec_perm_1 (&dperm)
43465 || expand_vec_perm_broadcast_1 (&dperm));
43466 gcc_assert (ok);
43467 return ok;
43469 goto widen;
43471 case V16QImode:
43472 if (TARGET_AVX2)
43473 return ix86_vector_duplicate_value (mode, target, val);
43475 if (TARGET_SSE2)
43476 goto permute;
43477 goto widen;
43479 widen:
43480 /* Replicate the value once into the next wider mode and recurse. */
43482 machine_mode smode, wsmode, wvmode;
43483 rtx x;
43485 smode = GET_MODE_INNER (mode);
43486 wvmode = get_mode_wider_vector (mode);
43487 wsmode = GET_MODE_INNER (wvmode);
43489 val = convert_modes (wsmode, smode, val, true);
43490 x = expand_simple_binop (wsmode, ASHIFT, val,
43491 GEN_INT (GET_MODE_BITSIZE (smode)),
43492 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43493 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43495 x = gen_reg_rtx (wvmode);
43496 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43497 gcc_assert (ok);
43498 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43499 return ok;
43502 case V16HImode:
43503 case V32QImode:
43504 if (TARGET_AVX2)
43505 return ix86_vector_duplicate_value (mode, target, val);
43506 else
43508 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43509 rtx x = gen_reg_rtx (hvmode);
43511 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43512 gcc_assert (ok);
43514 x = gen_rtx_VEC_CONCAT (mode, x, x);
43515 emit_insn (gen_rtx_SET (target, x));
43517 return true;
43519 case V64QImode:
43520 case V32HImode:
43521 if (TARGET_AVX512BW)
43522 return ix86_vector_duplicate_value (mode, target, val);
43523 else
43525 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43526 rtx x = gen_reg_rtx (hvmode);
43528 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43529 gcc_assert (ok);
43531 x = gen_rtx_VEC_CONCAT (mode, x, x);
43532 emit_insn (gen_rtx_SET (target, x));
43534 return true;
43536 default:
43537 return false;
43541 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43542 whose ONE_VAR element is VAR, and other elements are zero. Return true
43543 if successful. */
43545 static bool
43546 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43547 rtx target, rtx var, int one_var)
43549 machine_mode vsimode;
43550 rtx new_target;
43551 rtx x, tmp;
43552 bool use_vector_set = false;
43554 switch (mode)
43556 case V2DImode:
43557 /* For SSE4.1, we normally use vector set. But if the second
43558 element is zero and inter-unit moves are OK, we use movq
43559 instead. */
43560 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43561 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43562 && one_var == 0));
43563 break;
43564 case V16QImode:
43565 case V4SImode:
43566 case V4SFmode:
43567 use_vector_set = TARGET_SSE4_1;
43568 break;
43569 case V8HImode:
43570 use_vector_set = TARGET_SSE2;
43571 break;
43572 case V4HImode:
43573 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43574 break;
43575 case V32QImode:
43576 case V16HImode:
43577 case V8SImode:
43578 case V8SFmode:
43579 case V4DFmode:
43580 use_vector_set = TARGET_AVX;
43581 break;
43582 case V4DImode:
43583 /* Use ix86_expand_vector_set in 64bit mode only. */
43584 use_vector_set = TARGET_AVX && TARGET_64BIT;
43585 break;
43586 default:
43587 break;
43590 if (use_vector_set)
43592 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43593 var = force_reg (GET_MODE_INNER (mode), var);
43594 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43595 return true;
43598 switch (mode)
43600 case V2SFmode:
43601 case V2SImode:
43602 if (!mmx_ok)
43603 return false;
43604 /* FALLTHRU */
43606 case V2DFmode:
43607 case V2DImode:
43608 if (one_var != 0)
43609 return false;
43610 var = force_reg (GET_MODE_INNER (mode), var);
43611 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43612 emit_insn (gen_rtx_SET (target, x));
43613 return true;
43615 case V4SFmode:
43616 case V4SImode:
43617 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43618 new_target = gen_reg_rtx (mode);
43619 else
43620 new_target = target;
43621 var = force_reg (GET_MODE_INNER (mode), var);
43622 x = gen_rtx_VEC_DUPLICATE (mode, var);
43623 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43624 emit_insn (gen_rtx_SET (new_target, x));
43625 if (one_var != 0)
43627 /* We need to shuffle the value to the correct position, so
43628 create a new pseudo to store the intermediate result. */
43630 /* With SSE2, we can use the integer shuffle insns. */
43631 if (mode != V4SFmode && TARGET_SSE2)
43633 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43634 const1_rtx,
43635 GEN_INT (one_var == 1 ? 0 : 1),
43636 GEN_INT (one_var == 2 ? 0 : 1),
43637 GEN_INT (one_var == 3 ? 0 : 1)));
43638 if (target != new_target)
43639 emit_move_insn (target, new_target);
43640 return true;
43643 /* Otherwise convert the intermediate result to V4SFmode and
43644 use the SSE1 shuffle instructions. */
43645 if (mode != V4SFmode)
43647 tmp = gen_reg_rtx (V4SFmode);
43648 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43650 else
43651 tmp = new_target;
43653 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43654 const1_rtx,
43655 GEN_INT (one_var == 1 ? 0 : 1),
43656 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43657 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43659 if (mode != V4SFmode)
43660 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43661 else if (tmp != target)
43662 emit_move_insn (target, tmp);
43664 else if (target != new_target)
43665 emit_move_insn (target, new_target);
43666 return true;
43668 case V8HImode:
43669 case V16QImode:
43670 vsimode = V4SImode;
43671 goto widen;
43672 case V4HImode:
43673 case V8QImode:
43674 if (!mmx_ok)
43675 return false;
43676 vsimode = V2SImode;
43677 goto widen;
43678 widen:
43679 if (one_var != 0)
43680 return false;
43682 /* Zero extend the variable element to SImode and recurse. */
43683 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43685 x = gen_reg_rtx (vsimode);
43686 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43687 var, one_var))
43688 gcc_unreachable ();
43690 emit_move_insn (target, gen_lowpart (mode, x));
43691 return true;
43693 default:
43694 return false;
43698 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43699 consisting of the values in VALS. It is known that all elements
43700 except ONE_VAR are constants. Return true if successful. */
43702 static bool
43703 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43704 rtx target, rtx vals, int one_var)
43706 rtx var = XVECEXP (vals, 0, one_var);
43707 machine_mode wmode;
43708 rtx const_vec, x;
43710 const_vec = copy_rtx (vals);
43711 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43712 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43714 switch (mode)
43716 case V2DFmode:
43717 case V2DImode:
43718 case V2SFmode:
43719 case V2SImode:
43720 /* For the two element vectors, it's just as easy to use
43721 the general case. */
43722 return false;
43724 case V4DImode:
43725 /* Use ix86_expand_vector_set in 64bit mode only. */
43726 if (!TARGET_64BIT)
43727 return false;
43728 /* FALLTHRU */
43729 case V4DFmode:
43730 case V8SFmode:
43731 case V8SImode:
43732 case V16HImode:
43733 case V32QImode:
43734 case V4SFmode:
43735 case V4SImode:
43736 case V8HImode:
43737 case V4HImode:
43738 break;
43740 case V16QImode:
43741 if (TARGET_SSE4_1)
43742 break;
43743 wmode = V8HImode;
43744 goto widen;
43745 case V8QImode:
43746 wmode = V4HImode;
43747 goto widen;
43748 widen:
43749 /* There's no way to set one QImode entry easily. Combine
43750 the variable value with its adjacent constant value, and
43751 promote to an HImode set. */
43752 x = XVECEXP (vals, 0, one_var ^ 1);
43753 if (one_var & 1)
43755 var = convert_modes (HImode, QImode, var, true);
43756 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43757 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43758 x = GEN_INT (INTVAL (x) & 0xff);
43760 else
43762 var = convert_modes (HImode, QImode, var, true);
43763 x = gen_int_mode (INTVAL (x) << 8, HImode);
43765 if (x != const0_rtx)
43766 var = expand_simple_binop (HImode, IOR, var, x, var,
43767 1, OPTAB_LIB_WIDEN);
43769 x = gen_reg_rtx (wmode);
43770 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43771 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43773 emit_move_insn (target, gen_lowpart (mode, x));
43774 return true;
43776 default:
43777 return false;
43780 emit_move_insn (target, const_vec);
43781 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43782 return true;
43785 /* A subroutine of ix86_expand_vector_init_general. Use vector
43786 concatenate to handle the most general case: all values variable,
43787 and none identical. */
43789 static void
43790 ix86_expand_vector_init_concat (machine_mode mode,
43791 rtx target, rtx *ops, int n)
43793 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43794 rtx first[16], second[8], third[4];
43795 rtvec v;
43796 int i, j;
43798 switch (n)
43800 case 2:
43801 switch (mode)
43803 case V16SImode:
43804 cmode = V8SImode;
43805 break;
43806 case V16SFmode:
43807 cmode = V8SFmode;
43808 break;
43809 case V8DImode:
43810 cmode = V4DImode;
43811 break;
43812 case V8DFmode:
43813 cmode = V4DFmode;
43814 break;
43815 case V8SImode:
43816 cmode = V4SImode;
43817 break;
43818 case V8SFmode:
43819 cmode = V4SFmode;
43820 break;
43821 case V4DImode:
43822 cmode = V2DImode;
43823 break;
43824 case V4DFmode:
43825 cmode = V2DFmode;
43826 break;
43827 case V4SImode:
43828 cmode = V2SImode;
43829 break;
43830 case V4SFmode:
43831 cmode = V2SFmode;
43832 break;
43833 case V2DImode:
43834 cmode = DImode;
43835 break;
43836 case V2SImode:
43837 cmode = SImode;
43838 break;
43839 case V2DFmode:
43840 cmode = DFmode;
43841 break;
43842 case V2SFmode:
43843 cmode = SFmode;
43844 break;
43845 default:
43846 gcc_unreachable ();
43849 if (!register_operand (ops[1], cmode))
43850 ops[1] = force_reg (cmode, ops[1]);
43851 if (!register_operand (ops[0], cmode))
43852 ops[0] = force_reg (cmode, ops[0]);
43853 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43854 ops[1])));
43855 break;
43857 case 4:
43858 switch (mode)
43860 case V4DImode:
43861 cmode = V2DImode;
43862 break;
43863 case V4DFmode:
43864 cmode = V2DFmode;
43865 break;
43866 case V4SImode:
43867 cmode = V2SImode;
43868 break;
43869 case V4SFmode:
43870 cmode = V2SFmode;
43871 break;
43872 default:
43873 gcc_unreachable ();
43875 goto half;
43877 case 8:
43878 switch (mode)
43880 case V8DImode:
43881 cmode = V2DImode;
43882 hmode = V4DImode;
43883 break;
43884 case V8DFmode:
43885 cmode = V2DFmode;
43886 hmode = V4DFmode;
43887 break;
43888 case V8SImode:
43889 cmode = V2SImode;
43890 hmode = V4SImode;
43891 break;
43892 case V8SFmode:
43893 cmode = V2SFmode;
43894 hmode = V4SFmode;
43895 break;
43896 default:
43897 gcc_unreachable ();
43899 goto half;
43901 case 16:
43902 switch (mode)
43904 case V16SImode:
43905 cmode = V2SImode;
43906 hmode = V4SImode;
43907 gmode = V8SImode;
43908 break;
43909 case V16SFmode:
43910 cmode = V2SFmode;
43911 hmode = V4SFmode;
43912 gmode = V8SFmode;
43913 break;
43914 default:
43915 gcc_unreachable ();
43917 goto half;
43919 half:
43920 /* FIXME: We process inputs backward to help RA. PR 36222. */
43921 i = n - 1;
43922 j = (n >> 1) - 1;
43923 for (; i > 0; i -= 2, j--)
43925 first[j] = gen_reg_rtx (cmode);
43926 v = gen_rtvec (2, ops[i - 1], ops[i]);
43927 ix86_expand_vector_init (false, first[j],
43928 gen_rtx_PARALLEL (cmode, v));
43931 n >>= 1;
43932 if (n > 4)
43934 gcc_assert (hmode != VOIDmode);
43935 gcc_assert (gmode != VOIDmode);
43936 for (i = j = 0; i < n; i += 2, j++)
43938 second[j] = gen_reg_rtx (hmode);
43939 ix86_expand_vector_init_concat (hmode, second [j],
43940 &first [i], 2);
43942 n >>= 1;
43943 for (i = j = 0; i < n; i += 2, j++)
43945 third[j] = gen_reg_rtx (gmode);
43946 ix86_expand_vector_init_concat (gmode, third[j],
43947 &second[i], 2);
43949 n >>= 1;
43950 ix86_expand_vector_init_concat (mode, target, third, n);
43952 else if (n > 2)
43954 gcc_assert (hmode != VOIDmode);
43955 for (i = j = 0; i < n; i += 2, j++)
43957 second[j] = gen_reg_rtx (hmode);
43958 ix86_expand_vector_init_concat (hmode, second [j],
43959 &first [i], 2);
43961 n >>= 1;
43962 ix86_expand_vector_init_concat (mode, target, second, n);
43964 else
43965 ix86_expand_vector_init_concat (mode, target, first, n);
43966 break;
43968 default:
43969 gcc_unreachable ();
43973 /* A subroutine of ix86_expand_vector_init_general. Use vector
43974 interleave to handle the most general case: all values variable,
43975 and none identical. */
43977 static void
43978 ix86_expand_vector_init_interleave (machine_mode mode,
43979 rtx target, rtx *ops, int n)
43981 machine_mode first_imode, second_imode, third_imode, inner_mode;
43982 int i, j;
43983 rtx op0, op1;
43984 rtx (*gen_load_even) (rtx, rtx, rtx);
43985 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43986 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43988 switch (mode)
43990 case V8HImode:
43991 gen_load_even = gen_vec_setv8hi;
43992 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43993 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43994 inner_mode = HImode;
43995 first_imode = V4SImode;
43996 second_imode = V2DImode;
43997 third_imode = VOIDmode;
43998 break;
43999 case V16QImode:
44000 gen_load_even = gen_vec_setv16qi;
44001 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44002 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44003 inner_mode = QImode;
44004 first_imode = V8HImode;
44005 second_imode = V4SImode;
44006 third_imode = V2DImode;
44007 break;
44008 default:
44009 gcc_unreachable ();
44012 for (i = 0; i < n; i++)
44014 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44015 op0 = gen_reg_rtx (SImode);
44016 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44018 /* Insert the SImode value as low element of V4SImode vector. */
44019 op1 = gen_reg_rtx (V4SImode);
44020 op0 = gen_rtx_VEC_MERGE (V4SImode,
44021 gen_rtx_VEC_DUPLICATE (V4SImode,
44022 op0),
44023 CONST0_RTX (V4SImode),
44024 const1_rtx);
44025 emit_insn (gen_rtx_SET (op1, op0));
44027 /* Cast the V4SImode vector back to a vector in orignal mode. */
44028 op0 = gen_reg_rtx (mode);
44029 emit_move_insn (op0, gen_lowpart (mode, op1));
44031 /* Load even elements into the second position. */
44032 emit_insn (gen_load_even (op0,
44033 force_reg (inner_mode,
44034 ops [i + i + 1]),
44035 const1_rtx));
44037 /* Cast vector to FIRST_IMODE vector. */
44038 ops[i] = gen_reg_rtx (first_imode);
44039 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44042 /* Interleave low FIRST_IMODE vectors. */
44043 for (i = j = 0; i < n; i += 2, j++)
44045 op0 = gen_reg_rtx (first_imode);
44046 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44048 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44049 ops[j] = gen_reg_rtx (second_imode);
44050 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44053 /* Interleave low SECOND_IMODE vectors. */
44054 switch (second_imode)
44056 case V4SImode:
44057 for (i = j = 0; i < n / 2; i += 2, j++)
44059 op0 = gen_reg_rtx (second_imode);
44060 emit_insn (gen_interleave_second_low (op0, ops[i],
44061 ops[i + 1]));
44063 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44064 vector. */
44065 ops[j] = gen_reg_rtx (third_imode);
44066 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44068 second_imode = V2DImode;
44069 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44070 /* FALLTHRU */
44072 case V2DImode:
44073 op0 = gen_reg_rtx (second_imode);
44074 emit_insn (gen_interleave_second_low (op0, ops[0],
44075 ops[1]));
44077 /* Cast the SECOND_IMODE vector back to a vector on original
44078 mode. */
44079 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44080 break;
44082 default:
44083 gcc_unreachable ();
44087 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44088 all values variable, and none identical. */
44090 static void
44091 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44092 rtx target, rtx vals)
44094 rtx ops[64], op0, op1, op2, op3, op4, op5;
44095 machine_mode half_mode = VOIDmode;
44096 machine_mode quarter_mode = VOIDmode;
44097 int n, i;
44099 switch (mode)
44101 case V2SFmode:
44102 case V2SImode:
44103 if (!mmx_ok && !TARGET_SSE)
44104 break;
44105 /* FALLTHRU */
44107 case V16SImode:
44108 case V16SFmode:
44109 case V8DFmode:
44110 case V8DImode:
44111 case V8SFmode:
44112 case V8SImode:
44113 case V4DFmode:
44114 case V4DImode:
44115 case V4SFmode:
44116 case V4SImode:
44117 case V2DFmode:
44118 case V2DImode:
44119 n = GET_MODE_NUNITS (mode);
44120 for (i = 0; i < n; i++)
44121 ops[i] = XVECEXP (vals, 0, i);
44122 ix86_expand_vector_init_concat (mode, target, ops, n);
44123 return;
44125 case V2TImode:
44126 for (i = 0; i < 2; i++)
44127 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44128 op0 = gen_reg_rtx (V4DImode);
44129 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44130 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44131 return;
44133 case V4TImode:
44134 for (i = 0; i < 4; i++)
44135 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44136 ops[4] = gen_reg_rtx (V4DImode);
44137 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44138 ops[5] = gen_reg_rtx (V4DImode);
44139 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44140 op0 = gen_reg_rtx (V8DImode);
44141 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44142 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44143 return;
44145 case V32QImode:
44146 half_mode = V16QImode;
44147 goto half;
44149 case V16HImode:
44150 half_mode = V8HImode;
44151 goto half;
44153 half:
44154 n = GET_MODE_NUNITS (mode);
44155 for (i = 0; i < n; i++)
44156 ops[i] = XVECEXP (vals, 0, i);
44157 op0 = gen_reg_rtx (half_mode);
44158 op1 = gen_reg_rtx (half_mode);
44159 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44160 n >> 2);
44161 ix86_expand_vector_init_interleave (half_mode, op1,
44162 &ops [n >> 1], n >> 2);
44163 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44164 return;
44166 case V64QImode:
44167 quarter_mode = V16QImode;
44168 half_mode = V32QImode;
44169 goto quarter;
44171 case V32HImode:
44172 quarter_mode = V8HImode;
44173 half_mode = V16HImode;
44174 goto quarter;
44176 quarter:
44177 n = GET_MODE_NUNITS (mode);
44178 for (i = 0; i < n; i++)
44179 ops[i] = XVECEXP (vals, 0, i);
44180 op0 = gen_reg_rtx (quarter_mode);
44181 op1 = gen_reg_rtx (quarter_mode);
44182 op2 = gen_reg_rtx (quarter_mode);
44183 op3 = gen_reg_rtx (quarter_mode);
44184 op4 = gen_reg_rtx (half_mode);
44185 op5 = gen_reg_rtx (half_mode);
44186 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44187 n >> 3);
44188 ix86_expand_vector_init_interleave (quarter_mode, op1,
44189 &ops [n >> 2], n >> 3);
44190 ix86_expand_vector_init_interleave (quarter_mode, op2,
44191 &ops [n >> 1], n >> 3);
44192 ix86_expand_vector_init_interleave (quarter_mode, op3,
44193 &ops [(n >> 1) | (n >> 2)], n >> 3);
44194 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44195 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44196 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44197 return;
44199 case V16QImode:
44200 if (!TARGET_SSE4_1)
44201 break;
44202 /* FALLTHRU */
44204 case V8HImode:
44205 if (!TARGET_SSE2)
44206 break;
44208 /* Don't use ix86_expand_vector_init_interleave if we can't
44209 move from GPR to SSE register directly. */
44210 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44211 break;
44213 n = GET_MODE_NUNITS (mode);
44214 for (i = 0; i < n; i++)
44215 ops[i] = XVECEXP (vals, 0, i);
44216 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44217 return;
44219 case V4HImode:
44220 case V8QImode:
44221 break;
44223 default:
44224 gcc_unreachable ();
44228 int i, j, n_elts, n_words, n_elt_per_word;
44229 machine_mode inner_mode;
44230 rtx words[4], shift;
44232 inner_mode = GET_MODE_INNER (mode);
44233 n_elts = GET_MODE_NUNITS (mode);
44234 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44235 n_elt_per_word = n_elts / n_words;
44236 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44238 for (i = 0; i < n_words; ++i)
44240 rtx word = NULL_RTX;
44242 for (j = 0; j < n_elt_per_word; ++j)
44244 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44245 elt = convert_modes (word_mode, inner_mode, elt, true);
44247 if (j == 0)
44248 word = elt;
44249 else
44251 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44252 word, 1, OPTAB_LIB_WIDEN);
44253 word = expand_simple_binop (word_mode, IOR, word, elt,
44254 word, 1, OPTAB_LIB_WIDEN);
44258 words[i] = word;
44261 if (n_words == 1)
44262 emit_move_insn (target, gen_lowpart (mode, words[0]));
44263 else if (n_words == 2)
44265 rtx tmp = gen_reg_rtx (mode);
44266 emit_clobber (tmp);
44267 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44268 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44269 emit_move_insn (target, tmp);
44271 else if (n_words == 4)
44273 rtx tmp = gen_reg_rtx (V4SImode);
44274 gcc_assert (word_mode == SImode);
44275 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44276 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44277 emit_move_insn (target, gen_lowpart (mode, tmp));
44279 else
44280 gcc_unreachable ();
44284 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44285 instructions unless MMX_OK is true. */
44287 void
44288 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44290 machine_mode mode = GET_MODE (target);
44291 machine_mode inner_mode = GET_MODE_INNER (mode);
44292 int n_elts = GET_MODE_NUNITS (mode);
44293 int n_var = 0, one_var = -1;
44294 bool all_same = true, all_const_zero = true;
44295 int i;
44296 rtx x;
44298 for (i = 0; i < n_elts; ++i)
44300 x = XVECEXP (vals, 0, i);
44301 if (!(CONST_SCALAR_INT_P (x)
44302 || CONST_DOUBLE_P (x)
44303 || CONST_FIXED_P (x)))
44304 n_var++, one_var = i;
44305 else if (x != CONST0_RTX (inner_mode))
44306 all_const_zero = false;
44307 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44308 all_same = false;
44311 /* Constants are best loaded from the constant pool. */
44312 if (n_var == 0)
44314 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44315 return;
44318 /* If all values are identical, broadcast the value. */
44319 if (all_same
44320 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44321 XVECEXP (vals, 0, 0)))
44322 return;
44324 /* Values where only one field is non-constant are best loaded from
44325 the pool and overwritten via move later. */
44326 if (n_var == 1)
44328 if (all_const_zero
44329 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44330 XVECEXP (vals, 0, one_var),
44331 one_var))
44332 return;
44334 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44335 return;
44338 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44341 void
44342 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44344 machine_mode mode = GET_MODE (target);
44345 machine_mode inner_mode = GET_MODE_INNER (mode);
44346 machine_mode half_mode;
44347 bool use_vec_merge = false;
44348 rtx tmp;
44349 static rtx (*gen_extract[6][2]) (rtx, rtx)
44351 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44352 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44353 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44354 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44355 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44356 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44358 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44360 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44361 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44362 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44363 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44364 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44365 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44367 int i, j, n;
44368 machine_mode mmode = VOIDmode;
44369 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44371 switch (mode)
44373 case V2SFmode:
44374 case V2SImode:
44375 if (mmx_ok)
44377 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44378 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44379 if (elt == 0)
44380 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44381 else
44382 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44383 emit_insn (gen_rtx_SET (target, tmp));
44384 return;
44386 break;
44388 case V2DImode:
44389 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44390 if (use_vec_merge)
44391 break;
44393 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44394 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44395 if (elt == 0)
44396 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44397 else
44398 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44399 emit_insn (gen_rtx_SET (target, tmp));
44400 return;
44402 case V2DFmode:
44404 rtx op0, op1;
44406 /* For the two element vectors, we implement a VEC_CONCAT with
44407 the extraction of the other element. */
44409 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44410 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44412 if (elt == 0)
44413 op0 = val, op1 = tmp;
44414 else
44415 op0 = tmp, op1 = val;
44417 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44418 emit_insn (gen_rtx_SET (target, tmp));
44420 return;
44422 case V4SFmode:
44423 use_vec_merge = TARGET_SSE4_1;
44424 if (use_vec_merge)
44425 break;
44427 switch (elt)
44429 case 0:
44430 use_vec_merge = true;
44431 break;
44433 case 1:
44434 /* tmp = target = A B C D */
44435 tmp = copy_to_reg (target);
44436 /* target = A A B B */
44437 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44438 /* target = X A B B */
44439 ix86_expand_vector_set (false, target, val, 0);
44440 /* target = A X C D */
44441 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44442 const1_rtx, const0_rtx,
44443 GEN_INT (2+4), GEN_INT (3+4)));
44444 return;
44446 case 2:
44447 /* tmp = target = A B C D */
44448 tmp = copy_to_reg (target);
44449 /* tmp = X B C D */
44450 ix86_expand_vector_set (false, tmp, val, 0);
44451 /* target = A B X D */
44452 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44453 const0_rtx, const1_rtx,
44454 GEN_INT (0+4), GEN_INT (3+4)));
44455 return;
44457 case 3:
44458 /* tmp = target = A B C D */
44459 tmp = copy_to_reg (target);
44460 /* tmp = X B C D */
44461 ix86_expand_vector_set (false, tmp, val, 0);
44462 /* target = A B X D */
44463 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44464 const0_rtx, const1_rtx,
44465 GEN_INT (2+4), GEN_INT (0+4)));
44466 return;
44468 default:
44469 gcc_unreachable ();
44471 break;
44473 case V4SImode:
44474 use_vec_merge = TARGET_SSE4_1;
44475 if (use_vec_merge)
44476 break;
44478 /* Element 0 handled by vec_merge below. */
44479 if (elt == 0)
44481 use_vec_merge = true;
44482 break;
44485 if (TARGET_SSE2)
44487 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44488 store into element 0, then shuffle them back. */
44490 rtx order[4];
44492 order[0] = GEN_INT (elt);
44493 order[1] = const1_rtx;
44494 order[2] = const2_rtx;
44495 order[3] = GEN_INT (3);
44496 order[elt] = const0_rtx;
44498 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44499 order[1], order[2], order[3]));
44501 ix86_expand_vector_set (false, target, val, 0);
44503 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44504 order[1], order[2], order[3]));
44506 else
44508 /* For SSE1, we have to reuse the V4SF code. */
44509 rtx t = gen_reg_rtx (V4SFmode);
44510 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44511 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44512 emit_move_insn (target, gen_lowpart (mode, t));
44514 return;
44516 case V8HImode:
44517 use_vec_merge = TARGET_SSE2;
44518 break;
44519 case V4HImode:
44520 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44521 break;
44523 case V16QImode:
44524 use_vec_merge = TARGET_SSE4_1;
44525 break;
44527 case V8QImode:
44528 break;
44530 case V32QImode:
44531 half_mode = V16QImode;
44532 j = 0;
44533 n = 16;
44534 goto half;
44536 case V16HImode:
44537 half_mode = V8HImode;
44538 j = 1;
44539 n = 8;
44540 goto half;
44542 case V8SImode:
44543 half_mode = V4SImode;
44544 j = 2;
44545 n = 4;
44546 goto half;
44548 case V4DImode:
44549 half_mode = V2DImode;
44550 j = 3;
44551 n = 2;
44552 goto half;
44554 case V8SFmode:
44555 half_mode = V4SFmode;
44556 j = 4;
44557 n = 4;
44558 goto half;
44560 case V4DFmode:
44561 half_mode = V2DFmode;
44562 j = 5;
44563 n = 2;
44564 goto half;
44566 half:
44567 /* Compute offset. */
44568 i = elt / n;
44569 elt %= n;
44571 gcc_assert (i <= 1);
44573 /* Extract the half. */
44574 tmp = gen_reg_rtx (half_mode);
44575 emit_insn (gen_extract[j][i] (tmp, target));
44577 /* Put val in tmp at elt. */
44578 ix86_expand_vector_set (false, tmp, val, elt);
44580 /* Put it back. */
44581 emit_insn (gen_insert[j][i] (target, target, tmp));
44582 return;
44584 case V8DFmode:
44585 if (TARGET_AVX512F)
44587 mmode = QImode;
44588 gen_blendm = gen_avx512f_blendmv8df;
44590 break;
44592 case V8DImode:
44593 if (TARGET_AVX512F)
44595 mmode = QImode;
44596 gen_blendm = gen_avx512f_blendmv8di;
44598 break;
44600 case V16SFmode:
44601 if (TARGET_AVX512F)
44603 mmode = HImode;
44604 gen_blendm = gen_avx512f_blendmv16sf;
44606 break;
44608 case V16SImode:
44609 if (TARGET_AVX512F)
44611 mmode = HImode;
44612 gen_blendm = gen_avx512f_blendmv16si;
44614 break;
44616 case V32HImode:
44617 if (TARGET_AVX512F && TARGET_AVX512BW)
44619 mmode = SImode;
44620 gen_blendm = gen_avx512bw_blendmv32hi;
44622 break;
44624 case V64QImode:
44625 if (TARGET_AVX512F && TARGET_AVX512BW)
44627 mmode = DImode;
44628 gen_blendm = gen_avx512bw_blendmv64qi;
44630 break;
44632 default:
44633 break;
44636 if (mmode != VOIDmode)
44638 tmp = gen_reg_rtx (mode);
44639 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44640 /* The avx512*_blendm<mode> expanders have different operand order
44641 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44642 elements where the mask is set and second input operand otherwise,
44643 in {sse,avx}*_*blend* the first input operand is used for elements
44644 where the mask is clear and second input operand otherwise. */
44645 emit_insn (gen_blendm (target, target, tmp,
44646 force_reg (mmode,
44647 gen_int_mode (1 << elt, mmode))));
44649 else if (use_vec_merge)
44651 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44652 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44653 emit_insn (gen_rtx_SET (target, tmp));
44655 else
44657 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44659 emit_move_insn (mem, target);
44661 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44662 emit_move_insn (tmp, val);
44664 emit_move_insn (target, mem);
44668 void
44669 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44671 machine_mode mode = GET_MODE (vec);
44672 machine_mode inner_mode = GET_MODE_INNER (mode);
44673 bool use_vec_extr = false;
44674 rtx tmp;
44676 switch (mode)
44678 case V2SImode:
44679 case V2SFmode:
44680 if (!mmx_ok)
44681 break;
44682 /* FALLTHRU */
44684 case V2DFmode:
44685 case V2DImode:
44686 case V2TImode:
44687 case V4TImode:
44688 use_vec_extr = true;
44689 break;
44691 case V4SFmode:
44692 use_vec_extr = TARGET_SSE4_1;
44693 if (use_vec_extr)
44694 break;
44696 switch (elt)
44698 case 0:
44699 tmp = vec;
44700 break;
44702 case 1:
44703 case 3:
44704 tmp = gen_reg_rtx (mode);
44705 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44706 GEN_INT (elt), GEN_INT (elt),
44707 GEN_INT (elt+4), GEN_INT (elt+4)));
44708 break;
44710 case 2:
44711 tmp = gen_reg_rtx (mode);
44712 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44713 break;
44715 default:
44716 gcc_unreachable ();
44718 vec = tmp;
44719 use_vec_extr = true;
44720 elt = 0;
44721 break;
44723 case V4SImode:
44724 use_vec_extr = TARGET_SSE4_1;
44725 if (use_vec_extr)
44726 break;
44728 if (TARGET_SSE2)
44730 switch (elt)
44732 case 0:
44733 tmp = vec;
44734 break;
44736 case 1:
44737 case 3:
44738 tmp = gen_reg_rtx (mode);
44739 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44740 GEN_INT (elt), GEN_INT (elt),
44741 GEN_INT (elt), GEN_INT (elt)));
44742 break;
44744 case 2:
44745 tmp = gen_reg_rtx (mode);
44746 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44747 break;
44749 default:
44750 gcc_unreachable ();
44752 vec = tmp;
44753 use_vec_extr = true;
44754 elt = 0;
44756 else
44758 /* For SSE1, we have to reuse the V4SF code. */
44759 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44760 gen_lowpart (V4SFmode, vec), elt);
44761 return;
44763 break;
44765 case V8HImode:
44766 use_vec_extr = TARGET_SSE2;
44767 break;
44768 case V4HImode:
44769 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44770 break;
44772 case V16QImode:
44773 use_vec_extr = TARGET_SSE4_1;
44774 break;
44776 case V8SFmode:
44777 if (TARGET_AVX)
44779 tmp = gen_reg_rtx (V4SFmode);
44780 if (elt < 4)
44781 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44782 else
44783 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44784 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44785 return;
44787 break;
44789 case V4DFmode:
44790 if (TARGET_AVX)
44792 tmp = gen_reg_rtx (V2DFmode);
44793 if (elt < 2)
44794 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44795 else
44796 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44797 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44798 return;
44800 break;
44802 case V32QImode:
44803 if (TARGET_AVX)
44805 tmp = gen_reg_rtx (V16QImode);
44806 if (elt < 16)
44807 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44808 else
44809 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44810 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44811 return;
44813 break;
44815 case V16HImode:
44816 if (TARGET_AVX)
44818 tmp = gen_reg_rtx (V8HImode);
44819 if (elt < 8)
44820 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44821 else
44822 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44823 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44824 return;
44826 break;
44828 case V8SImode:
44829 if (TARGET_AVX)
44831 tmp = gen_reg_rtx (V4SImode);
44832 if (elt < 4)
44833 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44834 else
44835 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44836 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44837 return;
44839 break;
44841 case V4DImode:
44842 if (TARGET_AVX)
44844 tmp = gen_reg_rtx (V2DImode);
44845 if (elt < 2)
44846 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44847 else
44848 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44849 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44850 return;
44852 break;
44854 case V32HImode:
44855 if (TARGET_AVX512BW)
44857 tmp = gen_reg_rtx (V16HImode);
44858 if (elt < 16)
44859 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44860 else
44861 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44862 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44863 return;
44865 break;
44867 case V64QImode:
44868 if (TARGET_AVX512BW)
44870 tmp = gen_reg_rtx (V32QImode);
44871 if (elt < 32)
44872 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44873 else
44874 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44875 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44876 return;
44878 break;
44880 case V16SFmode:
44881 tmp = gen_reg_rtx (V8SFmode);
44882 if (elt < 8)
44883 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44884 else
44885 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44886 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44887 return;
44889 case V8DFmode:
44890 tmp = gen_reg_rtx (V4DFmode);
44891 if (elt < 4)
44892 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44893 else
44894 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44895 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44896 return;
44898 case V16SImode:
44899 tmp = gen_reg_rtx (V8SImode);
44900 if (elt < 8)
44901 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44902 else
44903 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44904 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44905 return;
44907 case V8DImode:
44908 tmp = gen_reg_rtx (V4DImode);
44909 if (elt < 4)
44910 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44911 else
44912 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44913 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44914 return;
44916 case V8QImode:
44917 /* ??? Could extract the appropriate HImode element and shift. */
44918 default:
44919 break;
44922 if (use_vec_extr)
44924 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44925 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44927 /* Let the rtl optimizers know about the zero extension performed. */
44928 if (inner_mode == QImode || inner_mode == HImode)
44930 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44931 target = gen_lowpart (SImode, target);
44934 emit_insn (gen_rtx_SET (target, tmp));
44936 else
44938 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44940 emit_move_insn (mem, vec);
44942 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44943 emit_move_insn (target, tmp);
44947 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44948 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44949 The upper bits of DEST are undefined, though they shouldn't cause
44950 exceptions (some bits from src or all zeros are ok). */
44952 static void
44953 emit_reduc_half (rtx dest, rtx src, int i)
44955 rtx tem, d = dest;
44956 switch (GET_MODE (src))
44958 case V4SFmode:
44959 if (i == 128)
44960 tem = gen_sse_movhlps (dest, src, src);
44961 else
44962 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44963 GEN_INT (1 + 4), GEN_INT (1 + 4));
44964 break;
44965 case V2DFmode:
44966 tem = gen_vec_interleave_highv2df (dest, src, src);
44967 break;
44968 case V16QImode:
44969 case V8HImode:
44970 case V4SImode:
44971 case V2DImode:
44972 d = gen_reg_rtx (V1TImode);
44973 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44974 GEN_INT (i / 2));
44975 break;
44976 case V8SFmode:
44977 if (i == 256)
44978 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44979 else
44980 tem = gen_avx_shufps256 (dest, src, src,
44981 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44982 break;
44983 case V4DFmode:
44984 if (i == 256)
44985 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44986 else
44987 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44988 break;
44989 case V32QImode:
44990 case V16HImode:
44991 case V8SImode:
44992 case V4DImode:
44993 if (i == 256)
44995 if (GET_MODE (dest) != V4DImode)
44996 d = gen_reg_rtx (V4DImode);
44997 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44998 gen_lowpart (V4DImode, src),
44999 const1_rtx);
45001 else
45003 d = gen_reg_rtx (V2TImode);
45004 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45005 GEN_INT (i / 2));
45007 break;
45008 case V64QImode:
45009 case V32HImode:
45010 case V16SImode:
45011 case V16SFmode:
45012 case V8DImode:
45013 case V8DFmode:
45014 if (i > 128)
45015 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45016 gen_lowpart (V16SImode, src),
45017 gen_lowpart (V16SImode, src),
45018 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45019 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45020 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45021 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45022 GEN_INT (0xC), GEN_INT (0xD),
45023 GEN_INT (0xE), GEN_INT (0xF),
45024 GEN_INT (0x10), GEN_INT (0x11),
45025 GEN_INT (0x12), GEN_INT (0x13),
45026 GEN_INT (0x14), GEN_INT (0x15),
45027 GEN_INT (0x16), GEN_INT (0x17));
45028 else
45029 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45030 gen_lowpart (V16SImode, src),
45031 GEN_INT (i == 128 ? 0x2 : 0x1),
45032 GEN_INT (0x3),
45033 GEN_INT (0x3),
45034 GEN_INT (0x3),
45035 GEN_INT (i == 128 ? 0x6 : 0x5),
45036 GEN_INT (0x7),
45037 GEN_INT (0x7),
45038 GEN_INT (0x7),
45039 GEN_INT (i == 128 ? 0xA : 0x9),
45040 GEN_INT (0xB),
45041 GEN_INT (0xB),
45042 GEN_INT (0xB),
45043 GEN_INT (i == 128 ? 0xE : 0xD),
45044 GEN_INT (0xF),
45045 GEN_INT (0xF),
45046 GEN_INT (0xF));
45047 break;
45048 default:
45049 gcc_unreachable ();
45051 emit_insn (tem);
45052 if (d != dest)
45053 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45056 /* Expand a vector reduction. FN is the binary pattern to reduce;
45057 DEST is the destination; IN is the input vector. */
45059 void
45060 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45062 rtx half, dst, vec = in;
45063 machine_mode mode = GET_MODE (in);
45064 int i;
45066 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45067 if (TARGET_SSE4_1
45068 && mode == V8HImode
45069 && fn == gen_uminv8hi3)
45071 emit_insn (gen_sse4_1_phminposuw (dest, in));
45072 return;
45075 for (i = GET_MODE_BITSIZE (mode);
45076 i > GET_MODE_UNIT_BITSIZE (mode);
45077 i >>= 1)
45079 half = gen_reg_rtx (mode);
45080 emit_reduc_half (half, vec, i);
45081 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45082 dst = dest;
45083 else
45084 dst = gen_reg_rtx (mode);
45085 emit_insn (fn (dst, half, vec));
45086 vec = dst;
45090 /* Target hook for scalar_mode_supported_p. */
45091 static bool
45092 ix86_scalar_mode_supported_p (machine_mode mode)
45094 if (DECIMAL_FLOAT_MODE_P (mode))
45095 return default_decimal_float_supported_p ();
45096 else if (mode == TFmode)
45097 return true;
45098 else
45099 return default_scalar_mode_supported_p (mode);
45102 /* Implements target hook vector_mode_supported_p. */
45103 static bool
45104 ix86_vector_mode_supported_p (machine_mode mode)
45106 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45107 return true;
45108 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45109 return true;
45110 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45111 return true;
45112 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45113 return true;
45114 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45115 return true;
45116 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45117 return true;
45118 return false;
45121 /* Target hook for c_mode_for_suffix. */
45122 static machine_mode
45123 ix86_c_mode_for_suffix (char suffix)
45125 if (suffix == 'q')
45126 return TFmode;
45127 if (suffix == 'w')
45128 return XFmode;
45130 return VOIDmode;
45133 /* Worker function for TARGET_MD_ASM_ADJUST.
45135 We implement asm flag outputs, and maintain source compatibility
45136 with the old cc0-based compiler. */
45138 static rtx_insn *
45139 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45140 vec<const char *> &constraints,
45141 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45143 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45144 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45146 bool saw_asm_flag = false;
45148 start_sequence ();
45149 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45151 const char *con = constraints[i];
45152 if (strncmp (con, "=@cc", 4) != 0)
45153 continue;
45154 con += 4;
45155 if (strchr (con, ',') != NULL)
45157 error ("alternatives not allowed in asm flag output");
45158 continue;
45161 bool invert = false;
45162 if (con[0] == 'n')
45163 invert = true, con++;
45165 machine_mode mode = CCmode;
45166 rtx_code code = UNKNOWN;
45168 switch (con[0])
45170 case 'a':
45171 if (con[1] == 0)
45172 mode = CCAmode, code = EQ;
45173 else if (con[1] == 'e' && con[2] == 0)
45174 mode = CCCmode, code = NE;
45175 break;
45176 case 'b':
45177 if (con[1] == 0)
45178 mode = CCCmode, code = EQ;
45179 else if (con[1] == 'e' && con[2] == 0)
45180 mode = CCAmode, code = NE;
45181 break;
45182 case 'c':
45183 if (con[1] == 0)
45184 mode = CCCmode, code = EQ;
45185 break;
45186 case 'e':
45187 if (con[1] == 0)
45188 mode = CCZmode, code = EQ;
45189 break;
45190 case 'g':
45191 if (con[1] == 0)
45192 mode = CCGCmode, code = GT;
45193 else if (con[1] == 'e' && con[2] == 0)
45194 mode = CCGCmode, code = GE;
45195 break;
45196 case 'l':
45197 if (con[1] == 0)
45198 mode = CCGCmode, code = LT;
45199 else if (con[1] == 'e' && con[2] == 0)
45200 mode = CCGCmode, code = LE;
45201 break;
45202 case 'o':
45203 if (con[1] == 0)
45204 mode = CCOmode, code = EQ;
45205 break;
45206 case 'p':
45207 if (con[1] == 0)
45208 mode = CCPmode, code = EQ;
45209 break;
45210 case 's':
45211 if (con[1] == 0)
45212 mode = CCSmode, code = EQ;
45213 break;
45214 case 'z':
45215 if (con[1] == 0)
45216 mode = CCZmode, code = EQ;
45217 break;
45219 if (code == UNKNOWN)
45221 error ("unknown asm flag output %qs", constraints[i]);
45222 continue;
45224 if (invert)
45225 code = reverse_condition (code);
45227 rtx dest = outputs[i];
45228 if (!saw_asm_flag)
45230 /* This is the first asm flag output. Here we put the flags
45231 register in as the real output and adjust the condition to
45232 allow it. */
45233 constraints[i] = "=Bf";
45234 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45235 saw_asm_flag = true;
45237 else
45239 /* We don't need the flags register as output twice. */
45240 constraints[i] = "=X";
45241 outputs[i] = gen_rtx_SCRATCH (SImode);
45244 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45245 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45247 machine_mode dest_mode = GET_MODE (dest);
45248 if (!SCALAR_INT_MODE_P (dest_mode))
45250 error ("invalid type for asm flag output");
45251 continue;
45254 if (dest_mode == DImode && !TARGET_64BIT)
45255 dest_mode = SImode;
45257 if (dest_mode != QImode)
45259 rtx destqi = gen_reg_rtx (QImode);
45260 emit_insn (gen_rtx_SET (destqi, x));
45262 if (TARGET_ZERO_EXTEND_WITH_AND
45263 && optimize_function_for_speed_p (cfun))
45265 x = force_reg (dest_mode, const0_rtx);
45267 emit_insn (gen_movstrictqi
45268 (gen_lowpart (QImode, x), destqi));
45270 else
45271 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45274 if (dest_mode != GET_MODE (dest))
45276 rtx tmp = gen_reg_rtx (SImode);
45278 emit_insn (gen_rtx_SET (tmp, x));
45279 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45281 else
45282 emit_insn (gen_rtx_SET (dest, x));
45284 rtx_insn *seq = get_insns ();
45285 end_sequence ();
45287 if (saw_asm_flag)
45288 return seq;
45289 else
45291 /* If we had no asm flag outputs, clobber the flags. */
45292 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45293 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45294 return NULL;
45298 /* Implements target vector targetm.asm.encode_section_info. */
45300 static void ATTRIBUTE_UNUSED
45301 ix86_encode_section_info (tree decl, rtx rtl, int first)
45303 default_encode_section_info (decl, rtl, first);
45305 if (ix86_in_large_data_p (decl))
45306 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45309 /* Worker function for REVERSE_CONDITION. */
45311 enum rtx_code
45312 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45314 return (mode != CCFPmode && mode != CCFPUmode
45315 ? reverse_condition (code)
45316 : reverse_condition_maybe_unordered (code));
45319 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45320 to OPERANDS[0]. */
45322 const char *
45323 output_387_reg_move (rtx_insn *insn, rtx *operands)
45325 if (REG_P (operands[0]))
45327 if (REG_P (operands[1])
45328 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45330 if (REGNO (operands[0]) == FIRST_STACK_REG)
45331 return output_387_ffreep (operands, 0);
45332 return "fstp\t%y0";
45334 if (STACK_TOP_P (operands[0]))
45335 return "fld%Z1\t%y1";
45336 return "fst\t%y0";
45338 else if (MEM_P (operands[0]))
45340 gcc_assert (REG_P (operands[1]));
45341 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45342 return "fstp%Z0\t%y0";
45343 else
45345 /* There is no non-popping store to memory for XFmode.
45346 So if we need one, follow the store with a load. */
45347 if (GET_MODE (operands[0]) == XFmode)
45348 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45349 else
45350 return "fst%Z0\t%y0";
45353 else
45354 gcc_unreachable();
45357 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45358 FP status register is set. */
45360 void
45361 ix86_emit_fp_unordered_jump (rtx label)
45363 rtx reg = gen_reg_rtx (HImode);
45364 rtx temp;
45366 emit_insn (gen_x86_fnstsw_1 (reg));
45368 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45370 emit_insn (gen_x86_sahf_1 (reg));
45372 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45373 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45375 else
45377 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45379 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45380 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45383 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45384 gen_rtx_LABEL_REF (VOIDmode, label),
45385 pc_rtx);
45386 temp = gen_rtx_SET (pc_rtx, temp);
45388 emit_jump_insn (temp);
45389 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45392 /* Output code to perform a log1p XFmode calculation. */
45394 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45396 rtx_code_label *label1 = gen_label_rtx ();
45397 rtx_code_label *label2 = gen_label_rtx ();
45399 rtx tmp = gen_reg_rtx (XFmode);
45400 rtx tmp2 = gen_reg_rtx (XFmode);
45401 rtx test;
45403 emit_insn (gen_absxf2 (tmp, op1));
45404 test = gen_rtx_GE (VOIDmode, tmp,
45405 const_double_from_real_value (
45406 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45407 XFmode));
45408 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45410 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45411 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45412 emit_jump (label2);
45414 emit_label (label1);
45415 emit_move_insn (tmp, CONST1_RTX (XFmode));
45416 emit_insn (gen_addxf3 (tmp, op1, tmp));
45417 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45418 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45420 emit_label (label2);
45423 /* Emit code for round calculation. */
45424 void ix86_emit_i387_round (rtx op0, rtx op1)
45426 machine_mode inmode = GET_MODE (op1);
45427 machine_mode outmode = GET_MODE (op0);
45428 rtx e1, e2, res, tmp, tmp1, half;
45429 rtx scratch = gen_reg_rtx (HImode);
45430 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45431 rtx_code_label *jump_label = gen_label_rtx ();
45432 rtx insn;
45433 rtx (*gen_abs) (rtx, rtx);
45434 rtx (*gen_neg) (rtx, rtx);
45436 switch (inmode)
45438 case SFmode:
45439 gen_abs = gen_abssf2;
45440 break;
45441 case DFmode:
45442 gen_abs = gen_absdf2;
45443 break;
45444 case XFmode:
45445 gen_abs = gen_absxf2;
45446 break;
45447 default:
45448 gcc_unreachable ();
45451 switch (outmode)
45453 case SFmode:
45454 gen_neg = gen_negsf2;
45455 break;
45456 case DFmode:
45457 gen_neg = gen_negdf2;
45458 break;
45459 case XFmode:
45460 gen_neg = gen_negxf2;
45461 break;
45462 case HImode:
45463 gen_neg = gen_neghi2;
45464 break;
45465 case SImode:
45466 gen_neg = gen_negsi2;
45467 break;
45468 case DImode:
45469 gen_neg = gen_negdi2;
45470 break;
45471 default:
45472 gcc_unreachable ();
45475 e1 = gen_reg_rtx (inmode);
45476 e2 = gen_reg_rtx (inmode);
45477 res = gen_reg_rtx (outmode);
45479 half = const_double_from_real_value (dconsthalf, inmode);
45481 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45483 /* scratch = fxam(op1) */
45484 emit_insn (gen_rtx_SET (scratch,
45485 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45486 UNSPEC_FXAM)));
45487 /* e1 = fabs(op1) */
45488 emit_insn (gen_abs (e1, op1));
45490 /* e2 = e1 + 0.5 */
45491 half = force_reg (inmode, half);
45492 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45494 /* res = floor(e2) */
45495 if (inmode != XFmode)
45497 tmp1 = gen_reg_rtx (XFmode);
45499 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45501 else
45502 tmp1 = e2;
45504 switch (outmode)
45506 case SFmode:
45507 case DFmode:
45509 rtx tmp0 = gen_reg_rtx (XFmode);
45511 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45513 emit_insn (gen_rtx_SET (res,
45514 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45515 UNSPEC_TRUNC_NOOP)));
45517 break;
45518 case XFmode:
45519 emit_insn (gen_frndintxf2_floor (res, tmp1));
45520 break;
45521 case HImode:
45522 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45523 break;
45524 case SImode:
45525 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45526 break;
45527 case DImode:
45528 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45529 break;
45530 default:
45531 gcc_unreachable ();
45534 /* flags = signbit(a) */
45535 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45537 /* if (flags) then res = -res */
45538 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45539 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45540 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45541 pc_rtx);
45542 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45543 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45544 JUMP_LABEL (insn) = jump_label;
45546 emit_insn (gen_neg (res, res));
45548 emit_label (jump_label);
45549 LABEL_NUSES (jump_label) = 1;
45551 emit_move_insn (op0, res);
45554 /* Output code to perform a Newton-Rhapson approximation of a single precision
45555 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45557 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45559 rtx x0, x1, e0, e1;
45561 x0 = gen_reg_rtx (mode);
45562 e0 = gen_reg_rtx (mode);
45563 e1 = gen_reg_rtx (mode);
45564 x1 = gen_reg_rtx (mode);
45566 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45568 b = force_reg (mode, b);
45570 /* x0 = rcp(b) estimate */
45571 if (mode == V16SFmode || mode == V8DFmode)
45573 if (TARGET_AVX512ER)
45575 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45576 UNSPEC_RCP28)));
45577 /* res = a * x0 */
45578 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45579 return;
45581 else
45582 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45583 UNSPEC_RCP14)));
45585 else
45586 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45587 UNSPEC_RCP)));
45589 /* e0 = x0 * b */
45590 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45592 /* e0 = x0 * e0 */
45593 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45595 /* e1 = x0 + x0 */
45596 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45598 /* x1 = e1 - e0 */
45599 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45601 /* res = a * x1 */
45602 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45605 /* Output code to perform a Newton-Rhapson approximation of a
45606 single precision floating point [reciprocal] square root. */
45608 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45610 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45611 REAL_VALUE_TYPE r;
45612 int unspec;
45614 x0 = gen_reg_rtx (mode);
45615 e0 = gen_reg_rtx (mode);
45616 e1 = gen_reg_rtx (mode);
45617 e2 = gen_reg_rtx (mode);
45618 e3 = gen_reg_rtx (mode);
45620 if (TARGET_AVX512ER && mode == V16SFmode)
45622 if (recip)
45623 /* res = rsqrt28(a) estimate */
45624 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45625 UNSPEC_RSQRT28)));
45626 else
45628 /* x0 = rsqrt28(a) estimate */
45629 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45630 UNSPEC_RSQRT28)));
45631 /* res = rcp28(x0) estimate */
45632 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45633 UNSPEC_RCP28)));
45635 return;
45638 real_from_integer (&r, VOIDmode, -3, SIGNED);
45639 mthree = const_double_from_real_value (r, SFmode);
45641 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45642 mhalf = const_double_from_real_value (r, SFmode);
45643 unspec = UNSPEC_RSQRT;
45645 if (VECTOR_MODE_P (mode))
45647 mthree = ix86_build_const_vector (mode, true, mthree);
45648 mhalf = ix86_build_const_vector (mode, true, mhalf);
45649 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45650 if (GET_MODE_SIZE (mode) == 64)
45651 unspec = UNSPEC_RSQRT14;
45654 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45655 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45657 a = force_reg (mode, a);
45659 /* x0 = rsqrt(a) estimate */
45660 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45661 unspec)));
45663 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45664 if (!recip)
45666 rtx zero = force_reg (mode, CONST0_RTX(mode));
45667 rtx mask;
45669 /* Handle masked compare. */
45670 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45672 mask = gen_reg_rtx (HImode);
45673 /* Imm value 0x4 corresponds to not-equal comparison. */
45674 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45675 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45677 else
45679 mask = gen_reg_rtx (mode);
45680 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45681 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45685 /* e0 = x0 * a */
45686 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45687 /* e1 = e0 * x0 */
45688 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45690 /* e2 = e1 - 3. */
45691 mthree = force_reg (mode, mthree);
45692 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45694 mhalf = force_reg (mode, mhalf);
45695 if (recip)
45696 /* e3 = -.5 * x0 */
45697 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45698 else
45699 /* e3 = -.5 * e0 */
45700 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45701 /* ret = e2 * e3 */
45702 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45705 #ifdef TARGET_SOLARIS
45706 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45708 static void
45709 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45710 tree decl)
45712 /* With Binutils 2.15, the "@unwind" marker must be specified on
45713 every occurrence of the ".eh_frame" section, not just the first
45714 one. */
45715 if (TARGET_64BIT
45716 && strcmp (name, ".eh_frame") == 0)
45718 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45719 flags & SECTION_WRITE ? "aw" : "a");
45720 return;
45723 #ifndef USE_GAS
45724 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45726 solaris_elf_asm_comdat_section (name, flags, decl);
45727 return;
45729 #endif
45731 default_elf_asm_named_section (name, flags, decl);
45733 #endif /* TARGET_SOLARIS */
45735 /* Return the mangling of TYPE if it is an extended fundamental type. */
45737 static const char *
45738 ix86_mangle_type (const_tree type)
45740 type = TYPE_MAIN_VARIANT (type);
45742 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45743 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45744 return NULL;
45746 switch (TYPE_MODE (type))
45748 case TFmode:
45749 /* __float128 is "g". */
45750 return "g";
45751 case XFmode:
45752 /* "long double" or __float80 is "e". */
45753 return "e";
45754 default:
45755 return NULL;
45759 #ifdef TARGET_THREAD_SSP_OFFSET
45760 /* If using TLS guards, don't waste time creating and expanding
45761 __stack_chk_guard decl and MEM as we are going to ignore it. */
45762 static tree
45763 ix86_stack_protect_guard (void)
45765 if (TARGET_SSP_TLS_GUARD)
45766 return NULL_TREE;
45767 return default_stack_protect_guard ();
45769 #endif
45771 /* For 32-bit code we can save PIC register setup by using
45772 __stack_chk_fail_local hidden function instead of calling
45773 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45774 register, so it is better to call __stack_chk_fail directly. */
45776 static tree ATTRIBUTE_UNUSED
45777 ix86_stack_protect_fail (void)
45779 return TARGET_64BIT
45780 ? default_external_stack_protect_fail ()
45781 : default_hidden_stack_protect_fail ();
45784 /* Select a format to encode pointers in exception handling data. CODE
45785 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45786 true if the symbol may be affected by dynamic relocations.
45788 ??? All x86 object file formats are capable of representing this.
45789 After all, the relocation needed is the same as for the call insn.
45790 Whether or not a particular assembler allows us to enter such, I
45791 guess we'll have to see. */
45793 asm_preferred_eh_data_format (int code, int global)
45795 if (flag_pic)
45797 int type = DW_EH_PE_sdata8;
45798 if (!TARGET_64BIT
45799 || ix86_cmodel == CM_SMALL_PIC
45800 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45801 type = DW_EH_PE_sdata4;
45802 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45804 if (ix86_cmodel == CM_SMALL
45805 || (ix86_cmodel == CM_MEDIUM && code))
45806 return DW_EH_PE_udata4;
45807 return DW_EH_PE_absptr;
45810 /* Expand copysign from SIGN to the positive value ABS_VALUE
45811 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45812 the sign-bit. */
45813 static void
45814 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45816 machine_mode mode = GET_MODE (sign);
45817 rtx sgn = gen_reg_rtx (mode);
45818 if (mask == NULL_RTX)
45820 machine_mode vmode;
45822 if (mode == SFmode)
45823 vmode = V4SFmode;
45824 else if (mode == DFmode)
45825 vmode = V2DFmode;
45826 else
45827 vmode = mode;
45829 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45830 if (!VECTOR_MODE_P (mode))
45832 /* We need to generate a scalar mode mask in this case. */
45833 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45834 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45835 mask = gen_reg_rtx (mode);
45836 emit_insn (gen_rtx_SET (mask, tmp));
45839 else
45840 mask = gen_rtx_NOT (mode, mask);
45841 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45842 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45845 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45846 mask for masking out the sign-bit is stored in *SMASK, if that is
45847 non-null. */
45848 static rtx
45849 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45851 machine_mode vmode, mode = GET_MODE (op0);
45852 rtx xa, mask;
45854 xa = gen_reg_rtx (mode);
45855 if (mode == SFmode)
45856 vmode = V4SFmode;
45857 else if (mode == DFmode)
45858 vmode = V2DFmode;
45859 else
45860 vmode = mode;
45861 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45862 if (!VECTOR_MODE_P (mode))
45864 /* We need to generate a scalar mode mask in this case. */
45865 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45866 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45867 mask = gen_reg_rtx (mode);
45868 emit_insn (gen_rtx_SET (mask, tmp));
45870 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45872 if (smask)
45873 *smask = mask;
45875 return xa;
45878 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45879 swapping the operands if SWAP_OPERANDS is true. The expanded
45880 code is a forward jump to a newly created label in case the
45881 comparison is true. The generated label rtx is returned. */
45882 static rtx_code_label *
45883 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45884 bool swap_operands)
45886 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45887 rtx_code_label *label;
45888 rtx tmp;
45890 if (swap_operands)
45891 std::swap (op0, op1);
45893 label = gen_label_rtx ();
45894 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45895 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45896 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45897 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45898 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45899 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45900 JUMP_LABEL (tmp) = label;
45902 return label;
45905 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45906 using comparison code CODE. Operands are swapped for the comparison if
45907 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45908 static rtx
45909 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45910 bool swap_operands)
45912 rtx (*insn)(rtx, rtx, rtx, rtx);
45913 machine_mode mode = GET_MODE (op0);
45914 rtx mask = gen_reg_rtx (mode);
45916 if (swap_operands)
45917 std::swap (op0, op1);
45919 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45921 emit_insn (insn (mask, op0, op1,
45922 gen_rtx_fmt_ee (code, mode, op0, op1)));
45923 return mask;
45926 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45927 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45928 static rtx
45929 ix86_gen_TWO52 (machine_mode mode)
45931 REAL_VALUE_TYPE TWO52r;
45932 rtx TWO52;
45934 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45935 TWO52 = const_double_from_real_value (TWO52r, mode);
45936 TWO52 = force_reg (mode, TWO52);
45938 return TWO52;
45941 /* Expand SSE sequence for computing lround from OP1 storing
45942 into OP0. */
45943 void
45944 ix86_expand_lround (rtx op0, rtx op1)
45946 /* C code for the stuff we're doing below:
45947 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45948 return (long)tmp;
45950 machine_mode mode = GET_MODE (op1);
45951 const struct real_format *fmt;
45952 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45953 rtx adj;
45955 /* load nextafter (0.5, 0.0) */
45956 fmt = REAL_MODE_FORMAT (mode);
45957 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45958 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45960 /* adj = copysign (0.5, op1) */
45961 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45962 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45964 /* adj = op1 + adj */
45965 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45967 /* op0 = (imode)adj */
45968 expand_fix (op0, adj, 0);
45971 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45972 into OPERAND0. */
45973 void
45974 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45976 /* C code for the stuff we're doing below (for do_floor):
45977 xi = (long)op1;
45978 xi -= (double)xi > op1 ? 1 : 0;
45979 return xi;
45981 machine_mode fmode = GET_MODE (op1);
45982 machine_mode imode = GET_MODE (op0);
45983 rtx ireg, freg, tmp;
45984 rtx_code_label *label;
45986 /* reg = (long)op1 */
45987 ireg = gen_reg_rtx (imode);
45988 expand_fix (ireg, op1, 0);
45990 /* freg = (double)reg */
45991 freg = gen_reg_rtx (fmode);
45992 expand_float (freg, ireg, 0);
45994 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45995 label = ix86_expand_sse_compare_and_jump (UNLE,
45996 freg, op1, !do_floor);
45997 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45998 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45999 emit_move_insn (ireg, tmp);
46001 emit_label (label);
46002 LABEL_NUSES (label) = 1;
46004 emit_move_insn (op0, ireg);
46007 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46008 result in OPERAND0. */
46009 void
46010 ix86_expand_rint (rtx operand0, rtx operand1)
46012 /* C code for the stuff we're doing below:
46013 xa = fabs (operand1);
46014 if (!isless (xa, 2**52))
46015 return operand1;
46016 xa = xa + 2**52 - 2**52;
46017 return copysign (xa, operand1);
46019 machine_mode mode = GET_MODE (operand0);
46020 rtx res, xa, TWO52, mask;
46021 rtx_code_label *label;
46023 res = gen_reg_rtx (mode);
46024 emit_move_insn (res, operand1);
46026 /* xa = abs (operand1) */
46027 xa = ix86_expand_sse_fabs (res, &mask);
46029 /* if (!isless (xa, TWO52)) goto label; */
46030 TWO52 = ix86_gen_TWO52 (mode);
46031 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46033 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46034 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46036 ix86_sse_copysign_to_positive (res, xa, res, mask);
46038 emit_label (label);
46039 LABEL_NUSES (label) = 1;
46041 emit_move_insn (operand0, res);
46044 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46045 into OPERAND0. */
46046 void
46047 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46049 /* C code for the stuff we expand below.
46050 double xa = fabs (x), x2;
46051 if (!isless (xa, TWO52))
46052 return x;
46053 xa = xa + TWO52 - TWO52;
46054 x2 = copysign (xa, x);
46055 Compensate. Floor:
46056 if (x2 > x)
46057 x2 -= 1;
46058 Compensate. Ceil:
46059 if (x2 < x)
46060 x2 -= -1;
46061 return x2;
46063 machine_mode mode = GET_MODE (operand0);
46064 rtx xa, TWO52, tmp, one, res, mask;
46065 rtx_code_label *label;
46067 TWO52 = ix86_gen_TWO52 (mode);
46069 /* Temporary for holding the result, initialized to the input
46070 operand to ease control flow. */
46071 res = gen_reg_rtx (mode);
46072 emit_move_insn (res, operand1);
46074 /* xa = abs (operand1) */
46075 xa = ix86_expand_sse_fabs (res, &mask);
46077 /* if (!isless (xa, TWO52)) goto label; */
46078 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46080 /* xa = xa + TWO52 - TWO52; */
46081 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46082 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46084 /* xa = copysign (xa, operand1) */
46085 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46087 /* generate 1.0 or -1.0 */
46088 one = force_reg (mode,
46089 const_double_from_real_value (do_floor
46090 ? dconst1 : dconstm1, mode));
46092 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46093 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46094 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46095 /* We always need to subtract here to preserve signed zero. */
46096 tmp = expand_simple_binop (mode, MINUS,
46097 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46098 emit_move_insn (res, tmp);
46100 emit_label (label);
46101 LABEL_NUSES (label) = 1;
46103 emit_move_insn (operand0, res);
46106 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46107 into OPERAND0. */
46108 void
46109 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46111 /* C code for the stuff we expand below.
46112 double xa = fabs (x), x2;
46113 if (!isless (xa, TWO52))
46114 return x;
46115 x2 = (double)(long)x;
46116 Compensate. Floor:
46117 if (x2 > x)
46118 x2 -= 1;
46119 Compensate. Ceil:
46120 if (x2 < x)
46121 x2 += 1;
46122 if (HONOR_SIGNED_ZEROS (mode))
46123 return copysign (x2, x);
46124 return x2;
46126 machine_mode mode = GET_MODE (operand0);
46127 rtx xa, xi, TWO52, tmp, one, res, mask;
46128 rtx_code_label *label;
46130 TWO52 = ix86_gen_TWO52 (mode);
46132 /* Temporary for holding the result, initialized to the input
46133 operand to ease control flow. */
46134 res = gen_reg_rtx (mode);
46135 emit_move_insn (res, operand1);
46137 /* xa = abs (operand1) */
46138 xa = ix86_expand_sse_fabs (res, &mask);
46140 /* if (!isless (xa, TWO52)) goto label; */
46141 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46143 /* xa = (double)(long)x */
46144 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46145 expand_fix (xi, res, 0);
46146 expand_float (xa, xi, 0);
46148 /* generate 1.0 */
46149 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46151 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46152 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46153 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46154 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46155 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46156 emit_move_insn (res, tmp);
46158 if (HONOR_SIGNED_ZEROS (mode))
46159 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46161 emit_label (label);
46162 LABEL_NUSES (label) = 1;
46164 emit_move_insn (operand0, res);
46167 /* Expand SSE sequence for computing round from OPERAND1 storing
46168 into OPERAND0. Sequence that works without relying on DImode truncation
46169 via cvttsd2siq that is only available on 64bit targets. */
46170 void
46171 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46173 /* C code for the stuff we expand below.
46174 double xa = fabs (x), xa2, x2;
46175 if (!isless (xa, TWO52))
46176 return x;
46177 Using the absolute value and copying back sign makes
46178 -0.0 -> -0.0 correct.
46179 xa2 = xa + TWO52 - TWO52;
46180 Compensate.
46181 dxa = xa2 - xa;
46182 if (dxa <= -0.5)
46183 xa2 += 1;
46184 else if (dxa > 0.5)
46185 xa2 -= 1;
46186 x2 = copysign (xa2, x);
46187 return x2;
46189 machine_mode mode = GET_MODE (operand0);
46190 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46191 rtx_code_label *label;
46193 TWO52 = ix86_gen_TWO52 (mode);
46195 /* Temporary for holding the result, initialized to the input
46196 operand to ease control flow. */
46197 res = gen_reg_rtx (mode);
46198 emit_move_insn (res, operand1);
46200 /* xa = abs (operand1) */
46201 xa = ix86_expand_sse_fabs (res, &mask);
46203 /* if (!isless (xa, TWO52)) goto label; */
46204 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46206 /* xa2 = xa + TWO52 - TWO52; */
46207 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46208 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46210 /* dxa = xa2 - xa; */
46211 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46213 /* generate 0.5, 1.0 and -0.5 */
46214 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46215 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46216 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46217 0, OPTAB_DIRECT);
46219 /* Compensate. */
46220 tmp = gen_reg_rtx (mode);
46221 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46222 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46223 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46224 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46225 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46226 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46227 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46228 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46230 /* res = copysign (xa2, operand1) */
46231 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46233 emit_label (label);
46234 LABEL_NUSES (label) = 1;
46236 emit_move_insn (operand0, res);
46239 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46240 into OPERAND0. */
46241 void
46242 ix86_expand_trunc (rtx operand0, rtx operand1)
46244 /* C code for SSE variant we expand below.
46245 double xa = fabs (x), x2;
46246 if (!isless (xa, TWO52))
46247 return x;
46248 x2 = (double)(long)x;
46249 if (HONOR_SIGNED_ZEROS (mode))
46250 return copysign (x2, x);
46251 return x2;
46253 machine_mode mode = GET_MODE (operand0);
46254 rtx xa, xi, TWO52, res, mask;
46255 rtx_code_label *label;
46257 TWO52 = ix86_gen_TWO52 (mode);
46259 /* Temporary for holding the result, initialized to the input
46260 operand to ease control flow. */
46261 res = gen_reg_rtx (mode);
46262 emit_move_insn (res, operand1);
46264 /* xa = abs (operand1) */
46265 xa = ix86_expand_sse_fabs (res, &mask);
46267 /* if (!isless (xa, TWO52)) goto label; */
46268 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46270 /* x = (double)(long)x */
46271 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46272 expand_fix (xi, res, 0);
46273 expand_float (res, xi, 0);
46275 if (HONOR_SIGNED_ZEROS (mode))
46276 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46278 emit_label (label);
46279 LABEL_NUSES (label) = 1;
46281 emit_move_insn (operand0, res);
46284 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46285 into OPERAND0. */
46286 void
46287 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46289 machine_mode mode = GET_MODE (operand0);
46290 rtx xa, mask, TWO52, one, res, smask, tmp;
46291 rtx_code_label *label;
46293 /* C code for SSE variant we expand below.
46294 double xa = fabs (x), x2;
46295 if (!isless (xa, TWO52))
46296 return x;
46297 xa2 = xa + TWO52 - TWO52;
46298 Compensate:
46299 if (xa2 > xa)
46300 xa2 -= 1.0;
46301 x2 = copysign (xa2, x);
46302 return x2;
46305 TWO52 = ix86_gen_TWO52 (mode);
46307 /* Temporary for holding the result, initialized to the input
46308 operand to ease control flow. */
46309 res = gen_reg_rtx (mode);
46310 emit_move_insn (res, operand1);
46312 /* xa = abs (operand1) */
46313 xa = ix86_expand_sse_fabs (res, &smask);
46315 /* if (!isless (xa, TWO52)) goto label; */
46316 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46318 /* res = xa + TWO52 - TWO52; */
46319 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46320 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46321 emit_move_insn (res, tmp);
46323 /* generate 1.0 */
46324 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46326 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46327 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46328 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46329 tmp = expand_simple_binop (mode, MINUS,
46330 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46331 emit_move_insn (res, tmp);
46333 /* res = copysign (res, operand1) */
46334 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46336 emit_label (label);
46337 LABEL_NUSES (label) = 1;
46339 emit_move_insn (operand0, res);
46342 /* Expand SSE sequence for computing round from OPERAND1 storing
46343 into OPERAND0. */
46344 void
46345 ix86_expand_round (rtx operand0, rtx operand1)
46347 /* C code for the stuff we're doing below:
46348 double xa = fabs (x);
46349 if (!isless (xa, TWO52))
46350 return x;
46351 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46352 return copysign (xa, x);
46354 machine_mode mode = GET_MODE (operand0);
46355 rtx res, TWO52, xa, xi, half, mask;
46356 rtx_code_label *label;
46357 const struct real_format *fmt;
46358 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46360 /* Temporary for holding the result, initialized to the input
46361 operand to ease control flow. */
46362 res = gen_reg_rtx (mode);
46363 emit_move_insn (res, operand1);
46365 TWO52 = ix86_gen_TWO52 (mode);
46366 xa = ix86_expand_sse_fabs (res, &mask);
46367 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46369 /* load nextafter (0.5, 0.0) */
46370 fmt = REAL_MODE_FORMAT (mode);
46371 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46372 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46374 /* xa = xa + 0.5 */
46375 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46376 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46378 /* xa = (double)(int64_t)xa */
46379 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46380 expand_fix (xi, xa, 0);
46381 expand_float (xa, xi, 0);
46383 /* res = copysign (xa, operand1) */
46384 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46386 emit_label (label);
46387 LABEL_NUSES (label) = 1;
46389 emit_move_insn (operand0, res);
46392 /* Expand SSE sequence for computing round
46393 from OP1 storing into OP0 using sse4 round insn. */
46394 void
46395 ix86_expand_round_sse4 (rtx op0, rtx op1)
46397 machine_mode mode = GET_MODE (op0);
46398 rtx e1, e2, res, half;
46399 const struct real_format *fmt;
46400 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46401 rtx (*gen_copysign) (rtx, rtx, rtx);
46402 rtx (*gen_round) (rtx, rtx, rtx);
46404 switch (mode)
46406 case SFmode:
46407 gen_copysign = gen_copysignsf3;
46408 gen_round = gen_sse4_1_roundsf2;
46409 break;
46410 case DFmode:
46411 gen_copysign = gen_copysigndf3;
46412 gen_round = gen_sse4_1_rounddf2;
46413 break;
46414 default:
46415 gcc_unreachable ();
46418 /* round (a) = trunc (a + copysign (0.5, a)) */
46420 /* load nextafter (0.5, 0.0) */
46421 fmt = REAL_MODE_FORMAT (mode);
46422 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46423 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46424 half = const_double_from_real_value (pred_half, mode);
46426 /* e1 = copysign (0.5, op1) */
46427 e1 = gen_reg_rtx (mode);
46428 emit_insn (gen_copysign (e1, half, op1));
46430 /* e2 = op1 + e1 */
46431 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46433 /* res = trunc (e2) */
46434 res = gen_reg_rtx (mode);
46435 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46437 emit_move_insn (op0, res);
46441 /* Table of valid machine attributes. */
46442 static const struct attribute_spec ix86_attribute_table[] =
46444 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46445 affects_type_identity } */
46446 /* Stdcall attribute says callee is responsible for popping arguments
46447 if they are not variable. */
46448 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46449 true },
46450 /* Fastcall attribute says callee is responsible for popping arguments
46451 if they are not variable. */
46452 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46453 true },
46454 /* Thiscall attribute says callee is responsible for popping arguments
46455 if they are not variable. */
46456 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46457 true },
46458 /* Cdecl attribute says the callee is a normal C declaration */
46459 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46460 true },
46461 /* Regparm attribute specifies how many integer arguments are to be
46462 passed in registers. */
46463 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46464 true },
46465 /* Sseregparm attribute says we are using x86_64 calling conventions
46466 for FP arguments. */
46467 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46468 true },
46469 /* The transactional memory builtins are implicitly regparm or fastcall
46470 depending on the ABI. Override the generic do-nothing attribute that
46471 these builtins were declared with. */
46472 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46473 true },
46474 /* force_align_arg_pointer says this function realigns the stack at entry. */
46475 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46476 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46477 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46478 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46479 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46480 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46481 false },
46482 #endif
46483 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46484 false },
46485 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46486 false },
46487 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46488 SUBTARGET_ATTRIBUTE_TABLE,
46489 #endif
46490 /* ms_abi and sysv_abi calling convention function attributes. */
46491 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46492 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46493 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46494 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46495 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46496 false },
46497 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46498 ix86_handle_callee_pop_aggregate_return, true },
46499 { "interrupt", 0, 0, false, true, true,
46500 ix86_handle_interrupt_attribute, false },
46501 { "no_caller_saved_registers", 0, 0, false, true, true,
46502 ix86_handle_no_caller_saved_registers_attribute, false },
46504 /* End element. */
46505 { NULL, 0, 0, false, false, false, NULL, false }
46508 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46509 static int
46510 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46511 tree vectype, int)
46513 switch (type_of_cost)
46515 case scalar_stmt:
46516 return ix86_cost->scalar_stmt_cost;
46518 case scalar_load:
46519 return ix86_cost->scalar_load_cost;
46521 case scalar_store:
46522 return ix86_cost->scalar_store_cost;
46524 case vector_stmt:
46525 return ix86_cost->vec_stmt_cost;
46527 case vector_load:
46528 return ix86_cost->vec_align_load_cost;
46530 case vector_store:
46531 return ix86_cost->vec_store_cost;
46533 case vec_to_scalar:
46534 return ix86_cost->vec_to_scalar_cost;
46536 case scalar_to_vec:
46537 return ix86_cost->scalar_to_vec_cost;
46539 case unaligned_load:
46540 case unaligned_store:
46541 return ix86_cost->vec_unalign_load_cost;
46543 case cond_branch_taken:
46544 return ix86_cost->cond_taken_branch_cost;
46546 case cond_branch_not_taken:
46547 return ix86_cost->cond_not_taken_branch_cost;
46549 case vec_perm:
46550 case vec_promote_demote:
46551 return ix86_cost->vec_stmt_cost;
46553 case vec_construct:
46554 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46556 default:
46557 gcc_unreachable ();
46561 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46562 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46563 insn every time. */
46565 static GTY(()) rtx_insn *vselect_insn;
46567 /* Initialize vselect_insn. */
46569 static void
46570 init_vselect_insn (void)
46572 unsigned i;
46573 rtx x;
46575 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46576 for (i = 0; i < MAX_VECT_LEN; ++i)
46577 XVECEXP (x, 0, i) = const0_rtx;
46578 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46579 const0_rtx), x);
46580 x = gen_rtx_SET (const0_rtx, x);
46581 start_sequence ();
46582 vselect_insn = emit_insn (x);
46583 end_sequence ();
46586 /* Construct (set target (vec_select op0 (parallel perm))) and
46587 return true if that's a valid instruction in the active ISA. */
46589 static bool
46590 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46591 unsigned nelt, bool testing_p)
46593 unsigned int i;
46594 rtx x, save_vconcat;
46595 int icode;
46597 if (vselect_insn == NULL_RTX)
46598 init_vselect_insn ();
46600 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46601 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46602 for (i = 0; i < nelt; ++i)
46603 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46604 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46605 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46606 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46607 SET_DEST (PATTERN (vselect_insn)) = target;
46608 icode = recog_memoized (vselect_insn);
46610 if (icode >= 0 && !testing_p)
46611 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46613 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46614 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46615 INSN_CODE (vselect_insn) = -1;
46617 return icode >= 0;
46620 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46622 static bool
46623 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46624 const unsigned char *perm, unsigned nelt,
46625 bool testing_p)
46627 machine_mode v2mode;
46628 rtx x;
46629 bool ok;
46631 if (vselect_insn == NULL_RTX)
46632 init_vselect_insn ();
46634 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
46635 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46636 PUT_MODE (x, v2mode);
46637 XEXP (x, 0) = op0;
46638 XEXP (x, 1) = op1;
46639 ok = expand_vselect (target, x, perm, nelt, testing_p);
46640 XEXP (x, 0) = const0_rtx;
46641 XEXP (x, 1) = const0_rtx;
46642 return ok;
46645 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46646 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46648 static bool
46649 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46651 machine_mode mmode, vmode = d->vmode;
46652 unsigned i, mask, nelt = d->nelt;
46653 rtx target, op0, op1, maskop, x;
46654 rtx rperm[32], vperm;
46656 if (d->one_operand_p)
46657 return false;
46658 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46659 && (TARGET_AVX512BW
46660 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46662 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46664 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46666 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46668 else
46669 return false;
46671 /* This is a blend, not a permute. Elements must stay in their
46672 respective lanes. */
46673 for (i = 0; i < nelt; ++i)
46675 unsigned e = d->perm[i];
46676 if (!(e == i || e == i + nelt))
46677 return false;
46680 if (d->testing_p)
46681 return true;
46683 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46684 decision should be extracted elsewhere, so that we only try that
46685 sequence once all budget==3 options have been tried. */
46686 target = d->target;
46687 op0 = d->op0;
46688 op1 = d->op1;
46689 mask = 0;
46691 switch (vmode)
46693 case V8DFmode:
46694 case V16SFmode:
46695 case V4DFmode:
46696 case V8SFmode:
46697 case V2DFmode:
46698 case V4SFmode:
46699 case V8HImode:
46700 case V8SImode:
46701 case V32HImode:
46702 case V64QImode:
46703 case V16SImode:
46704 case V8DImode:
46705 for (i = 0; i < nelt; ++i)
46706 mask |= (d->perm[i] >= nelt) << i;
46707 break;
46709 case V2DImode:
46710 for (i = 0; i < 2; ++i)
46711 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46712 vmode = V8HImode;
46713 goto do_subreg;
46715 case V4SImode:
46716 for (i = 0; i < 4; ++i)
46717 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46718 vmode = V8HImode;
46719 goto do_subreg;
46721 case V16QImode:
46722 /* See if bytes move in pairs so we can use pblendw with
46723 an immediate argument, rather than pblendvb with a vector
46724 argument. */
46725 for (i = 0; i < 16; i += 2)
46726 if (d->perm[i] + 1 != d->perm[i + 1])
46728 use_pblendvb:
46729 for (i = 0; i < nelt; ++i)
46730 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46732 finish_pblendvb:
46733 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46734 vperm = force_reg (vmode, vperm);
46736 if (GET_MODE_SIZE (vmode) == 16)
46737 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46738 else
46739 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46740 if (target != d->target)
46741 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46742 return true;
46745 for (i = 0; i < 8; ++i)
46746 mask |= (d->perm[i * 2] >= 16) << i;
46747 vmode = V8HImode;
46748 /* FALLTHRU */
46750 do_subreg:
46751 target = gen_reg_rtx (vmode);
46752 op0 = gen_lowpart (vmode, op0);
46753 op1 = gen_lowpart (vmode, op1);
46754 break;
46756 case V32QImode:
46757 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46758 for (i = 0; i < 32; i += 2)
46759 if (d->perm[i] + 1 != d->perm[i + 1])
46760 goto use_pblendvb;
46761 /* See if bytes move in quadruplets. If yes, vpblendd
46762 with immediate can be used. */
46763 for (i = 0; i < 32; i += 4)
46764 if (d->perm[i] + 2 != d->perm[i + 2])
46765 break;
46766 if (i < 32)
46768 /* See if bytes move the same in both lanes. If yes,
46769 vpblendw with immediate can be used. */
46770 for (i = 0; i < 16; i += 2)
46771 if (d->perm[i] + 16 != d->perm[i + 16])
46772 goto use_pblendvb;
46774 /* Use vpblendw. */
46775 for (i = 0; i < 16; ++i)
46776 mask |= (d->perm[i * 2] >= 32) << i;
46777 vmode = V16HImode;
46778 goto do_subreg;
46781 /* Use vpblendd. */
46782 for (i = 0; i < 8; ++i)
46783 mask |= (d->perm[i * 4] >= 32) << i;
46784 vmode = V8SImode;
46785 goto do_subreg;
46787 case V16HImode:
46788 /* See if words move in pairs. If yes, vpblendd can be used. */
46789 for (i = 0; i < 16; i += 2)
46790 if (d->perm[i] + 1 != d->perm[i + 1])
46791 break;
46792 if (i < 16)
46794 /* See if words move the same in both lanes. If not,
46795 vpblendvb must be used. */
46796 for (i = 0; i < 8; i++)
46797 if (d->perm[i] + 8 != d->perm[i + 8])
46799 /* Use vpblendvb. */
46800 for (i = 0; i < 32; ++i)
46801 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46803 vmode = V32QImode;
46804 nelt = 32;
46805 target = gen_reg_rtx (vmode);
46806 op0 = gen_lowpart (vmode, op0);
46807 op1 = gen_lowpart (vmode, op1);
46808 goto finish_pblendvb;
46811 /* Use vpblendw. */
46812 for (i = 0; i < 16; ++i)
46813 mask |= (d->perm[i] >= 16) << i;
46814 break;
46817 /* Use vpblendd. */
46818 for (i = 0; i < 8; ++i)
46819 mask |= (d->perm[i * 2] >= 16) << i;
46820 vmode = V8SImode;
46821 goto do_subreg;
46823 case V4DImode:
46824 /* Use vpblendd. */
46825 for (i = 0; i < 4; ++i)
46826 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46827 vmode = V8SImode;
46828 goto do_subreg;
46830 default:
46831 gcc_unreachable ();
46834 switch (vmode)
46836 case V8DFmode:
46837 case V8DImode:
46838 mmode = QImode;
46839 break;
46840 case V16SFmode:
46841 case V16SImode:
46842 mmode = HImode;
46843 break;
46844 case V32HImode:
46845 mmode = SImode;
46846 break;
46847 case V64QImode:
46848 mmode = DImode;
46849 break;
46850 default:
46851 mmode = VOIDmode;
46854 if (mmode != VOIDmode)
46855 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46856 else
46857 maskop = GEN_INT (mask);
46859 /* This matches five different patterns with the different modes. */
46860 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46861 x = gen_rtx_SET (target, x);
46862 emit_insn (x);
46863 if (target != d->target)
46864 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46866 return true;
46869 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46870 in terms of the variable form of vpermilps.
46872 Note that we will have already failed the immediate input vpermilps,
46873 which requires that the high and low part shuffle be identical; the
46874 variable form doesn't require that. */
46876 static bool
46877 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46879 rtx rperm[8], vperm;
46880 unsigned i;
46882 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46883 return false;
46885 /* We can only permute within the 128-bit lane. */
46886 for (i = 0; i < 8; ++i)
46888 unsigned e = d->perm[i];
46889 if (i < 4 ? e >= 4 : e < 4)
46890 return false;
46893 if (d->testing_p)
46894 return true;
46896 for (i = 0; i < 8; ++i)
46898 unsigned e = d->perm[i];
46900 /* Within each 128-bit lane, the elements of op0 are numbered
46901 from 0 and the elements of op1 are numbered from 4. */
46902 if (e >= 8 + 4)
46903 e -= 8;
46904 else if (e >= 4)
46905 e -= 4;
46907 rperm[i] = GEN_INT (e);
46910 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46911 vperm = force_reg (V8SImode, vperm);
46912 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46914 return true;
46917 /* Return true if permutation D can be performed as VMODE permutation
46918 instead. */
46920 static bool
46921 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46923 unsigned int i, j, chunk;
46925 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46926 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46927 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46928 return false;
46930 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46931 return true;
46933 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46934 for (i = 0; i < d->nelt; i += chunk)
46935 if (d->perm[i] & (chunk - 1))
46936 return false;
46937 else
46938 for (j = 1; j < chunk; ++j)
46939 if (d->perm[i] + j != d->perm[i + j])
46940 return false;
46942 return true;
46945 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46946 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46948 static bool
46949 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46951 unsigned i, nelt, eltsz, mask;
46952 unsigned char perm[64];
46953 machine_mode vmode = V16QImode;
46954 rtx rperm[64], vperm, target, op0, op1;
46956 nelt = d->nelt;
46958 if (!d->one_operand_p)
46960 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46962 if (TARGET_AVX2
46963 && valid_perm_using_mode_p (V2TImode, d))
46965 if (d->testing_p)
46966 return true;
46968 /* Use vperm2i128 insn. The pattern uses
46969 V4DImode instead of V2TImode. */
46970 target = d->target;
46971 if (d->vmode != V4DImode)
46972 target = gen_reg_rtx (V4DImode);
46973 op0 = gen_lowpart (V4DImode, d->op0);
46974 op1 = gen_lowpart (V4DImode, d->op1);
46975 rperm[0]
46976 = GEN_INT ((d->perm[0] / (nelt / 2))
46977 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46978 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46979 if (target != d->target)
46980 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46981 return true;
46983 return false;
46986 else
46988 if (GET_MODE_SIZE (d->vmode) == 16)
46990 if (!TARGET_SSSE3)
46991 return false;
46993 else if (GET_MODE_SIZE (d->vmode) == 32)
46995 if (!TARGET_AVX2)
46996 return false;
46998 /* V4DImode should be already handled through
46999 expand_vselect by vpermq instruction. */
47000 gcc_assert (d->vmode != V4DImode);
47002 vmode = V32QImode;
47003 if (d->vmode == V8SImode
47004 || d->vmode == V16HImode
47005 || d->vmode == V32QImode)
47007 /* First see if vpermq can be used for
47008 V8SImode/V16HImode/V32QImode. */
47009 if (valid_perm_using_mode_p (V4DImode, d))
47011 for (i = 0; i < 4; i++)
47012 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47013 if (d->testing_p)
47014 return true;
47015 target = gen_reg_rtx (V4DImode);
47016 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47017 perm, 4, false))
47019 emit_move_insn (d->target,
47020 gen_lowpart (d->vmode, target));
47021 return true;
47023 return false;
47026 /* Next see if vpermd can be used. */
47027 if (valid_perm_using_mode_p (V8SImode, d))
47028 vmode = V8SImode;
47030 /* Or if vpermps can be used. */
47031 else if (d->vmode == V8SFmode)
47032 vmode = V8SImode;
47034 if (vmode == V32QImode)
47036 /* vpshufb only works intra lanes, it is not
47037 possible to shuffle bytes in between the lanes. */
47038 for (i = 0; i < nelt; ++i)
47039 if ((d->perm[i] ^ i) & (nelt / 2))
47040 return false;
47043 else if (GET_MODE_SIZE (d->vmode) == 64)
47045 if (!TARGET_AVX512BW)
47046 return false;
47048 /* If vpermq didn't work, vpshufb won't work either. */
47049 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47050 return false;
47052 vmode = V64QImode;
47053 if (d->vmode == V16SImode
47054 || d->vmode == V32HImode
47055 || d->vmode == V64QImode)
47057 /* First see if vpermq can be used for
47058 V16SImode/V32HImode/V64QImode. */
47059 if (valid_perm_using_mode_p (V8DImode, d))
47061 for (i = 0; i < 8; i++)
47062 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47063 if (d->testing_p)
47064 return true;
47065 target = gen_reg_rtx (V8DImode);
47066 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47067 perm, 8, false))
47069 emit_move_insn (d->target,
47070 gen_lowpart (d->vmode, target));
47071 return true;
47073 return false;
47076 /* Next see if vpermd can be used. */
47077 if (valid_perm_using_mode_p (V16SImode, d))
47078 vmode = V16SImode;
47080 /* Or if vpermps can be used. */
47081 else if (d->vmode == V16SFmode)
47082 vmode = V16SImode;
47083 if (vmode == V64QImode)
47085 /* vpshufb only works intra lanes, it is not
47086 possible to shuffle bytes in between the lanes. */
47087 for (i = 0; i < nelt; ++i)
47088 if ((d->perm[i] ^ i) & (nelt / 4))
47089 return false;
47092 else
47093 return false;
47096 if (d->testing_p)
47097 return true;
47099 if (vmode == V8SImode)
47100 for (i = 0; i < 8; ++i)
47101 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47102 else if (vmode == V16SImode)
47103 for (i = 0; i < 16; ++i)
47104 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47105 else
47107 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47108 if (!d->one_operand_p)
47109 mask = 2 * nelt - 1;
47110 else if (vmode == V16QImode)
47111 mask = nelt - 1;
47112 else if (vmode == V64QImode)
47113 mask = nelt / 4 - 1;
47114 else
47115 mask = nelt / 2 - 1;
47117 for (i = 0; i < nelt; ++i)
47119 unsigned j, e = d->perm[i] & mask;
47120 for (j = 0; j < eltsz; ++j)
47121 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47125 vperm = gen_rtx_CONST_VECTOR (vmode,
47126 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47127 vperm = force_reg (vmode, vperm);
47129 target = d->target;
47130 if (d->vmode != vmode)
47131 target = gen_reg_rtx (vmode);
47132 op0 = gen_lowpart (vmode, d->op0);
47133 if (d->one_operand_p)
47135 if (vmode == V16QImode)
47136 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47137 else if (vmode == V32QImode)
47138 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47139 else if (vmode == V64QImode)
47140 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47141 else if (vmode == V8SFmode)
47142 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47143 else if (vmode == V8SImode)
47144 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47145 else if (vmode == V16SFmode)
47146 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47147 else if (vmode == V16SImode)
47148 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47149 else
47150 gcc_unreachable ();
47152 else
47154 op1 = gen_lowpart (vmode, d->op1);
47155 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47157 if (target != d->target)
47158 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47160 return true;
47163 /* For V*[QHS]Imode permutations, check if the same permutation
47164 can't be performed in a 2x, 4x or 8x wider inner mode. */
47166 static bool
47167 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47168 struct expand_vec_perm_d *nd)
47170 int i;
47171 machine_mode mode = VOIDmode;
47173 switch (d->vmode)
47175 case V16QImode: mode = V8HImode; break;
47176 case V32QImode: mode = V16HImode; break;
47177 case V64QImode: mode = V32HImode; break;
47178 case V8HImode: mode = V4SImode; break;
47179 case V16HImode: mode = V8SImode; break;
47180 case V32HImode: mode = V16SImode; break;
47181 case V4SImode: mode = V2DImode; break;
47182 case V8SImode: mode = V4DImode; break;
47183 case V16SImode: mode = V8DImode; break;
47184 default: return false;
47186 for (i = 0; i < d->nelt; i += 2)
47187 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47188 return false;
47189 nd->vmode = mode;
47190 nd->nelt = d->nelt / 2;
47191 for (i = 0; i < nd->nelt; i++)
47192 nd->perm[i] = d->perm[2 * i] / 2;
47193 if (GET_MODE_INNER (mode) != DImode)
47194 canonicalize_vector_int_perm (nd, nd);
47195 if (nd != d)
47197 nd->one_operand_p = d->one_operand_p;
47198 nd->testing_p = d->testing_p;
47199 if (d->op0 == d->op1)
47200 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47201 else
47203 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47204 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47206 if (d->testing_p)
47207 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47208 else
47209 nd->target = gen_reg_rtx (nd->vmode);
47211 return true;
47214 /* Try to expand one-operand permutation with constant mask. */
47216 static bool
47217 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47219 machine_mode mode = GET_MODE (d->op0);
47220 machine_mode maskmode = mode;
47221 rtx (*gen) (rtx, rtx, rtx) = NULL;
47222 rtx target, op0, mask;
47223 rtx vec[64];
47225 if (!rtx_equal_p (d->op0, d->op1))
47226 return false;
47228 if (!TARGET_AVX512F)
47229 return false;
47231 switch (mode)
47233 case V16SImode:
47234 gen = gen_avx512f_permvarv16si;
47235 break;
47236 case V16SFmode:
47237 gen = gen_avx512f_permvarv16sf;
47238 maskmode = V16SImode;
47239 break;
47240 case V8DImode:
47241 gen = gen_avx512f_permvarv8di;
47242 break;
47243 case V8DFmode:
47244 gen = gen_avx512f_permvarv8df;
47245 maskmode = V8DImode;
47246 break;
47247 default:
47248 return false;
47251 target = d->target;
47252 op0 = d->op0;
47253 for (int i = 0; i < d->nelt; ++i)
47254 vec[i] = GEN_INT (d->perm[i]);
47255 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47256 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47257 return true;
47260 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47261 in a single instruction. */
47263 static bool
47264 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47266 unsigned i, nelt = d->nelt;
47267 struct expand_vec_perm_d nd;
47269 /* Check plain VEC_SELECT first, because AVX has instructions that could
47270 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47271 input where SEL+CONCAT may not. */
47272 if (d->one_operand_p)
47274 int mask = nelt - 1;
47275 bool identity_perm = true;
47276 bool broadcast_perm = true;
47278 for (i = 0; i < nelt; i++)
47280 nd.perm[i] = d->perm[i] & mask;
47281 if (nd.perm[i] != i)
47282 identity_perm = false;
47283 if (nd.perm[i])
47284 broadcast_perm = false;
47287 if (identity_perm)
47289 if (!d->testing_p)
47290 emit_move_insn (d->target, d->op0);
47291 return true;
47293 else if (broadcast_perm && TARGET_AVX2)
47295 /* Use vpbroadcast{b,w,d}. */
47296 rtx (*gen) (rtx, rtx) = NULL;
47297 switch (d->vmode)
47299 case V64QImode:
47300 if (TARGET_AVX512BW)
47301 gen = gen_avx512bw_vec_dupv64qi_1;
47302 break;
47303 case V32QImode:
47304 gen = gen_avx2_pbroadcastv32qi_1;
47305 break;
47306 case V32HImode:
47307 if (TARGET_AVX512BW)
47308 gen = gen_avx512bw_vec_dupv32hi_1;
47309 break;
47310 case V16HImode:
47311 gen = gen_avx2_pbroadcastv16hi_1;
47312 break;
47313 case V16SImode:
47314 if (TARGET_AVX512F)
47315 gen = gen_avx512f_vec_dupv16si_1;
47316 break;
47317 case V8SImode:
47318 gen = gen_avx2_pbroadcastv8si_1;
47319 break;
47320 case V16QImode:
47321 gen = gen_avx2_pbroadcastv16qi;
47322 break;
47323 case V8HImode:
47324 gen = gen_avx2_pbroadcastv8hi;
47325 break;
47326 case V16SFmode:
47327 if (TARGET_AVX512F)
47328 gen = gen_avx512f_vec_dupv16sf_1;
47329 break;
47330 case V8SFmode:
47331 gen = gen_avx2_vec_dupv8sf_1;
47332 break;
47333 case V8DFmode:
47334 if (TARGET_AVX512F)
47335 gen = gen_avx512f_vec_dupv8df_1;
47336 break;
47337 case V8DImode:
47338 if (TARGET_AVX512F)
47339 gen = gen_avx512f_vec_dupv8di_1;
47340 break;
47341 /* For other modes prefer other shuffles this function creates. */
47342 default: break;
47344 if (gen != NULL)
47346 if (!d->testing_p)
47347 emit_insn (gen (d->target, d->op0));
47348 return true;
47352 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47353 return true;
47355 /* There are plenty of patterns in sse.md that are written for
47356 SEL+CONCAT and are not replicated for a single op. Perhaps
47357 that should be changed, to avoid the nastiness here. */
47359 /* Recognize interleave style patterns, which means incrementing
47360 every other permutation operand. */
47361 for (i = 0; i < nelt; i += 2)
47363 nd.perm[i] = d->perm[i] & mask;
47364 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47366 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47367 d->testing_p))
47368 return true;
47370 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47371 if (nelt >= 4)
47373 for (i = 0; i < nelt; i += 4)
47375 nd.perm[i + 0] = d->perm[i + 0] & mask;
47376 nd.perm[i + 1] = d->perm[i + 1] & mask;
47377 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47378 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47381 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47382 d->testing_p))
47383 return true;
47387 /* Finally, try the fully general two operand permute. */
47388 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47389 d->testing_p))
47390 return true;
47392 /* Recognize interleave style patterns with reversed operands. */
47393 if (!d->one_operand_p)
47395 for (i = 0; i < nelt; ++i)
47397 unsigned e = d->perm[i];
47398 if (e >= nelt)
47399 e -= nelt;
47400 else
47401 e += nelt;
47402 nd.perm[i] = e;
47405 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47406 d->testing_p))
47407 return true;
47410 /* Try the SSE4.1 blend variable merge instructions. */
47411 if (expand_vec_perm_blend (d))
47412 return true;
47414 /* Try one of the AVX vpermil variable permutations. */
47415 if (expand_vec_perm_vpermil (d))
47416 return true;
47418 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47419 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47420 if (expand_vec_perm_pshufb (d))
47421 return true;
47423 /* Try the AVX2 vpalignr instruction. */
47424 if (expand_vec_perm_palignr (d, true))
47425 return true;
47427 /* Try the AVX512F vperm{s,d} instructions. */
47428 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47429 return true;
47431 /* Try the AVX512F vpermi2 instructions. */
47432 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47433 return true;
47435 /* See if we can get the same permutation in different vector integer
47436 mode. */
47437 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47439 if (!d->testing_p)
47440 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47441 return true;
47443 return false;
47446 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47447 in terms of a pair of pshuflw + pshufhw instructions. */
47449 static bool
47450 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47452 unsigned char perm2[MAX_VECT_LEN];
47453 unsigned i;
47454 bool ok;
47456 if (d->vmode != V8HImode || !d->one_operand_p)
47457 return false;
47459 /* The two permutations only operate in 64-bit lanes. */
47460 for (i = 0; i < 4; ++i)
47461 if (d->perm[i] >= 4)
47462 return false;
47463 for (i = 4; i < 8; ++i)
47464 if (d->perm[i] < 4)
47465 return false;
47467 if (d->testing_p)
47468 return true;
47470 /* Emit the pshuflw. */
47471 memcpy (perm2, d->perm, 4);
47472 for (i = 4; i < 8; ++i)
47473 perm2[i] = i;
47474 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47475 gcc_assert (ok);
47477 /* Emit the pshufhw. */
47478 memcpy (perm2 + 4, d->perm + 4, 4);
47479 for (i = 0; i < 4; ++i)
47480 perm2[i] = i;
47481 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47482 gcc_assert (ok);
47484 return true;
47487 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47488 the permutation using the SSSE3 palignr instruction. This succeeds
47489 when all of the elements in PERM fit within one vector and we merely
47490 need to shift them down so that a single vector permutation has a
47491 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47492 the vpalignr instruction itself can perform the requested permutation. */
47494 static bool
47495 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47497 unsigned i, nelt = d->nelt;
47498 unsigned min, max, minswap, maxswap;
47499 bool in_order, ok, swap = false;
47500 rtx shift, target;
47501 struct expand_vec_perm_d dcopy;
47503 /* Even with AVX, palignr only operates on 128-bit vectors,
47504 in AVX2 palignr operates on both 128-bit lanes. */
47505 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47506 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47507 return false;
47509 min = 2 * nelt;
47510 max = 0;
47511 minswap = 2 * nelt;
47512 maxswap = 0;
47513 for (i = 0; i < nelt; ++i)
47515 unsigned e = d->perm[i];
47516 unsigned eswap = d->perm[i] ^ nelt;
47517 if (GET_MODE_SIZE (d->vmode) == 32)
47519 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47520 eswap = e ^ (nelt / 2);
47522 if (e < min)
47523 min = e;
47524 if (e > max)
47525 max = e;
47526 if (eswap < minswap)
47527 minswap = eswap;
47528 if (eswap > maxswap)
47529 maxswap = eswap;
47531 if (min == 0
47532 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47534 if (d->one_operand_p
47535 || minswap == 0
47536 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47537 ? nelt / 2 : nelt))
47538 return false;
47539 swap = true;
47540 min = minswap;
47541 max = maxswap;
47544 /* Given that we have SSSE3, we know we'll be able to implement the
47545 single operand permutation after the palignr with pshufb for
47546 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47547 first. */
47548 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47549 return true;
47551 dcopy = *d;
47552 if (swap)
47554 dcopy.op0 = d->op1;
47555 dcopy.op1 = d->op0;
47556 for (i = 0; i < nelt; ++i)
47557 dcopy.perm[i] ^= nelt;
47560 in_order = true;
47561 for (i = 0; i < nelt; ++i)
47563 unsigned e = dcopy.perm[i];
47564 if (GET_MODE_SIZE (d->vmode) == 32
47565 && e >= nelt
47566 && (e & (nelt / 2 - 1)) < min)
47567 e = e - min - (nelt / 2);
47568 else
47569 e = e - min;
47570 if (e != i)
47571 in_order = false;
47572 dcopy.perm[i] = e;
47574 dcopy.one_operand_p = true;
47576 if (single_insn_only_p && !in_order)
47577 return false;
47579 /* For AVX2, test whether we can permute the result in one instruction. */
47580 if (d->testing_p)
47582 if (in_order)
47583 return true;
47584 dcopy.op1 = dcopy.op0;
47585 return expand_vec_perm_1 (&dcopy);
47588 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47589 if (GET_MODE_SIZE (d->vmode) == 16)
47591 target = gen_reg_rtx (TImode);
47592 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47593 gen_lowpart (TImode, dcopy.op0), shift));
47595 else
47597 target = gen_reg_rtx (V2TImode);
47598 emit_insn (gen_avx2_palignrv2ti (target,
47599 gen_lowpart (V2TImode, dcopy.op1),
47600 gen_lowpart (V2TImode, dcopy.op0),
47601 shift));
47604 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47606 /* Test for the degenerate case where the alignment by itself
47607 produces the desired permutation. */
47608 if (in_order)
47610 emit_move_insn (d->target, dcopy.op0);
47611 return true;
47614 ok = expand_vec_perm_1 (&dcopy);
47615 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47617 return ok;
47620 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47621 the permutation using the SSE4_1 pblendv instruction. Potentially
47622 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47624 static bool
47625 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47627 unsigned i, which, nelt = d->nelt;
47628 struct expand_vec_perm_d dcopy, dcopy1;
47629 machine_mode vmode = d->vmode;
47630 bool ok;
47632 /* Use the same checks as in expand_vec_perm_blend. */
47633 if (d->one_operand_p)
47634 return false;
47635 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47637 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47639 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47641 else
47642 return false;
47644 /* Figure out where permutation elements stay not in their
47645 respective lanes. */
47646 for (i = 0, which = 0; i < nelt; ++i)
47648 unsigned e = d->perm[i];
47649 if (e != i)
47650 which |= (e < nelt ? 1 : 2);
47652 /* We can pblend the part where elements stay not in their
47653 respective lanes only when these elements are all in one
47654 half of a permutation.
47655 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47656 lanes, but both 8 and 9 >= 8
47657 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47658 respective lanes and 8 >= 8, but 2 not. */
47659 if (which != 1 && which != 2)
47660 return false;
47661 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47662 return true;
47664 /* First we apply one operand permutation to the part where
47665 elements stay not in their respective lanes. */
47666 dcopy = *d;
47667 if (which == 2)
47668 dcopy.op0 = dcopy.op1 = d->op1;
47669 else
47670 dcopy.op0 = dcopy.op1 = d->op0;
47671 if (!d->testing_p)
47672 dcopy.target = gen_reg_rtx (vmode);
47673 dcopy.one_operand_p = true;
47675 for (i = 0; i < nelt; ++i)
47676 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47678 ok = expand_vec_perm_1 (&dcopy);
47679 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47680 return false;
47681 else
47682 gcc_assert (ok);
47683 if (d->testing_p)
47684 return true;
47686 /* Next we put permuted elements into their positions. */
47687 dcopy1 = *d;
47688 if (which == 2)
47689 dcopy1.op1 = dcopy.target;
47690 else
47691 dcopy1.op0 = dcopy.target;
47693 for (i = 0; i < nelt; ++i)
47694 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47696 ok = expand_vec_perm_blend (&dcopy1);
47697 gcc_assert (ok);
47699 return true;
47702 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47704 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47705 a two vector permutation into a single vector permutation by using
47706 an interleave operation to merge the vectors. */
47708 static bool
47709 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47711 struct expand_vec_perm_d dremap, dfinal;
47712 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47713 unsigned HOST_WIDE_INT contents;
47714 unsigned char remap[2 * MAX_VECT_LEN];
47715 rtx_insn *seq;
47716 bool ok, same_halves = false;
47718 if (GET_MODE_SIZE (d->vmode) == 16)
47720 if (d->one_operand_p)
47721 return false;
47723 else if (GET_MODE_SIZE (d->vmode) == 32)
47725 if (!TARGET_AVX)
47726 return false;
47727 /* For 32-byte modes allow even d->one_operand_p.
47728 The lack of cross-lane shuffling in some instructions
47729 might prevent a single insn shuffle. */
47730 dfinal = *d;
47731 dfinal.testing_p = true;
47732 /* If expand_vec_perm_interleave3 can expand this into
47733 a 3 insn sequence, give up and let it be expanded as
47734 3 insn sequence. While that is one insn longer,
47735 it doesn't need a memory operand and in the common
47736 case that both interleave low and high permutations
47737 with the same operands are adjacent needs 4 insns
47738 for both after CSE. */
47739 if (expand_vec_perm_interleave3 (&dfinal))
47740 return false;
47742 else
47743 return false;
47745 /* Examine from whence the elements come. */
47746 contents = 0;
47747 for (i = 0; i < nelt; ++i)
47748 contents |= HOST_WIDE_INT_1U << d->perm[i];
47750 memset (remap, 0xff, sizeof (remap));
47751 dremap = *d;
47753 if (GET_MODE_SIZE (d->vmode) == 16)
47755 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47757 /* Split the two input vectors into 4 halves. */
47758 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47759 h2 = h1 << nelt2;
47760 h3 = h2 << nelt2;
47761 h4 = h3 << nelt2;
47763 /* If the elements from the low halves use interleave low, and similarly
47764 for interleave high. If the elements are from mis-matched halves, we
47765 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47766 if ((contents & (h1 | h3)) == contents)
47768 /* punpckl* */
47769 for (i = 0; i < nelt2; ++i)
47771 remap[i] = i * 2;
47772 remap[i + nelt] = i * 2 + 1;
47773 dremap.perm[i * 2] = i;
47774 dremap.perm[i * 2 + 1] = i + nelt;
47776 if (!TARGET_SSE2 && d->vmode == V4SImode)
47777 dremap.vmode = V4SFmode;
47779 else if ((contents & (h2 | h4)) == contents)
47781 /* punpckh* */
47782 for (i = 0; i < nelt2; ++i)
47784 remap[i + nelt2] = i * 2;
47785 remap[i + nelt + nelt2] = i * 2 + 1;
47786 dremap.perm[i * 2] = i + nelt2;
47787 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47789 if (!TARGET_SSE2 && d->vmode == V4SImode)
47790 dremap.vmode = V4SFmode;
47792 else if ((contents & (h1 | h4)) == contents)
47794 /* shufps */
47795 for (i = 0; i < nelt2; ++i)
47797 remap[i] = i;
47798 remap[i + nelt + nelt2] = i + nelt2;
47799 dremap.perm[i] = i;
47800 dremap.perm[i + nelt2] = i + nelt + nelt2;
47802 if (nelt != 4)
47804 /* shufpd */
47805 dremap.vmode = V2DImode;
47806 dremap.nelt = 2;
47807 dremap.perm[0] = 0;
47808 dremap.perm[1] = 3;
47811 else if ((contents & (h2 | h3)) == contents)
47813 /* shufps */
47814 for (i = 0; i < nelt2; ++i)
47816 remap[i + nelt2] = i;
47817 remap[i + nelt] = i + nelt2;
47818 dremap.perm[i] = i + nelt2;
47819 dremap.perm[i + nelt2] = i + nelt;
47821 if (nelt != 4)
47823 /* shufpd */
47824 dremap.vmode = V2DImode;
47825 dremap.nelt = 2;
47826 dremap.perm[0] = 1;
47827 dremap.perm[1] = 2;
47830 else
47831 return false;
47833 else
47835 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47836 unsigned HOST_WIDE_INT q[8];
47837 unsigned int nonzero_halves[4];
47839 /* Split the two input vectors into 8 quarters. */
47840 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47841 for (i = 1; i < 8; ++i)
47842 q[i] = q[0] << (nelt4 * i);
47843 for (i = 0; i < 4; ++i)
47844 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47846 nonzero_halves[nzcnt] = i;
47847 ++nzcnt;
47850 if (nzcnt == 1)
47852 gcc_assert (d->one_operand_p);
47853 nonzero_halves[1] = nonzero_halves[0];
47854 same_halves = true;
47856 else if (d->one_operand_p)
47858 gcc_assert (nonzero_halves[0] == 0);
47859 gcc_assert (nonzero_halves[1] == 1);
47862 if (nzcnt <= 2)
47864 if (d->perm[0] / nelt2 == nonzero_halves[1])
47866 /* Attempt to increase the likelihood that dfinal
47867 shuffle will be intra-lane. */
47868 std::swap (nonzero_halves[0], nonzero_halves[1]);
47871 /* vperm2f128 or vperm2i128. */
47872 for (i = 0; i < nelt2; ++i)
47874 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47875 remap[i + nonzero_halves[0] * nelt2] = i;
47876 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47877 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47880 if (d->vmode != V8SFmode
47881 && d->vmode != V4DFmode
47882 && d->vmode != V8SImode)
47884 dremap.vmode = V8SImode;
47885 dremap.nelt = 8;
47886 for (i = 0; i < 4; ++i)
47888 dremap.perm[i] = i + nonzero_halves[0] * 4;
47889 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47893 else if (d->one_operand_p)
47894 return false;
47895 else if (TARGET_AVX2
47896 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47898 /* vpunpckl* */
47899 for (i = 0; i < nelt4; ++i)
47901 remap[i] = i * 2;
47902 remap[i + nelt] = i * 2 + 1;
47903 remap[i + nelt2] = i * 2 + nelt2;
47904 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47905 dremap.perm[i * 2] = i;
47906 dremap.perm[i * 2 + 1] = i + nelt;
47907 dremap.perm[i * 2 + nelt2] = i + nelt2;
47908 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47911 else if (TARGET_AVX2
47912 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47914 /* vpunpckh* */
47915 for (i = 0; i < nelt4; ++i)
47917 remap[i + nelt4] = i * 2;
47918 remap[i + nelt + nelt4] = i * 2 + 1;
47919 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47920 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47921 dremap.perm[i * 2] = i + nelt4;
47922 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47923 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47924 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47927 else
47928 return false;
47931 /* Use the remapping array set up above to move the elements from their
47932 swizzled locations into their final destinations. */
47933 dfinal = *d;
47934 for (i = 0; i < nelt; ++i)
47936 unsigned e = remap[d->perm[i]];
47937 gcc_assert (e < nelt);
47938 /* If same_halves is true, both halves of the remapped vector are the
47939 same. Avoid cross-lane accesses if possible. */
47940 if (same_halves && i >= nelt2)
47942 gcc_assert (e < nelt2);
47943 dfinal.perm[i] = e + nelt2;
47945 else
47946 dfinal.perm[i] = e;
47948 if (!d->testing_p)
47950 dremap.target = gen_reg_rtx (dremap.vmode);
47951 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47953 dfinal.op1 = dfinal.op0;
47954 dfinal.one_operand_p = true;
47956 /* Test if the final remap can be done with a single insn. For V4SFmode or
47957 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47958 start_sequence ();
47959 ok = expand_vec_perm_1 (&dfinal);
47960 seq = get_insns ();
47961 end_sequence ();
47963 if (!ok)
47964 return false;
47966 if (d->testing_p)
47967 return true;
47969 if (dremap.vmode != dfinal.vmode)
47971 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47972 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47975 ok = expand_vec_perm_1 (&dremap);
47976 gcc_assert (ok);
47978 emit_insn (seq);
47979 return true;
47982 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47983 a single vector cross-lane permutation into vpermq followed
47984 by any of the single insn permutations. */
47986 static bool
47987 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47989 struct expand_vec_perm_d dremap, dfinal;
47990 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47991 unsigned contents[2];
47992 bool ok;
47994 if (!(TARGET_AVX2
47995 && (d->vmode == V32QImode || d->vmode == V16HImode)
47996 && d->one_operand_p))
47997 return false;
47999 contents[0] = 0;
48000 contents[1] = 0;
48001 for (i = 0; i < nelt2; ++i)
48003 contents[0] |= 1u << (d->perm[i] / nelt4);
48004 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48007 for (i = 0; i < 2; ++i)
48009 unsigned int cnt = 0;
48010 for (j = 0; j < 4; ++j)
48011 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48012 return false;
48015 if (d->testing_p)
48016 return true;
48018 dremap = *d;
48019 dremap.vmode = V4DImode;
48020 dremap.nelt = 4;
48021 dremap.target = gen_reg_rtx (V4DImode);
48022 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48023 dremap.op1 = dremap.op0;
48024 dremap.one_operand_p = true;
48025 for (i = 0; i < 2; ++i)
48027 unsigned int cnt = 0;
48028 for (j = 0; j < 4; ++j)
48029 if ((contents[i] & (1u << j)) != 0)
48030 dremap.perm[2 * i + cnt++] = j;
48031 for (; cnt < 2; ++cnt)
48032 dremap.perm[2 * i + cnt] = 0;
48035 dfinal = *d;
48036 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48037 dfinal.op1 = dfinal.op0;
48038 dfinal.one_operand_p = true;
48039 for (i = 0, j = 0; i < nelt; ++i)
48041 if (i == nelt2)
48042 j = 2;
48043 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48044 if ((d->perm[i] / nelt4) == dremap.perm[j])
48046 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48047 dfinal.perm[i] |= nelt4;
48048 else
48049 gcc_unreachable ();
48052 ok = expand_vec_perm_1 (&dremap);
48053 gcc_assert (ok);
48055 ok = expand_vec_perm_1 (&dfinal);
48056 gcc_assert (ok);
48058 return true;
48061 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48062 a vector permutation using two instructions, vperm2f128 resp.
48063 vperm2i128 followed by any single in-lane permutation. */
48065 static bool
48066 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48068 struct expand_vec_perm_d dfirst, dsecond;
48069 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48070 bool ok;
48072 if (!TARGET_AVX
48073 || GET_MODE_SIZE (d->vmode) != 32
48074 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48075 return false;
48077 dsecond = *d;
48078 dsecond.one_operand_p = false;
48079 dsecond.testing_p = true;
48081 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48082 immediate. For perm < 16 the second permutation uses
48083 d->op0 as first operand, for perm >= 16 it uses d->op1
48084 as first operand. The second operand is the result of
48085 vperm2[fi]128. */
48086 for (perm = 0; perm < 32; perm++)
48088 /* Ignore permutations which do not move anything cross-lane. */
48089 if (perm < 16)
48091 /* The second shuffle for e.g. V4DFmode has
48092 0123 and ABCD operands.
48093 Ignore AB23, as 23 is already in the second lane
48094 of the first operand. */
48095 if ((perm & 0xc) == (1 << 2)) continue;
48096 /* And 01CD, as 01 is in the first lane of the first
48097 operand. */
48098 if ((perm & 3) == 0) continue;
48099 /* And 4567, as then the vperm2[fi]128 doesn't change
48100 anything on the original 4567 second operand. */
48101 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48103 else
48105 /* The second shuffle for e.g. V4DFmode has
48106 4567 and ABCD operands.
48107 Ignore AB67, as 67 is already in the second lane
48108 of the first operand. */
48109 if ((perm & 0xc) == (3 << 2)) continue;
48110 /* And 45CD, as 45 is in the first lane of the first
48111 operand. */
48112 if ((perm & 3) == 2) continue;
48113 /* And 0123, as then the vperm2[fi]128 doesn't change
48114 anything on the original 0123 first operand. */
48115 if ((perm & 0xf) == (1 << 2)) continue;
48118 for (i = 0; i < nelt; i++)
48120 j = d->perm[i] / nelt2;
48121 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48122 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48123 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48124 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48125 else
48126 break;
48129 if (i == nelt)
48131 start_sequence ();
48132 ok = expand_vec_perm_1 (&dsecond);
48133 end_sequence ();
48135 else
48136 ok = false;
48138 if (ok)
48140 if (d->testing_p)
48141 return true;
48143 /* Found a usable second shuffle. dfirst will be
48144 vperm2f128 on d->op0 and d->op1. */
48145 dsecond.testing_p = false;
48146 dfirst = *d;
48147 dfirst.target = gen_reg_rtx (d->vmode);
48148 for (i = 0; i < nelt; i++)
48149 dfirst.perm[i] = (i & (nelt2 - 1))
48150 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48152 canonicalize_perm (&dfirst);
48153 ok = expand_vec_perm_1 (&dfirst);
48154 gcc_assert (ok);
48156 /* And dsecond is some single insn shuffle, taking
48157 d->op0 and result of vperm2f128 (if perm < 16) or
48158 d->op1 and result of vperm2f128 (otherwise). */
48159 if (perm >= 16)
48160 dsecond.op0 = dsecond.op1;
48161 dsecond.op1 = dfirst.target;
48163 ok = expand_vec_perm_1 (&dsecond);
48164 gcc_assert (ok);
48166 return true;
48169 /* For one operand, the only useful vperm2f128 permutation is 0x01
48170 aka lanes swap. */
48171 if (d->one_operand_p)
48172 return false;
48175 return false;
48178 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48179 a two vector permutation using 2 intra-lane interleave insns
48180 and cross-lane shuffle for 32-byte vectors. */
48182 static bool
48183 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48185 unsigned i, nelt;
48186 rtx (*gen) (rtx, rtx, rtx);
48188 if (d->one_operand_p)
48189 return false;
48190 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48192 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48194 else
48195 return false;
48197 nelt = d->nelt;
48198 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48199 return false;
48200 for (i = 0; i < nelt; i += 2)
48201 if (d->perm[i] != d->perm[0] + i / 2
48202 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48203 return false;
48205 if (d->testing_p)
48206 return true;
48208 switch (d->vmode)
48210 case V32QImode:
48211 if (d->perm[0])
48212 gen = gen_vec_interleave_highv32qi;
48213 else
48214 gen = gen_vec_interleave_lowv32qi;
48215 break;
48216 case V16HImode:
48217 if (d->perm[0])
48218 gen = gen_vec_interleave_highv16hi;
48219 else
48220 gen = gen_vec_interleave_lowv16hi;
48221 break;
48222 case V8SImode:
48223 if (d->perm[0])
48224 gen = gen_vec_interleave_highv8si;
48225 else
48226 gen = gen_vec_interleave_lowv8si;
48227 break;
48228 case V4DImode:
48229 if (d->perm[0])
48230 gen = gen_vec_interleave_highv4di;
48231 else
48232 gen = gen_vec_interleave_lowv4di;
48233 break;
48234 case V8SFmode:
48235 if (d->perm[0])
48236 gen = gen_vec_interleave_highv8sf;
48237 else
48238 gen = gen_vec_interleave_lowv8sf;
48239 break;
48240 case V4DFmode:
48241 if (d->perm[0])
48242 gen = gen_vec_interleave_highv4df;
48243 else
48244 gen = gen_vec_interleave_lowv4df;
48245 break;
48246 default:
48247 gcc_unreachable ();
48250 emit_insn (gen (d->target, d->op0, d->op1));
48251 return true;
48254 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48255 a single vector permutation using a single intra-lane vector
48256 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48257 the non-swapped and swapped vectors together. */
48259 static bool
48260 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48262 struct expand_vec_perm_d dfirst, dsecond;
48263 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48264 rtx_insn *seq;
48265 bool ok;
48266 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48268 if (!TARGET_AVX
48269 || TARGET_AVX2
48270 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48271 || !d->one_operand_p)
48272 return false;
48274 dfirst = *d;
48275 for (i = 0; i < nelt; i++)
48276 dfirst.perm[i] = 0xff;
48277 for (i = 0, msk = 0; i < nelt; i++)
48279 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48280 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48281 return false;
48282 dfirst.perm[j] = d->perm[i];
48283 if (j != i)
48284 msk |= (1 << i);
48286 for (i = 0; i < nelt; i++)
48287 if (dfirst.perm[i] == 0xff)
48288 dfirst.perm[i] = i;
48290 if (!d->testing_p)
48291 dfirst.target = gen_reg_rtx (dfirst.vmode);
48293 start_sequence ();
48294 ok = expand_vec_perm_1 (&dfirst);
48295 seq = get_insns ();
48296 end_sequence ();
48298 if (!ok)
48299 return false;
48301 if (d->testing_p)
48302 return true;
48304 emit_insn (seq);
48306 dsecond = *d;
48307 dsecond.op0 = dfirst.target;
48308 dsecond.op1 = dfirst.target;
48309 dsecond.one_operand_p = true;
48310 dsecond.target = gen_reg_rtx (dsecond.vmode);
48311 for (i = 0; i < nelt; i++)
48312 dsecond.perm[i] = i ^ nelt2;
48314 ok = expand_vec_perm_1 (&dsecond);
48315 gcc_assert (ok);
48317 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48318 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48319 return true;
48322 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48323 permutation using two vperm2f128, followed by a vshufpd insn blending
48324 the two vectors together. */
48326 static bool
48327 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48329 struct expand_vec_perm_d dfirst, dsecond, dthird;
48330 bool ok;
48332 if (!TARGET_AVX || (d->vmode != V4DFmode))
48333 return false;
48335 if (d->testing_p)
48336 return true;
48338 dfirst = *d;
48339 dsecond = *d;
48340 dthird = *d;
48342 dfirst.perm[0] = (d->perm[0] & ~1);
48343 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48344 dfirst.perm[2] = (d->perm[2] & ~1);
48345 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48346 dsecond.perm[0] = (d->perm[1] & ~1);
48347 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48348 dsecond.perm[2] = (d->perm[3] & ~1);
48349 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48350 dthird.perm[0] = (d->perm[0] % 2);
48351 dthird.perm[1] = (d->perm[1] % 2) + 4;
48352 dthird.perm[2] = (d->perm[2] % 2) + 2;
48353 dthird.perm[3] = (d->perm[3] % 2) + 6;
48355 dfirst.target = gen_reg_rtx (dfirst.vmode);
48356 dsecond.target = gen_reg_rtx (dsecond.vmode);
48357 dthird.op0 = dfirst.target;
48358 dthird.op1 = dsecond.target;
48359 dthird.one_operand_p = false;
48361 canonicalize_perm (&dfirst);
48362 canonicalize_perm (&dsecond);
48364 ok = expand_vec_perm_1 (&dfirst)
48365 && expand_vec_perm_1 (&dsecond)
48366 && expand_vec_perm_1 (&dthird);
48368 gcc_assert (ok);
48370 return true;
48373 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48374 permutation with two pshufb insns and an ior. We should have already
48375 failed all two instruction sequences. */
48377 static bool
48378 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48380 rtx rperm[2][16], vperm, l, h, op, m128;
48381 unsigned int i, nelt, eltsz;
48383 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48384 return false;
48385 gcc_assert (!d->one_operand_p);
48387 if (d->testing_p)
48388 return true;
48390 nelt = d->nelt;
48391 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48393 /* Generate two permutation masks. If the required element is within
48394 the given vector it is shuffled into the proper lane. If the required
48395 element is in the other vector, force a zero into the lane by setting
48396 bit 7 in the permutation mask. */
48397 m128 = GEN_INT (-128);
48398 for (i = 0; i < nelt; ++i)
48400 unsigned j, e = d->perm[i];
48401 unsigned which = (e >= nelt);
48402 if (e >= nelt)
48403 e -= nelt;
48405 for (j = 0; j < eltsz; ++j)
48407 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48408 rperm[1-which][i*eltsz + j] = m128;
48412 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48413 vperm = force_reg (V16QImode, vperm);
48415 l = gen_reg_rtx (V16QImode);
48416 op = gen_lowpart (V16QImode, d->op0);
48417 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48419 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48420 vperm = force_reg (V16QImode, vperm);
48422 h = gen_reg_rtx (V16QImode);
48423 op = gen_lowpart (V16QImode, d->op1);
48424 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48426 op = d->target;
48427 if (d->vmode != V16QImode)
48428 op = gen_reg_rtx (V16QImode);
48429 emit_insn (gen_iorv16qi3 (op, l, h));
48430 if (op != d->target)
48431 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48433 return true;
48436 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48437 with two vpshufb insns, vpermq and vpor. We should have already failed
48438 all two or three instruction sequences. */
48440 static bool
48441 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48443 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48444 unsigned int i, nelt, eltsz;
48446 if (!TARGET_AVX2
48447 || !d->one_operand_p
48448 || (d->vmode != V32QImode && d->vmode != V16HImode))
48449 return false;
48451 if (d->testing_p)
48452 return true;
48454 nelt = d->nelt;
48455 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48457 /* Generate two permutation masks. If the required element is within
48458 the same lane, it is shuffled in. If the required element from the
48459 other lane, force a zero by setting bit 7 in the permutation mask.
48460 In the other mask the mask has non-negative elements if element
48461 is requested from the other lane, but also moved to the other lane,
48462 so that the result of vpshufb can have the two V2TImode halves
48463 swapped. */
48464 m128 = GEN_INT (-128);
48465 for (i = 0; i < nelt; ++i)
48467 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48468 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48470 for (j = 0; j < eltsz; ++j)
48472 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48473 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48477 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48478 vperm = force_reg (V32QImode, vperm);
48480 h = gen_reg_rtx (V32QImode);
48481 op = gen_lowpart (V32QImode, d->op0);
48482 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48484 /* Swap the 128-byte lanes of h into hp. */
48485 hp = gen_reg_rtx (V4DImode);
48486 op = gen_lowpart (V4DImode, h);
48487 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48488 const1_rtx));
48490 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48491 vperm = force_reg (V32QImode, vperm);
48493 l = gen_reg_rtx (V32QImode);
48494 op = gen_lowpart (V32QImode, d->op0);
48495 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48497 op = d->target;
48498 if (d->vmode != V32QImode)
48499 op = gen_reg_rtx (V32QImode);
48500 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48501 if (op != d->target)
48502 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48504 return true;
48507 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48508 and extract-odd permutations of two V32QImode and V16QImode operand
48509 with two vpshufb insns, vpor and vpermq. We should have already
48510 failed all two or three instruction sequences. */
48512 static bool
48513 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48515 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48516 unsigned int i, nelt, eltsz;
48518 if (!TARGET_AVX2
48519 || d->one_operand_p
48520 || (d->vmode != V32QImode && d->vmode != V16HImode))
48521 return false;
48523 for (i = 0; i < d->nelt; ++i)
48524 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48525 return false;
48527 if (d->testing_p)
48528 return true;
48530 nelt = d->nelt;
48531 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48533 /* Generate two permutation masks. In the first permutation mask
48534 the first quarter will contain indexes for the first half
48535 of the op0, the second quarter will contain bit 7 set, third quarter
48536 will contain indexes for the second half of the op0 and the
48537 last quarter bit 7 set. In the second permutation mask
48538 the first quarter will contain bit 7 set, the second quarter
48539 indexes for the first half of the op1, the third quarter bit 7 set
48540 and last quarter indexes for the second half of the op1.
48541 I.e. the first mask e.g. for V32QImode extract even will be:
48542 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48543 (all values masked with 0xf except for -128) and second mask
48544 for extract even will be
48545 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48546 m128 = GEN_INT (-128);
48547 for (i = 0; i < nelt; ++i)
48549 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48550 unsigned which = d->perm[i] >= nelt;
48551 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48553 for (j = 0; j < eltsz; ++j)
48555 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48556 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48560 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48561 vperm = force_reg (V32QImode, vperm);
48563 l = gen_reg_rtx (V32QImode);
48564 op = gen_lowpart (V32QImode, d->op0);
48565 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48567 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48568 vperm = force_reg (V32QImode, vperm);
48570 h = gen_reg_rtx (V32QImode);
48571 op = gen_lowpart (V32QImode, d->op1);
48572 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48574 ior = gen_reg_rtx (V32QImode);
48575 emit_insn (gen_iorv32qi3 (ior, l, h));
48577 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48578 op = gen_reg_rtx (V4DImode);
48579 ior = gen_lowpart (V4DImode, ior);
48580 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48581 const1_rtx, GEN_INT (3)));
48582 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48584 return true;
48587 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48588 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48589 with two "and" and "pack" or two "shift" and "pack" insns. We should
48590 have already failed all two instruction sequences. */
48592 static bool
48593 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48595 rtx op, dop0, dop1, t, rperm[16];
48596 unsigned i, odd, c, s, nelt = d->nelt;
48597 bool end_perm = false;
48598 machine_mode half_mode;
48599 rtx (*gen_and) (rtx, rtx, rtx);
48600 rtx (*gen_pack) (rtx, rtx, rtx);
48601 rtx (*gen_shift) (rtx, rtx, rtx);
48603 if (d->one_operand_p)
48604 return false;
48606 switch (d->vmode)
48608 case V8HImode:
48609 /* Required for "pack". */
48610 if (!TARGET_SSE4_1)
48611 return false;
48612 c = 0xffff;
48613 s = 16;
48614 half_mode = V4SImode;
48615 gen_and = gen_andv4si3;
48616 gen_pack = gen_sse4_1_packusdw;
48617 gen_shift = gen_lshrv4si3;
48618 break;
48619 case V16QImode:
48620 /* No check as all instructions are SSE2. */
48621 c = 0xff;
48622 s = 8;
48623 half_mode = V8HImode;
48624 gen_and = gen_andv8hi3;
48625 gen_pack = gen_sse2_packuswb;
48626 gen_shift = gen_lshrv8hi3;
48627 break;
48628 case V16HImode:
48629 if (!TARGET_AVX2)
48630 return false;
48631 c = 0xffff;
48632 s = 16;
48633 half_mode = V8SImode;
48634 gen_and = gen_andv8si3;
48635 gen_pack = gen_avx2_packusdw;
48636 gen_shift = gen_lshrv8si3;
48637 end_perm = true;
48638 break;
48639 case V32QImode:
48640 if (!TARGET_AVX2)
48641 return false;
48642 c = 0xff;
48643 s = 8;
48644 half_mode = V16HImode;
48645 gen_and = gen_andv16hi3;
48646 gen_pack = gen_avx2_packuswb;
48647 gen_shift = gen_lshrv16hi3;
48648 end_perm = true;
48649 break;
48650 default:
48651 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48652 general shuffles. */
48653 return false;
48656 /* Check that permutation is even or odd. */
48657 odd = d->perm[0];
48658 if (odd > 1)
48659 return false;
48661 for (i = 1; i < nelt; ++i)
48662 if (d->perm[i] != 2 * i + odd)
48663 return false;
48665 if (d->testing_p)
48666 return true;
48668 dop0 = gen_reg_rtx (half_mode);
48669 dop1 = gen_reg_rtx (half_mode);
48670 if (odd == 0)
48672 for (i = 0; i < nelt / 2; i++)
48673 rperm[i] = GEN_INT (c);
48674 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48675 t = force_reg (half_mode, t);
48676 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48677 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48679 else
48681 emit_insn (gen_shift (dop0,
48682 gen_lowpart (half_mode, d->op0),
48683 GEN_INT (s)));
48684 emit_insn (gen_shift (dop1,
48685 gen_lowpart (half_mode, d->op1),
48686 GEN_INT (s)));
48688 /* In AVX2 for 256 bit case we need to permute pack result. */
48689 if (TARGET_AVX2 && end_perm)
48691 op = gen_reg_rtx (d->vmode);
48692 t = gen_reg_rtx (V4DImode);
48693 emit_insn (gen_pack (op, dop0, dop1));
48694 emit_insn (gen_avx2_permv4di_1 (t,
48695 gen_lowpart (V4DImode, op),
48696 const0_rtx,
48697 const2_rtx,
48698 const1_rtx,
48699 GEN_INT (3)));
48700 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48702 else
48703 emit_insn (gen_pack (d->target, dop0, dop1));
48705 return true;
48708 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48709 and extract-odd permutations of two V64QI operands
48710 with two "shifts", two "truncs" and one "concat" insns for "odd"
48711 and two "truncs" and one concat insn for "even."
48712 Have already failed all two instruction sequences. */
48714 static bool
48715 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48717 rtx t1, t2, t3, t4;
48718 unsigned i, odd, nelt = d->nelt;
48720 if (!TARGET_AVX512BW
48721 || d->one_operand_p
48722 || d->vmode != V64QImode)
48723 return false;
48725 /* Check that permutation is even or odd. */
48726 odd = d->perm[0];
48727 if (odd > 1)
48728 return false;
48730 for (i = 1; i < nelt; ++i)
48731 if (d->perm[i] != 2 * i + odd)
48732 return false;
48734 if (d->testing_p)
48735 return true;
48738 if (odd)
48740 t1 = gen_reg_rtx (V32HImode);
48741 t2 = gen_reg_rtx (V32HImode);
48742 emit_insn (gen_lshrv32hi3 (t1,
48743 gen_lowpart (V32HImode, d->op0),
48744 GEN_INT (8)));
48745 emit_insn (gen_lshrv32hi3 (t2,
48746 gen_lowpart (V32HImode, d->op1),
48747 GEN_INT (8)));
48749 else
48751 t1 = gen_lowpart (V32HImode, d->op0);
48752 t2 = gen_lowpart (V32HImode, d->op1);
48755 t3 = gen_reg_rtx (V32QImode);
48756 t4 = gen_reg_rtx (V32QImode);
48757 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48758 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48759 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48761 return true;
48764 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48765 and extract-odd permutations. */
48767 static bool
48768 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48770 rtx t1, t2, t3, t4, t5;
48772 switch (d->vmode)
48774 case V4DFmode:
48775 if (d->testing_p)
48776 break;
48777 t1 = gen_reg_rtx (V4DFmode);
48778 t2 = gen_reg_rtx (V4DFmode);
48780 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48781 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48782 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48784 /* Now an unpck[lh]pd will produce the result required. */
48785 if (odd)
48786 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48787 else
48788 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48789 emit_insn (t3);
48790 break;
48792 case V8SFmode:
48794 int mask = odd ? 0xdd : 0x88;
48796 if (d->testing_p)
48797 break;
48798 t1 = gen_reg_rtx (V8SFmode);
48799 t2 = gen_reg_rtx (V8SFmode);
48800 t3 = gen_reg_rtx (V8SFmode);
48802 /* Shuffle within the 128-bit lanes to produce:
48803 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48804 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48805 GEN_INT (mask)));
48807 /* Shuffle the lanes around to produce:
48808 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48809 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48810 GEN_INT (0x3)));
48812 /* Shuffle within the 128-bit lanes to produce:
48813 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48814 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48816 /* Shuffle within the 128-bit lanes to produce:
48817 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48818 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48820 /* Shuffle the lanes around to produce:
48821 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48822 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48823 GEN_INT (0x20)));
48825 break;
48827 case V2DFmode:
48828 case V4SFmode:
48829 case V2DImode:
48830 case V4SImode:
48831 /* These are always directly implementable by expand_vec_perm_1. */
48832 gcc_unreachable ();
48834 case V8HImode:
48835 if (TARGET_SSE4_1)
48836 return expand_vec_perm_even_odd_pack (d);
48837 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48838 return expand_vec_perm_pshufb2 (d);
48839 else
48841 if (d->testing_p)
48842 break;
48843 /* We need 2*log2(N)-1 operations to achieve odd/even
48844 with interleave. */
48845 t1 = gen_reg_rtx (V8HImode);
48846 t2 = gen_reg_rtx (V8HImode);
48847 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48848 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48849 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48850 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48851 if (odd)
48852 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48853 else
48854 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48855 emit_insn (t3);
48857 break;
48859 case V16QImode:
48860 return expand_vec_perm_even_odd_pack (d);
48862 case V16HImode:
48863 case V32QImode:
48864 return expand_vec_perm_even_odd_pack (d);
48866 case V64QImode:
48867 return expand_vec_perm_even_odd_trunc (d);
48869 case V4DImode:
48870 if (!TARGET_AVX2)
48872 struct expand_vec_perm_d d_copy = *d;
48873 d_copy.vmode = V4DFmode;
48874 if (d->testing_p)
48875 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48876 else
48877 d_copy.target = gen_reg_rtx (V4DFmode);
48878 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48879 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48880 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48882 if (!d->testing_p)
48883 emit_move_insn (d->target,
48884 gen_lowpart (V4DImode, d_copy.target));
48885 return true;
48887 return false;
48890 if (d->testing_p)
48891 break;
48893 t1 = gen_reg_rtx (V4DImode);
48894 t2 = gen_reg_rtx (V4DImode);
48896 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48897 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48898 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48900 /* Now an vpunpck[lh]qdq will produce the result required. */
48901 if (odd)
48902 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48903 else
48904 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48905 emit_insn (t3);
48906 break;
48908 case V8SImode:
48909 if (!TARGET_AVX2)
48911 struct expand_vec_perm_d d_copy = *d;
48912 d_copy.vmode = V8SFmode;
48913 if (d->testing_p)
48914 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48915 else
48916 d_copy.target = gen_reg_rtx (V8SFmode);
48917 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48918 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48919 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48921 if (!d->testing_p)
48922 emit_move_insn (d->target,
48923 gen_lowpart (V8SImode, d_copy.target));
48924 return true;
48926 return false;
48929 if (d->testing_p)
48930 break;
48932 t1 = gen_reg_rtx (V8SImode);
48933 t2 = gen_reg_rtx (V8SImode);
48934 t3 = gen_reg_rtx (V4DImode);
48935 t4 = gen_reg_rtx (V4DImode);
48936 t5 = gen_reg_rtx (V4DImode);
48938 /* Shuffle the lanes around into
48939 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48940 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48941 gen_lowpart (V4DImode, d->op1),
48942 GEN_INT (0x20)));
48943 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48944 gen_lowpart (V4DImode, d->op1),
48945 GEN_INT (0x31)));
48947 /* Swap the 2nd and 3rd position in each lane into
48948 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48949 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48950 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48951 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48952 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48954 /* Now an vpunpck[lh]qdq will produce
48955 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48956 if (odd)
48957 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48958 gen_lowpart (V4DImode, t2));
48959 else
48960 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48961 gen_lowpart (V4DImode, t2));
48962 emit_insn (t3);
48963 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48964 break;
48966 default:
48967 gcc_unreachable ();
48970 return true;
48973 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48974 extract-even and extract-odd permutations. */
48976 static bool
48977 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48979 unsigned i, odd, nelt = d->nelt;
48981 odd = d->perm[0];
48982 if (odd != 0 && odd != 1)
48983 return false;
48985 for (i = 1; i < nelt; ++i)
48986 if (d->perm[i] != 2 * i + odd)
48987 return false;
48989 return expand_vec_perm_even_odd_1 (d, odd);
48992 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48993 permutations. We assume that expand_vec_perm_1 has already failed. */
48995 static bool
48996 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48998 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48999 machine_mode vmode = d->vmode;
49000 unsigned char perm2[4];
49001 rtx op0 = d->op0, dest;
49002 bool ok;
49004 switch (vmode)
49006 case V4DFmode:
49007 case V8SFmode:
49008 /* These are special-cased in sse.md so that we can optionally
49009 use the vbroadcast instruction. They expand to two insns
49010 if the input happens to be in a register. */
49011 gcc_unreachable ();
49013 case V2DFmode:
49014 case V2DImode:
49015 case V4SFmode:
49016 case V4SImode:
49017 /* These are always implementable using standard shuffle patterns. */
49018 gcc_unreachable ();
49020 case V8HImode:
49021 case V16QImode:
49022 /* These can be implemented via interleave. We save one insn by
49023 stopping once we have promoted to V4SImode and then use pshufd. */
49024 if (d->testing_p)
49025 return true;
49028 rtx dest;
49029 rtx (*gen) (rtx, rtx, rtx)
49030 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49031 : gen_vec_interleave_lowv8hi;
49033 if (elt >= nelt2)
49035 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49036 : gen_vec_interleave_highv8hi;
49037 elt -= nelt2;
49039 nelt2 /= 2;
49041 dest = gen_reg_rtx (vmode);
49042 emit_insn (gen (dest, op0, op0));
49043 vmode = get_mode_wider_vector (vmode);
49044 op0 = gen_lowpart (vmode, dest);
49046 while (vmode != V4SImode);
49048 memset (perm2, elt, 4);
49049 dest = gen_reg_rtx (V4SImode);
49050 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49051 gcc_assert (ok);
49052 if (!d->testing_p)
49053 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49054 return true;
49056 case V64QImode:
49057 case V32QImode:
49058 case V16HImode:
49059 case V8SImode:
49060 case V4DImode:
49061 /* For AVX2 broadcasts of the first element vpbroadcast* or
49062 vpermq should be used by expand_vec_perm_1. */
49063 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49064 return false;
49066 default:
49067 gcc_unreachable ();
49071 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49072 broadcast permutations. */
49074 static bool
49075 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49077 unsigned i, elt, nelt = d->nelt;
49079 if (!d->one_operand_p)
49080 return false;
49082 elt = d->perm[0];
49083 for (i = 1; i < nelt; ++i)
49084 if (d->perm[i] != elt)
49085 return false;
49087 return expand_vec_perm_broadcast_1 (d);
49090 /* Implement arbitrary permutations of two V64QImode operands
49091 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49092 static bool
49093 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49095 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49096 return false;
49098 if (d->testing_p)
49099 return true;
49101 struct expand_vec_perm_d ds[2];
49102 rtx rperm[128], vperm, target0, target1;
49103 unsigned int i, nelt;
49104 machine_mode vmode;
49106 nelt = d->nelt;
49107 vmode = V64QImode;
49109 for (i = 0; i < 2; i++)
49111 ds[i] = *d;
49112 ds[i].vmode = V32HImode;
49113 ds[i].nelt = 32;
49114 ds[i].target = gen_reg_rtx (V32HImode);
49115 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49116 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49119 /* Prepare permutations such that the first one takes care of
49120 putting the even bytes into the right positions or one higher
49121 positions (ds[0]) and the second one takes care of
49122 putting the odd bytes into the right positions or one below
49123 (ds[1]). */
49125 for (i = 0; i < nelt; i++)
49127 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49128 if (i & 1)
49130 rperm[i] = constm1_rtx;
49131 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49133 else
49135 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49136 rperm[i + 64] = constm1_rtx;
49140 bool ok = expand_vec_perm_1 (&ds[0]);
49141 gcc_assert (ok);
49142 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49144 ok = expand_vec_perm_1 (&ds[1]);
49145 gcc_assert (ok);
49146 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49148 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49149 vperm = force_reg (vmode, vperm);
49150 target0 = gen_reg_rtx (V64QImode);
49151 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49153 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49154 vperm = force_reg (vmode, vperm);
49155 target1 = gen_reg_rtx (V64QImode);
49156 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49158 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49159 return true;
49162 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49163 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49164 all the shorter instruction sequences. */
49166 static bool
49167 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49169 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49170 unsigned int i, nelt, eltsz;
49171 bool used[4];
49173 if (!TARGET_AVX2
49174 || d->one_operand_p
49175 || (d->vmode != V32QImode && d->vmode != V16HImode))
49176 return false;
49178 if (d->testing_p)
49179 return true;
49181 nelt = d->nelt;
49182 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49184 /* Generate 4 permutation masks. If the required element is within
49185 the same lane, it is shuffled in. If the required element from the
49186 other lane, force a zero by setting bit 7 in the permutation mask.
49187 In the other mask the mask has non-negative elements if element
49188 is requested from the other lane, but also moved to the other lane,
49189 so that the result of vpshufb can have the two V2TImode halves
49190 swapped. */
49191 m128 = GEN_INT (-128);
49192 for (i = 0; i < 32; ++i)
49194 rperm[0][i] = m128;
49195 rperm[1][i] = m128;
49196 rperm[2][i] = m128;
49197 rperm[3][i] = m128;
49199 used[0] = false;
49200 used[1] = false;
49201 used[2] = false;
49202 used[3] = false;
49203 for (i = 0; i < nelt; ++i)
49205 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49206 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49207 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49209 for (j = 0; j < eltsz; ++j)
49210 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49211 used[which] = true;
49214 for (i = 0; i < 2; ++i)
49216 if (!used[2 * i + 1])
49218 h[i] = NULL_RTX;
49219 continue;
49221 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49222 gen_rtvec_v (32, rperm[2 * i + 1]));
49223 vperm = force_reg (V32QImode, vperm);
49224 h[i] = gen_reg_rtx (V32QImode);
49225 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49226 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49229 /* Swap the 128-byte lanes of h[X]. */
49230 for (i = 0; i < 2; ++i)
49232 if (h[i] == NULL_RTX)
49233 continue;
49234 op = gen_reg_rtx (V4DImode);
49235 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49236 const2_rtx, GEN_INT (3), const0_rtx,
49237 const1_rtx));
49238 h[i] = gen_lowpart (V32QImode, op);
49241 for (i = 0; i < 2; ++i)
49243 if (!used[2 * i])
49245 l[i] = NULL_RTX;
49246 continue;
49248 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49249 vperm = force_reg (V32QImode, vperm);
49250 l[i] = gen_reg_rtx (V32QImode);
49251 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49252 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49255 for (i = 0; i < 2; ++i)
49257 if (h[i] && l[i])
49259 op = gen_reg_rtx (V32QImode);
49260 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49261 l[i] = op;
49263 else if (h[i])
49264 l[i] = h[i];
49267 gcc_assert (l[0] && l[1]);
49268 op = d->target;
49269 if (d->vmode != V32QImode)
49270 op = gen_reg_rtx (V32QImode);
49271 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49272 if (op != d->target)
49273 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49274 return true;
49277 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49278 With all of the interface bits taken care of, perform the expansion
49279 in D and return true on success. */
49281 static bool
49282 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49284 /* Try a single instruction expansion. */
49285 if (expand_vec_perm_1 (d))
49286 return true;
49288 /* Try sequences of two instructions. */
49290 if (expand_vec_perm_pshuflw_pshufhw (d))
49291 return true;
49293 if (expand_vec_perm_palignr (d, false))
49294 return true;
49296 if (expand_vec_perm_interleave2 (d))
49297 return true;
49299 if (expand_vec_perm_broadcast (d))
49300 return true;
49302 if (expand_vec_perm_vpermq_perm_1 (d))
49303 return true;
49305 if (expand_vec_perm_vperm2f128 (d))
49306 return true;
49308 if (expand_vec_perm_pblendv (d))
49309 return true;
49311 /* Try sequences of three instructions. */
49313 if (expand_vec_perm_even_odd_pack (d))
49314 return true;
49316 if (expand_vec_perm_2vperm2f128_vshuf (d))
49317 return true;
49319 if (expand_vec_perm_pshufb2 (d))
49320 return true;
49322 if (expand_vec_perm_interleave3 (d))
49323 return true;
49325 if (expand_vec_perm_vperm2f128_vblend (d))
49326 return true;
49328 /* Try sequences of four instructions. */
49330 if (expand_vec_perm_even_odd_trunc (d))
49331 return true;
49332 if (expand_vec_perm_vpshufb2_vpermq (d))
49333 return true;
49335 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49336 return true;
49338 if (expand_vec_perm_vpermi2_vpshub2 (d))
49339 return true;
49341 /* ??? Look for narrow permutations whose element orderings would
49342 allow the promotion to a wider mode. */
49344 /* ??? Look for sequences of interleave or a wider permute that place
49345 the data into the correct lanes for a half-vector shuffle like
49346 pshuf[lh]w or vpermilps. */
49348 /* ??? Look for sequences of interleave that produce the desired results.
49349 The combinatorics of punpck[lh] get pretty ugly... */
49351 if (expand_vec_perm_even_odd (d))
49352 return true;
49354 /* Even longer sequences. */
49355 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49356 return true;
49358 /* See if we can get the same permutation in different vector integer
49359 mode. */
49360 struct expand_vec_perm_d nd;
49361 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49363 if (!d->testing_p)
49364 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49365 return true;
49368 return false;
49371 /* If a permutation only uses one operand, make it clear. Returns true
49372 if the permutation references both operands. */
49374 static bool
49375 canonicalize_perm (struct expand_vec_perm_d *d)
49377 int i, which, nelt = d->nelt;
49379 for (i = which = 0; i < nelt; ++i)
49380 which |= (d->perm[i] < nelt ? 1 : 2);
49382 d->one_operand_p = true;
49383 switch (which)
49385 default:
49386 gcc_unreachable();
49388 case 3:
49389 if (!rtx_equal_p (d->op0, d->op1))
49391 d->one_operand_p = false;
49392 break;
49394 /* The elements of PERM do not suggest that only the first operand
49395 is used, but both operands are identical. Allow easier matching
49396 of the permutation by folding the permutation into the single
49397 input vector. */
49398 /* FALLTHRU */
49400 case 2:
49401 for (i = 0; i < nelt; ++i)
49402 d->perm[i] &= nelt - 1;
49403 d->op0 = d->op1;
49404 break;
49406 case 1:
49407 d->op1 = d->op0;
49408 break;
49411 return (which == 3);
49414 bool
49415 ix86_expand_vec_perm_const (rtx operands[4])
49417 struct expand_vec_perm_d d;
49418 unsigned char perm[MAX_VECT_LEN];
49419 int i, nelt;
49420 bool two_args;
49421 rtx sel;
49423 d.target = operands[0];
49424 d.op0 = operands[1];
49425 d.op1 = operands[2];
49426 sel = operands[3];
49428 d.vmode = GET_MODE (d.target);
49429 gcc_assert (VECTOR_MODE_P (d.vmode));
49430 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49431 d.testing_p = false;
49433 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49434 gcc_assert (XVECLEN (sel, 0) == nelt);
49435 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49437 for (i = 0; i < nelt; ++i)
49439 rtx e = XVECEXP (sel, 0, i);
49440 int ei = INTVAL (e) & (2 * nelt - 1);
49441 d.perm[i] = ei;
49442 perm[i] = ei;
49445 two_args = canonicalize_perm (&d);
49447 if (ix86_expand_vec_perm_const_1 (&d))
49448 return true;
49450 /* If the selector says both arguments are needed, but the operands are the
49451 same, the above tried to expand with one_operand_p and flattened selector.
49452 If that didn't work, retry without one_operand_p; we succeeded with that
49453 during testing. */
49454 if (two_args && d.one_operand_p)
49456 d.one_operand_p = false;
49457 memcpy (d.perm, perm, sizeof (perm));
49458 return ix86_expand_vec_perm_const_1 (&d);
49461 return false;
49464 /* Implement targetm.vectorize.vec_perm_const_ok. */
49466 static bool
49467 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49468 const unsigned char *sel)
49470 struct expand_vec_perm_d d;
49471 unsigned int i, nelt, which;
49472 bool ret;
49474 d.vmode = vmode;
49475 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49476 d.testing_p = true;
49478 /* Given sufficient ISA support we can just return true here
49479 for selected vector modes. */
49480 switch (d.vmode)
49482 case V16SFmode:
49483 case V16SImode:
49484 case V8DImode:
49485 case V8DFmode:
49486 if (TARGET_AVX512F)
49487 /* All implementable with a single vpermi2 insn. */
49488 return true;
49489 break;
49490 case V32HImode:
49491 if (TARGET_AVX512BW)
49492 /* All implementable with a single vpermi2 insn. */
49493 return true;
49494 break;
49495 case V64QImode:
49496 if (TARGET_AVX512BW)
49497 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49498 return true;
49499 break;
49500 case V8SImode:
49501 case V8SFmode:
49502 case V4DFmode:
49503 case V4DImode:
49504 if (TARGET_AVX512VL)
49505 /* All implementable with a single vpermi2 insn. */
49506 return true;
49507 break;
49508 case V16HImode:
49509 if (TARGET_AVX2)
49510 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49511 return true;
49512 break;
49513 case V32QImode:
49514 if (TARGET_AVX2)
49515 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49516 return true;
49517 break;
49518 case V4SImode:
49519 case V4SFmode:
49520 case V8HImode:
49521 case V16QImode:
49522 /* All implementable with a single vpperm insn. */
49523 if (TARGET_XOP)
49524 return true;
49525 /* All implementable with 2 pshufb + 1 ior. */
49526 if (TARGET_SSSE3)
49527 return true;
49528 break;
49529 case V2DImode:
49530 case V2DFmode:
49531 /* All implementable with shufpd or unpck[lh]pd. */
49532 return true;
49533 default:
49534 return false;
49537 /* Extract the values from the vector CST into the permutation
49538 array in D. */
49539 memcpy (d.perm, sel, nelt);
49540 for (i = which = 0; i < nelt; ++i)
49542 unsigned char e = d.perm[i];
49543 gcc_assert (e < 2 * nelt);
49544 which |= (e < nelt ? 1 : 2);
49547 /* For all elements from second vector, fold the elements to first. */
49548 if (which == 2)
49549 for (i = 0; i < nelt; ++i)
49550 d.perm[i] -= nelt;
49552 /* Check whether the mask can be applied to the vector type. */
49553 d.one_operand_p = (which != 3);
49555 /* Implementable with shufps or pshufd. */
49556 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49557 return true;
49559 /* Otherwise we have to go through the motions and see if we can
49560 figure out how to generate the requested permutation. */
49561 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49562 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49563 if (!d.one_operand_p)
49564 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49566 start_sequence ();
49567 ret = ix86_expand_vec_perm_const_1 (&d);
49568 end_sequence ();
49570 return ret;
49573 void
49574 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49576 struct expand_vec_perm_d d;
49577 unsigned i, nelt;
49579 d.target = targ;
49580 d.op0 = op0;
49581 d.op1 = op1;
49582 d.vmode = GET_MODE (targ);
49583 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49584 d.one_operand_p = false;
49585 d.testing_p = false;
49587 for (i = 0; i < nelt; ++i)
49588 d.perm[i] = i * 2 + odd;
49590 /* We'll either be able to implement the permutation directly... */
49591 if (expand_vec_perm_1 (&d))
49592 return;
49594 /* ... or we use the special-case patterns. */
49595 expand_vec_perm_even_odd_1 (&d, odd);
49598 static void
49599 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49601 struct expand_vec_perm_d d;
49602 unsigned i, nelt, base;
49603 bool ok;
49605 d.target = targ;
49606 d.op0 = op0;
49607 d.op1 = op1;
49608 d.vmode = GET_MODE (targ);
49609 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49610 d.one_operand_p = false;
49611 d.testing_p = false;
49613 base = high_p ? nelt / 2 : 0;
49614 for (i = 0; i < nelt / 2; ++i)
49616 d.perm[i * 2] = i + base;
49617 d.perm[i * 2 + 1] = i + base + nelt;
49620 /* Note that for AVX this isn't one instruction. */
49621 ok = ix86_expand_vec_perm_const_1 (&d);
49622 gcc_assert (ok);
49626 /* Expand a vector operation CODE for a V*QImode in terms of the
49627 same operation on V*HImode. */
49629 void
49630 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49632 machine_mode qimode = GET_MODE (dest);
49633 machine_mode himode;
49634 rtx (*gen_il) (rtx, rtx, rtx);
49635 rtx (*gen_ih) (rtx, rtx, rtx);
49636 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49637 struct expand_vec_perm_d d;
49638 bool ok, full_interleave;
49639 bool uns_p = false;
49640 int i;
49642 switch (qimode)
49644 case V16QImode:
49645 himode = V8HImode;
49646 gen_il = gen_vec_interleave_lowv16qi;
49647 gen_ih = gen_vec_interleave_highv16qi;
49648 break;
49649 case V32QImode:
49650 himode = V16HImode;
49651 gen_il = gen_avx2_interleave_lowv32qi;
49652 gen_ih = gen_avx2_interleave_highv32qi;
49653 break;
49654 case V64QImode:
49655 himode = V32HImode;
49656 gen_il = gen_avx512bw_interleave_lowv64qi;
49657 gen_ih = gen_avx512bw_interleave_highv64qi;
49658 break;
49659 default:
49660 gcc_unreachable ();
49663 op2_l = op2_h = op2;
49664 switch (code)
49666 case MULT:
49667 /* Unpack data such that we've got a source byte in each low byte of
49668 each word. We don't care what goes into the high byte of each word.
49669 Rather than trying to get zero in there, most convenient is to let
49670 it be a copy of the low byte. */
49671 op2_l = gen_reg_rtx (qimode);
49672 op2_h = gen_reg_rtx (qimode);
49673 emit_insn (gen_il (op2_l, op2, op2));
49674 emit_insn (gen_ih (op2_h, op2, op2));
49675 /* FALLTHRU */
49677 op1_l = gen_reg_rtx (qimode);
49678 op1_h = gen_reg_rtx (qimode);
49679 emit_insn (gen_il (op1_l, op1, op1));
49680 emit_insn (gen_ih (op1_h, op1, op1));
49681 full_interleave = qimode == V16QImode;
49682 break;
49684 case ASHIFT:
49685 case LSHIFTRT:
49686 uns_p = true;
49687 /* FALLTHRU */
49688 case ASHIFTRT:
49689 op1_l = gen_reg_rtx (himode);
49690 op1_h = gen_reg_rtx (himode);
49691 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49692 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49693 full_interleave = true;
49694 break;
49695 default:
49696 gcc_unreachable ();
49699 /* Perform the operation. */
49700 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49701 1, OPTAB_DIRECT);
49702 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49703 1, OPTAB_DIRECT);
49704 gcc_assert (res_l && res_h);
49706 /* Merge the data back into the right place. */
49707 d.target = dest;
49708 d.op0 = gen_lowpart (qimode, res_l);
49709 d.op1 = gen_lowpart (qimode, res_h);
49710 d.vmode = qimode;
49711 d.nelt = GET_MODE_NUNITS (qimode);
49712 d.one_operand_p = false;
49713 d.testing_p = false;
49715 if (full_interleave)
49717 /* For SSE2, we used an full interleave, so the desired
49718 results are in the even elements. */
49719 for (i = 0; i < d.nelt; ++i)
49720 d.perm[i] = i * 2;
49722 else
49724 /* For AVX, the interleave used above was not cross-lane. So the
49725 extraction is evens but with the second and third quarter swapped.
49726 Happily, that is even one insn shorter than even extraction.
49727 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49728 always first from the first and then from the second source operand,
49729 the index bits above the low 4 bits remains the same.
49730 Thus, for d.nelt == 32 we want permutation
49731 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49732 and for d.nelt == 64 we want permutation
49733 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49734 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49735 for (i = 0; i < d.nelt; ++i)
49736 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49739 ok = ix86_expand_vec_perm_const_1 (&d);
49740 gcc_assert (ok);
49742 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49743 gen_rtx_fmt_ee (code, qimode, op1, op2));
49746 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49747 if op is CONST_VECTOR with all odd elements equal to their
49748 preceding element. */
49750 static bool
49751 const_vector_equal_evenodd_p (rtx op)
49753 machine_mode mode = GET_MODE (op);
49754 int i, nunits = GET_MODE_NUNITS (mode);
49755 if (GET_CODE (op) != CONST_VECTOR
49756 || nunits != CONST_VECTOR_NUNITS (op))
49757 return false;
49758 for (i = 0; i < nunits; i += 2)
49759 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49760 return false;
49761 return true;
49764 void
49765 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49766 bool uns_p, bool odd_p)
49768 machine_mode mode = GET_MODE (op1);
49769 machine_mode wmode = GET_MODE (dest);
49770 rtx x;
49771 rtx orig_op1 = op1, orig_op2 = op2;
49773 if (!nonimmediate_operand (op1, mode))
49774 op1 = force_reg (mode, op1);
49775 if (!nonimmediate_operand (op2, mode))
49776 op2 = force_reg (mode, op2);
49778 /* We only play even/odd games with vectors of SImode. */
49779 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49781 /* If we're looking for the odd results, shift those members down to
49782 the even slots. For some cpus this is faster than a PSHUFD. */
49783 if (odd_p)
49785 /* For XOP use vpmacsdqh, but only for smult, as it is only
49786 signed. */
49787 if (TARGET_XOP && mode == V4SImode && !uns_p)
49789 x = force_reg (wmode, CONST0_RTX (wmode));
49790 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49791 return;
49794 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49795 if (!const_vector_equal_evenodd_p (orig_op1))
49796 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49797 x, NULL, 1, OPTAB_DIRECT);
49798 if (!const_vector_equal_evenodd_p (orig_op2))
49799 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49800 x, NULL, 1, OPTAB_DIRECT);
49801 op1 = gen_lowpart (mode, op1);
49802 op2 = gen_lowpart (mode, op2);
49805 if (mode == V16SImode)
49807 if (uns_p)
49808 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49809 else
49810 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49812 else if (mode == V8SImode)
49814 if (uns_p)
49815 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49816 else
49817 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49819 else if (uns_p)
49820 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49821 else if (TARGET_SSE4_1)
49822 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49823 else
49825 rtx s1, s2, t0, t1, t2;
49827 /* The easiest way to implement this without PMULDQ is to go through
49828 the motions as if we are performing a full 64-bit multiply. With
49829 the exception that we need to do less shuffling of the elements. */
49831 /* Compute the sign-extension, aka highparts, of the two operands. */
49832 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49833 op1, pc_rtx, pc_rtx);
49834 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49835 op2, pc_rtx, pc_rtx);
49837 /* Multiply LO(A) * HI(B), and vice-versa. */
49838 t1 = gen_reg_rtx (wmode);
49839 t2 = gen_reg_rtx (wmode);
49840 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49841 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49843 /* Multiply LO(A) * LO(B). */
49844 t0 = gen_reg_rtx (wmode);
49845 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49847 /* Combine and shift the highparts into place. */
49848 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49849 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49850 1, OPTAB_DIRECT);
49852 /* Combine high and low parts. */
49853 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49854 return;
49856 emit_insn (x);
49859 void
49860 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49861 bool uns_p, bool high_p)
49863 machine_mode wmode = GET_MODE (dest);
49864 machine_mode mode = GET_MODE (op1);
49865 rtx t1, t2, t3, t4, mask;
49867 switch (mode)
49869 case V4SImode:
49870 t1 = gen_reg_rtx (mode);
49871 t2 = gen_reg_rtx (mode);
49872 if (TARGET_XOP && !uns_p)
49874 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49875 shuffle the elements once so that all elements are in the right
49876 place for immediate use: { A C B D }. */
49877 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49878 const1_rtx, GEN_INT (3)));
49879 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49880 const1_rtx, GEN_INT (3)));
49882 else
49884 /* Put the elements into place for the multiply. */
49885 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49886 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49887 high_p = false;
49889 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49890 break;
49892 case V8SImode:
49893 /* Shuffle the elements between the lanes. After this we
49894 have { A B E F | C D G H } for each operand. */
49895 t1 = gen_reg_rtx (V4DImode);
49896 t2 = gen_reg_rtx (V4DImode);
49897 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49898 const0_rtx, const2_rtx,
49899 const1_rtx, GEN_INT (3)));
49900 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49901 const0_rtx, const2_rtx,
49902 const1_rtx, GEN_INT (3)));
49904 /* Shuffle the elements within the lanes. After this we
49905 have { A A B B | C C D D } or { E E F F | G G H H }. */
49906 t3 = gen_reg_rtx (V8SImode);
49907 t4 = gen_reg_rtx (V8SImode);
49908 mask = GEN_INT (high_p
49909 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49910 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49911 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49912 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49914 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49915 break;
49917 case V8HImode:
49918 case V16HImode:
49919 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49920 uns_p, OPTAB_DIRECT);
49921 t2 = expand_binop (mode,
49922 uns_p ? umul_highpart_optab : smul_highpart_optab,
49923 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49924 gcc_assert (t1 && t2);
49926 t3 = gen_reg_rtx (mode);
49927 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49928 emit_move_insn (dest, gen_lowpart (wmode, t3));
49929 break;
49931 case V16QImode:
49932 case V32QImode:
49933 case V32HImode:
49934 case V16SImode:
49935 case V64QImode:
49936 t1 = gen_reg_rtx (wmode);
49937 t2 = gen_reg_rtx (wmode);
49938 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49939 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49941 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49942 break;
49944 default:
49945 gcc_unreachable ();
49949 void
49950 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49952 rtx res_1, res_2, res_3, res_4;
49954 res_1 = gen_reg_rtx (V4SImode);
49955 res_2 = gen_reg_rtx (V4SImode);
49956 res_3 = gen_reg_rtx (V2DImode);
49957 res_4 = gen_reg_rtx (V2DImode);
49958 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49959 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49961 /* Move the results in element 2 down to element 1; we don't care
49962 what goes in elements 2 and 3. Then we can merge the parts
49963 back together with an interleave.
49965 Note that two other sequences were tried:
49966 (1) Use interleaves at the start instead of psrldq, which allows
49967 us to use a single shufps to merge things back at the end.
49968 (2) Use shufps here to combine the two vectors, then pshufd to
49969 put the elements in the correct order.
49970 In both cases the cost of the reformatting stall was too high
49971 and the overall sequence slower. */
49973 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49974 const0_rtx, const2_rtx,
49975 const0_rtx, const0_rtx));
49976 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49977 const0_rtx, const2_rtx,
49978 const0_rtx, const0_rtx));
49979 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49981 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49984 void
49985 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49987 machine_mode mode = GET_MODE (op0);
49988 rtx t1, t2, t3, t4, t5, t6;
49990 if (TARGET_AVX512DQ && mode == V8DImode)
49991 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49992 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49993 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49994 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49995 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49996 else if (TARGET_XOP && mode == V2DImode)
49998 /* op1: A,B,C,D, op2: E,F,G,H */
49999 op1 = gen_lowpart (V4SImode, op1);
50000 op2 = gen_lowpart (V4SImode, op2);
50002 t1 = gen_reg_rtx (V4SImode);
50003 t2 = gen_reg_rtx (V4SImode);
50004 t3 = gen_reg_rtx (V2DImode);
50005 t4 = gen_reg_rtx (V2DImode);
50007 /* t1: B,A,D,C */
50008 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50009 GEN_INT (1),
50010 GEN_INT (0),
50011 GEN_INT (3),
50012 GEN_INT (2)));
50014 /* t2: (B*E),(A*F),(D*G),(C*H) */
50015 emit_insn (gen_mulv4si3 (t2, t1, op2));
50017 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50018 emit_insn (gen_xop_phadddq (t3, t2));
50020 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50021 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50023 /* Multiply lower parts and add all */
50024 t5 = gen_reg_rtx (V2DImode);
50025 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50026 gen_lowpart (V4SImode, op1),
50027 gen_lowpart (V4SImode, op2)));
50028 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50031 else
50033 machine_mode nmode;
50034 rtx (*umul) (rtx, rtx, rtx);
50036 if (mode == V2DImode)
50038 umul = gen_vec_widen_umult_even_v4si;
50039 nmode = V4SImode;
50041 else if (mode == V4DImode)
50043 umul = gen_vec_widen_umult_even_v8si;
50044 nmode = V8SImode;
50046 else if (mode == V8DImode)
50048 umul = gen_vec_widen_umult_even_v16si;
50049 nmode = V16SImode;
50051 else
50052 gcc_unreachable ();
50055 /* Multiply low parts. */
50056 t1 = gen_reg_rtx (mode);
50057 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50059 /* Shift input vectors right 32 bits so we can multiply high parts. */
50060 t6 = GEN_INT (32);
50061 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50062 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50064 /* Multiply high parts by low parts. */
50065 t4 = gen_reg_rtx (mode);
50066 t5 = gen_reg_rtx (mode);
50067 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50068 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50070 /* Combine and shift the highparts back. */
50071 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50072 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50074 /* Combine high and low parts. */
50075 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50078 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50079 gen_rtx_MULT (mode, op1, op2));
50082 /* Return 1 if control tansfer instruction INSN
50083 should be encoded with bnd prefix.
50084 If insn is NULL then return 1 when control
50085 transfer instructions should be prefixed with
50086 bnd by default for current function. */
50088 bool
50089 ix86_bnd_prefixed_insn_p (rtx insn)
50091 /* For call insns check special flag. */
50092 if (insn && CALL_P (insn))
50094 rtx call = get_call_rtx_from (insn);
50095 if (call)
50096 return CALL_EXPR_WITH_BOUNDS_P (call);
50099 /* All other insns are prefixed only if function is instrumented. */
50100 return chkp_function_instrumented_p (current_function_decl);
50103 /* Calculate integer abs() using only SSE2 instructions. */
50105 void
50106 ix86_expand_sse2_abs (rtx target, rtx input)
50108 machine_mode mode = GET_MODE (target);
50109 rtx tmp0, tmp1, x;
50111 switch (mode)
50113 /* For 32-bit signed integer X, the best way to calculate the absolute
50114 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50115 case V4SImode:
50116 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50117 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50118 NULL, 0, OPTAB_DIRECT);
50119 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50120 NULL, 0, OPTAB_DIRECT);
50121 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50122 target, 0, OPTAB_DIRECT);
50123 break;
50125 /* For 16-bit signed integer X, the best way to calculate the absolute
50126 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50127 case V8HImode:
50128 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50130 x = expand_simple_binop (mode, SMAX, tmp0, input,
50131 target, 0, OPTAB_DIRECT);
50132 break;
50134 /* For 8-bit signed integer X, the best way to calculate the absolute
50135 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50136 as SSE2 provides the PMINUB insn. */
50137 case V16QImode:
50138 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50140 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50141 target, 0, OPTAB_DIRECT);
50142 break;
50144 default:
50145 gcc_unreachable ();
50148 if (x != target)
50149 emit_move_insn (target, x);
50152 /* Expand an extract from a vector register through pextr insn.
50153 Return true if successful. */
50155 bool
50156 ix86_expand_pextr (rtx *operands)
50158 rtx dst = operands[0];
50159 rtx src = operands[1];
50161 unsigned int size = INTVAL (operands[2]);
50162 unsigned int pos = INTVAL (operands[3]);
50164 if (SUBREG_P (dst))
50166 /* Reject non-lowpart subregs. */
50167 if (SUBREG_BYTE (dst) > 0)
50168 return false;
50169 dst = SUBREG_REG (dst);
50172 if (SUBREG_P (src))
50174 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50175 src = SUBREG_REG (src);
50178 switch (GET_MODE (src))
50180 case V16QImode:
50181 case V8HImode:
50182 case V4SImode:
50183 case V2DImode:
50184 case V1TImode:
50185 case TImode:
50187 machine_mode srcmode, dstmode;
50188 rtx d, pat;
50190 dstmode = mode_for_size (size, MODE_INT, 0);
50192 switch (dstmode)
50194 case QImode:
50195 if (!TARGET_SSE4_1)
50196 return false;
50197 srcmode = V16QImode;
50198 break;
50200 case HImode:
50201 if (!TARGET_SSE2)
50202 return false;
50203 srcmode = V8HImode;
50204 break;
50206 case SImode:
50207 if (!TARGET_SSE4_1)
50208 return false;
50209 srcmode = V4SImode;
50210 break;
50212 case DImode:
50213 gcc_assert (TARGET_64BIT);
50214 if (!TARGET_SSE4_1)
50215 return false;
50216 srcmode = V2DImode;
50217 break;
50219 default:
50220 return false;
50223 /* Reject extractions from misaligned positions. */
50224 if (pos & (size-1))
50225 return false;
50227 if (GET_MODE (dst) == dstmode)
50228 d = dst;
50229 else
50230 d = gen_reg_rtx (dstmode);
50232 /* Construct insn pattern. */
50233 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50234 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50236 /* Let the rtl optimizers know about the zero extension performed. */
50237 if (dstmode == QImode || dstmode == HImode)
50239 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50240 d = gen_lowpart (SImode, d);
50243 emit_insn (gen_rtx_SET (d, pat));
50245 if (d != dst)
50246 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50247 return true;
50250 default:
50251 return false;
50255 /* Expand an insert into a vector register through pinsr insn.
50256 Return true if successful. */
50258 bool
50259 ix86_expand_pinsr (rtx *operands)
50261 rtx dst = operands[0];
50262 rtx src = operands[3];
50264 unsigned int size = INTVAL (operands[1]);
50265 unsigned int pos = INTVAL (operands[2]);
50267 if (SUBREG_P (dst))
50269 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50270 dst = SUBREG_REG (dst);
50273 switch (GET_MODE (dst))
50275 case V16QImode:
50276 case V8HImode:
50277 case V4SImode:
50278 case V2DImode:
50279 case V1TImode:
50280 case TImode:
50282 machine_mode srcmode, dstmode;
50283 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50284 rtx d;
50286 srcmode = mode_for_size (size, MODE_INT, 0);
50288 switch (srcmode)
50290 case QImode:
50291 if (!TARGET_SSE4_1)
50292 return false;
50293 dstmode = V16QImode;
50294 pinsr = gen_sse4_1_pinsrb;
50295 break;
50297 case HImode:
50298 if (!TARGET_SSE2)
50299 return false;
50300 dstmode = V8HImode;
50301 pinsr = gen_sse2_pinsrw;
50302 break;
50304 case SImode:
50305 if (!TARGET_SSE4_1)
50306 return false;
50307 dstmode = V4SImode;
50308 pinsr = gen_sse4_1_pinsrd;
50309 break;
50311 case DImode:
50312 gcc_assert (TARGET_64BIT);
50313 if (!TARGET_SSE4_1)
50314 return false;
50315 dstmode = V2DImode;
50316 pinsr = gen_sse4_1_pinsrq;
50317 break;
50319 default:
50320 return false;
50323 /* Reject insertions to misaligned positions. */
50324 if (pos & (size-1))
50325 return false;
50327 if (SUBREG_P (src))
50329 unsigned int srcpos = SUBREG_BYTE (src);
50331 if (srcpos > 0)
50333 rtx extr_ops[4];
50335 extr_ops[0] = gen_reg_rtx (srcmode);
50336 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50337 extr_ops[2] = GEN_INT (size);
50338 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50340 if (!ix86_expand_pextr (extr_ops))
50341 return false;
50343 src = extr_ops[0];
50345 else
50346 src = gen_lowpart (srcmode, SUBREG_REG (src));
50349 if (GET_MODE (dst) == dstmode)
50350 d = dst;
50351 else
50352 d = gen_reg_rtx (dstmode);
50354 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50355 gen_lowpart (srcmode, src),
50356 GEN_INT (1 << (pos / size))));
50357 if (d != dst)
50358 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50359 return true;
50362 default:
50363 return false;
50367 /* This function returns the calling abi specific va_list type node.
50368 It returns the FNDECL specific va_list type. */
50370 static tree
50371 ix86_fn_abi_va_list (tree fndecl)
50373 if (!TARGET_64BIT)
50374 return va_list_type_node;
50375 gcc_assert (fndecl != NULL_TREE);
50377 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50378 return ms_va_list_type_node;
50379 else
50380 return sysv_va_list_type_node;
50383 /* Returns the canonical va_list type specified by TYPE. If there
50384 is no valid TYPE provided, it return NULL_TREE. */
50386 static tree
50387 ix86_canonical_va_list_type (tree type)
50389 if (TARGET_64BIT)
50391 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50392 return ms_va_list_type_node;
50394 if ((TREE_CODE (type) == ARRAY_TYPE
50395 && integer_zerop (array_type_nelts (type)))
50396 || POINTER_TYPE_P (type))
50398 tree elem_type = TREE_TYPE (type);
50399 if (TREE_CODE (elem_type) == RECORD_TYPE
50400 && lookup_attribute ("sysv_abi va_list",
50401 TYPE_ATTRIBUTES (elem_type)))
50402 return sysv_va_list_type_node;
50405 return NULL_TREE;
50408 return std_canonical_va_list_type (type);
50411 /* Iterate through the target-specific builtin types for va_list.
50412 IDX denotes the iterator, *PTREE is set to the result type of
50413 the va_list builtin, and *PNAME to its internal type.
50414 Returns zero if there is no element for this index, otherwise
50415 IDX should be increased upon the next call.
50416 Note, do not iterate a base builtin's name like __builtin_va_list.
50417 Used from c_common_nodes_and_builtins. */
50419 static int
50420 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50422 if (TARGET_64BIT)
50424 switch (idx)
50426 default:
50427 break;
50429 case 0:
50430 *ptree = ms_va_list_type_node;
50431 *pname = "__builtin_ms_va_list";
50432 return 1;
50434 case 1:
50435 *ptree = sysv_va_list_type_node;
50436 *pname = "__builtin_sysv_va_list";
50437 return 1;
50441 return 0;
50444 #undef TARGET_SCHED_DISPATCH
50445 #define TARGET_SCHED_DISPATCH has_dispatch
50446 #undef TARGET_SCHED_DISPATCH_DO
50447 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50449 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50450 #undef TARGET_SCHED_REORDER
50451 #define TARGET_SCHED_REORDER ix86_sched_reorder
50452 #undef TARGET_SCHED_ADJUST_PRIORITY
50453 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50454 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50455 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50456 ix86_dependencies_evaluation_hook
50458 /* The size of the dispatch window is the total number of bytes of
50459 object code allowed in a window. */
50460 #define DISPATCH_WINDOW_SIZE 16
50462 /* Number of dispatch windows considered for scheduling. */
50463 #define MAX_DISPATCH_WINDOWS 3
50465 /* Maximum number of instructions in a window. */
50466 #define MAX_INSN 4
50468 /* Maximum number of immediate operands in a window. */
50469 #define MAX_IMM 4
50471 /* Maximum number of immediate bits allowed in a window. */
50472 #define MAX_IMM_SIZE 128
50474 /* Maximum number of 32 bit immediates allowed in a window. */
50475 #define MAX_IMM_32 4
50477 /* Maximum number of 64 bit immediates allowed in a window. */
50478 #define MAX_IMM_64 2
50480 /* Maximum total of loads or prefetches allowed in a window. */
50481 #define MAX_LOAD 2
50483 /* Maximum total of stores allowed in a window. */
50484 #define MAX_STORE 1
50486 #undef BIG
50487 #define BIG 100
50490 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50491 enum dispatch_group {
50492 disp_no_group = 0,
50493 disp_load,
50494 disp_store,
50495 disp_load_store,
50496 disp_prefetch,
50497 disp_imm,
50498 disp_imm_32,
50499 disp_imm_64,
50500 disp_branch,
50501 disp_cmp,
50502 disp_jcc,
50503 disp_last
50506 /* Number of allowable groups in a dispatch window. It is an array
50507 indexed by dispatch_group enum. 100 is used as a big number,
50508 because the number of these kind of operations does not have any
50509 effect in dispatch window, but we need them for other reasons in
50510 the table. */
50511 static unsigned int num_allowable_groups[disp_last] = {
50512 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50515 char group_name[disp_last + 1][16] = {
50516 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50517 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50518 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50521 /* Instruction path. */
50522 enum insn_path {
50523 no_path = 0,
50524 path_single, /* Single micro op. */
50525 path_double, /* Double micro op. */
50526 path_multi, /* Instructions with more than 2 micro op.. */
50527 last_path
50530 /* sched_insn_info defines a window to the instructions scheduled in
50531 the basic block. It contains a pointer to the insn_info table and
50532 the instruction scheduled.
50534 Windows are allocated for each basic block and are linked
50535 together. */
50536 typedef struct sched_insn_info_s {
50537 rtx insn;
50538 enum dispatch_group group;
50539 enum insn_path path;
50540 int byte_len;
50541 int imm_bytes;
50542 } sched_insn_info;
50544 /* Linked list of dispatch windows. This is a two way list of
50545 dispatch windows of a basic block. It contains information about
50546 the number of uops in the window and the total number of
50547 instructions and of bytes in the object code for this dispatch
50548 window. */
50549 typedef struct dispatch_windows_s {
50550 int num_insn; /* Number of insn in the window. */
50551 int num_uops; /* Number of uops in the window. */
50552 int window_size; /* Number of bytes in the window. */
50553 int window_num; /* Window number between 0 or 1. */
50554 int num_imm; /* Number of immediates in an insn. */
50555 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50556 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50557 int imm_size; /* Total immediates in the window. */
50558 int num_loads; /* Total memory loads in the window. */
50559 int num_stores; /* Total memory stores in the window. */
50560 int violation; /* Violation exists in window. */
50561 sched_insn_info *window; /* Pointer to the window. */
50562 struct dispatch_windows_s *next;
50563 struct dispatch_windows_s *prev;
50564 } dispatch_windows;
50566 /* Immediate valuse used in an insn. */
50567 typedef struct imm_info_s
50569 int imm;
50570 int imm32;
50571 int imm64;
50572 } imm_info;
50574 static dispatch_windows *dispatch_window_list;
50575 static dispatch_windows *dispatch_window_list1;
50577 /* Get dispatch group of insn. */
50579 static enum dispatch_group
50580 get_mem_group (rtx_insn *insn)
50582 enum attr_memory memory;
50584 if (INSN_CODE (insn) < 0)
50585 return disp_no_group;
50586 memory = get_attr_memory (insn);
50587 if (memory == MEMORY_STORE)
50588 return disp_store;
50590 if (memory == MEMORY_LOAD)
50591 return disp_load;
50593 if (memory == MEMORY_BOTH)
50594 return disp_load_store;
50596 return disp_no_group;
50599 /* Return true if insn is a compare instruction. */
50601 static bool
50602 is_cmp (rtx_insn *insn)
50604 enum attr_type type;
50606 type = get_attr_type (insn);
50607 return (type == TYPE_TEST
50608 || type == TYPE_ICMP
50609 || type == TYPE_FCMP
50610 || GET_CODE (PATTERN (insn)) == COMPARE);
50613 /* Return true if a dispatch violation encountered. */
50615 static bool
50616 dispatch_violation (void)
50618 if (dispatch_window_list->next)
50619 return dispatch_window_list->next->violation;
50620 return dispatch_window_list->violation;
50623 /* Return true if insn is a branch instruction. */
50625 static bool
50626 is_branch (rtx_insn *insn)
50628 return (CALL_P (insn) || JUMP_P (insn));
50631 /* Return true if insn is a prefetch instruction. */
50633 static bool
50634 is_prefetch (rtx_insn *insn)
50636 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50639 /* This function initializes a dispatch window and the list container holding a
50640 pointer to the window. */
50642 static void
50643 init_window (int window_num)
50645 int i;
50646 dispatch_windows *new_list;
50648 if (window_num == 0)
50649 new_list = dispatch_window_list;
50650 else
50651 new_list = dispatch_window_list1;
50653 new_list->num_insn = 0;
50654 new_list->num_uops = 0;
50655 new_list->window_size = 0;
50656 new_list->next = NULL;
50657 new_list->prev = NULL;
50658 new_list->window_num = window_num;
50659 new_list->num_imm = 0;
50660 new_list->num_imm_32 = 0;
50661 new_list->num_imm_64 = 0;
50662 new_list->imm_size = 0;
50663 new_list->num_loads = 0;
50664 new_list->num_stores = 0;
50665 new_list->violation = false;
50667 for (i = 0; i < MAX_INSN; i++)
50669 new_list->window[i].insn = NULL;
50670 new_list->window[i].group = disp_no_group;
50671 new_list->window[i].path = no_path;
50672 new_list->window[i].byte_len = 0;
50673 new_list->window[i].imm_bytes = 0;
50675 return;
50678 /* This function allocates and initializes a dispatch window and the
50679 list container holding a pointer to the window. */
50681 static dispatch_windows *
50682 allocate_window (void)
50684 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50685 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50687 return new_list;
50690 /* This routine initializes the dispatch scheduling information. It
50691 initiates building dispatch scheduler tables and constructs the
50692 first dispatch window. */
50694 static void
50695 init_dispatch_sched (void)
50697 /* Allocate a dispatch list and a window. */
50698 dispatch_window_list = allocate_window ();
50699 dispatch_window_list1 = allocate_window ();
50700 init_window (0);
50701 init_window (1);
50704 /* This function returns true if a branch is detected. End of a basic block
50705 does not have to be a branch, but here we assume only branches end a
50706 window. */
50708 static bool
50709 is_end_basic_block (enum dispatch_group group)
50711 return group == disp_branch;
50714 /* This function is called when the end of a window processing is reached. */
50716 static void
50717 process_end_window (void)
50719 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50720 if (dispatch_window_list->next)
50722 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50723 gcc_assert (dispatch_window_list->window_size
50724 + dispatch_window_list1->window_size <= 48);
50725 init_window (1);
50727 init_window (0);
50730 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50731 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50732 for 48 bytes of instructions. Note that these windows are not dispatch
50733 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50735 static dispatch_windows *
50736 allocate_next_window (int window_num)
50738 if (window_num == 0)
50740 if (dispatch_window_list->next)
50741 init_window (1);
50742 init_window (0);
50743 return dispatch_window_list;
50746 dispatch_window_list->next = dispatch_window_list1;
50747 dispatch_window_list1->prev = dispatch_window_list;
50749 return dispatch_window_list1;
50752 /* Compute number of immediate operands of an instruction. */
50754 static void
50755 find_constant (rtx in_rtx, imm_info *imm_values)
50757 if (INSN_P (in_rtx))
50758 in_rtx = PATTERN (in_rtx);
50759 subrtx_iterator::array_type array;
50760 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50761 if (const_rtx x = *iter)
50762 switch (GET_CODE (x))
50764 case CONST:
50765 case SYMBOL_REF:
50766 case CONST_INT:
50767 (imm_values->imm)++;
50768 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
50769 (imm_values->imm32)++;
50770 else
50771 (imm_values->imm64)++;
50772 break;
50774 case CONST_DOUBLE:
50775 case CONST_WIDE_INT:
50776 (imm_values->imm)++;
50777 (imm_values->imm64)++;
50778 break;
50780 case CODE_LABEL:
50781 if (LABEL_KIND (x) == LABEL_NORMAL)
50783 (imm_values->imm)++;
50784 (imm_values->imm32)++;
50786 break;
50788 default:
50789 break;
50793 /* Return total size of immediate operands of an instruction along with number
50794 of corresponding immediate-operands. It initializes its parameters to zero
50795 befor calling FIND_CONSTANT.
50796 INSN is the input instruction. IMM is the total of immediates.
50797 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
50798 bit immediates. */
50800 static int
50801 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
50803 imm_info imm_values = {0, 0, 0};
50805 find_constant (insn, &imm_values);
50806 *imm = imm_values.imm;
50807 *imm32 = imm_values.imm32;
50808 *imm64 = imm_values.imm64;
50809 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
50812 /* This function indicates if an operand of an instruction is an
50813 immediate. */
50815 static bool
50816 has_immediate (rtx_insn *insn)
50818 int num_imm_operand;
50819 int num_imm32_operand;
50820 int num_imm64_operand;
50822 if (insn)
50823 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50824 &num_imm64_operand);
50825 return false;
50828 /* Return single or double path for instructions. */
50830 static enum insn_path
50831 get_insn_path (rtx_insn *insn)
50833 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
50835 if ((int)path == 0)
50836 return path_single;
50838 if ((int)path == 1)
50839 return path_double;
50841 return path_multi;
50844 /* Return insn dispatch group. */
50846 static enum dispatch_group
50847 get_insn_group (rtx_insn *insn)
50849 enum dispatch_group group = get_mem_group (insn);
50850 if (group)
50851 return group;
50853 if (is_branch (insn))
50854 return disp_branch;
50856 if (is_cmp (insn))
50857 return disp_cmp;
50859 if (has_immediate (insn))
50860 return disp_imm;
50862 if (is_prefetch (insn))
50863 return disp_prefetch;
50865 return disp_no_group;
50868 /* Count number of GROUP restricted instructions in a dispatch
50869 window WINDOW_LIST. */
50871 static int
50872 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50874 enum dispatch_group group = get_insn_group (insn);
50875 int imm_size;
50876 int num_imm_operand;
50877 int num_imm32_operand;
50878 int num_imm64_operand;
50880 if (group == disp_no_group)
50881 return 0;
50883 if (group == disp_imm)
50885 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50886 &num_imm64_operand);
50887 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50888 || num_imm_operand + window_list->num_imm > MAX_IMM
50889 || (num_imm32_operand > 0
50890 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50891 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50892 || (num_imm64_operand > 0
50893 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50894 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50895 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50896 && num_imm64_operand > 0
50897 && ((window_list->num_imm_64 > 0
50898 && window_list->num_insn >= 2)
50899 || window_list->num_insn >= 3)))
50900 return BIG;
50902 return 1;
50905 if ((group == disp_load_store
50906 && (window_list->num_loads >= MAX_LOAD
50907 || window_list->num_stores >= MAX_STORE))
50908 || ((group == disp_load
50909 || group == disp_prefetch)
50910 && window_list->num_loads >= MAX_LOAD)
50911 || (group == disp_store
50912 && window_list->num_stores >= MAX_STORE))
50913 return BIG;
50915 return 1;
50918 /* This function returns true if insn satisfies dispatch rules on the
50919 last window scheduled. */
50921 static bool
50922 fits_dispatch_window (rtx_insn *insn)
50924 dispatch_windows *window_list = dispatch_window_list;
50925 dispatch_windows *window_list_next = dispatch_window_list->next;
50926 unsigned int num_restrict;
50927 enum dispatch_group group = get_insn_group (insn);
50928 enum insn_path path = get_insn_path (insn);
50929 int sum;
50931 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50932 instructions should be given the lowest priority in the
50933 scheduling process in Haifa scheduler to make sure they will be
50934 scheduled in the same dispatch window as the reference to them. */
50935 if (group == disp_jcc || group == disp_cmp)
50936 return false;
50938 /* Check nonrestricted. */
50939 if (group == disp_no_group || group == disp_branch)
50940 return true;
50942 /* Get last dispatch window. */
50943 if (window_list_next)
50944 window_list = window_list_next;
50946 if (window_list->window_num == 1)
50948 sum = window_list->prev->window_size + window_list->window_size;
50950 if (sum == 32
50951 || (min_insn_size (insn) + sum) >= 48)
50952 /* Window 1 is full. Go for next window. */
50953 return true;
50956 num_restrict = count_num_restricted (insn, window_list);
50958 if (num_restrict > num_allowable_groups[group])
50959 return false;
50961 /* See if it fits in the first window. */
50962 if (window_list->window_num == 0)
50964 /* The first widow should have only single and double path
50965 uops. */
50966 if (path == path_double
50967 && (window_list->num_uops + 2) > MAX_INSN)
50968 return false;
50969 else if (path != path_single)
50970 return false;
50972 return true;
50975 /* Add an instruction INSN with NUM_UOPS micro-operations to the
50976 dispatch window WINDOW_LIST. */
50978 static void
50979 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
50981 int byte_len = min_insn_size (insn);
50982 int num_insn = window_list->num_insn;
50983 int imm_size;
50984 sched_insn_info *window = window_list->window;
50985 enum dispatch_group group = get_insn_group (insn);
50986 enum insn_path path = get_insn_path (insn);
50987 int num_imm_operand;
50988 int num_imm32_operand;
50989 int num_imm64_operand;
50991 if (!window_list->violation && group != disp_cmp
50992 && !fits_dispatch_window (insn))
50993 window_list->violation = true;
50995 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50996 &num_imm64_operand);
50998 /* Initialize window with new instruction. */
50999 window[num_insn].insn = insn;
51000 window[num_insn].byte_len = byte_len;
51001 window[num_insn].group = group;
51002 window[num_insn].path = path;
51003 window[num_insn].imm_bytes = imm_size;
51005 window_list->window_size += byte_len;
51006 window_list->num_insn = num_insn + 1;
51007 window_list->num_uops = window_list->num_uops + num_uops;
51008 window_list->imm_size += imm_size;
51009 window_list->num_imm += num_imm_operand;
51010 window_list->num_imm_32 += num_imm32_operand;
51011 window_list->num_imm_64 += num_imm64_operand;
51013 if (group == disp_store)
51014 window_list->num_stores += 1;
51015 else if (group == disp_load
51016 || group == disp_prefetch)
51017 window_list->num_loads += 1;
51018 else if (group == disp_load_store)
51020 window_list->num_stores += 1;
51021 window_list->num_loads += 1;
51025 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51026 If the total bytes of instructions or the number of instructions in
51027 the window exceed allowable, it allocates a new window. */
51029 static void
51030 add_to_dispatch_window (rtx_insn *insn)
51032 int byte_len;
51033 dispatch_windows *window_list;
51034 dispatch_windows *next_list;
51035 dispatch_windows *window0_list;
51036 enum insn_path path;
51037 enum dispatch_group insn_group;
51038 bool insn_fits;
51039 int num_insn;
51040 int num_uops;
51041 int window_num;
51042 int insn_num_uops;
51043 int sum;
51045 if (INSN_CODE (insn) < 0)
51046 return;
51048 byte_len = min_insn_size (insn);
51049 window_list = dispatch_window_list;
51050 next_list = window_list->next;
51051 path = get_insn_path (insn);
51052 insn_group = get_insn_group (insn);
51054 /* Get the last dispatch window. */
51055 if (next_list)
51056 window_list = dispatch_window_list->next;
51058 if (path == path_single)
51059 insn_num_uops = 1;
51060 else if (path == path_double)
51061 insn_num_uops = 2;
51062 else
51063 insn_num_uops = (int) path;
51065 /* If current window is full, get a new window.
51066 Window number zero is full, if MAX_INSN uops are scheduled in it.
51067 Window number one is full, if window zero's bytes plus window
51068 one's bytes is 32, or if the bytes of the new instruction added
51069 to the total makes it greater than 48, or it has already MAX_INSN
51070 instructions in it. */
51071 num_insn = window_list->num_insn;
51072 num_uops = window_list->num_uops;
51073 window_num = window_list->window_num;
51074 insn_fits = fits_dispatch_window (insn);
51076 if (num_insn >= MAX_INSN
51077 || num_uops + insn_num_uops > MAX_INSN
51078 || !(insn_fits))
51080 window_num = ~window_num & 1;
51081 window_list = allocate_next_window (window_num);
51084 if (window_num == 0)
51086 add_insn_window (insn, window_list, insn_num_uops);
51087 if (window_list->num_insn >= MAX_INSN
51088 && insn_group == disp_branch)
51090 process_end_window ();
51091 return;
51094 else if (window_num == 1)
51096 window0_list = window_list->prev;
51097 sum = window0_list->window_size + window_list->window_size;
51098 if (sum == 32
51099 || (byte_len + sum) >= 48)
51101 process_end_window ();
51102 window_list = dispatch_window_list;
51105 add_insn_window (insn, window_list, insn_num_uops);
51107 else
51108 gcc_unreachable ();
51110 if (is_end_basic_block (insn_group))
51112 /* End of basic block is reached do end-basic-block process. */
51113 process_end_window ();
51114 return;
51118 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51120 DEBUG_FUNCTION static void
51121 debug_dispatch_window_file (FILE *file, int window_num)
51123 dispatch_windows *list;
51124 int i;
51126 if (window_num == 0)
51127 list = dispatch_window_list;
51128 else
51129 list = dispatch_window_list1;
51131 fprintf (file, "Window #%d:\n", list->window_num);
51132 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51133 list->num_insn, list->num_uops, list->window_size);
51134 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51135 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51137 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51138 list->num_stores);
51139 fprintf (file, " insn info:\n");
51141 for (i = 0; i < MAX_INSN; i++)
51143 if (!list->window[i].insn)
51144 break;
51145 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51146 i, group_name[list->window[i].group],
51147 i, (void *)list->window[i].insn,
51148 i, list->window[i].path,
51149 i, list->window[i].byte_len,
51150 i, list->window[i].imm_bytes);
51154 /* Print to stdout a dispatch window. */
51156 DEBUG_FUNCTION void
51157 debug_dispatch_window (int window_num)
51159 debug_dispatch_window_file (stdout, window_num);
51162 /* Print INSN dispatch information to FILE. */
51164 DEBUG_FUNCTION static void
51165 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51167 int byte_len;
51168 enum insn_path path;
51169 enum dispatch_group group;
51170 int imm_size;
51171 int num_imm_operand;
51172 int num_imm32_operand;
51173 int num_imm64_operand;
51175 if (INSN_CODE (insn) < 0)
51176 return;
51178 byte_len = min_insn_size (insn);
51179 path = get_insn_path (insn);
51180 group = get_insn_group (insn);
51181 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51182 &num_imm64_operand);
51184 fprintf (file, " insn info:\n");
51185 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51186 group_name[group], path, byte_len);
51187 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51188 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51191 /* Print to STDERR the status of the ready list with respect to
51192 dispatch windows. */
51194 DEBUG_FUNCTION void
51195 debug_ready_dispatch (void)
51197 int i;
51198 int no_ready = number_in_ready ();
51200 fprintf (stdout, "Number of ready: %d\n", no_ready);
51202 for (i = 0; i < no_ready; i++)
51203 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51206 /* This routine is the driver of the dispatch scheduler. */
51208 static void
51209 do_dispatch (rtx_insn *insn, int mode)
51211 if (mode == DISPATCH_INIT)
51212 init_dispatch_sched ();
51213 else if (mode == ADD_TO_DISPATCH_WINDOW)
51214 add_to_dispatch_window (insn);
51217 /* Return TRUE if Dispatch Scheduling is supported. */
51219 static bool
51220 has_dispatch (rtx_insn *insn, int action)
51222 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51223 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51224 switch (action)
51226 default:
51227 return false;
51229 case IS_DISPATCH_ON:
51230 return true;
51232 case IS_CMP:
51233 return is_cmp (insn);
51235 case DISPATCH_VIOLATION:
51236 return dispatch_violation ();
51238 case FITS_DISPATCH_WINDOW:
51239 return fits_dispatch_window (insn);
51242 return false;
51245 /* Implementation of reassociation_width target hook used by
51246 reassoc phase to identify parallelism level in reassociated
51247 tree. Statements tree_code is passed in OPC. Arguments type
51248 is passed in MODE.
51250 Currently parallel reassociation is enabled for Atom
51251 processors only and we set reassociation width to be 2
51252 because Atom may issue up to 2 instructions per cycle.
51254 Return value should be fixed if parallel reassociation is
51255 enabled for other processors. */
51257 static int
51258 ix86_reassociation_width (unsigned int, machine_mode mode)
51260 /* Vector part. */
51261 if (VECTOR_MODE_P (mode))
51263 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51264 return 2;
51265 else
51266 return 1;
51269 /* Scalar part. */
51270 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51271 return 2;
51272 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51273 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51274 else
51275 return 1;
51278 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51279 place emms and femms instructions. */
51281 static machine_mode
51282 ix86_preferred_simd_mode (machine_mode mode)
51284 if (!TARGET_SSE)
51285 return word_mode;
51287 switch (mode)
51289 case QImode:
51290 return TARGET_AVX512BW ? V64QImode :
51291 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51292 case HImode:
51293 return TARGET_AVX512BW ? V32HImode :
51294 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51295 case SImode:
51296 return TARGET_AVX512F ? V16SImode :
51297 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51298 case DImode:
51299 return TARGET_AVX512F ? V8DImode :
51300 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51302 case SFmode:
51303 if (TARGET_AVX512F)
51304 return V16SFmode;
51305 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51306 return V8SFmode;
51307 else
51308 return V4SFmode;
51310 case DFmode:
51311 if (TARGET_AVX512F)
51312 return V8DFmode;
51313 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51314 return V4DFmode;
51315 else if (TARGET_SSE2)
51316 return V2DFmode;
51317 /* FALLTHRU */
51319 default:
51320 return word_mode;
51324 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51325 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51326 256bit and 128bit vectors. */
51328 static unsigned int
51329 ix86_autovectorize_vector_sizes (void)
51331 return TARGET_AVX512F ? 64 | 32 | 16 :
51332 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51335 /* Implemenation of targetm.vectorize.get_mask_mode. */
51337 static machine_mode
51338 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51340 unsigned elem_size = vector_size / nunits;
51342 /* Scalar mask case. */
51343 if ((TARGET_AVX512F && vector_size == 64)
51344 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51346 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51347 return smallest_mode_for_size (nunits, MODE_INT);
51350 machine_mode elem_mode
51351 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51353 gcc_assert (elem_size * nunits == vector_size);
51355 return mode_for_vector (elem_mode, nunits);
51360 /* Return class of registers which could be used for pseudo of MODE
51361 and of class RCLASS for spilling instead of memory. Return NO_REGS
51362 if it is not possible or non-profitable. */
51364 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51366 static reg_class_t
51367 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51369 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51370 && TARGET_SSE2
51371 && TARGET_INTER_UNIT_MOVES_TO_VEC
51372 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51373 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51374 && INTEGER_CLASS_P (rclass))
51375 return ALL_SSE_REGS;
51376 return NO_REGS;
51379 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51380 but returns a lower bound. */
51382 static unsigned int
51383 ix86_max_noce_ifcvt_seq_cost (edge e)
51385 bool predictable_p = predictable_edge_p (e);
51387 enum compiler_param param
51388 = (predictable_p
51389 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51390 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51392 /* If we have a parameter set, use that, otherwise take a guess using
51393 BRANCH_COST. */
51394 if (global_options_set.x_param_values[param])
51395 return PARAM_VALUE (param);
51396 else
51397 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51400 /* Return true if SEQ is a good candidate as a replacement for the
51401 if-convertible sequence described in IF_INFO. */
51403 static bool
51404 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51406 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51408 int cmov_cnt = 0;
51409 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51410 Maybe we should allow even more conditional moves as long as they
51411 are used far enough not to stall the CPU, or also consider
51412 IF_INFO->TEST_BB succ edge probabilities. */
51413 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51415 rtx set = single_set (insn);
51416 if (!set)
51417 continue;
51418 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51419 continue;
51420 rtx src = SET_SRC (set);
51421 machine_mode mode = GET_MODE (src);
51422 if (GET_MODE_CLASS (mode) != MODE_INT
51423 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51424 continue;
51425 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51426 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51427 continue;
51428 /* insn is CMOV or FCMOV. */
51429 if (++cmov_cnt > 1)
51430 return false;
51433 return default_noce_conversion_profitable_p (seq, if_info);
51436 /* Implement targetm.vectorize.init_cost. */
51438 static void *
51439 ix86_init_cost (struct loop *)
51441 unsigned *cost = XNEWVEC (unsigned, 3);
51442 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51443 return cost;
51446 /* Implement targetm.vectorize.add_stmt_cost. */
51448 static unsigned
51449 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51450 struct _stmt_vec_info *stmt_info, int misalign,
51451 enum vect_cost_model_location where)
51453 unsigned *cost = (unsigned *) data;
51454 unsigned retval = 0;
51456 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51457 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51459 /* Penalize DFmode vector operations for Bonnell. */
51460 if (TARGET_BONNELL && kind == vector_stmt
51461 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51462 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51464 /* Statements in an inner loop relative to the loop being
51465 vectorized are weighted more heavily. The value here is
51466 arbitrary and could potentially be improved with analysis. */
51467 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51468 count *= 50; /* FIXME. */
51470 retval = (unsigned) (count * stmt_cost);
51472 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51473 for Silvermont as it has out of order integer pipeline and can execute
51474 2 scalar instruction per tick, but has in order SIMD pipeline. */
51475 if ((TARGET_SILVERMONT || TARGET_INTEL)
51476 && stmt_info && stmt_info->stmt)
51478 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51479 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51480 retval = (retval * 17) / 10;
51483 cost[where] += retval;
51485 return retval;
51488 /* Implement targetm.vectorize.finish_cost. */
51490 static void
51491 ix86_finish_cost (void *data, unsigned *prologue_cost,
51492 unsigned *body_cost, unsigned *epilogue_cost)
51494 unsigned *cost = (unsigned *) data;
51495 *prologue_cost = cost[vect_prologue];
51496 *body_cost = cost[vect_body];
51497 *epilogue_cost = cost[vect_epilogue];
51500 /* Implement targetm.vectorize.destroy_cost_data. */
51502 static void
51503 ix86_destroy_cost_data (void *data)
51505 free (data);
51508 /* Validate target specific memory model bits in VAL. */
51510 static unsigned HOST_WIDE_INT
51511 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51513 enum memmodel model = memmodel_from_int (val);
51514 bool strong;
51516 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51517 |MEMMODEL_MASK)
51518 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51520 warning (OPT_Winvalid_memory_model,
51521 "Unknown architecture specific memory model");
51522 return MEMMODEL_SEQ_CST;
51524 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51525 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51527 warning (OPT_Winvalid_memory_model,
51528 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51529 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51531 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51533 warning (OPT_Winvalid_memory_model,
51534 "HLE_RELEASE not used with RELEASE or stronger memory model");
51535 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51537 return val;
51540 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51541 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51542 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51543 or number of vecsize_mangle variants that should be emitted. */
51545 static int
51546 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51547 struct cgraph_simd_clone *clonei,
51548 tree base_type, int num)
51550 int ret = 1;
51552 if (clonei->simdlen
51553 && (clonei->simdlen < 2
51554 || clonei->simdlen > 1024
51555 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51557 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51558 "unsupported simdlen %d", clonei->simdlen);
51559 return 0;
51562 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51563 if (TREE_CODE (ret_type) != VOID_TYPE)
51564 switch (TYPE_MODE (ret_type))
51566 case QImode:
51567 case HImode:
51568 case SImode:
51569 case DImode:
51570 case SFmode:
51571 case DFmode:
51572 /* case SCmode: */
51573 /* case DCmode: */
51574 break;
51575 default:
51576 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51577 "unsupported return type %qT for simd\n", ret_type);
51578 return 0;
51581 tree t;
51582 int i;
51584 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51585 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51586 switch (TYPE_MODE (TREE_TYPE (t)))
51588 case QImode:
51589 case HImode:
51590 case SImode:
51591 case DImode:
51592 case SFmode:
51593 case DFmode:
51594 /* case SCmode: */
51595 /* case DCmode: */
51596 break;
51597 default:
51598 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51599 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51600 return 0;
51603 if (clonei->cilk_elemental)
51605 /* Parse here processor clause. If not present, default to 'b'. */
51606 clonei->vecsize_mangle = 'b';
51608 else if (!TREE_PUBLIC (node->decl))
51610 /* If the function isn't exported, we can pick up just one ISA
51611 for the clones. */
51612 if (TARGET_AVX512F)
51613 clonei->vecsize_mangle = 'e';
51614 else if (TARGET_AVX2)
51615 clonei->vecsize_mangle = 'd';
51616 else if (TARGET_AVX)
51617 clonei->vecsize_mangle = 'c';
51618 else
51619 clonei->vecsize_mangle = 'b';
51620 ret = 1;
51622 else
51624 clonei->vecsize_mangle = "bcde"[num];
51625 ret = 4;
51627 clonei->mask_mode = VOIDmode;
51628 switch (clonei->vecsize_mangle)
51630 case 'b':
51631 clonei->vecsize_int = 128;
51632 clonei->vecsize_float = 128;
51633 break;
51634 case 'c':
51635 clonei->vecsize_int = 128;
51636 clonei->vecsize_float = 256;
51637 break;
51638 case 'd':
51639 clonei->vecsize_int = 256;
51640 clonei->vecsize_float = 256;
51641 break;
51642 case 'e':
51643 clonei->vecsize_int = 512;
51644 clonei->vecsize_float = 512;
51645 if (TYPE_MODE (base_type) == QImode)
51646 clonei->mask_mode = DImode;
51647 else
51648 clonei->mask_mode = SImode;
51649 break;
51651 if (clonei->simdlen == 0)
51653 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51654 clonei->simdlen = clonei->vecsize_int;
51655 else
51656 clonei->simdlen = clonei->vecsize_float;
51657 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51659 else if (clonei->simdlen > 16)
51661 /* For compatibility with ICC, use the same upper bounds
51662 for simdlen. In particular, for CTYPE below, use the return type,
51663 unless the function returns void, in that case use the characteristic
51664 type. If it is possible for given SIMDLEN to pass CTYPE value
51665 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51666 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51667 emit corresponding clone. */
51668 tree ctype = ret_type;
51669 if (TREE_CODE (ret_type) == VOID_TYPE)
51670 ctype = base_type;
51671 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51672 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51673 cnt /= clonei->vecsize_int;
51674 else
51675 cnt /= clonei->vecsize_float;
51676 if (cnt > (TARGET_64BIT ? 16 : 8))
51678 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51679 "unsupported simdlen %d", clonei->simdlen);
51680 return 0;
51683 return ret;
51686 /* Add target attribute to SIMD clone NODE if needed. */
51688 static void
51689 ix86_simd_clone_adjust (struct cgraph_node *node)
51691 const char *str = NULL;
51692 gcc_assert (node->decl == cfun->decl);
51693 switch (node->simdclone->vecsize_mangle)
51695 case 'b':
51696 if (!TARGET_SSE2)
51697 str = "sse2";
51698 break;
51699 case 'c':
51700 if (!TARGET_AVX)
51701 str = "avx";
51702 break;
51703 case 'd':
51704 if (!TARGET_AVX2)
51705 str = "avx2";
51706 break;
51707 case 'e':
51708 if (!TARGET_AVX512F)
51709 str = "avx512f";
51710 break;
51711 default:
51712 gcc_unreachable ();
51714 if (str == NULL)
51715 return;
51716 push_cfun (NULL);
51717 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51718 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51719 gcc_assert (ok);
51720 pop_cfun ();
51721 ix86_reset_previous_fndecl ();
51722 ix86_set_current_function (node->decl);
51725 /* If SIMD clone NODE can't be used in a vectorized loop
51726 in current function, return -1, otherwise return a badness of using it
51727 (0 if it is most desirable from vecsize_mangle point of view, 1
51728 slightly less desirable, etc.). */
51730 static int
51731 ix86_simd_clone_usable (struct cgraph_node *node)
51733 switch (node->simdclone->vecsize_mangle)
51735 case 'b':
51736 if (!TARGET_SSE2)
51737 return -1;
51738 if (!TARGET_AVX)
51739 return 0;
51740 return TARGET_AVX2 ? 2 : 1;
51741 case 'c':
51742 if (!TARGET_AVX)
51743 return -1;
51744 return TARGET_AVX2 ? 1 : 0;
51745 case 'd':
51746 if (!TARGET_AVX2)
51747 return -1;
51748 return 0;
51749 case 'e':
51750 if (!TARGET_AVX512F)
51751 return -1;
51752 return 0;
51753 default:
51754 gcc_unreachable ();
51758 /* This function adjusts the unroll factor based on
51759 the hardware capabilities. For ex, bdver3 has
51760 a loop buffer which makes unrolling of smaller
51761 loops less important. This function decides the
51762 unroll factor using number of memory references
51763 (value 32 is used) as a heuristic. */
51765 static unsigned
51766 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
51768 basic_block *bbs;
51769 rtx_insn *insn;
51770 unsigned i;
51771 unsigned mem_count = 0;
51773 if (!TARGET_ADJUST_UNROLL)
51774 return nunroll;
51776 /* Count the number of memory references within the loop body.
51777 This value determines the unrolling factor for bdver3 and bdver4
51778 architectures. */
51779 subrtx_iterator::array_type array;
51780 bbs = get_loop_body (loop);
51781 for (i = 0; i < loop->num_nodes; i++)
51782 FOR_BB_INSNS (bbs[i], insn)
51783 if (NONDEBUG_INSN_P (insn))
51784 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
51785 if (const_rtx x = *iter)
51786 if (MEM_P (x))
51788 machine_mode mode = GET_MODE (x);
51789 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
51790 if (n_words > 4)
51791 mem_count += 2;
51792 else
51793 mem_count += 1;
51795 free (bbs);
51797 if (mem_count && mem_count <=32)
51798 return 32/mem_count;
51800 return nunroll;
51804 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
51806 static bool
51807 ix86_float_exceptions_rounding_supported_p (void)
51809 /* For x87 floating point with standard excess precision handling,
51810 there is no adddf3 pattern (since x87 floating point only has
51811 XFmode operations) so the default hook implementation gets this
51812 wrong. */
51813 return TARGET_80387 || TARGET_SSE_MATH;
51816 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
51818 static void
51819 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
51821 if (!TARGET_80387 && !TARGET_SSE_MATH)
51822 return;
51823 tree exceptions_var = create_tmp_var_raw (integer_type_node);
51824 if (TARGET_80387)
51826 tree fenv_index_type = build_index_type (size_int (6));
51827 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
51828 tree fenv_var = create_tmp_var_raw (fenv_type);
51829 TREE_ADDRESSABLE (fenv_var) = 1;
51830 tree fenv_ptr = build_pointer_type (fenv_type);
51831 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
51832 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
51833 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
51834 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
51835 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
51836 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
51837 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
51838 tree hold_fnclex = build_call_expr (fnclex, 0);
51839 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
51840 NULL_TREE, NULL_TREE);
51841 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
51842 hold_fnclex);
51843 *clear = build_call_expr (fnclex, 0);
51844 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
51845 tree fnstsw_call = build_call_expr (fnstsw, 0);
51846 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
51847 sw_var, fnstsw_call);
51848 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
51849 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
51850 exceptions_var, exceptions_x87);
51851 *update = build2 (COMPOUND_EXPR, integer_type_node,
51852 sw_mod, update_mod);
51853 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
51854 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
51856 if (TARGET_SSE_MATH)
51858 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
51859 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
51860 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
51861 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
51862 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
51863 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
51864 mxcsr_orig_var, stmxcsr_hold_call);
51865 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
51866 mxcsr_orig_var,
51867 build_int_cst (unsigned_type_node, 0x1f80));
51868 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
51869 build_int_cst (unsigned_type_node, 0xffffffc0));
51870 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
51871 mxcsr_mod_var, hold_mod_val);
51872 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51873 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51874 hold_assign_orig, hold_assign_mod);
51875 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51876 ldmxcsr_hold_call);
51877 if (*hold)
51878 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51879 else
51880 *hold = hold_all;
51881 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51882 if (*clear)
51883 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51884 ldmxcsr_clear_call);
51885 else
51886 *clear = ldmxcsr_clear_call;
51887 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51888 tree exceptions_sse = fold_convert (integer_type_node,
51889 stxmcsr_update_call);
51890 if (*update)
51892 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51893 exceptions_var, exceptions_sse);
51894 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51895 exceptions_var, exceptions_mod);
51896 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51897 exceptions_assign);
51899 else
51900 *update = build2 (MODIFY_EXPR, integer_type_node,
51901 exceptions_var, exceptions_sse);
51902 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51903 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51904 ldmxcsr_update_call);
51906 tree atomic_feraiseexcept
51907 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51908 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51909 1, exceptions_var);
51910 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51911 atomic_feraiseexcept_call);
51914 /* Return mode to be used for bounds or VOIDmode
51915 if bounds are not supported. */
51917 static machine_mode
51918 ix86_mpx_bound_mode ()
51920 /* Do not support pointer checker if MPX
51921 is not enabled. */
51922 if (!TARGET_MPX)
51924 if (flag_check_pointer_bounds)
51925 warning (0, "Pointer Checker requires MPX support on this target."
51926 " Use -mmpx options to enable MPX.");
51927 return VOIDmode;
51930 return BNDmode;
51933 /* Return constant used to statically initialize constant bounds.
51935 This function is used to create special bound values. For now
51936 only INIT bounds and NONE bounds are expected. More special
51937 values may be added later. */
51939 static tree
51940 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51942 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51943 : build_zero_cst (pointer_sized_int_node);
51944 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51945 : build_minus_one_cst (pointer_sized_int_node);
51947 /* This function is supposed to be used to create INIT and
51948 NONE bounds only. */
51949 gcc_assert ((lb == 0 && ub == -1)
51950 || (lb == -1 && ub == 0));
51952 return build_complex (NULL, low, high);
51955 /* Generate a list of statements STMTS to initialize pointer bounds
51956 variable VAR with bounds LB and UB. Return the number of generated
51957 statements. */
51959 static int
51960 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51962 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51963 tree lhs, modify, var_p;
51965 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51966 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51968 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51969 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51970 append_to_statement_list (modify, stmts);
51972 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51973 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51974 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51975 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51976 append_to_statement_list (modify, stmts);
51978 return 2;
51981 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51982 /* For i386, common symbol is local only for non-PIE binaries. For
51983 x86-64, common symbol is local only for non-PIE binaries or linker
51984 supports copy reloc in PIE binaries. */
51986 static bool
51987 ix86_binds_local_p (const_tree exp)
51989 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51990 (!flag_pic
51991 || (TARGET_64BIT
51992 && HAVE_LD_PIE_COPYRELOC != 0)));
51994 #endif
51996 /* If MEM is in the form of [base+offset], extract the two parts
51997 of address and set to BASE and OFFSET, otherwise return false. */
51999 static bool
52000 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52002 rtx addr;
52004 gcc_assert (MEM_P (mem));
52006 addr = XEXP (mem, 0);
52008 if (GET_CODE (addr) == CONST)
52009 addr = XEXP (addr, 0);
52011 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52013 *base = addr;
52014 *offset = const0_rtx;
52015 return true;
52018 if (GET_CODE (addr) == PLUS
52019 && (REG_P (XEXP (addr, 0))
52020 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52021 && CONST_INT_P (XEXP (addr, 1)))
52023 *base = XEXP (addr, 0);
52024 *offset = XEXP (addr, 1);
52025 return true;
52028 return false;
52031 /* Given OPERANDS of consecutive load/store, check if we can merge
52032 them into move multiple. LOAD is true if they are load instructions.
52033 MODE is the mode of memory operands. */
52035 bool
52036 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52037 machine_mode mode)
52039 HOST_WIDE_INT offval_1, offval_2, msize;
52040 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52042 if (load)
52044 mem_1 = operands[1];
52045 mem_2 = operands[3];
52046 reg_1 = operands[0];
52047 reg_2 = operands[2];
52049 else
52051 mem_1 = operands[0];
52052 mem_2 = operands[2];
52053 reg_1 = operands[1];
52054 reg_2 = operands[3];
52057 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52059 if (REGNO (reg_1) != REGNO (reg_2))
52060 return false;
52062 /* Check if the addresses are in the form of [base+offset]. */
52063 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52064 return false;
52065 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52066 return false;
52068 /* Check if the bases are the same. */
52069 if (!rtx_equal_p (base_1, base_2))
52070 return false;
52072 offval_1 = INTVAL (offset_1);
52073 offval_2 = INTVAL (offset_2);
52074 msize = GET_MODE_SIZE (mode);
52075 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52076 if (offval_1 + msize != offval_2)
52077 return false;
52079 return true;
52082 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52084 static bool
52085 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52086 optimization_type opt_type)
52088 switch (op)
52090 case asin_optab:
52091 case acos_optab:
52092 case log1p_optab:
52093 case exp_optab:
52094 case exp10_optab:
52095 case exp2_optab:
52096 case expm1_optab:
52097 case ldexp_optab:
52098 case scalb_optab:
52099 case round_optab:
52100 return opt_type == OPTIMIZE_FOR_SPEED;
52102 case rint_optab:
52103 if (SSE_FLOAT_MODE_P (mode1)
52104 && TARGET_SSE_MATH
52105 && !flag_trapping_math
52106 && !TARGET_ROUND)
52107 return opt_type == OPTIMIZE_FOR_SPEED;
52108 return true;
52110 case floor_optab:
52111 case ceil_optab:
52112 case btrunc_optab:
52113 if (SSE_FLOAT_MODE_P (mode1)
52114 && TARGET_SSE_MATH
52115 && !flag_trapping_math
52116 && TARGET_ROUND)
52117 return true;
52118 return opt_type == OPTIMIZE_FOR_SPEED;
52120 case rsqrt_optab:
52121 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52123 default:
52124 return true;
52128 /* Address space support.
52130 This is not "far pointers" in the 16-bit sense, but an easy way
52131 to use %fs and %gs segment prefixes. Therefore:
52133 (a) All address spaces have the same modes,
52134 (b) All address spaces have the same addresss forms,
52135 (c) While %fs and %gs are technically subsets of the generic
52136 address space, they are probably not subsets of each other.
52137 (d) Since we have no access to the segment base register values
52138 without resorting to a system call, we cannot convert a
52139 non-default address space to a default address space.
52140 Therefore we do not claim %fs or %gs are subsets of generic.
52142 Therefore we can (mostly) use the default hooks. */
52144 /* All use of segmentation is assumed to make address 0 valid. */
52146 static bool
52147 ix86_addr_space_zero_address_valid (addr_space_t as)
52149 return as != ADDR_SPACE_GENERIC;
52152 static void
52153 ix86_init_libfuncs (void)
52155 if (TARGET_64BIT)
52157 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52158 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52160 else
52162 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52163 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52166 #if TARGET_MACHO
52167 darwin_rename_builtins ();
52168 #endif
52171 /* Generate call to __divmoddi4. */
52173 static void
52174 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52175 rtx op0, rtx op1,
52176 rtx *quot_p, rtx *rem_p)
52178 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52180 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52181 mode, 3,
52182 op0, GET_MODE (op0),
52183 op1, GET_MODE (op1),
52184 XEXP (rem, 0), Pmode);
52185 *quot_p = quot;
52186 *rem_p = rem;
52189 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52190 FPU, assume that the fpcw is set to extended precision; when using
52191 only SSE, rounding is correct; when using both SSE and the FPU,
52192 the rounding precision is indeterminate, since either may be chosen
52193 apparently at random. */
52195 static enum flt_eval_method
52196 ix86_excess_precision (enum excess_precision_type type)
52198 switch (type)
52200 case EXCESS_PRECISION_TYPE_FAST:
52201 /* The fastest type to promote to will always be the native type,
52202 whether that occurs with implicit excess precision or
52203 otherwise. */
52204 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52205 case EXCESS_PRECISION_TYPE_STANDARD:
52206 case EXCESS_PRECISION_TYPE_IMPLICIT:
52207 /* Otherwise, the excess precision we want when we are
52208 in a standards compliant mode, and the implicit precision we
52209 provide would be identical were it not for the unpredictable
52210 cases. */
52211 if (!TARGET_80387)
52212 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52213 else if (!TARGET_MIX_SSE_I387)
52215 if (!TARGET_SSE_MATH)
52216 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52217 else if (TARGET_SSE2)
52218 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52221 /* If we are in standards compliant mode, but we know we will
52222 calculate in unpredictable precision, return
52223 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52224 excess precision if the target can't guarantee it will honor
52225 it. */
52226 return (type == EXCESS_PRECISION_TYPE_STANDARD
52227 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52228 : FLT_EVAL_METHOD_UNPREDICTABLE);
52229 default:
52230 gcc_unreachable ();
52233 return FLT_EVAL_METHOD_UNPREDICTABLE;
52236 /* Target-specific selftests. */
52238 #if CHECKING_P
52240 namespace selftest {
52242 /* Verify that hard regs are dumped as expected (in compact mode). */
52244 static void
52245 ix86_test_dumping_hard_regs ()
52247 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52248 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52251 /* Test dumping an insn with repeated references to the same SCRATCH,
52252 to verify the rtx_reuse code. */
52254 static void
52255 ix86_test_dumping_memory_blockage ()
52257 set_new_first_and_last_insn (NULL, NULL);
52259 rtx pat = gen_memory_blockage ();
52260 rtx_reuse_manager r;
52261 r.preprocess (pat);
52263 /* Verify that the repeated references to the SCRATCH show use
52264 reuse IDS. The first should be prefixed with a reuse ID,
52265 and the second should be dumped as a "reuse_rtx" of that ID.
52266 The expected string assumes Pmode == DImode. */
52267 if (Pmode == DImode)
52268 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52269 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52270 " (unspec:BLK [\n"
52271 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52272 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52275 /* Verify loading an RTL dump; specifically a dump of copying
52276 a param on x86_64 from a hard reg into the frame.
52277 This test is target-specific since the dump contains target-specific
52278 hard reg names. */
52280 static void
52281 ix86_test_loading_dump_fragment_1 ()
52283 rtl_dump_test t (SELFTEST_LOCATION,
52284 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52286 rtx_insn *insn = get_insn_by_uid (1);
52288 /* The block structure and indentation here is purely for
52289 readability; it mirrors the structure of the rtx. */
52290 tree mem_expr;
52292 rtx pat = PATTERN (insn);
52293 ASSERT_EQ (SET, GET_CODE (pat));
52295 rtx dest = SET_DEST (pat);
52296 ASSERT_EQ (MEM, GET_CODE (dest));
52297 /* Verify the "/c" was parsed. */
52298 ASSERT_TRUE (RTX_FLAG (dest, call));
52299 ASSERT_EQ (SImode, GET_MODE (dest));
52301 rtx addr = XEXP (dest, 0);
52302 ASSERT_EQ (PLUS, GET_CODE (addr));
52303 ASSERT_EQ (DImode, GET_MODE (addr));
52305 rtx lhs = XEXP (addr, 0);
52306 /* Verify that the "frame" REG was consolidated. */
52307 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52310 rtx rhs = XEXP (addr, 1);
52311 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52312 ASSERT_EQ (-4, INTVAL (rhs));
52315 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52316 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52317 /* "i" should have been handled by synthesizing a global int
52318 variable named "i". */
52319 mem_expr = MEM_EXPR (dest);
52320 ASSERT_NE (mem_expr, NULL);
52321 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52322 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52323 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52324 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52325 /* "+0". */
52326 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52327 ASSERT_EQ (0, MEM_OFFSET (dest));
52328 /* "S4". */
52329 ASSERT_EQ (4, MEM_SIZE (dest));
52330 /* "A32. */
52331 ASSERT_EQ (32, MEM_ALIGN (dest));
52334 rtx src = SET_SRC (pat);
52335 ASSERT_EQ (REG, GET_CODE (src));
52336 ASSERT_EQ (SImode, GET_MODE (src));
52337 ASSERT_EQ (5, REGNO (src));
52338 tree reg_expr = REG_EXPR (src);
52339 /* "i" here should point to the same var as for the MEM_EXPR. */
52340 ASSERT_EQ (reg_expr, mem_expr);
52345 /* Verify that the RTL loader copes with a call_insn dump.
52346 This test is target-specific since the dump contains a target-specific
52347 hard reg name. */
52349 static void
52350 ix86_test_loading_call_insn ()
52352 /* The test dump includes register "xmm0", where requires TARGET_SSE
52353 to exist. */
52354 if (!TARGET_SSE)
52355 return;
52357 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52359 rtx_insn *insn = get_insns ();
52360 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52362 /* "/j". */
52363 ASSERT_TRUE (RTX_FLAG (insn, jump));
52365 rtx pat = PATTERN (insn);
52366 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52368 /* Verify REG_NOTES. */
52370 /* "(expr_list:REG_CALL_DECL". */
52371 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52372 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52373 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52375 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52376 rtx_expr_list *note1 = note0->next ();
52377 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52379 ASSERT_EQ (NULL, note1->next ());
52382 /* Verify CALL_INSN_FUNCTION_USAGE. */
52384 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52385 rtx_expr_list *usage
52386 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52387 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52388 ASSERT_EQ (DFmode, GET_MODE (usage));
52389 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52390 ASSERT_EQ (NULL, usage->next ());
52394 /* Verify that the RTL loader copes a dump from print_rtx_function.
52395 This test is target-specific since the dump contains target-specific
52396 hard reg names. */
52398 static void
52399 ix86_test_loading_full_dump ()
52401 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52403 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52405 rtx_insn *insn_1 = get_insn_by_uid (1);
52406 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52408 rtx_insn *insn_7 = get_insn_by_uid (7);
52409 ASSERT_EQ (INSN, GET_CODE (insn_7));
52410 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52412 rtx_insn *insn_15 = get_insn_by_uid (15);
52413 ASSERT_EQ (INSN, GET_CODE (insn_15));
52414 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52416 /* Verify crtl->return_rtx. */
52417 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52418 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52419 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52422 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52423 In particular, verify that it correctly loads the 2nd operand.
52424 This test is target-specific since these are machine-specific
52425 operands (and enums). */
52427 static void
52428 ix86_test_loading_unspec ()
52430 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52432 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52434 ASSERT_TRUE (cfun);
52436 /* Test of an UNSPEC. */
52437 rtx_insn *insn = get_insns ();
52438 ASSERT_EQ (INSN, GET_CODE (insn));
52439 rtx set = single_set (insn);
52440 ASSERT_NE (NULL, set);
52441 rtx dst = SET_DEST (set);
52442 ASSERT_EQ (MEM, GET_CODE (dst));
52443 rtx src = SET_SRC (set);
52444 ASSERT_EQ (UNSPEC, GET_CODE (src));
52445 ASSERT_EQ (BLKmode, GET_MODE (src));
52446 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52448 rtx v0 = XVECEXP (src, 0, 0);
52450 /* Verify that the two uses of the first SCRATCH have pointer
52451 equality. */
52452 rtx scratch_a = XEXP (dst, 0);
52453 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52455 rtx scratch_b = XEXP (v0, 0);
52456 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52458 ASSERT_EQ (scratch_a, scratch_b);
52460 /* Verify that the two mems are thus treated as equal. */
52461 ASSERT_TRUE (rtx_equal_p (dst, v0));
52463 /* Verify the the insn is recognized. */
52464 ASSERT_NE(-1, recog_memoized (insn));
52466 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52467 insn = NEXT_INSN (insn);
52468 ASSERT_EQ (INSN, GET_CODE (insn));
52470 set = single_set (insn);
52471 ASSERT_NE (NULL, set);
52473 src = SET_SRC (set);
52474 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52475 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52478 /* Run all target-specific selftests. */
52480 static void
52481 ix86_run_selftests (void)
52483 ix86_test_dumping_hard_regs ();
52484 ix86_test_dumping_memory_blockage ();
52486 /* Various tests of loading RTL dumps, here because they contain
52487 ix86-isms (e.g. names of hard regs). */
52488 ix86_test_loading_dump_fragment_1 ();
52489 ix86_test_loading_call_insn ();
52490 ix86_test_loading_full_dump ();
52491 ix86_test_loading_unspec ();
52494 } // namespace selftest
52496 #endif /* CHECKING_P */
52498 /* Initialize the GCC target structure. */
52499 #undef TARGET_RETURN_IN_MEMORY
52500 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52502 #undef TARGET_LEGITIMIZE_ADDRESS
52503 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52505 #undef TARGET_ATTRIBUTE_TABLE
52506 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52507 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52508 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52509 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52510 # undef TARGET_MERGE_DECL_ATTRIBUTES
52511 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52512 #endif
52514 #undef TARGET_COMP_TYPE_ATTRIBUTES
52515 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52517 #undef TARGET_INIT_BUILTINS
52518 #define TARGET_INIT_BUILTINS ix86_init_builtins
52519 #undef TARGET_BUILTIN_DECL
52520 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52521 #undef TARGET_EXPAND_BUILTIN
52522 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52524 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52525 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52526 ix86_builtin_vectorized_function
52528 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52529 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52531 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52532 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52534 #undef TARGET_BUILTIN_RECIPROCAL
52535 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52537 #undef TARGET_ASM_FUNCTION_EPILOGUE
52538 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52540 #undef TARGET_ENCODE_SECTION_INFO
52541 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52542 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52543 #else
52544 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52545 #endif
52547 #undef TARGET_ASM_OPEN_PAREN
52548 #define TARGET_ASM_OPEN_PAREN ""
52549 #undef TARGET_ASM_CLOSE_PAREN
52550 #define TARGET_ASM_CLOSE_PAREN ""
52552 #undef TARGET_ASM_BYTE_OP
52553 #define TARGET_ASM_BYTE_OP ASM_BYTE
52555 #undef TARGET_ASM_ALIGNED_HI_OP
52556 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52557 #undef TARGET_ASM_ALIGNED_SI_OP
52558 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52559 #ifdef ASM_QUAD
52560 #undef TARGET_ASM_ALIGNED_DI_OP
52561 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52562 #endif
52564 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52565 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52567 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52568 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52570 #undef TARGET_ASM_UNALIGNED_HI_OP
52571 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52572 #undef TARGET_ASM_UNALIGNED_SI_OP
52573 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52574 #undef TARGET_ASM_UNALIGNED_DI_OP
52575 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52577 #undef TARGET_PRINT_OPERAND
52578 #define TARGET_PRINT_OPERAND ix86_print_operand
52579 #undef TARGET_PRINT_OPERAND_ADDRESS
52580 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52581 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52582 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52583 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52584 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52586 #undef TARGET_SCHED_INIT_GLOBAL
52587 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52588 #undef TARGET_SCHED_ADJUST_COST
52589 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52590 #undef TARGET_SCHED_ISSUE_RATE
52591 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52592 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52593 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52594 ia32_multipass_dfa_lookahead
52595 #undef TARGET_SCHED_MACRO_FUSION_P
52596 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52597 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52598 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52600 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52601 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52603 #undef TARGET_MEMMODEL_CHECK
52604 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52606 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52607 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52609 #ifdef HAVE_AS_TLS
52610 #undef TARGET_HAVE_TLS
52611 #define TARGET_HAVE_TLS true
52612 #endif
52613 #undef TARGET_CANNOT_FORCE_CONST_MEM
52614 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52615 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52616 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52618 #undef TARGET_DELEGITIMIZE_ADDRESS
52619 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52621 #undef TARGET_MS_BITFIELD_LAYOUT_P
52622 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52624 #if TARGET_MACHO
52625 #undef TARGET_BINDS_LOCAL_P
52626 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52627 #else
52628 #undef TARGET_BINDS_LOCAL_P
52629 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52630 #endif
52631 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52632 #undef TARGET_BINDS_LOCAL_P
52633 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52634 #endif
52636 #undef TARGET_ASM_OUTPUT_MI_THUNK
52637 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52638 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52639 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52641 #undef TARGET_ASM_FILE_START
52642 #define TARGET_ASM_FILE_START x86_file_start
52644 #undef TARGET_OPTION_OVERRIDE
52645 #define TARGET_OPTION_OVERRIDE ix86_option_override
52647 #undef TARGET_REGISTER_MOVE_COST
52648 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52649 #undef TARGET_MEMORY_MOVE_COST
52650 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52651 #undef TARGET_RTX_COSTS
52652 #define TARGET_RTX_COSTS ix86_rtx_costs
52653 #undef TARGET_ADDRESS_COST
52654 #define TARGET_ADDRESS_COST ix86_address_cost
52656 #undef TARGET_FLAGS_REGNUM
52657 #define TARGET_FLAGS_REGNUM FLAGS_REG
52658 #undef TARGET_FIXED_CONDITION_CODE_REGS
52659 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52660 #undef TARGET_CC_MODES_COMPATIBLE
52661 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52663 #undef TARGET_MACHINE_DEPENDENT_REORG
52664 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52666 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52667 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52669 #undef TARGET_BUILD_BUILTIN_VA_LIST
52670 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52672 #undef TARGET_FOLD_BUILTIN
52673 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52675 #undef TARGET_GIMPLE_FOLD_BUILTIN
52676 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52678 #undef TARGET_COMPARE_VERSION_PRIORITY
52679 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52681 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52682 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52683 ix86_generate_version_dispatcher_body
52685 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52686 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52687 ix86_get_function_versions_dispatcher
52689 #undef TARGET_ENUM_VA_LIST_P
52690 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52692 #undef TARGET_FN_ABI_VA_LIST
52693 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52695 #undef TARGET_CANONICAL_VA_LIST_TYPE
52696 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52698 #undef TARGET_EXPAND_BUILTIN_VA_START
52699 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52701 #undef TARGET_MD_ASM_ADJUST
52702 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52704 #undef TARGET_C_EXCESS_PRECISION
52705 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52706 #undef TARGET_PROMOTE_PROTOTYPES
52707 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52708 #undef TARGET_SETUP_INCOMING_VARARGS
52709 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52710 #undef TARGET_MUST_PASS_IN_STACK
52711 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52712 #undef TARGET_FUNCTION_ARG_ADVANCE
52713 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52714 #undef TARGET_FUNCTION_ARG
52715 #define TARGET_FUNCTION_ARG ix86_function_arg
52716 #undef TARGET_INIT_PIC_REG
52717 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52718 #undef TARGET_USE_PSEUDO_PIC_REG
52719 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52720 #undef TARGET_FUNCTION_ARG_BOUNDARY
52721 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52722 #undef TARGET_PASS_BY_REFERENCE
52723 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52724 #undef TARGET_INTERNAL_ARG_POINTER
52725 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52726 #undef TARGET_UPDATE_STACK_BOUNDARY
52727 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52728 #undef TARGET_GET_DRAP_RTX
52729 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52730 #undef TARGET_STRICT_ARGUMENT_NAMING
52731 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52732 #undef TARGET_STATIC_CHAIN
52733 #define TARGET_STATIC_CHAIN ix86_static_chain
52734 #undef TARGET_TRAMPOLINE_INIT
52735 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52736 #undef TARGET_RETURN_POPS_ARGS
52737 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52739 #undef TARGET_LEGITIMATE_COMBINED_INSN
52740 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52742 #undef TARGET_ASAN_SHADOW_OFFSET
52743 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52745 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52746 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52748 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52749 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52751 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52752 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52754 #undef TARGET_C_MODE_FOR_SUFFIX
52755 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52757 #ifdef HAVE_AS_TLS
52758 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
52759 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
52760 #endif
52762 #ifdef SUBTARGET_INSERT_ATTRIBUTES
52763 #undef TARGET_INSERT_ATTRIBUTES
52764 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
52765 #endif
52767 #undef TARGET_MANGLE_TYPE
52768 #define TARGET_MANGLE_TYPE ix86_mangle_type
52770 #ifdef TARGET_THREAD_SSP_OFFSET
52771 #undef TARGET_STACK_PROTECT_GUARD
52772 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
52773 #endif
52775 #if !TARGET_MACHO
52776 #undef TARGET_STACK_PROTECT_FAIL
52777 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
52778 #endif
52780 #undef TARGET_FUNCTION_VALUE
52781 #define TARGET_FUNCTION_VALUE ix86_function_value
52783 #undef TARGET_FUNCTION_VALUE_REGNO_P
52784 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
52786 #undef TARGET_PROMOTE_FUNCTION_MODE
52787 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
52789 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
52790 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
52792 #undef TARGET_MEMBER_TYPE_FORCES_BLK
52793 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
52795 #undef TARGET_INSTANTIATE_DECLS
52796 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
52798 #undef TARGET_SECONDARY_RELOAD
52799 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
52801 #undef TARGET_CLASS_MAX_NREGS
52802 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
52804 #undef TARGET_PREFERRED_RELOAD_CLASS
52805 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
52806 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
52807 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
52808 #undef TARGET_CLASS_LIKELY_SPILLED_P
52809 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
52811 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
52812 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
52813 ix86_builtin_vectorization_cost
52814 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
52815 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
52816 ix86_vectorize_vec_perm_const_ok
52817 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
52818 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
52819 ix86_preferred_simd_mode
52820 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
52821 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
52822 ix86_autovectorize_vector_sizes
52823 #undef TARGET_VECTORIZE_GET_MASK_MODE
52824 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
52825 #undef TARGET_VECTORIZE_INIT_COST
52826 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
52827 #undef TARGET_VECTORIZE_ADD_STMT_COST
52828 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
52829 #undef TARGET_VECTORIZE_FINISH_COST
52830 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
52831 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
52832 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
52834 #undef TARGET_SET_CURRENT_FUNCTION
52835 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
52837 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
52838 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
52840 #undef TARGET_OPTION_SAVE
52841 #define TARGET_OPTION_SAVE ix86_function_specific_save
52843 #undef TARGET_OPTION_RESTORE
52844 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
52846 #undef TARGET_OPTION_POST_STREAM_IN
52847 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
52849 #undef TARGET_OPTION_PRINT
52850 #define TARGET_OPTION_PRINT ix86_function_specific_print
52852 #undef TARGET_OPTION_FUNCTION_VERSIONS
52853 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
52855 #undef TARGET_CAN_INLINE_P
52856 #define TARGET_CAN_INLINE_P ix86_can_inline_p
52858 #undef TARGET_LEGITIMATE_ADDRESS_P
52859 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
52861 #undef TARGET_REGISTER_PRIORITY
52862 #define TARGET_REGISTER_PRIORITY ix86_register_priority
52864 #undef TARGET_REGISTER_USAGE_LEVELING_P
52865 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
52867 #undef TARGET_LEGITIMATE_CONSTANT_P
52868 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52870 #undef TARGET_COMPUTE_FRAME_LAYOUT
52871 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52873 #undef TARGET_FRAME_POINTER_REQUIRED
52874 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52876 #undef TARGET_CAN_ELIMINATE
52877 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52879 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52880 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52882 #undef TARGET_ASM_CODE_END
52883 #define TARGET_ASM_CODE_END ix86_code_end
52885 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52886 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52888 #undef TARGET_LOOP_UNROLL_ADJUST
52889 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52891 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52892 #undef TARGET_SPILL_CLASS
52893 #define TARGET_SPILL_CLASS ix86_spill_class
52895 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52896 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52897 ix86_simd_clone_compute_vecsize_and_simdlen
52899 #undef TARGET_SIMD_CLONE_ADJUST
52900 #define TARGET_SIMD_CLONE_ADJUST \
52901 ix86_simd_clone_adjust
52903 #undef TARGET_SIMD_CLONE_USABLE
52904 #define TARGET_SIMD_CLONE_USABLE \
52905 ix86_simd_clone_usable
52907 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52908 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52909 ix86_float_exceptions_rounding_supported_p
52911 #undef TARGET_MODE_EMIT
52912 #define TARGET_MODE_EMIT ix86_emit_mode_set
52914 #undef TARGET_MODE_NEEDED
52915 #define TARGET_MODE_NEEDED ix86_mode_needed
52917 #undef TARGET_MODE_AFTER
52918 #define TARGET_MODE_AFTER ix86_mode_after
52920 #undef TARGET_MODE_ENTRY
52921 #define TARGET_MODE_ENTRY ix86_mode_entry
52923 #undef TARGET_MODE_EXIT
52924 #define TARGET_MODE_EXIT ix86_mode_exit
52926 #undef TARGET_MODE_PRIORITY
52927 #define TARGET_MODE_PRIORITY ix86_mode_priority
52929 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52930 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52932 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52933 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52935 #undef TARGET_STORE_BOUNDS_FOR_ARG
52936 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52938 #undef TARGET_LOAD_RETURNED_BOUNDS
52939 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52941 #undef TARGET_STORE_RETURNED_BOUNDS
52942 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52944 #undef TARGET_CHKP_BOUND_MODE
52945 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52947 #undef TARGET_BUILTIN_CHKP_FUNCTION
52948 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52950 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52951 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52953 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52954 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52956 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52957 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52959 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52960 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52962 #undef TARGET_OFFLOAD_OPTIONS
52963 #define TARGET_OFFLOAD_OPTIONS \
52964 ix86_offload_options
52966 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52967 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52969 #undef TARGET_OPTAB_SUPPORTED_P
52970 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52972 #undef TARGET_HARD_REGNO_SCRATCH_OK
52973 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52975 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52976 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52978 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52979 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52981 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52982 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52984 #undef TARGET_INIT_LIBFUNCS
52985 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52987 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52988 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52990 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52991 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52993 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52994 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52996 #if CHECKING_P
52997 #undef TARGET_RUN_TARGET_SELFTESTS
52998 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52999 #endif /* #if CHECKING_P */
53001 struct gcc_target targetm = TARGET_INITIALIZER;
53003 #include "gt-i386.h"