Fix numerous typos in comments
[official-gcc.git] / gcc / config / i386 / i386.c
blob80b8773649460d8e1fabfd7c79c27209edbca302
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
88 /* This file should be included last. */
89 #include "target-def.h"
91 static rtx legitimize_dllimport_symbol (rtx, bool);
92 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
93 static rtx legitimize_pe_coff_symbol (rtx, bool);
94 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
96 #ifndef CHECK_STACK_LIMIT
97 #define CHECK_STACK_LIMIT (-1)
98 #endif
100 /* Return index of given mode in mult and division cost tables. */
101 #define MODE_INDEX(mode) \
102 ((mode) == QImode ? 0 \
103 : (mode) == HImode ? 1 \
104 : (mode) == SImode ? 2 \
105 : (mode) == DImode ? 3 \
106 : 4)
108 /* Processor costs (relative to an add) */
109 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
110 #define COSTS_N_BYTES(N) ((N) * 2)
112 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
114 static stringop_algs ix86_size_memcpy[2] = {
115 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
116 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
117 static stringop_algs ix86_size_memset[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
121 const
122 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
123 COSTS_N_BYTES (2), /* cost of an add instruction */
124 COSTS_N_BYTES (3), /* cost of a lea instruction */
125 COSTS_N_BYTES (2), /* variable shift costs */
126 COSTS_N_BYTES (3), /* constant shift costs */
127 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
128 COSTS_N_BYTES (3), /* HI */
129 COSTS_N_BYTES (3), /* SI */
130 COSTS_N_BYTES (3), /* DI */
131 COSTS_N_BYTES (5)}, /* other */
132 0, /* cost of multiply per each bit set */
133 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
134 COSTS_N_BYTES (3), /* HI */
135 COSTS_N_BYTES (3), /* SI */
136 COSTS_N_BYTES (3), /* DI */
137 COSTS_N_BYTES (5)}, /* other */
138 COSTS_N_BYTES (3), /* cost of movsx */
139 COSTS_N_BYTES (3), /* cost of movzx */
140 0, /* "large" insn */
141 2, /* MOVE_RATIO */
142 2, /* cost for loading QImode using movzbl */
143 {2, 2, 2}, /* cost of loading integer registers
144 in QImode, HImode and SImode.
145 Relative to reg-reg move (2). */
146 {2, 2, 2}, /* cost of storing integer registers */
147 2, /* cost of reg,reg fld/fst */
148 {2, 2, 2}, /* cost of loading fp registers
149 in SFmode, DFmode and XFmode */
150 {2, 2, 2}, /* cost of storing fp registers
151 in SFmode, DFmode and XFmode */
152 3, /* cost of moving MMX register */
153 {3, 3}, /* cost of loading MMX registers
154 in SImode and DImode */
155 {3, 3}, /* cost of storing MMX registers
156 in SImode and DImode */
157 3, /* cost of moving SSE register */
158 {3, 3, 3}, /* cost of loading SSE registers
159 in SImode, DImode and TImode */
160 {3, 3, 3}, /* cost of storing SSE registers
161 in SImode, DImode and TImode */
162 3, /* MMX or SSE register to integer */
163 0, /* size of l1 cache */
164 0, /* size of l2 cache */
165 0, /* size of prefetch block */
166 0, /* number of parallel prefetches */
167 2, /* Branch cost */
168 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
169 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
170 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
171 COSTS_N_BYTES (2), /* cost of FABS instruction. */
172 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
173 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
174 ix86_size_memcpy,
175 ix86_size_memset,
176 1, /* scalar_stmt_cost. */
177 1, /* scalar load_cost. */
178 1, /* scalar_store_cost. */
179 1, /* vec_stmt_cost. */
180 1, /* vec_to_scalar_cost. */
181 1, /* scalar_to_vec_cost. */
182 1, /* vec_align_load_cost. */
183 1, /* vec_unalign_load_cost. */
184 1, /* vec_store_cost. */
185 1, /* cond_taken_branch_cost. */
186 1, /* cond_not_taken_branch_cost. */
189 /* Processor costs (relative to an add) */
190 static stringop_algs i386_memcpy[2] = {
191 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
192 DUMMY_STRINGOP_ALGS};
193 static stringop_algs i386_memset[2] = {
194 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
195 DUMMY_STRINGOP_ALGS};
197 static const
198 struct processor_costs i386_cost = { /* 386 specific costs */
199 COSTS_N_INSNS (1), /* cost of an add instruction */
200 COSTS_N_INSNS (1), /* cost of a lea instruction */
201 COSTS_N_INSNS (3), /* variable shift costs */
202 COSTS_N_INSNS (2), /* constant shift costs */
203 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
204 COSTS_N_INSNS (6), /* HI */
205 COSTS_N_INSNS (6), /* SI */
206 COSTS_N_INSNS (6), /* DI */
207 COSTS_N_INSNS (6)}, /* other */
208 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
209 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
210 COSTS_N_INSNS (23), /* HI */
211 COSTS_N_INSNS (23), /* SI */
212 COSTS_N_INSNS (23), /* DI */
213 COSTS_N_INSNS (23)}, /* other */
214 COSTS_N_INSNS (3), /* cost of movsx */
215 COSTS_N_INSNS (2), /* cost of movzx */
216 15, /* "large" insn */
217 3, /* MOVE_RATIO */
218 4, /* cost for loading QImode using movzbl */
219 {2, 4, 2}, /* cost of loading integer registers
220 in QImode, HImode and SImode.
221 Relative to reg-reg move (2). */
222 {2, 4, 2}, /* cost of storing integer registers */
223 2, /* cost of reg,reg fld/fst */
224 {8, 8, 8}, /* cost of loading fp registers
225 in SFmode, DFmode and XFmode */
226 {8, 8, 8}, /* cost of storing fp registers
227 in SFmode, DFmode and XFmode */
228 2, /* cost of moving MMX register */
229 {4, 8}, /* cost of loading MMX registers
230 in SImode and DImode */
231 {4, 8}, /* cost of storing MMX registers
232 in SImode and DImode */
233 2, /* cost of moving SSE register */
234 {4, 8, 16}, /* cost of loading SSE registers
235 in SImode, DImode and TImode */
236 {4, 8, 16}, /* cost of storing SSE registers
237 in SImode, DImode and TImode */
238 3, /* MMX or SSE register to integer */
239 0, /* size of l1 cache */
240 0, /* size of l2 cache */
241 0, /* size of prefetch block */
242 0, /* number of parallel prefetches */
243 1, /* Branch cost */
244 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
245 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
246 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
247 COSTS_N_INSNS (22), /* cost of FABS instruction. */
248 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
249 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
250 i386_memcpy,
251 i386_memset,
252 1, /* scalar_stmt_cost. */
253 1, /* scalar load_cost. */
254 1, /* scalar_store_cost. */
255 1, /* vec_stmt_cost. */
256 1, /* vec_to_scalar_cost. */
257 1, /* scalar_to_vec_cost. */
258 1, /* vec_align_load_cost. */
259 2, /* vec_unalign_load_cost. */
260 1, /* vec_store_cost. */
261 3, /* cond_taken_branch_cost. */
262 1, /* cond_not_taken_branch_cost. */
265 static stringop_algs i486_memcpy[2] = {
266 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
267 DUMMY_STRINGOP_ALGS};
268 static stringop_algs i486_memset[2] = {
269 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
270 DUMMY_STRINGOP_ALGS};
272 static const
273 struct processor_costs i486_cost = { /* 486 specific costs */
274 COSTS_N_INSNS (1), /* cost of an add instruction */
275 COSTS_N_INSNS (1), /* cost of a lea instruction */
276 COSTS_N_INSNS (3), /* variable shift costs */
277 COSTS_N_INSNS (2), /* constant shift costs */
278 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
279 COSTS_N_INSNS (12), /* HI */
280 COSTS_N_INSNS (12), /* SI */
281 COSTS_N_INSNS (12), /* DI */
282 COSTS_N_INSNS (12)}, /* other */
283 1, /* cost of multiply per each bit set */
284 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
285 COSTS_N_INSNS (40), /* HI */
286 COSTS_N_INSNS (40), /* SI */
287 COSTS_N_INSNS (40), /* DI */
288 COSTS_N_INSNS (40)}, /* other */
289 COSTS_N_INSNS (3), /* cost of movsx */
290 COSTS_N_INSNS (2), /* cost of movzx */
291 15, /* "large" insn */
292 3, /* MOVE_RATIO */
293 4, /* cost for loading QImode using movzbl */
294 {2, 4, 2}, /* cost of loading integer registers
295 in QImode, HImode and SImode.
296 Relative to reg-reg move (2). */
297 {2, 4, 2}, /* cost of storing integer registers */
298 2, /* cost of reg,reg fld/fst */
299 {8, 8, 8}, /* cost of loading fp registers
300 in SFmode, DFmode and XFmode */
301 {8, 8, 8}, /* cost of storing fp registers
302 in SFmode, DFmode and XFmode */
303 2, /* cost of moving MMX register */
304 {4, 8}, /* cost of loading MMX registers
305 in SImode and DImode */
306 {4, 8}, /* cost of storing MMX registers
307 in SImode and DImode */
308 2, /* cost of moving SSE register */
309 {4, 8, 16}, /* cost of loading SSE registers
310 in SImode, DImode and TImode */
311 {4, 8, 16}, /* cost of storing SSE registers
312 in SImode, DImode and TImode */
313 3, /* MMX or SSE register to integer */
314 4, /* size of l1 cache. 486 has 8kB cache
315 shared for code and data, so 4kB is
316 not really precise. */
317 4, /* size of l2 cache */
318 0, /* size of prefetch block */
319 0, /* number of parallel prefetches */
320 1, /* Branch cost */
321 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
322 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
323 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
324 COSTS_N_INSNS (3), /* cost of FABS instruction. */
325 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
326 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
327 i486_memcpy,
328 i486_memset,
329 1, /* scalar_stmt_cost. */
330 1, /* scalar load_cost. */
331 1, /* scalar_store_cost. */
332 1, /* vec_stmt_cost. */
333 1, /* vec_to_scalar_cost. */
334 1, /* scalar_to_vec_cost. */
335 1, /* vec_align_load_cost. */
336 2, /* vec_unalign_load_cost. */
337 1, /* vec_store_cost. */
338 3, /* cond_taken_branch_cost. */
339 1, /* cond_not_taken_branch_cost. */
342 static stringop_algs pentium_memcpy[2] = {
343 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
344 DUMMY_STRINGOP_ALGS};
345 static stringop_algs pentium_memset[2] = {
346 {libcall, {{-1, rep_prefix_4_byte, false}}},
347 DUMMY_STRINGOP_ALGS};
349 static const
350 struct processor_costs pentium_cost = {
351 COSTS_N_INSNS (1), /* cost of an add instruction */
352 COSTS_N_INSNS (1), /* cost of a lea instruction */
353 COSTS_N_INSNS (4), /* variable shift costs */
354 COSTS_N_INSNS (1), /* constant shift costs */
355 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
356 COSTS_N_INSNS (11), /* HI */
357 COSTS_N_INSNS (11), /* SI */
358 COSTS_N_INSNS (11), /* DI */
359 COSTS_N_INSNS (11)}, /* other */
360 0, /* cost of multiply per each bit set */
361 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
362 COSTS_N_INSNS (25), /* HI */
363 COSTS_N_INSNS (25), /* SI */
364 COSTS_N_INSNS (25), /* DI */
365 COSTS_N_INSNS (25)}, /* other */
366 COSTS_N_INSNS (3), /* cost of movsx */
367 COSTS_N_INSNS (2), /* cost of movzx */
368 8, /* "large" insn */
369 6, /* MOVE_RATIO */
370 6, /* cost for loading QImode using movzbl */
371 {2, 4, 2}, /* cost of loading integer registers
372 in QImode, HImode and SImode.
373 Relative to reg-reg move (2). */
374 {2, 4, 2}, /* cost of storing integer registers */
375 2, /* cost of reg,reg fld/fst */
376 {2, 2, 6}, /* cost of loading fp registers
377 in SFmode, DFmode and XFmode */
378 {4, 4, 6}, /* cost of storing fp registers
379 in SFmode, DFmode and XFmode */
380 8, /* cost of moving MMX register */
381 {8, 8}, /* cost of loading MMX registers
382 in SImode and DImode */
383 {8, 8}, /* cost of storing MMX registers
384 in SImode and DImode */
385 2, /* cost of moving SSE register */
386 {4, 8, 16}, /* cost of loading SSE registers
387 in SImode, DImode and TImode */
388 {4, 8, 16}, /* cost of storing SSE registers
389 in SImode, DImode and TImode */
390 3, /* MMX or SSE register to integer */
391 8, /* size of l1 cache. */
392 8, /* size of l2 cache */
393 0, /* size of prefetch block */
394 0, /* number of parallel prefetches */
395 2, /* Branch cost */
396 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
397 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
398 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
399 COSTS_N_INSNS (1), /* cost of FABS instruction. */
400 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
401 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
402 pentium_memcpy,
403 pentium_memset,
404 1, /* scalar_stmt_cost. */
405 1, /* scalar load_cost. */
406 1, /* scalar_store_cost. */
407 1, /* vec_stmt_cost. */
408 1, /* vec_to_scalar_cost. */
409 1, /* scalar_to_vec_cost. */
410 1, /* vec_align_load_cost. */
411 2, /* vec_unalign_load_cost. */
412 1, /* vec_store_cost. */
413 3, /* cond_taken_branch_cost. */
414 1, /* cond_not_taken_branch_cost. */
417 static const
418 struct processor_costs lakemont_cost = {
419 COSTS_N_INSNS (1), /* cost of an add instruction */
420 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
421 COSTS_N_INSNS (1), /* variable shift costs */
422 COSTS_N_INSNS (1), /* constant shift costs */
423 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
424 COSTS_N_INSNS (11), /* HI */
425 COSTS_N_INSNS (11), /* SI */
426 COSTS_N_INSNS (11), /* DI */
427 COSTS_N_INSNS (11)}, /* other */
428 0, /* cost of multiply per each bit set */
429 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
430 COSTS_N_INSNS (25), /* HI */
431 COSTS_N_INSNS (25), /* SI */
432 COSTS_N_INSNS (25), /* DI */
433 COSTS_N_INSNS (25)}, /* other */
434 COSTS_N_INSNS (3), /* cost of movsx */
435 COSTS_N_INSNS (2), /* cost of movzx */
436 8, /* "large" insn */
437 17, /* MOVE_RATIO */
438 6, /* cost for loading QImode using movzbl */
439 {2, 4, 2}, /* cost of loading integer registers
440 in QImode, HImode and SImode.
441 Relative to reg-reg move (2). */
442 {2, 4, 2}, /* cost of storing integer registers */
443 2, /* cost of reg,reg fld/fst */
444 {2, 2, 6}, /* cost of loading fp registers
445 in SFmode, DFmode and XFmode */
446 {4, 4, 6}, /* cost of storing fp registers
447 in SFmode, DFmode and XFmode */
448 8, /* cost of moving MMX register */
449 {8, 8}, /* cost of loading MMX registers
450 in SImode and DImode */
451 {8, 8}, /* cost of storing MMX registers
452 in SImode and DImode */
453 2, /* cost of moving SSE register */
454 {4, 8, 16}, /* cost of loading SSE registers
455 in SImode, DImode and TImode */
456 {4, 8, 16}, /* cost of storing SSE registers
457 in SImode, DImode and TImode */
458 3, /* MMX or SSE register to integer */
459 8, /* size of l1 cache. */
460 8, /* size of l2 cache */
461 0, /* size of prefetch block */
462 0, /* number of parallel prefetches */
463 2, /* Branch cost */
464 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
465 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
466 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
467 COSTS_N_INSNS (1), /* cost of FABS instruction. */
468 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
469 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
470 pentium_memcpy,
471 pentium_memset,
472 1, /* scalar_stmt_cost. */
473 1, /* scalar load_cost. */
474 1, /* scalar_store_cost. */
475 1, /* vec_stmt_cost. */
476 1, /* vec_to_scalar_cost. */
477 1, /* scalar_to_vec_cost. */
478 1, /* vec_align_load_cost. */
479 2, /* vec_unalign_load_cost. */
480 1, /* vec_store_cost. */
481 3, /* cond_taken_branch_cost. */
482 1, /* cond_not_taken_branch_cost. */
485 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
486 (we ensure the alignment). For small blocks inline loop is still a
487 noticeable win, for bigger blocks either rep movsl or rep movsb is
488 way to go. Rep movsb has apparently more expensive startup time in CPU,
489 but after 4K the difference is down in the noise. */
490 static stringop_algs pentiumpro_memcpy[2] = {
491 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
492 {8192, rep_prefix_4_byte, false},
493 {-1, rep_prefix_1_byte, false}}},
494 DUMMY_STRINGOP_ALGS};
495 static stringop_algs pentiumpro_memset[2] = {
496 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
497 {8192, rep_prefix_4_byte, false},
498 {-1, libcall, false}}},
499 DUMMY_STRINGOP_ALGS};
500 static const
501 struct processor_costs pentiumpro_cost = {
502 COSTS_N_INSNS (1), /* cost of an add instruction */
503 COSTS_N_INSNS (1), /* cost of a lea instruction */
504 COSTS_N_INSNS (1), /* variable shift costs */
505 COSTS_N_INSNS (1), /* constant shift costs */
506 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
507 COSTS_N_INSNS (4), /* HI */
508 COSTS_N_INSNS (4), /* SI */
509 COSTS_N_INSNS (4), /* DI */
510 COSTS_N_INSNS (4)}, /* other */
511 0, /* cost of multiply per each bit set */
512 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
513 COSTS_N_INSNS (17), /* HI */
514 COSTS_N_INSNS (17), /* SI */
515 COSTS_N_INSNS (17), /* DI */
516 COSTS_N_INSNS (17)}, /* other */
517 COSTS_N_INSNS (1), /* cost of movsx */
518 COSTS_N_INSNS (1), /* cost of movzx */
519 8, /* "large" insn */
520 6, /* MOVE_RATIO */
521 2, /* cost for loading QImode using movzbl */
522 {4, 4, 4}, /* cost of loading integer registers
523 in QImode, HImode and SImode.
524 Relative to reg-reg move (2). */
525 {2, 2, 2}, /* cost of storing integer registers */
526 2, /* cost of reg,reg fld/fst */
527 {2, 2, 6}, /* cost of loading fp registers
528 in SFmode, DFmode and XFmode */
529 {4, 4, 6}, /* cost of storing fp registers
530 in SFmode, DFmode and XFmode */
531 2, /* cost of moving MMX register */
532 {2, 2}, /* cost of loading MMX registers
533 in SImode and DImode */
534 {2, 2}, /* cost of storing MMX registers
535 in SImode and DImode */
536 2, /* cost of moving SSE register */
537 {2, 2, 8}, /* cost of loading SSE registers
538 in SImode, DImode and TImode */
539 {2, 2, 8}, /* cost of storing SSE registers
540 in SImode, DImode and TImode */
541 3, /* MMX or SSE register to integer */
542 8, /* size of l1 cache. */
543 256, /* size of l2 cache */
544 32, /* size of prefetch block */
545 6, /* number of parallel prefetches */
546 2, /* Branch cost */
547 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
548 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
549 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
550 COSTS_N_INSNS (2), /* cost of FABS instruction. */
551 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
552 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
553 pentiumpro_memcpy,
554 pentiumpro_memset,
555 1, /* scalar_stmt_cost. */
556 1, /* scalar load_cost. */
557 1, /* scalar_store_cost. */
558 1, /* vec_stmt_cost. */
559 1, /* vec_to_scalar_cost. */
560 1, /* scalar_to_vec_cost. */
561 1, /* vec_align_load_cost. */
562 2, /* vec_unalign_load_cost. */
563 1, /* vec_store_cost. */
564 3, /* cond_taken_branch_cost. */
565 1, /* cond_not_taken_branch_cost. */
568 static stringop_algs geode_memcpy[2] = {
569 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
570 DUMMY_STRINGOP_ALGS};
571 static stringop_algs geode_memset[2] = {
572 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
573 DUMMY_STRINGOP_ALGS};
574 static const
575 struct processor_costs geode_cost = {
576 COSTS_N_INSNS (1), /* cost of an add instruction */
577 COSTS_N_INSNS (1), /* cost of a lea instruction */
578 COSTS_N_INSNS (2), /* variable shift costs */
579 COSTS_N_INSNS (1), /* constant shift costs */
580 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
581 COSTS_N_INSNS (4), /* HI */
582 COSTS_N_INSNS (7), /* SI */
583 COSTS_N_INSNS (7), /* DI */
584 COSTS_N_INSNS (7)}, /* other */
585 0, /* cost of multiply per each bit set */
586 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
587 COSTS_N_INSNS (23), /* HI */
588 COSTS_N_INSNS (39), /* SI */
589 COSTS_N_INSNS (39), /* DI */
590 COSTS_N_INSNS (39)}, /* other */
591 COSTS_N_INSNS (1), /* cost of movsx */
592 COSTS_N_INSNS (1), /* cost of movzx */
593 8, /* "large" insn */
594 4, /* MOVE_RATIO */
595 1, /* cost for loading QImode using movzbl */
596 {1, 1, 1}, /* cost of loading integer registers
597 in QImode, HImode and SImode.
598 Relative to reg-reg move (2). */
599 {1, 1, 1}, /* cost of storing integer registers */
600 1, /* cost of reg,reg fld/fst */
601 {1, 1, 1}, /* cost of loading fp registers
602 in SFmode, DFmode and XFmode */
603 {4, 6, 6}, /* cost of storing fp registers
604 in SFmode, DFmode and XFmode */
606 2, /* cost of moving MMX register */
607 {2, 2}, /* cost of loading MMX registers
608 in SImode and DImode */
609 {2, 2}, /* cost of storing MMX registers
610 in SImode and DImode */
611 2, /* cost of moving SSE register */
612 {2, 2, 8}, /* cost of loading SSE registers
613 in SImode, DImode and TImode */
614 {2, 2, 8}, /* cost of storing SSE registers
615 in SImode, DImode and TImode */
616 3, /* MMX or SSE register to integer */
617 64, /* size of l1 cache. */
618 128, /* size of l2 cache. */
619 32, /* size of prefetch block */
620 1, /* number of parallel prefetches */
621 1, /* Branch cost */
622 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
623 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
624 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
625 COSTS_N_INSNS (1), /* cost of FABS instruction. */
626 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
627 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
628 geode_memcpy,
629 geode_memset,
630 1, /* scalar_stmt_cost. */
631 1, /* scalar load_cost. */
632 1, /* scalar_store_cost. */
633 1, /* vec_stmt_cost. */
634 1, /* vec_to_scalar_cost. */
635 1, /* scalar_to_vec_cost. */
636 1, /* vec_align_load_cost. */
637 2, /* vec_unalign_load_cost. */
638 1, /* vec_store_cost. */
639 3, /* cond_taken_branch_cost. */
640 1, /* cond_not_taken_branch_cost. */
643 static stringop_algs k6_memcpy[2] = {
644 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
645 DUMMY_STRINGOP_ALGS};
646 static stringop_algs k6_memset[2] = {
647 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
648 DUMMY_STRINGOP_ALGS};
649 static const
650 struct processor_costs k6_cost = {
651 COSTS_N_INSNS (1), /* cost of an add instruction */
652 COSTS_N_INSNS (2), /* cost of a lea instruction */
653 COSTS_N_INSNS (1), /* variable shift costs */
654 COSTS_N_INSNS (1), /* constant shift costs */
655 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
656 COSTS_N_INSNS (3), /* HI */
657 COSTS_N_INSNS (3), /* SI */
658 COSTS_N_INSNS (3), /* DI */
659 COSTS_N_INSNS (3)}, /* other */
660 0, /* cost of multiply per each bit set */
661 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
662 COSTS_N_INSNS (18), /* HI */
663 COSTS_N_INSNS (18), /* SI */
664 COSTS_N_INSNS (18), /* DI */
665 COSTS_N_INSNS (18)}, /* other */
666 COSTS_N_INSNS (2), /* cost of movsx */
667 COSTS_N_INSNS (2), /* cost of movzx */
668 8, /* "large" insn */
669 4, /* MOVE_RATIO */
670 3, /* cost for loading QImode using movzbl */
671 {4, 5, 4}, /* cost of loading integer registers
672 in QImode, HImode and SImode.
673 Relative to reg-reg move (2). */
674 {2, 3, 2}, /* cost of storing integer registers */
675 4, /* cost of reg,reg fld/fst */
676 {6, 6, 6}, /* cost of loading fp registers
677 in SFmode, DFmode and XFmode */
678 {4, 4, 4}, /* cost of storing fp registers
679 in SFmode, DFmode and XFmode */
680 2, /* cost of moving MMX register */
681 {2, 2}, /* cost of loading MMX registers
682 in SImode and DImode */
683 {2, 2}, /* cost of storing MMX registers
684 in SImode and DImode */
685 2, /* cost of moving SSE register */
686 {2, 2, 8}, /* cost of loading SSE registers
687 in SImode, DImode and TImode */
688 {2, 2, 8}, /* cost of storing SSE registers
689 in SImode, DImode and TImode */
690 6, /* MMX or SSE register to integer */
691 32, /* size of l1 cache. */
692 32, /* size of l2 cache. Some models
693 have integrated l2 cache, but
694 optimizing for k6 is not important
695 enough to worry about that. */
696 32, /* size of prefetch block */
697 1, /* number of parallel prefetches */
698 1, /* Branch cost */
699 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
700 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
701 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
702 COSTS_N_INSNS (2), /* cost of FABS instruction. */
703 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
704 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
705 k6_memcpy,
706 k6_memset,
707 1, /* scalar_stmt_cost. */
708 1, /* scalar load_cost. */
709 1, /* scalar_store_cost. */
710 1, /* vec_stmt_cost. */
711 1, /* vec_to_scalar_cost. */
712 1, /* scalar_to_vec_cost. */
713 1, /* vec_align_load_cost. */
714 2, /* vec_unalign_load_cost. */
715 1, /* vec_store_cost. */
716 3, /* cond_taken_branch_cost. */
717 1, /* cond_not_taken_branch_cost. */
720 /* For some reason, Athlon deals better with REP prefix (relative to loops)
721 compared to K8. Alignment becomes important after 8 bytes for memcpy and
722 128 bytes for memset. */
723 static stringop_algs athlon_memcpy[2] = {
724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725 DUMMY_STRINGOP_ALGS};
726 static stringop_algs athlon_memset[2] = {
727 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
728 DUMMY_STRINGOP_ALGS};
729 static const
730 struct processor_costs athlon_cost = {
731 COSTS_N_INSNS (1), /* cost of an add instruction */
732 COSTS_N_INSNS (2), /* cost of a lea instruction */
733 COSTS_N_INSNS (1), /* variable shift costs */
734 COSTS_N_INSNS (1), /* constant shift costs */
735 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
736 COSTS_N_INSNS (5), /* HI */
737 COSTS_N_INSNS (5), /* SI */
738 COSTS_N_INSNS (5), /* DI */
739 COSTS_N_INSNS (5)}, /* other */
740 0, /* cost of multiply per each bit set */
741 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
742 COSTS_N_INSNS (26), /* HI */
743 COSTS_N_INSNS (42), /* SI */
744 COSTS_N_INSNS (74), /* DI */
745 COSTS_N_INSNS (74)}, /* other */
746 COSTS_N_INSNS (1), /* cost of movsx */
747 COSTS_N_INSNS (1), /* cost of movzx */
748 8, /* "large" insn */
749 9, /* MOVE_RATIO */
750 4, /* cost for loading QImode using movzbl */
751 {3, 4, 3}, /* cost of loading integer registers
752 in QImode, HImode and SImode.
753 Relative to reg-reg move (2). */
754 {3, 4, 3}, /* cost of storing integer registers */
755 4, /* cost of reg,reg fld/fst */
756 {4, 4, 12}, /* cost of loading fp registers
757 in SFmode, DFmode and XFmode */
758 {6, 6, 8}, /* cost of storing fp registers
759 in SFmode, DFmode and XFmode */
760 2, /* cost of moving MMX register */
761 {4, 4}, /* cost of loading MMX registers
762 in SImode and DImode */
763 {4, 4}, /* cost of storing MMX registers
764 in SImode and DImode */
765 2, /* cost of moving SSE register */
766 {4, 4, 6}, /* cost of loading SSE registers
767 in SImode, DImode and TImode */
768 {4, 4, 5}, /* cost of storing SSE registers
769 in SImode, DImode and TImode */
770 5, /* MMX or SSE register to integer */
771 64, /* size of l1 cache. */
772 256, /* size of l2 cache. */
773 64, /* size of prefetch block */
774 6, /* number of parallel prefetches */
775 5, /* Branch cost */
776 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
777 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
778 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
779 COSTS_N_INSNS (2), /* cost of FABS instruction. */
780 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
781 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
782 athlon_memcpy,
783 athlon_memset,
784 1, /* scalar_stmt_cost. */
785 1, /* scalar load_cost. */
786 1, /* scalar_store_cost. */
787 1, /* vec_stmt_cost. */
788 1, /* vec_to_scalar_cost. */
789 1, /* scalar_to_vec_cost. */
790 1, /* vec_align_load_cost. */
791 2, /* vec_unalign_load_cost. */
792 1, /* vec_store_cost. */
793 3, /* cond_taken_branch_cost. */
794 1, /* cond_not_taken_branch_cost. */
797 /* K8 has optimized REP instruction for medium sized blocks, but for very
798 small blocks it is better to use loop. For large blocks, libcall can
799 do nontemporary accesses and beat inline considerably. */
800 static stringop_algs k8_memcpy[2] = {
801 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
802 {-1, rep_prefix_4_byte, false}}},
803 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 static stringop_algs k8_memset[2] = {
806 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
807 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
808 {libcall, {{48, unrolled_loop, false},
809 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
810 static const
811 struct processor_costs k8_cost = {
812 COSTS_N_INSNS (1), /* cost of an add instruction */
813 COSTS_N_INSNS (2), /* cost of a lea instruction */
814 COSTS_N_INSNS (1), /* variable shift costs */
815 COSTS_N_INSNS (1), /* constant shift costs */
816 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
817 COSTS_N_INSNS (4), /* HI */
818 COSTS_N_INSNS (3), /* SI */
819 COSTS_N_INSNS (4), /* DI */
820 COSTS_N_INSNS (5)}, /* other */
821 0, /* cost of multiply per each bit set */
822 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
823 COSTS_N_INSNS (26), /* HI */
824 COSTS_N_INSNS (42), /* SI */
825 COSTS_N_INSNS (74), /* DI */
826 COSTS_N_INSNS (74)}, /* other */
827 COSTS_N_INSNS (1), /* cost of movsx */
828 COSTS_N_INSNS (1), /* cost of movzx */
829 8, /* "large" insn */
830 9, /* MOVE_RATIO */
831 4, /* cost for loading QImode using movzbl */
832 {3, 4, 3}, /* cost of loading integer registers
833 in QImode, HImode and SImode.
834 Relative to reg-reg move (2). */
835 {3, 4, 3}, /* cost of storing integer registers */
836 4, /* cost of reg,reg fld/fst */
837 {4, 4, 12}, /* cost of loading fp registers
838 in SFmode, DFmode and XFmode */
839 {6, 6, 8}, /* cost of storing fp registers
840 in SFmode, DFmode and XFmode */
841 2, /* cost of moving MMX register */
842 {3, 3}, /* cost of loading MMX registers
843 in SImode and DImode */
844 {4, 4}, /* cost of storing MMX registers
845 in SImode and DImode */
846 2, /* cost of moving SSE register */
847 {4, 3, 6}, /* cost of loading SSE registers
848 in SImode, DImode and TImode */
849 {4, 4, 5}, /* cost of storing SSE registers
850 in SImode, DImode and TImode */
851 5, /* MMX or SSE register to integer */
852 64, /* size of l1 cache. */
853 512, /* size of l2 cache. */
854 64, /* size of prefetch block */
855 /* New AMD processors never drop prefetches; if they cannot be performed
856 immediately, they are queued. We set number of simultaneous prefetches
857 to a large constant to reflect this (it probably is not a good idea not
858 to limit number of prefetches at all, as their execution also takes some
859 time). */
860 100, /* number of parallel prefetches */
861 3, /* Branch cost */
862 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
863 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
864 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
865 COSTS_N_INSNS (2), /* cost of FABS instruction. */
866 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
867 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
869 k8_memcpy,
870 k8_memset,
871 4, /* scalar_stmt_cost. */
872 2, /* scalar load_cost. */
873 2, /* scalar_store_cost. */
874 5, /* vec_stmt_cost. */
875 0, /* vec_to_scalar_cost. */
876 2, /* scalar_to_vec_cost. */
877 2, /* vec_align_load_cost. */
878 3, /* vec_unalign_load_cost. */
879 3, /* vec_store_cost. */
880 3, /* cond_taken_branch_cost. */
881 2, /* cond_not_taken_branch_cost. */
884 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
885 very small blocks it is better to use loop. For large blocks, libcall can
886 do nontemporary accesses and beat inline considerably. */
887 static stringop_algs amdfam10_memcpy[2] = {
888 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
889 {-1, rep_prefix_4_byte, false}}},
890 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
891 {-1, libcall, false}}}};
892 static stringop_algs amdfam10_memset[2] = {
893 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
894 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
895 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
896 {-1, libcall, false}}}};
897 struct processor_costs amdfam10_cost = {
898 COSTS_N_INSNS (1), /* cost of an add instruction */
899 COSTS_N_INSNS (2), /* cost of a lea instruction */
900 COSTS_N_INSNS (1), /* variable shift costs */
901 COSTS_N_INSNS (1), /* constant shift costs */
902 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
903 COSTS_N_INSNS (4), /* HI */
904 COSTS_N_INSNS (3), /* SI */
905 COSTS_N_INSNS (4), /* DI */
906 COSTS_N_INSNS (5)}, /* other */
907 0, /* cost of multiply per each bit set */
908 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
909 COSTS_N_INSNS (35), /* HI */
910 COSTS_N_INSNS (51), /* SI */
911 COSTS_N_INSNS (83), /* DI */
912 COSTS_N_INSNS (83)}, /* other */
913 COSTS_N_INSNS (1), /* cost of movsx */
914 COSTS_N_INSNS (1), /* cost of movzx */
915 8, /* "large" insn */
916 9, /* MOVE_RATIO */
917 4, /* cost for loading QImode using movzbl */
918 {3, 4, 3}, /* cost of loading integer registers
919 in QImode, HImode and SImode.
920 Relative to reg-reg move (2). */
921 {3, 4, 3}, /* cost of storing integer registers */
922 4, /* cost of reg,reg fld/fst */
923 {4, 4, 12}, /* cost of loading fp registers
924 in SFmode, DFmode and XFmode */
925 {6, 6, 8}, /* cost of storing fp registers
926 in SFmode, DFmode and XFmode */
927 2, /* cost of moving MMX register */
928 {3, 3}, /* cost of loading MMX registers
929 in SImode and DImode */
930 {4, 4}, /* cost of storing MMX registers
931 in SImode and DImode */
932 2, /* cost of moving SSE register */
933 {4, 4, 3}, /* cost of loading SSE registers
934 in SImode, DImode and TImode */
935 {4, 4, 5}, /* cost of storing SSE registers
936 in SImode, DImode and TImode */
937 3, /* MMX or SSE register to integer */
938 /* On K8:
939 MOVD reg64, xmmreg Double FSTORE 4
940 MOVD reg32, xmmreg Double FSTORE 4
941 On AMDFAM10:
942 MOVD reg64, xmmreg Double FADD 3
943 1/1 1/1
944 MOVD reg32, xmmreg Double FADD 3
945 1/1 1/1 */
946 64, /* size of l1 cache. */
947 512, /* size of l2 cache. */
948 64, /* size of prefetch block */
949 /* New AMD processors never drop prefetches; if they cannot be performed
950 immediately, they are queued. We set number of simultaneous prefetches
951 to a large constant to reflect this (it probably is not a good idea not
952 to limit number of prefetches at all, as their execution also takes some
953 time). */
954 100, /* number of parallel prefetches */
955 2, /* Branch cost */
956 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
957 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
958 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
959 COSTS_N_INSNS (2), /* cost of FABS instruction. */
960 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
961 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
963 amdfam10_memcpy,
964 amdfam10_memset,
965 4, /* scalar_stmt_cost. */
966 2, /* scalar load_cost. */
967 2, /* scalar_store_cost. */
968 6, /* vec_stmt_cost. */
969 0, /* vec_to_scalar_cost. */
970 2, /* scalar_to_vec_cost. */
971 2, /* vec_align_load_cost. */
972 2, /* vec_unalign_load_cost. */
973 2, /* vec_store_cost. */
974 2, /* cond_taken_branch_cost. */
975 1, /* cond_not_taken_branch_cost. */
978 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
979 very small blocks it is better to use loop. For large blocks, libcall
980 can do nontemporary accesses and beat inline considerably. */
981 static stringop_algs bdver1_memcpy[2] = {
982 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
983 {-1, rep_prefix_4_byte, false}}},
984 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
985 {-1, libcall, false}}}};
986 static stringop_algs bdver1_memset[2] = {
987 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
988 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
989 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
990 {-1, libcall, false}}}};
992 const struct processor_costs bdver1_cost = {
993 COSTS_N_INSNS (1), /* cost of an add instruction */
994 COSTS_N_INSNS (1), /* cost of a lea instruction */
995 COSTS_N_INSNS (1), /* variable shift costs */
996 COSTS_N_INSNS (1), /* constant shift costs */
997 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
998 COSTS_N_INSNS (4), /* HI */
999 COSTS_N_INSNS (4), /* SI */
1000 COSTS_N_INSNS (6), /* DI */
1001 COSTS_N_INSNS (6)}, /* other */
1002 0, /* cost of multiply per each bit set */
1003 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1004 COSTS_N_INSNS (35), /* HI */
1005 COSTS_N_INSNS (51), /* SI */
1006 COSTS_N_INSNS (83), /* DI */
1007 COSTS_N_INSNS (83)}, /* other */
1008 COSTS_N_INSNS (1), /* cost of movsx */
1009 COSTS_N_INSNS (1), /* cost of movzx */
1010 8, /* "large" insn */
1011 9, /* MOVE_RATIO */
1012 4, /* cost for loading QImode using movzbl */
1013 {5, 5, 4}, /* cost of loading integer registers
1014 in QImode, HImode and SImode.
1015 Relative to reg-reg move (2). */
1016 {4, 4, 4}, /* cost of storing integer registers */
1017 2, /* cost of reg,reg fld/fst */
1018 {5, 5, 12}, /* cost of loading fp registers
1019 in SFmode, DFmode and XFmode */
1020 {4, 4, 8}, /* cost of storing fp registers
1021 in SFmode, DFmode and XFmode */
1022 2, /* cost of moving MMX register */
1023 {4, 4}, /* cost of loading MMX registers
1024 in SImode and DImode */
1025 {4, 4}, /* cost of storing MMX registers
1026 in SImode and DImode */
1027 2, /* cost of moving SSE register */
1028 {4, 4, 4}, /* cost of loading SSE registers
1029 in SImode, DImode and TImode */
1030 {4, 4, 4}, /* cost of storing SSE registers
1031 in SImode, DImode and TImode */
1032 2, /* MMX or SSE register to integer */
1033 /* On K8:
1034 MOVD reg64, xmmreg Double FSTORE 4
1035 MOVD reg32, xmmreg Double FSTORE 4
1036 On AMDFAM10:
1037 MOVD reg64, xmmreg Double FADD 3
1038 1/1 1/1
1039 MOVD reg32, xmmreg Double FADD 3
1040 1/1 1/1 */
1041 16, /* size of l1 cache. */
1042 2048, /* size of l2 cache. */
1043 64, /* size of prefetch block */
1044 /* New AMD processors never drop prefetches; if they cannot be performed
1045 immediately, they are queued. We set number of simultaneous prefetches
1046 to a large constant to reflect this (it probably is not a good idea not
1047 to limit number of prefetches at all, as their execution also takes some
1048 time). */
1049 100, /* number of parallel prefetches */
1050 2, /* Branch cost */
1051 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1052 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1053 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1054 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1055 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1056 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1058 bdver1_memcpy,
1059 bdver1_memset,
1060 6, /* scalar_stmt_cost. */
1061 4, /* scalar load_cost. */
1062 4, /* scalar_store_cost. */
1063 6, /* vec_stmt_cost. */
1064 0, /* vec_to_scalar_cost. */
1065 2, /* scalar_to_vec_cost. */
1066 4, /* vec_align_load_cost. */
1067 4, /* vec_unalign_load_cost. */
1068 4, /* vec_store_cost. */
1069 4, /* cond_taken_branch_cost. */
1070 2, /* cond_not_taken_branch_cost. */
1073 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1074 very small blocks it is better to use loop. For large blocks, libcall
1075 can do nontemporary accesses and beat inline considerably. */
1077 static stringop_algs bdver2_memcpy[2] = {
1078 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}};
1082 static stringop_algs bdver2_memset[2] = {
1083 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1084 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1085 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1086 {-1, libcall, false}}}};
1088 const struct processor_costs bdver2_cost = {
1089 COSTS_N_INSNS (1), /* cost of an add instruction */
1090 COSTS_N_INSNS (1), /* cost of a lea instruction */
1091 COSTS_N_INSNS (1), /* variable shift costs */
1092 COSTS_N_INSNS (1), /* constant shift costs */
1093 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1094 COSTS_N_INSNS (4), /* HI */
1095 COSTS_N_INSNS (4), /* SI */
1096 COSTS_N_INSNS (6), /* DI */
1097 COSTS_N_INSNS (6)}, /* other */
1098 0, /* cost of multiply per each bit set */
1099 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1100 COSTS_N_INSNS (35), /* HI */
1101 COSTS_N_INSNS (51), /* SI */
1102 COSTS_N_INSNS (83), /* DI */
1103 COSTS_N_INSNS (83)}, /* other */
1104 COSTS_N_INSNS (1), /* cost of movsx */
1105 COSTS_N_INSNS (1), /* cost of movzx */
1106 8, /* "large" insn */
1107 9, /* MOVE_RATIO */
1108 4, /* cost for loading QImode using movzbl */
1109 {5, 5, 4}, /* cost of loading integer registers
1110 in QImode, HImode and SImode.
1111 Relative to reg-reg move (2). */
1112 {4, 4, 4}, /* cost of storing integer registers */
1113 2, /* cost of reg,reg fld/fst */
1114 {5, 5, 12}, /* cost of loading fp registers
1115 in SFmode, DFmode and XFmode */
1116 {4, 4, 8}, /* cost of storing fp registers
1117 in SFmode, DFmode and XFmode */
1118 2, /* cost of moving MMX register */
1119 {4, 4}, /* cost of loading MMX registers
1120 in SImode and DImode */
1121 {4, 4}, /* cost of storing MMX registers
1122 in SImode and DImode */
1123 2, /* cost of moving SSE register */
1124 {4, 4, 4}, /* cost of loading SSE registers
1125 in SImode, DImode and TImode */
1126 {4, 4, 4}, /* cost of storing SSE registers
1127 in SImode, DImode and TImode */
1128 2, /* MMX or SSE register to integer */
1129 /* On K8:
1130 MOVD reg64, xmmreg Double FSTORE 4
1131 MOVD reg32, xmmreg Double FSTORE 4
1132 On AMDFAM10:
1133 MOVD reg64, xmmreg Double FADD 3
1134 1/1 1/1
1135 MOVD reg32, xmmreg Double FADD 3
1136 1/1 1/1 */
1137 16, /* size of l1 cache. */
1138 2048, /* size of l2 cache. */
1139 64, /* size of prefetch block */
1140 /* New AMD processors never drop prefetches; if they cannot be performed
1141 immediately, they are queued. We set number of simultaneous prefetches
1142 to a large constant to reflect this (it probably is not a good idea not
1143 to limit number of prefetches at all, as their execution also takes some
1144 time). */
1145 100, /* number of parallel prefetches */
1146 2, /* Branch cost */
1147 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1148 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1149 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1150 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1151 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1152 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1154 bdver2_memcpy,
1155 bdver2_memset,
1156 6, /* scalar_stmt_cost. */
1157 4, /* scalar load_cost. */
1158 4, /* scalar_store_cost. */
1159 6, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 4, /* vec_align_load_cost. */
1163 4, /* vec_unalign_load_cost. */
1164 4, /* vec_store_cost. */
1165 4, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1170 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1171 very small blocks it is better to use loop. For large blocks, libcall
1172 can do nontemporary accesses and beat inline considerably. */
1173 static stringop_algs bdver3_memcpy[2] = {
1174 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1175 {-1, rep_prefix_4_byte, false}}},
1176 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1177 {-1, libcall, false}}}};
1178 static stringop_algs bdver3_memset[2] = {
1179 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1180 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1181 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1182 {-1, libcall, false}}}};
1183 struct processor_costs bdver3_cost = {
1184 COSTS_N_INSNS (1), /* cost of an add instruction */
1185 COSTS_N_INSNS (1), /* cost of a lea instruction */
1186 COSTS_N_INSNS (1), /* variable shift costs */
1187 COSTS_N_INSNS (1), /* constant shift costs */
1188 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1189 COSTS_N_INSNS (4), /* HI */
1190 COSTS_N_INSNS (4), /* SI */
1191 COSTS_N_INSNS (6), /* DI */
1192 COSTS_N_INSNS (6)}, /* other */
1193 0, /* cost of multiply per each bit set */
1194 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1195 COSTS_N_INSNS (35), /* HI */
1196 COSTS_N_INSNS (51), /* SI */
1197 COSTS_N_INSNS (83), /* DI */
1198 COSTS_N_INSNS (83)}, /* other */
1199 COSTS_N_INSNS (1), /* cost of movsx */
1200 COSTS_N_INSNS (1), /* cost of movzx */
1201 8, /* "large" insn */
1202 9, /* MOVE_RATIO */
1203 4, /* cost for loading QImode using movzbl */
1204 {5, 5, 4}, /* cost of loading integer registers
1205 in QImode, HImode and SImode.
1206 Relative to reg-reg move (2). */
1207 {4, 4, 4}, /* cost of storing integer registers */
1208 2, /* cost of reg,reg fld/fst */
1209 {5, 5, 12}, /* cost of loading fp registers
1210 in SFmode, DFmode and XFmode */
1211 {4, 4, 8}, /* cost of storing fp registers
1212 in SFmode, DFmode and XFmode */
1213 2, /* cost of moving MMX register */
1214 {4, 4}, /* cost of loading MMX registers
1215 in SImode and DImode */
1216 {4, 4}, /* cost of storing MMX registers
1217 in SImode and DImode */
1218 2, /* cost of moving SSE register */
1219 {4, 4, 4}, /* cost of loading SSE registers
1220 in SImode, DImode and TImode */
1221 {4, 4, 4}, /* cost of storing SSE registers
1222 in SImode, DImode and TImode */
1223 2, /* MMX or SSE register to integer */
1224 16, /* size of l1 cache. */
1225 2048, /* size of l2 cache. */
1226 64, /* size of prefetch block */
1227 /* New AMD processors never drop prefetches; if they cannot be performed
1228 immediately, they are queued. We set number of simultaneous prefetches
1229 to a large constant to reflect this (it probably is not a good idea not
1230 to limit number of prefetches at all, as their execution also takes some
1231 time). */
1232 100, /* number of parallel prefetches */
1233 2, /* Branch cost */
1234 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1235 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1236 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1237 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1238 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1239 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1241 bdver3_memcpy,
1242 bdver3_memset,
1243 6, /* scalar_stmt_cost. */
1244 4, /* scalar load_cost. */
1245 4, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 4, /* vec_align_load_cost. */
1250 4, /* vec_unalign_load_cost. */
1251 4, /* vec_store_cost. */
1252 4, /* cond_taken_branch_cost. */
1253 2, /* cond_not_taken_branch_cost. */
1256 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1257 very small blocks it is better to use loop. For large blocks, libcall
1258 can do nontemporary accesses and beat inline considerably. */
1259 static stringop_algs bdver4_memcpy[2] = {
1260 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1261 {-1, rep_prefix_4_byte, false}}},
1262 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1263 {-1, libcall, false}}}};
1264 static stringop_algs bdver4_memset[2] = {
1265 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1266 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1267 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1268 {-1, libcall, false}}}};
1269 struct processor_costs bdver4_cost = {
1270 COSTS_N_INSNS (1), /* cost of an add instruction */
1271 COSTS_N_INSNS (1), /* cost of a lea instruction */
1272 COSTS_N_INSNS (1), /* variable shift costs */
1273 COSTS_N_INSNS (1), /* constant shift costs */
1274 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1275 COSTS_N_INSNS (4), /* HI */
1276 COSTS_N_INSNS (4), /* SI */
1277 COSTS_N_INSNS (6), /* DI */
1278 COSTS_N_INSNS (6)}, /* other */
1279 0, /* cost of multiply per each bit set */
1280 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1281 COSTS_N_INSNS (35), /* HI */
1282 COSTS_N_INSNS (51), /* SI */
1283 COSTS_N_INSNS (83), /* DI */
1284 COSTS_N_INSNS (83)}, /* other */
1285 COSTS_N_INSNS (1), /* cost of movsx */
1286 COSTS_N_INSNS (1), /* cost of movzx */
1287 8, /* "large" insn */
1288 9, /* MOVE_RATIO */
1289 4, /* cost for loading QImode using movzbl */
1290 {5, 5, 4}, /* cost of loading integer registers
1291 in QImode, HImode and SImode.
1292 Relative to reg-reg move (2). */
1293 {4, 4, 4}, /* cost of storing integer registers */
1294 2, /* cost of reg,reg fld/fst */
1295 {5, 5, 12}, /* cost of loading fp registers
1296 in SFmode, DFmode and XFmode */
1297 {4, 4, 8}, /* cost of storing fp registers
1298 in SFmode, DFmode and XFmode */
1299 2, /* cost of moving MMX register */
1300 {4, 4}, /* cost of loading MMX registers
1301 in SImode and DImode */
1302 {4, 4}, /* cost of storing MMX registers
1303 in SImode and DImode */
1304 2, /* cost of moving SSE register */
1305 {4, 4, 4}, /* cost of loading SSE registers
1306 in SImode, DImode and TImode */
1307 {4, 4, 4}, /* cost of storing SSE registers
1308 in SImode, DImode and TImode */
1309 2, /* MMX or SSE register to integer */
1310 16, /* size of l1 cache. */
1311 2048, /* size of l2 cache. */
1312 64, /* size of prefetch block */
1313 /* New AMD processors never drop prefetches; if they cannot be performed
1314 immediately, they are queued. We set number of simultaneous prefetches
1315 to a large constant to reflect this (it probably is not a good idea not
1316 to limit number of prefetches at all, as their execution also takes some
1317 time). */
1318 100, /* number of parallel prefetches */
1319 2, /* Branch cost */
1320 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1321 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1322 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1323 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1324 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1325 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1327 bdver4_memcpy,
1328 bdver4_memset,
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 4, /* cond_taken_branch_cost. */
1339 2, /* cond_not_taken_branch_cost. */
1343 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1344 very small blocks it is better to use loop. For large blocks, libcall
1345 can do nontemporary accesses and beat inline considerably. */
1346 static stringop_algs znver1_memcpy[2] = {
1347 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1348 {-1, rep_prefix_4_byte, false}}},
1349 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1350 {-1, libcall, false}}}};
1351 static stringop_algs znver1_memset[2] = {
1352 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1353 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1354 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1355 {-1, libcall, false}}}};
1356 struct processor_costs znver1_cost = {
1357 COSTS_N_INSNS (1), /* cost of an add instruction. */
1358 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1359 COSTS_N_INSNS (1), /* variable shift costs. */
1360 COSTS_N_INSNS (1), /* constant shift costs. */
1361 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1362 COSTS_N_INSNS (3), /* HI. */
1363 COSTS_N_INSNS (3), /* SI. */
1364 COSTS_N_INSNS (4), /* DI. */
1365 COSTS_N_INSNS (4)}, /* other. */
1366 0, /* cost of multiply per each bit
1367 set. */
1368 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1369 COSTS_N_INSNS (35), /* HI. */
1370 COSTS_N_INSNS (51), /* SI. */
1371 COSTS_N_INSNS (83), /* DI. */
1372 COSTS_N_INSNS (83)}, /* other. */
1373 COSTS_N_INSNS (1), /* cost of movsx. */
1374 COSTS_N_INSNS (1), /* cost of movzx. */
1375 8, /* "large" insn. */
1376 9, /* MOVE_RATIO. */
1377 4, /* cost for loading QImode using
1378 movzbl. */
1379 {5, 5, 4}, /* cost of loading integer registers
1380 in QImode, HImode and SImode.
1381 Relative to reg-reg move (2). */
1382 {4, 4, 4}, /* cost of storing integer
1383 registers. */
1384 2, /* cost of reg,reg fld/fst. */
1385 {5, 5, 12}, /* cost of loading fp registers
1386 in SFmode, DFmode and XFmode. */
1387 {4, 4, 8}, /* cost of storing fp registers
1388 in SFmode, DFmode and XFmode. */
1389 2, /* cost of moving MMX register. */
1390 {4, 4}, /* cost of loading MMX registers
1391 in SImode and DImode. */
1392 {4, 4}, /* cost of storing MMX registers
1393 in SImode and DImode. */
1394 2, /* cost of moving SSE register. */
1395 {4, 4, 4}, /* cost of loading SSE registers
1396 in SImode, DImode and TImode. */
1397 {4, 4, 4}, /* cost of storing SSE registers
1398 in SImode, DImode and TImode. */
1399 2, /* MMX or SSE register to integer. */
1400 32, /* size of l1 cache. */
1401 512, /* size of l2 cache. */
1402 64, /* size of prefetch block. */
1403 /* New AMD processors never drop prefetches; if they cannot be performed
1404 immediately, they are queued. We set number of simultaneous prefetches
1405 to a large constant to reflect this (it probably is not a good idea not
1406 to limit number of prefetches at all, as their execution also takes some
1407 time). */
1408 100, /* number of parallel prefetches. */
1409 2, /* Branch cost. */
1410 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1411 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1412 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1413 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1414 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1415 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1417 znver1_memcpy,
1418 znver1_memset,
1419 6, /* scalar_stmt_cost. */
1420 4, /* scalar load_cost. */
1421 4, /* scalar_store_cost. */
1422 6, /* vec_stmt_cost. */
1423 0, /* vec_to_scalar_cost. */
1424 2, /* scalar_to_vec_cost. */
1425 4, /* vec_align_load_cost. */
1426 4, /* vec_unalign_load_cost. */
1427 4, /* vec_store_cost. */
1428 4, /* cond_taken_branch_cost. */
1429 2, /* cond_not_taken_branch_cost. */
1432 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1433 very small blocks it is better to use loop. For large blocks, libcall can
1434 do nontemporary accesses and beat inline considerably. */
1435 static stringop_algs btver1_memcpy[2] = {
1436 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1437 {-1, rep_prefix_4_byte, false}}},
1438 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1439 {-1, libcall, false}}}};
1440 static stringop_algs btver1_memset[2] = {
1441 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1442 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1443 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1444 {-1, libcall, false}}}};
1445 const struct processor_costs btver1_cost = {
1446 COSTS_N_INSNS (1), /* cost of an add instruction */
1447 COSTS_N_INSNS (2), /* cost of a lea instruction */
1448 COSTS_N_INSNS (1), /* variable shift costs */
1449 COSTS_N_INSNS (1), /* constant shift costs */
1450 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1451 COSTS_N_INSNS (4), /* HI */
1452 COSTS_N_INSNS (3), /* SI */
1453 COSTS_N_INSNS (4), /* DI */
1454 COSTS_N_INSNS (5)}, /* other */
1455 0, /* cost of multiply per each bit set */
1456 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1457 COSTS_N_INSNS (35), /* HI */
1458 COSTS_N_INSNS (51), /* SI */
1459 COSTS_N_INSNS (83), /* DI */
1460 COSTS_N_INSNS (83)}, /* other */
1461 COSTS_N_INSNS (1), /* cost of movsx */
1462 COSTS_N_INSNS (1), /* cost of movzx */
1463 8, /* "large" insn */
1464 9, /* MOVE_RATIO */
1465 4, /* cost for loading QImode using movzbl */
1466 {3, 4, 3}, /* cost of loading integer registers
1467 in QImode, HImode and SImode.
1468 Relative to reg-reg move (2). */
1469 {3, 4, 3}, /* cost of storing integer registers */
1470 4, /* cost of reg,reg fld/fst */
1471 {4, 4, 12}, /* cost of loading fp registers
1472 in SFmode, DFmode and XFmode */
1473 {6, 6, 8}, /* cost of storing fp registers
1474 in SFmode, DFmode and XFmode */
1475 2, /* cost of moving MMX register */
1476 {3, 3}, /* cost of loading MMX registers
1477 in SImode and DImode */
1478 {4, 4}, /* cost of storing MMX registers
1479 in SImode and DImode */
1480 2, /* cost of moving SSE register */
1481 {4, 4, 3}, /* cost of loading SSE registers
1482 in SImode, DImode and TImode */
1483 {4, 4, 5}, /* cost of storing SSE registers
1484 in SImode, DImode and TImode */
1485 3, /* MMX or SSE register to integer */
1486 /* On K8:
1487 MOVD reg64, xmmreg Double FSTORE 4
1488 MOVD reg32, xmmreg Double FSTORE 4
1489 On AMDFAM10:
1490 MOVD reg64, xmmreg Double FADD 3
1491 1/1 1/1
1492 MOVD reg32, xmmreg Double FADD 3
1493 1/1 1/1 */
1494 32, /* size of l1 cache. */
1495 512, /* size of l2 cache. */
1496 64, /* size of prefetch block */
1497 100, /* number of parallel prefetches */
1498 2, /* Branch cost */
1499 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1500 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1501 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1502 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1503 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1504 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1506 btver1_memcpy,
1507 btver1_memset,
1508 4, /* scalar_stmt_cost. */
1509 2, /* scalar load_cost. */
1510 2, /* scalar_store_cost. */
1511 6, /* vec_stmt_cost. */
1512 0, /* vec_to_scalar_cost. */
1513 2, /* scalar_to_vec_cost. */
1514 2, /* vec_align_load_cost. */
1515 2, /* vec_unalign_load_cost. */
1516 2, /* vec_store_cost. */
1517 2, /* cond_taken_branch_cost. */
1518 1, /* cond_not_taken_branch_cost. */
1521 static stringop_algs btver2_memcpy[2] = {
1522 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1523 {-1, rep_prefix_4_byte, false}}},
1524 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1525 {-1, libcall, false}}}};
1526 static stringop_algs btver2_memset[2] = {
1527 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1528 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1529 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1530 {-1, libcall, false}}}};
1531 const struct processor_costs btver2_cost = {
1532 COSTS_N_INSNS (1), /* cost of an add instruction */
1533 COSTS_N_INSNS (2), /* cost of a lea instruction */
1534 COSTS_N_INSNS (1), /* variable shift costs */
1535 COSTS_N_INSNS (1), /* constant shift costs */
1536 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1537 COSTS_N_INSNS (4), /* HI */
1538 COSTS_N_INSNS (3), /* SI */
1539 COSTS_N_INSNS (4), /* DI */
1540 COSTS_N_INSNS (5)}, /* other */
1541 0, /* cost of multiply per each bit set */
1542 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1543 COSTS_N_INSNS (35), /* HI */
1544 COSTS_N_INSNS (51), /* SI */
1545 COSTS_N_INSNS (83), /* DI */
1546 COSTS_N_INSNS (83)}, /* other */
1547 COSTS_N_INSNS (1), /* cost of movsx */
1548 COSTS_N_INSNS (1), /* cost of movzx */
1549 8, /* "large" insn */
1550 9, /* MOVE_RATIO */
1551 4, /* cost for loading QImode using movzbl */
1552 {3, 4, 3}, /* cost of loading integer registers
1553 in QImode, HImode and SImode.
1554 Relative to reg-reg move (2). */
1555 {3, 4, 3}, /* cost of storing integer registers */
1556 4, /* cost of reg,reg fld/fst */
1557 {4, 4, 12}, /* cost of loading fp registers
1558 in SFmode, DFmode and XFmode */
1559 {6, 6, 8}, /* cost of storing fp registers
1560 in SFmode, DFmode and XFmode */
1561 2, /* cost of moving MMX register */
1562 {3, 3}, /* cost of loading MMX registers
1563 in SImode and DImode */
1564 {4, 4}, /* cost of storing MMX registers
1565 in SImode and DImode */
1566 2, /* cost of moving SSE register */
1567 {4, 4, 3}, /* cost of loading SSE registers
1568 in SImode, DImode and TImode */
1569 {4, 4, 5}, /* cost of storing SSE registers
1570 in SImode, DImode and TImode */
1571 3, /* MMX or SSE register to integer */
1572 /* On K8:
1573 MOVD reg64, xmmreg Double FSTORE 4
1574 MOVD reg32, xmmreg Double FSTORE 4
1575 On AMDFAM10:
1576 MOVD reg64, xmmreg Double FADD 3
1577 1/1 1/1
1578 MOVD reg32, xmmreg Double FADD 3
1579 1/1 1/1 */
1580 32, /* size of l1 cache. */
1581 2048, /* size of l2 cache. */
1582 64, /* size of prefetch block */
1583 100, /* number of parallel prefetches */
1584 2, /* Branch cost */
1585 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1586 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1587 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1588 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1589 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1590 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1591 btver2_memcpy,
1592 btver2_memset,
1593 4, /* scalar_stmt_cost. */
1594 2, /* scalar load_cost. */
1595 2, /* scalar_store_cost. */
1596 6, /* vec_stmt_cost. */
1597 0, /* vec_to_scalar_cost. */
1598 2, /* scalar_to_vec_cost. */
1599 2, /* vec_align_load_cost. */
1600 2, /* vec_unalign_load_cost. */
1601 2, /* vec_store_cost. */
1602 2, /* cond_taken_branch_cost. */
1603 1, /* cond_not_taken_branch_cost. */
1606 static stringop_algs pentium4_memcpy[2] = {
1607 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1608 DUMMY_STRINGOP_ALGS};
1609 static stringop_algs pentium4_memset[2] = {
1610 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1611 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1612 DUMMY_STRINGOP_ALGS};
1614 static const
1615 struct processor_costs pentium4_cost = {
1616 COSTS_N_INSNS (1), /* cost of an add instruction */
1617 COSTS_N_INSNS (3), /* cost of a lea instruction */
1618 COSTS_N_INSNS (4), /* variable shift costs */
1619 COSTS_N_INSNS (4), /* constant shift costs */
1620 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1621 COSTS_N_INSNS (15), /* HI */
1622 COSTS_N_INSNS (15), /* SI */
1623 COSTS_N_INSNS (15), /* DI */
1624 COSTS_N_INSNS (15)}, /* other */
1625 0, /* cost of multiply per each bit set */
1626 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1627 COSTS_N_INSNS (56), /* HI */
1628 COSTS_N_INSNS (56), /* SI */
1629 COSTS_N_INSNS (56), /* DI */
1630 COSTS_N_INSNS (56)}, /* other */
1631 COSTS_N_INSNS (1), /* cost of movsx */
1632 COSTS_N_INSNS (1), /* cost of movzx */
1633 16, /* "large" insn */
1634 6, /* MOVE_RATIO */
1635 2, /* cost for loading QImode using movzbl */
1636 {4, 5, 4}, /* cost of loading integer registers
1637 in QImode, HImode and SImode.
1638 Relative to reg-reg move (2). */
1639 {2, 3, 2}, /* cost of storing integer registers */
1640 2, /* cost of reg,reg fld/fst */
1641 {2, 2, 6}, /* cost of loading fp registers
1642 in SFmode, DFmode and XFmode */
1643 {4, 4, 6}, /* cost of storing fp registers
1644 in SFmode, DFmode and XFmode */
1645 2, /* cost of moving MMX register */
1646 {2, 2}, /* cost of loading MMX registers
1647 in SImode and DImode */
1648 {2, 2}, /* cost of storing MMX registers
1649 in SImode and DImode */
1650 12, /* cost of moving SSE register */
1651 {12, 12, 12}, /* cost of loading SSE registers
1652 in SImode, DImode and TImode */
1653 {2, 2, 8}, /* cost of storing SSE registers
1654 in SImode, DImode and TImode */
1655 10, /* MMX or SSE register to integer */
1656 8, /* size of l1 cache. */
1657 256, /* size of l2 cache. */
1658 64, /* size of prefetch block */
1659 6, /* number of parallel prefetches */
1660 2, /* Branch cost */
1661 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1662 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1663 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1664 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1665 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1666 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1667 pentium4_memcpy,
1668 pentium4_memset,
1669 1, /* scalar_stmt_cost. */
1670 1, /* scalar load_cost. */
1671 1, /* scalar_store_cost. */
1672 1, /* vec_stmt_cost. */
1673 1, /* vec_to_scalar_cost. */
1674 1, /* scalar_to_vec_cost. */
1675 1, /* vec_align_load_cost. */
1676 2, /* vec_unalign_load_cost. */
1677 1, /* vec_store_cost. */
1678 3, /* cond_taken_branch_cost. */
1679 1, /* cond_not_taken_branch_cost. */
1682 static stringop_algs nocona_memcpy[2] = {
1683 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1684 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1685 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1687 static stringop_algs nocona_memset[2] = {
1688 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1689 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1690 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1691 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1693 static const
1694 struct processor_costs nocona_cost = {
1695 COSTS_N_INSNS (1), /* cost of an add instruction */
1696 COSTS_N_INSNS (1), /* cost of a lea instruction */
1697 COSTS_N_INSNS (1), /* variable shift costs */
1698 COSTS_N_INSNS (1), /* constant shift costs */
1699 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1700 COSTS_N_INSNS (10), /* HI */
1701 COSTS_N_INSNS (10), /* SI */
1702 COSTS_N_INSNS (10), /* DI */
1703 COSTS_N_INSNS (10)}, /* other */
1704 0, /* cost of multiply per each bit set */
1705 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1706 COSTS_N_INSNS (66), /* HI */
1707 COSTS_N_INSNS (66), /* SI */
1708 COSTS_N_INSNS (66), /* DI */
1709 COSTS_N_INSNS (66)}, /* other */
1710 COSTS_N_INSNS (1), /* cost of movsx */
1711 COSTS_N_INSNS (1), /* cost of movzx */
1712 16, /* "large" insn */
1713 17, /* MOVE_RATIO */
1714 4, /* cost for loading QImode using movzbl */
1715 {4, 4, 4}, /* cost of loading integer registers
1716 in QImode, HImode and SImode.
1717 Relative to reg-reg move (2). */
1718 {4, 4, 4}, /* cost of storing integer registers */
1719 3, /* cost of reg,reg fld/fst */
1720 {12, 12, 12}, /* cost of loading fp registers
1721 in SFmode, DFmode and XFmode */
1722 {4, 4, 4}, /* cost of storing fp registers
1723 in SFmode, DFmode and XFmode */
1724 6, /* cost of moving MMX register */
1725 {12, 12}, /* cost of loading MMX registers
1726 in SImode and DImode */
1727 {12, 12}, /* cost of storing MMX registers
1728 in SImode and DImode */
1729 6, /* cost of moving SSE register */
1730 {12, 12, 12}, /* cost of loading SSE registers
1731 in SImode, DImode and TImode */
1732 {12, 12, 12}, /* cost of storing SSE registers
1733 in SImode, DImode and TImode */
1734 8, /* MMX or SSE register to integer */
1735 8, /* size of l1 cache. */
1736 1024, /* size of l2 cache. */
1737 64, /* size of prefetch block */
1738 8, /* number of parallel prefetches */
1739 1, /* Branch cost */
1740 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1741 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1742 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1743 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1744 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1745 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1746 nocona_memcpy,
1747 nocona_memset,
1748 1, /* scalar_stmt_cost. */
1749 1, /* scalar load_cost. */
1750 1, /* scalar_store_cost. */
1751 1, /* vec_stmt_cost. */
1752 1, /* vec_to_scalar_cost. */
1753 1, /* scalar_to_vec_cost. */
1754 1, /* vec_align_load_cost. */
1755 2, /* vec_unalign_load_cost. */
1756 1, /* vec_store_cost. */
1757 3, /* cond_taken_branch_cost. */
1758 1, /* cond_not_taken_branch_cost. */
1761 static stringop_algs atom_memcpy[2] = {
1762 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1763 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1764 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1765 static stringop_algs atom_memset[2] = {
1766 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1767 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1768 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1769 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1770 static const
1771 struct processor_costs atom_cost = {
1772 COSTS_N_INSNS (1), /* cost of an add instruction */
1773 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1774 COSTS_N_INSNS (1), /* variable shift costs */
1775 COSTS_N_INSNS (1), /* constant shift costs */
1776 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1777 COSTS_N_INSNS (4), /* HI */
1778 COSTS_N_INSNS (3), /* SI */
1779 COSTS_N_INSNS (4), /* DI */
1780 COSTS_N_INSNS (2)}, /* other */
1781 0, /* cost of multiply per each bit set */
1782 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1783 COSTS_N_INSNS (26), /* HI */
1784 COSTS_N_INSNS (42), /* SI */
1785 COSTS_N_INSNS (74), /* DI */
1786 COSTS_N_INSNS (74)}, /* other */
1787 COSTS_N_INSNS (1), /* cost of movsx */
1788 COSTS_N_INSNS (1), /* cost of movzx */
1789 8, /* "large" insn */
1790 17, /* MOVE_RATIO */
1791 4, /* cost for loading QImode using movzbl */
1792 {4, 4, 4}, /* cost of loading integer registers
1793 in QImode, HImode and SImode.
1794 Relative to reg-reg move (2). */
1795 {4, 4, 4}, /* cost of storing integer registers */
1796 4, /* cost of reg,reg fld/fst */
1797 {12, 12, 12}, /* cost of loading fp registers
1798 in SFmode, DFmode and XFmode */
1799 {6, 6, 8}, /* cost of storing fp registers
1800 in SFmode, DFmode and XFmode */
1801 2, /* cost of moving MMX register */
1802 {8, 8}, /* cost of loading MMX registers
1803 in SImode and DImode */
1804 {8, 8}, /* cost of storing MMX registers
1805 in SImode and DImode */
1806 2, /* cost of moving SSE register */
1807 {8, 8, 8}, /* cost of loading SSE registers
1808 in SImode, DImode and TImode */
1809 {8, 8, 8}, /* cost of storing SSE registers
1810 in SImode, DImode and TImode */
1811 5, /* MMX or SSE register to integer */
1812 32, /* size of l1 cache. */
1813 256, /* size of l2 cache. */
1814 64, /* size of prefetch block */
1815 6, /* number of parallel prefetches */
1816 3, /* Branch cost */
1817 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1818 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1819 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1820 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1821 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1822 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1823 atom_memcpy,
1824 atom_memset,
1825 1, /* scalar_stmt_cost. */
1826 1, /* scalar load_cost. */
1827 1, /* scalar_store_cost. */
1828 1, /* vec_stmt_cost. */
1829 1, /* vec_to_scalar_cost. */
1830 1, /* scalar_to_vec_cost. */
1831 1, /* vec_align_load_cost. */
1832 2, /* vec_unalign_load_cost. */
1833 1, /* vec_store_cost. */
1834 3, /* cond_taken_branch_cost. */
1835 1, /* cond_not_taken_branch_cost. */
1838 static stringop_algs slm_memcpy[2] = {
1839 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1840 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1841 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1842 static stringop_algs slm_memset[2] = {
1843 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1844 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1845 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1846 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1847 static const
1848 struct processor_costs slm_cost = {
1849 COSTS_N_INSNS (1), /* cost of an add instruction */
1850 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1851 COSTS_N_INSNS (1), /* variable shift costs */
1852 COSTS_N_INSNS (1), /* constant shift costs */
1853 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1854 COSTS_N_INSNS (3), /* HI */
1855 COSTS_N_INSNS (3), /* SI */
1856 COSTS_N_INSNS (4), /* DI */
1857 COSTS_N_INSNS (2)}, /* other */
1858 0, /* cost of multiply per each bit set */
1859 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1860 COSTS_N_INSNS (26), /* HI */
1861 COSTS_N_INSNS (42), /* SI */
1862 COSTS_N_INSNS (74), /* DI */
1863 COSTS_N_INSNS (74)}, /* other */
1864 COSTS_N_INSNS (1), /* cost of movsx */
1865 COSTS_N_INSNS (1), /* cost of movzx */
1866 8, /* "large" insn */
1867 17, /* MOVE_RATIO */
1868 4, /* cost for loading QImode using movzbl */
1869 {4, 4, 4}, /* cost of loading integer registers
1870 in QImode, HImode and SImode.
1871 Relative to reg-reg move (2). */
1872 {4, 4, 4}, /* cost of storing integer registers */
1873 4, /* cost of reg,reg fld/fst */
1874 {12, 12, 12}, /* cost of loading fp registers
1875 in SFmode, DFmode and XFmode */
1876 {6, 6, 8}, /* cost of storing fp registers
1877 in SFmode, DFmode and XFmode */
1878 2, /* cost of moving MMX register */
1879 {8, 8}, /* cost of loading MMX registers
1880 in SImode and DImode */
1881 {8, 8}, /* cost of storing MMX registers
1882 in SImode and DImode */
1883 2, /* cost of moving SSE register */
1884 {8, 8, 8}, /* cost of loading SSE registers
1885 in SImode, DImode and TImode */
1886 {8, 8, 8}, /* cost of storing SSE registers
1887 in SImode, DImode and TImode */
1888 5, /* MMX or SSE register to integer */
1889 32, /* size of l1 cache. */
1890 256, /* size of l2 cache. */
1891 64, /* size of prefetch block */
1892 6, /* number of parallel prefetches */
1893 3, /* Branch cost */
1894 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1895 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1896 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1897 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1898 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1899 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1900 slm_memcpy,
1901 slm_memset,
1902 1, /* scalar_stmt_cost. */
1903 1, /* scalar load_cost. */
1904 1, /* scalar_store_cost. */
1905 1, /* vec_stmt_cost. */
1906 4, /* vec_to_scalar_cost. */
1907 1, /* scalar_to_vec_cost. */
1908 1, /* vec_align_load_cost. */
1909 2, /* vec_unalign_load_cost. */
1910 1, /* vec_store_cost. */
1911 3, /* cond_taken_branch_cost. */
1912 1, /* cond_not_taken_branch_cost. */
1915 static stringop_algs intel_memcpy[2] = {
1916 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1917 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1918 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1919 static stringop_algs intel_memset[2] = {
1920 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1921 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1922 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1923 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1924 static const
1925 struct processor_costs intel_cost = {
1926 COSTS_N_INSNS (1), /* cost of an add instruction */
1927 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1928 COSTS_N_INSNS (1), /* variable shift costs */
1929 COSTS_N_INSNS (1), /* constant shift costs */
1930 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1931 COSTS_N_INSNS (3), /* HI */
1932 COSTS_N_INSNS (3), /* SI */
1933 COSTS_N_INSNS (4), /* DI */
1934 COSTS_N_INSNS (2)}, /* other */
1935 0, /* cost of multiply per each bit set */
1936 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1937 COSTS_N_INSNS (26), /* HI */
1938 COSTS_N_INSNS (42), /* SI */
1939 COSTS_N_INSNS (74), /* DI */
1940 COSTS_N_INSNS (74)}, /* other */
1941 COSTS_N_INSNS (1), /* cost of movsx */
1942 COSTS_N_INSNS (1), /* cost of movzx */
1943 8, /* "large" insn */
1944 17, /* MOVE_RATIO */
1945 4, /* cost for loading QImode using movzbl */
1946 {4, 4, 4}, /* cost of loading integer registers
1947 in QImode, HImode and SImode.
1948 Relative to reg-reg move (2). */
1949 {4, 4, 4}, /* cost of storing integer registers */
1950 4, /* cost of reg,reg fld/fst */
1951 {12, 12, 12}, /* cost of loading fp registers
1952 in SFmode, DFmode and XFmode */
1953 {6, 6, 8}, /* cost of storing fp registers
1954 in SFmode, DFmode and XFmode */
1955 2, /* cost of moving MMX register */
1956 {8, 8}, /* cost of loading MMX registers
1957 in SImode and DImode */
1958 {8, 8}, /* cost of storing MMX registers
1959 in SImode and DImode */
1960 2, /* cost of moving SSE register */
1961 {8, 8, 8}, /* cost of loading SSE registers
1962 in SImode, DImode and TImode */
1963 {8, 8, 8}, /* cost of storing SSE registers
1964 in SImode, DImode and TImode */
1965 5, /* MMX or SSE register to integer */
1966 32, /* size of l1 cache. */
1967 256, /* size of l2 cache. */
1968 64, /* size of prefetch block */
1969 6, /* number of parallel prefetches */
1970 3, /* Branch cost */
1971 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1972 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1973 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1974 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1975 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1976 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1977 intel_memcpy,
1978 intel_memset,
1979 1, /* scalar_stmt_cost. */
1980 1, /* scalar load_cost. */
1981 1, /* scalar_store_cost. */
1982 1, /* vec_stmt_cost. */
1983 4, /* vec_to_scalar_cost. */
1984 1, /* scalar_to_vec_cost. */
1985 1, /* vec_align_load_cost. */
1986 2, /* vec_unalign_load_cost. */
1987 1, /* vec_store_cost. */
1988 3, /* cond_taken_branch_cost. */
1989 1, /* cond_not_taken_branch_cost. */
1992 /* Generic should produce code tuned for Core-i7 (and newer chips)
1993 and btver1 (and newer chips). */
1995 static stringop_algs generic_memcpy[2] = {
1996 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1997 {-1, libcall, false}}},
1998 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1999 {-1, libcall, false}}}};
2000 static stringop_algs generic_memset[2] = {
2001 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2002 {-1, libcall, false}}},
2003 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2004 {-1, libcall, false}}}};
2005 static const
2006 struct processor_costs generic_cost = {
2007 COSTS_N_INSNS (1), /* cost of an add instruction */
2008 /* On all chips taken into consideration lea is 2 cycles and more. With
2009 this cost however our current implementation of synth_mult results in
2010 use of unnecessary temporary registers causing regression on several
2011 SPECfp benchmarks. */
2012 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2013 COSTS_N_INSNS (1), /* variable shift costs */
2014 COSTS_N_INSNS (1), /* constant shift costs */
2015 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2016 COSTS_N_INSNS (4), /* HI */
2017 COSTS_N_INSNS (3), /* SI */
2018 COSTS_N_INSNS (4), /* DI */
2019 COSTS_N_INSNS (2)}, /* other */
2020 0, /* cost of multiply per each bit set */
2021 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2022 COSTS_N_INSNS (26), /* HI */
2023 COSTS_N_INSNS (42), /* SI */
2024 COSTS_N_INSNS (74), /* DI */
2025 COSTS_N_INSNS (74)}, /* other */
2026 COSTS_N_INSNS (1), /* cost of movsx */
2027 COSTS_N_INSNS (1), /* cost of movzx */
2028 8, /* "large" insn */
2029 17, /* MOVE_RATIO */
2030 4, /* cost for loading QImode using movzbl */
2031 {4, 4, 4}, /* cost of loading integer registers
2032 in QImode, HImode and SImode.
2033 Relative to reg-reg move (2). */
2034 {4, 4, 4}, /* cost of storing integer registers */
2035 4, /* cost of reg,reg fld/fst */
2036 {12, 12, 12}, /* cost of loading fp registers
2037 in SFmode, DFmode and XFmode */
2038 {6, 6, 8}, /* cost of storing fp registers
2039 in SFmode, DFmode and XFmode */
2040 2, /* cost of moving MMX register */
2041 {8, 8}, /* cost of loading MMX registers
2042 in SImode and DImode */
2043 {8, 8}, /* cost of storing MMX registers
2044 in SImode and DImode */
2045 2, /* cost of moving SSE register */
2046 {8, 8, 8}, /* cost of loading SSE registers
2047 in SImode, DImode and TImode */
2048 {8, 8, 8}, /* cost of storing SSE registers
2049 in SImode, DImode and TImode */
2050 5, /* MMX or SSE register to integer */
2051 32, /* size of l1 cache. */
2052 512, /* size of l2 cache. */
2053 64, /* size of prefetch block */
2054 6, /* number of parallel prefetches */
2055 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2056 value is increased to perhaps more appropriate value of 5. */
2057 3, /* Branch cost */
2058 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2059 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2060 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2061 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2062 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2063 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2064 generic_memcpy,
2065 generic_memset,
2066 1, /* scalar_stmt_cost. */
2067 1, /* scalar load_cost. */
2068 1, /* scalar_store_cost. */
2069 1, /* vec_stmt_cost. */
2070 1, /* vec_to_scalar_cost. */
2071 1, /* scalar_to_vec_cost. */
2072 1, /* vec_align_load_cost. */
2073 2, /* vec_unalign_load_cost. */
2074 1, /* vec_store_cost. */
2075 3, /* cond_taken_branch_cost. */
2076 1, /* cond_not_taken_branch_cost. */
2079 /* core_cost should produce code tuned for Core familly of CPUs. */
2080 static stringop_algs core_memcpy[2] = {
2081 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2082 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2083 {-1, libcall, false}}}};
2084 static stringop_algs core_memset[2] = {
2085 {libcall, {{6, loop_1_byte, true},
2086 {24, loop, true},
2087 {8192, rep_prefix_4_byte, true},
2088 {-1, libcall, false}}},
2089 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2090 {-1, libcall, false}}}};
2092 static const
2093 struct processor_costs core_cost = {
2094 COSTS_N_INSNS (1), /* cost of an add instruction */
2095 /* On all chips taken into consideration lea is 2 cycles and more. With
2096 this cost however our current implementation of synth_mult results in
2097 use of unnecessary temporary registers causing regression on several
2098 SPECfp benchmarks. */
2099 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2100 COSTS_N_INSNS (1), /* variable shift costs */
2101 COSTS_N_INSNS (1), /* constant shift costs */
2102 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2103 COSTS_N_INSNS (4), /* HI */
2104 COSTS_N_INSNS (3), /* SI */
2105 COSTS_N_INSNS (4), /* DI */
2106 COSTS_N_INSNS (2)}, /* other */
2107 0, /* cost of multiply per each bit set */
2108 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2109 COSTS_N_INSNS (26), /* HI */
2110 COSTS_N_INSNS (42), /* SI */
2111 COSTS_N_INSNS (74), /* DI */
2112 COSTS_N_INSNS (74)}, /* other */
2113 COSTS_N_INSNS (1), /* cost of movsx */
2114 COSTS_N_INSNS (1), /* cost of movzx */
2115 8, /* "large" insn */
2116 17, /* MOVE_RATIO */
2117 4, /* cost for loading QImode using movzbl */
2118 {4, 4, 4}, /* cost of loading integer registers
2119 in QImode, HImode and SImode.
2120 Relative to reg-reg move (2). */
2121 {4, 4, 4}, /* cost of storing integer registers */
2122 4, /* cost of reg,reg fld/fst */
2123 {12, 12, 12}, /* cost of loading fp registers
2124 in SFmode, DFmode and XFmode */
2125 {6, 6, 8}, /* cost of storing fp registers
2126 in SFmode, DFmode and XFmode */
2127 2, /* cost of moving MMX register */
2128 {8, 8}, /* cost of loading MMX registers
2129 in SImode and DImode */
2130 {8, 8}, /* cost of storing MMX registers
2131 in SImode and DImode */
2132 2, /* cost of moving SSE register */
2133 {8, 8, 8}, /* cost of loading SSE registers
2134 in SImode, DImode and TImode */
2135 {8, 8, 8}, /* cost of storing SSE registers
2136 in SImode, DImode and TImode */
2137 5, /* MMX or SSE register to integer */
2138 64, /* size of l1 cache. */
2139 512, /* size of l2 cache. */
2140 64, /* size of prefetch block */
2141 6, /* number of parallel prefetches */
2142 /* FIXME perhaps more appropriate value is 5. */
2143 3, /* Branch cost */
2144 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2145 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2146 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2147 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2148 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2149 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2150 core_memcpy,
2151 core_memset,
2152 1, /* scalar_stmt_cost. */
2153 1, /* scalar load_cost. */
2154 1, /* scalar_store_cost. */
2155 1, /* vec_stmt_cost. */
2156 1, /* vec_to_scalar_cost. */
2157 1, /* scalar_to_vec_cost. */
2158 1, /* vec_align_load_cost. */
2159 2, /* vec_unalign_load_cost. */
2160 1, /* vec_store_cost. */
2161 3, /* cond_taken_branch_cost. */
2162 1, /* cond_not_taken_branch_cost. */
2166 /* Set by -mtune. */
2167 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2169 /* Set by -mtune or -Os. */
2170 const struct processor_costs *ix86_cost = &pentium_cost;
2172 /* Processor feature/optimization bitmasks. */
2173 #define m_386 (1U<<PROCESSOR_I386)
2174 #define m_486 (1U<<PROCESSOR_I486)
2175 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2176 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2177 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2178 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2179 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2180 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2181 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2182 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2183 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2184 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2185 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2186 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2187 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2188 #define m_KNL (1U<<PROCESSOR_KNL)
2189 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2190 #define m_INTEL (1U<<PROCESSOR_INTEL)
2192 #define m_GEODE (1U<<PROCESSOR_GEODE)
2193 #define m_K6 (1U<<PROCESSOR_K6)
2194 #define m_K6_GEODE (m_K6 | m_GEODE)
2195 #define m_K8 (1U<<PROCESSOR_K8)
2196 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2197 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2198 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2199 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2200 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2201 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2202 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2203 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2204 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2205 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2206 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2207 #define m_BTVER (m_BTVER1 | m_BTVER2)
2208 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2209 | m_ZNVER1)
2211 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2213 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2214 #undef DEF_TUNE
2215 #define DEF_TUNE(tune, name, selector) name,
2216 #include "x86-tune.def"
2217 #undef DEF_TUNE
2220 /* Feature tests against the various tunings. */
2221 unsigned char ix86_tune_features[X86_TUNE_LAST];
2223 /* Feature tests against the various tunings used to create ix86_tune_features
2224 based on the processor mask. */
2225 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2226 #undef DEF_TUNE
2227 #define DEF_TUNE(tune, name, selector) selector,
2228 #include "x86-tune.def"
2229 #undef DEF_TUNE
2232 /* Feature tests against the various architecture variations. */
2233 unsigned char ix86_arch_features[X86_ARCH_LAST];
2235 /* Feature tests against the various architecture variations, used to create
2236 ix86_arch_features based on the processor mask. */
2237 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2238 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2239 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2241 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2242 ~m_386,
2244 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2245 ~(m_386 | m_486),
2247 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2248 ~m_386,
2250 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2251 ~m_386,
2254 /* In case the average insn count for single function invocation is
2255 lower than this constant, emit fast (but longer) prologue and
2256 epilogue code. */
2257 #define FAST_PROLOGUE_INSN_COUNT 20
2259 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2260 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2261 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2262 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2264 /* Array of the smallest class containing reg number REGNO, indexed by
2265 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2267 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2269 /* ax, dx, cx, bx */
2270 AREG, DREG, CREG, BREG,
2271 /* si, di, bp, sp */
2272 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2273 /* FP registers */
2274 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2275 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2276 /* arg pointer */
2277 NON_Q_REGS,
2278 /* flags, fpsr, fpcr, frame */
2279 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2280 /* SSE registers */
2281 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2282 SSE_REGS, SSE_REGS,
2283 /* MMX registers */
2284 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2285 MMX_REGS, MMX_REGS,
2286 /* REX registers */
2287 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2288 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2289 /* SSE REX registers */
2290 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2291 SSE_REGS, SSE_REGS,
2292 /* AVX-512 SSE registers */
2293 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2294 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2295 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2296 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2297 /* Mask registers. */
2298 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2299 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2300 /* MPX bound registers */
2301 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2304 /* The "default" register map used in 32bit mode. */
2306 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2308 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2309 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2310 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2311 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2312 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2313 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2314 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2315 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2316 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2317 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2318 101, 102, 103, 104, /* bound registers */
2321 /* The "default" register map used in 64bit mode. */
2323 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2325 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2326 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2327 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2328 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2329 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2330 8,9,10,11,12,13,14,15, /* extended integer registers */
2331 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2332 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2333 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2334 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2335 126, 127, 128, 129, /* bound registers */
2338 /* Define the register numbers to be used in Dwarf debugging information.
2339 The SVR4 reference port C compiler uses the following register numbers
2340 in its Dwarf output code:
2341 0 for %eax (gcc regno = 0)
2342 1 for %ecx (gcc regno = 2)
2343 2 for %edx (gcc regno = 1)
2344 3 for %ebx (gcc regno = 3)
2345 4 for %esp (gcc regno = 7)
2346 5 for %ebp (gcc regno = 6)
2347 6 for %esi (gcc regno = 4)
2348 7 for %edi (gcc regno = 5)
2349 The following three DWARF register numbers are never generated by
2350 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2351 believes these numbers have these meanings.
2352 8 for %eip (no gcc equivalent)
2353 9 for %eflags (gcc regno = 17)
2354 10 for %trapno (no gcc equivalent)
2355 It is not at all clear how we should number the FP stack registers
2356 for the x86 architecture. If the version of SDB on x86/svr4 were
2357 a bit less brain dead with respect to floating-point then we would
2358 have a precedent to follow with respect to DWARF register numbers
2359 for x86 FP registers, but the SDB on x86/svr4 is so completely
2360 broken with respect to FP registers that it is hardly worth thinking
2361 of it as something to strive for compatibility with.
2362 The version of x86/svr4 SDB I have at the moment does (partially)
2363 seem to believe that DWARF register number 11 is associated with
2364 the x86 register %st(0), but that's about all. Higher DWARF
2365 register numbers don't seem to be associated with anything in
2366 particular, and even for DWARF regno 11, SDB only seems to under-
2367 stand that it should say that a variable lives in %st(0) (when
2368 asked via an `=' command) if we said it was in DWARF regno 11,
2369 but SDB still prints garbage when asked for the value of the
2370 variable in question (via a `/' command).
2371 (Also note that the labels SDB prints for various FP stack regs
2372 when doing an `x' command are all wrong.)
2373 Note that these problems generally don't affect the native SVR4
2374 C compiler because it doesn't allow the use of -O with -g and
2375 because when it is *not* optimizing, it allocates a memory
2376 location for each floating-point variable, and the memory
2377 location is what gets described in the DWARF AT_location
2378 attribute for the variable in question.
2379 Regardless of the severe mental illness of the x86/svr4 SDB, we
2380 do something sensible here and we use the following DWARF
2381 register numbers. Note that these are all stack-top-relative
2382 numbers.
2383 11 for %st(0) (gcc regno = 8)
2384 12 for %st(1) (gcc regno = 9)
2385 13 for %st(2) (gcc regno = 10)
2386 14 for %st(3) (gcc regno = 11)
2387 15 for %st(4) (gcc regno = 12)
2388 16 for %st(5) (gcc regno = 13)
2389 17 for %st(6) (gcc regno = 14)
2390 18 for %st(7) (gcc regno = 15)
2392 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2394 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2395 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2396 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2397 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2398 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2399 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2400 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2401 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2402 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2403 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2404 101, 102, 103, 104, /* bound registers */
2407 /* Define parameter passing and return registers. */
2409 static int const x86_64_int_parameter_registers[6] =
2411 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2414 static int const x86_64_ms_abi_int_parameter_registers[4] =
2416 CX_REG, DX_REG, R8_REG, R9_REG
2419 static int const x86_64_int_return_registers[4] =
2421 AX_REG, DX_REG, DI_REG, SI_REG
2424 /* Additional registers that are clobbered by SYSV calls. */
2426 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2428 SI_REG, DI_REG,
2429 XMM6_REG, XMM7_REG,
2430 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2431 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2434 /* Define the structure for the machine field in struct function. */
2436 struct GTY(()) stack_local_entry {
2437 unsigned short mode;
2438 unsigned short n;
2439 rtx rtl;
2440 struct stack_local_entry *next;
2443 /* Structure describing stack frame layout.
2444 Stack grows downward:
2446 [arguments]
2447 <- ARG_POINTER
2448 saved pc
2450 saved static chain if ix86_static_chain_on_stack
2452 saved frame pointer if frame_pointer_needed
2453 <- HARD_FRAME_POINTER
2454 [saved regs]
2455 <- regs_save_offset
2456 [padding0]
2458 [saved SSE regs]
2459 <- sse_regs_save_offset
2460 [padding1] |
2461 | <- FRAME_POINTER
2462 [va_arg registers] |
2464 [frame] |
2466 [padding2] | = to_allocate
2467 <- STACK_POINTER
2469 struct ix86_frame
2471 int nsseregs;
2472 int nregs;
2473 int va_arg_size;
2474 int red_zone_size;
2475 int outgoing_arguments_size;
2477 /* The offsets relative to ARG_POINTER. */
2478 HOST_WIDE_INT frame_pointer_offset;
2479 HOST_WIDE_INT hard_frame_pointer_offset;
2480 HOST_WIDE_INT stack_pointer_offset;
2481 HOST_WIDE_INT hfp_save_offset;
2482 HOST_WIDE_INT reg_save_offset;
2483 HOST_WIDE_INT sse_reg_save_offset;
2485 /* When save_regs_using_mov is set, emit prologue using
2486 move instead of push instructions. */
2487 bool save_regs_using_mov;
2490 /* Which cpu are we scheduling for. */
2491 enum attr_cpu ix86_schedule;
2493 /* Which cpu are we optimizing for. */
2494 enum processor_type ix86_tune;
2496 /* Which instruction set architecture to use. */
2497 enum processor_type ix86_arch;
2499 /* True if processor has SSE prefetch instruction. */
2500 unsigned char x86_prefetch_sse;
2502 /* -mstackrealign option */
2503 static const char ix86_force_align_arg_pointer_string[]
2504 = "force_align_arg_pointer";
2506 static rtx (*ix86_gen_leave) (void);
2507 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2509 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2510 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2511 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2513 static rtx (*ix86_gen_clzero) (rtx);
2514 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2515 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2516 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2517 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2518 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2519 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2521 /* Preferred alignment for stack boundary in bits. */
2522 unsigned int ix86_preferred_stack_boundary;
2524 /* Alignment for incoming stack boundary in bits specified at
2525 command line. */
2526 static unsigned int ix86_user_incoming_stack_boundary;
2528 /* Default alignment for incoming stack boundary in bits. */
2529 static unsigned int ix86_default_incoming_stack_boundary;
2531 /* Alignment for incoming stack boundary in bits. */
2532 unsigned int ix86_incoming_stack_boundary;
2534 /* Calling abi specific va_list type nodes. */
2535 static GTY(()) tree sysv_va_list_type_node;
2536 static GTY(()) tree ms_va_list_type_node;
2538 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2539 char internal_label_prefix[16];
2540 int internal_label_prefix_len;
2542 /* Fence to use after loop using movnt. */
2543 tree x86_mfence;
2545 /* Register class used for passing given 64bit part of the argument.
2546 These represent classes as documented by the PS ABI, with the exception
2547 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2548 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2550 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2551 whenever possible (upper half does contain padding). */
2552 enum x86_64_reg_class
2554 X86_64_NO_CLASS,
2555 X86_64_INTEGER_CLASS,
2556 X86_64_INTEGERSI_CLASS,
2557 X86_64_SSE_CLASS,
2558 X86_64_SSESF_CLASS,
2559 X86_64_SSEDF_CLASS,
2560 X86_64_SSEUP_CLASS,
2561 X86_64_X87_CLASS,
2562 X86_64_X87UP_CLASS,
2563 X86_64_COMPLEX_X87_CLASS,
2564 X86_64_MEMORY_CLASS
2567 #define MAX_CLASSES 8
2569 /* Table of constants used by fldpi, fldln2, etc.... */
2570 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2571 static bool ext_80387_constants_init = 0;
2574 static struct machine_function * ix86_init_machine_status (void);
2575 static rtx ix86_function_value (const_tree, const_tree, bool);
2576 static bool ix86_function_value_regno_p (const unsigned int);
2577 static unsigned int ix86_function_arg_boundary (machine_mode,
2578 const_tree);
2579 static rtx ix86_static_chain (const_tree, bool);
2580 static int ix86_function_regparm (const_tree, const_tree);
2581 static void ix86_compute_frame_layout (struct ix86_frame *);
2582 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2583 rtx, rtx, int);
2584 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2585 static tree ix86_canonical_va_list_type (tree);
2586 static void predict_jump (int);
2587 static unsigned int split_stack_prologue_scratch_regno (void);
2588 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2590 enum ix86_function_specific_strings
2592 IX86_FUNCTION_SPECIFIC_ARCH,
2593 IX86_FUNCTION_SPECIFIC_TUNE,
2594 IX86_FUNCTION_SPECIFIC_MAX
2597 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2598 const char *, const char *, enum fpmath_unit,
2599 bool);
2600 static void ix86_function_specific_save (struct cl_target_option *,
2601 struct gcc_options *opts);
2602 static void ix86_function_specific_restore (struct gcc_options *opts,
2603 struct cl_target_option *);
2604 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2605 static void ix86_function_specific_print (FILE *, int,
2606 struct cl_target_option *);
2607 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2608 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2609 struct gcc_options *,
2610 struct gcc_options *,
2611 struct gcc_options *);
2612 static bool ix86_can_inline_p (tree, tree);
2613 static void ix86_set_current_function (tree);
2614 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2616 static enum calling_abi ix86_function_abi (const_tree);
2619 #ifndef SUBTARGET32_DEFAULT_CPU
2620 #define SUBTARGET32_DEFAULT_CPU "i386"
2621 #endif
2623 /* Whether -mtune= or -march= were specified */
2624 static int ix86_tune_defaulted;
2625 static int ix86_arch_specified;
2627 /* Vectorization library interface and handlers. */
2628 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2630 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2631 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2633 /* Processor target table, indexed by processor number */
2634 struct ptt
2636 const char *const name; /* processor name */
2637 const struct processor_costs *cost; /* Processor costs */
2638 const int align_loop; /* Default alignments. */
2639 const int align_loop_max_skip;
2640 const int align_jump;
2641 const int align_jump_max_skip;
2642 const int align_func;
2645 /* This table must be in sync with enum processor_type in i386.h. */
2646 static const struct ptt processor_target_table[PROCESSOR_max] =
2648 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2649 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2650 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2651 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2652 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2653 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2654 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2655 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2656 {"core2", &core_cost, 16, 10, 16, 10, 16},
2657 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2658 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2659 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2660 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2661 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2662 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2663 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2664 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2665 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2666 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2667 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2668 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2669 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2670 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2671 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2672 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2673 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2674 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2675 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2676 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2679 static unsigned int
2680 rest_of_handle_insert_vzeroupper (void)
2682 int i;
2684 /* vzeroupper instructions are inserted immediately after reload to
2685 account for possible spills from 256bit registers. The pass
2686 reuses mode switching infrastructure by re-running mode insertion
2687 pass, so disable entities that have already been processed. */
2688 for (i = 0; i < MAX_386_ENTITIES; i++)
2689 ix86_optimize_mode_switching[i] = 0;
2691 ix86_optimize_mode_switching[AVX_U128] = 1;
2693 /* Call optimize_mode_switching. */
2694 g->get_passes ()->execute_pass_mode_switching ();
2695 return 0;
2698 /* Return 1 if INSN uses or defines a hard register.
2699 Hard register uses in a memory address are ignored.
2700 Clobbers and flags definitions are ignored. */
2702 static bool
2703 has_non_address_hard_reg (rtx_insn *insn)
2705 df_ref ref;
2706 FOR_EACH_INSN_DEF (ref, insn)
2707 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2708 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2709 && DF_REF_REGNO (ref) != FLAGS_REG)
2710 return true;
2712 FOR_EACH_INSN_USE (ref, insn)
2713 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2714 return true;
2716 return false;
2719 /* Check if comparison INSN may be transformed
2720 into vector comparison. Currently we transform
2721 zero checks only which look like:
2723 (set (reg:CCZ 17 flags)
2724 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2725 (subreg:SI (reg:DI x) 0))
2726 (const_int 0 [0]))) */
2728 static bool
2729 convertible_comparison_p (rtx_insn *insn)
2731 if (!TARGET_SSE4_1)
2732 return false;
2734 rtx def_set = single_set (insn);
2736 gcc_assert (def_set);
2738 rtx src = SET_SRC (def_set);
2739 rtx dst = SET_DEST (def_set);
2741 gcc_assert (GET_CODE (src) == COMPARE);
2743 if (GET_CODE (dst) != REG
2744 || REGNO (dst) != FLAGS_REG
2745 || GET_MODE (dst) != CCZmode)
2746 return false;
2748 rtx op1 = XEXP (src, 0);
2749 rtx op2 = XEXP (src, 1);
2751 if (op2 != CONST0_RTX (GET_MODE (op2)))
2752 return false;
2754 if (GET_CODE (op1) != IOR)
2755 return false;
2757 op2 = XEXP (op1, 1);
2758 op1 = XEXP (op1, 0);
2760 if (!SUBREG_P (op1)
2761 || !SUBREG_P (op2)
2762 || GET_MODE (op1) != SImode
2763 || GET_MODE (op2) != SImode
2764 || ((SUBREG_BYTE (op1) != 0
2765 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2766 && (SUBREG_BYTE (op2) != 0
2767 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2768 return false;
2770 op1 = SUBREG_REG (op1);
2771 op2 = SUBREG_REG (op2);
2773 if (op1 != op2
2774 || !REG_P (op1)
2775 || GET_MODE (op1) != DImode)
2776 return false;
2778 return true;
2781 /* The DImode version of scalar_to_vector_candidate_p. */
2783 static bool
2784 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2786 rtx def_set = single_set (insn);
2788 if (!def_set)
2789 return false;
2791 if (has_non_address_hard_reg (insn))
2792 return false;
2794 rtx src = SET_SRC (def_set);
2795 rtx dst = SET_DEST (def_set);
2797 if (GET_CODE (src) == COMPARE)
2798 return convertible_comparison_p (insn);
2800 /* We are interested in DImode promotion only. */
2801 if ((GET_MODE (src) != DImode
2802 && !CONST_INT_P (src))
2803 || GET_MODE (dst) != DImode)
2804 return false;
2806 if (!REG_P (dst) && !MEM_P (dst))
2807 return false;
2809 switch (GET_CODE (src))
2811 case ASHIFT:
2812 case LSHIFTRT:
2813 /* FIXME: consider also variable shifts. */
2814 if (!CONST_INT_P (XEXP (src, 1))
2815 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
2816 return false;
2817 break;
2819 case PLUS:
2820 case MINUS:
2821 case IOR:
2822 case XOR:
2823 case AND:
2824 if (!REG_P (XEXP (src, 1))
2825 && !MEM_P (XEXP (src, 1))
2826 && !CONST_INT_P (XEXP (src, 1)))
2827 return false;
2828 break;
2830 case NEG:
2831 case NOT:
2832 break;
2834 case REG:
2835 return true;
2837 case MEM:
2838 case CONST_INT:
2839 return REG_P (dst);
2841 default:
2842 return false;
2845 if (!REG_P (XEXP (src, 0))
2846 && !MEM_P (XEXP (src, 0))
2847 && !CONST_INT_P (XEXP (src, 0))
2848 /* Check for andnot case. */
2849 && (GET_CODE (src) != AND
2850 || GET_CODE (XEXP (src, 0)) != NOT
2851 || !REG_P (XEXP (XEXP (src, 0), 0))))
2852 return false;
2854 if ((GET_MODE (XEXP (src, 0)) != DImode
2855 && !CONST_INT_P (XEXP (src, 0)))
2856 || (GET_CODE (src) != NEG
2857 && GET_CODE (src) != NOT
2858 && GET_MODE (XEXP (src, 1)) != DImode
2859 && !CONST_INT_P (XEXP (src, 1))))
2860 return false;
2862 return true;
2865 /* The TImode version of scalar_to_vector_candidate_p. */
2867 static bool
2868 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2870 rtx def_set = single_set (insn);
2872 if (!def_set)
2873 return false;
2875 if (has_non_address_hard_reg (insn))
2876 return false;
2878 rtx src = SET_SRC (def_set);
2879 rtx dst = SET_DEST (def_set);
2881 /* Only TImode load and store are allowed. */
2882 if (GET_MODE (dst) != TImode)
2883 return false;
2885 if (MEM_P (dst))
2887 /* Check for store. Memory must be aligned or unaligned store
2888 is optimal. Only support store from register, standard SSE
2889 constant or CONST_WIDE_INT generated from piecewise store.
2891 ??? Verify performance impact before enabling CONST_INT for
2892 __int128 store. */
2893 if (misaligned_operand (dst, TImode)
2894 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2895 return false;
2897 switch (GET_CODE (src))
2899 default:
2900 return false;
2902 case REG:
2903 case CONST_WIDE_INT:
2904 return true;
2906 case CONST_INT:
2907 return standard_sse_constant_p (src, TImode);
2910 else if (MEM_P (src))
2912 /* Check for load. Memory must be aligned or unaligned load is
2913 optimal. */
2914 return (REG_P (dst)
2915 && (!misaligned_operand (src, TImode)
2916 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2919 return false;
2922 /* Return 1 if INSN may be converted into vector
2923 instruction. */
2925 static bool
2926 scalar_to_vector_candidate_p (rtx_insn *insn)
2928 if (TARGET_64BIT)
2929 return timode_scalar_to_vector_candidate_p (insn);
2930 else
2931 return dimode_scalar_to_vector_candidate_p (insn);
2934 /* The DImode version of remove_non_convertible_regs. */
2936 static void
2937 dimode_remove_non_convertible_regs (bitmap candidates)
2939 bitmap_iterator bi;
2940 unsigned id;
2941 bitmap regs = BITMAP_ALLOC (NULL);
2943 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2945 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2946 rtx reg = SET_DEST (def_set);
2948 if (!REG_P (reg)
2949 || bitmap_bit_p (regs, REGNO (reg))
2950 || HARD_REGISTER_P (reg))
2951 continue;
2953 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2954 def;
2955 def = DF_REF_NEXT_REG (def))
2957 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2959 if (dump_file)
2960 fprintf (dump_file,
2961 "r%d has non convertible definition in insn %d\n",
2962 REGNO (reg), DF_REF_INSN_UID (def));
2964 bitmap_set_bit (regs, REGNO (reg));
2965 break;
2970 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2972 for (df_ref def = DF_REG_DEF_CHAIN (id);
2973 def;
2974 def = DF_REF_NEXT_REG (def))
2975 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2977 if (dump_file)
2978 fprintf (dump_file, "Removing insn %d from candidates list\n",
2979 DF_REF_INSN_UID (def));
2981 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2985 BITMAP_FREE (regs);
2988 /* For a register REGNO, scan instructions for its defs and uses.
2989 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2991 static void
2992 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2993 unsigned int regno)
2995 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2996 def;
2997 def = DF_REF_NEXT_REG (def))
2999 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3001 if (dump_file)
3002 fprintf (dump_file,
3003 "r%d has non convertible def in insn %d\n",
3004 regno, DF_REF_INSN_UID (def));
3006 bitmap_set_bit (regs, regno);
3007 break;
3011 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3012 ref;
3013 ref = DF_REF_NEXT_REG (ref))
3015 /* Debug instructions are skipped. */
3016 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3017 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3019 if (dump_file)
3020 fprintf (dump_file,
3021 "r%d has non convertible use in insn %d\n",
3022 regno, DF_REF_INSN_UID (ref));
3024 bitmap_set_bit (regs, regno);
3025 break;
3030 /* The TImode version of remove_non_convertible_regs. */
3032 static void
3033 timode_remove_non_convertible_regs (bitmap candidates)
3035 bitmap_iterator bi;
3036 unsigned id;
3037 bitmap regs = BITMAP_ALLOC (NULL);
3039 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3041 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3042 rtx dest = SET_DEST (def_set);
3043 rtx src = SET_SRC (def_set);
3045 if ((!REG_P (dest)
3046 || bitmap_bit_p (regs, REGNO (dest))
3047 || HARD_REGISTER_P (dest))
3048 && (!REG_P (src)
3049 || bitmap_bit_p (regs, REGNO (src))
3050 || HARD_REGISTER_P (src)))
3051 continue;
3053 if (REG_P (dest))
3054 timode_check_non_convertible_regs (candidates, regs,
3055 REGNO (dest));
3057 if (REG_P (src))
3058 timode_check_non_convertible_regs (candidates, regs,
3059 REGNO (src));
3062 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3064 for (df_ref def = DF_REG_DEF_CHAIN (id);
3065 def;
3066 def = DF_REF_NEXT_REG (def))
3067 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3069 if (dump_file)
3070 fprintf (dump_file, "Removing insn %d from candidates list\n",
3071 DF_REF_INSN_UID (def));
3073 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3076 for (df_ref ref = DF_REG_USE_CHAIN (id);
3077 ref;
3078 ref = DF_REF_NEXT_REG (ref))
3079 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3081 if (dump_file)
3082 fprintf (dump_file, "Removing insn %d from candidates list\n",
3083 DF_REF_INSN_UID (ref));
3085 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3089 BITMAP_FREE (regs);
3092 /* For a given bitmap of insn UIDs scans all instruction and
3093 remove insn from CANDIDATES in case it has both convertible
3094 and not convertible definitions.
3096 All insns in a bitmap are conversion candidates according to
3097 scalar_to_vector_candidate_p. Currently it implies all insns
3098 are single_set. */
3100 static void
3101 remove_non_convertible_regs (bitmap candidates)
3103 if (TARGET_64BIT)
3104 timode_remove_non_convertible_regs (candidates);
3105 else
3106 dimode_remove_non_convertible_regs (candidates);
3109 class scalar_chain
3111 public:
3112 scalar_chain ();
3113 virtual ~scalar_chain ();
3115 static unsigned max_id;
3117 /* ID of a chain. */
3118 unsigned int chain_id;
3119 /* A queue of instructions to be included into a chain. */
3120 bitmap queue;
3121 /* Instructions included into a chain. */
3122 bitmap insns;
3123 /* All registers defined by a chain. */
3124 bitmap defs;
3125 /* Registers used in both vector and sclar modes. */
3126 bitmap defs_conv;
3128 void build (bitmap candidates, unsigned insn_uid);
3129 virtual int compute_convert_gain () = 0;
3130 int convert ();
3132 protected:
3133 void add_to_queue (unsigned insn_uid);
3134 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3136 private:
3137 void add_insn (bitmap candidates, unsigned insn_uid);
3138 void analyze_register_chain (bitmap candidates, df_ref ref);
3139 virtual void mark_dual_mode_def (df_ref def) = 0;
3140 virtual void convert_insn (rtx_insn *insn) = 0;
3141 virtual void convert_registers () = 0;
3144 class dimode_scalar_chain : public scalar_chain
3146 public:
3147 int compute_convert_gain ();
3148 private:
3149 void mark_dual_mode_def (df_ref def);
3150 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3151 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3152 void convert_insn (rtx_insn *insn);
3153 void convert_op (rtx *op, rtx_insn *insn);
3154 void convert_reg (unsigned regno);
3155 void make_vector_copies (unsigned regno);
3156 void convert_registers ();
3157 int vector_const_cost (rtx exp);
3160 class timode_scalar_chain : public scalar_chain
3162 public:
3163 /* Convert from TImode to V1TImode is always faster. */
3164 int compute_convert_gain () { return 1; }
3166 private:
3167 void mark_dual_mode_def (df_ref def);
3168 void fix_debug_reg_uses (rtx reg);
3169 void convert_insn (rtx_insn *insn);
3170 /* We don't convert registers to difference size. */
3171 void convert_registers () {}
3174 unsigned scalar_chain::max_id = 0;
3176 /* Initialize new chain. */
3178 scalar_chain::scalar_chain ()
3180 chain_id = ++max_id;
3182 if (dump_file)
3183 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3185 bitmap_obstack_initialize (NULL);
3186 insns = BITMAP_ALLOC (NULL);
3187 defs = BITMAP_ALLOC (NULL);
3188 defs_conv = BITMAP_ALLOC (NULL);
3189 queue = NULL;
3192 /* Free chain's data. */
3194 scalar_chain::~scalar_chain ()
3196 BITMAP_FREE (insns);
3197 BITMAP_FREE (defs);
3198 BITMAP_FREE (defs_conv);
3199 bitmap_obstack_release (NULL);
3202 /* Add instruction into chains' queue. */
3204 void
3205 scalar_chain::add_to_queue (unsigned insn_uid)
3207 if (bitmap_bit_p (insns, insn_uid)
3208 || bitmap_bit_p (queue, insn_uid))
3209 return;
3211 if (dump_file)
3212 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3213 insn_uid, chain_id);
3214 bitmap_set_bit (queue, insn_uid);
3217 /* For DImode conversion, mark register defined by DEF as requiring
3218 conversion. */
3220 void
3221 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3223 gcc_assert (DF_REF_REG_DEF_P (def));
3225 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3226 return;
3228 if (dump_file)
3229 fprintf (dump_file,
3230 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3231 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3233 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3236 /* For TImode conversion, it is unused. */
3238 void
3239 timode_scalar_chain::mark_dual_mode_def (df_ref)
3241 gcc_unreachable ();
3244 /* Check REF's chain to add new insns into a queue
3245 and find registers requiring conversion. */
3247 void
3248 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3250 df_link *chain;
3252 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3253 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3254 add_to_queue (DF_REF_INSN_UID (ref));
3256 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3258 unsigned uid = DF_REF_INSN_UID (chain->ref);
3260 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3261 continue;
3263 if (!DF_REF_REG_MEM_P (chain->ref))
3265 if (bitmap_bit_p (insns, uid))
3266 continue;
3268 if (bitmap_bit_p (candidates, uid))
3270 add_to_queue (uid);
3271 continue;
3275 if (DF_REF_REG_DEF_P (chain->ref))
3277 if (dump_file)
3278 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3279 DF_REF_REGNO (chain->ref), uid);
3280 mark_dual_mode_def (chain->ref);
3282 else
3284 if (dump_file)
3285 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3286 DF_REF_REGNO (chain->ref), uid);
3287 mark_dual_mode_def (ref);
3292 /* Add instruction into a chain. */
3294 void
3295 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3297 if (bitmap_bit_p (insns, insn_uid))
3298 return;
3300 if (dump_file)
3301 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3303 bitmap_set_bit (insns, insn_uid);
3305 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3306 rtx def_set = single_set (insn);
3307 if (def_set && REG_P (SET_DEST (def_set))
3308 && !HARD_REGISTER_P (SET_DEST (def_set)))
3309 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3311 df_ref ref;
3312 df_ref def;
3313 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3314 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3315 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3316 def;
3317 def = DF_REF_NEXT_REG (def))
3318 analyze_register_chain (candidates, def);
3319 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3320 if (!DF_REF_REG_MEM_P (ref))
3321 analyze_register_chain (candidates, ref);
3324 /* Build new chain starting from insn INSN_UID recursively
3325 adding all dependent uses and definitions. */
3327 void
3328 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3330 queue = BITMAP_ALLOC (NULL);
3331 bitmap_set_bit (queue, insn_uid);
3333 if (dump_file)
3334 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3336 while (!bitmap_empty_p (queue))
3338 insn_uid = bitmap_first_set_bit (queue);
3339 bitmap_clear_bit (queue, insn_uid);
3340 bitmap_clear_bit (candidates, insn_uid);
3341 add_insn (candidates, insn_uid);
3344 if (dump_file)
3346 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3347 fprintf (dump_file, " insns: ");
3348 dump_bitmap (dump_file, insns);
3349 if (!bitmap_empty_p (defs_conv))
3351 bitmap_iterator bi;
3352 unsigned id;
3353 const char *comma = "";
3354 fprintf (dump_file, " defs to convert: ");
3355 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3357 fprintf (dump_file, "%sr%d", comma, id);
3358 comma = ", ";
3360 fprintf (dump_file, "\n");
3364 BITMAP_FREE (queue);
3367 /* Return a cost of building a vector costant
3368 instead of using a scalar one. */
3371 dimode_scalar_chain::vector_const_cost (rtx exp)
3373 gcc_assert (CONST_INT_P (exp));
3375 if (standard_sse_constant_p (exp, V2DImode))
3376 return COSTS_N_INSNS (1);
3377 return ix86_cost->sse_load[1];
3380 /* Compute a gain for chain conversion. */
3383 dimode_scalar_chain::compute_convert_gain ()
3385 bitmap_iterator bi;
3386 unsigned insn_uid;
3387 int gain = 0;
3388 int cost = 0;
3390 if (dump_file)
3391 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3393 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3395 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3396 rtx def_set = single_set (insn);
3397 rtx src = SET_SRC (def_set);
3398 rtx dst = SET_DEST (def_set);
3400 if (REG_P (src) && REG_P (dst))
3401 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3402 else if (REG_P (src) && MEM_P (dst))
3403 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3404 else if (MEM_P (src) && REG_P (dst))
3405 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3406 else if (GET_CODE (src) == ASHIFT
3407 || GET_CODE (src) == LSHIFTRT)
3409 gain += ix86_cost->add;
3410 if (CONST_INT_P (XEXP (src, 0)))
3411 gain -= vector_const_cost (XEXP (src, 0));
3412 if (CONST_INT_P (XEXP (src, 1))
3413 && INTVAL (XEXP (src, 1)) >= 32)
3414 gain -= COSTS_N_INSNS (1);
3416 else if (GET_CODE (src) == PLUS
3417 || GET_CODE (src) == MINUS
3418 || GET_CODE (src) == IOR
3419 || GET_CODE (src) == XOR
3420 || GET_CODE (src) == AND)
3422 gain += ix86_cost->add;
3423 /* Additional gain for andnot for targets without BMI. */
3424 if (GET_CODE (XEXP (src, 0)) == NOT
3425 && !TARGET_BMI)
3426 gain += 2 * ix86_cost->add;
3428 if (CONST_INT_P (XEXP (src, 0)))
3429 gain -= vector_const_cost (XEXP (src, 0));
3430 if (CONST_INT_P (XEXP (src, 1)))
3431 gain -= vector_const_cost (XEXP (src, 1));
3433 else if (GET_CODE (src) == NEG
3434 || GET_CODE (src) == NOT)
3435 gain += ix86_cost->add - COSTS_N_INSNS (1);
3436 else if (GET_CODE (src) == COMPARE)
3438 /* Assume comparison cost is the same. */
3440 else if (CONST_INT_P (src))
3442 if (REG_P (dst))
3443 gain += COSTS_N_INSNS (2);
3444 else if (MEM_P (dst))
3445 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3446 gain -= vector_const_cost (src);
3448 else
3449 gcc_unreachable ();
3452 if (dump_file)
3453 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3455 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3456 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3458 if (dump_file)
3459 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3461 gain -= cost;
3463 if (dump_file)
3464 fprintf (dump_file, " Total gain: %d\n", gain);
3466 return gain;
3469 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3472 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3474 if (x == reg)
3475 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3477 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3478 int i, j;
3479 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3481 if (fmt[i] == 'e')
3482 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3483 else if (fmt[i] == 'E')
3484 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3485 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3486 reg, new_reg);
3489 return x;
3492 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3494 void
3495 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3496 rtx reg, rtx new_reg)
3498 replace_with_subreg (single_set (insn), reg, new_reg);
3501 /* Insert generated conversion instruction sequence INSNS
3502 after instruction AFTER. New BB may be required in case
3503 instruction has EH region attached. */
3505 void
3506 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3508 if (!control_flow_insn_p (after))
3510 emit_insn_after (insns, after);
3511 return;
3514 basic_block bb = BLOCK_FOR_INSN (after);
3515 edge e = find_fallthru_edge (bb->succs);
3516 gcc_assert (e);
3518 basic_block new_bb = split_edge (e);
3519 emit_insn_after (insns, BB_HEAD (new_bb));
3522 /* Make vector copies for all register REGNO definitions
3523 and replace its uses in a chain. */
3525 void
3526 dimode_scalar_chain::make_vector_copies (unsigned regno)
3528 rtx reg = regno_reg_rtx[regno];
3529 rtx vreg = gen_reg_rtx (DImode);
3530 df_ref ref;
3532 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3533 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3535 rtx_insn *insn = DF_REF_INSN (ref);
3537 start_sequence ();
3538 if (TARGET_SSE4_1)
3540 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3541 CONST0_RTX (V4SImode),
3542 gen_rtx_SUBREG (SImode, reg, 0)));
3543 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3544 gen_rtx_SUBREG (V4SImode, vreg, 0),
3545 gen_rtx_SUBREG (SImode, reg, 4),
3546 GEN_INT (2)));
3548 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3550 rtx tmp = gen_reg_rtx (DImode);
3551 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3552 CONST0_RTX (V4SImode),
3553 gen_rtx_SUBREG (SImode, reg, 0)));
3554 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3555 CONST0_RTX (V4SImode),
3556 gen_rtx_SUBREG (SImode, reg, 4)));
3557 emit_insn (gen_vec_interleave_lowv4si
3558 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3559 gen_rtx_SUBREG (V4SImode, vreg, 0),
3560 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3562 else
3564 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3565 emit_move_insn (adjust_address (tmp, SImode, 0),
3566 gen_rtx_SUBREG (SImode, reg, 0));
3567 emit_move_insn (adjust_address (tmp, SImode, 4),
3568 gen_rtx_SUBREG (SImode, reg, 4));
3569 emit_move_insn (vreg, tmp);
3571 rtx_insn *seq = get_insns ();
3572 end_sequence ();
3573 emit_conversion_insns (seq, insn);
3575 if (dump_file)
3576 fprintf (dump_file,
3577 " Copied r%d to a vector register r%d for insn %d\n",
3578 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3581 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3582 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3584 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3586 if (dump_file)
3587 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3588 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3592 /* Convert all definitions of register REGNO
3593 and fix its uses. Scalar copies may be created
3594 in case register is used in not convertible insn. */
3596 void
3597 dimode_scalar_chain::convert_reg (unsigned regno)
3599 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3600 rtx reg = regno_reg_rtx[regno];
3601 rtx scopy = NULL_RTX;
3602 df_ref ref;
3603 bitmap conv;
3605 conv = BITMAP_ALLOC (NULL);
3606 bitmap_copy (conv, insns);
3608 if (scalar_copy)
3609 scopy = gen_reg_rtx (DImode);
3611 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3613 rtx_insn *insn = DF_REF_INSN (ref);
3614 rtx def_set = single_set (insn);
3615 rtx src = SET_SRC (def_set);
3616 rtx reg = DF_REF_REG (ref);
3618 if (!MEM_P (src))
3620 replace_with_subreg_in_insn (insn, reg, reg);
3621 bitmap_clear_bit (conv, INSN_UID (insn));
3624 if (scalar_copy)
3626 start_sequence ();
3627 if (TARGET_SSE4_1)
3629 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3630 emit_insn
3631 (gen_rtx_SET
3632 (gen_rtx_SUBREG (SImode, scopy, 0),
3633 gen_rtx_VEC_SELECT (SImode,
3634 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3636 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3637 emit_insn
3638 (gen_rtx_SET
3639 (gen_rtx_SUBREG (SImode, scopy, 4),
3640 gen_rtx_VEC_SELECT (SImode,
3641 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3643 else if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3645 rtx vcopy = gen_reg_rtx (V2DImode);
3646 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3647 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3648 gen_rtx_SUBREG (SImode, vcopy, 0));
3649 emit_move_insn (vcopy,
3650 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3651 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3652 gen_rtx_SUBREG (SImode, vcopy, 0));
3654 else
3656 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3657 emit_move_insn (tmp, reg);
3658 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3659 adjust_address (tmp, SImode, 0));
3660 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3661 adjust_address (tmp, SImode, 4));
3663 rtx_insn *seq = get_insns ();
3664 end_sequence ();
3665 emit_conversion_insns (seq, insn);
3667 if (dump_file)
3668 fprintf (dump_file,
3669 " Copied r%d to a scalar register r%d for insn %d\n",
3670 regno, REGNO (scopy), INSN_UID (insn));
3674 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3675 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3677 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3679 rtx def_set = single_set (DF_REF_INSN (ref));
3680 if (!MEM_P (SET_DEST (def_set))
3681 || !REG_P (SET_SRC (def_set)))
3682 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3683 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3686 /* Skip debug insns and uninitialized uses. */
3687 else if (DF_REF_CHAIN (ref)
3688 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3690 gcc_assert (scopy);
3691 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3692 df_insn_rescan (DF_REF_INSN (ref));
3695 BITMAP_FREE (conv);
3698 /* Convert operand OP in INSN. We should handle
3699 memory operands and uninitialized registers.
3700 All other register uses are converted during
3701 registers conversion. */
3703 void
3704 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3706 *op = copy_rtx_if_shared (*op);
3708 if (GET_CODE (*op) == NOT)
3710 convert_op (&XEXP (*op, 0), insn);
3711 PUT_MODE (*op, V2DImode);
3713 else if (MEM_P (*op))
3715 rtx tmp = gen_reg_rtx (DImode);
3717 emit_insn_before (gen_move_insn (tmp, *op), insn);
3718 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3720 if (dump_file)
3721 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3722 INSN_UID (insn), REGNO (tmp));
3724 else if (REG_P (*op))
3726 /* We may have not converted register usage in case
3727 this register has no definition. Otherwise it
3728 should be converted in convert_reg. */
3729 df_ref ref;
3730 FOR_EACH_INSN_USE (ref, insn)
3731 if (DF_REF_REGNO (ref) == REGNO (*op))
3733 gcc_assert (!DF_REF_CHAIN (ref));
3734 break;
3736 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3738 else if (CONST_INT_P (*op))
3740 rtx vec_cst;
3741 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3743 /* Prefer all ones vector in case of -1. */
3744 if (constm1_operand (*op, GET_MODE (*op)))
3745 vec_cst = CONSTM1_RTX (V2DImode);
3746 else
3747 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3748 gen_rtvec (2, *op, const0_rtx));
3750 if (!standard_sse_constant_p (vec_cst, V2DImode))
3752 start_sequence ();
3753 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3754 rtx_insn *seq = get_insns ();
3755 end_sequence ();
3756 emit_insn_before (seq, insn);
3759 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
3760 *op = tmp;
3762 else
3764 gcc_assert (SUBREG_P (*op));
3765 gcc_assert (GET_MODE (*op) == V2DImode);
3769 /* Convert INSN to vector mode. */
3771 void
3772 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3774 rtx def_set = single_set (insn);
3775 rtx src = SET_SRC (def_set);
3776 rtx dst = SET_DEST (def_set);
3777 rtx subreg;
3779 if (MEM_P (dst) && !REG_P (src))
3781 /* There are no scalar integer instructions and therefore
3782 temporary register usage is required. */
3783 rtx tmp = gen_reg_rtx (DImode);
3784 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3785 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3788 switch (GET_CODE (src))
3790 case ASHIFT:
3791 case LSHIFTRT:
3792 convert_op (&XEXP (src, 0), insn);
3793 PUT_MODE (src, V2DImode);
3794 break;
3796 case PLUS:
3797 case MINUS:
3798 case IOR:
3799 case XOR:
3800 case AND:
3801 convert_op (&XEXP (src, 0), insn);
3802 convert_op (&XEXP (src, 1), insn);
3803 PUT_MODE (src, V2DImode);
3804 break;
3806 case NEG:
3807 src = XEXP (src, 0);
3808 convert_op (&src, insn);
3809 subreg = gen_reg_rtx (V2DImode);
3810 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
3811 src = gen_rtx_MINUS (V2DImode, subreg, src);
3812 break;
3814 case NOT:
3815 src = XEXP (src, 0);
3816 convert_op (&src, insn);
3817 subreg = gen_reg_rtx (V2DImode);
3818 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
3819 src = gen_rtx_XOR (V2DImode, src, subreg);
3820 break;
3822 case MEM:
3823 if (!REG_P (dst))
3824 convert_op (&src, insn);
3825 break;
3827 case REG:
3828 if (!MEM_P (dst))
3829 convert_op (&src, insn);
3830 break;
3832 case SUBREG:
3833 gcc_assert (GET_MODE (src) == V2DImode);
3834 break;
3836 case COMPARE:
3837 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3839 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3840 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3842 if (REG_P (src))
3843 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3844 else
3845 subreg = copy_rtx_if_shared (src);
3846 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3847 copy_rtx_if_shared (subreg),
3848 copy_rtx_if_shared (subreg)),
3849 insn);
3850 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3851 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3852 copy_rtx_if_shared (src)),
3853 UNSPEC_PTEST);
3854 break;
3856 case CONST_INT:
3857 convert_op (&src, insn);
3858 break;
3860 default:
3861 gcc_unreachable ();
3864 SET_SRC (def_set) = src;
3865 SET_DEST (def_set) = dst;
3867 /* Drop possible dead definitions. */
3868 PATTERN (insn) = def_set;
3870 INSN_CODE (insn) = -1;
3871 recog_memoized (insn);
3872 df_insn_rescan (insn);
3875 /* Fix uses of converted REG in debug insns. */
3877 void
3878 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3880 if (!flag_var_tracking)
3881 return;
3883 df_ref ref, next;
3884 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
3886 rtx_insn *insn = DF_REF_INSN (ref);
3887 /* Make sure the next ref is for a different instruction,
3888 so that we're not affected by the rescan. */
3889 next = DF_REF_NEXT_REG (ref);
3890 while (next && DF_REF_INSN (next) == insn)
3891 next = DF_REF_NEXT_REG (next);
3893 if (DEBUG_INSN_P (insn))
3895 /* It may be a debug insn with a TImode variable in
3896 register. */
3897 bool changed = false;
3898 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
3900 rtx *loc = DF_REF_LOC (ref);
3901 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
3903 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
3904 changed = true;
3907 if (changed)
3908 df_insn_rescan (insn);
3913 /* Convert INSN from TImode to V1T1mode. */
3915 void
3916 timode_scalar_chain::convert_insn (rtx_insn *insn)
3918 rtx def_set = single_set (insn);
3919 rtx src = SET_SRC (def_set);
3920 rtx dst = SET_DEST (def_set);
3922 switch (GET_CODE (dst))
3924 case REG:
3926 rtx tmp = find_reg_equal_equiv_note (insn);
3927 if (tmp)
3928 PUT_MODE (XEXP (tmp, 0), V1TImode);
3929 PUT_MODE (dst, V1TImode);
3930 fix_debug_reg_uses (dst);
3932 break;
3933 case MEM:
3934 PUT_MODE (dst, V1TImode);
3935 break;
3937 default:
3938 gcc_unreachable ();
3941 switch (GET_CODE (src))
3943 case REG:
3944 PUT_MODE (src, V1TImode);
3945 /* Call fix_debug_reg_uses only if SRC is never defined. */
3946 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3947 fix_debug_reg_uses (src);
3948 break;
3950 case MEM:
3951 PUT_MODE (src, V1TImode);
3952 break;
3954 case CONST_WIDE_INT:
3955 if (NONDEBUG_INSN_P (insn))
3957 /* Since there are no instructions to store 128-bit constant,
3958 temporary register usage is required. */
3959 rtx tmp = gen_reg_rtx (V1TImode);
3960 start_sequence ();
3961 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3962 src = validize_mem (force_const_mem (V1TImode, src));
3963 rtx_insn *seq = get_insns ();
3964 end_sequence ();
3965 if (seq)
3966 emit_insn_before (seq, insn);
3967 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3968 dst = tmp;
3970 break;
3972 case CONST_INT:
3973 switch (standard_sse_constant_p (src, TImode))
3975 case 1:
3976 src = CONST0_RTX (GET_MODE (dst));
3977 break;
3978 case 2:
3979 src = CONSTM1_RTX (GET_MODE (dst));
3980 break;
3981 default:
3982 gcc_unreachable ();
3984 if (NONDEBUG_INSN_P (insn))
3986 rtx tmp = gen_reg_rtx (V1TImode);
3987 /* Since there are no instructions to store standard SSE
3988 constant, temporary register usage is required. */
3989 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3990 dst = tmp;
3992 break;
3994 default:
3995 gcc_unreachable ();
3998 SET_SRC (def_set) = src;
3999 SET_DEST (def_set) = dst;
4001 /* Drop possible dead definitions. */
4002 PATTERN (insn) = def_set;
4004 INSN_CODE (insn) = -1;
4005 recog_memoized (insn);
4006 df_insn_rescan (insn);
4009 void
4010 dimode_scalar_chain::convert_registers ()
4012 bitmap_iterator bi;
4013 unsigned id;
4015 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4016 convert_reg (id);
4018 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4019 make_vector_copies (id);
4022 /* Convert whole chain creating required register
4023 conversions and copies. */
4026 scalar_chain::convert ()
4028 bitmap_iterator bi;
4029 unsigned id;
4030 int converted_insns = 0;
4032 if (!dbg_cnt (stv_conversion))
4033 return 0;
4035 if (dump_file)
4036 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4038 convert_registers ();
4040 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4042 convert_insn (DF_INSN_UID_GET (id)->insn);
4043 converted_insns++;
4046 return converted_insns;
4049 /* Main STV pass function. Find and convert scalar
4050 instructions into vector mode when profitable. */
4052 static unsigned int
4053 convert_scalars_to_vector ()
4055 basic_block bb;
4056 bitmap candidates;
4057 int converted_insns = 0;
4059 bitmap_obstack_initialize (NULL);
4060 candidates = BITMAP_ALLOC (NULL);
4062 calculate_dominance_info (CDI_DOMINATORS);
4063 df_set_flags (DF_DEFER_INSN_RESCAN);
4064 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4065 df_md_add_problem ();
4066 df_analyze ();
4068 /* Find all instructions we want to convert into vector mode. */
4069 if (dump_file)
4070 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4072 FOR_EACH_BB_FN (bb, cfun)
4074 rtx_insn *insn;
4075 FOR_BB_INSNS (bb, insn)
4076 if (scalar_to_vector_candidate_p (insn))
4078 if (dump_file)
4079 fprintf (dump_file, " insn %d is marked as a candidate\n",
4080 INSN_UID (insn));
4082 bitmap_set_bit (candidates, INSN_UID (insn));
4086 remove_non_convertible_regs (candidates);
4088 if (bitmap_empty_p (candidates))
4089 if (dump_file)
4090 fprintf (dump_file, "There are no candidates for optimization.\n");
4092 while (!bitmap_empty_p (candidates))
4094 unsigned uid = bitmap_first_set_bit (candidates);
4095 scalar_chain *chain;
4097 if (TARGET_64BIT)
4098 chain = new timode_scalar_chain;
4099 else
4100 chain = new dimode_scalar_chain;
4102 /* Find instructions chain we want to convert to vector mode.
4103 Check all uses and definitions to estimate all required
4104 conversions. */
4105 chain->build (candidates, uid);
4107 if (chain->compute_convert_gain () > 0)
4108 converted_insns += chain->convert ();
4109 else
4110 if (dump_file)
4111 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4112 chain->chain_id);
4114 delete chain;
4117 if (dump_file)
4118 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4120 BITMAP_FREE (candidates);
4121 bitmap_obstack_release (NULL);
4122 df_process_deferred_rescans ();
4124 /* Conversion means we may have 128bit register spills/fills
4125 which require aligned stack. */
4126 if (converted_insns)
4128 if (crtl->stack_alignment_needed < 128)
4129 crtl->stack_alignment_needed = 128;
4130 if (crtl->stack_alignment_estimated < 128)
4131 crtl->stack_alignment_estimated = 128;
4132 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4133 if (TARGET_64BIT)
4134 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4135 parm; parm = DECL_CHAIN (parm))
4137 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4138 continue;
4139 if (DECL_RTL_SET_P (parm)
4140 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4142 rtx r = DECL_RTL (parm);
4143 if (REG_P (r))
4144 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4146 if (DECL_INCOMING_RTL (parm)
4147 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4149 rtx r = DECL_INCOMING_RTL (parm);
4150 if (REG_P (r))
4151 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4156 return 0;
4159 namespace {
4161 const pass_data pass_data_insert_vzeroupper =
4163 RTL_PASS, /* type */
4164 "vzeroupper", /* name */
4165 OPTGROUP_NONE, /* optinfo_flags */
4166 TV_MACH_DEP, /* tv_id */
4167 0, /* properties_required */
4168 0, /* properties_provided */
4169 0, /* properties_destroyed */
4170 0, /* todo_flags_start */
4171 TODO_df_finish, /* todo_flags_finish */
4174 class pass_insert_vzeroupper : public rtl_opt_pass
4176 public:
4177 pass_insert_vzeroupper(gcc::context *ctxt)
4178 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4181 /* opt_pass methods: */
4182 virtual bool gate (function *)
4184 return TARGET_AVX && !TARGET_AVX512F
4185 && TARGET_VZEROUPPER && flag_expensive_optimizations
4186 && !optimize_size;
4189 virtual unsigned int execute (function *)
4191 return rest_of_handle_insert_vzeroupper ();
4194 }; // class pass_insert_vzeroupper
4196 const pass_data pass_data_stv =
4198 RTL_PASS, /* type */
4199 "stv", /* name */
4200 OPTGROUP_NONE, /* optinfo_flags */
4201 TV_MACH_DEP, /* tv_id */
4202 0, /* properties_required */
4203 0, /* properties_provided */
4204 0, /* properties_destroyed */
4205 0, /* todo_flags_start */
4206 TODO_df_finish, /* todo_flags_finish */
4209 class pass_stv : public rtl_opt_pass
4211 public:
4212 pass_stv (gcc::context *ctxt)
4213 : rtl_opt_pass (pass_data_stv, ctxt),
4214 timode_p (false)
4217 /* opt_pass methods: */
4218 virtual bool gate (function *)
4220 return (timode_p == !!TARGET_64BIT
4221 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4224 virtual unsigned int execute (function *)
4226 return convert_scalars_to_vector ();
4229 opt_pass *clone ()
4231 return new pass_stv (m_ctxt);
4234 void set_pass_param (unsigned int n, bool param)
4236 gcc_assert (n == 0);
4237 timode_p = param;
4240 private:
4241 bool timode_p;
4242 }; // class pass_stv
4244 } // anon namespace
4246 rtl_opt_pass *
4247 make_pass_insert_vzeroupper (gcc::context *ctxt)
4249 return new pass_insert_vzeroupper (ctxt);
4252 rtl_opt_pass *
4253 make_pass_stv (gcc::context *ctxt)
4255 return new pass_stv (ctxt);
4258 /* Return true if a red-zone is in use. */
4260 bool
4261 ix86_using_red_zone (void)
4263 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4266 /* Return a string that documents the current -m options. The caller is
4267 responsible for freeing the string. */
4269 static char *
4270 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4271 int flags, int flags2,
4272 const char *arch, const char *tune,
4273 enum fpmath_unit fpmath, bool add_nl_p)
4275 struct ix86_target_opts
4277 const char *option; /* option string */
4278 HOST_WIDE_INT mask; /* isa mask options */
4281 /* This table is ordered so that options like -msse4.2 that imply other
4282 ISAs come first. Target string will be displayed in the same order. */
4283 static struct ix86_target_opts isa2_opts[] =
4285 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4286 { "-msgx", OPTION_MASK_ISA_SGX },
4287 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4288 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4289 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4291 static struct ix86_target_opts isa_opts[] =
4293 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4294 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4295 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4296 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4297 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4298 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4299 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4300 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4301 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4302 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4303 { "-mfma", OPTION_MASK_ISA_FMA },
4304 { "-mxop", OPTION_MASK_ISA_XOP },
4305 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4306 { "-mf16c", OPTION_MASK_ISA_F16C },
4307 { "-mavx", OPTION_MASK_ISA_AVX },
4308 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4309 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4310 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4311 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4312 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4313 { "-msse3", OPTION_MASK_ISA_SSE3 },
4314 { "-maes", OPTION_MASK_ISA_AES },
4315 { "-msha", OPTION_MASK_ISA_SHA },
4316 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4317 { "-msse2", OPTION_MASK_ISA_SSE2 },
4318 { "-msse", OPTION_MASK_ISA_SSE },
4319 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4320 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4321 { "-mmmx", OPTION_MASK_ISA_MMX },
4322 { "-mrtm", OPTION_MASK_ISA_RTM },
4323 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4324 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4325 { "-madx", OPTION_MASK_ISA_ADX },
4326 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4327 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4328 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4329 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4330 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4331 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4332 { "-mabm", OPTION_MASK_ISA_ABM },
4333 { "-mbmi", OPTION_MASK_ISA_BMI },
4334 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4335 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4336 { "-mtbm", OPTION_MASK_ISA_TBM },
4337 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4338 { "-mcx16", OPTION_MASK_ISA_CX16 },
4339 { "-msahf", OPTION_MASK_ISA_SAHF },
4340 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4341 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4342 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4343 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4344 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4345 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4346 { "-mpku", OPTION_MASK_ISA_PKU },
4347 { "-mlwp", OPTION_MASK_ISA_LWP },
4348 { "-mhle", OPTION_MASK_ISA_HLE },
4349 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4350 { "-mmpx", OPTION_MASK_ISA_MPX },
4351 { "-mclwb", OPTION_MASK_ISA_CLWB }
4354 /* Flag options. */
4355 static struct ix86_target_opts flag_opts[] =
4357 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4358 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4359 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4360 { "-m80387", MASK_80387 },
4361 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4362 { "-malign-double", MASK_ALIGN_DOUBLE },
4363 { "-mcld", MASK_CLD },
4364 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4365 { "-mieee-fp", MASK_IEEE_FP },
4366 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4367 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4368 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4369 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4370 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4371 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4372 { "-mno-red-zone", MASK_NO_RED_ZONE },
4373 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4374 { "-mrecip", MASK_RECIP },
4375 { "-mrtd", MASK_RTD },
4376 { "-msseregparm", MASK_SSEREGPARM },
4377 { "-mstack-arg-probe", MASK_STACK_PROBE },
4378 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4379 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4380 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4381 { "-mvzeroupper", MASK_VZEROUPPER },
4382 { "-mstv", MASK_STV },
4383 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4384 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4385 { "-mprefer-avx128", MASK_PREFER_AVX128 }
4388 /* Additional flag options. */
4389 static struct ix86_target_opts flag2_opts[] =
4391 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4394 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4395 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4397 char isa_other[40];
4398 char isa2_other[40];
4399 char flags_other[40];
4400 char flags2_other[40];
4401 unsigned num = 0;
4402 unsigned i, j;
4403 char *ret;
4404 char *ptr;
4405 size_t len;
4406 size_t line_len;
4407 size_t sep_len;
4408 const char *abi;
4410 memset (opts, '\0', sizeof (opts));
4412 /* Add -march= option. */
4413 if (arch)
4415 opts[num][0] = "-march=";
4416 opts[num++][1] = arch;
4419 /* Add -mtune= option. */
4420 if (tune)
4422 opts[num][0] = "-mtune=";
4423 opts[num++][1] = tune;
4426 /* Add -m32/-m64/-mx32. */
4427 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4429 if ((isa & OPTION_MASK_ABI_64) != 0)
4430 abi = "-m64";
4431 else
4432 abi = "-mx32";
4433 isa &= ~ (OPTION_MASK_ISA_64BIT
4434 | OPTION_MASK_ABI_64
4435 | OPTION_MASK_ABI_X32);
4437 else
4438 abi = "-m32";
4439 opts[num++][0] = abi;
4441 /* Pick out the options in isa2 options. */
4442 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4444 if ((isa2 & isa2_opts[i].mask) != 0)
4446 opts[num++][0] = isa2_opts[i].option;
4447 isa2 &= ~ isa2_opts[i].mask;
4451 if (isa2 && add_nl_p)
4453 opts[num++][0] = isa2_other;
4454 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4457 /* Pick out the options in isa options. */
4458 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4460 if ((isa & isa_opts[i].mask) != 0)
4462 opts[num++][0] = isa_opts[i].option;
4463 isa &= ~ isa_opts[i].mask;
4467 if (isa && add_nl_p)
4469 opts[num++][0] = isa_other;
4470 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4473 /* Add flag options. */
4474 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4476 if ((flags & flag_opts[i].mask) != 0)
4478 opts[num++][0] = flag_opts[i].option;
4479 flags &= ~ flag_opts[i].mask;
4483 if (flags && add_nl_p)
4485 opts[num++][0] = flags_other;
4486 sprintf (flags_other, "(other flags: %#x)", flags);
4489 /* Add additional flag options. */
4490 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4492 if ((flags2 & flag2_opts[i].mask) != 0)
4494 opts[num++][0] = flag2_opts[i].option;
4495 flags2 &= ~ flag2_opts[i].mask;
4499 if (flags2 && add_nl_p)
4501 opts[num++][0] = flags2_other;
4502 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4505 /* Add -fpmath= option. */
4506 if (fpmath)
4508 opts[num][0] = "-mfpmath=";
4509 switch ((int) fpmath)
4511 case FPMATH_387:
4512 opts[num++][1] = "387";
4513 break;
4515 case FPMATH_SSE:
4516 opts[num++][1] = "sse";
4517 break;
4519 case FPMATH_387 | FPMATH_SSE:
4520 opts[num++][1] = "sse+387";
4521 break;
4523 default:
4524 gcc_unreachable ();
4528 /* Any options? */
4529 if (num == 0)
4530 return NULL;
4532 gcc_assert (num < ARRAY_SIZE (opts));
4534 /* Size the string. */
4535 len = 0;
4536 sep_len = (add_nl_p) ? 3 : 1;
4537 for (i = 0; i < num; i++)
4539 len += sep_len;
4540 for (j = 0; j < 2; j++)
4541 if (opts[i][j])
4542 len += strlen (opts[i][j]);
4545 /* Build the string. */
4546 ret = ptr = (char *) xmalloc (len);
4547 line_len = 0;
4549 for (i = 0; i < num; i++)
4551 size_t len2[2];
4553 for (j = 0; j < 2; j++)
4554 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4556 if (i != 0)
4558 *ptr++ = ' ';
4559 line_len++;
4561 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4563 *ptr++ = '\\';
4564 *ptr++ = '\n';
4565 line_len = 0;
4569 for (j = 0; j < 2; j++)
4570 if (opts[i][j])
4572 memcpy (ptr, opts[i][j], len2[j]);
4573 ptr += len2[j];
4574 line_len += len2[j];
4578 *ptr = '\0';
4579 gcc_assert (ret + len >= ptr);
4581 return ret;
4584 /* Return true, if profiling code should be emitted before
4585 prologue. Otherwise it returns false.
4586 Note: For x86 with "hotfix" it is sorried. */
4587 static bool
4588 ix86_profile_before_prologue (void)
4590 return flag_fentry != 0;
4593 /* Function that is callable from the debugger to print the current
4594 options. */
4595 void ATTRIBUTE_UNUSED
4596 ix86_debug_options (void)
4598 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4599 target_flags, ix86_target_flags,
4600 ix86_arch_string,ix86_tune_string,
4601 ix86_fpmath, true);
4603 if (opts)
4605 fprintf (stderr, "%s\n\n", opts);
4606 free (opts);
4608 else
4609 fputs ("<no options>\n\n", stderr);
4611 return;
4614 /* Return true if T is one of the bytes we should avoid with
4615 -fmitigate-rop. */
4617 static bool
4618 ix86_rop_should_change_byte_p (int t)
4620 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4623 static const char *stringop_alg_names[] = {
4624 #define DEF_ENUM
4625 #define DEF_ALG(alg, name) #name,
4626 #include "stringop.def"
4627 #undef DEF_ENUM
4628 #undef DEF_ALG
4631 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4632 The string is of the following form (or comma separated list of it):
4634 strategy_alg:max_size:[align|noalign]
4636 where the full size range for the strategy is either [0, max_size] or
4637 [min_size, max_size], in which min_size is the max_size + 1 of the
4638 preceding range. The last size range must have max_size == -1.
4640 Examples:
4643 -mmemcpy-strategy=libcall:-1:noalign
4645 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4649 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4651 This is to tell the compiler to use the following strategy for memset
4652 1) when the expected size is between [1, 16], use rep_8byte strategy;
4653 2) when the size is between [17, 2048], use vector_loop;
4654 3) when the size is > 2048, use libcall. */
4656 struct stringop_size_range
4658 int max;
4659 stringop_alg alg;
4660 bool noalign;
4663 static void
4664 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4666 const struct stringop_algs *default_algs;
4667 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4668 char *curr_range_str, *next_range_str;
4669 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4670 int i = 0, n = 0;
4672 if (is_memset)
4673 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4674 else
4675 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4677 curr_range_str = strategy_str;
4681 int maxs;
4682 char alg_name[128];
4683 char align[16];
4684 next_range_str = strchr (curr_range_str, ',');
4685 if (next_range_str)
4686 *next_range_str++ = '\0';
4688 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4689 alg_name, &maxs, align))
4691 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4692 return;
4695 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4697 error ("size ranges of option %qs should be increasing", opt);
4698 return;
4701 for (i = 0; i < last_alg; i++)
4702 if (!strcmp (alg_name, stringop_alg_names[i]))
4703 break;
4705 if (i == last_alg)
4707 error ("wrong strategy name %qs specified for option %qs",
4708 alg_name, opt);
4710 auto_vec <const char *> candidates;
4711 for (i = 0; i < last_alg; i++)
4712 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4713 candidates.safe_push (stringop_alg_names[i]);
4715 char *s;
4716 const char *hint
4717 = candidates_list_and_hint (alg_name, s, candidates);
4718 if (hint)
4719 inform (input_location,
4720 "valid arguments to %qs are: %s; did you mean %qs?",
4721 opt, s, hint);
4722 else
4723 inform (input_location, "valid arguments to %qs are: %s",
4724 opt, s);
4725 XDELETEVEC (s);
4726 return;
4729 if ((stringop_alg) i == rep_prefix_8_byte
4730 && !TARGET_64BIT)
4732 /* rep; movq isn't available in 32-bit code. */
4733 error ("strategy name %qs specified for option %qs "
4734 "not supported for 32-bit code", alg_name, opt);
4735 return;
4738 input_ranges[n].max = maxs;
4739 input_ranges[n].alg = (stringop_alg) i;
4740 if (!strcmp (align, "align"))
4741 input_ranges[n].noalign = false;
4742 else if (!strcmp (align, "noalign"))
4743 input_ranges[n].noalign = true;
4744 else
4746 error ("unknown alignment %qs specified for option %qs", align, opt);
4747 return;
4749 n++;
4750 curr_range_str = next_range_str;
4752 while (curr_range_str);
4754 if (input_ranges[n - 1].max != -1)
4756 error ("the max value for the last size range should be -1"
4757 " for option %qs", opt);
4758 return;
4761 if (n > MAX_STRINGOP_ALGS)
4763 error ("too many size ranges specified in option %qs", opt);
4764 return;
4767 /* Now override the default algs array. */
4768 for (i = 0; i < n; i++)
4770 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4771 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4772 = input_ranges[i].alg;
4773 *const_cast<int *>(&default_algs->size[i].noalign)
4774 = input_ranges[i].noalign;
4779 /* parse -mtune-ctrl= option. When DUMP is true,
4780 print the features that are explicitly set. */
4782 static void
4783 parse_mtune_ctrl_str (bool dump)
4785 if (!ix86_tune_ctrl_string)
4786 return;
4788 char *next_feature_string = NULL;
4789 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4790 char *orig = curr_feature_string;
4791 int i;
4794 bool clear = false;
4796 next_feature_string = strchr (curr_feature_string, ',');
4797 if (next_feature_string)
4798 *next_feature_string++ = '\0';
4799 if (*curr_feature_string == '^')
4801 curr_feature_string++;
4802 clear = true;
4804 for (i = 0; i < X86_TUNE_LAST; i++)
4806 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4808 ix86_tune_features[i] = !clear;
4809 if (dump)
4810 fprintf (stderr, "Explicitly %s feature %s\n",
4811 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4812 break;
4815 if (i == X86_TUNE_LAST)
4816 error ("Unknown parameter to option -mtune-ctrl: %s",
4817 clear ? curr_feature_string - 1 : curr_feature_string);
4818 curr_feature_string = next_feature_string;
4820 while (curr_feature_string);
4821 free (orig);
4824 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4825 processor type. */
4827 static void
4828 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4830 unsigned int ix86_tune_mask = 1u << ix86_tune;
4831 int i;
4833 for (i = 0; i < X86_TUNE_LAST; ++i)
4835 if (ix86_tune_no_default)
4836 ix86_tune_features[i] = 0;
4837 else
4838 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4841 if (dump)
4843 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4844 for (i = 0; i < X86_TUNE_LAST; i++)
4845 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4846 ix86_tune_features[i] ? "on" : "off");
4849 parse_mtune_ctrl_str (dump);
4853 /* Default align_* from the processor table. */
4855 static void
4856 ix86_default_align (struct gcc_options *opts)
4858 if (opts->x_align_loops == 0)
4860 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4861 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4863 if (opts->x_align_jumps == 0)
4865 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4866 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4868 if (opts->x_align_functions == 0)
4870 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4874 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4876 static void
4877 ix86_override_options_after_change (void)
4879 ix86_default_align (&global_options);
4882 /* Override various settings based on options. If MAIN_ARGS_P, the
4883 options are from the command line, otherwise they are from
4884 attributes. Return true if there's an error related to march
4885 option. */
4887 static bool
4888 ix86_option_override_internal (bool main_args_p,
4889 struct gcc_options *opts,
4890 struct gcc_options *opts_set)
4892 int i;
4893 unsigned int ix86_arch_mask;
4894 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4896 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4897 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4898 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4899 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4900 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4901 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4902 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4903 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4904 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4905 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4906 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4907 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4908 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4909 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4910 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4911 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4912 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4913 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4914 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4915 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4916 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4917 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4918 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4919 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4920 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4921 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4922 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4923 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4924 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4925 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4926 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4927 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4928 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4929 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4930 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4931 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4932 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4933 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4934 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4935 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4936 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4937 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4938 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4939 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4940 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4941 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4942 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4943 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4944 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4945 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4946 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4947 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4948 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4949 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4950 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4951 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4952 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4953 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4954 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4955 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4956 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
4957 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
4958 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
4959 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
4961 #define PTA_CORE2 \
4962 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4963 | PTA_CX16 | PTA_FXSR)
4964 #define PTA_NEHALEM \
4965 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4966 #define PTA_WESTMERE \
4967 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4968 #define PTA_SANDYBRIDGE \
4969 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4970 #define PTA_IVYBRIDGE \
4971 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4972 #define PTA_HASWELL \
4973 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4974 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4975 #define PTA_BROADWELL \
4976 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4977 #define PTA_SKYLAKE \
4978 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4979 #define PTA_SKYLAKE_AVX512 \
4980 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4981 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4982 #define PTA_KNL \
4983 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4984 #define PTA_BONNELL \
4985 (PTA_CORE2 | PTA_MOVBE)
4986 #define PTA_SILVERMONT \
4987 (PTA_WESTMERE | PTA_MOVBE)
4989 /* if this reaches 64, need to widen struct pta flags below */
4991 static struct pta
4993 const char *const name; /* processor name or nickname. */
4994 const enum processor_type processor;
4995 const enum attr_cpu schedule;
4996 const unsigned HOST_WIDE_INT flags;
4998 const processor_alias_table[] =
5000 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5001 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5002 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5003 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5004 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5005 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5006 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5007 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5008 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5009 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5010 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5011 PTA_MMX | PTA_SSE | PTA_FXSR},
5012 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5013 PTA_MMX | PTA_SSE | PTA_FXSR},
5014 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5015 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5016 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5017 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5018 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5019 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5020 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5021 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5022 PTA_MMX | PTA_SSE | PTA_FXSR},
5023 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5024 PTA_MMX | PTA_SSE | PTA_FXSR},
5025 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5026 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5027 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5028 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5029 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5030 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5031 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5032 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5033 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5034 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5035 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5036 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5037 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5038 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5039 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5040 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5041 PTA_SANDYBRIDGE},
5042 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5043 PTA_SANDYBRIDGE},
5044 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5045 PTA_IVYBRIDGE},
5046 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5047 PTA_IVYBRIDGE},
5048 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5049 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5050 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5051 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5052 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5053 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5054 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5055 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5056 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5057 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5058 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5059 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5060 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5061 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5062 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5063 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5064 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5065 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5066 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5067 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5068 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5069 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5070 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5071 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5072 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5073 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5074 {"x86-64", PROCESSOR_K8, CPU_K8,
5075 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5076 {"eden-x2", PROCESSOR_K8, CPU_K8,
5077 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5078 {"nano", PROCESSOR_K8, CPU_K8,
5079 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5080 | PTA_SSSE3 | PTA_FXSR},
5081 {"nano-1000", PROCESSOR_K8, CPU_K8,
5082 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5083 | PTA_SSSE3 | PTA_FXSR},
5084 {"nano-2000", PROCESSOR_K8, CPU_K8,
5085 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5086 | PTA_SSSE3 | PTA_FXSR},
5087 {"nano-3000", PROCESSOR_K8, CPU_K8,
5088 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5089 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5090 {"nano-x2", PROCESSOR_K8, CPU_K8,
5091 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5092 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5093 {"eden-x4", PROCESSOR_K8, CPU_K8,
5094 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5095 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5096 {"nano-x4", PROCESSOR_K8, CPU_K8,
5097 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5098 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5099 {"k8", PROCESSOR_K8, CPU_K8,
5100 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5101 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5102 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5103 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5104 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5105 {"opteron", PROCESSOR_K8, CPU_K8,
5106 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5107 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5108 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5109 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5110 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5111 {"athlon64", PROCESSOR_K8, CPU_K8,
5112 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5113 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5114 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5115 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5116 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5117 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5118 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5119 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5120 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5121 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5122 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5123 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5124 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5125 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5126 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5127 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5128 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5129 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5130 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5131 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5132 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5133 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5134 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5135 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5136 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5137 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5138 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5139 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5140 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5141 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5142 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5143 | PTA_XSAVEOPT | PTA_FSGSBASE},
5144 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5145 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5146 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5147 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5148 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5149 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5150 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5151 | PTA_MOVBE | PTA_MWAITX},
5152 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5153 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5154 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5155 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5156 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5157 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5158 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5159 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5160 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5161 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5162 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5163 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5164 | PTA_FXSR | PTA_XSAVE},
5165 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5166 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5167 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5168 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5169 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5170 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5172 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5173 PTA_64BIT
5174 | PTA_HLE /* flags are only used for -march switch. */ },
5177 /* -mrecip options. */
5178 static struct
5180 const char *string; /* option name */
5181 unsigned int mask; /* mask bits to set */
5183 const recip_options[] =
5185 { "all", RECIP_MASK_ALL },
5186 { "none", RECIP_MASK_NONE },
5187 { "div", RECIP_MASK_DIV },
5188 { "sqrt", RECIP_MASK_SQRT },
5189 { "vec-div", RECIP_MASK_VEC_DIV },
5190 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5193 int const pta_size = ARRAY_SIZE (processor_alias_table);
5195 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5196 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5197 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5198 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5199 #ifdef TARGET_BI_ARCH
5200 else
5202 #if TARGET_BI_ARCH == 1
5203 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5204 is on and OPTION_MASK_ABI_X32 is off. We turn off
5205 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5206 -mx32. */
5207 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5208 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5209 #else
5210 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5211 on and OPTION_MASK_ABI_64 is off. We turn off
5212 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5213 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5214 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5215 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5216 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5217 #endif
5218 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5219 && TARGET_IAMCU_P (opts->x_target_flags))
5220 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5221 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5223 #endif
5225 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5227 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5228 OPTION_MASK_ABI_64 for TARGET_X32. */
5229 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5230 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5232 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5233 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5234 | OPTION_MASK_ABI_X32
5235 | OPTION_MASK_ABI_64);
5236 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5238 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5239 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5240 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5241 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5244 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5245 SUBTARGET_OVERRIDE_OPTIONS;
5246 #endif
5248 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5249 SUBSUBTARGET_OVERRIDE_OPTIONS;
5250 #endif
5252 /* -fPIC is the default for x86_64. */
5253 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5254 opts->x_flag_pic = 2;
5256 /* Need to check -mtune=generic first. */
5257 if (opts->x_ix86_tune_string)
5259 /* As special support for cross compilers we read -mtune=native
5260 as -mtune=generic. With native compilers we won't see the
5261 -mtune=native, as it was changed by the driver. */
5262 if (!strcmp (opts->x_ix86_tune_string, "native"))
5264 opts->x_ix86_tune_string = "generic";
5266 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5267 warning (OPT_Wdeprecated,
5268 main_args_p
5269 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5270 "or %<-mtune=generic%> instead as appropriate")
5271 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5272 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5273 " instead as appropriate"));
5275 else
5277 if (opts->x_ix86_arch_string)
5278 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5279 if (!opts->x_ix86_tune_string)
5281 opts->x_ix86_tune_string
5282 = processor_target_table[TARGET_CPU_DEFAULT].name;
5283 ix86_tune_defaulted = 1;
5286 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5287 or defaulted. We need to use a sensible tune option. */
5288 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5290 opts->x_ix86_tune_string = "generic";
5294 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5295 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5297 /* rep; movq isn't available in 32-bit code. */
5298 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5299 opts->x_ix86_stringop_alg = no_stringop;
5302 if (!opts->x_ix86_arch_string)
5303 opts->x_ix86_arch_string
5304 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5305 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5306 else
5307 ix86_arch_specified = 1;
5309 if (opts_set->x_ix86_pmode)
5311 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5312 && opts->x_ix86_pmode == PMODE_SI)
5313 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5314 && opts->x_ix86_pmode == PMODE_DI))
5315 error ("address mode %qs not supported in the %s bit mode",
5316 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5317 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5319 else
5320 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5321 ? PMODE_DI : PMODE_SI;
5323 if (!opts_set->x_ix86_abi)
5324 opts->x_ix86_abi = DEFAULT_ABI;
5326 /* For targets using ms ABI enable ms-extensions, if not
5327 explicit turned off. For non-ms ABI we turn off this
5328 option. */
5329 if (!opts_set->x_flag_ms_extensions)
5330 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5332 if (opts_set->x_ix86_cmodel)
5334 switch (opts->x_ix86_cmodel)
5336 case CM_SMALL:
5337 case CM_SMALL_PIC:
5338 if (opts->x_flag_pic)
5339 opts->x_ix86_cmodel = CM_SMALL_PIC;
5340 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5341 error ("code model %qs not supported in the %s bit mode",
5342 "small", "32");
5343 break;
5345 case CM_MEDIUM:
5346 case CM_MEDIUM_PIC:
5347 if (opts->x_flag_pic)
5348 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5349 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5350 error ("code model %qs not supported in the %s bit mode",
5351 "medium", "32");
5352 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5353 error ("code model %qs not supported in x32 mode",
5354 "medium");
5355 break;
5357 case CM_LARGE:
5358 case CM_LARGE_PIC:
5359 if (opts->x_flag_pic)
5360 opts->x_ix86_cmodel = CM_LARGE_PIC;
5361 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5362 error ("code model %qs not supported in the %s bit mode",
5363 "large", "32");
5364 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5365 error ("code model %qs not supported in x32 mode",
5366 "large");
5367 break;
5369 case CM_32:
5370 if (opts->x_flag_pic)
5371 error ("code model %s does not support PIC mode", "32");
5372 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5373 error ("code model %qs not supported in the %s bit mode",
5374 "32", "64");
5375 break;
5377 case CM_KERNEL:
5378 if (opts->x_flag_pic)
5380 error ("code model %s does not support PIC mode", "kernel");
5381 opts->x_ix86_cmodel = CM_32;
5383 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5384 error ("code model %qs not supported in the %s bit mode",
5385 "kernel", "32");
5386 break;
5388 default:
5389 gcc_unreachable ();
5392 else
5394 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5395 use of rip-relative addressing. This eliminates fixups that
5396 would otherwise be needed if this object is to be placed in a
5397 DLL, and is essentially just as efficient as direct addressing. */
5398 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5399 && (TARGET_RDOS || TARGET_PECOFF))
5400 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5401 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5402 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5403 else
5404 opts->x_ix86_cmodel = CM_32;
5406 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5408 error ("-masm=intel not supported in this configuration");
5409 opts->x_ix86_asm_dialect = ASM_ATT;
5411 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5412 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5413 sorry ("%i-bit mode not compiled in",
5414 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5416 for (i = 0; i < pta_size; i++)
5417 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5419 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5421 error (main_args_p
5422 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5423 "switch")
5424 : G_("%<generic%> CPU can be used only for "
5425 "%<target(\"tune=\")%> attribute"));
5426 return false;
5428 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5430 error (main_args_p
5431 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5432 "switch")
5433 : G_("%<intel%> CPU can be used only for "
5434 "%<target(\"tune=\")%> attribute"));
5435 return false;
5438 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5439 && !(processor_alias_table[i].flags & PTA_64BIT))
5441 error ("CPU you selected does not support x86-64 "
5442 "instruction set");
5443 return false;
5446 ix86_schedule = processor_alias_table[i].schedule;
5447 ix86_arch = processor_alias_table[i].processor;
5448 /* Default cpu tuning to the architecture. */
5449 ix86_tune = ix86_arch;
5451 if (processor_alias_table[i].flags & PTA_MMX
5452 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5453 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5454 if (processor_alias_table[i].flags & PTA_3DNOW
5455 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5456 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5457 if (processor_alias_table[i].flags & PTA_3DNOW_A
5458 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5459 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5460 if (processor_alias_table[i].flags & PTA_SSE
5461 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5462 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5463 if (processor_alias_table[i].flags & PTA_SSE2
5464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5466 if (processor_alias_table[i].flags & PTA_SSE3
5467 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5468 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5469 if (processor_alias_table[i].flags & PTA_SSSE3
5470 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5471 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5472 if (processor_alias_table[i].flags & PTA_SSE4_1
5473 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5474 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5475 if (processor_alias_table[i].flags & PTA_SSE4_2
5476 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5477 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5478 if (processor_alias_table[i].flags & PTA_AVX
5479 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5480 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5481 if (processor_alias_table[i].flags & PTA_AVX2
5482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5484 if (processor_alias_table[i].flags & PTA_FMA
5485 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5486 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5487 if (processor_alias_table[i].flags & PTA_SSE4A
5488 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5489 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5490 if (processor_alias_table[i].flags & PTA_FMA4
5491 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5492 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5493 if (processor_alias_table[i].flags & PTA_XOP
5494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5496 if (processor_alias_table[i].flags & PTA_LWP
5497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5499 if (processor_alias_table[i].flags & PTA_ABM
5500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5502 if (processor_alias_table[i].flags & PTA_BMI
5503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5505 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5508 if (processor_alias_table[i].flags & PTA_TBM
5509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5511 if (processor_alias_table[i].flags & PTA_BMI2
5512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5514 if (processor_alias_table[i].flags & PTA_CX16
5515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5517 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5520 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5521 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5522 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5523 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5524 if (processor_alias_table[i].flags & PTA_MOVBE
5525 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5526 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5527 if (processor_alias_table[i].flags & PTA_AES
5528 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5529 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5530 if (processor_alias_table[i].flags & PTA_SHA
5531 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5532 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5533 if (processor_alias_table[i].flags & PTA_PCLMUL
5534 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5535 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5536 if (processor_alias_table[i].flags & PTA_FSGSBASE
5537 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5538 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5539 if (processor_alias_table[i].flags & PTA_RDRND
5540 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5541 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5542 if (processor_alias_table[i].flags & PTA_F16C
5543 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5544 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5545 if (processor_alias_table[i].flags & PTA_RTM
5546 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5547 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5548 if (processor_alias_table[i].flags & PTA_HLE
5549 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5550 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5551 if (processor_alias_table[i].flags & PTA_PRFCHW
5552 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5553 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5554 if (processor_alias_table[i].flags & PTA_RDSEED
5555 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5556 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5557 if (processor_alias_table[i].flags & PTA_ADX
5558 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5559 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5560 if (processor_alias_table[i].flags & PTA_FXSR
5561 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5562 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5563 if (processor_alias_table[i].flags & PTA_XSAVE
5564 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5565 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5566 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5567 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5568 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5569 if (processor_alias_table[i].flags & PTA_AVX512F
5570 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5571 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5572 if (processor_alias_table[i].flags & PTA_AVX512ER
5573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5575 if (processor_alias_table[i].flags & PTA_AVX512PF
5576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5578 if (processor_alias_table[i].flags & PTA_AVX512CD
5579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5581 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5584 if (processor_alias_table[i].flags & PTA_CLWB
5585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5587 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5590 if (processor_alias_table[i].flags & PTA_CLZERO
5591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5593 if (processor_alias_table[i].flags & PTA_XSAVEC
5594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5596 if (processor_alias_table[i].flags & PTA_XSAVES
5597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5599 if (processor_alias_table[i].flags & PTA_AVX512DQ
5600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5602 if (processor_alias_table[i].flags & PTA_AVX512BW
5603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5605 if (processor_alias_table[i].flags & PTA_AVX512VL
5606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5608 if (processor_alias_table[i].flags & PTA_MPX
5609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5611 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5614 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5615 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5616 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5618 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5619 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5620 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5621 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5622 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5623 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5624 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5625 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5626 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5627 if (processor_alias_table[i].flags & PTA_SGX
5628 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5629 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5631 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5632 x86_prefetch_sse = true;
5633 if (processor_alias_table[i].flags & PTA_MWAITX
5634 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5635 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5636 if (processor_alias_table[i].flags & PTA_PKU
5637 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5638 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5640 /* Don't enable x87 instructions if only
5641 general registers are allowed. */
5642 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5643 && !(opts_set->x_target_flags & MASK_80387))
5645 if (processor_alias_table[i].flags & PTA_NO_80387)
5646 opts->x_target_flags &= ~MASK_80387;
5647 else
5648 opts->x_target_flags |= MASK_80387;
5650 break;
5653 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5654 error ("Intel MPX does not support x32");
5656 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5657 error ("Intel MPX does not support x32");
5659 if (i == pta_size)
5661 error (main_args_p
5662 ? G_("bad value (%qs) for %<-march=%> switch")
5663 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
5664 opts->x_ix86_arch_string);
5666 auto_vec <const char *> candidates;
5667 for (i = 0; i < pta_size; i++)
5668 if (strcmp (processor_alias_table[i].name, "generic")
5669 && strcmp (processor_alias_table[i].name, "intel")
5670 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5671 || (processor_alias_table[i].flags & PTA_64BIT)))
5672 candidates.safe_push (processor_alias_table[i].name);
5674 char *s;
5675 const char *hint
5676 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5677 if (hint)
5678 inform (input_location,
5679 main_args_p
5680 ? G_("valid arguments to %<-march=%> switch are: "
5681 "%s; did you mean %qs?")
5682 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
5683 "%s; did you mean %qs?"), s, hint);
5684 else
5685 inform (input_location,
5686 main_args_p
5687 ? G_("valid arguments to %<-march=%> switch are: %s")
5688 : G_("valid arguments to %<target(\"arch=\")%> attribute "
5689 "are: %s"), s);
5690 XDELETEVEC (s);
5693 ix86_arch_mask = 1u << ix86_arch;
5694 for (i = 0; i < X86_ARCH_LAST; ++i)
5695 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5697 for (i = 0; i < pta_size; i++)
5698 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5700 ix86_schedule = processor_alias_table[i].schedule;
5701 ix86_tune = processor_alias_table[i].processor;
5702 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5704 if (!(processor_alias_table[i].flags & PTA_64BIT))
5706 if (ix86_tune_defaulted)
5708 opts->x_ix86_tune_string = "x86-64";
5709 for (i = 0; i < pta_size; i++)
5710 if (! strcmp (opts->x_ix86_tune_string,
5711 processor_alias_table[i].name))
5712 break;
5713 ix86_schedule = processor_alias_table[i].schedule;
5714 ix86_tune = processor_alias_table[i].processor;
5716 else
5717 error ("CPU you selected does not support x86-64 "
5718 "instruction set");
5721 /* Intel CPUs have always interpreted SSE prefetch instructions as
5722 NOPs; so, we can enable SSE prefetch instructions even when
5723 -mtune (rather than -march) points us to a processor that has them.
5724 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5725 higher processors. */
5726 if (TARGET_CMOV
5727 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5728 x86_prefetch_sse = true;
5729 break;
5732 if (ix86_tune_specified && i == pta_size)
5734 error (main_args_p
5735 ? G_("bad value (%qs) for %<-mtune=%> switch")
5736 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
5737 opts->x_ix86_tune_string);
5739 auto_vec <const char *> candidates;
5740 for (i = 0; i < pta_size; i++)
5741 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5742 || (processor_alias_table[i].flags & PTA_64BIT))
5743 candidates.safe_push (processor_alias_table[i].name);
5745 char *s;
5746 const char *hint
5747 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5748 if (hint)
5749 inform (input_location,
5750 main_args_p
5751 ? G_("valid arguments to %<-mtune=%> switch are: "
5752 "%s; did you mean %qs?")
5753 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
5754 "%s; did you mean %qs?"), s, hint);
5755 else
5756 inform (input_location,
5757 main_args_p
5758 ? G_("valid arguments to %<-mtune=%> switch are: %s")
5759 : G_("valid arguments to %<target(\"tune=\")%> attribute "
5760 "are: %s"), s);
5761 XDELETEVEC (s);
5764 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5766 #ifndef USE_IX86_FRAME_POINTER
5767 #define USE_IX86_FRAME_POINTER 0
5768 #endif
5770 #ifndef USE_X86_64_FRAME_POINTER
5771 #define USE_X86_64_FRAME_POINTER 0
5772 #endif
5774 /* Set the default values for switches whose default depends on TARGET_64BIT
5775 in case they weren't overwritten by command line options. */
5776 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5778 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5779 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5780 if (opts->x_flag_asynchronous_unwind_tables
5781 && !opts_set->x_flag_unwind_tables
5782 && TARGET_64BIT_MS_ABI)
5783 opts->x_flag_unwind_tables = 1;
5784 if (opts->x_flag_asynchronous_unwind_tables == 2)
5785 opts->x_flag_unwind_tables
5786 = opts->x_flag_asynchronous_unwind_tables = 1;
5787 if (opts->x_flag_pcc_struct_return == 2)
5788 opts->x_flag_pcc_struct_return = 0;
5790 else
5792 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5793 opts->x_flag_omit_frame_pointer
5794 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5795 if (opts->x_flag_asynchronous_unwind_tables == 2)
5796 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5797 if (opts->x_flag_pcc_struct_return == 2)
5799 /* Intel MCU psABI specifies that -freg-struct-return should
5800 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5801 we check -miamcu so that -freg-struct-return is always
5802 turned on if -miamcu is used. */
5803 if (TARGET_IAMCU_P (opts->x_target_flags))
5804 opts->x_flag_pcc_struct_return = 0;
5805 else
5806 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5810 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5811 /* TODO: ix86_cost should be chosen at instruction or function granuality
5812 so for cold code we use size_cost even in !optimize_size compilation. */
5813 if (opts->x_optimize_size)
5814 ix86_cost = &ix86_size_cost;
5815 else
5816 ix86_cost = ix86_tune_cost;
5818 /* Arrange to set up i386_stack_locals for all functions. */
5819 init_machine_status = ix86_init_machine_status;
5821 /* Validate -mregparm= value. */
5822 if (opts_set->x_ix86_regparm)
5824 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5825 warning (0, "-mregparm is ignored in 64-bit mode");
5826 else if (TARGET_IAMCU_P (opts->x_target_flags))
5827 warning (0, "-mregparm is ignored for Intel MCU psABI");
5828 if (opts->x_ix86_regparm > REGPARM_MAX)
5830 error ("-mregparm=%d is not between 0 and %d",
5831 opts->x_ix86_regparm, REGPARM_MAX);
5832 opts->x_ix86_regparm = 0;
5835 if (TARGET_IAMCU_P (opts->x_target_flags)
5836 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5837 opts->x_ix86_regparm = REGPARM_MAX;
5839 /* Default align_* from the processor table. */
5840 ix86_default_align (opts);
5842 /* Provide default for -mbranch-cost= value. */
5843 if (!opts_set->x_ix86_branch_cost)
5844 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5846 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5848 opts->x_target_flags
5849 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5851 /* Enable by default the SSE and MMX builtins. Do allow the user to
5852 explicitly disable any of these. In particular, disabling SSE and
5853 MMX for kernel code is extremely useful. */
5854 if (!ix86_arch_specified)
5855 opts->x_ix86_isa_flags
5856 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5857 | TARGET_SUBTARGET64_ISA_DEFAULT)
5858 & ~opts->x_ix86_isa_flags_explicit);
5860 if (TARGET_RTD_P (opts->x_target_flags))
5861 warning (0,
5862 main_args_p
5863 ? G_("%<-mrtd%> is ignored in 64bit mode")
5864 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
5866 else
5868 opts->x_target_flags
5869 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5871 if (!ix86_arch_specified)
5872 opts->x_ix86_isa_flags
5873 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5875 /* i386 ABI does not specify red zone. It still makes sense to use it
5876 when programmer takes care to stack from being destroyed. */
5877 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5878 opts->x_target_flags |= MASK_NO_RED_ZONE;
5881 /* Keep nonleaf frame pointers. */
5882 if (opts->x_flag_omit_frame_pointer)
5883 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5884 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5885 opts->x_flag_omit_frame_pointer = 1;
5887 /* If we're doing fast math, we don't care about comparison order
5888 wrt NaNs. This lets us use a shorter comparison sequence. */
5889 if (opts->x_flag_finite_math_only)
5890 opts->x_target_flags &= ~MASK_IEEE_FP;
5892 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5893 since the insns won't need emulation. */
5894 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5895 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5897 /* Likewise, if the target doesn't have a 387, or we've specified
5898 software floating point, don't use 387 inline intrinsics. */
5899 if (!TARGET_80387_P (opts->x_target_flags))
5900 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5902 /* Turn on MMX builtins for -msse. */
5903 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5904 opts->x_ix86_isa_flags
5905 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5907 /* Enable SSE prefetch. */
5908 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5909 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5910 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5911 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5912 x86_prefetch_sse = true;
5914 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5915 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5916 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5917 opts->x_ix86_isa_flags
5918 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5920 /* Enable lzcnt instruction for -mabm. */
5921 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5922 opts->x_ix86_isa_flags
5923 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5925 /* Validate -mpreferred-stack-boundary= value or default it to
5926 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5927 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5928 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5930 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
5931 int max = TARGET_SEH ? 4 : 12;
5933 if (opts->x_ix86_preferred_stack_boundary_arg < min
5934 || opts->x_ix86_preferred_stack_boundary_arg > max)
5936 if (min == max)
5937 error ("-mpreferred-stack-boundary is not supported "
5938 "for this target");
5939 else
5940 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5941 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5943 else
5944 ix86_preferred_stack_boundary
5945 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5948 /* Set the default value for -mstackrealign. */
5949 if (opts->x_ix86_force_align_arg_pointer == -1)
5950 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5952 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5954 /* Validate -mincoming-stack-boundary= value or default it to
5955 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5956 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5957 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5959 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5961 if (opts->x_ix86_incoming_stack_boundary_arg < min
5962 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5963 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5964 opts->x_ix86_incoming_stack_boundary_arg, min);
5965 else
5967 ix86_user_incoming_stack_boundary
5968 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5969 ix86_incoming_stack_boundary
5970 = ix86_user_incoming_stack_boundary;
5974 #ifndef NO_PROFILE_COUNTERS
5975 if (flag_nop_mcount)
5976 error ("-mnop-mcount is not compatible with this target");
5977 #endif
5978 if (flag_nop_mcount && flag_pic)
5979 error ("-mnop-mcount is not implemented for -fPIC");
5981 /* Accept -msseregparm only if at least SSE support is enabled. */
5982 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5983 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5984 error (main_args_p
5985 ? G_("%<-msseregparm%> used without SSE enabled")
5986 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
5988 if (opts_set->x_ix86_fpmath)
5990 if (opts->x_ix86_fpmath & FPMATH_SSE)
5992 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5994 if (TARGET_80387_P (opts->x_target_flags))
5996 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5997 opts->x_ix86_fpmath = FPMATH_387;
6000 else if ((opts->x_ix86_fpmath & FPMATH_387)
6001 && !TARGET_80387_P (opts->x_target_flags))
6003 warning (0, "387 instruction set disabled, using SSE arithmetics");
6004 opts->x_ix86_fpmath = FPMATH_SSE;
6008 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6009 fpmath=387. The second is however default at many targets since the
6010 extra 80bit precision of temporaries is considered to be part of ABI.
6011 Overwrite the default at least for -ffast-math.
6012 TODO: -mfpmath=both seems to produce same performing code with bit
6013 smaller binaries. It is however not clear if register allocation is
6014 ready for this setting.
6015 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6016 codegen. We may switch to 387 with -ffast-math for size optimized
6017 functions. */
6018 else if (fast_math_flags_set_p (&global_options)
6019 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6020 opts->x_ix86_fpmath = FPMATH_SSE;
6021 else
6022 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6024 /* Use external vectorized library in vectorizing intrinsics. */
6025 if (opts_set->x_ix86_veclibabi_type)
6026 switch (opts->x_ix86_veclibabi_type)
6028 case ix86_veclibabi_type_svml:
6029 ix86_veclib_handler = ix86_veclibabi_svml;
6030 break;
6032 case ix86_veclibabi_type_acml:
6033 ix86_veclib_handler = ix86_veclibabi_acml;
6034 break;
6036 default:
6037 gcc_unreachable ();
6040 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6041 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6042 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6044 /* If stack probes are required, the space used for large function
6045 arguments on the stack must also be probed, so enable
6046 -maccumulate-outgoing-args so this happens in the prologue. */
6047 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6048 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6050 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6051 warning (0,
6052 main_args_p
6053 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6054 "for correctness")
6055 : G_("stack probing requires "
6056 "%<target(\"accumulate-outgoing-args\")%> for "
6057 "correctness"));
6058 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6061 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6062 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6063 if (fixed_regs[BP_REG]
6064 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6066 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6067 warning (0,
6068 main_args_p
6069 ? G_("fixed ebp register requires "
6070 "%<-maccumulate-outgoing-args%>")
6071 : G_("fixed ebp register requires "
6072 "%<target(\"accumulate-outgoing-args\")%>"));
6073 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6076 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6078 char *p;
6079 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6080 p = strchr (internal_label_prefix, 'X');
6081 internal_label_prefix_len = p - internal_label_prefix;
6082 *p = '\0';
6085 /* When scheduling description is not available, disable scheduler pass
6086 so it won't slow down the compilation and make x87 code slower. */
6087 if (!TARGET_SCHEDULE)
6088 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6090 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6091 ix86_tune_cost->simultaneous_prefetches,
6092 opts->x_param_values,
6093 opts_set->x_param_values);
6094 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6095 ix86_tune_cost->prefetch_block,
6096 opts->x_param_values,
6097 opts_set->x_param_values);
6098 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6099 ix86_tune_cost->l1_cache_size,
6100 opts->x_param_values,
6101 opts_set->x_param_values);
6102 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6103 ix86_tune_cost->l2_cache_size,
6104 opts->x_param_values,
6105 opts_set->x_param_values);
6107 /* Restrict number of if-converted SET insns to 1. */
6108 if (TARGET_ONE_IF_CONV_INSN)
6109 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
6111 opts->x_param_values,
6112 opts_set->x_param_values);
6114 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6115 if (opts->x_flag_prefetch_loop_arrays < 0
6116 && HAVE_prefetch
6117 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6118 && !opts->x_optimize_size
6119 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6120 opts->x_flag_prefetch_loop_arrays = 1;
6122 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6123 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6124 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6125 targetm.expand_builtin_va_start = NULL;
6127 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6129 ix86_gen_leave = gen_leave_rex64;
6130 if (Pmode == DImode)
6132 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6133 ix86_gen_tls_local_dynamic_base_64
6134 = gen_tls_local_dynamic_base_64_di;
6136 else
6138 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6139 ix86_gen_tls_local_dynamic_base_64
6140 = gen_tls_local_dynamic_base_64_si;
6143 else
6144 ix86_gen_leave = gen_leave;
6146 if (Pmode == DImode)
6148 ix86_gen_add3 = gen_adddi3;
6149 ix86_gen_sub3 = gen_subdi3;
6150 ix86_gen_sub3_carry = gen_subdi3_carry;
6151 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6152 ix86_gen_andsp = gen_anddi3;
6153 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6154 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6155 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6156 ix86_gen_monitor = gen_sse3_monitor_di;
6157 ix86_gen_monitorx = gen_monitorx_di;
6158 ix86_gen_clzero = gen_clzero_di;
6160 else
6162 ix86_gen_add3 = gen_addsi3;
6163 ix86_gen_sub3 = gen_subsi3;
6164 ix86_gen_sub3_carry = gen_subsi3_carry;
6165 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6166 ix86_gen_andsp = gen_andsi3;
6167 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6168 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6169 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6170 ix86_gen_monitor = gen_sse3_monitor_si;
6171 ix86_gen_monitorx = gen_monitorx_si;
6172 ix86_gen_clzero = gen_clzero_si;
6175 #ifdef USE_IX86_CLD
6176 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6177 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6178 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6179 #endif
6181 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6183 if (opts->x_flag_fentry > 0)
6184 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6185 "with -fpic");
6186 opts->x_flag_fentry = 0;
6188 else if (TARGET_SEH)
6190 if (opts->x_flag_fentry == 0)
6191 sorry ("-mno-fentry isn%'t compatible with SEH");
6192 opts->x_flag_fentry = 1;
6194 else if (opts->x_flag_fentry < 0)
6196 #if defined(PROFILE_BEFORE_PROLOGUE)
6197 opts->x_flag_fentry = 1;
6198 #else
6199 opts->x_flag_fentry = 0;
6200 #endif
6203 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6204 opts->x_target_flags |= MASK_VZEROUPPER;
6205 if (!(opts_set->x_target_flags & MASK_STV))
6206 opts->x_target_flags |= MASK_STV;
6207 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6208 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6209 stack realignment will be extra cost the pass doesn't take into
6210 account and the pass can't realign the stack. */
6211 if (ix86_preferred_stack_boundary < 128
6212 || ix86_incoming_stack_boundary < 128
6213 || opts->x_ix86_force_align_arg_pointer)
6214 opts->x_target_flags &= ~MASK_STV;
6215 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6216 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6217 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6218 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6219 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6220 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6221 /* Enable 128-bit AVX instruction generation
6222 for the auto-vectorizer. */
6223 if (TARGET_AVX128_OPTIMAL
6224 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6225 opts->x_target_flags |= MASK_PREFER_AVX128;
6227 if (opts->x_ix86_recip_name)
6229 char *p = ASTRDUP (opts->x_ix86_recip_name);
6230 char *q;
6231 unsigned int mask, i;
6232 bool invert;
6234 while ((q = strtok (p, ",")) != NULL)
6236 p = NULL;
6237 if (*q == '!')
6239 invert = true;
6240 q++;
6242 else
6243 invert = false;
6245 if (!strcmp (q, "default"))
6246 mask = RECIP_MASK_ALL;
6247 else
6249 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6250 if (!strcmp (q, recip_options[i].string))
6252 mask = recip_options[i].mask;
6253 break;
6256 if (i == ARRAY_SIZE (recip_options))
6258 error ("unknown option for -mrecip=%s", q);
6259 invert = false;
6260 mask = RECIP_MASK_NONE;
6264 opts->x_recip_mask_explicit |= mask;
6265 if (invert)
6266 opts->x_recip_mask &= ~mask;
6267 else
6268 opts->x_recip_mask |= mask;
6272 if (TARGET_RECIP_P (opts->x_target_flags))
6273 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6274 else if (opts_set->x_target_flags & MASK_RECIP)
6275 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6277 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6278 for 64-bit Bionic. Also default long double to 64-bit for Intel
6279 MCU psABI. */
6280 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6281 && !(opts_set->x_target_flags
6282 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6283 opts->x_target_flags |= (TARGET_64BIT
6284 ? MASK_LONG_DOUBLE_128
6285 : MASK_LONG_DOUBLE_64);
6287 /* Only one of them can be active. */
6288 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6289 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6291 /* Save the initial options in case the user does function specific
6292 options. */
6293 if (main_args_p)
6294 target_option_default_node = target_option_current_node
6295 = build_target_option_node (opts);
6297 /* Handle stack protector */
6298 if (!opts_set->x_ix86_stack_protector_guard)
6299 opts->x_ix86_stack_protector_guard
6300 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6302 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6303 if (opts->x_ix86_tune_memcpy_strategy)
6305 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6306 ix86_parse_stringop_strategy_string (str, false);
6307 free (str);
6310 if (opts->x_ix86_tune_memset_strategy)
6312 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6313 ix86_parse_stringop_strategy_string (str, true);
6314 free (str);
6317 return true;
6320 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6322 static void
6323 ix86_option_override (void)
6325 ix86_option_override_internal (true, &global_options, &global_options_set);
6328 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6329 static char *
6330 ix86_offload_options (void)
6332 if (TARGET_LP64)
6333 return xstrdup ("-foffload-abi=lp64");
6334 return xstrdup ("-foffload-abi=ilp32");
6337 /* Update register usage after having seen the compiler flags. */
6339 static void
6340 ix86_conditional_register_usage (void)
6342 int i, c_mask;
6344 /* If there are no caller-saved registers, preserve all registers.
6345 except fixed_regs and registers used for function return value
6346 since aggregate_value_p checks call_used_regs[regno] on return
6347 value. */
6348 if (cfun && cfun->machine->no_caller_saved_registers)
6349 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6350 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6351 call_used_regs[i] = 0;
6353 /* For 32-bit targets, squash the REX registers. */
6354 if (! TARGET_64BIT)
6356 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6357 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6358 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6359 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6360 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6361 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6364 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6365 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6367 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6369 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6371 /* Set/reset conditionally defined registers from
6372 CALL_USED_REGISTERS initializer. */
6373 if (call_used_regs[i] > 1)
6374 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6376 /* Calculate registers of CLOBBERED_REGS register set
6377 as call used registers from GENERAL_REGS register set. */
6378 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6379 && call_used_regs[i])
6380 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6383 /* If MMX is disabled, squash the registers. */
6384 if (! TARGET_MMX)
6385 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6386 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6387 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6389 /* If SSE is disabled, squash the registers. */
6390 if (! TARGET_SSE)
6391 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6392 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6393 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6395 /* If the FPU is disabled, squash the registers. */
6396 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6397 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6398 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6399 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6401 /* If AVX512F is disabled, squash the registers. */
6402 if (! TARGET_AVX512F)
6404 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6405 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6407 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6408 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6411 /* If MPX is disabled, squash the registers. */
6412 if (! TARGET_MPX)
6413 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6414 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6418 /* Save the current options */
6420 static void
6421 ix86_function_specific_save (struct cl_target_option *ptr,
6422 struct gcc_options *opts)
6424 ptr->arch = ix86_arch;
6425 ptr->schedule = ix86_schedule;
6426 ptr->prefetch_sse = x86_prefetch_sse;
6427 ptr->tune = ix86_tune;
6428 ptr->branch_cost = ix86_branch_cost;
6429 ptr->tune_defaulted = ix86_tune_defaulted;
6430 ptr->arch_specified = ix86_arch_specified;
6431 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6432 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6433 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6434 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6435 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6436 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6437 ptr->x_ix86_abi = opts->x_ix86_abi;
6438 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6439 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6440 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6441 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6442 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6443 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6444 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6445 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6446 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6447 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6448 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6449 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6450 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6451 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6452 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6453 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6454 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6455 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6456 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6457 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6459 /* The fields are char but the variables are not; make sure the
6460 values fit in the fields. */
6461 gcc_assert (ptr->arch == ix86_arch);
6462 gcc_assert (ptr->schedule == ix86_schedule);
6463 gcc_assert (ptr->tune == ix86_tune);
6464 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6467 /* Restore the current options */
6469 static void
6470 ix86_function_specific_restore (struct gcc_options *opts,
6471 struct cl_target_option *ptr)
6473 enum processor_type old_tune = ix86_tune;
6474 enum processor_type old_arch = ix86_arch;
6475 unsigned int ix86_arch_mask;
6476 int i;
6478 /* We don't change -fPIC. */
6479 opts->x_flag_pic = flag_pic;
6481 ix86_arch = (enum processor_type) ptr->arch;
6482 ix86_schedule = (enum attr_cpu) ptr->schedule;
6483 ix86_tune = (enum processor_type) ptr->tune;
6484 x86_prefetch_sse = ptr->prefetch_sse;
6485 opts->x_ix86_branch_cost = ptr->branch_cost;
6486 ix86_tune_defaulted = ptr->tune_defaulted;
6487 ix86_arch_specified = ptr->arch_specified;
6488 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6489 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6490 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6491 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6492 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6493 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6494 opts->x_ix86_abi = ptr->x_ix86_abi;
6495 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6496 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6497 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6498 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6499 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6500 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6501 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6502 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6503 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6504 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6505 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6506 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6507 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6508 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6509 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6510 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6511 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6512 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6513 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6514 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6515 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6516 /* TODO: ix86_cost should be chosen at instruction or function granuality
6517 so for cold code we use size_cost even in !optimize_size compilation. */
6518 if (opts->x_optimize_size)
6519 ix86_cost = &ix86_size_cost;
6520 else
6521 ix86_cost = ix86_tune_cost;
6523 /* Recreate the arch feature tests if the arch changed */
6524 if (old_arch != ix86_arch)
6526 ix86_arch_mask = 1u << ix86_arch;
6527 for (i = 0; i < X86_ARCH_LAST; ++i)
6528 ix86_arch_features[i]
6529 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6532 /* Recreate the tune optimization tests */
6533 if (old_tune != ix86_tune)
6534 set_ix86_tune_features (ix86_tune, false);
6537 /* Adjust target options after streaming them in. This is mainly about
6538 reconciling them with global options. */
6540 static void
6541 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6543 /* flag_pic is a global option, but ix86_cmodel is target saved option
6544 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6545 for PIC, or error out. */
6546 if (flag_pic)
6547 switch (ptr->x_ix86_cmodel)
6549 case CM_SMALL:
6550 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6551 break;
6553 case CM_MEDIUM:
6554 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6555 break;
6557 case CM_LARGE:
6558 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6559 break;
6561 case CM_KERNEL:
6562 error ("code model %s does not support PIC mode", "kernel");
6563 break;
6565 default:
6566 break;
6568 else
6569 switch (ptr->x_ix86_cmodel)
6571 case CM_SMALL_PIC:
6572 ptr->x_ix86_cmodel = CM_SMALL;
6573 break;
6575 case CM_MEDIUM_PIC:
6576 ptr->x_ix86_cmodel = CM_MEDIUM;
6577 break;
6579 case CM_LARGE_PIC:
6580 ptr->x_ix86_cmodel = CM_LARGE;
6581 break;
6583 default:
6584 break;
6588 /* Print the current options */
6590 static void
6591 ix86_function_specific_print (FILE *file, int indent,
6592 struct cl_target_option *ptr)
6594 char *target_string
6595 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6596 ptr->x_target_flags, ptr->x_ix86_target_flags,
6597 NULL, NULL, ptr->x_ix86_fpmath, false);
6599 gcc_assert (ptr->arch < PROCESSOR_max);
6600 fprintf (file, "%*sarch = %d (%s)\n",
6601 indent, "",
6602 ptr->arch, processor_target_table[ptr->arch].name);
6604 gcc_assert (ptr->tune < PROCESSOR_max);
6605 fprintf (file, "%*stune = %d (%s)\n",
6606 indent, "",
6607 ptr->tune, processor_target_table[ptr->tune].name);
6609 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6611 if (target_string)
6613 fprintf (file, "%*s%s\n", indent, "", target_string);
6614 free (target_string);
6619 /* Inner function to process the attribute((target(...))), take an argument and
6620 set the current options from the argument. If we have a list, recursively go
6621 over the list. */
6623 static bool
6624 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6625 struct gcc_options *opts,
6626 struct gcc_options *opts_set,
6627 struct gcc_options *enum_opts_set)
6629 char *next_optstr;
6630 bool ret = true;
6632 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6633 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6634 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6635 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6636 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6638 enum ix86_opt_type
6640 ix86_opt_unknown,
6641 ix86_opt_yes,
6642 ix86_opt_no,
6643 ix86_opt_str,
6644 ix86_opt_enum,
6645 ix86_opt_isa
6648 static const struct
6650 const char *string;
6651 size_t len;
6652 enum ix86_opt_type type;
6653 int opt;
6654 int mask;
6655 } attrs[] = {
6656 /* isa options */
6657 IX86_ATTR_ISA ("sgx", OPT_msgx),
6658 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
6659 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
6660 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
6662 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6663 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6664 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6665 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6666 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6667 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6668 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6669 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6670 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6671 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6672 IX86_ATTR_ISA ("fma", OPT_mfma),
6673 IX86_ATTR_ISA ("xop", OPT_mxop),
6674 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6675 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6676 IX86_ATTR_ISA ("avx", OPT_mavx),
6677 IX86_ATTR_ISA ("sse4", OPT_msse4),
6678 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6679 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6680 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6681 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6682 IX86_ATTR_ISA ("sse3", OPT_msse3),
6683 IX86_ATTR_ISA ("aes", OPT_maes),
6684 IX86_ATTR_ISA ("sha", OPT_msha),
6685 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6686 IX86_ATTR_ISA ("sse2", OPT_msse2),
6687 IX86_ATTR_ISA ("sse", OPT_msse),
6688 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
6689 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6690 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6691 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6692 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6693 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6694 IX86_ATTR_ISA ("adx", OPT_madx),
6695 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6696 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6697 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6698 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6699 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6700 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6701 IX86_ATTR_ISA ("abm", OPT_mabm),
6702 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6703 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6704 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6705 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6706 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6707 IX86_ATTR_ISA ("cx16", OPT_mcx16),
6708 IX86_ATTR_ISA ("sahf", OPT_msahf),
6709 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6710 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6711 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6712 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6713 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6714 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6715 IX86_ATTR_ISA ("pku", OPT_mpku),
6716 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6717 IX86_ATTR_ISA ("hle", OPT_mhle),
6718 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6719 IX86_ATTR_ISA ("mpx", OPT_mmpx),
6720 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6721 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
6723 /* enum options */
6724 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6726 /* string options */
6727 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6728 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6730 /* flag options */
6731 IX86_ATTR_YES ("cld",
6732 OPT_mcld,
6733 MASK_CLD),
6735 IX86_ATTR_NO ("fancy-math-387",
6736 OPT_mfancy_math_387,
6737 MASK_NO_FANCY_MATH_387),
6739 IX86_ATTR_YES ("ieee-fp",
6740 OPT_mieee_fp,
6741 MASK_IEEE_FP),
6743 IX86_ATTR_YES ("inline-all-stringops",
6744 OPT_minline_all_stringops,
6745 MASK_INLINE_ALL_STRINGOPS),
6747 IX86_ATTR_YES ("inline-stringops-dynamically",
6748 OPT_minline_stringops_dynamically,
6749 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6751 IX86_ATTR_NO ("align-stringops",
6752 OPT_mno_align_stringops,
6753 MASK_NO_ALIGN_STRINGOPS),
6755 IX86_ATTR_YES ("recip",
6756 OPT_mrecip,
6757 MASK_RECIP),
6761 /* If this is a list, recurse to get the options. */
6762 if (TREE_CODE (args) == TREE_LIST)
6764 bool ret = true;
6766 for (; args; args = TREE_CHAIN (args))
6767 if (TREE_VALUE (args)
6768 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6769 p_strings, opts, opts_set,
6770 enum_opts_set))
6771 ret = false;
6773 return ret;
6776 else if (TREE_CODE (args) != STRING_CST)
6778 error ("attribute %<target%> argument not a string");
6779 return false;
6782 /* Handle multiple arguments separated by commas. */
6783 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6785 while (next_optstr && *next_optstr != '\0')
6787 char *p = next_optstr;
6788 char *orig_p = p;
6789 char *comma = strchr (next_optstr, ',');
6790 const char *opt_string;
6791 size_t len, opt_len;
6792 int opt;
6793 bool opt_set_p;
6794 char ch;
6795 unsigned i;
6796 enum ix86_opt_type type = ix86_opt_unknown;
6797 int mask = 0;
6799 if (comma)
6801 *comma = '\0';
6802 len = comma - next_optstr;
6803 next_optstr = comma + 1;
6805 else
6807 len = strlen (p);
6808 next_optstr = NULL;
6811 /* Recognize no-xxx. */
6812 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6814 opt_set_p = false;
6815 p += 3;
6816 len -= 3;
6818 else
6819 opt_set_p = true;
6821 /* Find the option. */
6822 ch = *p;
6823 opt = N_OPTS;
6824 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6826 type = attrs[i].type;
6827 opt_len = attrs[i].len;
6828 if (ch == attrs[i].string[0]
6829 && ((type != ix86_opt_str && type != ix86_opt_enum)
6830 ? len == opt_len
6831 : len > opt_len)
6832 && memcmp (p, attrs[i].string, opt_len) == 0)
6834 opt = attrs[i].opt;
6835 mask = attrs[i].mask;
6836 opt_string = attrs[i].string;
6837 break;
6841 /* Process the option. */
6842 if (opt == N_OPTS)
6844 error ("attribute(target(\"%s\")) is unknown", orig_p);
6845 ret = false;
6848 else if (type == ix86_opt_isa)
6850 struct cl_decoded_option decoded;
6852 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6853 ix86_handle_option (opts, opts_set,
6854 &decoded, input_location);
6857 else if (type == ix86_opt_yes || type == ix86_opt_no)
6859 if (type == ix86_opt_no)
6860 opt_set_p = !opt_set_p;
6862 if (opt_set_p)
6863 opts->x_target_flags |= mask;
6864 else
6865 opts->x_target_flags &= ~mask;
6868 else if (type == ix86_opt_str)
6870 if (p_strings[opt])
6872 error ("option(\"%s\") was already specified", opt_string);
6873 ret = false;
6875 else
6876 p_strings[opt] = xstrdup (p + opt_len);
6879 else if (type == ix86_opt_enum)
6881 bool arg_ok;
6882 int value;
6884 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6885 if (arg_ok)
6886 set_option (opts, enum_opts_set, opt, value,
6887 p + opt_len, DK_UNSPECIFIED, input_location,
6888 global_dc);
6889 else
6891 error ("attribute(target(\"%s\")) is unknown", orig_p);
6892 ret = false;
6896 else
6897 gcc_unreachable ();
6900 return ret;
6903 /* Release allocated strings. */
6904 static void
6905 release_options_strings (char **option_strings)
6907 /* Free up memory allocated to hold the strings */
6908 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6909 free (option_strings[i]);
6912 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6914 tree
6915 ix86_valid_target_attribute_tree (tree args,
6916 struct gcc_options *opts,
6917 struct gcc_options *opts_set)
6919 const char *orig_arch_string = opts->x_ix86_arch_string;
6920 const char *orig_tune_string = opts->x_ix86_tune_string;
6921 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6922 int orig_tune_defaulted = ix86_tune_defaulted;
6923 int orig_arch_specified = ix86_arch_specified;
6924 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6925 tree t = NULL_TREE;
6926 struct cl_target_option *def
6927 = TREE_TARGET_OPTION (target_option_default_node);
6928 struct gcc_options enum_opts_set;
6930 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6932 /* Process each of the options on the chain. */
6933 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6934 opts_set, &enum_opts_set))
6935 return error_mark_node;
6937 /* If the changed options are different from the default, rerun
6938 ix86_option_override_internal, and then save the options away.
6939 The string options are attribute options, and will be undone
6940 when we copy the save structure. */
6941 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6942 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
6943 || opts->x_target_flags != def->x_target_flags
6944 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6945 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6946 || enum_opts_set.x_ix86_fpmath)
6948 /* If we are using the default tune= or arch=, undo the string assigned,
6949 and use the default. */
6950 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6952 opts->x_ix86_arch_string
6953 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6955 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6956 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6957 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6958 | OPTION_MASK_ABI_64
6959 | OPTION_MASK_ABI_X32
6960 | OPTION_MASK_CODE16);
6961 opts->x_ix86_isa_flags2 = 0;
6963 else if (!orig_arch_specified)
6964 opts->x_ix86_arch_string = NULL;
6966 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6967 opts->x_ix86_tune_string
6968 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6969 else if (orig_tune_defaulted)
6970 opts->x_ix86_tune_string = NULL;
6972 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6973 if (enum_opts_set.x_ix86_fpmath)
6974 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6975 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6976 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6978 if (TARGET_80387_P (opts->x_target_flags))
6979 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6980 | FPMATH_387);
6981 else
6982 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6983 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6986 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6987 bool r = ix86_option_override_internal (false, opts, opts_set);
6988 if (!r)
6990 release_options_strings (option_strings);
6991 return error_mark_node;
6994 /* Add any builtin functions with the new isa if any. */
6995 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
6997 /* Save the current options unless we are validating options for
6998 #pragma. */
6999 t = build_target_option_node (opts);
7001 opts->x_ix86_arch_string = orig_arch_string;
7002 opts->x_ix86_tune_string = orig_tune_string;
7003 opts_set->x_ix86_fpmath = orig_fpmath_set;
7005 release_options_strings (option_strings);
7008 return t;
7011 /* Hook to validate attribute((target("string"))). */
7013 static bool
7014 ix86_valid_target_attribute_p (tree fndecl,
7015 tree ARG_UNUSED (name),
7016 tree args,
7017 int ARG_UNUSED (flags))
7019 struct gcc_options func_options;
7020 tree new_target, new_optimize;
7021 bool ret = true;
7023 /* attribute((target("default"))) does nothing, beyond
7024 affecting multi-versioning. */
7025 if (TREE_VALUE (args)
7026 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7027 && TREE_CHAIN (args) == NULL_TREE
7028 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7029 return true;
7031 tree old_optimize = build_optimization_node (&global_options);
7033 /* Get the optimization options of the current function. */
7034 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7036 if (!func_optimize)
7037 func_optimize = old_optimize;
7039 /* Init func_options. */
7040 memset (&func_options, 0, sizeof (func_options));
7041 init_options_struct (&func_options, NULL);
7042 lang_hooks.init_options_struct (&func_options);
7044 cl_optimization_restore (&func_options,
7045 TREE_OPTIMIZATION (func_optimize));
7047 /* Initialize func_options to the default before its target options can
7048 be set. */
7049 cl_target_option_restore (&func_options,
7050 TREE_TARGET_OPTION (target_option_default_node));
7052 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7053 &global_options_set);
7055 new_optimize = build_optimization_node (&func_options);
7057 if (new_target == error_mark_node)
7058 ret = false;
7060 else if (fndecl && new_target)
7062 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7064 if (old_optimize != new_optimize)
7065 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7068 finalize_options_struct (&func_options);
7070 return ret;
7074 /* Hook to determine if one function can safely inline another. */
7076 static bool
7077 ix86_can_inline_p (tree caller, tree callee)
7079 bool ret = false;
7080 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7081 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7083 /* If callee has no option attributes, then it is ok to inline. */
7084 if (!callee_tree)
7085 ret = true;
7087 /* If caller has no option attributes, but callee does then it is not ok to
7088 inline. */
7089 else if (!caller_tree)
7090 ret = false;
7092 else
7094 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7095 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7097 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7098 function can inline a SSE2 function but a SSE2 function can't inline
7099 a SSE4 function. */
7100 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7101 != callee_opts->x_ix86_isa_flags)
7102 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7103 != callee_opts->x_ix86_isa_flags2))
7104 ret = false;
7106 /* See if we have the same non-isa options. */
7107 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7108 ret = false;
7110 /* See if arch, tune, etc. are the same. */
7111 else if (caller_opts->arch != callee_opts->arch)
7112 ret = false;
7114 else if (caller_opts->tune != callee_opts->tune)
7115 ret = false;
7117 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7118 ret = false;
7120 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7121 ret = false;
7123 else
7124 ret = true;
7127 return ret;
7131 /* Remember the last target of ix86_set_current_function. */
7132 static GTY(()) tree ix86_previous_fndecl;
7134 /* Set targets globals to the default (or current #pragma GCC target
7135 if active). Invalidate ix86_previous_fndecl cache. */
7137 void
7138 ix86_reset_previous_fndecl (void)
7140 tree new_tree = target_option_current_node;
7141 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7142 if (TREE_TARGET_GLOBALS (new_tree))
7143 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7144 else if (new_tree == target_option_default_node)
7145 restore_target_globals (&default_target_globals);
7146 else
7147 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7148 ix86_previous_fndecl = NULL_TREE;
7151 /* Set the func_type field from the function FNDECL. */
7153 static void
7154 ix86_set_func_type (tree fndecl)
7156 if (cfun->machine->func_type == TYPE_UNKNOWN)
7158 if (lookup_attribute ("interrupt",
7159 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7161 int nargs = 0;
7162 for (tree arg = DECL_ARGUMENTS (fndecl);
7163 arg;
7164 arg = TREE_CHAIN (arg))
7165 nargs++;
7166 cfun->machine->no_caller_saved_registers = true;
7167 cfun->machine->func_type
7168 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7170 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7172 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7173 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7174 sorry ("Only DWARF debug format is supported for interrupt "
7175 "service routine.");
7177 else
7179 cfun->machine->func_type = TYPE_NORMAL;
7180 if (lookup_attribute ("no_caller_saved_registers",
7181 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7182 cfun->machine->no_caller_saved_registers = true;
7187 /* Establish appropriate back-end context for processing the function
7188 FNDECL. The argument might be NULL to indicate processing at top
7189 level, outside of any function scope. */
7190 static void
7191 ix86_set_current_function (tree fndecl)
7193 /* Only change the context if the function changes. This hook is called
7194 several times in the course of compiling a function, and we don't want to
7195 slow things down too much or call target_reinit when it isn't safe. */
7196 if (fndecl == ix86_previous_fndecl)
7198 /* There may be 2 function bodies for the same function FNDECL,
7199 one is extern inline and one isn't. Call ix86_set_func_type
7200 to set the func_type field. */
7201 if (fndecl != NULL_TREE)
7202 ix86_set_func_type (fndecl);
7203 return;
7206 tree old_tree;
7207 if (ix86_previous_fndecl == NULL_TREE)
7208 old_tree = target_option_current_node;
7209 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7210 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7211 else
7212 old_tree = target_option_default_node;
7214 if (fndecl == NULL_TREE)
7216 if (old_tree != target_option_current_node)
7217 ix86_reset_previous_fndecl ();
7218 return;
7221 ix86_set_func_type (fndecl);
7223 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7224 if (new_tree == NULL_TREE)
7225 new_tree = target_option_default_node;
7227 if (old_tree != new_tree)
7229 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7230 if (TREE_TARGET_GLOBALS (new_tree))
7231 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7232 else if (new_tree == target_option_default_node)
7233 restore_target_globals (&default_target_globals);
7234 else
7235 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7237 ix86_previous_fndecl = fndecl;
7239 static bool prev_no_caller_saved_registers;
7241 /* 64-bit MS and SYSV ABI have different set of call used registers.
7242 Avoid expensive re-initialization of init_regs each time we switch
7243 function context. */
7244 if (TARGET_64BIT
7245 && (call_used_regs[SI_REG]
7246 == (cfun->machine->call_abi == MS_ABI)))
7247 reinit_regs ();
7248 /* Need to re-initialize init_regs if caller-saved registers are
7249 changed. */
7250 else if (prev_no_caller_saved_registers
7251 != cfun->machine->no_caller_saved_registers)
7252 reinit_regs ();
7254 if (cfun->machine->func_type != TYPE_NORMAL
7255 || cfun->machine->no_caller_saved_registers)
7257 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7258 may change processor state. */
7259 const char *isa;
7260 if (TARGET_MPX)
7261 isa = "MPX";
7262 else if (TARGET_SSE)
7263 isa = "SSE";
7264 else if (TARGET_MMX)
7265 isa = "MMX/3Dnow";
7266 else if (TARGET_80387)
7267 isa = "80387";
7268 else
7269 isa = NULL;
7270 if (isa != NULL)
7272 if (cfun->machine->func_type != TYPE_NORMAL)
7273 sorry ("%s instructions aren't allowed in %s service routine",
7274 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7275 ? "exception" : "interrupt"));
7276 else
7277 sorry ("%s instructions aren't allowed in function with "
7278 "no_caller_saved_registers attribute", isa);
7279 /* Don't issue the same error twice. */
7280 cfun->machine->func_type = TYPE_NORMAL;
7281 cfun->machine->no_caller_saved_registers = false;
7285 prev_no_caller_saved_registers
7286 = cfun->machine->no_caller_saved_registers;
7290 /* Return true if this goes in large data/bss. */
7292 static bool
7293 ix86_in_large_data_p (tree exp)
7295 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7296 return false;
7298 if (exp == NULL_TREE)
7299 return false;
7301 /* Functions are never large data. */
7302 if (TREE_CODE (exp) == FUNCTION_DECL)
7303 return false;
7305 /* Automatic variables are never large data. */
7306 if (VAR_P (exp) && !is_global_var (exp))
7307 return false;
7309 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7311 const char *section = DECL_SECTION_NAME (exp);
7312 if (strcmp (section, ".ldata") == 0
7313 || strcmp (section, ".lbss") == 0)
7314 return true;
7315 return false;
7317 else
7319 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7321 /* If this is an incomplete type with size 0, then we can't put it
7322 in data because it might be too big when completed. Also,
7323 int_size_in_bytes returns -1 if size can vary or is larger than
7324 an integer in which case also it is safer to assume that it goes in
7325 large data. */
7326 if (size <= 0 || size > ix86_section_threshold)
7327 return true;
7330 return false;
7333 /* i386-specific section flag to mark large sections. */
7334 #define SECTION_LARGE SECTION_MACH_DEP
7336 /* Switch to the appropriate section for output of DECL.
7337 DECL is either a `VAR_DECL' node or a constant of some sort.
7338 RELOC indicates whether forming the initial value of DECL requires
7339 link-time relocations. */
7341 ATTRIBUTE_UNUSED static section *
7342 x86_64_elf_select_section (tree decl, int reloc,
7343 unsigned HOST_WIDE_INT align)
7345 if (ix86_in_large_data_p (decl))
7347 const char *sname = NULL;
7348 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7349 switch (categorize_decl_for_section (decl, reloc))
7351 case SECCAT_DATA:
7352 sname = ".ldata";
7353 break;
7354 case SECCAT_DATA_REL:
7355 sname = ".ldata.rel";
7356 break;
7357 case SECCAT_DATA_REL_LOCAL:
7358 sname = ".ldata.rel.local";
7359 break;
7360 case SECCAT_DATA_REL_RO:
7361 sname = ".ldata.rel.ro";
7362 break;
7363 case SECCAT_DATA_REL_RO_LOCAL:
7364 sname = ".ldata.rel.ro.local";
7365 break;
7366 case SECCAT_BSS:
7367 sname = ".lbss";
7368 flags |= SECTION_BSS;
7369 break;
7370 case SECCAT_RODATA:
7371 case SECCAT_RODATA_MERGE_STR:
7372 case SECCAT_RODATA_MERGE_STR_INIT:
7373 case SECCAT_RODATA_MERGE_CONST:
7374 sname = ".lrodata";
7375 flags &= ~SECTION_WRITE;
7376 break;
7377 case SECCAT_SRODATA:
7378 case SECCAT_SDATA:
7379 case SECCAT_SBSS:
7380 gcc_unreachable ();
7381 case SECCAT_TEXT:
7382 case SECCAT_TDATA:
7383 case SECCAT_TBSS:
7384 /* We don't split these for medium model. Place them into
7385 default sections and hope for best. */
7386 break;
7388 if (sname)
7390 /* We might get called with string constants, but get_named_section
7391 doesn't like them as they are not DECLs. Also, we need to set
7392 flags in that case. */
7393 if (!DECL_P (decl))
7394 return get_section (sname, flags, NULL);
7395 return get_named_section (decl, sname, reloc);
7398 return default_elf_select_section (decl, reloc, align);
7401 /* Select a set of attributes for section NAME based on the properties
7402 of DECL and whether or not RELOC indicates that DECL's initializer
7403 might contain runtime relocations. */
7405 static unsigned int ATTRIBUTE_UNUSED
7406 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7408 unsigned int flags = default_section_type_flags (decl, name, reloc);
7410 if (ix86_in_large_data_p (decl))
7411 flags |= SECTION_LARGE;
7413 if (decl == NULL_TREE
7414 && (strcmp (name, ".ldata.rel.ro") == 0
7415 || strcmp (name, ".ldata.rel.ro.local") == 0))
7416 flags |= SECTION_RELRO;
7418 if (strcmp (name, ".lbss") == 0
7419 || strncmp (name, ".lbss.", 5) == 0
7420 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7421 flags |= SECTION_BSS;
7423 return flags;
7426 /* Build up a unique section name, expressed as a
7427 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7428 RELOC indicates whether the initial value of EXP requires
7429 link-time relocations. */
7431 static void ATTRIBUTE_UNUSED
7432 x86_64_elf_unique_section (tree decl, int reloc)
7434 if (ix86_in_large_data_p (decl))
7436 const char *prefix = NULL;
7437 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7438 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7440 switch (categorize_decl_for_section (decl, reloc))
7442 case SECCAT_DATA:
7443 case SECCAT_DATA_REL:
7444 case SECCAT_DATA_REL_LOCAL:
7445 case SECCAT_DATA_REL_RO:
7446 case SECCAT_DATA_REL_RO_LOCAL:
7447 prefix = one_only ? ".ld" : ".ldata";
7448 break;
7449 case SECCAT_BSS:
7450 prefix = one_only ? ".lb" : ".lbss";
7451 break;
7452 case SECCAT_RODATA:
7453 case SECCAT_RODATA_MERGE_STR:
7454 case SECCAT_RODATA_MERGE_STR_INIT:
7455 case SECCAT_RODATA_MERGE_CONST:
7456 prefix = one_only ? ".lr" : ".lrodata";
7457 break;
7458 case SECCAT_SRODATA:
7459 case SECCAT_SDATA:
7460 case SECCAT_SBSS:
7461 gcc_unreachable ();
7462 case SECCAT_TEXT:
7463 case SECCAT_TDATA:
7464 case SECCAT_TBSS:
7465 /* We don't split these for medium model. Place them into
7466 default sections and hope for best. */
7467 break;
7469 if (prefix)
7471 const char *name, *linkonce;
7472 char *string;
7474 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7475 name = targetm.strip_name_encoding (name);
7477 /* If we're using one_only, then there needs to be a .gnu.linkonce
7478 prefix to the section name. */
7479 linkonce = one_only ? ".gnu.linkonce" : "";
7481 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7483 set_decl_section_name (decl, string);
7484 return;
7487 default_unique_section (decl, reloc);
7490 #ifdef COMMON_ASM_OP
7492 #ifndef LARGECOMM_SECTION_ASM_OP
7493 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7494 #endif
7496 /* This says how to output assembler code to declare an
7497 uninitialized external linkage data object.
7499 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7500 large objects. */
7501 void
7502 x86_elf_aligned_decl_common (FILE *file, tree decl,
7503 const char *name, unsigned HOST_WIDE_INT size,
7504 int align)
7506 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7507 && size > (unsigned int)ix86_section_threshold)
7509 switch_to_section (get_named_section (decl, ".lbss", 0));
7510 fputs (LARGECOMM_SECTION_ASM_OP, file);
7512 else
7513 fputs (COMMON_ASM_OP, file);
7514 assemble_name (file, name);
7515 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7516 size, align / BITS_PER_UNIT);
7518 #endif
7520 /* Utility function for targets to use in implementing
7521 ASM_OUTPUT_ALIGNED_BSS. */
7523 void
7524 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7525 unsigned HOST_WIDE_INT size, int align)
7527 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7528 && size > (unsigned int)ix86_section_threshold)
7529 switch_to_section (get_named_section (decl, ".lbss", 0));
7530 else
7531 switch_to_section (bss_section);
7532 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7533 #ifdef ASM_DECLARE_OBJECT_NAME
7534 last_assemble_variable_decl = decl;
7535 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7536 #else
7537 /* Standard thing is just output label for the object. */
7538 ASM_OUTPUT_LABEL (file, name);
7539 #endif /* ASM_DECLARE_OBJECT_NAME */
7540 ASM_OUTPUT_SKIP (file, size ? size : 1);
7543 /* Decide whether we must probe the stack before any space allocation
7544 on this target. It's essentially TARGET_STACK_PROBE except when
7545 -fstack-check causes the stack to be already probed differently. */
7547 bool
7548 ix86_target_stack_probe (void)
7550 /* Do not probe the stack twice if static stack checking is enabled. */
7551 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7552 return false;
7554 return TARGET_STACK_PROBE;
7557 /* Decide whether we can make a sibling call to a function. DECL is the
7558 declaration of the function being targeted by the call and EXP is the
7559 CALL_EXPR representing the call. */
7561 static bool
7562 ix86_function_ok_for_sibcall (tree decl, tree exp)
7564 tree type, decl_or_type;
7565 rtx a, b;
7566 bool bind_global = decl && !targetm.binds_local_p (decl);
7568 /* Sibling call isn't OK if there are no caller-saved registers
7569 since all registers must be preserved before return. */
7570 if (cfun->machine->no_caller_saved_registers)
7571 return false;
7573 /* If we are generating position-independent code, we cannot sibcall
7574 optimize direct calls to global functions, as the PLT requires
7575 %ebx be live. (Darwin does not have a PLT.) */
7576 if (!TARGET_MACHO
7577 && !TARGET_64BIT
7578 && flag_pic
7579 && flag_plt
7580 && bind_global)
7581 return false;
7583 /* If we need to align the outgoing stack, then sibcalling would
7584 unalign the stack, which may break the called function. */
7585 if (ix86_minimum_incoming_stack_boundary (true)
7586 < PREFERRED_STACK_BOUNDARY)
7587 return false;
7589 if (decl)
7591 decl_or_type = decl;
7592 type = TREE_TYPE (decl);
7594 else
7596 /* We're looking at the CALL_EXPR, we need the type of the function. */
7597 type = CALL_EXPR_FN (exp); /* pointer expression */
7598 type = TREE_TYPE (type); /* pointer type */
7599 type = TREE_TYPE (type); /* function type */
7600 decl_or_type = type;
7603 /* Check that the return value locations are the same. Like
7604 if we are returning floats on the 80387 register stack, we cannot
7605 make a sibcall from a function that doesn't return a float to a
7606 function that does or, conversely, from a function that does return
7607 a float to a function that doesn't; the necessary stack adjustment
7608 would not be executed. This is also the place we notice
7609 differences in the return value ABI. Note that it is ok for one
7610 of the functions to have void return type as long as the return
7611 value of the other is passed in a register. */
7612 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7613 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7614 cfun->decl, false);
7615 if (STACK_REG_P (a) || STACK_REG_P (b))
7617 if (!rtx_equal_p (a, b))
7618 return false;
7620 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7622 else if (!rtx_equal_p (a, b))
7623 return false;
7625 if (TARGET_64BIT)
7627 /* The SYSV ABI has more call-clobbered registers;
7628 disallow sibcalls from MS to SYSV. */
7629 if (cfun->machine->call_abi == MS_ABI
7630 && ix86_function_type_abi (type) == SYSV_ABI)
7631 return false;
7633 else
7635 /* If this call is indirect, we'll need to be able to use a
7636 call-clobbered register for the address of the target function.
7637 Make sure that all such registers are not used for passing
7638 parameters. Note that DLLIMPORT functions and call to global
7639 function via GOT slot are indirect. */
7640 if (!decl
7641 || (bind_global && flag_pic && !flag_plt)
7642 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7644 /* Check if regparm >= 3 since arg_reg_available is set to
7645 false if regparm == 0. If regparm is 1 or 2, there is
7646 always a call-clobbered register available.
7648 ??? The symbol indirect call doesn't need a call-clobbered
7649 register. But we don't know if this is a symbol indirect
7650 call or not here. */
7651 if (ix86_function_regparm (type, NULL) >= 3
7652 && !cfun->machine->arg_reg_available)
7653 return false;
7657 /* Otherwise okay. That also includes certain types of indirect calls. */
7658 return true;
7661 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7662 and "sseregparm" calling convention attributes;
7663 arguments as in struct attribute_spec.handler. */
7665 static tree
7666 ix86_handle_cconv_attribute (tree *node, tree name,
7667 tree args,
7668 int,
7669 bool *no_add_attrs)
7671 if (TREE_CODE (*node) != FUNCTION_TYPE
7672 && TREE_CODE (*node) != METHOD_TYPE
7673 && TREE_CODE (*node) != FIELD_DECL
7674 && TREE_CODE (*node) != TYPE_DECL)
7676 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7677 name);
7678 *no_add_attrs = true;
7679 return NULL_TREE;
7682 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7683 if (is_attribute_p ("regparm", name))
7685 tree cst;
7687 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7689 error ("fastcall and regparm attributes are not compatible");
7692 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7694 error ("regparam and thiscall attributes are not compatible");
7697 cst = TREE_VALUE (args);
7698 if (TREE_CODE (cst) != INTEGER_CST)
7700 warning (OPT_Wattributes,
7701 "%qE attribute requires an integer constant argument",
7702 name);
7703 *no_add_attrs = true;
7705 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7707 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7708 name, REGPARM_MAX);
7709 *no_add_attrs = true;
7712 return NULL_TREE;
7715 if (TARGET_64BIT)
7717 /* Do not warn when emulating the MS ABI. */
7718 if ((TREE_CODE (*node) != FUNCTION_TYPE
7719 && TREE_CODE (*node) != METHOD_TYPE)
7720 || ix86_function_type_abi (*node) != MS_ABI)
7721 warning (OPT_Wattributes, "%qE attribute ignored",
7722 name);
7723 *no_add_attrs = true;
7724 return NULL_TREE;
7727 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7728 if (is_attribute_p ("fastcall", name))
7730 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7732 error ("fastcall and cdecl attributes are not compatible");
7734 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7736 error ("fastcall and stdcall attributes are not compatible");
7738 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7740 error ("fastcall and regparm attributes are not compatible");
7742 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7744 error ("fastcall and thiscall attributes are not compatible");
7748 /* Can combine stdcall with fastcall (redundant), regparm and
7749 sseregparm. */
7750 else if (is_attribute_p ("stdcall", name))
7752 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7754 error ("stdcall and cdecl attributes are not compatible");
7756 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7758 error ("stdcall and fastcall attributes are not compatible");
7760 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7762 error ("stdcall and thiscall attributes are not compatible");
7766 /* Can combine cdecl with regparm and sseregparm. */
7767 else if (is_attribute_p ("cdecl", name))
7769 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7771 error ("stdcall and cdecl attributes are not compatible");
7773 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7775 error ("fastcall and cdecl attributes are not compatible");
7777 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7779 error ("cdecl and thiscall attributes are not compatible");
7782 else if (is_attribute_p ("thiscall", name))
7784 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7785 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7786 name);
7787 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7789 error ("stdcall and thiscall attributes are not compatible");
7791 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7793 error ("fastcall and thiscall attributes are not compatible");
7795 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7797 error ("cdecl and thiscall attributes are not compatible");
7801 /* Can combine sseregparm with all attributes. */
7803 return NULL_TREE;
7806 /* The transactional memory builtins are implicitly regparm or fastcall
7807 depending on the ABI. Override the generic do-nothing attribute that
7808 these builtins were declared with, and replace it with one of the two
7809 attributes that we expect elsewhere. */
7811 static tree
7812 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7813 int flags, bool *no_add_attrs)
7815 tree alt;
7817 /* In no case do we want to add the placeholder attribute. */
7818 *no_add_attrs = true;
7820 /* The 64-bit ABI is unchanged for transactional memory. */
7821 if (TARGET_64BIT)
7822 return NULL_TREE;
7824 /* ??? Is there a better way to validate 32-bit windows? We have
7825 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7826 if (CHECK_STACK_LIMIT > 0)
7827 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7828 else
7830 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7831 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7833 decl_attributes (node, alt, flags);
7835 return NULL_TREE;
7838 /* This function determines from TYPE the calling-convention. */
7840 unsigned int
7841 ix86_get_callcvt (const_tree type)
7843 unsigned int ret = 0;
7844 bool is_stdarg;
7845 tree attrs;
7847 if (TARGET_64BIT)
7848 return IX86_CALLCVT_CDECL;
7850 attrs = TYPE_ATTRIBUTES (type);
7851 if (attrs != NULL_TREE)
7853 if (lookup_attribute ("cdecl", attrs))
7854 ret |= IX86_CALLCVT_CDECL;
7855 else if (lookup_attribute ("stdcall", attrs))
7856 ret |= IX86_CALLCVT_STDCALL;
7857 else if (lookup_attribute ("fastcall", attrs))
7858 ret |= IX86_CALLCVT_FASTCALL;
7859 else if (lookup_attribute ("thiscall", attrs))
7860 ret |= IX86_CALLCVT_THISCALL;
7862 /* Regparam isn't allowed for thiscall and fastcall. */
7863 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7865 if (lookup_attribute ("regparm", attrs))
7866 ret |= IX86_CALLCVT_REGPARM;
7867 if (lookup_attribute ("sseregparm", attrs))
7868 ret |= IX86_CALLCVT_SSEREGPARM;
7871 if (IX86_BASE_CALLCVT(ret) != 0)
7872 return ret;
7875 is_stdarg = stdarg_p (type);
7876 if (TARGET_RTD && !is_stdarg)
7877 return IX86_CALLCVT_STDCALL | ret;
7879 if (ret != 0
7880 || is_stdarg
7881 || TREE_CODE (type) != METHOD_TYPE
7882 || ix86_function_type_abi (type) != MS_ABI)
7883 return IX86_CALLCVT_CDECL | ret;
7885 return IX86_CALLCVT_THISCALL;
7888 /* Return 0 if the attributes for two types are incompatible, 1 if they
7889 are compatible, and 2 if they are nearly compatible (which causes a
7890 warning to be generated). */
7892 static int
7893 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7895 unsigned int ccvt1, ccvt2;
7897 if (TREE_CODE (type1) != FUNCTION_TYPE
7898 && TREE_CODE (type1) != METHOD_TYPE)
7899 return 1;
7901 ccvt1 = ix86_get_callcvt (type1);
7902 ccvt2 = ix86_get_callcvt (type2);
7903 if (ccvt1 != ccvt2)
7904 return 0;
7905 if (ix86_function_regparm (type1, NULL)
7906 != ix86_function_regparm (type2, NULL))
7907 return 0;
7909 return 1;
7912 /* Return the regparm value for a function with the indicated TYPE and DECL.
7913 DECL may be NULL when calling function indirectly
7914 or considering a libcall. */
7916 static int
7917 ix86_function_regparm (const_tree type, const_tree decl)
7919 tree attr;
7920 int regparm;
7921 unsigned int ccvt;
7923 if (TARGET_64BIT)
7924 return (ix86_function_type_abi (type) == SYSV_ABI
7925 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7926 ccvt = ix86_get_callcvt (type);
7927 regparm = ix86_regparm;
7929 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7931 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7932 if (attr)
7934 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7935 return regparm;
7938 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7939 return 2;
7940 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7941 return 1;
7943 /* Use register calling convention for local functions when possible. */
7944 if (decl
7945 && TREE_CODE (decl) == FUNCTION_DECL)
7947 cgraph_node *target = cgraph_node::get (decl);
7948 if (target)
7949 target = target->function_symbol ();
7951 /* Caller and callee must agree on the calling convention, so
7952 checking here just optimize means that with
7953 __attribute__((optimize (...))) caller could use regparm convention
7954 and callee not, or vice versa. Instead look at whether the callee
7955 is optimized or not. */
7956 if (target && opt_for_fn (target->decl, optimize)
7957 && !(profile_flag && !flag_fentry))
7959 cgraph_local_info *i = &target->local;
7960 if (i && i->local && i->can_change_signature)
7962 int local_regparm, globals = 0, regno;
7964 /* Make sure no regparm register is taken by a
7965 fixed register variable. */
7966 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7967 local_regparm++)
7968 if (fixed_regs[local_regparm])
7969 break;
7971 /* We don't want to use regparm(3) for nested functions as
7972 these use a static chain pointer in the third argument. */
7973 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7974 local_regparm = 2;
7976 /* Save a register for the split stack. */
7977 if (flag_split_stack)
7979 if (local_regparm == 3)
7980 local_regparm = 2;
7981 else if (local_regparm == 2
7982 && DECL_STATIC_CHAIN (target->decl))
7983 local_regparm = 1;
7986 /* Each fixed register usage increases register pressure,
7987 so less registers should be used for argument passing.
7988 This functionality can be overriden by an explicit
7989 regparm value. */
7990 for (regno = AX_REG; regno <= DI_REG; regno++)
7991 if (fixed_regs[regno])
7992 globals++;
7994 local_regparm
7995 = globals < local_regparm ? local_regparm - globals : 0;
7997 if (local_regparm > regparm)
7998 regparm = local_regparm;
8003 return regparm;
8006 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8007 DFmode (2) arguments in SSE registers for a function with the
8008 indicated TYPE and DECL. DECL may be NULL when calling function
8009 indirectly or considering a libcall. Return -1 if any FP parameter
8010 should be rejected by error. This is used in siutation we imply SSE
8011 calling convetion but the function is called from another function with
8012 SSE disabled. Otherwise return 0. */
8014 static int
8015 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8017 gcc_assert (!TARGET_64BIT);
8019 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8020 by the sseregparm attribute. */
8021 if (TARGET_SSEREGPARM
8022 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8024 if (!TARGET_SSE)
8026 if (warn)
8028 if (decl)
8029 error ("calling %qD with attribute sseregparm without "
8030 "SSE/SSE2 enabled", decl);
8031 else
8032 error ("calling %qT with attribute sseregparm without "
8033 "SSE/SSE2 enabled", type);
8035 return 0;
8038 return 2;
8041 if (!decl)
8042 return 0;
8044 cgraph_node *target = cgraph_node::get (decl);
8045 if (target)
8046 target = target->function_symbol ();
8048 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8049 (and DFmode for SSE2) arguments in SSE registers. */
8050 if (target
8051 /* TARGET_SSE_MATH */
8052 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8053 && opt_for_fn (target->decl, optimize)
8054 && !(profile_flag && !flag_fentry))
8056 cgraph_local_info *i = &target->local;
8057 if (i && i->local && i->can_change_signature)
8059 /* Refuse to produce wrong code when local function with SSE enabled
8060 is called from SSE disabled function.
8061 FIXME: We need a way to detect these cases cross-ltrans partition
8062 and avoid using SSE calling conventions on local functions called
8063 from function with SSE disabled. For now at least delay the
8064 warning until we know we are going to produce wrong code.
8065 See PR66047 */
8066 if (!TARGET_SSE && warn)
8067 return -1;
8068 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8069 ->x_ix86_isa_flags) ? 2 : 1;
8073 return 0;
8076 /* Return true if EAX is live at the start of the function. Used by
8077 ix86_expand_prologue to determine if we need special help before
8078 calling allocate_stack_worker. */
8080 static bool
8081 ix86_eax_live_at_start_p (void)
8083 /* Cheat. Don't bother working forward from ix86_function_regparm
8084 to the function type to whether an actual argument is located in
8085 eax. Instead just look at cfg info, which is still close enough
8086 to correct at this point. This gives false positives for broken
8087 functions that might use uninitialized data that happens to be
8088 allocated in eax, but who cares? */
8089 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8092 static bool
8093 ix86_keep_aggregate_return_pointer (tree fntype)
8095 tree attr;
8097 if (!TARGET_64BIT)
8099 attr = lookup_attribute ("callee_pop_aggregate_return",
8100 TYPE_ATTRIBUTES (fntype));
8101 if (attr)
8102 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8104 /* For 32-bit MS-ABI the default is to keep aggregate
8105 return pointer. */
8106 if (ix86_function_type_abi (fntype) == MS_ABI)
8107 return true;
8109 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8112 /* Value is the number of bytes of arguments automatically
8113 popped when returning from a subroutine call.
8114 FUNDECL is the declaration node of the function (as a tree),
8115 FUNTYPE is the data type of the function (as a tree),
8116 or for a library call it is an identifier node for the subroutine name.
8117 SIZE is the number of bytes of arguments passed on the stack.
8119 On the 80386, the RTD insn may be used to pop them if the number
8120 of args is fixed, but if the number is variable then the caller
8121 must pop them all. RTD can't be used for library calls now
8122 because the library is compiled with the Unix compiler.
8123 Use of RTD is a selectable option, since it is incompatible with
8124 standard Unix calling sequences. If the option is not selected,
8125 the caller must always pop the args.
8127 The attribute stdcall is equivalent to RTD on a per module basis. */
8129 static int
8130 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8132 unsigned int ccvt;
8134 /* None of the 64-bit ABIs pop arguments. */
8135 if (TARGET_64BIT)
8136 return 0;
8138 ccvt = ix86_get_callcvt (funtype);
8140 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8141 | IX86_CALLCVT_THISCALL)) != 0
8142 && ! stdarg_p (funtype))
8143 return size;
8145 /* Lose any fake structure return argument if it is passed on the stack. */
8146 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8147 && !ix86_keep_aggregate_return_pointer (funtype))
8149 int nregs = ix86_function_regparm (funtype, fundecl);
8150 if (nregs == 0)
8151 return GET_MODE_SIZE (Pmode);
8154 return 0;
8157 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8159 static bool
8160 ix86_legitimate_combined_insn (rtx_insn *insn)
8162 int i;
8164 /* Check operand constraints in case hard registers were propagated
8165 into insn pattern. This check prevents combine pass from
8166 generating insn patterns with invalid hard register operands.
8167 These invalid insns can eventually confuse reload to error out
8168 with a spill failure. See also PRs 46829 and 46843. */
8170 gcc_assert (INSN_CODE (insn) >= 0);
8172 extract_insn (insn);
8173 preprocess_constraints (insn);
8175 int n_operands = recog_data.n_operands;
8176 int n_alternatives = recog_data.n_alternatives;
8177 for (i = 0; i < n_operands; i++)
8179 rtx op = recog_data.operand[i];
8180 machine_mode mode = GET_MODE (op);
8181 const operand_alternative *op_alt;
8182 int offset = 0;
8183 bool win;
8184 int j;
8186 /* A unary operator may be accepted by the predicate, but it
8187 is irrelevant for matching constraints. */
8188 if (UNARY_P (op))
8189 op = XEXP (op, 0);
8191 if (SUBREG_P (op))
8193 if (REG_P (SUBREG_REG (op))
8194 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8195 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8196 GET_MODE (SUBREG_REG (op)),
8197 SUBREG_BYTE (op),
8198 GET_MODE (op));
8199 op = SUBREG_REG (op);
8202 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8203 continue;
8205 op_alt = recog_op_alt;
8207 /* Operand has no constraints, anything is OK. */
8208 win = !n_alternatives;
8210 alternative_mask preferred = get_preferred_alternatives (insn);
8211 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8213 if (!TEST_BIT (preferred, j))
8214 continue;
8215 if (op_alt[i].anything_ok
8216 || (op_alt[i].matches != -1
8217 && operands_match_p
8218 (recog_data.operand[i],
8219 recog_data.operand[op_alt[i].matches]))
8220 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8222 win = true;
8223 break;
8227 if (!win)
8228 return false;
8231 return true;
8234 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8236 static unsigned HOST_WIDE_INT
8237 ix86_asan_shadow_offset (void)
8239 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8240 : HOST_WIDE_INT_C (0x7fff8000))
8241 : (HOST_WIDE_INT_1 << 29);
8244 /* Argument support functions. */
8246 /* Return true when register may be used to pass function parameters. */
8247 bool
8248 ix86_function_arg_regno_p (int regno)
8250 int i;
8251 enum calling_abi call_abi;
8252 const int *parm_regs;
8254 if (TARGET_MPX && BND_REGNO_P (regno))
8255 return true;
8257 if (!TARGET_64BIT)
8259 if (TARGET_MACHO)
8260 return (regno < REGPARM_MAX
8261 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8262 else
8263 return (regno < REGPARM_MAX
8264 || (TARGET_MMX && MMX_REGNO_P (regno)
8265 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8266 || (TARGET_SSE && SSE_REGNO_P (regno)
8267 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8270 if (TARGET_SSE && SSE_REGNO_P (regno)
8271 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8272 return true;
8274 /* TODO: The function should depend on current function ABI but
8275 builtins.c would need updating then. Therefore we use the
8276 default ABI. */
8277 call_abi = ix86_cfun_abi ();
8279 /* RAX is used as hidden argument to va_arg functions. */
8280 if (call_abi == SYSV_ABI && regno == AX_REG)
8281 return true;
8283 if (call_abi == MS_ABI)
8284 parm_regs = x86_64_ms_abi_int_parameter_registers;
8285 else
8286 parm_regs = x86_64_int_parameter_registers;
8288 for (i = 0; i < (call_abi == MS_ABI
8289 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8290 if (regno == parm_regs[i])
8291 return true;
8292 return false;
8295 /* Return if we do not know how to pass TYPE solely in registers. */
8297 static bool
8298 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8300 if (must_pass_in_stack_var_size_or_pad (mode, type))
8301 return true;
8303 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8304 The layout_type routine is crafty and tries to trick us into passing
8305 currently unsupported vector types on the stack by using TImode. */
8306 return (!TARGET_64BIT && mode == TImode
8307 && type && TREE_CODE (type) != VECTOR_TYPE);
8310 /* It returns the size, in bytes, of the area reserved for arguments passed
8311 in registers for the function represented by fndecl dependent to the used
8312 abi format. */
8314 ix86_reg_parm_stack_space (const_tree fndecl)
8316 enum calling_abi call_abi = SYSV_ABI;
8317 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8318 call_abi = ix86_function_abi (fndecl);
8319 else
8320 call_abi = ix86_function_type_abi (fndecl);
8321 if (TARGET_64BIT && call_abi == MS_ABI)
8322 return 32;
8323 return 0;
8326 /* We add this as a workaround in order to use libc_has_function
8327 hook in i386.md. */
8328 bool
8329 ix86_libc_has_function (enum function_class fn_class)
8331 return targetm.libc_has_function (fn_class);
8334 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8335 specifying the call abi used. */
8336 enum calling_abi
8337 ix86_function_type_abi (const_tree fntype)
8339 enum calling_abi abi = ix86_abi;
8341 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8342 return abi;
8344 if (abi == SYSV_ABI
8345 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8347 if (TARGET_X32)
8348 error ("X32 does not support ms_abi attribute");
8350 abi = MS_ABI;
8352 else if (abi == MS_ABI
8353 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8354 abi = SYSV_ABI;
8356 return abi;
8359 static enum calling_abi
8360 ix86_function_abi (const_tree fndecl)
8362 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8365 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8366 specifying the call abi used. */
8367 enum calling_abi
8368 ix86_cfun_abi (void)
8370 return cfun ? cfun->machine->call_abi : ix86_abi;
8373 static bool
8374 ix86_function_ms_hook_prologue (const_tree fn)
8376 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8378 if (decl_function_context (fn) != NULL_TREE)
8379 error_at (DECL_SOURCE_LOCATION (fn),
8380 "ms_hook_prologue is not compatible with nested function");
8381 else
8382 return true;
8384 return false;
8387 /* Write the extra assembler code needed to declare a function properly. */
8389 void
8390 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8391 tree decl)
8393 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8395 if (is_ms_hook)
8397 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8398 unsigned int filler_cc = 0xcccccccc;
8400 for (i = 0; i < filler_count; i += 4)
8401 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8404 #ifdef SUBTARGET_ASM_UNWIND_INIT
8405 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8406 #endif
8408 ASM_OUTPUT_LABEL (asm_out_file, fname);
8410 /* Output magic byte marker, if hot-patch attribute is set. */
8411 if (is_ms_hook)
8413 if (TARGET_64BIT)
8415 /* leaq [%rsp + 0], %rsp */
8416 asm_fprintf (asm_out_file, ASM_BYTE
8417 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8419 else
8421 /* movl.s %edi, %edi
8422 push %ebp
8423 movl.s %esp, %ebp */
8424 asm_fprintf (asm_out_file, ASM_BYTE
8425 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8430 /* regclass.c */
8431 extern void init_regs (void);
8433 /* Implementation of call abi switching target hook. Specific to FNDECL
8434 the specific call register sets are set. See also
8435 ix86_conditional_register_usage for more details. */
8436 void
8437 ix86_call_abi_override (const_tree fndecl)
8439 cfun->machine->call_abi = ix86_function_abi (fndecl);
8442 /* Return 1 if pseudo register should be created and used to hold
8443 GOT address for PIC code. */
8444 bool
8445 ix86_use_pseudo_pic_reg (void)
8447 if ((TARGET_64BIT
8448 && (ix86_cmodel == CM_SMALL_PIC
8449 || TARGET_PECOFF))
8450 || !flag_pic)
8451 return false;
8452 return true;
8455 /* Initialize large model PIC register. */
8457 static void
8458 ix86_init_large_pic_reg (unsigned int tmp_regno)
8460 rtx_code_label *label;
8461 rtx tmp_reg;
8463 gcc_assert (Pmode == DImode);
8464 label = gen_label_rtx ();
8465 emit_label (label);
8466 LABEL_PRESERVE_P (label) = 1;
8467 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8468 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8469 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8470 label));
8471 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8472 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8473 pic_offset_table_rtx, tmp_reg));
8476 /* Create and initialize PIC register if required. */
8477 static void
8478 ix86_init_pic_reg (void)
8480 edge entry_edge;
8481 rtx_insn *seq;
8483 if (!ix86_use_pseudo_pic_reg ())
8484 return;
8486 start_sequence ();
8488 if (TARGET_64BIT)
8490 if (ix86_cmodel == CM_LARGE_PIC)
8491 ix86_init_large_pic_reg (R11_REG);
8492 else
8493 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8495 else
8497 /* If there is future mcount call in the function it is more profitable
8498 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8499 rtx reg = crtl->profile
8500 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8501 : pic_offset_table_rtx;
8502 rtx_insn *insn = emit_insn (gen_set_got (reg));
8503 RTX_FRAME_RELATED_P (insn) = 1;
8504 if (crtl->profile)
8505 emit_move_insn (pic_offset_table_rtx, reg);
8506 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8509 seq = get_insns ();
8510 end_sequence ();
8512 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8513 insert_insn_on_edge (seq, entry_edge);
8514 commit_one_edge_insertion (entry_edge);
8517 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8518 for a call to a function whose data type is FNTYPE.
8519 For a library call, FNTYPE is 0. */
8521 void
8522 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8523 tree fntype, /* tree ptr for function decl */
8524 rtx libname, /* SYMBOL_REF of library name or 0 */
8525 tree fndecl,
8526 int caller)
8528 struct cgraph_local_info *i = NULL;
8529 struct cgraph_node *target = NULL;
8531 memset (cum, 0, sizeof (*cum));
8533 if (fndecl)
8535 target = cgraph_node::get (fndecl);
8536 if (target)
8538 target = target->function_symbol ();
8539 i = cgraph_node::local_info (target->decl);
8540 cum->call_abi = ix86_function_abi (target->decl);
8542 else
8543 cum->call_abi = ix86_function_abi (fndecl);
8545 else
8546 cum->call_abi = ix86_function_type_abi (fntype);
8548 cum->caller = caller;
8550 /* Set up the number of registers to use for passing arguments. */
8551 cum->nregs = ix86_regparm;
8552 if (TARGET_64BIT)
8554 cum->nregs = (cum->call_abi == SYSV_ABI
8555 ? X86_64_REGPARM_MAX
8556 : X86_64_MS_REGPARM_MAX);
8558 if (TARGET_SSE)
8560 cum->sse_nregs = SSE_REGPARM_MAX;
8561 if (TARGET_64BIT)
8563 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8564 ? X86_64_SSE_REGPARM_MAX
8565 : X86_64_MS_SSE_REGPARM_MAX);
8568 if (TARGET_MMX)
8569 cum->mmx_nregs = MMX_REGPARM_MAX;
8570 cum->warn_avx512f = true;
8571 cum->warn_avx = true;
8572 cum->warn_sse = true;
8573 cum->warn_mmx = true;
8575 /* Because type might mismatch in between caller and callee, we need to
8576 use actual type of function for local calls.
8577 FIXME: cgraph_analyze can be told to actually record if function uses
8578 va_start so for local functions maybe_vaarg can be made aggressive
8579 helping K&R code.
8580 FIXME: once typesytem is fixed, we won't need this code anymore. */
8581 if (i && i->local && i->can_change_signature)
8582 fntype = TREE_TYPE (target->decl);
8583 cum->stdarg = stdarg_p (fntype);
8584 cum->maybe_vaarg = (fntype
8585 ? (!prototype_p (fntype) || stdarg_p (fntype))
8586 : !libname);
8588 cum->bnd_regno = FIRST_BND_REG;
8589 cum->bnds_in_bt = 0;
8590 cum->force_bnd_pass = 0;
8591 cum->decl = fndecl;
8593 if (!TARGET_64BIT)
8595 /* If there are variable arguments, then we won't pass anything
8596 in registers in 32-bit mode. */
8597 if (stdarg_p (fntype))
8599 cum->nregs = 0;
8600 /* Since in 32-bit, variable arguments are always passed on
8601 stack, there is scratch register available for indirect
8602 sibcall. */
8603 cfun->machine->arg_reg_available = true;
8604 cum->sse_nregs = 0;
8605 cum->mmx_nregs = 0;
8606 cum->warn_avx512f = false;
8607 cum->warn_avx = false;
8608 cum->warn_sse = false;
8609 cum->warn_mmx = false;
8610 return;
8613 /* Use ecx and edx registers if function has fastcall attribute,
8614 else look for regparm information. */
8615 if (fntype)
8617 unsigned int ccvt = ix86_get_callcvt (fntype);
8618 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8620 cum->nregs = 1;
8621 cum->fastcall = 1; /* Same first register as in fastcall. */
8623 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8625 cum->nregs = 2;
8626 cum->fastcall = 1;
8628 else
8629 cum->nregs = ix86_function_regparm (fntype, fndecl);
8632 /* Set up the number of SSE registers used for passing SFmode
8633 and DFmode arguments. Warn for mismatching ABI. */
8634 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8637 cfun->machine->arg_reg_available = (cum->nregs > 0);
8640 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8641 But in the case of vector types, it is some vector mode.
8643 When we have only some of our vector isa extensions enabled, then there
8644 are some modes for which vector_mode_supported_p is false. For these
8645 modes, the generic vector support in gcc will choose some non-vector mode
8646 in order to implement the type. By computing the natural mode, we'll
8647 select the proper ABI location for the operand and not depend on whatever
8648 the middle-end decides to do with these vector types.
8650 The midde-end can't deal with the vector types > 16 bytes. In this
8651 case, we return the original mode and warn ABI change if CUM isn't
8652 NULL.
8654 If INT_RETURN is true, warn ABI change if the vector mode isn't
8655 available for function return value. */
8657 static machine_mode
8658 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8659 bool in_return)
8661 machine_mode mode = TYPE_MODE (type);
8663 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8665 HOST_WIDE_INT size = int_size_in_bytes (type);
8666 if ((size == 8 || size == 16 || size == 32 || size == 64)
8667 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8668 && TYPE_VECTOR_SUBPARTS (type) > 1)
8670 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8672 /* There are no XFmode vector modes. */
8673 if (innermode == XFmode)
8674 return mode;
8676 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8677 mode = MIN_MODE_VECTOR_FLOAT;
8678 else
8679 mode = MIN_MODE_VECTOR_INT;
8681 /* Get the mode which has this inner mode and number of units. */
8682 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8683 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8684 && GET_MODE_INNER (mode) == innermode)
8686 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8688 static bool warnedavx512f;
8689 static bool warnedavx512f_ret;
8691 if (cum && cum->warn_avx512f && !warnedavx512f)
8693 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8694 "without AVX512F enabled changes the ABI"))
8695 warnedavx512f = true;
8697 else if (in_return && !warnedavx512f_ret)
8699 if (warning (OPT_Wpsabi, "AVX512F vector return "
8700 "without AVX512F enabled changes the ABI"))
8701 warnedavx512f_ret = true;
8704 return TYPE_MODE (type);
8706 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8708 static bool warnedavx;
8709 static bool warnedavx_ret;
8711 if (cum && cum->warn_avx && !warnedavx)
8713 if (warning (OPT_Wpsabi, "AVX vector argument "
8714 "without AVX enabled changes the ABI"))
8715 warnedavx = true;
8717 else if (in_return && !warnedavx_ret)
8719 if (warning (OPT_Wpsabi, "AVX vector return "
8720 "without AVX enabled changes the ABI"))
8721 warnedavx_ret = true;
8724 return TYPE_MODE (type);
8726 else if (((size == 8 && TARGET_64BIT) || size == 16)
8727 && !TARGET_SSE
8728 && !TARGET_IAMCU)
8730 static bool warnedsse;
8731 static bool warnedsse_ret;
8733 if (cum && cum->warn_sse && !warnedsse)
8735 if (warning (OPT_Wpsabi, "SSE vector argument "
8736 "without SSE enabled changes the ABI"))
8737 warnedsse = true;
8739 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8741 if (warning (OPT_Wpsabi, "SSE vector return "
8742 "without SSE enabled changes the ABI"))
8743 warnedsse_ret = true;
8746 else if ((size == 8 && !TARGET_64BIT)
8747 && (!cfun
8748 || cfun->machine->func_type == TYPE_NORMAL)
8749 && !TARGET_MMX
8750 && !TARGET_IAMCU)
8752 static bool warnedmmx;
8753 static bool warnedmmx_ret;
8755 if (cum && cum->warn_mmx && !warnedmmx)
8757 if (warning (OPT_Wpsabi, "MMX vector argument "
8758 "without MMX enabled changes the ABI"))
8759 warnedmmx = true;
8761 else if (in_return && !warnedmmx_ret)
8763 if (warning (OPT_Wpsabi, "MMX vector return "
8764 "without MMX enabled changes the ABI"))
8765 warnedmmx_ret = true;
8768 return mode;
8771 gcc_unreachable ();
8775 return mode;
8778 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8779 this may not agree with the mode that the type system has chosen for the
8780 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8781 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8783 static rtx
8784 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8785 unsigned int regno)
8787 rtx tmp;
8789 if (orig_mode != BLKmode)
8790 tmp = gen_rtx_REG (orig_mode, regno);
8791 else
8793 tmp = gen_rtx_REG (mode, regno);
8794 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8795 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8798 return tmp;
8801 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8802 of this code is to classify each 8bytes of incoming argument by the register
8803 class and assign registers accordingly. */
8805 /* Return the union class of CLASS1 and CLASS2.
8806 See the x86-64 PS ABI for details. */
8808 static enum x86_64_reg_class
8809 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8811 /* Rule #1: If both classes are equal, this is the resulting class. */
8812 if (class1 == class2)
8813 return class1;
8815 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8816 the other class. */
8817 if (class1 == X86_64_NO_CLASS)
8818 return class2;
8819 if (class2 == X86_64_NO_CLASS)
8820 return class1;
8822 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8823 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8824 return X86_64_MEMORY_CLASS;
8826 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8827 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8828 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8829 return X86_64_INTEGERSI_CLASS;
8830 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8831 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8832 return X86_64_INTEGER_CLASS;
8834 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8835 MEMORY is used. */
8836 if (class1 == X86_64_X87_CLASS
8837 || class1 == X86_64_X87UP_CLASS
8838 || class1 == X86_64_COMPLEX_X87_CLASS
8839 || class2 == X86_64_X87_CLASS
8840 || class2 == X86_64_X87UP_CLASS
8841 || class2 == X86_64_COMPLEX_X87_CLASS)
8842 return X86_64_MEMORY_CLASS;
8844 /* Rule #6: Otherwise class SSE is used. */
8845 return X86_64_SSE_CLASS;
8848 /* Classify the argument of type TYPE and mode MODE.
8849 CLASSES will be filled by the register class used to pass each word
8850 of the operand. The number of words is returned. In case the parameter
8851 should be passed in memory, 0 is returned. As a special case for zero
8852 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8854 BIT_OFFSET is used internally for handling records and specifies offset
8855 of the offset in bits modulo 512 to avoid overflow cases.
8857 See the x86-64 PS ABI for details.
8860 static int
8861 classify_argument (machine_mode mode, const_tree type,
8862 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8864 HOST_WIDE_INT bytes =
8865 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8866 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8868 /* Variable sized entities are always passed/returned in memory. */
8869 if (bytes < 0)
8870 return 0;
8872 if (mode != VOIDmode
8873 && targetm.calls.must_pass_in_stack (mode, type))
8874 return 0;
8876 if (type && AGGREGATE_TYPE_P (type))
8878 int i;
8879 tree field;
8880 enum x86_64_reg_class subclasses[MAX_CLASSES];
8882 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8883 if (bytes > 64)
8884 return 0;
8886 for (i = 0; i < words; i++)
8887 classes[i] = X86_64_NO_CLASS;
8889 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8890 signalize memory class, so handle it as special case. */
8891 if (!words)
8893 classes[0] = X86_64_NO_CLASS;
8894 return 1;
8897 /* Classify each field of record and merge classes. */
8898 switch (TREE_CODE (type))
8900 case RECORD_TYPE:
8901 /* And now merge the fields of structure. */
8902 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8904 if (TREE_CODE (field) == FIELD_DECL)
8906 int num;
8908 if (TREE_TYPE (field) == error_mark_node)
8909 continue;
8911 /* Bitfields are always classified as integer. Handle them
8912 early, since later code would consider them to be
8913 misaligned integers. */
8914 if (DECL_BIT_FIELD (field))
8916 for (i = (int_bit_position (field)
8917 + (bit_offset % 64)) / 8 / 8;
8918 i < ((int_bit_position (field) + (bit_offset % 64))
8919 + tree_to_shwi (DECL_SIZE (field))
8920 + 63) / 8 / 8; i++)
8921 classes[i] =
8922 merge_classes (X86_64_INTEGER_CLASS,
8923 classes[i]);
8925 else
8927 int pos;
8929 type = TREE_TYPE (field);
8931 /* Flexible array member is ignored. */
8932 if (TYPE_MODE (type) == BLKmode
8933 && TREE_CODE (type) == ARRAY_TYPE
8934 && TYPE_SIZE (type) == NULL_TREE
8935 && TYPE_DOMAIN (type) != NULL_TREE
8936 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8937 == NULL_TREE))
8939 static bool warned;
8941 if (!warned && warn_psabi)
8943 warned = true;
8944 inform (input_location,
8945 "the ABI of passing struct with"
8946 " a flexible array member has"
8947 " changed in GCC 4.4");
8949 continue;
8951 num = classify_argument (TYPE_MODE (type), type,
8952 subclasses,
8953 (int_bit_position (field)
8954 + bit_offset) % 512);
8955 if (!num)
8956 return 0;
8957 pos = (int_bit_position (field)
8958 + (bit_offset % 64)) / 8 / 8;
8959 for (i = 0; i < num && (i + pos) < words; i++)
8960 classes[i + pos] =
8961 merge_classes (subclasses[i], classes[i + pos]);
8965 break;
8967 case ARRAY_TYPE:
8968 /* Arrays are handled as small records. */
8970 int num;
8971 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8972 TREE_TYPE (type), subclasses, bit_offset);
8973 if (!num)
8974 return 0;
8976 /* The partial classes are now full classes. */
8977 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8978 subclasses[0] = X86_64_SSE_CLASS;
8979 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8980 && !((bit_offset % 64) == 0 && bytes == 4))
8981 subclasses[0] = X86_64_INTEGER_CLASS;
8983 for (i = 0; i < words; i++)
8984 classes[i] = subclasses[i % num];
8986 break;
8988 case UNION_TYPE:
8989 case QUAL_UNION_TYPE:
8990 /* Unions are similar to RECORD_TYPE but offset is always 0.
8992 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8994 if (TREE_CODE (field) == FIELD_DECL)
8996 int num;
8998 if (TREE_TYPE (field) == error_mark_node)
8999 continue;
9001 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9002 TREE_TYPE (field), subclasses,
9003 bit_offset);
9004 if (!num)
9005 return 0;
9006 for (i = 0; i < num && i < words; i++)
9007 classes[i] = merge_classes (subclasses[i], classes[i]);
9010 break;
9012 default:
9013 gcc_unreachable ();
9016 if (words > 2)
9018 /* When size > 16 bytes, if the first one isn't
9019 X86_64_SSE_CLASS or any other ones aren't
9020 X86_64_SSEUP_CLASS, everything should be passed in
9021 memory. */
9022 if (classes[0] != X86_64_SSE_CLASS)
9023 return 0;
9025 for (i = 1; i < words; i++)
9026 if (classes[i] != X86_64_SSEUP_CLASS)
9027 return 0;
9030 /* Final merger cleanup. */
9031 for (i = 0; i < words; i++)
9033 /* If one class is MEMORY, everything should be passed in
9034 memory. */
9035 if (classes[i] == X86_64_MEMORY_CLASS)
9036 return 0;
9038 /* The X86_64_SSEUP_CLASS should be always preceded by
9039 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9040 if (classes[i] == X86_64_SSEUP_CLASS
9041 && classes[i - 1] != X86_64_SSE_CLASS
9042 && classes[i - 1] != X86_64_SSEUP_CLASS)
9044 /* The first one should never be X86_64_SSEUP_CLASS. */
9045 gcc_assert (i != 0);
9046 classes[i] = X86_64_SSE_CLASS;
9049 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9050 everything should be passed in memory. */
9051 if (classes[i] == X86_64_X87UP_CLASS
9052 && (classes[i - 1] != X86_64_X87_CLASS))
9054 static bool warned;
9056 /* The first one should never be X86_64_X87UP_CLASS. */
9057 gcc_assert (i != 0);
9058 if (!warned && warn_psabi)
9060 warned = true;
9061 inform (input_location,
9062 "the ABI of passing union with long double"
9063 " has changed in GCC 4.4");
9065 return 0;
9068 return words;
9071 /* Compute alignment needed. We align all types to natural boundaries with
9072 exception of XFmode that is aligned to 64bits. */
9073 if (mode != VOIDmode && mode != BLKmode)
9075 int mode_alignment = GET_MODE_BITSIZE (mode);
9077 if (mode == XFmode)
9078 mode_alignment = 128;
9079 else if (mode == XCmode)
9080 mode_alignment = 256;
9081 if (COMPLEX_MODE_P (mode))
9082 mode_alignment /= 2;
9083 /* Misaligned fields are always returned in memory. */
9084 if (bit_offset % mode_alignment)
9085 return 0;
9088 /* for V1xx modes, just use the base mode */
9089 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9090 && GET_MODE_UNIT_SIZE (mode) == bytes)
9091 mode = GET_MODE_INNER (mode);
9093 /* Classification of atomic types. */
9094 switch (mode)
9096 case SDmode:
9097 case DDmode:
9098 classes[0] = X86_64_SSE_CLASS;
9099 return 1;
9100 case TDmode:
9101 classes[0] = X86_64_SSE_CLASS;
9102 classes[1] = X86_64_SSEUP_CLASS;
9103 return 2;
9104 case DImode:
9105 case SImode:
9106 case HImode:
9107 case QImode:
9108 case CSImode:
9109 case CHImode:
9110 case CQImode:
9112 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9114 /* Analyze last 128 bits only. */
9115 size = (size - 1) & 0x7f;
9117 if (size < 32)
9119 classes[0] = X86_64_INTEGERSI_CLASS;
9120 return 1;
9122 else if (size < 64)
9124 classes[0] = X86_64_INTEGER_CLASS;
9125 return 1;
9127 else if (size < 64+32)
9129 classes[0] = X86_64_INTEGER_CLASS;
9130 classes[1] = X86_64_INTEGERSI_CLASS;
9131 return 2;
9133 else if (size < 64+64)
9135 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9136 return 2;
9138 else
9139 gcc_unreachable ();
9141 case CDImode:
9142 case TImode:
9143 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9144 return 2;
9145 case COImode:
9146 case OImode:
9147 /* OImode shouldn't be used directly. */
9148 gcc_unreachable ();
9149 case CTImode:
9150 return 0;
9151 case SFmode:
9152 if (!(bit_offset % 64))
9153 classes[0] = X86_64_SSESF_CLASS;
9154 else
9155 classes[0] = X86_64_SSE_CLASS;
9156 return 1;
9157 case DFmode:
9158 classes[0] = X86_64_SSEDF_CLASS;
9159 return 1;
9160 case XFmode:
9161 classes[0] = X86_64_X87_CLASS;
9162 classes[1] = X86_64_X87UP_CLASS;
9163 return 2;
9164 case TFmode:
9165 classes[0] = X86_64_SSE_CLASS;
9166 classes[1] = X86_64_SSEUP_CLASS;
9167 return 2;
9168 case SCmode:
9169 classes[0] = X86_64_SSE_CLASS;
9170 if (!(bit_offset % 64))
9171 return 1;
9172 else
9174 static bool warned;
9176 if (!warned && warn_psabi)
9178 warned = true;
9179 inform (input_location,
9180 "the ABI of passing structure with complex float"
9181 " member has changed in GCC 4.4");
9183 classes[1] = X86_64_SSESF_CLASS;
9184 return 2;
9186 case DCmode:
9187 classes[0] = X86_64_SSEDF_CLASS;
9188 classes[1] = X86_64_SSEDF_CLASS;
9189 return 2;
9190 case XCmode:
9191 classes[0] = X86_64_COMPLEX_X87_CLASS;
9192 return 1;
9193 case TCmode:
9194 /* This modes is larger than 16 bytes. */
9195 return 0;
9196 case V8SFmode:
9197 case V8SImode:
9198 case V32QImode:
9199 case V16HImode:
9200 case V4DFmode:
9201 case V4DImode:
9202 classes[0] = X86_64_SSE_CLASS;
9203 classes[1] = X86_64_SSEUP_CLASS;
9204 classes[2] = X86_64_SSEUP_CLASS;
9205 classes[3] = X86_64_SSEUP_CLASS;
9206 return 4;
9207 case V8DFmode:
9208 case V16SFmode:
9209 case V8DImode:
9210 case V16SImode:
9211 case V32HImode:
9212 case V64QImode:
9213 classes[0] = X86_64_SSE_CLASS;
9214 classes[1] = X86_64_SSEUP_CLASS;
9215 classes[2] = X86_64_SSEUP_CLASS;
9216 classes[3] = X86_64_SSEUP_CLASS;
9217 classes[4] = X86_64_SSEUP_CLASS;
9218 classes[5] = X86_64_SSEUP_CLASS;
9219 classes[6] = X86_64_SSEUP_CLASS;
9220 classes[7] = X86_64_SSEUP_CLASS;
9221 return 8;
9222 case V4SFmode:
9223 case V4SImode:
9224 case V16QImode:
9225 case V8HImode:
9226 case V2DFmode:
9227 case V2DImode:
9228 classes[0] = X86_64_SSE_CLASS;
9229 classes[1] = X86_64_SSEUP_CLASS;
9230 return 2;
9231 case V1TImode:
9232 case V1DImode:
9233 case V2SFmode:
9234 case V2SImode:
9235 case V4HImode:
9236 case V8QImode:
9237 classes[0] = X86_64_SSE_CLASS;
9238 return 1;
9239 case BLKmode:
9240 case VOIDmode:
9241 return 0;
9242 default:
9243 gcc_assert (VECTOR_MODE_P (mode));
9245 if (bytes > 16)
9246 return 0;
9248 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9250 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9251 classes[0] = X86_64_INTEGERSI_CLASS;
9252 else
9253 classes[0] = X86_64_INTEGER_CLASS;
9254 classes[1] = X86_64_INTEGER_CLASS;
9255 return 1 + (bytes > 8);
9259 /* Examine the argument and return set number of register required in each
9260 class. Return true iff parameter should be passed in memory. */
9262 static bool
9263 examine_argument (machine_mode mode, const_tree type, int in_return,
9264 int *int_nregs, int *sse_nregs)
9266 enum x86_64_reg_class regclass[MAX_CLASSES];
9267 int n = classify_argument (mode, type, regclass, 0);
9269 *int_nregs = 0;
9270 *sse_nregs = 0;
9272 if (!n)
9273 return true;
9274 for (n--; n >= 0; n--)
9275 switch (regclass[n])
9277 case X86_64_INTEGER_CLASS:
9278 case X86_64_INTEGERSI_CLASS:
9279 (*int_nregs)++;
9280 break;
9281 case X86_64_SSE_CLASS:
9282 case X86_64_SSESF_CLASS:
9283 case X86_64_SSEDF_CLASS:
9284 (*sse_nregs)++;
9285 break;
9286 case X86_64_NO_CLASS:
9287 case X86_64_SSEUP_CLASS:
9288 break;
9289 case X86_64_X87_CLASS:
9290 case X86_64_X87UP_CLASS:
9291 case X86_64_COMPLEX_X87_CLASS:
9292 if (!in_return)
9293 return true;
9294 break;
9295 case X86_64_MEMORY_CLASS:
9296 gcc_unreachable ();
9299 return false;
9302 /* Construct container for the argument used by GCC interface. See
9303 FUNCTION_ARG for the detailed description. */
9305 static rtx
9306 construct_container (machine_mode mode, machine_mode orig_mode,
9307 const_tree type, int in_return, int nintregs, int nsseregs,
9308 const int *intreg, int sse_regno)
9310 /* The following variables hold the static issued_error state. */
9311 static bool issued_sse_arg_error;
9312 static bool issued_sse_ret_error;
9313 static bool issued_x87_ret_error;
9315 machine_mode tmpmode;
9316 int bytes =
9317 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9318 enum x86_64_reg_class regclass[MAX_CLASSES];
9319 int n;
9320 int i;
9321 int nexps = 0;
9322 int needed_sseregs, needed_intregs;
9323 rtx exp[MAX_CLASSES];
9324 rtx ret;
9326 n = classify_argument (mode, type, regclass, 0);
9327 if (!n)
9328 return NULL;
9329 if (examine_argument (mode, type, in_return, &needed_intregs,
9330 &needed_sseregs))
9331 return NULL;
9332 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9333 return NULL;
9335 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9336 some less clueful developer tries to use floating-point anyway. */
9337 if (needed_sseregs && !TARGET_SSE)
9339 if (in_return)
9341 if (!issued_sse_ret_error)
9343 error ("SSE register return with SSE disabled");
9344 issued_sse_ret_error = true;
9347 else if (!issued_sse_arg_error)
9349 error ("SSE register argument with SSE disabled");
9350 issued_sse_arg_error = true;
9352 return NULL;
9355 /* Likewise, error if the ABI requires us to return values in the
9356 x87 registers and the user specified -mno-80387. */
9357 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9358 for (i = 0; i < n; i++)
9359 if (regclass[i] == X86_64_X87_CLASS
9360 || regclass[i] == X86_64_X87UP_CLASS
9361 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9363 if (!issued_x87_ret_error)
9365 error ("x87 register return with x87 disabled");
9366 issued_x87_ret_error = true;
9368 return NULL;
9371 /* First construct simple cases. Avoid SCmode, since we want to use
9372 single register to pass this type. */
9373 if (n == 1 && mode != SCmode)
9374 switch (regclass[0])
9376 case X86_64_INTEGER_CLASS:
9377 case X86_64_INTEGERSI_CLASS:
9378 return gen_rtx_REG (mode, intreg[0]);
9379 case X86_64_SSE_CLASS:
9380 case X86_64_SSESF_CLASS:
9381 case X86_64_SSEDF_CLASS:
9382 if (mode != BLKmode)
9383 return gen_reg_or_parallel (mode, orig_mode,
9384 SSE_REGNO (sse_regno));
9385 break;
9386 case X86_64_X87_CLASS:
9387 case X86_64_COMPLEX_X87_CLASS:
9388 return gen_rtx_REG (mode, FIRST_STACK_REG);
9389 case X86_64_NO_CLASS:
9390 /* Zero sized array, struct or class. */
9391 return NULL;
9392 default:
9393 gcc_unreachable ();
9395 if (n == 2
9396 && regclass[0] == X86_64_SSE_CLASS
9397 && regclass[1] == X86_64_SSEUP_CLASS
9398 && mode != BLKmode)
9399 return gen_reg_or_parallel (mode, orig_mode,
9400 SSE_REGNO (sse_regno));
9401 if (n == 4
9402 && regclass[0] == X86_64_SSE_CLASS
9403 && regclass[1] == X86_64_SSEUP_CLASS
9404 && regclass[2] == X86_64_SSEUP_CLASS
9405 && regclass[3] == X86_64_SSEUP_CLASS
9406 && mode != BLKmode)
9407 return gen_reg_or_parallel (mode, orig_mode,
9408 SSE_REGNO (sse_regno));
9409 if (n == 8
9410 && regclass[0] == X86_64_SSE_CLASS
9411 && regclass[1] == X86_64_SSEUP_CLASS
9412 && regclass[2] == X86_64_SSEUP_CLASS
9413 && regclass[3] == X86_64_SSEUP_CLASS
9414 && regclass[4] == X86_64_SSEUP_CLASS
9415 && regclass[5] == X86_64_SSEUP_CLASS
9416 && regclass[6] == X86_64_SSEUP_CLASS
9417 && regclass[7] == X86_64_SSEUP_CLASS
9418 && mode != BLKmode)
9419 return gen_reg_or_parallel (mode, orig_mode,
9420 SSE_REGNO (sse_regno));
9421 if (n == 2
9422 && regclass[0] == X86_64_X87_CLASS
9423 && regclass[1] == X86_64_X87UP_CLASS)
9424 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9426 if (n == 2
9427 && regclass[0] == X86_64_INTEGER_CLASS
9428 && regclass[1] == X86_64_INTEGER_CLASS
9429 && (mode == CDImode || mode == TImode)
9430 && intreg[0] + 1 == intreg[1])
9431 return gen_rtx_REG (mode, intreg[0]);
9433 /* Otherwise figure out the entries of the PARALLEL. */
9434 for (i = 0; i < n; i++)
9436 int pos;
9438 switch (regclass[i])
9440 case X86_64_NO_CLASS:
9441 break;
9442 case X86_64_INTEGER_CLASS:
9443 case X86_64_INTEGERSI_CLASS:
9444 /* Merge TImodes on aligned occasions here too. */
9445 if (i * 8 + 8 > bytes)
9446 tmpmode
9447 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9448 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9449 tmpmode = SImode;
9450 else
9451 tmpmode = DImode;
9452 /* We've requested 24 bytes we
9453 don't have mode for. Use DImode. */
9454 if (tmpmode == BLKmode)
9455 tmpmode = DImode;
9456 exp [nexps++]
9457 = gen_rtx_EXPR_LIST (VOIDmode,
9458 gen_rtx_REG (tmpmode, *intreg),
9459 GEN_INT (i*8));
9460 intreg++;
9461 break;
9462 case X86_64_SSESF_CLASS:
9463 exp [nexps++]
9464 = gen_rtx_EXPR_LIST (VOIDmode,
9465 gen_rtx_REG (SFmode,
9466 SSE_REGNO (sse_regno)),
9467 GEN_INT (i*8));
9468 sse_regno++;
9469 break;
9470 case X86_64_SSEDF_CLASS:
9471 exp [nexps++]
9472 = gen_rtx_EXPR_LIST (VOIDmode,
9473 gen_rtx_REG (DFmode,
9474 SSE_REGNO (sse_regno)),
9475 GEN_INT (i*8));
9476 sse_regno++;
9477 break;
9478 case X86_64_SSE_CLASS:
9479 pos = i;
9480 switch (n)
9482 case 1:
9483 tmpmode = DImode;
9484 break;
9485 case 2:
9486 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9488 tmpmode = TImode;
9489 i++;
9491 else
9492 tmpmode = DImode;
9493 break;
9494 case 4:
9495 gcc_assert (i == 0
9496 && regclass[1] == X86_64_SSEUP_CLASS
9497 && regclass[2] == X86_64_SSEUP_CLASS
9498 && regclass[3] == X86_64_SSEUP_CLASS);
9499 tmpmode = OImode;
9500 i += 3;
9501 break;
9502 case 8:
9503 gcc_assert (i == 0
9504 && regclass[1] == X86_64_SSEUP_CLASS
9505 && regclass[2] == X86_64_SSEUP_CLASS
9506 && regclass[3] == X86_64_SSEUP_CLASS
9507 && regclass[4] == X86_64_SSEUP_CLASS
9508 && regclass[5] == X86_64_SSEUP_CLASS
9509 && regclass[6] == X86_64_SSEUP_CLASS
9510 && regclass[7] == X86_64_SSEUP_CLASS);
9511 tmpmode = XImode;
9512 i += 7;
9513 break;
9514 default:
9515 gcc_unreachable ();
9517 exp [nexps++]
9518 = gen_rtx_EXPR_LIST (VOIDmode,
9519 gen_rtx_REG (tmpmode,
9520 SSE_REGNO (sse_regno)),
9521 GEN_INT (pos*8));
9522 sse_regno++;
9523 break;
9524 default:
9525 gcc_unreachable ();
9529 /* Empty aligned struct, union or class. */
9530 if (nexps == 0)
9531 return NULL;
9533 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9534 for (i = 0; i < nexps; i++)
9535 XVECEXP (ret, 0, i) = exp [i];
9536 return ret;
9539 /* Update the data in CUM to advance over an argument of mode MODE
9540 and data type TYPE. (TYPE is null for libcalls where that information
9541 may not be available.)
9543 Return a number of integer regsiters advanced over. */
9545 static int
9546 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9547 const_tree type, HOST_WIDE_INT bytes,
9548 HOST_WIDE_INT words)
9550 int res = 0;
9551 bool error_p = NULL;
9553 if (TARGET_IAMCU)
9555 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9556 bytes in registers. */
9557 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9558 goto pass_in_reg;
9559 return res;
9562 switch (mode)
9564 default:
9565 break;
9567 case BLKmode:
9568 if (bytes < 0)
9569 break;
9570 /* FALLTHRU */
9572 case DImode:
9573 case SImode:
9574 case HImode:
9575 case QImode:
9576 pass_in_reg:
9577 cum->words += words;
9578 cum->nregs -= words;
9579 cum->regno += words;
9580 if (cum->nregs >= 0)
9581 res = words;
9582 if (cum->nregs <= 0)
9584 cum->nregs = 0;
9585 cfun->machine->arg_reg_available = false;
9586 cum->regno = 0;
9588 break;
9590 case OImode:
9591 /* OImode shouldn't be used directly. */
9592 gcc_unreachable ();
9594 case DFmode:
9595 if (cum->float_in_sse == -1)
9596 error_p = 1;
9597 if (cum->float_in_sse < 2)
9598 break;
9599 /* FALLTHRU */
9600 case SFmode:
9601 if (cum->float_in_sse == -1)
9602 error_p = 1;
9603 if (cum->float_in_sse < 1)
9604 break;
9605 /* FALLTHRU */
9607 case V8SFmode:
9608 case V8SImode:
9609 case V64QImode:
9610 case V32HImode:
9611 case V16SImode:
9612 case V8DImode:
9613 case V16SFmode:
9614 case V8DFmode:
9615 case V32QImode:
9616 case V16HImode:
9617 case V4DFmode:
9618 case V4DImode:
9619 case TImode:
9620 case V16QImode:
9621 case V8HImode:
9622 case V4SImode:
9623 case V2DImode:
9624 case V4SFmode:
9625 case V2DFmode:
9626 if (!type || !AGGREGATE_TYPE_P (type))
9628 cum->sse_words += words;
9629 cum->sse_nregs -= 1;
9630 cum->sse_regno += 1;
9631 if (cum->sse_nregs <= 0)
9633 cum->sse_nregs = 0;
9634 cum->sse_regno = 0;
9637 break;
9639 case V8QImode:
9640 case V4HImode:
9641 case V2SImode:
9642 case V2SFmode:
9643 case V1TImode:
9644 case V1DImode:
9645 if (!type || !AGGREGATE_TYPE_P (type))
9647 cum->mmx_words += words;
9648 cum->mmx_nregs -= 1;
9649 cum->mmx_regno += 1;
9650 if (cum->mmx_nregs <= 0)
9652 cum->mmx_nregs = 0;
9653 cum->mmx_regno = 0;
9656 break;
9658 if (error_p)
9660 cum->float_in_sse = 0;
9661 error ("calling %qD with SSE calling convention without "
9662 "SSE/SSE2 enabled", cum->decl);
9663 sorry ("this is a GCC bug that can be worked around by adding "
9664 "attribute used to function called");
9667 return res;
9670 static int
9671 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9672 const_tree type, HOST_WIDE_INT words, bool named)
9674 int int_nregs, sse_nregs;
9676 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9677 if (!named && (VALID_AVX512F_REG_MODE (mode)
9678 || VALID_AVX256_REG_MODE (mode)))
9679 return 0;
9681 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9682 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9684 cum->nregs -= int_nregs;
9685 cum->sse_nregs -= sse_nregs;
9686 cum->regno += int_nregs;
9687 cum->sse_regno += sse_nregs;
9688 return int_nregs;
9690 else
9692 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9693 cum->words = ROUND_UP (cum->words, align);
9694 cum->words += words;
9695 return 0;
9699 static int
9700 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9701 HOST_WIDE_INT words)
9703 /* Otherwise, this should be passed indirect. */
9704 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9706 cum->words += words;
9707 if (cum->nregs > 0)
9709 cum->nregs -= 1;
9710 cum->regno += 1;
9711 return 1;
9713 return 0;
9716 /* Update the data in CUM to advance over an argument of mode MODE and
9717 data type TYPE. (TYPE is null for libcalls where that information
9718 may not be available.) */
9720 static void
9721 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9722 const_tree type, bool named)
9724 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9725 HOST_WIDE_INT bytes, words;
9726 int nregs;
9728 /* The argument of interrupt handler is a special case and is
9729 handled in ix86_function_arg. */
9730 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9731 return;
9733 if (mode == BLKmode)
9734 bytes = int_size_in_bytes (type);
9735 else
9736 bytes = GET_MODE_SIZE (mode);
9737 words = CEIL (bytes, UNITS_PER_WORD);
9739 if (type)
9740 mode = type_natural_mode (type, NULL, false);
9742 if ((type && POINTER_BOUNDS_TYPE_P (type))
9743 || POINTER_BOUNDS_MODE_P (mode))
9745 /* If we pass bounds in BT then just update remained bounds count. */
9746 if (cum->bnds_in_bt)
9748 cum->bnds_in_bt--;
9749 return;
9752 /* Update remained number of bounds to force. */
9753 if (cum->force_bnd_pass)
9754 cum->force_bnd_pass--;
9756 cum->bnd_regno++;
9758 return;
9761 /* The first arg not going to Bounds Tables resets this counter. */
9762 cum->bnds_in_bt = 0;
9763 /* For unnamed args we always pass bounds to avoid bounds mess when
9764 passed and received types do not match. If bounds do not follow
9765 unnamed arg, still pretend required number of bounds were passed. */
9766 if (cum->force_bnd_pass)
9768 cum->bnd_regno += cum->force_bnd_pass;
9769 cum->force_bnd_pass = 0;
9772 if (TARGET_64BIT)
9774 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9776 if (call_abi == MS_ABI)
9777 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9778 else
9779 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9781 else
9782 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9784 /* For stdarg we expect bounds to be passed for each value passed
9785 in register. */
9786 if (cum->stdarg)
9787 cum->force_bnd_pass = nregs;
9788 /* For pointers passed in memory we expect bounds passed in Bounds
9789 Table. */
9790 if (!nregs)
9791 cum->bnds_in_bt = chkp_type_bounds_count (type);
9794 /* Define where to put the arguments to a function.
9795 Value is zero to push the argument on the stack,
9796 or a hard register in which to store the argument.
9798 MODE is the argument's machine mode.
9799 TYPE is the data type of the argument (as a tree).
9800 This is null for libcalls where that information may
9801 not be available.
9802 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9803 the preceding args and about the function being called.
9804 NAMED is nonzero if this argument is a named parameter
9805 (otherwise it is an extra parameter matching an ellipsis). */
9807 static rtx
9808 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9809 machine_mode orig_mode, const_tree type,
9810 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9812 bool error_p = false;
9813 /* Avoid the AL settings for the Unix64 ABI. */
9814 if (mode == VOIDmode)
9815 return constm1_rtx;
9817 if (TARGET_IAMCU)
9819 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9820 bytes in registers. */
9821 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9822 goto pass_in_reg;
9823 return NULL_RTX;
9826 switch (mode)
9828 default:
9829 break;
9831 case BLKmode:
9832 if (bytes < 0)
9833 break;
9834 /* FALLTHRU */
9835 case DImode:
9836 case SImode:
9837 case HImode:
9838 case QImode:
9839 pass_in_reg:
9840 if (words <= cum->nregs)
9842 int regno = cum->regno;
9844 /* Fastcall allocates the first two DWORD (SImode) or
9845 smaller arguments to ECX and EDX if it isn't an
9846 aggregate type . */
9847 if (cum->fastcall)
9849 if (mode == BLKmode
9850 || mode == DImode
9851 || (type && AGGREGATE_TYPE_P (type)))
9852 break;
9854 /* ECX not EAX is the first allocated register. */
9855 if (regno == AX_REG)
9856 regno = CX_REG;
9858 return gen_rtx_REG (mode, regno);
9860 break;
9862 case DFmode:
9863 if (cum->float_in_sse == -1)
9864 error_p = 1;
9865 if (cum->float_in_sse < 2)
9866 break;
9867 /* FALLTHRU */
9868 case SFmode:
9869 if (cum->float_in_sse == -1)
9870 error_p = 1;
9871 if (cum->float_in_sse < 1)
9872 break;
9873 /* FALLTHRU */
9874 case TImode:
9875 /* In 32bit, we pass TImode in xmm registers. */
9876 case V16QImode:
9877 case V8HImode:
9878 case V4SImode:
9879 case V2DImode:
9880 case V4SFmode:
9881 case V2DFmode:
9882 if (!type || !AGGREGATE_TYPE_P (type))
9884 if (cum->sse_nregs)
9885 return gen_reg_or_parallel (mode, orig_mode,
9886 cum->sse_regno + FIRST_SSE_REG);
9888 break;
9890 case OImode:
9891 case XImode:
9892 /* OImode and XImode shouldn't be used directly. */
9893 gcc_unreachable ();
9895 case V64QImode:
9896 case V32HImode:
9897 case V16SImode:
9898 case V8DImode:
9899 case V16SFmode:
9900 case V8DFmode:
9901 case V8SFmode:
9902 case V8SImode:
9903 case V32QImode:
9904 case V16HImode:
9905 case V4DFmode:
9906 case V4DImode:
9907 if (!type || !AGGREGATE_TYPE_P (type))
9909 if (cum->sse_nregs)
9910 return gen_reg_or_parallel (mode, orig_mode,
9911 cum->sse_regno + FIRST_SSE_REG);
9913 break;
9915 case V8QImode:
9916 case V4HImode:
9917 case V2SImode:
9918 case V2SFmode:
9919 case V1TImode:
9920 case V1DImode:
9921 if (!type || !AGGREGATE_TYPE_P (type))
9923 if (cum->mmx_nregs)
9924 return gen_reg_or_parallel (mode, orig_mode,
9925 cum->mmx_regno + FIRST_MMX_REG);
9927 break;
9929 if (error_p)
9931 cum->float_in_sse = 0;
9932 error ("calling %qD with SSE calling convention without "
9933 "SSE/SSE2 enabled", cum->decl);
9934 sorry ("this is a GCC bug that can be worked around by adding "
9935 "attribute used to function called");
9938 return NULL_RTX;
9941 static rtx
9942 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9943 machine_mode orig_mode, const_tree type, bool named)
9945 /* Handle a hidden AL argument containing number of registers
9946 for varargs x86-64 functions. */
9947 if (mode == VOIDmode)
9948 return GEN_INT (cum->maybe_vaarg
9949 ? (cum->sse_nregs < 0
9950 ? X86_64_SSE_REGPARM_MAX
9951 : cum->sse_regno)
9952 : -1);
9954 switch (mode)
9956 default:
9957 break;
9959 case V8SFmode:
9960 case V8SImode:
9961 case V32QImode:
9962 case V16HImode:
9963 case V4DFmode:
9964 case V4DImode:
9965 case V16SFmode:
9966 case V16SImode:
9967 case V64QImode:
9968 case V32HImode:
9969 case V8DFmode:
9970 case V8DImode:
9971 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9972 if (!named)
9973 return NULL;
9974 break;
9977 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9978 cum->sse_nregs,
9979 &x86_64_int_parameter_registers [cum->regno],
9980 cum->sse_regno);
9983 static rtx
9984 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9985 machine_mode orig_mode, bool named,
9986 HOST_WIDE_INT bytes)
9988 unsigned int regno;
9990 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9991 We use value of -2 to specify that current function call is MSABI. */
9992 if (mode == VOIDmode)
9993 return GEN_INT (-2);
9995 /* If we've run out of registers, it goes on the stack. */
9996 if (cum->nregs == 0)
9997 return NULL_RTX;
9999 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10001 /* Only floating point modes are passed in anything but integer regs. */
10002 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10004 if (named)
10005 regno = cum->regno + FIRST_SSE_REG;
10006 else
10008 rtx t1, t2;
10010 /* Unnamed floating parameters are passed in both the
10011 SSE and integer registers. */
10012 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10013 t2 = gen_rtx_REG (mode, regno);
10014 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10015 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10016 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10019 /* Handle aggregated types passed in register. */
10020 if (orig_mode == BLKmode)
10022 if (bytes > 0 && bytes <= 8)
10023 mode = (bytes > 4 ? DImode : SImode);
10024 if (mode == BLKmode)
10025 mode = DImode;
10028 return gen_reg_or_parallel (mode, orig_mode, regno);
10031 /* Return where to put the arguments to a function.
10032 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10034 MODE is the argument's machine mode. TYPE is the data type of the
10035 argument. It is null for libcalls where that information may not be
10036 available. CUM gives information about the preceding args and about
10037 the function being called. NAMED is nonzero if this argument is a
10038 named parameter (otherwise it is an extra parameter matching an
10039 ellipsis). */
10041 static rtx
10042 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10043 const_tree type, bool named)
10045 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10046 machine_mode mode = omode;
10047 HOST_WIDE_INT bytes, words;
10048 rtx arg;
10050 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10052 gcc_assert (type != NULL_TREE);
10053 if (POINTER_TYPE_P (type))
10055 /* This is the pointer argument. */
10056 gcc_assert (TYPE_MODE (type) == Pmode);
10057 if (cfun->machine->func_type == TYPE_INTERRUPT)
10058 /* -WORD(AP) in the current frame in interrupt handler. */
10059 arg = plus_constant (Pmode, arg_pointer_rtx,
10060 -UNITS_PER_WORD);
10061 else
10062 /* (AP) in the current frame in exception handler. */
10063 arg = arg_pointer_rtx;
10065 else
10067 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10068 && TREE_CODE (type) == INTEGER_TYPE
10069 && TYPE_MODE (type) == word_mode);
10070 /* The integer argument is the error code at -WORD(AP) in
10071 the current frame in exception handler. */
10072 arg = gen_rtx_MEM (word_mode,
10073 plus_constant (Pmode,
10074 arg_pointer_rtx,
10075 -UNITS_PER_WORD));
10077 return arg;
10080 /* All pointer bounds arguments are handled separately here. */
10081 if ((type && POINTER_BOUNDS_TYPE_P (type))
10082 || POINTER_BOUNDS_MODE_P (mode))
10084 /* Return NULL if bounds are forced to go in Bounds Table. */
10085 if (cum->bnds_in_bt)
10086 arg = NULL;
10087 /* Return the next available bound reg if any. */
10088 else if (cum->bnd_regno <= LAST_BND_REG)
10089 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10090 /* Return the next special slot number otherwise. */
10091 else
10092 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10094 return arg;
10097 if (mode == BLKmode)
10098 bytes = int_size_in_bytes (type);
10099 else
10100 bytes = GET_MODE_SIZE (mode);
10101 words = CEIL (bytes, UNITS_PER_WORD);
10103 /* To simplify the code below, represent vector types with a vector mode
10104 even if MMX/SSE are not active. */
10105 if (type && TREE_CODE (type) == VECTOR_TYPE)
10106 mode = type_natural_mode (type, cum, false);
10108 if (TARGET_64BIT)
10110 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10112 if (call_abi == MS_ABI)
10113 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10114 else
10115 arg = function_arg_64 (cum, mode, omode, type, named);
10117 else
10118 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10120 return arg;
10123 /* A C expression that indicates when an argument must be passed by
10124 reference. If nonzero for an argument, a copy of that argument is
10125 made in memory and a pointer to the argument is passed instead of
10126 the argument itself. The pointer is passed in whatever way is
10127 appropriate for passing a pointer to that type. */
10129 static bool
10130 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10131 const_tree type, bool)
10133 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10135 /* Bounds are never passed by reference. */
10136 if ((type && POINTER_BOUNDS_TYPE_P (type))
10137 || POINTER_BOUNDS_MODE_P (mode))
10138 return false;
10140 if (TARGET_64BIT)
10142 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10144 /* See Windows x64 Software Convention. */
10145 if (call_abi == MS_ABI)
10147 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10149 if (type)
10151 /* Arrays are passed by reference. */
10152 if (TREE_CODE (type) == ARRAY_TYPE)
10153 return true;
10155 if (RECORD_OR_UNION_TYPE_P (type))
10157 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10158 are passed by reference. */
10159 msize = int_size_in_bytes (type);
10163 /* __m128 is passed by reference. */
10164 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10166 else if (type && int_size_in_bytes (type) == -1)
10167 return true;
10170 return false;
10173 /* Return true when TYPE should be 128bit aligned for 32bit argument
10174 passing ABI. XXX: This function is obsolete and is only used for
10175 checking psABI compatibility with previous versions of GCC. */
10177 static bool
10178 ix86_compat_aligned_value_p (const_tree type)
10180 machine_mode mode = TYPE_MODE (type);
10181 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10182 || mode == TDmode
10183 || mode == TFmode
10184 || mode == TCmode)
10185 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10186 return true;
10187 if (TYPE_ALIGN (type) < 128)
10188 return false;
10190 if (AGGREGATE_TYPE_P (type))
10192 /* Walk the aggregates recursively. */
10193 switch (TREE_CODE (type))
10195 case RECORD_TYPE:
10196 case UNION_TYPE:
10197 case QUAL_UNION_TYPE:
10199 tree field;
10201 /* Walk all the structure fields. */
10202 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10204 if (TREE_CODE (field) == FIELD_DECL
10205 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10206 return true;
10208 break;
10211 case ARRAY_TYPE:
10212 /* Just for use if some languages passes arrays by value. */
10213 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10214 return true;
10215 break;
10217 default:
10218 gcc_unreachable ();
10221 return false;
10224 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10225 XXX: This function is obsolete and is only used for checking psABI
10226 compatibility with previous versions of GCC. */
10228 static unsigned int
10229 ix86_compat_function_arg_boundary (machine_mode mode,
10230 const_tree type, unsigned int align)
10232 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10233 natural boundaries. */
10234 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10236 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10237 make an exception for SSE modes since these require 128bit
10238 alignment.
10240 The handling here differs from field_alignment. ICC aligns MMX
10241 arguments to 4 byte boundaries, while structure fields are aligned
10242 to 8 byte boundaries. */
10243 if (!type)
10245 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10246 align = PARM_BOUNDARY;
10248 else
10250 if (!ix86_compat_aligned_value_p (type))
10251 align = PARM_BOUNDARY;
10254 if (align > BIGGEST_ALIGNMENT)
10255 align = BIGGEST_ALIGNMENT;
10256 return align;
10259 /* Return true when TYPE should be 128bit aligned for 32bit argument
10260 passing ABI. */
10262 static bool
10263 ix86_contains_aligned_value_p (const_tree type)
10265 machine_mode mode = TYPE_MODE (type);
10267 if (mode == XFmode || mode == XCmode)
10268 return false;
10270 if (TYPE_ALIGN (type) < 128)
10271 return false;
10273 if (AGGREGATE_TYPE_P (type))
10275 /* Walk the aggregates recursively. */
10276 switch (TREE_CODE (type))
10278 case RECORD_TYPE:
10279 case UNION_TYPE:
10280 case QUAL_UNION_TYPE:
10282 tree field;
10284 /* Walk all the structure fields. */
10285 for (field = TYPE_FIELDS (type);
10286 field;
10287 field = DECL_CHAIN (field))
10289 if (TREE_CODE (field) == FIELD_DECL
10290 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10291 return true;
10293 break;
10296 case ARRAY_TYPE:
10297 /* Just for use if some languages passes arrays by value. */
10298 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10299 return true;
10300 break;
10302 default:
10303 gcc_unreachable ();
10306 else
10307 return TYPE_ALIGN (type) >= 128;
10309 return false;
10312 /* Gives the alignment boundary, in bits, of an argument with the
10313 specified mode and type. */
10315 static unsigned int
10316 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10318 unsigned int align;
10319 if (type)
10321 /* Since the main variant type is used for call, we convert it to
10322 the main variant type. */
10323 type = TYPE_MAIN_VARIANT (type);
10324 align = TYPE_ALIGN (type);
10326 else
10327 align = GET_MODE_ALIGNMENT (mode);
10328 if (align < PARM_BOUNDARY)
10329 align = PARM_BOUNDARY;
10330 else
10332 static bool warned;
10333 unsigned int saved_align = align;
10335 if (!TARGET_64BIT)
10337 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10338 if (!type)
10340 if (mode == XFmode || mode == XCmode)
10341 align = PARM_BOUNDARY;
10343 else if (!ix86_contains_aligned_value_p (type))
10344 align = PARM_BOUNDARY;
10346 if (align < 128)
10347 align = PARM_BOUNDARY;
10350 if (warn_psabi
10351 && !warned
10352 && align != ix86_compat_function_arg_boundary (mode, type,
10353 saved_align))
10355 warned = true;
10356 inform (input_location,
10357 "The ABI for passing parameters with %d-byte"
10358 " alignment has changed in GCC 4.6",
10359 align / BITS_PER_UNIT);
10363 return align;
10366 /* Return true if N is a possible register number of function value. */
10368 static bool
10369 ix86_function_value_regno_p (const unsigned int regno)
10371 switch (regno)
10373 case AX_REG:
10374 return true;
10375 case DX_REG:
10376 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10377 case DI_REG:
10378 case SI_REG:
10379 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10381 case BND0_REG:
10382 case BND1_REG:
10383 return chkp_function_instrumented_p (current_function_decl);
10385 /* Complex values are returned in %st(0)/%st(1) pair. */
10386 case ST0_REG:
10387 case ST1_REG:
10388 /* TODO: The function should depend on current function ABI but
10389 builtins.c would need updating then. Therefore we use the
10390 default ABI. */
10391 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10392 return false;
10393 return TARGET_FLOAT_RETURNS_IN_80387;
10395 /* Complex values are returned in %xmm0/%xmm1 pair. */
10396 case XMM0_REG:
10397 case XMM1_REG:
10398 return TARGET_SSE;
10400 case MM0_REG:
10401 if (TARGET_MACHO || TARGET_64BIT)
10402 return false;
10403 return TARGET_MMX;
10406 return false;
10409 /* Define how to find the value returned by a function.
10410 VALTYPE is the data type of the value (as a tree).
10411 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10412 otherwise, FUNC is 0. */
10414 static rtx
10415 function_value_32 (machine_mode orig_mode, machine_mode mode,
10416 const_tree fntype, const_tree fn)
10418 unsigned int regno;
10420 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10421 we normally prevent this case when mmx is not available. However
10422 some ABIs may require the result to be returned like DImode. */
10423 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10424 regno = FIRST_MMX_REG;
10426 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10427 we prevent this case when sse is not available. However some ABIs
10428 may require the result to be returned like integer TImode. */
10429 else if (mode == TImode
10430 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10431 regno = FIRST_SSE_REG;
10433 /* 32-byte vector modes in %ymm0. */
10434 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10435 regno = FIRST_SSE_REG;
10437 /* 64-byte vector modes in %zmm0. */
10438 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10439 regno = FIRST_SSE_REG;
10441 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10442 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10443 regno = FIRST_FLOAT_REG;
10444 else
10445 /* Most things go in %eax. */
10446 regno = AX_REG;
10448 /* Override FP return register with %xmm0 for local functions when
10449 SSE math is enabled or for functions with sseregparm attribute. */
10450 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10452 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10453 if (sse_level == -1)
10455 error ("calling %qD with SSE calling convention without "
10456 "SSE/SSE2 enabled", fn);
10457 sorry ("this is a GCC bug that can be worked around by adding "
10458 "attribute used to function called");
10460 else if ((sse_level >= 1 && mode == SFmode)
10461 || (sse_level == 2 && mode == DFmode))
10462 regno = FIRST_SSE_REG;
10465 /* OImode shouldn't be used directly. */
10466 gcc_assert (mode != OImode);
10468 return gen_rtx_REG (orig_mode, regno);
10471 static rtx
10472 function_value_64 (machine_mode orig_mode, machine_mode mode,
10473 const_tree valtype)
10475 rtx ret;
10477 /* Handle libcalls, which don't provide a type node. */
10478 if (valtype == NULL)
10480 unsigned int regno;
10482 switch (mode)
10484 case SFmode:
10485 case SCmode:
10486 case DFmode:
10487 case DCmode:
10488 case TFmode:
10489 case SDmode:
10490 case DDmode:
10491 case TDmode:
10492 regno = FIRST_SSE_REG;
10493 break;
10494 case XFmode:
10495 case XCmode:
10496 regno = FIRST_FLOAT_REG;
10497 break;
10498 case TCmode:
10499 return NULL;
10500 default:
10501 regno = AX_REG;
10504 return gen_rtx_REG (mode, regno);
10506 else if (POINTER_TYPE_P (valtype))
10508 /* Pointers are always returned in word_mode. */
10509 mode = word_mode;
10512 ret = construct_container (mode, orig_mode, valtype, 1,
10513 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10514 x86_64_int_return_registers, 0);
10516 /* For zero sized structures, construct_container returns NULL, but we
10517 need to keep rest of compiler happy by returning meaningful value. */
10518 if (!ret)
10519 ret = gen_rtx_REG (orig_mode, AX_REG);
10521 return ret;
10524 static rtx
10525 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10526 const_tree valtype)
10528 unsigned int regno = AX_REG;
10530 if (TARGET_SSE)
10532 switch (GET_MODE_SIZE (mode))
10534 case 16:
10535 if (valtype != NULL_TREE
10536 && !VECTOR_INTEGER_TYPE_P (valtype)
10537 && !VECTOR_INTEGER_TYPE_P (valtype)
10538 && !INTEGRAL_TYPE_P (valtype)
10539 && !VECTOR_FLOAT_TYPE_P (valtype))
10540 break;
10541 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10542 && !COMPLEX_MODE_P (mode))
10543 regno = FIRST_SSE_REG;
10544 break;
10545 case 8:
10546 case 4:
10547 if (mode == SFmode || mode == DFmode)
10548 regno = FIRST_SSE_REG;
10549 break;
10550 default:
10551 break;
10554 return gen_rtx_REG (orig_mode, regno);
10557 static rtx
10558 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10559 machine_mode orig_mode, machine_mode mode)
10561 const_tree fn, fntype;
10563 fn = NULL_TREE;
10564 if (fntype_or_decl && DECL_P (fntype_or_decl))
10565 fn = fntype_or_decl;
10566 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10568 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10569 || POINTER_BOUNDS_MODE_P (mode))
10570 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10571 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10572 return function_value_ms_64 (orig_mode, mode, valtype);
10573 else if (TARGET_64BIT)
10574 return function_value_64 (orig_mode, mode, valtype);
10575 else
10576 return function_value_32 (orig_mode, mode, fntype, fn);
10579 static rtx
10580 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10582 machine_mode mode, orig_mode;
10584 orig_mode = TYPE_MODE (valtype);
10585 mode = type_natural_mode (valtype, NULL, true);
10586 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10589 /* Return an RTX representing a place where a function returns
10590 or recieves pointer bounds or NULL if no bounds are returned.
10592 VALTYPE is a data type of a value returned by the function.
10594 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10595 or FUNCTION_TYPE of the function.
10597 If OUTGOING is false, return a place in which the caller will
10598 see the return value. Otherwise, return a place where a
10599 function returns a value. */
10601 static rtx
10602 ix86_function_value_bounds (const_tree valtype,
10603 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10604 bool outgoing ATTRIBUTE_UNUSED)
10606 rtx res = NULL_RTX;
10608 if (BOUNDED_TYPE_P (valtype))
10609 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10610 else if (chkp_type_has_pointer (valtype))
10612 bitmap slots;
10613 rtx bounds[2];
10614 bitmap_iterator bi;
10615 unsigned i, bnd_no = 0;
10617 bitmap_obstack_initialize (NULL);
10618 slots = BITMAP_ALLOC (NULL);
10619 chkp_find_bound_slots (valtype, slots);
10621 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10623 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10624 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10625 gcc_assert (bnd_no < 2);
10626 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10629 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10631 BITMAP_FREE (slots);
10632 bitmap_obstack_release (NULL);
10634 else
10635 res = NULL_RTX;
10637 return res;
10640 /* Pointer function arguments and return values are promoted to
10641 word_mode for normal functions. */
10643 static machine_mode
10644 ix86_promote_function_mode (const_tree type, machine_mode mode,
10645 int *punsignedp, const_tree fntype,
10646 int for_return)
10648 if (cfun->machine->func_type == TYPE_NORMAL
10649 && type != NULL_TREE
10650 && POINTER_TYPE_P (type))
10652 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10653 return word_mode;
10655 return default_promote_function_mode (type, mode, punsignedp, fntype,
10656 for_return);
10659 /* Return true if a structure, union or array with MODE containing FIELD
10660 should be accessed using BLKmode. */
10662 static bool
10663 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10665 /* Union with XFmode must be in BLKmode. */
10666 return (mode == XFmode
10667 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10668 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10672 ix86_libcall_value (machine_mode mode)
10674 return ix86_function_value_1 (NULL, NULL, mode, mode);
10677 /* Return true iff type is returned in memory. */
10679 static bool
10680 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10682 #ifdef SUBTARGET_RETURN_IN_MEMORY
10683 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10684 #else
10685 const machine_mode mode = type_natural_mode (type, NULL, true);
10686 HOST_WIDE_INT size;
10688 if (POINTER_BOUNDS_TYPE_P (type))
10689 return false;
10691 if (TARGET_64BIT)
10693 if (ix86_function_type_abi (fntype) == MS_ABI)
10695 size = int_size_in_bytes (type);
10697 /* __m128 is returned in xmm0. */
10698 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10699 || INTEGRAL_TYPE_P (type)
10700 || VECTOR_FLOAT_TYPE_P (type))
10701 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10702 && !COMPLEX_MODE_P (mode)
10703 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10704 return false;
10706 /* Otherwise, the size must be exactly in [1248]. */
10707 return size != 1 && size != 2 && size != 4 && size != 8;
10709 else
10711 int needed_intregs, needed_sseregs;
10713 return examine_argument (mode, type, 1,
10714 &needed_intregs, &needed_sseregs);
10717 else
10719 size = int_size_in_bytes (type);
10721 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10722 bytes in registers. */
10723 if (TARGET_IAMCU)
10724 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10726 if (mode == BLKmode)
10727 return true;
10729 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10730 return false;
10732 if (VECTOR_MODE_P (mode) || mode == TImode)
10734 /* User-created vectors small enough to fit in EAX. */
10735 if (size < 8)
10736 return false;
10738 /* Unless ABI prescibes otherwise,
10739 MMX/3dNow values are returned in MM0 if available. */
10741 if (size == 8)
10742 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10744 /* SSE values are returned in XMM0 if available. */
10745 if (size == 16)
10746 return !TARGET_SSE;
10748 /* AVX values are returned in YMM0 if available. */
10749 if (size == 32)
10750 return !TARGET_AVX;
10752 /* AVX512F values are returned in ZMM0 if available. */
10753 if (size == 64)
10754 return !TARGET_AVX512F;
10757 if (mode == XFmode)
10758 return false;
10760 if (size > 12)
10761 return true;
10763 /* OImode shouldn't be used directly. */
10764 gcc_assert (mode != OImode);
10766 return false;
10768 #endif
10772 /* Create the va_list data type. */
10774 static tree
10775 ix86_build_builtin_va_list_64 (void)
10777 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10779 record = lang_hooks.types.make_type (RECORD_TYPE);
10780 type_decl = build_decl (BUILTINS_LOCATION,
10781 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10783 f_gpr = build_decl (BUILTINS_LOCATION,
10784 FIELD_DECL, get_identifier ("gp_offset"),
10785 unsigned_type_node);
10786 f_fpr = build_decl (BUILTINS_LOCATION,
10787 FIELD_DECL, get_identifier ("fp_offset"),
10788 unsigned_type_node);
10789 f_ovf = build_decl (BUILTINS_LOCATION,
10790 FIELD_DECL, get_identifier ("overflow_arg_area"),
10791 ptr_type_node);
10792 f_sav = build_decl (BUILTINS_LOCATION,
10793 FIELD_DECL, get_identifier ("reg_save_area"),
10794 ptr_type_node);
10796 va_list_gpr_counter_field = f_gpr;
10797 va_list_fpr_counter_field = f_fpr;
10799 DECL_FIELD_CONTEXT (f_gpr) = record;
10800 DECL_FIELD_CONTEXT (f_fpr) = record;
10801 DECL_FIELD_CONTEXT (f_ovf) = record;
10802 DECL_FIELD_CONTEXT (f_sav) = record;
10804 TYPE_STUB_DECL (record) = type_decl;
10805 TYPE_NAME (record) = type_decl;
10806 TYPE_FIELDS (record) = f_gpr;
10807 DECL_CHAIN (f_gpr) = f_fpr;
10808 DECL_CHAIN (f_fpr) = f_ovf;
10809 DECL_CHAIN (f_ovf) = f_sav;
10811 layout_type (record);
10813 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10814 NULL_TREE, TYPE_ATTRIBUTES (record));
10816 /* The correct type is an array type of one element. */
10817 return build_array_type (record, build_index_type (size_zero_node));
10820 /* Setup the builtin va_list data type and for 64-bit the additional
10821 calling convention specific va_list data types. */
10823 static tree
10824 ix86_build_builtin_va_list (void)
10826 if (TARGET_64BIT)
10828 /* Initialize ABI specific va_list builtin types.
10830 In lto1, we can encounter two va_list types:
10831 - one as a result of the type-merge across TUs, and
10832 - the one constructed here.
10833 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10834 a type identity check in canonical_va_list_type based on
10835 TYPE_MAIN_VARIANT (which we used to have) will not work.
10836 Instead, we tag each va_list_type_node with its unique attribute, and
10837 look for the attribute in the type identity check in
10838 canonical_va_list_type.
10840 Tagging sysv_va_list_type_node directly with the attribute is
10841 problematic since it's a array of one record, which will degrade into a
10842 pointer to record when used as parameter (see build_va_arg comments for
10843 an example), dropping the attribute in the process. So we tag the
10844 record instead. */
10846 /* For SYSV_ABI we use an array of one record. */
10847 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10849 /* For MS_ABI we use plain pointer to argument area. */
10850 tree char_ptr_type = build_pointer_type (char_type_node);
10851 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10852 TYPE_ATTRIBUTES (char_ptr_type));
10853 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10855 return ((ix86_abi == MS_ABI)
10856 ? ms_va_list_type_node
10857 : sysv_va_list_type_node);
10859 else
10861 /* For i386 we use plain pointer to argument area. */
10862 return build_pointer_type (char_type_node);
10866 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10868 static void
10869 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10871 rtx save_area, mem;
10872 alias_set_type set;
10873 int i, max;
10875 /* GPR size of varargs save area. */
10876 if (cfun->va_list_gpr_size)
10877 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10878 else
10879 ix86_varargs_gpr_size = 0;
10881 /* FPR size of varargs save area. We don't need it if we don't pass
10882 anything in SSE registers. */
10883 if (TARGET_SSE && cfun->va_list_fpr_size)
10884 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10885 else
10886 ix86_varargs_fpr_size = 0;
10888 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10889 return;
10891 save_area = frame_pointer_rtx;
10892 set = get_varargs_alias_set ();
10894 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10895 if (max > X86_64_REGPARM_MAX)
10896 max = X86_64_REGPARM_MAX;
10898 for (i = cum->regno; i < max; i++)
10900 mem = gen_rtx_MEM (word_mode,
10901 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10902 MEM_NOTRAP_P (mem) = 1;
10903 set_mem_alias_set (mem, set);
10904 emit_move_insn (mem,
10905 gen_rtx_REG (word_mode,
10906 x86_64_int_parameter_registers[i]));
10909 if (ix86_varargs_fpr_size)
10911 machine_mode smode;
10912 rtx_code_label *label;
10913 rtx test;
10915 /* Now emit code to save SSE registers. The AX parameter contains number
10916 of SSE parameter registers used to call this function, though all we
10917 actually check here is the zero/non-zero status. */
10919 label = gen_label_rtx ();
10920 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10921 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10922 label));
10924 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10925 we used movdqa (i.e. TImode) instead? Perhaps even better would
10926 be if we could determine the real mode of the data, via a hook
10927 into pass_stdarg. Ignore all that for now. */
10928 smode = V4SFmode;
10929 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10930 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10932 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10933 if (max > X86_64_SSE_REGPARM_MAX)
10934 max = X86_64_SSE_REGPARM_MAX;
10936 for (i = cum->sse_regno; i < max; ++i)
10938 mem = plus_constant (Pmode, save_area,
10939 i * 16 + ix86_varargs_gpr_size);
10940 mem = gen_rtx_MEM (smode, mem);
10941 MEM_NOTRAP_P (mem) = 1;
10942 set_mem_alias_set (mem, set);
10943 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10945 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10948 emit_label (label);
10952 static void
10953 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10955 alias_set_type set = get_varargs_alias_set ();
10956 int i;
10958 /* Reset to zero, as there might be a sysv vaarg used
10959 before. */
10960 ix86_varargs_gpr_size = 0;
10961 ix86_varargs_fpr_size = 0;
10963 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10965 rtx reg, mem;
10967 mem = gen_rtx_MEM (Pmode,
10968 plus_constant (Pmode, virtual_incoming_args_rtx,
10969 i * UNITS_PER_WORD));
10970 MEM_NOTRAP_P (mem) = 1;
10971 set_mem_alias_set (mem, set);
10973 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10974 emit_move_insn (mem, reg);
10978 static void
10979 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10980 tree type, int *, int no_rtl)
10982 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10983 CUMULATIVE_ARGS next_cum;
10984 tree fntype;
10986 /* This argument doesn't appear to be used anymore. Which is good,
10987 because the old code here didn't suppress rtl generation. */
10988 gcc_assert (!no_rtl);
10990 if (!TARGET_64BIT)
10991 return;
10993 fntype = TREE_TYPE (current_function_decl);
10995 /* For varargs, we do not want to skip the dummy va_dcl argument.
10996 For stdargs, we do want to skip the last named argument. */
10997 next_cum = *cum;
10998 if (stdarg_p (fntype))
10999 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11000 true);
11002 if (cum->call_abi == MS_ABI)
11003 setup_incoming_varargs_ms_64 (&next_cum);
11004 else
11005 setup_incoming_varargs_64 (&next_cum);
11008 static void
11009 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11010 enum machine_mode mode,
11011 tree type,
11012 int *pretend_size ATTRIBUTE_UNUSED,
11013 int no_rtl)
11015 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11016 CUMULATIVE_ARGS next_cum;
11017 tree fntype;
11018 rtx save_area;
11019 int bnd_reg, i, max;
11021 gcc_assert (!no_rtl);
11023 /* Do nothing if we use plain pointer to argument area. */
11024 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11025 return;
11027 fntype = TREE_TYPE (current_function_decl);
11029 /* For varargs, we do not want to skip the dummy va_dcl argument.
11030 For stdargs, we do want to skip the last named argument. */
11031 next_cum = *cum;
11032 if (stdarg_p (fntype))
11033 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11034 true);
11035 save_area = frame_pointer_rtx;
11037 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11038 if (max > X86_64_REGPARM_MAX)
11039 max = X86_64_REGPARM_MAX;
11041 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11042 if (chkp_function_instrumented_p (current_function_decl))
11043 for (i = cum->regno; i < max; i++)
11045 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11046 rtx ptr = gen_rtx_REG (Pmode,
11047 x86_64_int_parameter_registers[i]);
11048 rtx bounds;
11050 if (bnd_reg <= LAST_BND_REG)
11051 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11052 else
11054 rtx ldx_addr =
11055 plus_constant (Pmode, arg_pointer_rtx,
11056 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11057 bounds = gen_reg_rtx (BNDmode);
11058 emit_insn (BNDmode == BND64mode
11059 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11060 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11063 emit_insn (BNDmode == BND64mode
11064 ? gen_bnd64_stx (addr, ptr, bounds)
11065 : gen_bnd32_stx (addr, ptr, bounds));
11067 bnd_reg++;
11072 /* Checks if TYPE is of kind va_list char *. */
11074 static bool
11075 is_va_list_char_pointer (tree type)
11077 tree canonic;
11079 /* For 32-bit it is always true. */
11080 if (!TARGET_64BIT)
11081 return true;
11082 canonic = ix86_canonical_va_list_type (type);
11083 return (canonic == ms_va_list_type_node
11084 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11087 /* Implement va_start. */
11089 static void
11090 ix86_va_start (tree valist, rtx nextarg)
11092 HOST_WIDE_INT words, n_gpr, n_fpr;
11093 tree f_gpr, f_fpr, f_ovf, f_sav;
11094 tree gpr, fpr, ovf, sav, t;
11095 tree type;
11096 rtx ovf_rtx;
11098 if (flag_split_stack
11099 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11101 unsigned int scratch_regno;
11103 /* When we are splitting the stack, we can't refer to the stack
11104 arguments using internal_arg_pointer, because they may be on
11105 the old stack. The split stack prologue will arrange to
11106 leave a pointer to the old stack arguments in a scratch
11107 register, which we here copy to a pseudo-register. The split
11108 stack prologue can't set the pseudo-register directly because
11109 it (the prologue) runs before any registers have been saved. */
11111 scratch_regno = split_stack_prologue_scratch_regno ();
11112 if (scratch_regno != INVALID_REGNUM)
11114 rtx reg;
11115 rtx_insn *seq;
11117 reg = gen_reg_rtx (Pmode);
11118 cfun->machine->split_stack_varargs_pointer = reg;
11120 start_sequence ();
11121 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11122 seq = get_insns ();
11123 end_sequence ();
11125 push_topmost_sequence ();
11126 emit_insn_after (seq, entry_of_function ());
11127 pop_topmost_sequence ();
11131 /* Only 64bit target needs something special. */
11132 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11134 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11135 std_expand_builtin_va_start (valist, nextarg);
11136 else
11138 rtx va_r, next;
11140 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11141 next = expand_binop (ptr_mode, add_optab,
11142 cfun->machine->split_stack_varargs_pointer,
11143 crtl->args.arg_offset_rtx,
11144 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11145 convert_move (va_r, next, 0);
11147 /* Store zero bounds for va_list. */
11148 if (chkp_function_instrumented_p (current_function_decl))
11149 chkp_expand_bounds_reset_for_mem (valist,
11150 make_tree (TREE_TYPE (valist),
11151 next));
11154 return;
11157 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11158 f_fpr = DECL_CHAIN (f_gpr);
11159 f_ovf = DECL_CHAIN (f_fpr);
11160 f_sav = DECL_CHAIN (f_ovf);
11162 valist = build_simple_mem_ref (valist);
11163 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11164 /* The following should be folded into the MEM_REF offset. */
11165 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11166 f_gpr, NULL_TREE);
11167 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11168 f_fpr, NULL_TREE);
11169 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11170 f_ovf, NULL_TREE);
11171 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11172 f_sav, NULL_TREE);
11174 /* Count number of gp and fp argument registers used. */
11175 words = crtl->args.info.words;
11176 n_gpr = crtl->args.info.regno;
11177 n_fpr = crtl->args.info.sse_regno;
11179 if (cfun->va_list_gpr_size)
11181 type = TREE_TYPE (gpr);
11182 t = build2 (MODIFY_EXPR, type,
11183 gpr, build_int_cst (type, n_gpr * 8));
11184 TREE_SIDE_EFFECTS (t) = 1;
11185 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11188 if (TARGET_SSE && cfun->va_list_fpr_size)
11190 type = TREE_TYPE (fpr);
11191 t = build2 (MODIFY_EXPR, type, fpr,
11192 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11193 TREE_SIDE_EFFECTS (t) = 1;
11194 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11197 /* Find the overflow area. */
11198 type = TREE_TYPE (ovf);
11199 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11200 ovf_rtx = crtl->args.internal_arg_pointer;
11201 else
11202 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11203 t = make_tree (type, ovf_rtx);
11204 if (words != 0)
11205 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11207 /* Store zero bounds for overflow area pointer. */
11208 if (chkp_function_instrumented_p (current_function_decl))
11209 chkp_expand_bounds_reset_for_mem (ovf, t);
11211 t = build2 (MODIFY_EXPR, type, ovf, t);
11212 TREE_SIDE_EFFECTS (t) = 1;
11213 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11215 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11217 /* Find the register save area.
11218 Prologue of the function save it right above stack frame. */
11219 type = TREE_TYPE (sav);
11220 t = make_tree (type, frame_pointer_rtx);
11221 if (!ix86_varargs_gpr_size)
11222 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11224 /* Store zero bounds for save area pointer. */
11225 if (chkp_function_instrumented_p (current_function_decl))
11226 chkp_expand_bounds_reset_for_mem (sav, t);
11228 t = build2 (MODIFY_EXPR, type, sav, t);
11229 TREE_SIDE_EFFECTS (t) = 1;
11230 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11234 /* Implement va_arg. */
11236 static tree
11237 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11238 gimple_seq *post_p)
11240 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11241 tree f_gpr, f_fpr, f_ovf, f_sav;
11242 tree gpr, fpr, ovf, sav, t;
11243 int size, rsize;
11244 tree lab_false, lab_over = NULL_TREE;
11245 tree addr, t2;
11246 rtx container;
11247 int indirect_p = 0;
11248 tree ptrtype;
11249 machine_mode nat_mode;
11250 unsigned int arg_boundary;
11252 /* Only 64bit target needs something special. */
11253 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11254 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11256 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11257 f_fpr = DECL_CHAIN (f_gpr);
11258 f_ovf = DECL_CHAIN (f_fpr);
11259 f_sav = DECL_CHAIN (f_ovf);
11261 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11262 valist, f_gpr, NULL_TREE);
11264 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11265 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11266 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11268 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11269 if (indirect_p)
11270 type = build_pointer_type (type);
11271 size = int_size_in_bytes (type);
11272 rsize = CEIL (size, UNITS_PER_WORD);
11274 nat_mode = type_natural_mode (type, NULL, false);
11275 switch (nat_mode)
11277 case V8SFmode:
11278 case V8SImode:
11279 case V32QImode:
11280 case V16HImode:
11281 case V4DFmode:
11282 case V4DImode:
11283 case V16SFmode:
11284 case V16SImode:
11285 case V64QImode:
11286 case V32HImode:
11287 case V8DFmode:
11288 case V8DImode:
11289 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11290 if (!TARGET_64BIT_MS_ABI)
11292 container = NULL;
11293 break;
11295 /* FALLTHRU */
11297 default:
11298 container = construct_container (nat_mode, TYPE_MODE (type),
11299 type, 0, X86_64_REGPARM_MAX,
11300 X86_64_SSE_REGPARM_MAX, intreg,
11302 break;
11305 /* Pull the value out of the saved registers. */
11307 addr = create_tmp_var (ptr_type_node, "addr");
11309 if (container)
11311 int needed_intregs, needed_sseregs;
11312 bool need_temp;
11313 tree int_addr, sse_addr;
11315 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11316 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11318 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11320 need_temp = (!REG_P (container)
11321 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11322 || TYPE_ALIGN (type) > 128));
11324 /* In case we are passing structure, verify that it is consecutive block
11325 on the register save area. If not we need to do moves. */
11326 if (!need_temp && !REG_P (container))
11328 /* Verify that all registers are strictly consecutive */
11329 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11331 int i;
11333 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11335 rtx slot = XVECEXP (container, 0, i);
11336 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11337 || INTVAL (XEXP (slot, 1)) != i * 16)
11338 need_temp = true;
11341 else
11343 int i;
11345 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11347 rtx slot = XVECEXP (container, 0, i);
11348 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11349 || INTVAL (XEXP (slot, 1)) != i * 8)
11350 need_temp = true;
11354 if (!need_temp)
11356 int_addr = addr;
11357 sse_addr = addr;
11359 else
11361 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11362 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11365 /* First ensure that we fit completely in registers. */
11366 if (needed_intregs)
11368 t = build_int_cst (TREE_TYPE (gpr),
11369 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11370 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11371 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11372 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11373 gimplify_and_add (t, pre_p);
11375 if (needed_sseregs)
11377 t = build_int_cst (TREE_TYPE (fpr),
11378 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11379 + X86_64_REGPARM_MAX * 8);
11380 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11381 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11382 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11383 gimplify_and_add (t, pre_p);
11386 /* Compute index to start of area used for integer regs. */
11387 if (needed_intregs)
11389 /* int_addr = gpr + sav; */
11390 t = fold_build_pointer_plus (sav, gpr);
11391 gimplify_assign (int_addr, t, pre_p);
11393 if (needed_sseregs)
11395 /* sse_addr = fpr + sav; */
11396 t = fold_build_pointer_plus (sav, fpr);
11397 gimplify_assign (sse_addr, t, pre_p);
11399 if (need_temp)
11401 int i, prev_size = 0;
11402 tree temp = create_tmp_var (type, "va_arg_tmp");
11404 /* addr = &temp; */
11405 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11406 gimplify_assign (addr, t, pre_p);
11408 for (i = 0; i < XVECLEN (container, 0); i++)
11410 rtx slot = XVECEXP (container, 0, i);
11411 rtx reg = XEXP (slot, 0);
11412 machine_mode mode = GET_MODE (reg);
11413 tree piece_type;
11414 tree addr_type;
11415 tree daddr_type;
11416 tree src_addr, src;
11417 int src_offset;
11418 tree dest_addr, dest;
11419 int cur_size = GET_MODE_SIZE (mode);
11421 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11422 prev_size = INTVAL (XEXP (slot, 1));
11423 if (prev_size + cur_size > size)
11425 cur_size = size - prev_size;
11426 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11427 if (mode == BLKmode)
11428 mode = QImode;
11430 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11431 if (mode == GET_MODE (reg))
11432 addr_type = build_pointer_type (piece_type);
11433 else
11434 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11435 true);
11436 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11437 true);
11439 if (SSE_REGNO_P (REGNO (reg)))
11441 src_addr = sse_addr;
11442 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11444 else
11446 src_addr = int_addr;
11447 src_offset = REGNO (reg) * 8;
11449 src_addr = fold_convert (addr_type, src_addr);
11450 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11452 dest_addr = fold_convert (daddr_type, addr);
11453 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11454 if (cur_size == GET_MODE_SIZE (mode))
11456 src = build_va_arg_indirect_ref (src_addr);
11457 dest = build_va_arg_indirect_ref (dest_addr);
11459 gimplify_assign (dest, src, pre_p);
11461 else
11463 tree copy
11464 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11465 3, dest_addr, src_addr,
11466 size_int (cur_size));
11467 gimplify_and_add (copy, pre_p);
11469 prev_size += cur_size;
11473 if (needed_intregs)
11475 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11476 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11477 gimplify_assign (gpr, t, pre_p);
11480 if (needed_sseregs)
11482 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11483 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11484 gimplify_assign (unshare_expr (fpr), t, pre_p);
11487 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11489 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11492 /* ... otherwise out of the overflow area. */
11494 /* When we align parameter on stack for caller, if the parameter
11495 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11496 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11497 here with caller. */
11498 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11499 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11500 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11502 /* Care for on-stack alignment if needed. */
11503 if (arg_boundary <= 64 || size == 0)
11504 t = ovf;
11505 else
11507 HOST_WIDE_INT align = arg_boundary / 8;
11508 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11509 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11510 build_int_cst (TREE_TYPE (t), -align));
11513 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11514 gimplify_assign (addr, t, pre_p);
11516 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11517 gimplify_assign (unshare_expr (ovf), t, pre_p);
11519 if (container)
11520 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11522 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11523 addr = fold_convert (ptrtype, addr);
11525 if (indirect_p)
11526 addr = build_va_arg_indirect_ref (addr);
11527 return build_va_arg_indirect_ref (addr);
11530 /* Return true if OPNUM's MEM should be matched
11531 in movabs* patterns. */
11533 bool
11534 ix86_check_movabs (rtx insn, int opnum)
11536 rtx set, mem;
11538 set = PATTERN (insn);
11539 if (GET_CODE (set) == PARALLEL)
11540 set = XVECEXP (set, 0, 0);
11541 gcc_assert (GET_CODE (set) == SET);
11542 mem = XEXP (set, opnum);
11543 while (SUBREG_P (mem))
11544 mem = SUBREG_REG (mem);
11545 gcc_assert (MEM_P (mem));
11546 return volatile_ok || !MEM_VOLATILE_P (mem);
11549 /* Return false if INSN contains a MEM with a non-default address space. */
11550 bool
11551 ix86_check_no_addr_space (rtx insn)
11553 subrtx_var_iterator::array_type array;
11554 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11556 rtx x = *iter;
11557 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11558 return false;
11560 return true;
11563 /* Initialize the table of extra 80387 mathematical constants. */
11565 static void
11566 init_ext_80387_constants (void)
11568 static const char * cst[5] =
11570 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11571 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11572 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11573 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11574 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11576 int i;
11578 for (i = 0; i < 5; i++)
11580 real_from_string (&ext_80387_constants_table[i], cst[i]);
11581 /* Ensure each constant is rounded to XFmode precision. */
11582 real_convert (&ext_80387_constants_table[i],
11583 XFmode, &ext_80387_constants_table[i]);
11586 ext_80387_constants_init = 1;
11589 /* Return non-zero if the constant is something that
11590 can be loaded with a special instruction. */
11593 standard_80387_constant_p (rtx x)
11595 machine_mode mode = GET_MODE (x);
11597 const REAL_VALUE_TYPE *r;
11599 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11600 return -1;
11602 if (x == CONST0_RTX (mode))
11603 return 1;
11604 if (x == CONST1_RTX (mode))
11605 return 2;
11607 r = CONST_DOUBLE_REAL_VALUE (x);
11609 /* For XFmode constants, try to find a special 80387 instruction when
11610 optimizing for size or on those CPUs that benefit from them. */
11611 if (mode == XFmode
11612 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11614 int i;
11616 if (! ext_80387_constants_init)
11617 init_ext_80387_constants ();
11619 for (i = 0; i < 5; i++)
11620 if (real_identical (r, &ext_80387_constants_table[i]))
11621 return i + 3;
11624 /* Load of the constant -0.0 or -1.0 will be split as
11625 fldz;fchs or fld1;fchs sequence. */
11626 if (real_isnegzero (r))
11627 return 8;
11628 if (real_identical (r, &dconstm1))
11629 return 9;
11631 return 0;
11634 /* Return the opcode of the special instruction to be used to load
11635 the constant X. */
11637 const char *
11638 standard_80387_constant_opcode (rtx x)
11640 switch (standard_80387_constant_p (x))
11642 case 1:
11643 return "fldz";
11644 case 2:
11645 return "fld1";
11646 case 3:
11647 return "fldlg2";
11648 case 4:
11649 return "fldln2";
11650 case 5:
11651 return "fldl2e";
11652 case 6:
11653 return "fldl2t";
11654 case 7:
11655 return "fldpi";
11656 case 8:
11657 case 9:
11658 return "#";
11659 default:
11660 gcc_unreachable ();
11664 /* Return the CONST_DOUBLE representing the 80387 constant that is
11665 loaded by the specified special instruction. The argument IDX
11666 matches the return value from standard_80387_constant_p. */
11669 standard_80387_constant_rtx (int idx)
11671 int i;
11673 if (! ext_80387_constants_init)
11674 init_ext_80387_constants ();
11676 switch (idx)
11678 case 3:
11679 case 4:
11680 case 5:
11681 case 6:
11682 case 7:
11683 i = idx - 3;
11684 break;
11686 default:
11687 gcc_unreachable ();
11690 return const_double_from_real_value (ext_80387_constants_table[i],
11691 XFmode);
11694 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11695 in supported SSE/AVX vector mode. */
11698 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11700 machine_mode mode;
11702 if (!TARGET_SSE)
11703 return 0;
11705 mode = GET_MODE (x);
11707 if (x == const0_rtx || const0_operand (x, mode))
11708 return 1;
11710 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11712 /* VOIDmode integer constant, get mode from the predicate. */
11713 if (mode == VOIDmode)
11714 mode = pred_mode;
11716 switch (GET_MODE_SIZE (mode))
11718 case 64:
11719 if (TARGET_AVX512F)
11720 return 2;
11721 break;
11722 case 32:
11723 if (TARGET_AVX2)
11724 return 2;
11725 break;
11726 case 16:
11727 if (TARGET_SSE2)
11728 return 2;
11729 break;
11730 case 0:
11731 /* VOIDmode */
11732 gcc_unreachable ();
11733 default:
11734 break;
11738 return 0;
11741 /* Return the opcode of the special instruction to be used to load
11742 the constant X. */
11744 const char *
11745 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11747 machine_mode mode;
11749 gcc_assert (TARGET_SSE);
11751 mode = GET_MODE (x);
11753 if (x == const0_rtx || const0_operand (x, mode))
11755 switch (get_attr_mode (insn))
11757 case MODE_XI:
11758 return "vpxord\t%g0, %g0, %g0";
11759 case MODE_OI:
11760 return (TARGET_AVX512VL
11761 ? "vpxord\t%x0, %x0, %x0"
11762 : "vpxor\t%x0, %x0, %x0");
11763 case MODE_TI:
11764 return (TARGET_AVX512VL
11765 ? "vpxord\t%t0, %t0, %t0"
11766 : "%vpxor\t%0, %d0");
11768 case MODE_V8DF:
11769 return (TARGET_AVX512DQ
11770 ? "vxorpd\t%g0, %g0, %g0"
11771 : "vpxorq\t%g0, %g0, %g0");
11772 case MODE_V4DF:
11773 return "vxorpd\t%x0, %x0, %x0";
11774 case MODE_V2DF:
11775 return "%vxorpd\t%0, %d0";
11777 case MODE_V16SF:
11778 return (TARGET_AVX512DQ
11779 ? "vxorps\t%g0, %g0, %g0"
11780 : "vpxord\t%g0, %g0, %g0");
11781 case MODE_V8SF:
11782 return "vxorps\t%x0, %x0, %x0";
11783 case MODE_V4SF:
11784 return "%vxorps\t%0, %d0";
11786 default:
11787 gcc_unreachable ();
11790 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11792 enum attr_mode insn_mode = get_attr_mode (insn);
11794 switch (insn_mode)
11796 case MODE_XI:
11797 case MODE_V8DF:
11798 case MODE_V16SF:
11799 gcc_assert (TARGET_AVX512F);
11800 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11802 case MODE_OI:
11803 case MODE_V4DF:
11804 case MODE_V8SF:
11805 gcc_assert (TARGET_AVX2);
11806 /* FALLTHRU */
11807 case MODE_TI:
11808 case MODE_V2DF:
11809 case MODE_V4SF:
11810 gcc_assert (TARGET_SSE2);
11811 return (TARGET_AVX
11812 ? "vpcmpeqd\t%0, %0, %0"
11813 : "pcmpeqd\t%0, %0");
11815 default:
11816 gcc_unreachable ();
11820 gcc_unreachable ();
11823 /* Returns true if INSN can be transformed from a memory load
11824 to a supported FP constant load. */
11826 bool
11827 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11829 rtx src = find_constant_src (insn);
11831 gcc_assert (REG_P (dst));
11833 if (src == NULL
11834 || (SSE_REGNO_P (REGNO (dst))
11835 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11836 || (STACK_REGNO_P (REGNO (dst))
11837 && standard_80387_constant_p (src) < 1))
11838 return false;
11840 return true;
11843 /* Returns true if OP contains a symbol reference */
11845 bool
11846 symbolic_reference_mentioned_p (rtx op)
11848 const char *fmt;
11849 int i;
11851 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11852 return true;
11854 fmt = GET_RTX_FORMAT (GET_CODE (op));
11855 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11857 if (fmt[i] == 'E')
11859 int j;
11861 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11862 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11863 return true;
11866 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11867 return true;
11870 return false;
11873 /* Return true if it is appropriate to emit `ret' instructions in the
11874 body of a function. Do this only if the epilogue is simple, needing a
11875 couple of insns. Prior to reloading, we can't tell how many registers
11876 must be saved, so return false then. Return false if there is no frame
11877 marker to de-allocate. */
11879 bool
11880 ix86_can_use_return_insn_p (void)
11882 struct ix86_frame frame;
11884 /* Don't use `ret' instruction in interrupt handler. */
11885 if (! reload_completed
11886 || frame_pointer_needed
11887 || cfun->machine->func_type != TYPE_NORMAL)
11888 return 0;
11890 /* Don't allow more than 32k pop, since that's all we can do
11891 with one instruction. */
11892 if (crtl->args.pops_args && crtl->args.size >= 32768)
11893 return 0;
11895 ix86_compute_frame_layout (&frame);
11896 return (frame.stack_pointer_offset == UNITS_PER_WORD
11897 && (frame.nregs + frame.nsseregs) == 0);
11900 /* Value should be nonzero if functions must have frame pointers.
11901 Zero means the frame pointer need not be set up (and parms may
11902 be accessed via the stack pointer) in functions that seem suitable. */
11904 static bool
11905 ix86_frame_pointer_required (void)
11907 /* If we accessed previous frames, then the generated code expects
11908 to be able to access the saved ebp value in our frame. */
11909 if (cfun->machine->accesses_prev_frame)
11910 return true;
11912 /* Several x86 os'es need a frame pointer for other reasons,
11913 usually pertaining to setjmp. */
11914 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11915 return true;
11917 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11918 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11919 return true;
11921 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11922 allocation is 4GB. */
11923 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11924 return true;
11926 /* SSE saves require frame-pointer when stack is misaligned. */
11927 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11928 return true;
11930 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11931 turns off the frame pointer by default. Turn it back on now if
11932 we've not got a leaf function. */
11933 if (TARGET_OMIT_LEAF_FRAME_POINTER
11934 && (!crtl->is_leaf
11935 || ix86_current_function_calls_tls_descriptor))
11936 return true;
11938 if (crtl->profile && !flag_fentry)
11939 return true;
11941 return false;
11944 /* Record that the current function accesses previous call frames. */
11946 void
11947 ix86_setup_frame_addresses (void)
11949 cfun->machine->accesses_prev_frame = 1;
11952 #ifndef USE_HIDDEN_LINKONCE
11953 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11954 # define USE_HIDDEN_LINKONCE 1
11955 # else
11956 # define USE_HIDDEN_LINKONCE 0
11957 # endif
11958 #endif
11960 static int pic_labels_used;
11962 /* Fills in the label name that should be used for a pc thunk for
11963 the given register. */
11965 static void
11966 get_pc_thunk_name (char name[32], unsigned int regno)
11968 gcc_assert (!TARGET_64BIT);
11970 if (USE_HIDDEN_LINKONCE)
11971 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11972 else
11973 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11977 /* This function generates code for -fpic that loads %ebx with
11978 the return address of the caller and then returns. */
11980 static void
11981 ix86_code_end (void)
11983 rtx xops[2];
11984 int regno;
11986 for (regno = AX_REG; regno <= SP_REG; regno++)
11988 char name[32];
11989 tree decl;
11991 if (!(pic_labels_used & (1 << regno)))
11992 continue;
11994 get_pc_thunk_name (name, regno);
11996 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11997 get_identifier (name),
11998 build_function_type_list (void_type_node, NULL_TREE));
11999 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12000 NULL_TREE, void_type_node);
12001 TREE_PUBLIC (decl) = 1;
12002 TREE_STATIC (decl) = 1;
12003 DECL_IGNORED_P (decl) = 1;
12005 #if TARGET_MACHO
12006 if (TARGET_MACHO)
12008 switch_to_section (darwin_sections[picbase_thunk_section]);
12009 fputs ("\t.weak_definition\t", asm_out_file);
12010 assemble_name (asm_out_file, name);
12011 fputs ("\n\t.private_extern\t", asm_out_file);
12012 assemble_name (asm_out_file, name);
12013 putc ('\n', asm_out_file);
12014 ASM_OUTPUT_LABEL (asm_out_file, name);
12015 DECL_WEAK (decl) = 1;
12017 else
12018 #endif
12019 if (USE_HIDDEN_LINKONCE)
12021 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12023 targetm.asm_out.unique_section (decl, 0);
12024 switch_to_section (get_named_section (decl, NULL, 0));
12026 targetm.asm_out.globalize_label (asm_out_file, name);
12027 fputs ("\t.hidden\t", asm_out_file);
12028 assemble_name (asm_out_file, name);
12029 putc ('\n', asm_out_file);
12030 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12032 else
12034 switch_to_section (text_section);
12035 ASM_OUTPUT_LABEL (asm_out_file, name);
12038 DECL_INITIAL (decl) = make_node (BLOCK);
12039 current_function_decl = decl;
12040 allocate_struct_function (decl, false);
12041 init_function_start (decl);
12042 /* We're about to hide the function body from callees of final_* by
12043 emitting it directly; tell them we're a thunk, if they care. */
12044 cfun->is_thunk = true;
12045 first_function_block_is_cold = false;
12046 /* Make sure unwind info is emitted for the thunk if needed. */
12047 final_start_function (emit_barrier (), asm_out_file, 1);
12049 /* Pad stack IP move with 4 instructions (two NOPs count
12050 as one instruction). */
12051 if (TARGET_PAD_SHORT_FUNCTION)
12053 int i = 8;
12055 while (i--)
12056 fputs ("\tnop\n", asm_out_file);
12059 xops[0] = gen_rtx_REG (Pmode, regno);
12060 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12061 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12062 output_asm_insn ("%!ret", NULL);
12063 final_end_function ();
12064 init_insn_lengths ();
12065 free_after_compilation (cfun);
12066 set_cfun (NULL);
12067 current_function_decl = NULL;
12070 if (flag_split_stack)
12071 file_end_indicate_split_stack ();
12074 /* Emit code for the SET_GOT patterns. */
12076 const char *
12077 output_set_got (rtx dest, rtx label)
12079 rtx xops[3];
12081 xops[0] = dest;
12083 if (TARGET_VXWORKS_RTP && flag_pic)
12085 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12086 xops[2] = gen_rtx_MEM (Pmode,
12087 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12088 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12090 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12091 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12092 an unadorned address. */
12093 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12094 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12095 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12096 return "";
12099 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12101 if (flag_pic)
12103 char name[32];
12104 get_pc_thunk_name (name, REGNO (dest));
12105 pic_labels_used |= 1 << REGNO (dest);
12107 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12108 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12109 output_asm_insn ("%!call\t%X2", xops);
12111 #if TARGET_MACHO
12112 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12113 This is what will be referenced by the Mach-O PIC subsystem. */
12114 if (machopic_should_output_picbase_label () || !label)
12115 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12117 /* When we are restoring the pic base at the site of a nonlocal label,
12118 and we decided to emit the pic base above, we will still output a
12119 local label used for calculating the correction offset (even though
12120 the offset will be 0 in that case). */
12121 if (label)
12122 targetm.asm_out.internal_label (asm_out_file, "L",
12123 CODE_LABEL_NUMBER (label));
12124 #endif
12126 else
12128 if (TARGET_MACHO)
12129 /* We don't need a pic base, we're not producing pic. */
12130 gcc_unreachable ();
12132 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12133 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12134 targetm.asm_out.internal_label (asm_out_file, "L",
12135 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12138 if (!TARGET_MACHO)
12139 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12141 return "";
12144 /* Generate an "push" pattern for input ARG. */
12146 static rtx
12147 gen_push (rtx arg)
12149 struct machine_function *m = cfun->machine;
12151 if (m->fs.cfa_reg == stack_pointer_rtx)
12152 m->fs.cfa_offset += UNITS_PER_WORD;
12153 m->fs.sp_offset += UNITS_PER_WORD;
12155 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12156 arg = gen_rtx_REG (word_mode, REGNO (arg));
12158 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12159 gen_rtx_PRE_DEC (Pmode,
12160 stack_pointer_rtx)),
12161 arg);
12164 /* Generate an "pop" pattern for input ARG. */
12166 static rtx
12167 gen_pop (rtx arg)
12169 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12170 arg = gen_rtx_REG (word_mode, REGNO (arg));
12172 return gen_rtx_SET (arg,
12173 gen_rtx_MEM (word_mode,
12174 gen_rtx_POST_INC (Pmode,
12175 stack_pointer_rtx)));
12178 /* Return >= 0 if there is an unused call-clobbered register available
12179 for the entire function. */
12181 static unsigned int
12182 ix86_select_alt_pic_regnum (void)
12184 if (ix86_use_pseudo_pic_reg ())
12185 return INVALID_REGNUM;
12187 if (crtl->is_leaf
12188 && !crtl->profile
12189 && !ix86_current_function_calls_tls_descriptor)
12191 int i, drap;
12192 /* Can't use the same register for both PIC and DRAP. */
12193 if (crtl->drap_reg)
12194 drap = REGNO (crtl->drap_reg);
12195 else
12196 drap = -1;
12197 for (i = 2; i >= 0; --i)
12198 if (i != drap && !df_regs_ever_live_p (i))
12199 return i;
12202 return INVALID_REGNUM;
12205 /* Return true if REGNO is used by the epilogue. */
12207 bool
12208 ix86_epilogue_uses (int regno)
12210 /* If there are no caller-saved registers, we preserve all registers,
12211 except for MMX and x87 registers which aren't supported when saving
12212 and restoring registers. Don't explicitly save SP register since
12213 it is always preserved. */
12214 return (epilogue_completed
12215 && cfun->machine->no_caller_saved_registers
12216 && !fixed_regs[regno]
12217 && !STACK_REGNO_P (regno)
12218 && !MMX_REGNO_P (regno));
12221 /* Return nonzero if register REGNO can be used as a scratch register
12222 in peephole2. */
12224 static bool
12225 ix86_hard_regno_scratch_ok (unsigned int regno)
12227 /* If there are no caller-saved registers, we can't use any register
12228 as a scratch register after epilogue and use REGNO as scratch
12229 register only if it has been used before to avoid saving and
12230 restoring it. */
12231 return (!cfun->machine->no_caller_saved_registers
12232 || (!epilogue_completed
12233 && df_regs_ever_live_p (regno)));
12236 /* Return true if register class CL should be an additional allocno
12237 class. */
12239 static bool
12240 ix86_additional_allocno_class_p (reg_class_t cl)
12242 return cl == MOD4_SSE_REGS;
12245 /* Return TRUE if we need to save REGNO. */
12247 static bool
12248 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12250 /* If there are no caller-saved registers, we preserve all registers,
12251 except for MMX and x87 registers which aren't supported when saving
12252 and restoring registers. Don't explicitly save SP register since
12253 it is always preserved. */
12254 if (cfun->machine->no_caller_saved_registers)
12256 /* Don't preserve registers used for function return value. */
12257 rtx reg = crtl->return_rtx;
12258 if (reg)
12260 unsigned int i = REGNO (reg);
12261 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12262 while (nregs-- > 0)
12263 if ((i + nregs) == regno)
12264 return false;
12266 reg = crtl->return_bnd;
12267 if (reg)
12269 i = REGNO (reg);
12270 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12271 while (nregs-- > 0)
12272 if ((i + nregs) == regno)
12273 return false;
12277 return (df_regs_ever_live_p (regno)
12278 && !fixed_regs[regno]
12279 && !STACK_REGNO_P (regno)
12280 && !MMX_REGNO_P (regno)
12281 && (regno != HARD_FRAME_POINTER_REGNUM
12282 || !frame_pointer_needed));
12285 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12286 && pic_offset_table_rtx)
12288 if (ix86_use_pseudo_pic_reg ())
12290 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12291 _mcount in prologue. */
12292 if (!TARGET_64BIT && flag_pic && crtl->profile)
12293 return true;
12295 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12296 || crtl->profile
12297 || crtl->calls_eh_return
12298 || crtl->uses_const_pool
12299 || cfun->has_nonlocal_label)
12300 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12303 if (crtl->calls_eh_return && maybe_eh_return)
12305 unsigned i;
12306 for (i = 0; ; i++)
12308 unsigned test = EH_RETURN_DATA_REGNO (i);
12309 if (test == INVALID_REGNUM)
12310 break;
12311 if (test == regno)
12312 return true;
12316 if (crtl->drap_reg
12317 && regno == REGNO (crtl->drap_reg)
12318 && !cfun->machine->no_drap_save_restore)
12319 return true;
12321 return (df_regs_ever_live_p (regno)
12322 && !call_used_regs[regno]
12323 && !fixed_regs[regno]
12324 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12327 /* Return number of saved general prupose registers. */
12329 static int
12330 ix86_nsaved_regs (void)
12332 int nregs = 0;
12333 int regno;
12335 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12336 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12337 nregs ++;
12338 return nregs;
12341 /* Return number of saved SSE registers. */
12343 static int
12344 ix86_nsaved_sseregs (void)
12346 int nregs = 0;
12347 int regno;
12349 if (!TARGET_64BIT_MS_ABI)
12350 return 0;
12351 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12352 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12353 nregs ++;
12354 return nregs;
12357 /* Given FROM and TO register numbers, say whether this elimination is
12358 allowed. If stack alignment is needed, we can only replace argument
12359 pointer with hard frame pointer, or replace frame pointer with stack
12360 pointer. Otherwise, frame pointer elimination is automatically
12361 handled and all other eliminations are valid. */
12363 static bool
12364 ix86_can_eliminate (const int from, const int to)
12366 if (stack_realign_fp)
12367 return ((from == ARG_POINTER_REGNUM
12368 && to == HARD_FRAME_POINTER_REGNUM)
12369 || (from == FRAME_POINTER_REGNUM
12370 && to == STACK_POINTER_REGNUM));
12371 else
12372 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12375 /* Return the offset between two registers, one to be eliminated, and the other
12376 its replacement, at the start of a routine. */
12378 HOST_WIDE_INT
12379 ix86_initial_elimination_offset (int from, int to)
12381 struct ix86_frame frame;
12382 ix86_compute_frame_layout (&frame);
12384 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12385 return frame.hard_frame_pointer_offset;
12386 else if (from == FRAME_POINTER_REGNUM
12387 && to == HARD_FRAME_POINTER_REGNUM)
12388 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12389 else
12391 gcc_assert (to == STACK_POINTER_REGNUM);
12393 if (from == ARG_POINTER_REGNUM)
12394 return frame.stack_pointer_offset;
12396 gcc_assert (from == FRAME_POINTER_REGNUM);
12397 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12401 /* In a dynamically-aligned function, we can't know the offset from
12402 stack pointer to frame pointer, so we must ensure that setjmp
12403 eliminates fp against the hard fp (%ebp) rather than trying to
12404 index from %esp up to the top of the frame across a gap that is
12405 of unknown (at compile-time) size. */
12406 static rtx
12407 ix86_builtin_setjmp_frame_value (void)
12409 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12412 /* When using -fsplit-stack, the allocation routines set a field in
12413 the TCB to the bottom of the stack plus this much space, measured
12414 in bytes. */
12416 #define SPLIT_STACK_AVAILABLE 256
12418 /* Fill structure ix86_frame about frame of currently computed function. */
12420 static void
12421 ix86_compute_frame_layout (struct ix86_frame *frame)
12423 unsigned HOST_WIDE_INT stack_alignment_needed;
12424 HOST_WIDE_INT offset;
12425 unsigned HOST_WIDE_INT preferred_alignment;
12426 HOST_WIDE_INT size = get_frame_size ();
12427 HOST_WIDE_INT to_allocate;
12429 frame->nregs = ix86_nsaved_regs ();
12430 frame->nsseregs = ix86_nsaved_sseregs ();
12432 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12433 except for function prologues, leaf functions and when the defult
12434 incoming stack boundary is overriden at command line or via
12435 force_align_arg_pointer attribute. */
12436 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12437 && (!crtl->is_leaf || cfun->calls_alloca != 0
12438 || ix86_current_function_calls_tls_descriptor
12439 || ix86_incoming_stack_boundary < 128))
12441 crtl->preferred_stack_boundary = 128;
12442 crtl->stack_alignment_needed = 128;
12445 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12446 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12448 gcc_assert (!size || stack_alignment_needed);
12449 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12450 gcc_assert (preferred_alignment <= stack_alignment_needed);
12452 /* For SEH we have to limit the amount of code movement into the prologue.
12453 At present we do this via a BLOCKAGE, at which point there's very little
12454 scheduling that can be done, which means that there's very little point
12455 in doing anything except PUSHs. */
12456 if (TARGET_SEH)
12457 cfun->machine->use_fast_prologue_epilogue = false;
12459 /* During reload iteration the amount of registers saved can change.
12460 Recompute the value as needed. Do not recompute when amount of registers
12461 didn't change as reload does multiple calls to the function and does not
12462 expect the decision to change within single iteration. */
12463 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12464 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12466 int count = frame->nregs;
12467 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12469 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12471 /* The fast prologue uses move instead of push to save registers. This
12472 is significantly longer, but also executes faster as modern hardware
12473 can execute the moves in parallel, but can't do that for push/pop.
12475 Be careful about choosing what prologue to emit: When function takes
12476 many instructions to execute we may use slow version as well as in
12477 case function is known to be outside hot spot (this is known with
12478 feedback only). Weight the size of function by number of registers
12479 to save as it is cheap to use one or two push instructions but very
12480 slow to use many of them. */
12481 if (count)
12482 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12483 if (node->frequency < NODE_FREQUENCY_NORMAL
12484 || (flag_branch_probabilities
12485 && node->frequency < NODE_FREQUENCY_HOT))
12486 cfun->machine->use_fast_prologue_epilogue = false;
12487 else
12488 cfun->machine->use_fast_prologue_epilogue
12489 = !expensive_function_p (count);
12492 frame->save_regs_using_mov
12493 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12494 /* If static stack checking is enabled and done with probes,
12495 the registers need to be saved before allocating the frame. */
12496 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12498 /* Skip return address. */
12499 offset = UNITS_PER_WORD;
12501 /* Skip pushed static chain. */
12502 if (ix86_static_chain_on_stack)
12503 offset += UNITS_PER_WORD;
12505 /* Skip saved base pointer. */
12506 if (frame_pointer_needed)
12507 offset += UNITS_PER_WORD;
12508 frame->hfp_save_offset = offset;
12510 /* The traditional frame pointer location is at the top of the frame. */
12511 frame->hard_frame_pointer_offset = offset;
12513 /* Register save area */
12514 offset += frame->nregs * UNITS_PER_WORD;
12515 frame->reg_save_offset = offset;
12517 /* On SEH target, registers are pushed just before the frame pointer
12518 location. */
12519 if (TARGET_SEH)
12520 frame->hard_frame_pointer_offset = offset;
12522 /* Align and set SSE register save area. */
12523 if (frame->nsseregs)
12525 /* The only ABI that has saved SSE registers (Win64) also has a
12526 16-byte aligned default stack, and thus we don't need to be
12527 within the re-aligned local stack frame to save them. In case
12528 incoming stack boundary is aligned to less than 16 bytes,
12529 unaligned move of SSE register will be emitted, so there is
12530 no point to round up the SSE register save area outside the
12531 re-aligned local stack frame to 16 bytes. */
12532 if (ix86_incoming_stack_boundary >= 128)
12533 offset = ROUND_UP (offset, 16);
12534 offset += frame->nsseregs * 16;
12536 frame->sse_reg_save_offset = offset;
12538 /* The re-aligned stack starts here. Values before this point are not
12539 directly comparable with values below this point. In order to make
12540 sure that no value happens to be the same before and after, force
12541 the alignment computation below to add a non-zero value. */
12542 if (stack_realign_fp)
12543 offset = ROUND_UP (offset, stack_alignment_needed);
12545 /* Va-arg area */
12546 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12547 offset += frame->va_arg_size;
12549 /* Align start of frame for local function. */
12550 if (stack_realign_fp
12551 || offset != frame->sse_reg_save_offset
12552 || size != 0
12553 || !crtl->is_leaf
12554 || cfun->calls_alloca
12555 || ix86_current_function_calls_tls_descriptor)
12556 offset = ROUND_UP (offset, stack_alignment_needed);
12558 /* Frame pointer points here. */
12559 frame->frame_pointer_offset = offset;
12561 offset += size;
12563 /* Add outgoing arguments area. Can be skipped if we eliminated
12564 all the function calls as dead code.
12565 Skipping is however impossible when function calls alloca. Alloca
12566 expander assumes that last crtl->outgoing_args_size
12567 of stack frame are unused. */
12568 if (ACCUMULATE_OUTGOING_ARGS
12569 && (!crtl->is_leaf || cfun->calls_alloca
12570 || ix86_current_function_calls_tls_descriptor))
12572 offset += crtl->outgoing_args_size;
12573 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12575 else
12576 frame->outgoing_arguments_size = 0;
12578 /* Align stack boundary. Only needed if we're calling another function
12579 or using alloca. */
12580 if (!crtl->is_leaf || cfun->calls_alloca
12581 || ix86_current_function_calls_tls_descriptor)
12582 offset = ROUND_UP (offset, preferred_alignment);
12584 /* We've reached end of stack frame. */
12585 frame->stack_pointer_offset = offset;
12587 /* Size prologue needs to allocate. */
12588 to_allocate = offset - frame->sse_reg_save_offset;
12590 if ((!to_allocate && frame->nregs <= 1)
12591 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12592 frame->save_regs_using_mov = false;
12594 if (ix86_using_red_zone ()
12595 && crtl->sp_is_unchanging
12596 && crtl->is_leaf
12597 && !ix86_pc_thunk_call_expanded
12598 && !ix86_current_function_calls_tls_descriptor)
12600 frame->red_zone_size = to_allocate;
12601 if (frame->save_regs_using_mov)
12602 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12603 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12604 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12606 else
12607 frame->red_zone_size = 0;
12608 frame->stack_pointer_offset -= frame->red_zone_size;
12610 /* The SEH frame pointer location is near the bottom of the frame.
12611 This is enforced by the fact that the difference between the
12612 stack pointer and the frame pointer is limited to 240 bytes in
12613 the unwind data structure. */
12614 if (TARGET_SEH)
12616 HOST_WIDE_INT diff;
12618 /* If we can leave the frame pointer where it is, do so. Also, returns
12619 the establisher frame for __builtin_frame_address (0). */
12620 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12621 if (diff <= SEH_MAX_FRAME_SIZE
12622 && (diff > 240 || (diff & 15) != 0)
12623 && !crtl->accesses_prior_frames)
12625 /* Ideally we'd determine what portion of the local stack frame
12626 (within the constraint of the lowest 240) is most heavily used.
12627 But without that complication, simply bias the frame pointer
12628 by 128 bytes so as to maximize the amount of the local stack
12629 frame that is addressable with 8-bit offsets. */
12630 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12635 /* This is semi-inlined memory_address_length, but simplified
12636 since we know that we're always dealing with reg+offset, and
12637 to avoid having to create and discard all that rtl. */
12639 static inline int
12640 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12642 int len = 4;
12644 if (offset == 0)
12646 /* EBP and R13 cannot be encoded without an offset. */
12647 len = (regno == BP_REG || regno == R13_REG);
12649 else if (IN_RANGE (offset, -128, 127))
12650 len = 1;
12652 /* ESP and R12 must be encoded with a SIB byte. */
12653 if (regno == SP_REG || regno == R12_REG)
12654 len++;
12656 return len;
12659 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12660 The valid base registers are taken from CFUN->MACHINE->FS. */
12662 static rtx
12663 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12665 const struct machine_function *m = cfun->machine;
12666 rtx base_reg = NULL;
12667 HOST_WIDE_INT base_offset = 0;
12669 if (m->use_fast_prologue_epilogue)
12671 /* Choose the base register most likely to allow the most scheduling
12672 opportunities. Generally FP is valid throughout the function,
12673 while DRAP must be reloaded within the epilogue. But choose either
12674 over the SP due to increased encoding size. */
12676 if (m->fs.fp_valid)
12678 base_reg = hard_frame_pointer_rtx;
12679 base_offset = m->fs.fp_offset - cfa_offset;
12681 else if (m->fs.drap_valid)
12683 base_reg = crtl->drap_reg;
12684 base_offset = 0 - cfa_offset;
12686 else if (m->fs.sp_valid)
12688 base_reg = stack_pointer_rtx;
12689 base_offset = m->fs.sp_offset - cfa_offset;
12692 else
12694 HOST_WIDE_INT toffset;
12695 int len = 16, tlen;
12697 /* Choose the base register with the smallest address encoding.
12698 With a tie, choose FP > DRAP > SP. */
12699 if (m->fs.sp_valid)
12701 base_reg = stack_pointer_rtx;
12702 base_offset = m->fs.sp_offset - cfa_offset;
12703 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12705 if (m->fs.drap_valid)
12707 toffset = 0 - cfa_offset;
12708 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12709 if (tlen <= len)
12711 base_reg = crtl->drap_reg;
12712 base_offset = toffset;
12713 len = tlen;
12716 if (m->fs.fp_valid)
12718 toffset = m->fs.fp_offset - cfa_offset;
12719 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12720 if (tlen <= len)
12722 base_reg = hard_frame_pointer_rtx;
12723 base_offset = toffset;
12724 len = tlen;
12728 gcc_assert (base_reg != NULL);
12730 return plus_constant (Pmode, base_reg, base_offset);
12733 /* Emit code to save registers in the prologue. */
12735 static void
12736 ix86_emit_save_regs (void)
12738 unsigned int regno;
12739 rtx_insn *insn;
12741 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12742 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12744 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12745 RTX_FRAME_RELATED_P (insn) = 1;
12749 /* Emit a single register save at CFA - CFA_OFFSET. */
12751 static void
12752 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12753 HOST_WIDE_INT cfa_offset)
12755 struct machine_function *m = cfun->machine;
12756 rtx reg = gen_rtx_REG (mode, regno);
12757 rtx mem, addr, base, insn;
12758 unsigned int align;
12760 addr = choose_baseaddr (cfa_offset);
12761 mem = gen_frame_mem (mode, addr);
12763 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12764 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12765 set_mem_align (mem, align);
12767 insn = emit_insn (gen_rtx_SET (mem, reg));
12768 RTX_FRAME_RELATED_P (insn) = 1;
12770 base = addr;
12771 if (GET_CODE (base) == PLUS)
12772 base = XEXP (base, 0);
12773 gcc_checking_assert (REG_P (base));
12775 /* When saving registers into a re-aligned local stack frame, avoid
12776 any tricky guessing by dwarf2out. */
12777 if (m->fs.realigned)
12779 gcc_checking_assert (stack_realign_drap);
12781 if (regno == REGNO (crtl->drap_reg))
12783 /* A bit of a hack. We force the DRAP register to be saved in
12784 the re-aligned stack frame, which provides us with a copy
12785 of the CFA that will last past the prologue. Install it. */
12786 gcc_checking_assert (cfun->machine->fs.fp_valid);
12787 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12788 cfun->machine->fs.fp_offset - cfa_offset);
12789 mem = gen_rtx_MEM (mode, addr);
12790 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12792 else
12794 /* The frame pointer is a stable reference within the
12795 aligned frame. Use it. */
12796 gcc_checking_assert (cfun->machine->fs.fp_valid);
12797 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12798 cfun->machine->fs.fp_offset - cfa_offset);
12799 mem = gen_rtx_MEM (mode, addr);
12800 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12804 /* The memory may not be relative to the current CFA register,
12805 which means that we may need to generate a new pattern for
12806 use by the unwind info. */
12807 else if (base != m->fs.cfa_reg)
12809 addr = plus_constant (Pmode, m->fs.cfa_reg,
12810 m->fs.cfa_offset - cfa_offset);
12811 mem = gen_rtx_MEM (mode, addr);
12812 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12816 /* Emit code to save registers using MOV insns.
12817 First register is stored at CFA - CFA_OFFSET. */
12818 static void
12819 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12821 unsigned int regno;
12823 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12824 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12826 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12827 cfa_offset -= UNITS_PER_WORD;
12831 /* Emit code to save SSE registers using MOV insns.
12832 First register is stored at CFA - CFA_OFFSET. */
12833 static void
12834 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12836 unsigned int regno;
12838 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12839 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12841 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12842 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12846 static GTY(()) rtx queued_cfa_restores;
12848 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12849 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12850 Don't add the note if the previously saved value will be left untouched
12851 within stack red-zone till return, as unwinders can find the same value
12852 in the register and on the stack. */
12854 static void
12855 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12857 if (!crtl->shrink_wrapped
12858 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12859 return;
12861 if (insn)
12863 add_reg_note (insn, REG_CFA_RESTORE, reg);
12864 RTX_FRAME_RELATED_P (insn) = 1;
12866 else
12867 queued_cfa_restores
12868 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12871 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12873 static void
12874 ix86_add_queued_cfa_restore_notes (rtx insn)
12876 rtx last;
12877 if (!queued_cfa_restores)
12878 return;
12879 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12881 XEXP (last, 1) = REG_NOTES (insn);
12882 REG_NOTES (insn) = queued_cfa_restores;
12883 queued_cfa_restores = NULL_RTX;
12884 RTX_FRAME_RELATED_P (insn) = 1;
12887 /* Expand prologue or epilogue stack adjustment.
12888 The pattern exist to put a dependency on all ebp-based memory accesses.
12889 STYLE should be negative if instructions should be marked as frame related,
12890 zero if %r11 register is live and cannot be freely used and positive
12891 otherwise. */
12893 static void
12894 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12895 int style, bool set_cfa)
12897 struct machine_function *m = cfun->machine;
12898 rtx insn;
12899 bool add_frame_related_expr = false;
12901 if (Pmode == SImode)
12902 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12903 else if (x86_64_immediate_operand (offset, DImode))
12904 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12905 else
12907 rtx tmp;
12908 /* r11 is used by indirect sibcall return as well, set before the
12909 epilogue and used after the epilogue. */
12910 if (style)
12911 tmp = gen_rtx_REG (DImode, R11_REG);
12912 else
12914 gcc_assert (src != hard_frame_pointer_rtx
12915 && dest != hard_frame_pointer_rtx);
12916 tmp = hard_frame_pointer_rtx;
12918 insn = emit_insn (gen_rtx_SET (tmp, offset));
12919 if (style < 0)
12920 add_frame_related_expr = true;
12922 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12925 insn = emit_insn (insn);
12926 if (style >= 0)
12927 ix86_add_queued_cfa_restore_notes (insn);
12929 if (set_cfa)
12931 rtx r;
12933 gcc_assert (m->fs.cfa_reg == src);
12934 m->fs.cfa_offset += INTVAL (offset);
12935 m->fs.cfa_reg = dest;
12937 r = gen_rtx_PLUS (Pmode, src, offset);
12938 r = gen_rtx_SET (dest, r);
12939 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12940 RTX_FRAME_RELATED_P (insn) = 1;
12942 else if (style < 0)
12944 RTX_FRAME_RELATED_P (insn) = 1;
12945 if (add_frame_related_expr)
12947 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12948 r = gen_rtx_SET (dest, r);
12949 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12953 if (dest == stack_pointer_rtx)
12955 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12956 bool valid = m->fs.sp_valid;
12958 if (src == hard_frame_pointer_rtx)
12960 valid = m->fs.fp_valid;
12961 ooffset = m->fs.fp_offset;
12963 else if (src == crtl->drap_reg)
12965 valid = m->fs.drap_valid;
12966 ooffset = 0;
12968 else
12970 /* Else there are two possibilities: SP itself, which we set
12971 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12972 taken care of this by hand along the eh_return path. */
12973 gcc_checking_assert (src == stack_pointer_rtx
12974 || offset == const0_rtx);
12977 m->fs.sp_offset = ooffset - INTVAL (offset);
12978 m->fs.sp_valid = valid;
12982 /* Find an available register to be used as dynamic realign argument
12983 pointer regsiter. Such a register will be written in prologue and
12984 used in begin of body, so it must not be
12985 1. parameter passing register.
12986 2. GOT pointer.
12987 We reuse static-chain register if it is available. Otherwise, we
12988 use DI for i386 and R13 for x86-64. We chose R13 since it has
12989 shorter encoding.
12991 Return: the regno of chosen register. */
12993 static unsigned int
12994 find_drap_reg (void)
12996 tree decl = cfun->decl;
12998 /* Always use callee-saved register if there are no caller-saved
12999 registers. */
13000 if (TARGET_64BIT)
13002 /* Use R13 for nested function or function need static chain.
13003 Since function with tail call may use any caller-saved
13004 registers in epilogue, DRAP must not use caller-saved
13005 register in such case. */
13006 if (DECL_STATIC_CHAIN (decl)
13007 || cfun->machine->no_caller_saved_registers
13008 || crtl->tail_call_emit)
13009 return R13_REG;
13011 return R10_REG;
13013 else
13015 /* Use DI for nested function or function need static chain.
13016 Since function with tail call may use any caller-saved
13017 registers in epilogue, DRAP must not use caller-saved
13018 register in such case. */
13019 if (DECL_STATIC_CHAIN (decl)
13020 || cfun->machine->no_caller_saved_registers
13021 || crtl->tail_call_emit)
13022 return DI_REG;
13024 /* Reuse static chain register if it isn't used for parameter
13025 passing. */
13026 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13028 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13029 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13030 return CX_REG;
13032 return DI_REG;
13036 /* Handle a "force_align_arg_pointer" attribute. */
13038 static tree
13039 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13040 tree, int, bool *no_add_attrs)
13042 if (TREE_CODE (*node) != FUNCTION_TYPE
13043 && TREE_CODE (*node) != METHOD_TYPE
13044 && TREE_CODE (*node) != FIELD_DECL
13045 && TREE_CODE (*node) != TYPE_DECL)
13047 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13048 name);
13049 *no_add_attrs = true;
13052 return NULL_TREE;
13055 /* Return minimum incoming stack alignment. */
13057 static unsigned int
13058 ix86_minimum_incoming_stack_boundary (bool sibcall)
13060 unsigned int incoming_stack_boundary;
13062 /* Stack of interrupt handler is aligned to 128 bits in 64bit
13063 mode. */
13064 if (cfun->machine->func_type != TYPE_NORMAL)
13065 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13066 /* Prefer the one specified at command line. */
13067 else if (ix86_user_incoming_stack_boundary)
13068 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13069 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13070 if -mstackrealign is used, it isn't used for sibcall check and
13071 estimated stack alignment is 128bit. */
13072 else if (!sibcall
13073 && ix86_force_align_arg_pointer
13074 && crtl->stack_alignment_estimated == 128)
13075 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13076 else
13077 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13079 /* Incoming stack alignment can be changed on individual functions
13080 via force_align_arg_pointer attribute. We use the smallest
13081 incoming stack boundary. */
13082 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13083 && lookup_attribute (ix86_force_align_arg_pointer_string,
13084 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13085 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13087 /* The incoming stack frame has to be aligned at least at
13088 parm_stack_boundary. */
13089 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13090 incoming_stack_boundary = crtl->parm_stack_boundary;
13092 /* Stack at entrance of main is aligned by runtime. We use the
13093 smallest incoming stack boundary. */
13094 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13095 && DECL_NAME (current_function_decl)
13096 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13097 && DECL_FILE_SCOPE_P (current_function_decl))
13098 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13100 return incoming_stack_boundary;
13103 /* Update incoming stack boundary and estimated stack alignment. */
13105 static void
13106 ix86_update_stack_boundary (void)
13108 ix86_incoming_stack_boundary
13109 = ix86_minimum_incoming_stack_boundary (false);
13111 /* x86_64 vararg needs 16byte stack alignment for register save
13112 area. */
13113 if (TARGET_64BIT
13114 && cfun->stdarg
13115 && crtl->stack_alignment_estimated < 128)
13116 crtl->stack_alignment_estimated = 128;
13118 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13119 if (ix86_tls_descriptor_calls_expanded_in_cfun
13120 && crtl->preferred_stack_boundary < 128)
13121 crtl->preferred_stack_boundary = 128;
13124 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13125 needed or an rtx for DRAP otherwise. */
13127 static rtx
13128 ix86_get_drap_rtx (void)
13130 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
13131 crtl->need_drap = true;
13133 if (stack_realign_drap)
13135 /* Assign DRAP to vDRAP and returns vDRAP */
13136 unsigned int regno = find_drap_reg ();
13137 rtx drap_vreg;
13138 rtx arg_ptr;
13139 rtx_insn *seq, *insn;
13141 arg_ptr = gen_rtx_REG (Pmode, regno);
13142 crtl->drap_reg = arg_ptr;
13144 start_sequence ();
13145 drap_vreg = copy_to_reg (arg_ptr);
13146 seq = get_insns ();
13147 end_sequence ();
13149 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13150 if (!optimize)
13152 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13153 RTX_FRAME_RELATED_P (insn) = 1;
13155 return drap_vreg;
13157 else
13158 return NULL;
13161 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13163 static rtx
13164 ix86_internal_arg_pointer (void)
13166 return virtual_incoming_args_rtx;
13169 struct scratch_reg {
13170 rtx reg;
13171 bool saved;
13174 /* Return a short-lived scratch register for use on function entry.
13175 In 32-bit mode, it is valid only after the registers are saved
13176 in the prologue. This register must be released by means of
13177 release_scratch_register_on_entry once it is dead. */
13179 static void
13180 get_scratch_register_on_entry (struct scratch_reg *sr)
13182 int regno;
13184 sr->saved = false;
13186 if (TARGET_64BIT)
13188 /* We always use R11 in 64-bit mode. */
13189 regno = R11_REG;
13191 else
13193 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13194 bool fastcall_p
13195 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13196 bool thiscall_p
13197 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13198 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13199 int regparm = ix86_function_regparm (fntype, decl);
13200 int drap_regno
13201 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13203 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13204 for the static chain register. */
13205 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13206 && drap_regno != AX_REG)
13207 regno = AX_REG;
13208 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13209 for the static chain register. */
13210 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13211 regno = AX_REG;
13212 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13213 regno = DX_REG;
13214 /* ecx is the static chain register. */
13215 else if (regparm < 3 && !fastcall_p && !thiscall_p
13216 && !static_chain_p
13217 && drap_regno != CX_REG)
13218 regno = CX_REG;
13219 else if (ix86_save_reg (BX_REG, true))
13220 regno = BX_REG;
13221 /* esi is the static chain register. */
13222 else if (!(regparm == 3 && static_chain_p)
13223 && ix86_save_reg (SI_REG, true))
13224 regno = SI_REG;
13225 else if (ix86_save_reg (DI_REG, true))
13226 regno = DI_REG;
13227 else
13229 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13230 sr->saved = true;
13234 sr->reg = gen_rtx_REG (Pmode, regno);
13235 if (sr->saved)
13237 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13238 RTX_FRAME_RELATED_P (insn) = 1;
13242 /* Release a scratch register obtained from the preceding function. */
13244 static void
13245 release_scratch_register_on_entry (struct scratch_reg *sr)
13247 if (sr->saved)
13249 struct machine_function *m = cfun->machine;
13250 rtx x, insn = emit_insn (gen_pop (sr->reg));
13252 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13253 RTX_FRAME_RELATED_P (insn) = 1;
13254 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13255 x = gen_rtx_SET (stack_pointer_rtx, x);
13256 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13257 m->fs.sp_offset -= UNITS_PER_WORD;
13261 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13263 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13265 static void
13266 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13268 /* We skip the probe for the first interval + a small dope of 4 words and
13269 probe that many bytes past the specified size to maintain a protection
13270 area at the botton of the stack. */
13271 const int dope = 4 * UNITS_PER_WORD;
13272 rtx size_rtx = GEN_INT (size), last;
13274 /* See if we have a constant small number of probes to generate. If so,
13275 that's the easy case. The run-time loop is made up of 9 insns in the
13276 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13277 for n # of intervals. */
13278 if (size <= 4 * PROBE_INTERVAL)
13280 HOST_WIDE_INT i, adjust;
13281 bool first_probe = true;
13283 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13284 values of N from 1 until it exceeds SIZE. If only one probe is
13285 needed, this will not generate any code. Then adjust and probe
13286 to PROBE_INTERVAL + SIZE. */
13287 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13289 if (first_probe)
13291 adjust = 2 * PROBE_INTERVAL + dope;
13292 first_probe = false;
13294 else
13295 adjust = PROBE_INTERVAL;
13297 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13298 plus_constant (Pmode, stack_pointer_rtx,
13299 -adjust)));
13300 emit_stack_probe (stack_pointer_rtx);
13303 if (first_probe)
13304 adjust = size + PROBE_INTERVAL + dope;
13305 else
13306 adjust = size + PROBE_INTERVAL - i;
13308 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13309 plus_constant (Pmode, stack_pointer_rtx,
13310 -adjust)));
13311 emit_stack_probe (stack_pointer_rtx);
13313 /* Adjust back to account for the additional first interval. */
13314 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13315 plus_constant (Pmode, stack_pointer_rtx,
13316 PROBE_INTERVAL + dope)));
13319 /* Otherwise, do the same as above, but in a loop. Note that we must be
13320 extra careful with variables wrapping around because we might be at
13321 the very top (or the very bottom) of the address space and we have
13322 to be able to handle this case properly; in particular, we use an
13323 equality test for the loop condition. */
13324 else
13326 HOST_WIDE_INT rounded_size;
13327 struct scratch_reg sr;
13329 get_scratch_register_on_entry (&sr);
13332 /* Step 1: round SIZE to the previous multiple of the interval. */
13334 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13337 /* Step 2: compute initial and final value of the loop counter. */
13339 /* SP = SP_0 + PROBE_INTERVAL. */
13340 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13341 plus_constant (Pmode, stack_pointer_rtx,
13342 - (PROBE_INTERVAL + dope))));
13344 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13345 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13346 emit_insn (gen_rtx_SET (sr.reg,
13347 plus_constant (Pmode, stack_pointer_rtx,
13348 -rounded_size)));
13349 else
13351 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13352 emit_insn (gen_rtx_SET (sr.reg,
13353 gen_rtx_PLUS (Pmode, sr.reg,
13354 stack_pointer_rtx)));
13358 /* Step 3: the loop
13362 SP = SP + PROBE_INTERVAL
13363 probe at SP
13365 while (SP != LAST_ADDR)
13367 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13368 values of N from 1 until it is equal to ROUNDED_SIZE. */
13370 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13373 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13374 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13376 if (size != rounded_size)
13378 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13379 plus_constant (Pmode, stack_pointer_rtx,
13380 rounded_size - size)));
13381 emit_stack_probe (stack_pointer_rtx);
13384 /* Adjust back to account for the additional first interval. */
13385 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13386 plus_constant (Pmode, stack_pointer_rtx,
13387 PROBE_INTERVAL + dope)));
13389 release_scratch_register_on_entry (&sr);
13392 /* Even if the stack pointer isn't the CFA register, we need to correctly
13393 describe the adjustments made to it, in particular differentiate the
13394 frame-related ones from the frame-unrelated ones. */
13395 if (size > 0)
13397 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13398 XVECEXP (expr, 0, 0)
13399 = gen_rtx_SET (stack_pointer_rtx,
13400 plus_constant (Pmode, stack_pointer_rtx, -size));
13401 XVECEXP (expr, 0, 1)
13402 = gen_rtx_SET (stack_pointer_rtx,
13403 plus_constant (Pmode, stack_pointer_rtx,
13404 PROBE_INTERVAL + dope + size));
13405 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13406 RTX_FRAME_RELATED_P (last) = 1;
13408 cfun->machine->fs.sp_offset += size;
13411 /* Make sure nothing is scheduled before we are done. */
13412 emit_insn (gen_blockage ());
13415 /* Adjust the stack pointer up to REG while probing it. */
13417 const char *
13418 output_adjust_stack_and_probe (rtx reg)
13420 static int labelno = 0;
13421 char loop_lab[32];
13422 rtx xops[2];
13424 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13426 /* Loop. */
13427 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13429 /* SP = SP + PROBE_INTERVAL. */
13430 xops[0] = stack_pointer_rtx;
13431 xops[1] = GEN_INT (PROBE_INTERVAL);
13432 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13434 /* Probe at SP. */
13435 xops[1] = const0_rtx;
13436 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13438 /* Test if SP == LAST_ADDR. */
13439 xops[0] = stack_pointer_rtx;
13440 xops[1] = reg;
13441 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13443 /* Branch. */
13444 fputs ("\tjne\t", asm_out_file);
13445 assemble_name_raw (asm_out_file, loop_lab);
13446 fputc ('\n', asm_out_file);
13448 return "";
13451 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13452 inclusive. These are offsets from the current stack pointer. */
13454 static void
13455 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13457 /* See if we have a constant small number of probes to generate. If so,
13458 that's the easy case. The run-time loop is made up of 6 insns in the
13459 generic case while the compile-time loop is made up of n insns for n #
13460 of intervals. */
13461 if (size <= 6 * PROBE_INTERVAL)
13463 HOST_WIDE_INT i;
13465 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13466 it exceeds SIZE. If only one probe is needed, this will not
13467 generate any code. Then probe at FIRST + SIZE. */
13468 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13469 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13470 -(first + i)));
13472 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13473 -(first + size)));
13476 /* Otherwise, do the same as above, but in a loop. Note that we must be
13477 extra careful with variables wrapping around because we might be at
13478 the very top (or the very bottom) of the address space and we have
13479 to be able to handle this case properly; in particular, we use an
13480 equality test for the loop condition. */
13481 else
13483 HOST_WIDE_INT rounded_size, last;
13484 struct scratch_reg sr;
13486 get_scratch_register_on_entry (&sr);
13489 /* Step 1: round SIZE to the previous multiple of the interval. */
13491 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13494 /* Step 2: compute initial and final value of the loop counter. */
13496 /* TEST_OFFSET = FIRST. */
13497 emit_move_insn (sr.reg, GEN_INT (-first));
13499 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13500 last = first + rounded_size;
13503 /* Step 3: the loop
13507 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13508 probe at TEST_ADDR
13510 while (TEST_ADDR != LAST_ADDR)
13512 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13513 until it is equal to ROUNDED_SIZE. */
13515 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13518 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13519 that SIZE is equal to ROUNDED_SIZE. */
13521 if (size != rounded_size)
13522 emit_stack_probe (plus_constant (Pmode,
13523 gen_rtx_PLUS (Pmode,
13524 stack_pointer_rtx,
13525 sr.reg),
13526 rounded_size - size));
13528 release_scratch_register_on_entry (&sr);
13531 /* Make sure nothing is scheduled before we are done. */
13532 emit_insn (gen_blockage ());
13535 /* Probe a range of stack addresses from REG to END, inclusive. These are
13536 offsets from the current stack pointer. */
13538 const char *
13539 output_probe_stack_range (rtx reg, rtx end)
13541 static int labelno = 0;
13542 char loop_lab[32];
13543 rtx xops[3];
13545 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13547 /* Loop. */
13548 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13550 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13551 xops[0] = reg;
13552 xops[1] = GEN_INT (PROBE_INTERVAL);
13553 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13555 /* Probe at TEST_ADDR. */
13556 xops[0] = stack_pointer_rtx;
13557 xops[1] = reg;
13558 xops[2] = const0_rtx;
13559 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13561 /* Test if TEST_ADDR == LAST_ADDR. */
13562 xops[0] = reg;
13563 xops[1] = end;
13564 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13566 /* Branch. */
13567 fputs ("\tjne\t", asm_out_file);
13568 assemble_name_raw (asm_out_file, loop_lab);
13569 fputc ('\n', asm_out_file);
13571 return "";
13574 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13575 to be generated in correct form. */
13576 static void
13577 ix86_finalize_stack_realign_flags (void)
13579 /* Check if stack realign is really needed after reload, and
13580 stores result in cfun */
13581 unsigned int incoming_stack_boundary
13582 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13583 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13584 unsigned int stack_realign
13585 = (incoming_stack_boundary
13586 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13587 ? crtl->max_used_stack_slot_alignment
13588 : crtl->stack_alignment_needed));
13590 if (crtl->stack_realign_finalized)
13592 /* After stack_realign_needed is finalized, we can't no longer
13593 change it. */
13594 gcc_assert (crtl->stack_realign_needed == stack_realign);
13595 return;
13598 /* If the only reason for frame_pointer_needed is that we conservatively
13599 assumed stack realignment might be needed, but in the end nothing that
13600 needed the stack alignment had been spilled, clear frame_pointer_needed
13601 and say we don't need stack realignment. */
13602 if (stack_realign
13603 && frame_pointer_needed
13604 && crtl->is_leaf
13605 && flag_omit_frame_pointer
13606 && crtl->sp_is_unchanging
13607 && !ix86_current_function_calls_tls_descriptor
13608 && !crtl->accesses_prior_frames
13609 && !cfun->calls_alloca
13610 && !crtl->calls_eh_return
13611 /* See ira_setup_eliminable_regset for the rationale. */
13612 && !(STACK_CHECK_MOVING_SP
13613 && flag_stack_check
13614 && flag_exceptions
13615 && cfun->can_throw_non_call_exceptions)
13616 && !ix86_frame_pointer_required ()
13617 && get_frame_size () == 0
13618 && ix86_nsaved_sseregs () == 0
13619 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13621 HARD_REG_SET set_up_by_prologue, prologue_used;
13622 basic_block bb;
13624 CLEAR_HARD_REG_SET (prologue_used);
13625 CLEAR_HARD_REG_SET (set_up_by_prologue);
13626 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13627 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13628 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13629 HARD_FRAME_POINTER_REGNUM);
13630 FOR_EACH_BB_FN (bb, cfun)
13632 rtx_insn *insn;
13633 FOR_BB_INSNS (bb, insn)
13634 if (NONDEBUG_INSN_P (insn)
13635 && requires_stack_frame_p (insn, prologue_used,
13636 set_up_by_prologue))
13638 crtl->stack_realign_needed = stack_realign;
13639 crtl->stack_realign_finalized = true;
13640 return;
13644 /* If drap has been set, but it actually isn't live at the start
13645 of the function, there is no reason to set it up. */
13646 if (crtl->drap_reg)
13648 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13649 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13651 crtl->drap_reg = NULL_RTX;
13652 crtl->need_drap = false;
13655 else
13656 cfun->machine->no_drap_save_restore = true;
13658 frame_pointer_needed = false;
13659 stack_realign = false;
13660 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13661 crtl->stack_alignment_needed = incoming_stack_boundary;
13662 crtl->stack_alignment_estimated = incoming_stack_boundary;
13663 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13664 crtl->preferred_stack_boundary = incoming_stack_boundary;
13665 df_finish_pass (true);
13666 df_scan_alloc (NULL);
13667 df_scan_blocks ();
13668 df_compute_regs_ever_live (true);
13669 df_analyze ();
13672 crtl->stack_realign_needed = stack_realign;
13673 crtl->stack_realign_finalized = true;
13676 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13678 static void
13679 ix86_elim_entry_set_got (rtx reg)
13681 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13682 rtx_insn *c_insn = BB_HEAD (bb);
13683 if (!NONDEBUG_INSN_P (c_insn))
13684 c_insn = next_nonnote_nondebug_insn (c_insn);
13685 if (c_insn && NONJUMP_INSN_P (c_insn))
13687 rtx pat = PATTERN (c_insn);
13688 if (GET_CODE (pat) == PARALLEL)
13690 rtx vec = XVECEXP (pat, 0, 0);
13691 if (GET_CODE (vec) == SET
13692 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13693 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13694 delete_insn (c_insn);
13699 /* Expand the prologue into a bunch of separate insns. */
13701 void
13702 ix86_expand_prologue (void)
13704 struct machine_function *m = cfun->machine;
13705 rtx insn, t;
13706 struct ix86_frame frame;
13707 HOST_WIDE_INT allocate;
13708 bool int_registers_saved;
13709 bool sse_registers_saved;
13710 rtx static_chain = NULL_RTX;
13712 ix86_finalize_stack_realign_flags ();
13714 /* DRAP should not coexist with stack_realign_fp */
13715 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13717 memset (&m->fs, 0, sizeof (m->fs));
13719 /* Initialize CFA state for before the prologue. */
13720 m->fs.cfa_reg = stack_pointer_rtx;
13721 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13723 /* Track SP offset to the CFA. We continue tracking this after we've
13724 swapped the CFA register away from SP. In the case of re-alignment
13725 this is fudged; we're interested to offsets within the local frame. */
13726 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13727 m->fs.sp_valid = true;
13729 ix86_compute_frame_layout (&frame);
13731 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13733 /* We should have already generated an error for any use of
13734 ms_hook on a nested function. */
13735 gcc_checking_assert (!ix86_static_chain_on_stack);
13737 /* Check if profiling is active and we shall use profiling before
13738 prologue variant. If so sorry. */
13739 if (crtl->profile && flag_fentry != 0)
13740 sorry ("ms_hook_prologue attribute isn%'t compatible "
13741 "with -mfentry for 32-bit");
13743 /* In ix86_asm_output_function_label we emitted:
13744 8b ff movl.s %edi,%edi
13745 55 push %ebp
13746 8b ec movl.s %esp,%ebp
13748 This matches the hookable function prologue in Win32 API
13749 functions in Microsoft Windows XP Service Pack 2 and newer.
13750 Wine uses this to enable Windows apps to hook the Win32 API
13751 functions provided by Wine.
13753 What that means is that we've already set up the frame pointer. */
13755 if (frame_pointer_needed
13756 && !(crtl->drap_reg && crtl->stack_realign_needed))
13758 rtx push, mov;
13760 /* We've decided to use the frame pointer already set up.
13761 Describe this to the unwinder by pretending that both
13762 push and mov insns happen right here.
13764 Putting the unwind info here at the end of the ms_hook
13765 is done so that we can make absolutely certain we get
13766 the required byte sequence at the start of the function,
13767 rather than relying on an assembler that can produce
13768 the exact encoding required.
13770 However it does mean (in the unpatched case) that we have
13771 a 1 insn window where the asynchronous unwind info is
13772 incorrect. However, if we placed the unwind info at
13773 its correct location we would have incorrect unwind info
13774 in the patched case. Which is probably all moot since
13775 I don't expect Wine generates dwarf2 unwind info for the
13776 system libraries that use this feature. */
13778 insn = emit_insn (gen_blockage ());
13780 push = gen_push (hard_frame_pointer_rtx);
13781 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13782 stack_pointer_rtx);
13783 RTX_FRAME_RELATED_P (push) = 1;
13784 RTX_FRAME_RELATED_P (mov) = 1;
13786 RTX_FRAME_RELATED_P (insn) = 1;
13787 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13788 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13790 /* Note that gen_push incremented m->fs.cfa_offset, even
13791 though we didn't emit the push insn here. */
13792 m->fs.cfa_reg = hard_frame_pointer_rtx;
13793 m->fs.fp_offset = m->fs.cfa_offset;
13794 m->fs.fp_valid = true;
13796 else
13798 /* The frame pointer is not needed so pop %ebp again.
13799 This leaves us with a pristine state. */
13800 emit_insn (gen_pop (hard_frame_pointer_rtx));
13804 /* The first insn of a function that accepts its static chain on the
13805 stack is to push the register that would be filled in by a direct
13806 call. This insn will be skipped by the trampoline. */
13807 else if (ix86_static_chain_on_stack)
13809 static_chain = ix86_static_chain (cfun->decl, false);
13810 insn = emit_insn (gen_push (static_chain));
13811 emit_insn (gen_blockage ());
13813 /* We don't want to interpret this push insn as a register save,
13814 only as a stack adjustment. The real copy of the register as
13815 a save will be done later, if needed. */
13816 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13817 t = gen_rtx_SET (stack_pointer_rtx, t);
13818 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13819 RTX_FRAME_RELATED_P (insn) = 1;
13822 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13823 of DRAP is needed and stack realignment is really needed after reload */
13824 if (stack_realign_drap)
13826 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13828 /* Can't use DRAP in interrupt function. */
13829 if (cfun->machine->func_type != TYPE_NORMAL)
13830 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13831 "in interrupt service routine. This may be worked "
13832 "around by avoiding functions with aggregate return.");
13834 /* Only need to push parameter pointer reg if it is caller saved. */
13835 if (!call_used_regs[REGNO (crtl->drap_reg)])
13837 /* Push arg pointer reg */
13838 insn = emit_insn (gen_push (crtl->drap_reg));
13839 RTX_FRAME_RELATED_P (insn) = 1;
13842 /* Grab the argument pointer. */
13843 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13844 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13845 RTX_FRAME_RELATED_P (insn) = 1;
13846 m->fs.cfa_reg = crtl->drap_reg;
13847 m->fs.cfa_offset = 0;
13849 /* Align the stack. */
13850 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13851 stack_pointer_rtx,
13852 GEN_INT (-align_bytes)));
13853 RTX_FRAME_RELATED_P (insn) = 1;
13855 /* Replicate the return address on the stack so that return
13856 address can be reached via (argp - 1) slot. This is needed
13857 to implement macro RETURN_ADDR_RTX and intrinsic function
13858 expand_builtin_return_addr etc. */
13859 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13860 t = gen_frame_mem (word_mode, t);
13861 insn = emit_insn (gen_push (t));
13862 RTX_FRAME_RELATED_P (insn) = 1;
13864 /* For the purposes of frame and register save area addressing,
13865 we've started over with a new frame. */
13866 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13867 m->fs.realigned = true;
13869 if (static_chain)
13871 /* Replicate static chain on the stack so that static chain
13872 can be reached via (argp - 2) slot. This is needed for
13873 nested function with stack realignment. */
13874 insn = emit_insn (gen_push (static_chain));
13875 RTX_FRAME_RELATED_P (insn) = 1;
13879 int_registers_saved = (frame.nregs == 0);
13880 sse_registers_saved = (frame.nsseregs == 0);
13882 if (frame_pointer_needed && !m->fs.fp_valid)
13884 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13885 slower on all targets. Also sdb doesn't like it. */
13886 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13887 RTX_FRAME_RELATED_P (insn) = 1;
13889 /* Push registers now, before setting the frame pointer
13890 on SEH target. */
13891 if (!int_registers_saved
13892 && TARGET_SEH
13893 && !frame.save_regs_using_mov)
13895 ix86_emit_save_regs ();
13896 int_registers_saved = true;
13897 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13900 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13902 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13903 RTX_FRAME_RELATED_P (insn) = 1;
13905 if (m->fs.cfa_reg == stack_pointer_rtx)
13906 m->fs.cfa_reg = hard_frame_pointer_rtx;
13907 m->fs.fp_offset = m->fs.sp_offset;
13908 m->fs.fp_valid = true;
13912 if (!int_registers_saved)
13914 /* If saving registers via PUSH, do so now. */
13915 if (!frame.save_regs_using_mov)
13917 ix86_emit_save_regs ();
13918 int_registers_saved = true;
13919 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13922 /* When using red zone we may start register saving before allocating
13923 the stack frame saving one cycle of the prologue. However, avoid
13924 doing this if we have to probe the stack; at least on x86_64 the
13925 stack probe can turn into a call that clobbers a red zone location. */
13926 else if (ix86_using_red_zone ()
13927 && (! TARGET_STACK_PROBE
13928 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13930 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13931 int_registers_saved = true;
13935 if (stack_realign_fp)
13937 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13938 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13940 /* The computation of the size of the re-aligned stack frame means
13941 that we must allocate the size of the register save area before
13942 performing the actual alignment. Otherwise we cannot guarantee
13943 that there's enough storage above the realignment point. */
13944 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13945 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13946 GEN_INT (m->fs.sp_offset
13947 - frame.sse_reg_save_offset),
13948 -1, false);
13950 /* Align the stack. */
13951 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13952 stack_pointer_rtx,
13953 GEN_INT (-align_bytes)));
13955 /* For the purposes of register save area addressing, the stack
13956 pointer is no longer valid. As for the value of sp_offset,
13957 see ix86_compute_frame_layout, which we need to match in order
13958 to pass verification of stack_pointer_offset at the end. */
13959 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13960 m->fs.sp_valid = false;
13963 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13965 if (flag_stack_usage_info)
13967 /* We start to count from ARG_POINTER. */
13968 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13970 /* If it was realigned, take into account the fake frame. */
13971 if (stack_realign_drap)
13973 if (ix86_static_chain_on_stack)
13974 stack_size += UNITS_PER_WORD;
13976 if (!call_used_regs[REGNO (crtl->drap_reg)])
13977 stack_size += UNITS_PER_WORD;
13979 /* This over-estimates by 1 minimal-stack-alignment-unit but
13980 mitigates that by counting in the new return address slot. */
13981 current_function_dynamic_stack_size
13982 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13985 current_function_static_stack_size = stack_size;
13988 /* On SEH target with very large frame size, allocate an area to save
13989 SSE registers (as the very large allocation won't be described). */
13990 if (TARGET_SEH
13991 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13992 && !sse_registers_saved)
13994 HOST_WIDE_INT sse_size =
13995 frame.sse_reg_save_offset - frame.reg_save_offset;
13997 gcc_assert (int_registers_saved);
13999 /* No need to do stack checking as the area will be immediately
14000 written. */
14001 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14002 GEN_INT (-sse_size), -1,
14003 m->fs.cfa_reg == stack_pointer_rtx);
14004 allocate -= sse_size;
14005 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14006 sse_registers_saved = true;
14009 /* The stack has already been decremented by the instruction calling us
14010 so probe if the size is non-negative to preserve the protection area. */
14011 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14013 /* We expect the registers to be saved when probes are used. */
14014 gcc_assert (int_registers_saved);
14016 if (STACK_CHECK_MOVING_SP)
14018 if (!(crtl->is_leaf && !cfun->calls_alloca
14019 && allocate <= PROBE_INTERVAL))
14021 ix86_adjust_stack_and_probe (allocate);
14022 allocate = 0;
14025 else
14027 HOST_WIDE_INT size = allocate;
14029 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14030 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14032 if (TARGET_STACK_PROBE)
14034 if (crtl->is_leaf && !cfun->calls_alloca)
14036 if (size > PROBE_INTERVAL)
14037 ix86_emit_probe_stack_range (0, size);
14039 else
14040 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14042 else
14044 if (crtl->is_leaf && !cfun->calls_alloca)
14046 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14047 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14048 size - STACK_CHECK_PROTECT);
14050 else
14051 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14056 if (allocate == 0)
14058 else if (!ix86_target_stack_probe ()
14059 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14061 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14062 GEN_INT (-allocate), -1,
14063 m->fs.cfa_reg == stack_pointer_rtx);
14065 else
14067 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14068 rtx r10 = NULL;
14069 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14070 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14071 bool eax_live = ix86_eax_live_at_start_p ();
14072 bool r10_live = false;
14074 if (TARGET_64BIT)
14075 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14077 if (eax_live)
14079 insn = emit_insn (gen_push (eax));
14080 allocate -= UNITS_PER_WORD;
14081 /* Note that SEH directives need to continue tracking the stack
14082 pointer even after the frame pointer has been set up. */
14083 if (sp_is_cfa_reg || TARGET_SEH)
14085 if (sp_is_cfa_reg)
14086 m->fs.cfa_offset += UNITS_PER_WORD;
14087 RTX_FRAME_RELATED_P (insn) = 1;
14088 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14089 gen_rtx_SET (stack_pointer_rtx,
14090 plus_constant (Pmode, stack_pointer_rtx,
14091 -UNITS_PER_WORD)));
14095 if (r10_live)
14097 r10 = gen_rtx_REG (Pmode, R10_REG);
14098 insn = emit_insn (gen_push (r10));
14099 allocate -= UNITS_PER_WORD;
14100 if (sp_is_cfa_reg || TARGET_SEH)
14102 if (sp_is_cfa_reg)
14103 m->fs.cfa_offset += UNITS_PER_WORD;
14104 RTX_FRAME_RELATED_P (insn) = 1;
14105 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14106 gen_rtx_SET (stack_pointer_rtx,
14107 plus_constant (Pmode, stack_pointer_rtx,
14108 -UNITS_PER_WORD)));
14112 emit_move_insn (eax, GEN_INT (allocate));
14113 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14115 /* Use the fact that AX still contains ALLOCATE. */
14116 adjust_stack_insn = (Pmode == DImode
14117 ? gen_pro_epilogue_adjust_stack_di_sub
14118 : gen_pro_epilogue_adjust_stack_si_sub);
14120 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14121 stack_pointer_rtx, eax));
14123 if (sp_is_cfa_reg || TARGET_SEH)
14125 if (sp_is_cfa_reg)
14126 m->fs.cfa_offset += allocate;
14127 RTX_FRAME_RELATED_P (insn) = 1;
14128 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14129 gen_rtx_SET (stack_pointer_rtx,
14130 plus_constant (Pmode, stack_pointer_rtx,
14131 -allocate)));
14133 m->fs.sp_offset += allocate;
14135 /* Use stack_pointer_rtx for relative addressing so that code
14136 works for realigned stack, too. */
14137 if (r10_live && eax_live)
14139 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14140 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14141 gen_frame_mem (word_mode, t));
14142 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14143 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14144 gen_frame_mem (word_mode, t));
14146 else if (eax_live || r10_live)
14148 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14149 emit_move_insn (gen_rtx_REG (word_mode,
14150 (eax_live ? AX_REG : R10_REG)),
14151 gen_frame_mem (word_mode, t));
14154 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14156 /* If we havn't already set up the frame pointer, do so now. */
14157 if (frame_pointer_needed && !m->fs.fp_valid)
14159 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14160 GEN_INT (frame.stack_pointer_offset
14161 - frame.hard_frame_pointer_offset));
14162 insn = emit_insn (insn);
14163 RTX_FRAME_RELATED_P (insn) = 1;
14164 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14166 if (m->fs.cfa_reg == stack_pointer_rtx)
14167 m->fs.cfa_reg = hard_frame_pointer_rtx;
14168 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14169 m->fs.fp_valid = true;
14172 if (!int_registers_saved)
14173 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14174 if (!sse_registers_saved)
14175 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14177 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14178 in PROLOGUE. */
14179 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14181 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14182 insn = emit_insn (gen_set_got (pic));
14183 RTX_FRAME_RELATED_P (insn) = 1;
14184 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14185 emit_insn (gen_prologue_use (pic));
14186 /* Deleting already emmitted SET_GOT if exist and allocated to
14187 REAL_PIC_OFFSET_TABLE_REGNUM. */
14188 ix86_elim_entry_set_got (pic);
14191 if (crtl->drap_reg && !crtl->stack_realign_needed)
14193 /* vDRAP is setup but after reload it turns out stack realign
14194 isn't necessary, here we will emit prologue to setup DRAP
14195 without stack realign adjustment */
14196 t = choose_baseaddr (0);
14197 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14200 /* Prevent instructions from being scheduled into register save push
14201 sequence when access to the redzone area is done through frame pointer.
14202 The offset between the frame pointer and the stack pointer is calculated
14203 relative to the value of the stack pointer at the end of the function
14204 prologue, and moving instructions that access redzone area via frame
14205 pointer inside push sequence violates this assumption. */
14206 if (frame_pointer_needed && frame.red_zone_size)
14207 emit_insn (gen_memory_blockage ());
14209 /* SEH requires that the prologue end within 256 bytes of the start of
14210 the function. Prevent instruction schedules that would extend that.
14211 Further, prevent alloca modifications to the stack pointer from being
14212 combined with prologue modifications. */
14213 if (TARGET_SEH)
14214 emit_insn (gen_prologue_use (stack_pointer_rtx));
14217 /* Emit code to restore REG using a POP insn. */
14219 static void
14220 ix86_emit_restore_reg_using_pop (rtx reg)
14222 struct machine_function *m = cfun->machine;
14223 rtx_insn *insn = emit_insn (gen_pop (reg));
14225 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14226 m->fs.sp_offset -= UNITS_PER_WORD;
14228 if (m->fs.cfa_reg == crtl->drap_reg
14229 && REGNO (reg) == REGNO (crtl->drap_reg))
14231 /* Previously we'd represented the CFA as an expression
14232 like *(%ebp - 8). We've just popped that value from
14233 the stack, which means we need to reset the CFA to
14234 the drap register. This will remain until we restore
14235 the stack pointer. */
14236 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14237 RTX_FRAME_RELATED_P (insn) = 1;
14239 /* This means that the DRAP register is valid for addressing too. */
14240 m->fs.drap_valid = true;
14241 return;
14244 if (m->fs.cfa_reg == stack_pointer_rtx)
14246 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14247 x = gen_rtx_SET (stack_pointer_rtx, x);
14248 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14249 RTX_FRAME_RELATED_P (insn) = 1;
14251 m->fs.cfa_offset -= UNITS_PER_WORD;
14254 /* When the frame pointer is the CFA, and we pop it, we are
14255 swapping back to the stack pointer as the CFA. This happens
14256 for stack frames that don't allocate other data, so we assume
14257 the stack pointer is now pointing at the return address, i.e.
14258 the function entry state, which makes the offset be 1 word. */
14259 if (reg == hard_frame_pointer_rtx)
14261 m->fs.fp_valid = false;
14262 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14264 m->fs.cfa_reg = stack_pointer_rtx;
14265 m->fs.cfa_offset -= UNITS_PER_WORD;
14267 add_reg_note (insn, REG_CFA_DEF_CFA,
14268 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14269 GEN_INT (m->fs.cfa_offset)));
14270 RTX_FRAME_RELATED_P (insn) = 1;
14275 /* Emit code to restore saved registers using POP insns. */
14277 static void
14278 ix86_emit_restore_regs_using_pop (void)
14280 unsigned int regno;
14282 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14283 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14284 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14287 /* Emit code and notes for the LEAVE instruction. */
14289 static void
14290 ix86_emit_leave (void)
14292 struct machine_function *m = cfun->machine;
14293 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14295 ix86_add_queued_cfa_restore_notes (insn);
14297 gcc_assert (m->fs.fp_valid);
14298 m->fs.sp_valid = true;
14299 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14300 m->fs.fp_valid = false;
14302 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14304 m->fs.cfa_reg = stack_pointer_rtx;
14305 m->fs.cfa_offset = m->fs.sp_offset;
14307 add_reg_note (insn, REG_CFA_DEF_CFA,
14308 plus_constant (Pmode, stack_pointer_rtx,
14309 m->fs.sp_offset));
14310 RTX_FRAME_RELATED_P (insn) = 1;
14312 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14313 m->fs.fp_offset);
14316 /* Emit code to restore saved registers using MOV insns.
14317 First register is restored from CFA - CFA_OFFSET. */
14318 static void
14319 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14320 bool maybe_eh_return)
14322 struct machine_function *m = cfun->machine;
14323 unsigned int regno;
14325 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14326 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14328 rtx reg = gen_rtx_REG (word_mode, regno);
14329 rtx mem;
14330 rtx_insn *insn;
14332 mem = choose_baseaddr (cfa_offset);
14333 mem = gen_frame_mem (word_mode, mem);
14334 insn = emit_move_insn (reg, mem);
14336 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14338 /* Previously we'd represented the CFA as an expression
14339 like *(%ebp - 8). We've just popped that value from
14340 the stack, which means we need to reset the CFA to
14341 the drap register. This will remain until we restore
14342 the stack pointer. */
14343 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14344 RTX_FRAME_RELATED_P (insn) = 1;
14346 /* This means that the DRAP register is valid for addressing. */
14347 m->fs.drap_valid = true;
14349 else
14350 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14352 cfa_offset -= UNITS_PER_WORD;
14356 /* Emit code to restore saved registers using MOV insns.
14357 First register is restored from CFA - CFA_OFFSET. */
14358 static void
14359 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14360 bool maybe_eh_return)
14362 unsigned int regno;
14364 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14365 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14367 rtx reg = gen_rtx_REG (V4SFmode, regno);
14368 rtx mem;
14369 unsigned int align;
14371 mem = choose_baseaddr (cfa_offset);
14372 mem = gen_rtx_MEM (V4SFmode, mem);
14374 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14375 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14376 set_mem_align (mem, align);
14377 emit_insn (gen_rtx_SET (reg, mem));
14379 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14381 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14385 /* Restore function stack, frame, and registers. */
14387 void
14388 ix86_expand_epilogue (int style)
14390 struct machine_function *m = cfun->machine;
14391 struct machine_frame_state frame_state_save = m->fs;
14392 struct ix86_frame frame;
14393 bool restore_regs_via_mov;
14394 bool using_drap;
14396 ix86_finalize_stack_realign_flags ();
14397 ix86_compute_frame_layout (&frame);
14399 m->fs.sp_valid = (!frame_pointer_needed
14400 || (crtl->sp_is_unchanging
14401 && !stack_realign_fp));
14402 gcc_assert (!m->fs.sp_valid
14403 || m->fs.sp_offset == frame.stack_pointer_offset);
14405 /* The FP must be valid if the frame pointer is present. */
14406 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14407 gcc_assert (!m->fs.fp_valid
14408 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14410 /* We must have *some* valid pointer to the stack frame. */
14411 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14413 /* The DRAP is never valid at this point. */
14414 gcc_assert (!m->fs.drap_valid);
14416 /* See the comment about red zone and frame
14417 pointer usage in ix86_expand_prologue. */
14418 if (frame_pointer_needed && frame.red_zone_size)
14419 emit_insn (gen_memory_blockage ());
14421 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14422 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14424 /* Determine the CFA offset of the end of the red-zone. */
14425 m->fs.red_zone_offset = 0;
14426 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14428 /* The red-zone begins below the return address. */
14429 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14431 /* When the register save area is in the aligned portion of
14432 the stack, determine the maximum runtime displacement that
14433 matches up with the aligned frame. */
14434 if (stack_realign_drap)
14435 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14436 + UNITS_PER_WORD);
14439 /* Special care must be taken for the normal return case of a function
14440 using eh_return: the eax and edx registers are marked as saved, but
14441 not restored along this path. Adjust the save location to match. */
14442 if (crtl->calls_eh_return && style != 2)
14443 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14445 /* EH_RETURN requires the use of moves to function properly. */
14446 if (crtl->calls_eh_return)
14447 restore_regs_via_mov = true;
14448 /* SEH requires the use of pops to identify the epilogue. */
14449 else if (TARGET_SEH)
14450 restore_regs_via_mov = false;
14451 /* If we're only restoring one register and sp is not valid then
14452 using a move instruction to restore the register since it's
14453 less work than reloading sp and popping the register. */
14454 else if (!m->fs.sp_valid && frame.nregs <= 1)
14455 restore_regs_via_mov = true;
14456 else if (TARGET_EPILOGUE_USING_MOVE
14457 && cfun->machine->use_fast_prologue_epilogue
14458 && (frame.nregs > 1
14459 || m->fs.sp_offset != frame.reg_save_offset))
14460 restore_regs_via_mov = true;
14461 else if (frame_pointer_needed
14462 && !frame.nregs
14463 && m->fs.sp_offset != frame.reg_save_offset)
14464 restore_regs_via_mov = true;
14465 else if (frame_pointer_needed
14466 && TARGET_USE_LEAVE
14467 && cfun->machine->use_fast_prologue_epilogue
14468 && frame.nregs == 1)
14469 restore_regs_via_mov = true;
14470 else
14471 restore_regs_via_mov = false;
14473 if (restore_regs_via_mov || frame.nsseregs)
14475 /* Ensure that the entire register save area is addressable via
14476 the stack pointer, if we will restore via sp. */
14477 if (TARGET_64BIT
14478 && m->fs.sp_offset > 0x7fffffff
14479 && !(m->fs.fp_valid || m->fs.drap_valid)
14480 && (frame.nsseregs + frame.nregs) != 0)
14482 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14483 GEN_INT (m->fs.sp_offset
14484 - frame.sse_reg_save_offset),
14485 style,
14486 m->fs.cfa_reg == stack_pointer_rtx);
14490 /* If there are any SSE registers to restore, then we have to do it
14491 via moves, since there's obviously no pop for SSE regs. */
14492 if (frame.nsseregs)
14493 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14494 style == 2);
14496 if (restore_regs_via_mov)
14498 rtx t;
14500 if (frame.nregs)
14501 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14503 /* eh_return epilogues need %ecx added to the stack pointer. */
14504 if (style == 2)
14506 rtx sa = EH_RETURN_STACKADJ_RTX;
14507 rtx_insn *insn;
14509 /* %ecx can't be used for both DRAP register and eh_return. */
14510 if (crtl->drap_reg)
14511 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14513 /* regparm nested functions don't work with eh_return. */
14514 gcc_assert (!ix86_static_chain_on_stack);
14516 if (frame_pointer_needed)
14518 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14519 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14520 emit_insn (gen_rtx_SET (sa, t));
14522 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14523 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14525 /* Note that we use SA as a temporary CFA, as the return
14526 address is at the proper place relative to it. We
14527 pretend this happens at the FP restore insn because
14528 prior to this insn the FP would be stored at the wrong
14529 offset relative to SA, and after this insn we have no
14530 other reasonable register to use for the CFA. We don't
14531 bother resetting the CFA to the SP for the duration of
14532 the return insn. */
14533 add_reg_note (insn, REG_CFA_DEF_CFA,
14534 plus_constant (Pmode, sa, UNITS_PER_WORD));
14535 ix86_add_queued_cfa_restore_notes (insn);
14536 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14537 RTX_FRAME_RELATED_P (insn) = 1;
14539 m->fs.cfa_reg = sa;
14540 m->fs.cfa_offset = UNITS_PER_WORD;
14541 m->fs.fp_valid = false;
14543 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14544 const0_rtx, style, false);
14546 else
14548 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14549 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14550 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14551 ix86_add_queued_cfa_restore_notes (insn);
14553 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14554 if (m->fs.cfa_offset != UNITS_PER_WORD)
14556 m->fs.cfa_offset = UNITS_PER_WORD;
14557 add_reg_note (insn, REG_CFA_DEF_CFA,
14558 plus_constant (Pmode, stack_pointer_rtx,
14559 UNITS_PER_WORD));
14560 RTX_FRAME_RELATED_P (insn) = 1;
14563 m->fs.sp_offset = UNITS_PER_WORD;
14564 m->fs.sp_valid = true;
14567 else
14569 /* SEH requires that the function end with (1) a stack adjustment
14570 if necessary, (2) a sequence of pops, and (3) a return or
14571 jump instruction. Prevent insns from the function body from
14572 being scheduled into this sequence. */
14573 if (TARGET_SEH)
14575 /* Prevent a catch region from being adjacent to the standard
14576 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14577 several other flags that would be interesting to test are
14578 not yet set up. */
14579 if (flag_non_call_exceptions)
14580 emit_insn (gen_nops (const1_rtx));
14581 else
14582 emit_insn (gen_blockage ());
14585 /* First step is to deallocate the stack frame so that we can
14586 pop the registers. Also do it on SEH target for very large
14587 frame as the emitted instructions aren't allowed by the ABI in
14588 epilogues. */
14589 if (!m->fs.sp_valid
14590 || (TARGET_SEH
14591 && (m->fs.sp_offset - frame.reg_save_offset
14592 >= SEH_MAX_FRAME_SIZE)))
14594 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14595 GEN_INT (m->fs.fp_offset
14596 - frame.reg_save_offset),
14597 style, false);
14599 else if (m->fs.sp_offset != frame.reg_save_offset)
14601 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14602 GEN_INT (m->fs.sp_offset
14603 - frame.reg_save_offset),
14604 style,
14605 m->fs.cfa_reg == stack_pointer_rtx);
14608 ix86_emit_restore_regs_using_pop ();
14611 /* If we used a stack pointer and haven't already got rid of it,
14612 then do so now. */
14613 if (m->fs.fp_valid)
14615 /* If the stack pointer is valid and pointing at the frame
14616 pointer store address, then we only need a pop. */
14617 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14618 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14619 /* Leave results in shorter dependency chains on CPUs that are
14620 able to grok it fast. */
14621 else if (TARGET_USE_LEAVE
14622 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14623 || !cfun->machine->use_fast_prologue_epilogue)
14624 ix86_emit_leave ();
14625 else
14627 pro_epilogue_adjust_stack (stack_pointer_rtx,
14628 hard_frame_pointer_rtx,
14629 const0_rtx, style, !using_drap);
14630 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14634 if (using_drap)
14636 int param_ptr_offset = UNITS_PER_WORD;
14637 rtx_insn *insn;
14639 gcc_assert (stack_realign_drap);
14641 if (ix86_static_chain_on_stack)
14642 param_ptr_offset += UNITS_PER_WORD;
14643 if (!call_used_regs[REGNO (crtl->drap_reg)])
14644 param_ptr_offset += UNITS_PER_WORD;
14646 insn = emit_insn (gen_rtx_SET
14647 (stack_pointer_rtx,
14648 gen_rtx_PLUS (Pmode,
14649 crtl->drap_reg,
14650 GEN_INT (-param_ptr_offset))));
14651 m->fs.cfa_reg = stack_pointer_rtx;
14652 m->fs.cfa_offset = param_ptr_offset;
14653 m->fs.sp_offset = param_ptr_offset;
14654 m->fs.realigned = false;
14656 add_reg_note (insn, REG_CFA_DEF_CFA,
14657 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14658 GEN_INT (param_ptr_offset)));
14659 RTX_FRAME_RELATED_P (insn) = 1;
14661 if (!call_used_regs[REGNO (crtl->drap_reg)])
14662 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14665 /* At this point the stack pointer must be valid, and we must have
14666 restored all of the registers. We may not have deallocated the
14667 entire stack frame. We've delayed this until now because it may
14668 be possible to merge the local stack deallocation with the
14669 deallocation forced by ix86_static_chain_on_stack. */
14670 gcc_assert (m->fs.sp_valid);
14671 gcc_assert (!m->fs.fp_valid);
14672 gcc_assert (!m->fs.realigned);
14673 if (m->fs.sp_offset != UNITS_PER_WORD)
14675 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14676 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14677 style, true);
14679 else
14680 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14682 /* Sibcall epilogues don't want a return instruction. */
14683 if (style == 0)
14685 m->fs = frame_state_save;
14686 return;
14689 if (cfun->machine->func_type != TYPE_NORMAL)
14691 /* Return with the "IRET" instruction from interrupt handler.
14692 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14693 instruction in exception handler. */
14694 if (cfun->machine->func_type == TYPE_EXCEPTION)
14696 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14697 UNITS_PER_WORD);
14698 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14700 emit_jump_insn (gen_interrupt_return ());
14702 else if (crtl->args.pops_args && crtl->args.size)
14704 rtx popc = GEN_INT (crtl->args.pops_args);
14706 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14707 address, do explicit add, and jump indirectly to the caller. */
14709 if (crtl->args.pops_args >= 65536)
14711 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14712 rtx_insn *insn;
14714 /* There is no "pascal" calling convention in any 64bit ABI. */
14715 gcc_assert (!TARGET_64BIT);
14717 insn = emit_insn (gen_pop (ecx));
14718 m->fs.cfa_offset -= UNITS_PER_WORD;
14719 m->fs.sp_offset -= UNITS_PER_WORD;
14721 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14722 x = gen_rtx_SET (stack_pointer_rtx, x);
14723 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14724 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14725 RTX_FRAME_RELATED_P (insn) = 1;
14727 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14728 popc, -1, true);
14729 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14731 else
14732 emit_jump_insn (gen_simple_return_pop_internal (popc));
14734 else
14735 emit_jump_insn (gen_simple_return_internal ());
14737 /* Restore the state back to the state from the prologue,
14738 so that it's correct for the next epilogue. */
14739 m->fs = frame_state_save;
14742 /* Reset from the function's potential modifications. */
14744 static void
14745 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14747 if (pic_offset_table_rtx
14748 && !ix86_use_pseudo_pic_reg ())
14749 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14751 if (TARGET_MACHO)
14753 rtx_insn *insn = get_last_insn ();
14754 rtx_insn *deleted_debug_label = NULL;
14756 /* Mach-O doesn't support labels at the end of objects, so if
14757 it looks like we might want one, take special action.
14758 First, collect any sequence of deleted debug labels. */
14759 while (insn
14760 && NOTE_P (insn)
14761 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14763 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14764 notes only, instead set their CODE_LABEL_NUMBER to -1,
14765 otherwise there would be code generation differences
14766 in between -g and -g0. */
14767 if (NOTE_P (insn) && NOTE_KIND (insn)
14768 == NOTE_INSN_DELETED_DEBUG_LABEL)
14769 deleted_debug_label = insn;
14770 insn = PREV_INSN (insn);
14773 /* If we have:
14774 label:
14775 barrier
14776 then this needs to be detected, so skip past the barrier. */
14778 if (insn && BARRIER_P (insn))
14779 insn = PREV_INSN (insn);
14781 /* Up to now we've only seen notes or barriers. */
14782 if (insn)
14784 if (LABEL_P (insn)
14785 || (NOTE_P (insn)
14786 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14787 /* Trailing label. */
14788 fputs ("\tnop\n", file);
14789 else if (cfun && ! cfun->is_thunk)
14791 /* See if we have a completely empty function body, skipping
14792 the special case of the picbase thunk emitted as asm. */
14793 while (insn && ! INSN_P (insn))
14794 insn = PREV_INSN (insn);
14795 /* If we don't find any insns, we've got an empty function body;
14796 I.e. completely empty - without a return or branch. This is
14797 taken as the case where a function body has been removed
14798 because it contains an inline __builtin_unreachable(). GCC
14799 declares that reaching __builtin_unreachable() means UB so
14800 we're not obliged to do anything special; however, we want
14801 non-zero-sized function bodies. To meet this, and help the
14802 user out, let's trap the case. */
14803 if (insn == NULL)
14804 fputs ("\tud2\n", file);
14807 else if (deleted_debug_label)
14808 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14809 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14810 CODE_LABEL_NUMBER (insn) = -1;
14814 /* Return a scratch register to use in the split stack prologue. The
14815 split stack prologue is used for -fsplit-stack. It is the first
14816 instructions in the function, even before the regular prologue.
14817 The scratch register can be any caller-saved register which is not
14818 used for parameters or for the static chain. */
14820 static unsigned int
14821 split_stack_prologue_scratch_regno (void)
14823 if (TARGET_64BIT)
14824 return R11_REG;
14825 else
14827 bool is_fastcall, is_thiscall;
14828 int regparm;
14830 is_fastcall = (lookup_attribute ("fastcall",
14831 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14832 != NULL);
14833 is_thiscall = (lookup_attribute ("thiscall",
14834 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14835 != NULL);
14836 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14838 if (is_fastcall)
14840 if (DECL_STATIC_CHAIN (cfun->decl))
14842 sorry ("-fsplit-stack does not support fastcall with "
14843 "nested function");
14844 return INVALID_REGNUM;
14846 return AX_REG;
14848 else if (is_thiscall)
14850 if (!DECL_STATIC_CHAIN (cfun->decl))
14851 return DX_REG;
14852 return AX_REG;
14854 else if (regparm < 3)
14856 if (!DECL_STATIC_CHAIN (cfun->decl))
14857 return CX_REG;
14858 else
14860 if (regparm >= 2)
14862 sorry ("-fsplit-stack does not support 2 register "
14863 "parameters for a nested function");
14864 return INVALID_REGNUM;
14866 return DX_REG;
14869 else
14871 /* FIXME: We could make this work by pushing a register
14872 around the addition and comparison. */
14873 sorry ("-fsplit-stack does not support 3 register parameters");
14874 return INVALID_REGNUM;
14879 /* A SYMBOL_REF for the function which allocates new stackspace for
14880 -fsplit-stack. */
14882 static GTY(()) rtx split_stack_fn;
14884 /* A SYMBOL_REF for the more stack function when using the large
14885 model. */
14887 static GTY(()) rtx split_stack_fn_large;
14889 /* Handle -fsplit-stack. These are the first instructions in the
14890 function, even before the regular prologue. */
14892 void
14893 ix86_expand_split_stack_prologue (void)
14895 struct ix86_frame frame;
14896 HOST_WIDE_INT allocate;
14897 unsigned HOST_WIDE_INT args_size;
14898 rtx_code_label *label;
14899 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14900 rtx scratch_reg = NULL_RTX;
14901 rtx_code_label *varargs_label = NULL;
14902 rtx fn;
14904 gcc_assert (flag_split_stack && reload_completed);
14906 ix86_finalize_stack_realign_flags ();
14907 ix86_compute_frame_layout (&frame);
14908 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14910 /* This is the label we will branch to if we have enough stack
14911 space. We expect the basic block reordering pass to reverse this
14912 branch if optimizing, so that we branch in the unlikely case. */
14913 label = gen_label_rtx ();
14915 /* We need to compare the stack pointer minus the frame size with
14916 the stack boundary in the TCB. The stack boundary always gives
14917 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14918 can compare directly. Otherwise we need to do an addition. */
14920 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14921 UNSPEC_STACK_CHECK);
14922 limit = gen_rtx_CONST (Pmode, limit);
14923 limit = gen_rtx_MEM (Pmode, limit);
14924 if (allocate < SPLIT_STACK_AVAILABLE)
14925 current = stack_pointer_rtx;
14926 else
14928 unsigned int scratch_regno;
14929 rtx offset;
14931 /* We need a scratch register to hold the stack pointer minus
14932 the required frame size. Since this is the very start of the
14933 function, the scratch register can be any caller-saved
14934 register which is not used for parameters. */
14935 offset = GEN_INT (- allocate);
14936 scratch_regno = split_stack_prologue_scratch_regno ();
14937 if (scratch_regno == INVALID_REGNUM)
14938 return;
14939 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14940 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14942 /* We don't use ix86_gen_add3 in this case because it will
14943 want to split to lea, but when not optimizing the insn
14944 will not be split after this point. */
14945 emit_insn (gen_rtx_SET (scratch_reg,
14946 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14947 offset)));
14949 else
14951 emit_move_insn (scratch_reg, offset);
14952 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14953 stack_pointer_rtx));
14955 current = scratch_reg;
14958 ix86_expand_branch (GEU, current, limit, label);
14959 rtx_insn *jump_insn = get_last_insn ();
14960 JUMP_LABEL (jump_insn) = label;
14962 /* Mark the jump as very likely to be taken. */
14963 add_int_reg_note (jump_insn, REG_BR_PROB,
14964 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14966 if (split_stack_fn == NULL_RTX)
14968 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14969 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14971 fn = split_stack_fn;
14973 /* Get more stack space. We pass in the desired stack space and the
14974 size of the arguments to copy to the new stack. In 32-bit mode
14975 we push the parameters; __morestack will return on a new stack
14976 anyhow. In 64-bit mode we pass the parameters in r10 and
14977 r11. */
14978 allocate_rtx = GEN_INT (allocate);
14979 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14980 call_fusage = NULL_RTX;
14981 rtx pop = NULL_RTX;
14982 if (TARGET_64BIT)
14984 rtx reg10, reg11;
14986 reg10 = gen_rtx_REG (Pmode, R10_REG);
14987 reg11 = gen_rtx_REG (Pmode, R11_REG);
14989 /* If this function uses a static chain, it will be in %r10.
14990 Preserve it across the call to __morestack. */
14991 if (DECL_STATIC_CHAIN (cfun->decl))
14993 rtx rax;
14995 rax = gen_rtx_REG (word_mode, AX_REG);
14996 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14997 use_reg (&call_fusage, rax);
15000 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15001 && !TARGET_PECOFF)
15003 HOST_WIDE_INT argval;
15005 gcc_assert (Pmode == DImode);
15006 /* When using the large model we need to load the address
15007 into a register, and we've run out of registers. So we
15008 switch to a different calling convention, and we call a
15009 different function: __morestack_large. We pass the
15010 argument size in the upper 32 bits of r10 and pass the
15011 frame size in the lower 32 bits. */
15012 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15013 gcc_assert ((args_size & 0xffffffff) == args_size);
15015 if (split_stack_fn_large == NULL_RTX)
15017 split_stack_fn_large =
15018 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15019 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15021 if (ix86_cmodel == CM_LARGE_PIC)
15023 rtx_code_label *label;
15024 rtx x;
15026 label = gen_label_rtx ();
15027 emit_label (label);
15028 LABEL_PRESERVE_P (label) = 1;
15029 emit_insn (gen_set_rip_rex64 (reg10, label));
15030 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15031 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15032 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15033 UNSPEC_GOT);
15034 x = gen_rtx_CONST (Pmode, x);
15035 emit_move_insn (reg11, x);
15036 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15037 x = gen_const_mem (Pmode, x);
15038 emit_move_insn (reg11, x);
15040 else
15041 emit_move_insn (reg11, split_stack_fn_large);
15043 fn = reg11;
15045 argval = ((args_size << 16) << 16) + allocate;
15046 emit_move_insn (reg10, GEN_INT (argval));
15048 else
15050 emit_move_insn (reg10, allocate_rtx);
15051 emit_move_insn (reg11, GEN_INT (args_size));
15052 use_reg (&call_fusage, reg11);
15055 use_reg (&call_fusage, reg10);
15057 else
15059 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15060 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15061 insn = emit_insn (gen_push (allocate_rtx));
15062 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15063 pop = GEN_INT (2 * UNITS_PER_WORD);
15065 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15066 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15067 pop, false);
15068 add_function_usage_to (call_insn, call_fusage);
15069 if (!TARGET_64BIT)
15070 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15071 /* Indicate that this function can't jump to non-local gotos. */
15072 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15074 /* In order to make call/return prediction work right, we now need
15075 to execute a return instruction. See
15076 libgcc/config/i386/morestack.S for the details on how this works.
15078 For flow purposes gcc must not see this as a return
15079 instruction--we need control flow to continue at the subsequent
15080 label. Therefore, we use an unspec. */
15081 gcc_assert (crtl->args.pops_args < 65536);
15082 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15084 /* If we are in 64-bit mode and this function uses a static chain,
15085 we saved %r10 in %rax before calling _morestack. */
15086 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15087 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15088 gen_rtx_REG (word_mode, AX_REG));
15090 /* If this function calls va_start, we need to store a pointer to
15091 the arguments on the old stack, because they may not have been
15092 all copied to the new stack. At this point the old stack can be
15093 found at the frame pointer value used by __morestack, because
15094 __morestack has set that up before calling back to us. Here we
15095 store that pointer in a scratch register, and in
15096 ix86_expand_prologue we store the scratch register in a stack
15097 slot. */
15098 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15100 unsigned int scratch_regno;
15101 rtx frame_reg;
15102 int words;
15104 scratch_regno = split_stack_prologue_scratch_regno ();
15105 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15106 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15108 /* 64-bit:
15109 fp -> old fp value
15110 return address within this function
15111 return address of caller of this function
15112 stack arguments
15113 So we add three words to get to the stack arguments.
15115 32-bit:
15116 fp -> old fp value
15117 return address within this function
15118 first argument to __morestack
15119 second argument to __morestack
15120 return address of caller of this function
15121 stack arguments
15122 So we add five words to get to the stack arguments.
15124 words = TARGET_64BIT ? 3 : 5;
15125 emit_insn (gen_rtx_SET (scratch_reg,
15126 gen_rtx_PLUS (Pmode, frame_reg,
15127 GEN_INT (words * UNITS_PER_WORD))));
15129 varargs_label = gen_label_rtx ();
15130 emit_jump_insn (gen_jump (varargs_label));
15131 JUMP_LABEL (get_last_insn ()) = varargs_label;
15133 emit_barrier ();
15136 emit_label (label);
15137 LABEL_NUSES (label) = 1;
15139 /* If this function calls va_start, we now have to set the scratch
15140 register for the case where we do not call __morestack. In this
15141 case we need to set it based on the stack pointer. */
15142 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15144 emit_insn (gen_rtx_SET (scratch_reg,
15145 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15146 GEN_INT (UNITS_PER_WORD))));
15148 emit_label (varargs_label);
15149 LABEL_NUSES (varargs_label) = 1;
15153 /* We may have to tell the dataflow pass that the split stack prologue
15154 is initializing a scratch register. */
15156 static void
15157 ix86_live_on_entry (bitmap regs)
15159 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15161 gcc_assert (flag_split_stack);
15162 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15166 /* Extract the parts of an RTL expression that is a valid memory address
15167 for an instruction. Return 0 if the structure of the address is
15168 grossly off. Return -1 if the address contains ASHIFT, so it is not
15169 strictly valid, but still used for computing length of lea instruction. */
15172 ix86_decompose_address (rtx addr, struct ix86_address *out)
15174 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15175 rtx base_reg, index_reg;
15176 HOST_WIDE_INT scale = 1;
15177 rtx scale_rtx = NULL_RTX;
15178 rtx tmp;
15179 int retval = 1;
15180 addr_space_t seg = ADDR_SPACE_GENERIC;
15182 /* Allow zero-extended SImode addresses,
15183 they will be emitted with addr32 prefix. */
15184 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15186 if (GET_CODE (addr) == ZERO_EXTEND
15187 && GET_MODE (XEXP (addr, 0)) == SImode)
15189 addr = XEXP (addr, 0);
15190 if (CONST_INT_P (addr))
15191 return 0;
15193 else if (GET_CODE (addr) == AND
15194 && const_32bit_mask (XEXP (addr, 1), DImode))
15196 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15197 if (addr == NULL_RTX)
15198 return 0;
15200 if (CONST_INT_P (addr))
15201 return 0;
15205 /* Allow SImode subregs of DImode addresses,
15206 they will be emitted with addr32 prefix. */
15207 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15209 if (SUBREG_P (addr)
15210 && GET_MODE (SUBREG_REG (addr)) == DImode)
15212 addr = SUBREG_REG (addr);
15213 if (CONST_INT_P (addr))
15214 return 0;
15218 if (REG_P (addr))
15219 base = addr;
15220 else if (SUBREG_P (addr))
15222 if (REG_P (SUBREG_REG (addr)))
15223 base = addr;
15224 else
15225 return 0;
15227 else if (GET_CODE (addr) == PLUS)
15229 rtx addends[4], op;
15230 int n = 0, i;
15232 op = addr;
15235 if (n >= 4)
15236 return 0;
15237 addends[n++] = XEXP (op, 1);
15238 op = XEXP (op, 0);
15240 while (GET_CODE (op) == PLUS);
15241 if (n >= 4)
15242 return 0;
15243 addends[n] = op;
15245 for (i = n; i >= 0; --i)
15247 op = addends[i];
15248 switch (GET_CODE (op))
15250 case MULT:
15251 if (index)
15252 return 0;
15253 index = XEXP (op, 0);
15254 scale_rtx = XEXP (op, 1);
15255 break;
15257 case ASHIFT:
15258 if (index)
15259 return 0;
15260 index = XEXP (op, 0);
15261 tmp = XEXP (op, 1);
15262 if (!CONST_INT_P (tmp))
15263 return 0;
15264 scale = INTVAL (tmp);
15265 if ((unsigned HOST_WIDE_INT) scale > 3)
15266 return 0;
15267 scale = 1 << scale;
15268 break;
15270 case ZERO_EXTEND:
15271 op = XEXP (op, 0);
15272 if (GET_CODE (op) != UNSPEC)
15273 return 0;
15274 /* FALLTHRU */
15276 case UNSPEC:
15277 if (XINT (op, 1) == UNSPEC_TP
15278 && TARGET_TLS_DIRECT_SEG_REFS
15279 && seg == ADDR_SPACE_GENERIC)
15280 seg = DEFAULT_TLS_SEG_REG;
15281 else
15282 return 0;
15283 break;
15285 case SUBREG:
15286 if (!REG_P (SUBREG_REG (op)))
15287 return 0;
15288 /* FALLTHRU */
15290 case REG:
15291 if (!base)
15292 base = op;
15293 else if (!index)
15294 index = op;
15295 else
15296 return 0;
15297 break;
15299 case CONST:
15300 case CONST_INT:
15301 case SYMBOL_REF:
15302 case LABEL_REF:
15303 if (disp)
15304 return 0;
15305 disp = op;
15306 break;
15308 default:
15309 return 0;
15313 else if (GET_CODE (addr) == MULT)
15315 index = XEXP (addr, 0); /* index*scale */
15316 scale_rtx = XEXP (addr, 1);
15318 else if (GET_CODE (addr) == ASHIFT)
15320 /* We're called for lea too, which implements ashift on occasion. */
15321 index = XEXP (addr, 0);
15322 tmp = XEXP (addr, 1);
15323 if (!CONST_INT_P (tmp))
15324 return 0;
15325 scale = INTVAL (tmp);
15326 if ((unsigned HOST_WIDE_INT) scale > 3)
15327 return 0;
15328 scale = 1 << scale;
15329 retval = -1;
15331 else
15332 disp = addr; /* displacement */
15334 if (index)
15336 if (REG_P (index))
15338 else if (SUBREG_P (index)
15339 && REG_P (SUBREG_REG (index)))
15341 else
15342 return 0;
15345 /* Extract the integral value of scale. */
15346 if (scale_rtx)
15348 if (!CONST_INT_P (scale_rtx))
15349 return 0;
15350 scale = INTVAL (scale_rtx);
15353 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15354 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15356 /* Avoid useless 0 displacement. */
15357 if (disp == const0_rtx && (base || index))
15358 disp = NULL_RTX;
15360 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15361 if (base_reg && index_reg && scale == 1
15362 && (index_reg == arg_pointer_rtx
15363 || index_reg == frame_pointer_rtx
15364 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15366 std::swap (base, index);
15367 std::swap (base_reg, index_reg);
15370 /* Special case: %ebp cannot be encoded as a base without a displacement.
15371 Similarly %r13. */
15372 if (!disp
15373 && base_reg
15374 && (base_reg == hard_frame_pointer_rtx
15375 || base_reg == frame_pointer_rtx
15376 || base_reg == arg_pointer_rtx
15377 || (REG_P (base_reg)
15378 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15379 || REGNO (base_reg) == R13_REG))))
15380 disp = const0_rtx;
15382 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15383 Avoid this by transforming to [%esi+0].
15384 Reload calls address legitimization without cfun defined, so we need
15385 to test cfun for being non-NULL. */
15386 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15387 && base_reg && !index_reg && !disp
15388 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15389 disp = const0_rtx;
15391 /* Special case: encode reg+reg instead of reg*2. */
15392 if (!base && index && scale == 2)
15393 base = index, base_reg = index_reg, scale = 1;
15395 /* Special case: scaling cannot be encoded without base or displacement. */
15396 if (!base && !disp && index && scale != 1)
15397 disp = const0_rtx;
15399 out->base = base;
15400 out->index = index;
15401 out->disp = disp;
15402 out->scale = scale;
15403 out->seg = seg;
15405 return retval;
15408 /* Return cost of the memory address x.
15409 For i386, it is better to use a complex address than let gcc copy
15410 the address into a reg and make a new pseudo. But not if the address
15411 requires to two regs - that would mean more pseudos with longer
15412 lifetimes. */
15413 static int
15414 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15416 struct ix86_address parts;
15417 int cost = 1;
15418 int ok = ix86_decompose_address (x, &parts);
15420 gcc_assert (ok);
15422 if (parts.base && SUBREG_P (parts.base))
15423 parts.base = SUBREG_REG (parts.base);
15424 if (parts.index && SUBREG_P (parts.index))
15425 parts.index = SUBREG_REG (parts.index);
15427 /* Attempt to minimize number of registers in the address by increasing
15428 address cost for each used register. We don't increase address cost
15429 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15430 is not invariant itself it most likely means that base or index is not
15431 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15432 which is not profitable for x86. */
15433 if (parts.base
15434 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15435 && (current_pass->type == GIMPLE_PASS
15436 || !pic_offset_table_rtx
15437 || !REG_P (parts.base)
15438 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15439 cost++;
15441 if (parts.index
15442 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15443 && (current_pass->type == GIMPLE_PASS
15444 || !pic_offset_table_rtx
15445 || !REG_P (parts.index)
15446 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15447 cost++;
15449 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15450 since it's predecode logic can't detect the length of instructions
15451 and it degenerates to vector decoded. Increase cost of such
15452 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15453 to split such addresses or even refuse such addresses at all.
15455 Following addressing modes are affected:
15456 [base+scale*index]
15457 [scale*index+disp]
15458 [base+index]
15460 The first and last case may be avoidable by explicitly coding the zero in
15461 memory address, but I don't have AMD-K6 machine handy to check this
15462 theory. */
15464 if (TARGET_K6
15465 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15466 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15467 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15468 cost += 10;
15470 return cost;
15473 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15474 this is used for to form addresses to local data when -fPIC is in
15475 use. */
15477 static bool
15478 darwin_local_data_pic (rtx disp)
15480 return (GET_CODE (disp) == UNSPEC
15481 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15484 /* True if operand X should be loaded from GOT. */
15486 bool
15487 ix86_force_load_from_GOT_p (rtx x)
15489 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15490 && !TARGET_PECOFF && !TARGET_MACHO
15491 && !flag_plt && !flag_pic
15492 && ix86_cmodel != CM_LARGE
15493 && GET_CODE (x) == SYMBOL_REF
15494 && SYMBOL_REF_FUNCTION_P (x)
15495 && !SYMBOL_REF_LOCAL_P (x));
15498 /* Determine if a given RTX is a valid constant. We already know this
15499 satisfies CONSTANT_P. */
15501 static bool
15502 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15504 /* Pointer bounds constants are not valid. */
15505 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15506 return false;
15508 switch (GET_CODE (x))
15510 case CONST:
15511 x = XEXP (x, 0);
15513 if (GET_CODE (x) == PLUS)
15515 if (!CONST_INT_P (XEXP (x, 1)))
15516 return false;
15517 x = XEXP (x, 0);
15520 if (TARGET_MACHO && darwin_local_data_pic (x))
15521 return true;
15523 /* Only some unspecs are valid as "constants". */
15524 if (GET_CODE (x) == UNSPEC)
15525 switch (XINT (x, 1))
15527 case UNSPEC_GOT:
15528 case UNSPEC_GOTOFF:
15529 case UNSPEC_PLTOFF:
15530 return TARGET_64BIT;
15531 case UNSPEC_TPOFF:
15532 case UNSPEC_NTPOFF:
15533 x = XVECEXP (x, 0, 0);
15534 return (GET_CODE (x) == SYMBOL_REF
15535 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15536 case UNSPEC_DTPOFF:
15537 x = XVECEXP (x, 0, 0);
15538 return (GET_CODE (x) == SYMBOL_REF
15539 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15540 default:
15541 return false;
15544 /* We must have drilled down to a symbol. */
15545 if (GET_CODE (x) == LABEL_REF)
15546 return true;
15547 if (GET_CODE (x) != SYMBOL_REF)
15548 return false;
15549 /* FALLTHRU */
15551 case SYMBOL_REF:
15552 /* TLS symbols are never valid. */
15553 if (SYMBOL_REF_TLS_MODEL (x))
15554 return false;
15556 /* DLLIMPORT symbols are never valid. */
15557 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15558 && SYMBOL_REF_DLLIMPORT_P (x))
15559 return false;
15561 #if TARGET_MACHO
15562 /* mdynamic-no-pic */
15563 if (MACHO_DYNAMIC_NO_PIC_P)
15564 return machopic_symbol_defined_p (x);
15565 #endif
15567 /* External function address should be loaded
15568 via the GOT slot to avoid PLT. */
15569 if (ix86_force_load_from_GOT_p (x))
15570 return false;
15572 break;
15574 CASE_CONST_SCALAR_INT:
15575 switch (mode)
15577 case TImode:
15578 if (TARGET_64BIT)
15579 return true;
15580 /* FALLTHRU */
15581 case OImode:
15582 case XImode:
15583 if (!standard_sse_constant_p (x, mode))
15584 return false;
15585 default:
15586 break;
15588 break;
15590 case CONST_VECTOR:
15591 if (!standard_sse_constant_p (x, mode))
15592 return false;
15594 default:
15595 break;
15598 /* Otherwise we handle everything else in the move patterns. */
15599 return true;
15602 /* Determine if it's legal to put X into the constant pool. This
15603 is not possible for the address of thread-local symbols, which
15604 is checked above. */
15606 static bool
15607 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15609 /* We can put any immediate constant in memory. */
15610 switch (GET_CODE (x))
15612 CASE_CONST_ANY:
15613 return false;
15615 default:
15616 break;
15619 return !ix86_legitimate_constant_p (mode, x);
15622 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15623 otherwise zero. */
15625 static bool
15626 is_imported_p (rtx x)
15628 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15629 || GET_CODE (x) != SYMBOL_REF)
15630 return false;
15632 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15636 /* Nonzero if the constant value X is a legitimate general operand
15637 when generating PIC code. It is given that flag_pic is on and
15638 that X satisfies CONSTANT_P. */
15640 bool
15641 legitimate_pic_operand_p (rtx x)
15643 rtx inner;
15645 switch (GET_CODE (x))
15647 case CONST:
15648 inner = XEXP (x, 0);
15649 if (GET_CODE (inner) == PLUS
15650 && CONST_INT_P (XEXP (inner, 1)))
15651 inner = XEXP (inner, 0);
15653 /* Only some unspecs are valid as "constants". */
15654 if (GET_CODE (inner) == UNSPEC)
15655 switch (XINT (inner, 1))
15657 case UNSPEC_GOT:
15658 case UNSPEC_GOTOFF:
15659 case UNSPEC_PLTOFF:
15660 return TARGET_64BIT;
15661 case UNSPEC_TPOFF:
15662 x = XVECEXP (inner, 0, 0);
15663 return (GET_CODE (x) == SYMBOL_REF
15664 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15665 case UNSPEC_MACHOPIC_OFFSET:
15666 return legitimate_pic_address_disp_p (x);
15667 default:
15668 return false;
15670 /* FALLTHRU */
15672 case SYMBOL_REF:
15673 case LABEL_REF:
15674 return legitimate_pic_address_disp_p (x);
15676 default:
15677 return true;
15681 /* Determine if a given CONST RTX is a valid memory displacement
15682 in PIC mode. */
15684 bool
15685 legitimate_pic_address_disp_p (rtx disp)
15687 bool saw_plus;
15689 /* In 64bit mode we can allow direct addresses of symbols and labels
15690 when they are not dynamic symbols. */
15691 if (TARGET_64BIT)
15693 rtx op0 = disp, op1;
15695 switch (GET_CODE (disp))
15697 case LABEL_REF:
15698 return true;
15700 case CONST:
15701 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15702 break;
15703 op0 = XEXP (XEXP (disp, 0), 0);
15704 op1 = XEXP (XEXP (disp, 0), 1);
15705 if (!CONST_INT_P (op1)
15706 || INTVAL (op1) >= 16*1024*1024
15707 || INTVAL (op1) < -16*1024*1024)
15708 break;
15709 if (GET_CODE (op0) == LABEL_REF)
15710 return true;
15711 if (GET_CODE (op0) == CONST
15712 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15713 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15714 return true;
15715 if (GET_CODE (op0) == UNSPEC
15716 && XINT (op0, 1) == UNSPEC_PCREL)
15717 return true;
15718 if (GET_CODE (op0) != SYMBOL_REF)
15719 break;
15720 /* FALLTHRU */
15722 case SYMBOL_REF:
15723 /* TLS references should always be enclosed in UNSPEC.
15724 The dllimported symbol needs always to be resolved. */
15725 if (SYMBOL_REF_TLS_MODEL (op0)
15726 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15727 return false;
15729 if (TARGET_PECOFF)
15731 if (is_imported_p (op0))
15732 return true;
15734 if (SYMBOL_REF_FAR_ADDR_P (op0)
15735 || !SYMBOL_REF_LOCAL_P (op0))
15736 break;
15738 /* Function-symbols need to be resolved only for
15739 large-model.
15740 For the small-model we don't need to resolve anything
15741 here. */
15742 if ((ix86_cmodel != CM_LARGE_PIC
15743 && SYMBOL_REF_FUNCTION_P (op0))
15744 || ix86_cmodel == CM_SMALL_PIC)
15745 return true;
15746 /* Non-external symbols don't need to be resolved for
15747 large, and medium-model. */
15748 if ((ix86_cmodel == CM_LARGE_PIC
15749 || ix86_cmodel == CM_MEDIUM_PIC)
15750 && !SYMBOL_REF_EXTERNAL_P (op0))
15751 return true;
15753 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15754 && (SYMBOL_REF_LOCAL_P (op0)
15755 || (HAVE_LD_PIE_COPYRELOC
15756 && flag_pie
15757 && !SYMBOL_REF_WEAK (op0)
15758 && !SYMBOL_REF_FUNCTION_P (op0)))
15759 && ix86_cmodel != CM_LARGE_PIC)
15760 return true;
15761 break;
15763 default:
15764 break;
15767 if (GET_CODE (disp) != CONST)
15768 return false;
15769 disp = XEXP (disp, 0);
15771 if (TARGET_64BIT)
15773 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15774 of GOT tables. We should not need these anyway. */
15775 if (GET_CODE (disp) != UNSPEC
15776 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15777 && XINT (disp, 1) != UNSPEC_GOTOFF
15778 && XINT (disp, 1) != UNSPEC_PCREL
15779 && XINT (disp, 1) != UNSPEC_PLTOFF))
15780 return false;
15782 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15783 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15784 return false;
15785 return true;
15788 saw_plus = false;
15789 if (GET_CODE (disp) == PLUS)
15791 if (!CONST_INT_P (XEXP (disp, 1)))
15792 return false;
15793 disp = XEXP (disp, 0);
15794 saw_plus = true;
15797 if (TARGET_MACHO && darwin_local_data_pic (disp))
15798 return true;
15800 if (GET_CODE (disp) != UNSPEC)
15801 return false;
15803 switch (XINT (disp, 1))
15805 case UNSPEC_GOT:
15806 if (saw_plus)
15807 return false;
15808 /* We need to check for both symbols and labels because VxWorks loads
15809 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15810 details. */
15811 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15812 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15813 case UNSPEC_GOTOFF:
15814 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15815 While ABI specify also 32bit relocation but we don't produce it in
15816 small PIC model at all. */
15817 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15818 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15819 && !TARGET_64BIT)
15820 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15821 return false;
15822 case UNSPEC_GOTTPOFF:
15823 case UNSPEC_GOTNTPOFF:
15824 case UNSPEC_INDNTPOFF:
15825 if (saw_plus)
15826 return false;
15827 disp = XVECEXP (disp, 0, 0);
15828 return (GET_CODE (disp) == SYMBOL_REF
15829 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15830 case UNSPEC_NTPOFF:
15831 disp = XVECEXP (disp, 0, 0);
15832 return (GET_CODE (disp) == SYMBOL_REF
15833 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15834 case UNSPEC_DTPOFF:
15835 disp = XVECEXP (disp, 0, 0);
15836 return (GET_CODE (disp) == SYMBOL_REF
15837 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15840 return false;
15843 /* Determine if op is suitable RTX for an address register.
15844 Return naked register if a register or a register subreg is
15845 found, otherwise return NULL_RTX. */
15847 static rtx
15848 ix86_validate_address_register (rtx op)
15850 machine_mode mode = GET_MODE (op);
15852 /* Only SImode or DImode registers can form the address. */
15853 if (mode != SImode && mode != DImode)
15854 return NULL_RTX;
15856 if (REG_P (op))
15857 return op;
15858 else if (SUBREG_P (op))
15860 rtx reg = SUBREG_REG (op);
15862 if (!REG_P (reg))
15863 return NULL_RTX;
15865 mode = GET_MODE (reg);
15867 /* Don't allow SUBREGs that span more than a word. It can
15868 lead to spill failures when the register is one word out
15869 of a two word structure. */
15870 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15871 return NULL_RTX;
15873 /* Allow only SUBREGs of non-eliminable hard registers. */
15874 if (register_no_elim_operand (reg, mode))
15875 return reg;
15878 /* Op is not a register. */
15879 return NULL_RTX;
15882 /* Recognizes RTL expressions that are valid memory addresses for an
15883 instruction. The MODE argument is the machine mode for the MEM
15884 expression that wants to use this address.
15886 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15887 convert common non-canonical forms to canonical form so that they will
15888 be recognized. */
15890 static bool
15891 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15893 struct ix86_address parts;
15894 rtx base, index, disp;
15895 HOST_WIDE_INT scale;
15896 addr_space_t seg;
15898 if (ix86_decompose_address (addr, &parts) <= 0)
15899 /* Decomposition failed. */
15900 return false;
15902 base = parts.base;
15903 index = parts.index;
15904 disp = parts.disp;
15905 scale = parts.scale;
15906 seg = parts.seg;
15908 /* Validate base register. */
15909 if (base)
15911 rtx reg = ix86_validate_address_register (base);
15913 if (reg == NULL_RTX)
15914 return false;
15916 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15917 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15918 /* Base is not valid. */
15919 return false;
15922 /* Validate index register. */
15923 if (index)
15925 rtx reg = ix86_validate_address_register (index);
15927 if (reg == NULL_RTX)
15928 return false;
15930 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15931 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15932 /* Index is not valid. */
15933 return false;
15936 /* Index and base should have the same mode. */
15937 if (base && index
15938 && GET_MODE (base) != GET_MODE (index))
15939 return false;
15941 /* Address override works only on the (%reg) part of %fs:(%reg). */
15942 if (seg != ADDR_SPACE_GENERIC
15943 && ((base && GET_MODE (base) != word_mode)
15944 || (index && GET_MODE (index) != word_mode)))
15945 return false;
15947 /* Validate scale factor. */
15948 if (scale != 1)
15950 if (!index)
15951 /* Scale without index. */
15952 return false;
15954 if (scale != 2 && scale != 4 && scale != 8)
15955 /* Scale is not a valid multiplier. */
15956 return false;
15959 /* Validate displacement. */
15960 if (disp)
15962 if (GET_CODE (disp) == CONST
15963 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15964 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15965 switch (XINT (XEXP (disp, 0), 1))
15967 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15968 when used. While ABI specify also 32bit relocations, we
15969 don't produce them at all and use IP relative instead.
15970 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15971 should be loaded via GOT. */
15972 case UNSPEC_GOT:
15973 if (!TARGET_64BIT
15974 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15975 goto is_legitimate_pic;
15976 /* FALLTHRU */
15977 case UNSPEC_GOTOFF:
15978 gcc_assert (flag_pic);
15979 if (!TARGET_64BIT)
15980 goto is_legitimate_pic;
15982 /* 64bit address unspec. */
15983 return false;
15985 case UNSPEC_GOTPCREL:
15986 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15987 goto is_legitimate_pic;
15988 /* FALLTHRU */
15989 case UNSPEC_PCREL:
15990 gcc_assert (flag_pic);
15991 goto is_legitimate_pic;
15993 case UNSPEC_GOTTPOFF:
15994 case UNSPEC_GOTNTPOFF:
15995 case UNSPEC_INDNTPOFF:
15996 case UNSPEC_NTPOFF:
15997 case UNSPEC_DTPOFF:
15998 break;
16000 case UNSPEC_STACK_CHECK:
16001 gcc_assert (flag_split_stack);
16002 break;
16004 default:
16005 /* Invalid address unspec. */
16006 return false;
16009 else if (SYMBOLIC_CONST (disp)
16010 && (flag_pic
16011 || (TARGET_MACHO
16012 #if TARGET_MACHO
16013 && MACHOPIC_INDIRECT
16014 && !machopic_operand_p (disp)
16015 #endif
16019 is_legitimate_pic:
16020 if (TARGET_64BIT && (index || base))
16022 /* foo@dtpoff(%rX) is ok. */
16023 if (GET_CODE (disp) != CONST
16024 || GET_CODE (XEXP (disp, 0)) != PLUS
16025 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16026 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16027 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16028 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16029 /* Non-constant pic memory reference. */
16030 return false;
16032 else if ((!TARGET_MACHO || flag_pic)
16033 && ! legitimate_pic_address_disp_p (disp))
16034 /* Displacement is an invalid pic construct. */
16035 return false;
16036 #if TARGET_MACHO
16037 else if (MACHO_DYNAMIC_NO_PIC_P
16038 && !ix86_legitimate_constant_p (Pmode, disp))
16039 /* displacment must be referenced via non_lazy_pointer */
16040 return false;
16041 #endif
16043 /* This code used to verify that a symbolic pic displacement
16044 includes the pic_offset_table_rtx register.
16046 While this is good idea, unfortunately these constructs may
16047 be created by "adds using lea" optimization for incorrect
16048 code like:
16050 int a;
16051 int foo(int i)
16053 return *(&a+i);
16056 This code is nonsensical, but results in addressing
16057 GOT table with pic_offset_table_rtx base. We can't
16058 just refuse it easily, since it gets matched by
16059 "addsi3" pattern, that later gets split to lea in the
16060 case output register differs from input. While this
16061 can be handled by separate addsi pattern for this case
16062 that never results in lea, this seems to be easier and
16063 correct fix for crash to disable this test. */
16065 else if (GET_CODE (disp) != LABEL_REF
16066 && !CONST_INT_P (disp)
16067 && (GET_CODE (disp) != CONST
16068 || !ix86_legitimate_constant_p (Pmode, disp))
16069 && (GET_CODE (disp) != SYMBOL_REF
16070 || !ix86_legitimate_constant_p (Pmode, disp)))
16071 /* Displacement is not constant. */
16072 return false;
16073 else if (TARGET_64BIT
16074 && !x86_64_immediate_operand (disp, VOIDmode))
16075 /* Displacement is out of range. */
16076 return false;
16077 /* In x32 mode, constant addresses are sign extended to 64bit, so
16078 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16079 else if (TARGET_X32 && !(index || base)
16080 && CONST_INT_P (disp)
16081 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16082 return false;
16085 /* Everything looks valid. */
16086 return true;
16089 /* Determine if a given RTX is a valid constant address. */
16091 bool
16092 constant_address_p (rtx x)
16094 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16097 /* Return a unique alias set for the GOT. */
16099 static alias_set_type
16100 ix86_GOT_alias_set (void)
16102 static alias_set_type set = -1;
16103 if (set == -1)
16104 set = new_alias_set ();
16105 return set;
16108 /* Return a legitimate reference for ORIG (an address) using the
16109 register REG. If REG is 0, a new pseudo is generated.
16111 There are two types of references that must be handled:
16113 1. Global data references must load the address from the GOT, via
16114 the PIC reg. An insn is emitted to do this load, and the reg is
16115 returned.
16117 2. Static data references, constant pool addresses, and code labels
16118 compute the address as an offset from the GOT, whose base is in
16119 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16120 differentiate them from global data objects. The returned
16121 address is the PIC reg + an unspec constant.
16123 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16124 reg also appears in the address. */
16126 static rtx
16127 legitimize_pic_address (rtx orig, rtx reg)
16129 rtx addr = orig;
16130 rtx new_rtx = orig;
16132 #if TARGET_MACHO
16133 if (TARGET_MACHO && !TARGET_64BIT)
16135 if (reg == 0)
16136 reg = gen_reg_rtx (Pmode);
16137 /* Use the generic Mach-O PIC machinery. */
16138 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16140 #endif
16142 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16144 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16145 if (tmp)
16146 return tmp;
16149 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16150 new_rtx = addr;
16151 else if ((!TARGET_64BIT
16152 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16153 && !TARGET_PECOFF
16154 && gotoff_operand (addr, Pmode))
16156 /* This symbol may be referenced via a displacement
16157 from the PIC base address (@GOTOFF). */
16158 if (GET_CODE (addr) == CONST)
16159 addr = XEXP (addr, 0);
16161 if (GET_CODE (addr) == PLUS)
16163 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16164 UNSPEC_GOTOFF);
16165 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16167 else
16168 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16170 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16172 if (TARGET_64BIT)
16173 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16175 if (reg != 0)
16177 gcc_assert (REG_P (reg));
16178 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16179 new_rtx, reg, 1, OPTAB_DIRECT);
16181 else
16182 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16184 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16185 /* We can't use @GOTOFF for text labels
16186 on VxWorks, see gotoff_operand. */
16187 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16189 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16190 if (tmp)
16191 return tmp;
16193 /* For x64 PE-COFF there is no GOT table,
16194 so we use address directly. */
16195 if (TARGET_64BIT && TARGET_PECOFF)
16197 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16198 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16200 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16202 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16203 UNSPEC_GOTPCREL);
16204 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16205 new_rtx = gen_const_mem (Pmode, new_rtx);
16206 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16208 else
16210 /* This symbol must be referenced via a load
16211 from the Global Offset Table (@GOT). */
16212 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16213 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16214 if (TARGET_64BIT)
16215 new_rtx = force_reg (Pmode, new_rtx);
16216 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16217 new_rtx = gen_const_mem (Pmode, new_rtx);
16218 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16221 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16223 else
16225 if (CONST_INT_P (addr)
16226 && !x86_64_immediate_operand (addr, VOIDmode))
16227 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16228 else if (GET_CODE (addr) == CONST)
16230 addr = XEXP (addr, 0);
16232 /* We must match stuff we generate before. Assume the only
16233 unspecs that can get here are ours. Not that we could do
16234 anything with them anyway.... */
16235 if (GET_CODE (addr) == UNSPEC
16236 || (GET_CODE (addr) == PLUS
16237 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16238 return orig;
16239 gcc_assert (GET_CODE (addr) == PLUS);
16242 if (GET_CODE (addr) == PLUS)
16244 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16246 /* Check first to see if this is a constant
16247 offset from a @GOTOFF symbol reference. */
16248 if (!TARGET_PECOFF
16249 && gotoff_operand (op0, Pmode)
16250 && CONST_INT_P (op1))
16252 if (!TARGET_64BIT)
16254 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16255 UNSPEC_GOTOFF);
16256 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16257 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16259 if (reg != 0)
16261 gcc_assert (REG_P (reg));
16262 new_rtx = expand_simple_binop (Pmode, PLUS,
16263 pic_offset_table_rtx,
16264 new_rtx, reg, 1,
16265 OPTAB_DIRECT);
16267 else
16268 new_rtx
16269 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16271 else
16273 if (INTVAL (op1) < -16*1024*1024
16274 || INTVAL (op1) >= 16*1024*1024)
16276 if (!x86_64_immediate_operand (op1, Pmode))
16277 op1 = force_reg (Pmode, op1);
16279 new_rtx
16280 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16284 else
16286 rtx base = legitimize_pic_address (op0, reg);
16287 machine_mode mode = GET_MODE (base);
16288 new_rtx
16289 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16291 if (CONST_INT_P (new_rtx))
16293 if (INTVAL (new_rtx) < -16*1024*1024
16294 || INTVAL (new_rtx) >= 16*1024*1024)
16296 if (!x86_64_immediate_operand (new_rtx, mode))
16297 new_rtx = force_reg (mode, new_rtx);
16299 new_rtx
16300 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16302 else
16303 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16305 else
16307 /* For %rip addressing, we have to use
16308 just disp32, not base nor index. */
16309 if (TARGET_64BIT
16310 && (GET_CODE (base) == SYMBOL_REF
16311 || GET_CODE (base) == LABEL_REF))
16312 base = force_reg (mode, base);
16313 if (GET_CODE (new_rtx) == PLUS
16314 && CONSTANT_P (XEXP (new_rtx, 1)))
16316 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16317 new_rtx = XEXP (new_rtx, 1);
16319 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16324 return new_rtx;
16327 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16329 static rtx
16330 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16332 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16334 if (GET_MODE (tp) != tp_mode)
16336 gcc_assert (GET_MODE (tp) == SImode);
16337 gcc_assert (tp_mode == DImode);
16339 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16342 if (to_reg)
16343 tp = copy_to_mode_reg (tp_mode, tp);
16345 return tp;
16348 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16350 static GTY(()) rtx ix86_tls_symbol;
16352 static rtx
16353 ix86_tls_get_addr (void)
16355 if (!ix86_tls_symbol)
16357 const char *sym
16358 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16359 ? "___tls_get_addr" : "__tls_get_addr");
16361 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16364 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16366 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16367 UNSPEC_PLTOFF);
16368 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16369 gen_rtx_CONST (Pmode, unspec));
16372 return ix86_tls_symbol;
16375 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16377 static GTY(()) rtx ix86_tls_module_base_symbol;
16380 ix86_tls_module_base (void)
16382 if (!ix86_tls_module_base_symbol)
16384 ix86_tls_module_base_symbol
16385 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16387 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16388 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16391 return ix86_tls_module_base_symbol;
16394 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16395 false if we expect this to be used for a memory address and true if
16396 we expect to load the address into a register. */
16398 static rtx
16399 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16401 rtx dest, base, off;
16402 rtx pic = NULL_RTX, tp = NULL_RTX;
16403 machine_mode tp_mode = Pmode;
16404 int type;
16406 /* Fall back to global dynamic model if tool chain cannot support local
16407 dynamic. */
16408 if (TARGET_SUN_TLS && !TARGET_64BIT
16409 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16410 && model == TLS_MODEL_LOCAL_DYNAMIC)
16411 model = TLS_MODEL_GLOBAL_DYNAMIC;
16413 switch (model)
16415 case TLS_MODEL_GLOBAL_DYNAMIC:
16416 dest = gen_reg_rtx (Pmode);
16418 if (!TARGET_64BIT)
16420 if (flag_pic && !TARGET_PECOFF)
16421 pic = pic_offset_table_rtx;
16422 else
16424 pic = gen_reg_rtx (Pmode);
16425 emit_insn (gen_set_got (pic));
16429 if (TARGET_GNU2_TLS)
16431 if (TARGET_64BIT)
16432 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16433 else
16434 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16436 tp = get_thread_pointer (Pmode, true);
16437 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16439 if (GET_MODE (x) != Pmode)
16440 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16442 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16444 else
16446 rtx caddr = ix86_tls_get_addr ();
16448 if (TARGET_64BIT)
16450 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16451 rtx_insn *insns;
16453 start_sequence ();
16454 emit_call_insn
16455 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16456 insns = get_insns ();
16457 end_sequence ();
16459 if (GET_MODE (x) != Pmode)
16460 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16462 RTL_CONST_CALL_P (insns) = 1;
16463 emit_libcall_block (insns, dest, rax, x);
16465 else
16466 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16468 break;
16470 case TLS_MODEL_LOCAL_DYNAMIC:
16471 base = gen_reg_rtx (Pmode);
16473 if (!TARGET_64BIT)
16475 if (flag_pic)
16476 pic = pic_offset_table_rtx;
16477 else
16479 pic = gen_reg_rtx (Pmode);
16480 emit_insn (gen_set_got (pic));
16484 if (TARGET_GNU2_TLS)
16486 rtx tmp = ix86_tls_module_base ();
16488 if (TARGET_64BIT)
16489 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16490 else
16491 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16493 tp = get_thread_pointer (Pmode, true);
16494 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16495 gen_rtx_MINUS (Pmode, tmp, tp));
16497 else
16499 rtx caddr = ix86_tls_get_addr ();
16501 if (TARGET_64BIT)
16503 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16504 rtx_insn *insns;
16505 rtx eqv;
16507 start_sequence ();
16508 emit_call_insn
16509 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16510 insns = get_insns ();
16511 end_sequence ();
16513 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16514 share the LD_BASE result with other LD model accesses. */
16515 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16516 UNSPEC_TLS_LD_BASE);
16518 RTL_CONST_CALL_P (insns) = 1;
16519 emit_libcall_block (insns, base, rax, eqv);
16521 else
16522 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16525 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16526 off = gen_rtx_CONST (Pmode, off);
16528 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16530 if (TARGET_GNU2_TLS)
16532 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16534 if (GET_MODE (x) != Pmode)
16535 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16537 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16539 break;
16541 case TLS_MODEL_INITIAL_EXEC:
16542 if (TARGET_64BIT)
16544 if (TARGET_SUN_TLS && !TARGET_X32)
16546 /* The Sun linker took the AMD64 TLS spec literally
16547 and can only handle %rax as destination of the
16548 initial executable code sequence. */
16550 dest = gen_reg_rtx (DImode);
16551 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16552 return dest;
16555 /* Generate DImode references to avoid %fs:(%reg32)
16556 problems and linker IE->LE relaxation bug. */
16557 tp_mode = DImode;
16558 pic = NULL;
16559 type = UNSPEC_GOTNTPOFF;
16561 else if (flag_pic)
16563 pic = pic_offset_table_rtx;
16564 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16566 else if (!TARGET_ANY_GNU_TLS)
16568 pic = gen_reg_rtx (Pmode);
16569 emit_insn (gen_set_got (pic));
16570 type = UNSPEC_GOTTPOFF;
16572 else
16574 pic = NULL;
16575 type = UNSPEC_INDNTPOFF;
16578 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16579 off = gen_rtx_CONST (tp_mode, off);
16580 if (pic)
16581 off = gen_rtx_PLUS (tp_mode, pic, off);
16582 off = gen_const_mem (tp_mode, off);
16583 set_mem_alias_set (off, ix86_GOT_alias_set ());
16585 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16587 base = get_thread_pointer (tp_mode,
16588 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16589 off = force_reg (tp_mode, off);
16590 dest = gen_rtx_PLUS (tp_mode, base, off);
16591 if (tp_mode != Pmode)
16592 dest = convert_to_mode (Pmode, dest, 1);
16594 else
16596 base = get_thread_pointer (Pmode, true);
16597 dest = gen_reg_rtx (Pmode);
16598 emit_insn (ix86_gen_sub3 (dest, base, off));
16600 break;
16602 case TLS_MODEL_LOCAL_EXEC:
16603 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16604 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16605 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16606 off = gen_rtx_CONST (Pmode, off);
16608 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16610 base = get_thread_pointer (Pmode,
16611 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16612 return gen_rtx_PLUS (Pmode, base, off);
16614 else
16616 base = get_thread_pointer (Pmode, true);
16617 dest = gen_reg_rtx (Pmode);
16618 emit_insn (ix86_gen_sub3 (dest, base, off));
16620 break;
16622 default:
16623 gcc_unreachable ();
16626 return dest;
16629 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16630 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16631 unique refptr-DECL symbol corresponding to symbol DECL. */
16633 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16635 static inline hashval_t hash (tree_map *m) { return m->hash; }
16636 static inline bool
16637 equal (tree_map *a, tree_map *b)
16639 return a->base.from == b->base.from;
16642 static int
16643 keep_cache_entry (tree_map *&m)
16645 return ggc_marked_p (m->base.from);
16649 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16651 static tree
16652 get_dllimport_decl (tree decl, bool beimport)
16654 struct tree_map *h, in;
16655 const char *name;
16656 const char *prefix;
16657 size_t namelen, prefixlen;
16658 char *imp_name;
16659 tree to;
16660 rtx rtl;
16662 if (!dllimport_map)
16663 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16665 in.hash = htab_hash_pointer (decl);
16666 in.base.from = decl;
16667 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16668 h = *loc;
16669 if (h)
16670 return h->to;
16672 *loc = h = ggc_alloc<tree_map> ();
16673 h->hash = in.hash;
16674 h->base.from = decl;
16675 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16676 VAR_DECL, NULL, ptr_type_node);
16677 DECL_ARTIFICIAL (to) = 1;
16678 DECL_IGNORED_P (to) = 1;
16679 DECL_EXTERNAL (to) = 1;
16680 TREE_READONLY (to) = 1;
16682 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16683 name = targetm.strip_name_encoding (name);
16684 if (beimport)
16685 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16686 ? "*__imp_" : "*__imp__";
16687 else
16688 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16689 namelen = strlen (name);
16690 prefixlen = strlen (prefix);
16691 imp_name = (char *) alloca (namelen + prefixlen + 1);
16692 memcpy (imp_name, prefix, prefixlen);
16693 memcpy (imp_name + prefixlen, name, namelen + 1);
16695 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16696 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16697 SET_SYMBOL_REF_DECL (rtl, to);
16698 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16699 if (!beimport)
16701 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16702 #ifdef SUB_TARGET_RECORD_STUB
16703 SUB_TARGET_RECORD_STUB (name);
16704 #endif
16707 rtl = gen_const_mem (Pmode, rtl);
16708 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16710 SET_DECL_RTL (to, rtl);
16711 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16713 return to;
16716 /* Expand SYMBOL into its corresponding far-address symbol.
16717 WANT_REG is true if we require the result be a register. */
16719 static rtx
16720 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16722 tree imp_decl;
16723 rtx x;
16725 gcc_assert (SYMBOL_REF_DECL (symbol));
16726 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16728 x = DECL_RTL (imp_decl);
16729 if (want_reg)
16730 x = force_reg (Pmode, x);
16731 return x;
16734 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16735 true if we require the result be a register. */
16737 static rtx
16738 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16740 tree imp_decl;
16741 rtx x;
16743 gcc_assert (SYMBOL_REF_DECL (symbol));
16744 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16746 x = DECL_RTL (imp_decl);
16747 if (want_reg)
16748 x = force_reg (Pmode, x);
16749 return x;
16752 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16753 is true if we require the result be a register. */
16755 static rtx
16756 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16758 if (!TARGET_PECOFF)
16759 return NULL_RTX;
16761 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16763 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16764 return legitimize_dllimport_symbol (addr, inreg);
16765 if (GET_CODE (addr) == CONST
16766 && GET_CODE (XEXP (addr, 0)) == PLUS
16767 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16768 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16770 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16771 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16775 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16776 return NULL_RTX;
16777 if (GET_CODE (addr) == SYMBOL_REF
16778 && !is_imported_p (addr)
16779 && SYMBOL_REF_EXTERNAL_P (addr)
16780 && SYMBOL_REF_DECL (addr))
16781 return legitimize_pe_coff_extern_decl (addr, inreg);
16783 if (GET_CODE (addr) == CONST
16784 && GET_CODE (XEXP (addr, 0)) == PLUS
16785 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16786 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16787 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16788 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16790 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16791 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16793 return NULL_RTX;
16796 /* Try machine-dependent ways of modifying an illegitimate address
16797 to be legitimate. If we find one, return the new, valid address.
16798 This macro is used in only one place: `memory_address' in explow.c.
16800 OLDX is the address as it was before break_out_memory_refs was called.
16801 In some cases it is useful to look at this to decide what needs to be done.
16803 It is always safe for this macro to do nothing. It exists to recognize
16804 opportunities to optimize the output.
16806 For the 80386, we handle X+REG by loading X into a register R and
16807 using R+REG. R will go in a general reg and indexing will be used.
16808 However, if REG is a broken-out memory address or multiplication,
16809 nothing needs to be done because REG can certainly go in a general reg.
16811 When -fpic is used, special handling is needed for symbolic references.
16812 See comments by legitimize_pic_address in i386.c for details. */
16814 static rtx
16815 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16817 bool changed = false;
16818 unsigned log;
16820 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16821 if (log)
16822 return legitimize_tls_address (x, (enum tls_model) log, false);
16823 if (GET_CODE (x) == CONST
16824 && GET_CODE (XEXP (x, 0)) == PLUS
16825 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16826 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16828 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16829 (enum tls_model) log, false);
16830 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16833 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16835 rtx tmp = legitimize_pe_coff_symbol (x, true);
16836 if (tmp)
16837 return tmp;
16840 if (flag_pic && SYMBOLIC_CONST (x))
16841 return legitimize_pic_address (x, 0);
16843 #if TARGET_MACHO
16844 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16845 return machopic_indirect_data_reference (x, 0);
16846 #endif
16848 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16849 if (GET_CODE (x) == ASHIFT
16850 && CONST_INT_P (XEXP (x, 1))
16851 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16853 changed = true;
16854 log = INTVAL (XEXP (x, 1));
16855 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16856 GEN_INT (1 << log));
16859 if (GET_CODE (x) == PLUS)
16861 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16863 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16864 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16865 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16867 changed = true;
16868 log = INTVAL (XEXP (XEXP (x, 0), 1));
16869 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16870 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16871 GEN_INT (1 << log));
16874 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16875 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16876 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16878 changed = true;
16879 log = INTVAL (XEXP (XEXP (x, 1), 1));
16880 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16881 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16882 GEN_INT (1 << log));
16885 /* Put multiply first if it isn't already. */
16886 if (GET_CODE (XEXP (x, 1)) == MULT)
16888 std::swap (XEXP (x, 0), XEXP (x, 1));
16889 changed = true;
16892 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16893 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16894 created by virtual register instantiation, register elimination, and
16895 similar optimizations. */
16896 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16898 changed = true;
16899 x = gen_rtx_PLUS (Pmode,
16900 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16901 XEXP (XEXP (x, 1), 0)),
16902 XEXP (XEXP (x, 1), 1));
16905 /* Canonicalize
16906 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16907 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16908 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16909 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16910 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16911 && CONSTANT_P (XEXP (x, 1)))
16913 rtx constant;
16914 rtx other = NULL_RTX;
16916 if (CONST_INT_P (XEXP (x, 1)))
16918 constant = XEXP (x, 1);
16919 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16921 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16923 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16924 other = XEXP (x, 1);
16926 else
16927 constant = 0;
16929 if (constant)
16931 changed = true;
16932 x = gen_rtx_PLUS (Pmode,
16933 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16934 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16935 plus_constant (Pmode, other,
16936 INTVAL (constant)));
16940 if (changed && ix86_legitimate_address_p (mode, x, false))
16941 return x;
16943 if (GET_CODE (XEXP (x, 0)) == MULT)
16945 changed = true;
16946 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16949 if (GET_CODE (XEXP (x, 1)) == MULT)
16951 changed = true;
16952 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16955 if (changed
16956 && REG_P (XEXP (x, 1))
16957 && REG_P (XEXP (x, 0)))
16958 return x;
16960 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16962 changed = true;
16963 x = legitimize_pic_address (x, 0);
16966 if (changed && ix86_legitimate_address_p (mode, x, false))
16967 return x;
16969 if (REG_P (XEXP (x, 0)))
16971 rtx temp = gen_reg_rtx (Pmode);
16972 rtx val = force_operand (XEXP (x, 1), temp);
16973 if (val != temp)
16975 val = convert_to_mode (Pmode, val, 1);
16976 emit_move_insn (temp, val);
16979 XEXP (x, 1) = temp;
16980 return x;
16983 else if (REG_P (XEXP (x, 1)))
16985 rtx temp = gen_reg_rtx (Pmode);
16986 rtx val = force_operand (XEXP (x, 0), temp);
16987 if (val != temp)
16989 val = convert_to_mode (Pmode, val, 1);
16990 emit_move_insn (temp, val);
16993 XEXP (x, 0) = temp;
16994 return x;
16998 return x;
17001 /* Print an integer constant expression in assembler syntax. Addition
17002 and subtraction are the only arithmetic that may appear in these
17003 expressions. FILE is the stdio stream to write to, X is the rtx, and
17004 CODE is the operand print code from the output string. */
17006 static void
17007 output_pic_addr_const (FILE *file, rtx x, int code)
17009 char buf[256];
17011 switch (GET_CODE (x))
17013 case PC:
17014 gcc_assert (flag_pic);
17015 putc ('.', file);
17016 break;
17018 case SYMBOL_REF:
17019 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17020 output_addr_const (file, x);
17021 else
17023 const char *name = XSTR (x, 0);
17025 /* Mark the decl as referenced so that cgraph will
17026 output the function. */
17027 if (SYMBOL_REF_DECL (x))
17028 mark_decl_referenced (SYMBOL_REF_DECL (x));
17030 #if TARGET_MACHO
17031 if (MACHOPIC_INDIRECT
17032 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17033 name = machopic_indirection_name (x, /*stub_p=*/true);
17034 #endif
17035 assemble_name (file, name);
17037 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17038 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17039 fputs ("@PLT", file);
17040 break;
17042 case LABEL_REF:
17043 x = XEXP (x, 0);
17044 /* FALLTHRU */
17045 case CODE_LABEL:
17046 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17047 assemble_name (asm_out_file, buf);
17048 break;
17050 case CONST_INT:
17051 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17052 break;
17054 case CONST:
17055 /* This used to output parentheses around the expression,
17056 but that does not work on the 386 (either ATT or BSD assembler). */
17057 output_pic_addr_const (file, XEXP (x, 0), code);
17058 break;
17060 case CONST_DOUBLE:
17061 /* We can't handle floating point constants;
17062 TARGET_PRINT_OPERAND must handle them. */
17063 output_operand_lossage ("floating constant misused");
17064 break;
17066 case PLUS:
17067 /* Some assemblers need integer constants to appear first. */
17068 if (CONST_INT_P (XEXP (x, 0)))
17070 output_pic_addr_const (file, XEXP (x, 0), code);
17071 putc ('+', file);
17072 output_pic_addr_const (file, XEXP (x, 1), code);
17074 else
17076 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17077 output_pic_addr_const (file, XEXP (x, 1), code);
17078 putc ('+', file);
17079 output_pic_addr_const (file, XEXP (x, 0), code);
17081 break;
17083 case MINUS:
17084 if (!TARGET_MACHO)
17085 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17086 output_pic_addr_const (file, XEXP (x, 0), code);
17087 putc ('-', file);
17088 output_pic_addr_const (file, XEXP (x, 1), code);
17089 if (!TARGET_MACHO)
17090 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17091 break;
17093 case UNSPEC:
17094 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17096 bool f = i386_asm_output_addr_const_extra (file, x);
17097 gcc_assert (f);
17098 break;
17101 gcc_assert (XVECLEN (x, 0) == 1);
17102 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17103 switch (XINT (x, 1))
17105 case UNSPEC_GOT:
17106 fputs ("@GOT", file);
17107 break;
17108 case UNSPEC_GOTOFF:
17109 fputs ("@GOTOFF", file);
17110 break;
17111 case UNSPEC_PLTOFF:
17112 fputs ("@PLTOFF", file);
17113 break;
17114 case UNSPEC_PCREL:
17115 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17116 "(%rip)" : "[rip]", file);
17117 break;
17118 case UNSPEC_GOTPCREL:
17119 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17120 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17121 break;
17122 case UNSPEC_GOTTPOFF:
17123 /* FIXME: This might be @TPOFF in Sun ld too. */
17124 fputs ("@gottpoff", file);
17125 break;
17126 case UNSPEC_TPOFF:
17127 fputs ("@tpoff", file);
17128 break;
17129 case UNSPEC_NTPOFF:
17130 if (TARGET_64BIT)
17131 fputs ("@tpoff", file);
17132 else
17133 fputs ("@ntpoff", file);
17134 break;
17135 case UNSPEC_DTPOFF:
17136 fputs ("@dtpoff", file);
17137 break;
17138 case UNSPEC_GOTNTPOFF:
17139 if (TARGET_64BIT)
17140 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17141 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17142 else
17143 fputs ("@gotntpoff", file);
17144 break;
17145 case UNSPEC_INDNTPOFF:
17146 fputs ("@indntpoff", file);
17147 break;
17148 #if TARGET_MACHO
17149 case UNSPEC_MACHOPIC_OFFSET:
17150 putc ('-', file);
17151 machopic_output_function_base_name (file);
17152 break;
17153 #endif
17154 default:
17155 output_operand_lossage ("invalid UNSPEC as operand");
17156 break;
17158 break;
17160 default:
17161 output_operand_lossage ("invalid expression as operand");
17165 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17166 We need to emit DTP-relative relocations. */
17168 static void ATTRIBUTE_UNUSED
17169 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17171 fputs (ASM_LONG, file);
17172 output_addr_const (file, x);
17173 fputs ("@dtpoff", file);
17174 switch (size)
17176 case 4:
17177 break;
17178 case 8:
17179 fputs (", 0", file);
17180 break;
17181 default:
17182 gcc_unreachable ();
17186 /* Return true if X is a representation of the PIC register. This copes
17187 with calls from ix86_find_base_term, where the register might have
17188 been replaced by a cselib value. */
17190 static bool
17191 ix86_pic_register_p (rtx x)
17193 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17194 return (pic_offset_table_rtx
17195 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17196 else if (!REG_P (x))
17197 return false;
17198 else if (pic_offset_table_rtx)
17200 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17201 return true;
17202 if (HARD_REGISTER_P (x)
17203 && !HARD_REGISTER_P (pic_offset_table_rtx)
17204 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17205 return true;
17206 return false;
17208 else
17209 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17212 /* Helper function for ix86_delegitimize_address.
17213 Attempt to delegitimize TLS local-exec accesses. */
17215 static rtx
17216 ix86_delegitimize_tls_address (rtx orig_x)
17218 rtx x = orig_x, unspec;
17219 struct ix86_address addr;
17221 if (!TARGET_TLS_DIRECT_SEG_REFS)
17222 return orig_x;
17223 if (MEM_P (x))
17224 x = XEXP (x, 0);
17225 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17226 return orig_x;
17227 if (ix86_decompose_address (x, &addr) == 0
17228 || addr.seg != DEFAULT_TLS_SEG_REG
17229 || addr.disp == NULL_RTX
17230 || GET_CODE (addr.disp) != CONST)
17231 return orig_x;
17232 unspec = XEXP (addr.disp, 0);
17233 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17234 unspec = XEXP (unspec, 0);
17235 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17236 return orig_x;
17237 x = XVECEXP (unspec, 0, 0);
17238 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17239 if (unspec != XEXP (addr.disp, 0))
17240 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17241 if (addr.index)
17243 rtx idx = addr.index;
17244 if (addr.scale != 1)
17245 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17246 x = gen_rtx_PLUS (Pmode, idx, x);
17248 if (addr.base)
17249 x = gen_rtx_PLUS (Pmode, addr.base, x);
17250 if (MEM_P (orig_x))
17251 x = replace_equiv_address_nv (orig_x, x);
17252 return x;
17255 /* In the name of slightly smaller debug output, and to cater to
17256 general assembler lossage, recognize PIC+GOTOFF and turn it back
17257 into a direct symbol reference.
17259 On Darwin, this is necessary to avoid a crash, because Darwin
17260 has a different PIC label for each routine but the DWARF debugging
17261 information is not associated with any particular routine, so it's
17262 necessary to remove references to the PIC label from RTL stored by
17263 the DWARF output code.
17265 This helper is used in the normal ix86_delegitimize_address
17266 entrypoint (e.g. used in the target delegitimization hook) and
17267 in ix86_find_base_term. As compile time memory optimization, we
17268 avoid allocating rtxes that will not change anything on the outcome
17269 of the callers (find_base_value and find_base_term). */
17271 static inline rtx
17272 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17274 rtx orig_x = delegitimize_mem_from_attrs (x);
17275 /* addend is NULL or some rtx if x is something+GOTOFF where
17276 something doesn't include the PIC register. */
17277 rtx addend = NULL_RTX;
17278 /* reg_addend is NULL or a multiple of some register. */
17279 rtx reg_addend = NULL_RTX;
17280 /* const_addend is NULL or a const_int. */
17281 rtx const_addend = NULL_RTX;
17282 /* This is the result, or NULL. */
17283 rtx result = NULL_RTX;
17285 x = orig_x;
17287 if (MEM_P (x))
17288 x = XEXP (x, 0);
17290 if (TARGET_64BIT)
17292 if (GET_CODE (x) == CONST
17293 && GET_CODE (XEXP (x, 0)) == PLUS
17294 && GET_MODE (XEXP (x, 0)) == Pmode
17295 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17296 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17297 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17299 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17300 base. A CONST can't be arg_pointer_rtx based. */
17301 if (base_term_p && MEM_P (orig_x))
17302 return orig_x;
17303 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17304 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17305 if (MEM_P (orig_x))
17306 x = replace_equiv_address_nv (orig_x, x);
17307 return x;
17310 if (GET_CODE (x) == CONST
17311 && GET_CODE (XEXP (x, 0)) == UNSPEC
17312 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17313 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17314 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17316 x = XVECEXP (XEXP (x, 0), 0, 0);
17317 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17319 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17320 if (x == NULL_RTX)
17321 return orig_x;
17323 return x;
17326 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17327 return ix86_delegitimize_tls_address (orig_x);
17329 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17330 and -mcmodel=medium -fpic. */
17333 if (GET_CODE (x) != PLUS
17334 || GET_CODE (XEXP (x, 1)) != CONST)
17335 return ix86_delegitimize_tls_address (orig_x);
17337 if (ix86_pic_register_p (XEXP (x, 0)))
17338 /* %ebx + GOT/GOTOFF */
17340 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17342 /* %ebx + %reg * scale + GOT/GOTOFF */
17343 reg_addend = XEXP (x, 0);
17344 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17345 reg_addend = XEXP (reg_addend, 1);
17346 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17347 reg_addend = XEXP (reg_addend, 0);
17348 else
17350 reg_addend = NULL_RTX;
17351 addend = XEXP (x, 0);
17354 else
17355 addend = XEXP (x, 0);
17357 x = XEXP (XEXP (x, 1), 0);
17358 if (GET_CODE (x) == PLUS
17359 && CONST_INT_P (XEXP (x, 1)))
17361 const_addend = XEXP (x, 1);
17362 x = XEXP (x, 0);
17365 if (GET_CODE (x) == UNSPEC
17366 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17367 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17368 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17369 && !MEM_P (orig_x) && !addend)))
17370 result = XVECEXP (x, 0, 0);
17372 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17373 && !MEM_P (orig_x))
17374 result = XVECEXP (x, 0, 0);
17376 if (! result)
17377 return ix86_delegitimize_tls_address (orig_x);
17379 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17380 recurse on the first operand. */
17381 if (const_addend && !base_term_p)
17382 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17383 if (reg_addend)
17384 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17385 if (addend)
17387 /* If the rest of original X doesn't involve the PIC register, add
17388 addend and subtract pic_offset_table_rtx. This can happen e.g.
17389 for code like:
17390 leal (%ebx, %ecx, 4), %ecx
17392 movl foo@GOTOFF(%ecx), %edx
17393 in which case we return (%ecx - %ebx) + foo
17394 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17395 and reload has completed. */
17396 if (pic_offset_table_rtx
17397 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17398 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17399 pic_offset_table_rtx),
17400 result);
17401 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17403 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17404 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17405 result = gen_rtx_PLUS (Pmode, tmp, result);
17407 else
17408 return orig_x;
17410 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17412 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17413 if (result == NULL_RTX)
17414 return orig_x;
17416 return result;
17419 /* The normal instantiation of the above template. */
17421 static rtx
17422 ix86_delegitimize_address (rtx x)
17424 return ix86_delegitimize_address_1 (x, false);
17427 /* If X is a machine specific address (i.e. a symbol or label being
17428 referenced as a displacement from the GOT implemented using an
17429 UNSPEC), then return the base term. Otherwise return X. */
17432 ix86_find_base_term (rtx x)
17434 rtx term;
17436 if (TARGET_64BIT)
17438 if (GET_CODE (x) != CONST)
17439 return x;
17440 term = XEXP (x, 0);
17441 if (GET_CODE (term) == PLUS
17442 && CONST_INT_P (XEXP (term, 1)))
17443 term = XEXP (term, 0);
17444 if (GET_CODE (term) != UNSPEC
17445 || (XINT (term, 1) != UNSPEC_GOTPCREL
17446 && XINT (term, 1) != UNSPEC_PCREL))
17447 return x;
17449 return XVECEXP (term, 0, 0);
17452 return ix86_delegitimize_address_1 (x, true);
17455 static void
17456 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17457 bool fp, FILE *file)
17459 const char *suffix;
17461 if (mode == CCFPmode || mode == CCFPUmode)
17463 code = ix86_fp_compare_code_to_integer (code);
17464 mode = CCmode;
17466 if (reverse)
17467 code = reverse_condition (code);
17469 switch (code)
17471 case EQ:
17472 switch (mode)
17474 case CCAmode:
17475 suffix = "a";
17476 break;
17477 case CCCmode:
17478 suffix = "c";
17479 break;
17480 case CCOmode:
17481 suffix = "o";
17482 break;
17483 case CCPmode:
17484 suffix = "p";
17485 break;
17486 case CCSmode:
17487 suffix = "s";
17488 break;
17489 default:
17490 suffix = "e";
17491 break;
17493 break;
17494 case NE:
17495 switch (mode)
17497 case CCAmode:
17498 suffix = "na";
17499 break;
17500 case CCCmode:
17501 suffix = "nc";
17502 break;
17503 case CCOmode:
17504 suffix = "no";
17505 break;
17506 case CCPmode:
17507 suffix = "np";
17508 break;
17509 case CCSmode:
17510 suffix = "ns";
17511 break;
17512 default:
17513 suffix = "ne";
17514 break;
17516 break;
17517 case GT:
17518 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17519 suffix = "g";
17520 break;
17521 case GTU:
17522 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17523 Those same assemblers have the same but opposite lossage on cmov. */
17524 if (mode == CCmode)
17525 suffix = fp ? "nbe" : "a";
17526 else
17527 gcc_unreachable ();
17528 break;
17529 case LT:
17530 switch (mode)
17532 case CCNOmode:
17533 case CCGOCmode:
17534 suffix = "s";
17535 break;
17537 case CCmode:
17538 case CCGCmode:
17539 suffix = "l";
17540 break;
17542 default:
17543 gcc_unreachable ();
17545 break;
17546 case LTU:
17547 if (mode == CCmode)
17548 suffix = "b";
17549 else if (mode == CCCmode)
17550 suffix = fp ? "b" : "c";
17551 else
17552 gcc_unreachable ();
17553 break;
17554 case GE:
17555 switch (mode)
17557 case CCNOmode:
17558 case CCGOCmode:
17559 suffix = "ns";
17560 break;
17562 case CCmode:
17563 case CCGCmode:
17564 suffix = "ge";
17565 break;
17567 default:
17568 gcc_unreachable ();
17570 break;
17571 case GEU:
17572 if (mode == CCmode)
17573 suffix = "nb";
17574 else if (mode == CCCmode)
17575 suffix = fp ? "nb" : "nc";
17576 else
17577 gcc_unreachable ();
17578 break;
17579 case LE:
17580 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17581 suffix = "le";
17582 break;
17583 case LEU:
17584 if (mode == CCmode)
17585 suffix = "be";
17586 else
17587 gcc_unreachable ();
17588 break;
17589 case UNORDERED:
17590 suffix = fp ? "u" : "p";
17591 break;
17592 case ORDERED:
17593 suffix = fp ? "nu" : "np";
17594 break;
17595 default:
17596 gcc_unreachable ();
17598 fputs (suffix, file);
17601 /* Print the name of register X to FILE based on its machine mode and number.
17602 If CODE is 'w', pretend the mode is HImode.
17603 If CODE is 'b', pretend the mode is QImode.
17604 If CODE is 'k', pretend the mode is SImode.
17605 If CODE is 'q', pretend the mode is DImode.
17606 If CODE is 'x', pretend the mode is V4SFmode.
17607 If CODE is 't', pretend the mode is V8SFmode.
17608 If CODE is 'g', pretend the mode is V16SFmode.
17609 If CODE is 'h', pretend the reg is the 'high' byte register.
17610 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17611 If CODE is 'd', duplicate the operand for AVX instruction.
17614 void
17615 print_reg (rtx x, int code, FILE *file)
17617 const char *reg;
17618 int msize;
17619 unsigned int regno;
17620 bool duplicated;
17622 if (ASSEMBLER_DIALECT == ASM_ATT)
17623 putc ('%', file);
17625 if (x == pc_rtx)
17627 gcc_assert (TARGET_64BIT);
17628 fputs ("rip", file);
17629 return;
17632 if (code == 'y' && STACK_TOP_P (x))
17634 fputs ("st(0)", file);
17635 return;
17638 if (code == 'w')
17639 msize = 2;
17640 else if (code == 'b')
17641 msize = 1;
17642 else if (code == 'k')
17643 msize = 4;
17644 else if (code == 'q')
17645 msize = 8;
17646 else if (code == 'h')
17647 msize = 0;
17648 else if (code == 'x')
17649 msize = 16;
17650 else if (code == 't')
17651 msize = 32;
17652 else if (code == 'g')
17653 msize = 64;
17654 else
17655 msize = GET_MODE_SIZE (GET_MODE (x));
17657 regno = REGNO (x);
17659 gcc_assert (regno != ARG_POINTER_REGNUM
17660 && regno != FRAME_POINTER_REGNUM
17661 && regno != FPSR_REG
17662 && regno != FPCR_REG);
17664 if (regno == FLAGS_REG)
17666 output_operand_lossage ("invalid use of asm flag output");
17667 return;
17670 duplicated = code == 'd' && TARGET_AVX;
17672 switch (msize)
17674 case 16:
17675 case 12:
17676 case 8:
17677 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17678 warning (0, "unsupported size for integer register");
17679 /* FALLTHRU */
17680 case 4:
17681 if (LEGACY_INT_REGNO_P (regno))
17682 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17683 /* FALLTHRU */
17684 case 2:
17685 normal:
17686 reg = hi_reg_name[regno];
17687 break;
17688 case 1:
17689 if (regno >= ARRAY_SIZE (qi_reg_name))
17690 goto normal;
17691 if (!ANY_QI_REGNO_P (regno))
17692 error ("unsupported size for integer register");
17693 reg = qi_reg_name[regno];
17694 break;
17695 case 0:
17696 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17697 goto normal;
17698 reg = qi_high_reg_name[regno];
17699 break;
17700 case 32:
17701 case 64:
17702 if (SSE_REGNO_P (regno))
17704 gcc_assert (!duplicated);
17705 putc (msize == 32 ? 'y' : 'z', file);
17706 reg = hi_reg_name[regno] + 1;
17707 break;
17709 goto normal;
17710 default:
17711 gcc_unreachable ();
17714 fputs (reg, file);
17716 /* Irritatingly, AMD extended registers use
17717 different naming convention: "r%d[bwd]" */
17718 if (REX_INT_REGNO_P (regno))
17720 gcc_assert (TARGET_64BIT);
17721 switch (msize)
17723 case 0:
17724 error ("extended registers have no high halves");
17725 break;
17726 case 1:
17727 putc ('b', file);
17728 break;
17729 case 2:
17730 putc ('w', file);
17731 break;
17732 case 4:
17733 putc ('d', file);
17734 break;
17735 case 8:
17736 /* no suffix */
17737 break;
17738 default:
17739 error ("unsupported operand size for extended register");
17740 break;
17742 return;
17745 if (duplicated)
17747 if (ASSEMBLER_DIALECT == ASM_ATT)
17748 fprintf (file, ", %%%s", reg);
17749 else
17750 fprintf (file, ", %s", reg);
17754 /* Meaning of CODE:
17755 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17756 C -- print opcode suffix for set/cmov insn.
17757 c -- like C, but print reversed condition
17758 F,f -- likewise, but for floating-point.
17759 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17760 otherwise nothing
17761 R -- print embeded rounding and sae.
17762 r -- print only sae.
17763 z -- print the opcode suffix for the size of the current operand.
17764 Z -- likewise, with special suffixes for x87 instructions.
17765 * -- print a star (in certain assembler syntax)
17766 A -- print an absolute memory reference.
17767 E -- print address with DImode register names if TARGET_64BIT.
17768 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17769 s -- print a shift double count, followed by the assemblers argument
17770 delimiter.
17771 b -- print the QImode name of the register for the indicated operand.
17772 %b0 would print %al if operands[0] is reg 0.
17773 w -- likewise, print the HImode name of the register.
17774 k -- likewise, print the SImode name of the register.
17775 q -- likewise, print the DImode name of the register.
17776 x -- likewise, print the V4SFmode name of the register.
17777 t -- likewise, print the V8SFmode name of the register.
17778 g -- likewise, print the V16SFmode name of the register.
17779 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17780 y -- print "st(0)" instead of "st" as a register.
17781 d -- print duplicated register operand for AVX instruction.
17782 D -- print condition for SSE cmp instruction.
17783 P -- if PIC, print an @PLT suffix.
17784 p -- print raw symbol name.
17785 X -- don't print any sort of PIC '@' suffix for a symbol.
17786 & -- print some in-use local-dynamic symbol name.
17787 H -- print a memory address offset by 8; used for sse high-parts
17788 Y -- print condition for XOP pcom* instruction.
17789 + -- print a branch hint as 'cs' or 'ds' prefix
17790 ; -- print a semicolon (after prefixes due to bug in older gas).
17791 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17792 @ -- print a segment register of thread base pointer load
17793 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17794 ! -- print MPX prefix for jxx/call/ret instructions if required.
17797 void
17798 ix86_print_operand (FILE *file, rtx x, int code)
17800 if (code)
17802 switch (code)
17804 case 'A':
17805 switch (ASSEMBLER_DIALECT)
17807 case ASM_ATT:
17808 putc ('*', file);
17809 break;
17811 case ASM_INTEL:
17812 /* Intel syntax. For absolute addresses, registers should not
17813 be surrounded by braces. */
17814 if (!REG_P (x))
17816 putc ('[', file);
17817 ix86_print_operand (file, x, 0);
17818 putc (']', file);
17819 return;
17821 break;
17823 default:
17824 gcc_unreachable ();
17827 ix86_print_operand (file, x, 0);
17828 return;
17830 case 'E':
17831 /* Wrap address in an UNSPEC to declare special handling. */
17832 if (TARGET_64BIT)
17833 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17835 output_address (VOIDmode, x);
17836 return;
17838 case 'L':
17839 if (ASSEMBLER_DIALECT == ASM_ATT)
17840 putc ('l', file);
17841 return;
17843 case 'W':
17844 if (ASSEMBLER_DIALECT == ASM_ATT)
17845 putc ('w', file);
17846 return;
17848 case 'B':
17849 if (ASSEMBLER_DIALECT == ASM_ATT)
17850 putc ('b', file);
17851 return;
17853 case 'Q':
17854 if (ASSEMBLER_DIALECT == ASM_ATT)
17855 putc ('l', file);
17856 return;
17858 case 'S':
17859 if (ASSEMBLER_DIALECT == ASM_ATT)
17860 putc ('s', file);
17861 return;
17863 case 'T':
17864 if (ASSEMBLER_DIALECT == ASM_ATT)
17865 putc ('t', file);
17866 return;
17868 case 'O':
17869 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17870 if (ASSEMBLER_DIALECT != ASM_ATT)
17871 return;
17873 switch (GET_MODE_SIZE (GET_MODE (x)))
17875 case 2:
17876 putc ('w', file);
17877 break;
17879 case 4:
17880 putc ('l', file);
17881 break;
17883 case 8:
17884 putc ('q', file);
17885 break;
17887 default:
17888 output_operand_lossage ("invalid operand size for operand "
17889 "code 'O'");
17890 return;
17893 putc ('.', file);
17894 #endif
17895 return;
17897 case 'z':
17898 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17900 /* Opcodes don't get size suffixes if using Intel opcodes. */
17901 if (ASSEMBLER_DIALECT == ASM_INTEL)
17902 return;
17904 switch (GET_MODE_SIZE (GET_MODE (x)))
17906 case 1:
17907 putc ('b', file);
17908 return;
17910 case 2:
17911 putc ('w', file);
17912 return;
17914 case 4:
17915 putc ('l', file);
17916 return;
17918 case 8:
17919 putc ('q', file);
17920 return;
17922 default:
17923 output_operand_lossage ("invalid operand size for operand "
17924 "code 'z'");
17925 return;
17929 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17930 warning (0, "non-integer operand used with operand code 'z'");
17931 /* FALLTHRU */
17933 case 'Z':
17934 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17935 if (ASSEMBLER_DIALECT == ASM_INTEL)
17936 return;
17938 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17940 switch (GET_MODE_SIZE (GET_MODE (x)))
17942 case 2:
17943 #ifdef HAVE_AS_IX86_FILDS
17944 putc ('s', file);
17945 #endif
17946 return;
17948 case 4:
17949 putc ('l', file);
17950 return;
17952 case 8:
17953 #ifdef HAVE_AS_IX86_FILDQ
17954 putc ('q', file);
17955 #else
17956 fputs ("ll", file);
17957 #endif
17958 return;
17960 default:
17961 break;
17964 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17966 /* 387 opcodes don't get size suffixes
17967 if the operands are registers. */
17968 if (STACK_REG_P (x))
17969 return;
17971 switch (GET_MODE_SIZE (GET_MODE (x)))
17973 case 4:
17974 putc ('s', file);
17975 return;
17977 case 8:
17978 putc ('l', file);
17979 return;
17981 case 12:
17982 case 16:
17983 putc ('t', file);
17984 return;
17986 default:
17987 break;
17990 else
17992 output_operand_lossage ("invalid operand type used with "
17993 "operand code 'Z'");
17994 return;
17997 output_operand_lossage ("invalid operand size for operand code 'Z'");
17998 return;
18000 case 'd':
18001 case 'b':
18002 case 'w':
18003 case 'k':
18004 case 'q':
18005 case 'h':
18006 case 't':
18007 case 'g':
18008 case 'y':
18009 case 'x':
18010 case 'X':
18011 case 'P':
18012 case 'p':
18013 break;
18015 case 's':
18016 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18018 ix86_print_operand (file, x, 0);
18019 fputs (", ", file);
18021 return;
18023 case 'Y':
18024 switch (GET_CODE (x))
18026 case NE:
18027 fputs ("neq", file);
18028 break;
18029 case EQ:
18030 fputs ("eq", file);
18031 break;
18032 case GE:
18033 case GEU:
18034 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18035 break;
18036 case GT:
18037 case GTU:
18038 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18039 break;
18040 case LE:
18041 case LEU:
18042 fputs ("le", file);
18043 break;
18044 case LT:
18045 case LTU:
18046 fputs ("lt", file);
18047 break;
18048 case UNORDERED:
18049 fputs ("unord", file);
18050 break;
18051 case ORDERED:
18052 fputs ("ord", file);
18053 break;
18054 case UNEQ:
18055 fputs ("ueq", file);
18056 break;
18057 case UNGE:
18058 fputs ("nlt", file);
18059 break;
18060 case UNGT:
18061 fputs ("nle", file);
18062 break;
18063 case UNLE:
18064 fputs ("ule", file);
18065 break;
18066 case UNLT:
18067 fputs ("ult", file);
18068 break;
18069 case LTGT:
18070 fputs ("une", file);
18071 break;
18072 default:
18073 output_operand_lossage ("operand is not a condition code, "
18074 "invalid operand code 'Y'");
18075 return;
18077 return;
18079 case 'D':
18080 /* Little bit of braindamage here. The SSE compare instructions
18081 does use completely different names for the comparisons that the
18082 fp conditional moves. */
18083 switch (GET_CODE (x))
18085 case UNEQ:
18086 if (TARGET_AVX)
18088 fputs ("eq_us", file);
18089 break;
18091 /* FALLTHRU */
18092 case EQ:
18093 fputs ("eq", file);
18094 break;
18095 case UNLT:
18096 if (TARGET_AVX)
18098 fputs ("nge", file);
18099 break;
18101 /* FALLTHRU */
18102 case LT:
18103 fputs ("lt", file);
18104 break;
18105 case UNLE:
18106 if (TARGET_AVX)
18108 fputs ("ngt", file);
18109 break;
18111 /* FALLTHRU */
18112 case LE:
18113 fputs ("le", file);
18114 break;
18115 case UNORDERED:
18116 fputs ("unord", file);
18117 break;
18118 case LTGT:
18119 if (TARGET_AVX)
18121 fputs ("neq_oq", file);
18122 break;
18124 /* FALLTHRU */
18125 case NE:
18126 fputs ("neq", file);
18127 break;
18128 case GE:
18129 if (TARGET_AVX)
18131 fputs ("ge", file);
18132 break;
18134 /* FALLTHRU */
18135 case UNGE:
18136 fputs ("nlt", file);
18137 break;
18138 case GT:
18139 if (TARGET_AVX)
18141 fputs ("gt", file);
18142 break;
18144 /* FALLTHRU */
18145 case UNGT:
18146 fputs ("nle", file);
18147 break;
18148 case ORDERED:
18149 fputs ("ord", file);
18150 break;
18151 default:
18152 output_operand_lossage ("operand is not a condition code, "
18153 "invalid operand code 'D'");
18154 return;
18156 return;
18158 case 'F':
18159 case 'f':
18160 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18161 if (ASSEMBLER_DIALECT == ASM_ATT)
18162 putc ('.', file);
18163 gcc_fallthrough ();
18164 #endif
18166 case 'C':
18167 case 'c':
18168 if (!COMPARISON_P (x))
18170 output_operand_lossage ("operand is not a condition code, "
18171 "invalid operand code '%c'", code);
18172 return;
18174 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18175 code == 'c' || code == 'f',
18176 code == 'F' || code == 'f',
18177 file);
18178 return;
18180 case 'H':
18181 if (!offsettable_memref_p (x))
18183 output_operand_lossage ("operand is not an offsettable memory "
18184 "reference, invalid operand code 'H'");
18185 return;
18187 /* It doesn't actually matter what mode we use here, as we're
18188 only going to use this for printing. */
18189 x = adjust_address_nv (x, DImode, 8);
18190 /* Output 'qword ptr' for intel assembler dialect. */
18191 if (ASSEMBLER_DIALECT == ASM_INTEL)
18192 code = 'q';
18193 break;
18195 case 'K':
18196 if (!CONST_INT_P (x))
18198 output_operand_lossage ("operand is not an integer, invalid "
18199 "operand code 'K'");
18200 return;
18203 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18204 #ifdef HAVE_AS_IX86_HLE
18205 fputs ("xacquire ", file);
18206 #else
18207 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18208 #endif
18209 else if (INTVAL (x) & IX86_HLE_RELEASE)
18210 #ifdef HAVE_AS_IX86_HLE
18211 fputs ("xrelease ", file);
18212 #else
18213 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18214 #endif
18215 /* We do not want to print value of the operand. */
18216 return;
18218 case 'N':
18219 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18220 fputs ("{z}", file);
18221 return;
18223 case 'r':
18224 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18226 output_operand_lossage ("operand is not a specific integer, "
18227 "invalid operand code 'r'");
18228 return;
18231 if (ASSEMBLER_DIALECT == ASM_INTEL)
18232 fputs (", ", file);
18234 fputs ("{sae}", file);
18236 if (ASSEMBLER_DIALECT == ASM_ATT)
18237 fputs (", ", file);
18239 return;
18241 case 'R':
18242 if (!CONST_INT_P (x))
18244 output_operand_lossage ("operand is not an integer, invalid "
18245 "operand code 'R'");
18246 return;
18249 if (ASSEMBLER_DIALECT == ASM_INTEL)
18250 fputs (", ", file);
18252 switch (INTVAL (x))
18254 case ROUND_NEAREST_INT | ROUND_SAE:
18255 fputs ("{rn-sae}", file);
18256 break;
18257 case ROUND_NEG_INF | ROUND_SAE:
18258 fputs ("{rd-sae}", file);
18259 break;
18260 case ROUND_POS_INF | ROUND_SAE:
18261 fputs ("{ru-sae}", file);
18262 break;
18263 case ROUND_ZERO | ROUND_SAE:
18264 fputs ("{rz-sae}", file);
18265 break;
18266 default:
18267 output_operand_lossage ("operand is not a specific integer, "
18268 "invalid operand code 'R'");
18271 if (ASSEMBLER_DIALECT == ASM_ATT)
18272 fputs (", ", file);
18274 return;
18276 case '*':
18277 if (ASSEMBLER_DIALECT == ASM_ATT)
18278 putc ('*', file);
18279 return;
18281 case '&':
18283 const char *name = get_some_local_dynamic_name ();
18284 if (name == NULL)
18285 output_operand_lossage ("'%%&' used without any "
18286 "local dynamic TLS references");
18287 else
18288 assemble_name (file, name);
18289 return;
18292 case '+':
18294 rtx x;
18296 if (!optimize
18297 || optimize_function_for_size_p (cfun)
18298 || !TARGET_BRANCH_PREDICTION_HINTS)
18299 return;
18301 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18302 if (x)
18304 int pred_val = XINT (x, 0);
18306 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18307 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18309 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18310 bool cputaken
18311 = final_forward_branch_p (current_output_insn) == 0;
18313 /* Emit hints only in the case default branch prediction
18314 heuristics would fail. */
18315 if (taken != cputaken)
18317 /* We use 3e (DS) prefix for taken branches and
18318 2e (CS) prefix for not taken branches. */
18319 if (taken)
18320 fputs ("ds ; ", file);
18321 else
18322 fputs ("cs ; ", file);
18326 return;
18329 case ';':
18330 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18331 putc (';', file);
18332 #endif
18333 return;
18335 case '@':
18336 if (ASSEMBLER_DIALECT == ASM_ATT)
18337 putc ('%', file);
18339 /* The kernel uses a different segment register for performance
18340 reasons; a system call would not have to trash the userspace
18341 segment register, which would be expensive. */
18342 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18343 fputs ("fs", file);
18344 else
18345 fputs ("gs", file);
18346 return;
18348 case '~':
18349 putc (TARGET_AVX2 ? 'i' : 'f', file);
18350 return;
18352 case '^':
18353 if (TARGET_64BIT && Pmode != word_mode)
18354 fputs ("addr32 ", file);
18355 return;
18357 case '!':
18358 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18359 fputs ("bnd ", file);
18360 return;
18362 default:
18363 output_operand_lossage ("invalid operand code '%c'", code);
18367 if (REG_P (x))
18368 print_reg (x, code, file);
18370 else if (MEM_P (x))
18372 rtx addr = XEXP (x, 0);
18374 /* No `byte ptr' prefix for call instructions ... */
18375 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18377 machine_mode mode = GET_MODE (x);
18378 const char *size;
18380 /* Check for explicit size override codes. */
18381 if (code == 'b')
18382 size = "BYTE";
18383 else if (code == 'w')
18384 size = "WORD";
18385 else if (code == 'k')
18386 size = "DWORD";
18387 else if (code == 'q')
18388 size = "QWORD";
18389 else if (code == 'x')
18390 size = "XMMWORD";
18391 else if (code == 't')
18392 size = "YMMWORD";
18393 else if (code == 'g')
18394 size = "ZMMWORD";
18395 else if (mode == BLKmode)
18396 /* ... or BLKmode operands, when not overridden. */
18397 size = NULL;
18398 else
18399 switch (GET_MODE_SIZE (mode))
18401 case 1: size = "BYTE"; break;
18402 case 2: size = "WORD"; break;
18403 case 4: size = "DWORD"; break;
18404 case 8: size = "QWORD"; break;
18405 case 12: size = "TBYTE"; break;
18406 case 16:
18407 if (mode == XFmode)
18408 size = "TBYTE";
18409 else
18410 size = "XMMWORD";
18411 break;
18412 case 32: size = "YMMWORD"; break;
18413 case 64: size = "ZMMWORD"; break;
18414 default:
18415 gcc_unreachable ();
18417 if (size)
18419 fputs (size, file);
18420 fputs (" PTR ", file);
18424 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18425 output_operand_lossage ("invalid constraints for operand");
18426 else
18427 ix86_print_operand_address_as
18428 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18431 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18433 long l;
18435 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18437 if (ASSEMBLER_DIALECT == ASM_ATT)
18438 putc ('$', file);
18439 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18440 if (code == 'q')
18441 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18442 (unsigned long long) (int) l);
18443 else
18444 fprintf (file, "0x%08x", (unsigned int) l);
18447 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18449 long l[2];
18451 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18453 if (ASSEMBLER_DIALECT == ASM_ATT)
18454 putc ('$', file);
18455 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18458 /* These float cases don't actually occur as immediate operands. */
18459 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18461 char dstr[30];
18463 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18464 fputs (dstr, file);
18467 else
18469 /* We have patterns that allow zero sets of memory, for instance.
18470 In 64-bit mode, we should probably support all 8-byte vectors,
18471 since we can in fact encode that into an immediate. */
18472 if (GET_CODE (x) == CONST_VECTOR)
18474 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18475 x = const0_rtx;
18478 if (code != 'P' && code != 'p')
18480 if (CONST_INT_P (x))
18482 if (ASSEMBLER_DIALECT == ASM_ATT)
18483 putc ('$', file);
18485 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18486 || GET_CODE (x) == LABEL_REF)
18488 if (ASSEMBLER_DIALECT == ASM_ATT)
18489 putc ('$', file);
18490 else
18491 fputs ("OFFSET FLAT:", file);
18494 if (CONST_INT_P (x))
18495 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18496 else if (flag_pic || MACHOPIC_INDIRECT)
18497 output_pic_addr_const (file, x, code);
18498 else
18499 output_addr_const (file, x);
18503 static bool
18504 ix86_print_operand_punct_valid_p (unsigned char code)
18506 return (code == '@' || code == '*' || code == '+' || code == '&'
18507 || code == ';' || code == '~' || code == '^' || code == '!');
18510 /* Print a memory operand whose address is ADDR. */
18512 static void
18513 ix86_print_operand_address_as (FILE *file, rtx addr,
18514 addr_space_t as, bool no_rip)
18516 struct ix86_address parts;
18517 rtx base, index, disp;
18518 int scale;
18519 int ok;
18520 bool vsib = false;
18521 int code = 0;
18523 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18525 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18526 gcc_assert (parts.index == NULL_RTX);
18527 parts.index = XVECEXP (addr, 0, 1);
18528 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18529 addr = XVECEXP (addr, 0, 0);
18530 vsib = true;
18532 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18534 gcc_assert (TARGET_64BIT);
18535 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18536 code = 'q';
18538 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18540 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18541 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18542 if (parts.base != NULL_RTX)
18544 parts.index = parts.base;
18545 parts.scale = 1;
18547 parts.base = XVECEXP (addr, 0, 0);
18548 addr = XVECEXP (addr, 0, 0);
18550 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18552 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18553 gcc_assert (parts.index == NULL_RTX);
18554 parts.index = XVECEXP (addr, 0, 1);
18555 addr = XVECEXP (addr, 0, 0);
18557 else
18558 ok = ix86_decompose_address (addr, &parts);
18560 gcc_assert (ok);
18562 base = parts.base;
18563 index = parts.index;
18564 disp = parts.disp;
18565 scale = parts.scale;
18567 if (ADDR_SPACE_GENERIC_P (as))
18568 as = parts.seg;
18569 else
18570 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18572 if (!ADDR_SPACE_GENERIC_P (as))
18574 const char *string;
18576 if (as == ADDR_SPACE_SEG_FS)
18577 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18578 else if (as == ADDR_SPACE_SEG_GS)
18579 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18580 else
18581 gcc_unreachable ();
18582 fputs (string, file);
18585 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18586 if (TARGET_64BIT && !base && !index && !no_rip)
18588 rtx symbol = disp;
18590 if (GET_CODE (disp) == CONST
18591 && GET_CODE (XEXP (disp, 0)) == PLUS
18592 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18593 symbol = XEXP (XEXP (disp, 0), 0);
18595 if (GET_CODE (symbol) == LABEL_REF
18596 || (GET_CODE (symbol) == SYMBOL_REF
18597 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18598 base = pc_rtx;
18601 if (!base && !index)
18603 /* Displacement only requires special attention. */
18604 if (CONST_INT_P (disp))
18606 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18607 fputs ("ds:", file);
18608 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18610 /* Load the external function address via the GOT slot to avoid PLT. */
18611 else if (GET_CODE (disp) == CONST
18612 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18613 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18614 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18615 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18616 output_pic_addr_const (file, disp, 0);
18617 else if (flag_pic)
18618 output_pic_addr_const (file, disp, 0);
18619 else
18620 output_addr_const (file, disp);
18622 else
18624 /* Print SImode register names to force addr32 prefix. */
18625 if (SImode_address_operand (addr, VOIDmode))
18627 if (flag_checking)
18629 gcc_assert (TARGET_64BIT);
18630 switch (GET_CODE (addr))
18632 case SUBREG:
18633 gcc_assert (GET_MODE (addr) == SImode);
18634 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18635 break;
18636 case ZERO_EXTEND:
18637 case AND:
18638 gcc_assert (GET_MODE (addr) == DImode);
18639 break;
18640 default:
18641 gcc_unreachable ();
18644 gcc_assert (!code);
18645 code = 'k';
18647 else if (code == 0
18648 && TARGET_X32
18649 && disp
18650 && CONST_INT_P (disp)
18651 && INTVAL (disp) < -16*1024*1024)
18653 /* X32 runs in 64-bit mode, where displacement, DISP, in
18654 address DISP(%r64), is encoded as 32-bit immediate sign-
18655 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18656 address is %r64 + 0xffffffffbffffd00. When %r64 <
18657 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18658 which is invalid for x32. The correct address is %r64
18659 - 0x40000300 == 0xf7ffdd64. To properly encode
18660 -0x40000300(%r64) for x32, we zero-extend negative
18661 displacement by forcing addr32 prefix which truncates
18662 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18663 zero-extend all negative displacements, including -1(%rsp).
18664 However, for small negative displacements, sign-extension
18665 won't cause overflow. We only zero-extend negative
18666 displacements if they < -16*1024*1024, which is also used
18667 to check legitimate address displacements for PIC. */
18668 code = 'k';
18671 if (ASSEMBLER_DIALECT == ASM_ATT)
18673 if (disp)
18675 if (flag_pic)
18676 output_pic_addr_const (file, disp, 0);
18677 else if (GET_CODE (disp) == LABEL_REF)
18678 output_asm_label (disp);
18679 else
18680 output_addr_const (file, disp);
18683 putc ('(', file);
18684 if (base)
18685 print_reg (base, code, file);
18686 if (index)
18688 putc (',', file);
18689 print_reg (index, vsib ? 0 : code, file);
18690 if (scale != 1 || vsib)
18691 fprintf (file, ",%d", scale);
18693 putc (')', file);
18695 else
18697 rtx offset = NULL_RTX;
18699 if (disp)
18701 /* Pull out the offset of a symbol; print any symbol itself. */
18702 if (GET_CODE (disp) == CONST
18703 && GET_CODE (XEXP (disp, 0)) == PLUS
18704 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18706 offset = XEXP (XEXP (disp, 0), 1);
18707 disp = gen_rtx_CONST (VOIDmode,
18708 XEXP (XEXP (disp, 0), 0));
18711 if (flag_pic)
18712 output_pic_addr_const (file, disp, 0);
18713 else if (GET_CODE (disp) == LABEL_REF)
18714 output_asm_label (disp);
18715 else if (CONST_INT_P (disp))
18716 offset = disp;
18717 else
18718 output_addr_const (file, disp);
18721 putc ('[', file);
18722 if (base)
18724 print_reg (base, code, file);
18725 if (offset)
18727 if (INTVAL (offset) >= 0)
18728 putc ('+', file);
18729 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18732 else if (offset)
18733 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18734 else
18735 putc ('0', file);
18737 if (index)
18739 putc ('+', file);
18740 print_reg (index, vsib ? 0 : code, file);
18741 if (scale != 1 || vsib)
18742 fprintf (file, "*%d", scale);
18744 putc (']', file);
18749 static void
18750 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18752 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18755 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18757 static bool
18758 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18760 rtx op;
18762 if (GET_CODE (x) != UNSPEC)
18763 return false;
18765 op = XVECEXP (x, 0, 0);
18766 switch (XINT (x, 1))
18768 case UNSPEC_GOTTPOFF:
18769 output_addr_const (file, op);
18770 /* FIXME: This might be @TPOFF in Sun ld. */
18771 fputs ("@gottpoff", file);
18772 break;
18773 case UNSPEC_TPOFF:
18774 output_addr_const (file, op);
18775 fputs ("@tpoff", file);
18776 break;
18777 case UNSPEC_NTPOFF:
18778 output_addr_const (file, op);
18779 if (TARGET_64BIT)
18780 fputs ("@tpoff", file);
18781 else
18782 fputs ("@ntpoff", file);
18783 break;
18784 case UNSPEC_DTPOFF:
18785 output_addr_const (file, op);
18786 fputs ("@dtpoff", file);
18787 break;
18788 case UNSPEC_GOTNTPOFF:
18789 output_addr_const (file, op);
18790 if (TARGET_64BIT)
18791 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18792 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18793 else
18794 fputs ("@gotntpoff", file);
18795 break;
18796 case UNSPEC_INDNTPOFF:
18797 output_addr_const (file, op);
18798 fputs ("@indntpoff", file);
18799 break;
18800 #if TARGET_MACHO
18801 case UNSPEC_MACHOPIC_OFFSET:
18802 output_addr_const (file, op);
18803 putc ('-', file);
18804 machopic_output_function_base_name (file);
18805 break;
18806 #endif
18808 case UNSPEC_STACK_CHECK:
18810 int offset;
18812 gcc_assert (flag_split_stack);
18814 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18815 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18816 #else
18817 gcc_unreachable ();
18818 #endif
18820 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18822 break;
18824 default:
18825 return false;
18828 return true;
18831 /* Split one or more double-mode RTL references into pairs of half-mode
18832 references. The RTL can be REG, offsettable MEM, integer constant, or
18833 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18834 split and "num" is its length. lo_half and hi_half are output arrays
18835 that parallel "operands". */
18837 void
18838 split_double_mode (machine_mode mode, rtx operands[],
18839 int num, rtx lo_half[], rtx hi_half[])
18841 machine_mode half_mode;
18842 unsigned int byte;
18844 switch (mode)
18846 case TImode:
18847 half_mode = DImode;
18848 break;
18849 case DImode:
18850 half_mode = SImode;
18851 break;
18852 default:
18853 gcc_unreachable ();
18856 byte = GET_MODE_SIZE (half_mode);
18858 while (num--)
18860 rtx op = operands[num];
18862 /* simplify_subreg refuse to split volatile memory addresses,
18863 but we still have to handle it. */
18864 if (MEM_P (op))
18866 lo_half[num] = adjust_address (op, half_mode, 0);
18867 hi_half[num] = adjust_address (op, half_mode, byte);
18869 else
18871 lo_half[num] = simplify_gen_subreg (half_mode, op,
18872 GET_MODE (op) == VOIDmode
18873 ? mode : GET_MODE (op), 0);
18874 hi_half[num] = simplify_gen_subreg (half_mode, op,
18875 GET_MODE (op) == VOIDmode
18876 ? mode : GET_MODE (op), byte);
18881 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18882 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18883 is the expression of the binary operation. The output may either be
18884 emitted here, or returned to the caller, like all output_* functions.
18886 There is no guarantee that the operands are the same mode, as they
18887 might be within FLOAT or FLOAT_EXTEND expressions. */
18889 #ifndef SYSV386_COMPAT
18890 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18891 wants to fix the assemblers because that causes incompatibility
18892 with gcc. No-one wants to fix gcc because that causes
18893 incompatibility with assemblers... You can use the option of
18894 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18895 #define SYSV386_COMPAT 1
18896 #endif
18898 const char *
18899 output_387_binary_op (rtx_insn *insn, rtx *operands)
18901 static char buf[40];
18902 const char *p;
18903 const char *ssep;
18904 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18906 /* Even if we do not want to check the inputs, this documents input
18907 constraints. Which helps in understanding the following code. */
18908 if (flag_checking)
18910 if (STACK_REG_P (operands[0])
18911 && ((REG_P (operands[1])
18912 && REGNO (operands[0]) == REGNO (operands[1])
18913 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18914 || (REG_P (operands[2])
18915 && REGNO (operands[0]) == REGNO (operands[2])
18916 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18917 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18918 ; /* ok */
18919 else
18920 gcc_assert (is_sse);
18923 switch (GET_CODE (operands[3]))
18925 case PLUS:
18926 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18927 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18928 p = "fiadd";
18929 else
18930 p = "fadd";
18931 ssep = "vadd";
18932 break;
18934 case MINUS:
18935 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18936 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18937 p = "fisub";
18938 else
18939 p = "fsub";
18940 ssep = "vsub";
18941 break;
18943 case MULT:
18944 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18945 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18946 p = "fimul";
18947 else
18948 p = "fmul";
18949 ssep = "vmul";
18950 break;
18952 case DIV:
18953 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18954 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18955 p = "fidiv";
18956 else
18957 p = "fdiv";
18958 ssep = "vdiv";
18959 break;
18961 default:
18962 gcc_unreachable ();
18965 if (is_sse)
18967 if (TARGET_AVX)
18969 strcpy (buf, ssep);
18970 if (GET_MODE (operands[0]) == SFmode)
18971 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18972 else
18973 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18975 else
18977 strcpy (buf, ssep + 1);
18978 if (GET_MODE (operands[0]) == SFmode)
18979 strcat (buf, "ss\t{%2, %0|%0, %2}");
18980 else
18981 strcat (buf, "sd\t{%2, %0|%0, %2}");
18983 return buf;
18985 strcpy (buf, p);
18987 switch (GET_CODE (operands[3]))
18989 case MULT:
18990 case PLUS:
18991 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18992 std::swap (operands[1], operands[2]);
18994 /* know operands[0] == operands[1]. */
18996 if (MEM_P (operands[2]))
18998 p = "%Z2\t%2";
18999 break;
19002 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19004 if (STACK_TOP_P (operands[0]))
19005 /* How is it that we are storing to a dead operand[2]?
19006 Well, presumably operands[1] is dead too. We can't
19007 store the result to st(0) as st(0) gets popped on this
19008 instruction. Instead store to operands[2] (which I
19009 think has to be st(1)). st(1) will be popped later.
19010 gcc <= 2.8.1 didn't have this check and generated
19011 assembly code that the Unixware assembler rejected. */
19012 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19013 else
19014 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19015 break;
19018 if (STACK_TOP_P (operands[0]))
19019 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19020 else
19021 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19022 break;
19024 case MINUS:
19025 case DIV:
19026 if (MEM_P (operands[1]))
19028 p = "r%Z1\t%1";
19029 break;
19032 if (MEM_P (operands[2]))
19034 p = "%Z2\t%2";
19035 break;
19038 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19040 #if SYSV386_COMPAT
19041 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19042 derived assemblers, confusingly reverse the direction of
19043 the operation for fsub{r} and fdiv{r} when the
19044 destination register is not st(0). The Intel assembler
19045 doesn't have this brain damage. Read !SYSV386_COMPAT to
19046 figure out what the hardware really does. */
19047 if (STACK_TOP_P (operands[0]))
19048 p = "{p\t%0, %2|rp\t%2, %0}";
19049 else
19050 p = "{rp\t%2, %0|p\t%0, %2}";
19051 #else
19052 if (STACK_TOP_P (operands[0]))
19053 /* As above for fmul/fadd, we can't store to st(0). */
19054 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19055 else
19056 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19057 #endif
19058 break;
19061 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19063 #if SYSV386_COMPAT
19064 if (STACK_TOP_P (operands[0]))
19065 p = "{rp\t%0, %1|p\t%1, %0}";
19066 else
19067 p = "{p\t%1, %0|rp\t%0, %1}";
19068 #else
19069 if (STACK_TOP_P (operands[0]))
19070 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19071 else
19072 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19073 #endif
19074 break;
19077 if (STACK_TOP_P (operands[0]))
19079 if (STACK_TOP_P (operands[1]))
19080 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19081 else
19082 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19083 break;
19085 else if (STACK_TOP_P (operands[1]))
19087 #if SYSV386_COMPAT
19088 p = "{\t%1, %0|r\t%0, %1}";
19089 #else
19090 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19091 #endif
19093 else
19095 #if SYSV386_COMPAT
19096 p = "{r\t%2, %0|\t%0, %2}";
19097 #else
19098 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19099 #endif
19101 break;
19103 default:
19104 gcc_unreachable ();
19107 strcat (buf, p);
19108 return buf;
19111 /* Return needed mode for entity in optimize_mode_switching pass. */
19113 static int
19114 ix86_dirflag_mode_needed (rtx_insn *insn)
19116 if (CALL_P (insn))
19118 if (cfun->machine->func_type == TYPE_NORMAL)
19119 return X86_DIRFLAG_ANY;
19120 else
19121 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19122 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19125 if (recog_memoized (insn) < 0)
19126 return X86_DIRFLAG_ANY;
19128 if (get_attr_type (insn) == TYPE_STR)
19130 /* Emit cld instruction if stringops are used in the function. */
19131 if (cfun->machine->func_type == TYPE_NORMAL)
19132 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19133 else
19134 return X86_DIRFLAG_RESET;
19137 return X86_DIRFLAG_ANY;
19140 /* Check if a 256bit AVX register is referenced inside of EXP. */
19142 static bool
19143 ix86_check_avx256_register (const_rtx exp)
19145 if (SUBREG_P (exp))
19146 exp = SUBREG_REG (exp);
19148 return (REG_P (exp)
19149 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19152 /* Return needed mode for entity in optimize_mode_switching pass. */
19154 static int
19155 ix86_avx_u128_mode_needed (rtx_insn *insn)
19157 if (CALL_P (insn))
19159 rtx link;
19161 /* Needed mode is set to AVX_U128_CLEAN if there are
19162 no 256bit modes used in function arguments. */
19163 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19164 link;
19165 link = XEXP (link, 1))
19167 if (GET_CODE (XEXP (link, 0)) == USE)
19169 rtx arg = XEXP (XEXP (link, 0), 0);
19171 if (ix86_check_avx256_register (arg))
19172 return AVX_U128_DIRTY;
19176 return AVX_U128_CLEAN;
19179 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
19180 changes state only when a 256bit register is written to, but we need
19181 to prevent the compiler from moving optimal insertion point above
19182 eventual read from 256bit register. */
19183 subrtx_iterator::array_type array;
19184 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19185 if (ix86_check_avx256_register (*iter))
19186 return AVX_U128_DIRTY;
19188 return AVX_U128_ANY;
19191 /* Return mode that i387 must be switched into
19192 prior to the execution of insn. */
19194 static int
19195 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19197 enum attr_i387_cw mode;
19199 /* The mode UNINITIALIZED is used to store control word after a
19200 function call or ASM pattern. The mode ANY specify that function
19201 has no requirements on the control word and make no changes in the
19202 bits we are interested in. */
19204 if (CALL_P (insn)
19205 || (NONJUMP_INSN_P (insn)
19206 && (asm_noperands (PATTERN (insn)) >= 0
19207 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19208 return I387_CW_UNINITIALIZED;
19210 if (recog_memoized (insn) < 0)
19211 return I387_CW_ANY;
19213 mode = get_attr_i387_cw (insn);
19215 switch (entity)
19217 case I387_TRUNC:
19218 if (mode == I387_CW_TRUNC)
19219 return mode;
19220 break;
19222 case I387_FLOOR:
19223 if (mode == I387_CW_FLOOR)
19224 return mode;
19225 break;
19227 case I387_CEIL:
19228 if (mode == I387_CW_CEIL)
19229 return mode;
19230 break;
19232 case I387_MASK_PM:
19233 if (mode == I387_CW_MASK_PM)
19234 return mode;
19235 break;
19237 default:
19238 gcc_unreachable ();
19241 return I387_CW_ANY;
19244 /* Return mode that entity must be switched into
19245 prior to the execution of insn. */
19247 static int
19248 ix86_mode_needed (int entity, rtx_insn *insn)
19250 switch (entity)
19252 case X86_DIRFLAG:
19253 return ix86_dirflag_mode_needed (insn);
19254 case AVX_U128:
19255 return ix86_avx_u128_mode_needed (insn);
19256 case I387_TRUNC:
19257 case I387_FLOOR:
19258 case I387_CEIL:
19259 case I387_MASK_PM:
19260 return ix86_i387_mode_needed (entity, insn);
19261 default:
19262 gcc_unreachable ();
19264 return 0;
19267 /* Check if a 256bit AVX register is referenced in stores. */
19269 static void
19270 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19272 if (ix86_check_avx256_register (dest))
19274 bool *used = (bool *) data;
19275 *used = true;
19279 /* Calculate mode of upper 128bit AVX registers after the insn. */
19281 static int
19282 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19284 rtx pat = PATTERN (insn);
19286 if (vzeroupper_operation (pat, VOIDmode)
19287 || vzeroall_operation (pat, VOIDmode))
19288 return AVX_U128_CLEAN;
19290 /* We know that state is clean after CALL insn if there are no
19291 256bit registers used in the function return register. */
19292 if (CALL_P (insn))
19294 bool avx_reg256_found = false;
19295 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19297 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19300 /* Otherwise, return current mode. Remember that if insn
19301 references AVX 256bit registers, the mode was already changed
19302 to DIRTY from MODE_NEEDED. */
19303 return mode;
19306 /* Return the mode that an insn results in. */
19308 static int
19309 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19311 switch (entity)
19313 case X86_DIRFLAG:
19314 return mode;
19315 case AVX_U128:
19316 return ix86_avx_u128_mode_after (mode, insn);
19317 case I387_TRUNC:
19318 case I387_FLOOR:
19319 case I387_CEIL:
19320 case I387_MASK_PM:
19321 return mode;
19322 default:
19323 gcc_unreachable ();
19327 static int
19328 ix86_dirflag_mode_entry (void)
19330 /* For TARGET_CLD or in the interrupt handler we can't assume
19331 direction flag state at function entry. */
19332 if (TARGET_CLD
19333 || cfun->machine->func_type != TYPE_NORMAL)
19334 return X86_DIRFLAG_ANY;
19336 return X86_DIRFLAG_RESET;
19339 static int
19340 ix86_avx_u128_mode_entry (void)
19342 tree arg;
19344 /* Entry mode is set to AVX_U128_DIRTY if there are
19345 256bit modes used in function arguments. */
19346 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19347 arg = TREE_CHAIN (arg))
19349 rtx incoming = DECL_INCOMING_RTL (arg);
19351 if (incoming && ix86_check_avx256_register (incoming))
19352 return AVX_U128_DIRTY;
19355 return AVX_U128_CLEAN;
19358 /* Return a mode that ENTITY is assumed to be
19359 switched to at function entry. */
19361 static int
19362 ix86_mode_entry (int entity)
19364 switch (entity)
19366 case X86_DIRFLAG:
19367 return ix86_dirflag_mode_entry ();
19368 case AVX_U128:
19369 return ix86_avx_u128_mode_entry ();
19370 case I387_TRUNC:
19371 case I387_FLOOR:
19372 case I387_CEIL:
19373 case I387_MASK_PM:
19374 return I387_CW_ANY;
19375 default:
19376 gcc_unreachable ();
19380 static int
19381 ix86_avx_u128_mode_exit (void)
19383 rtx reg = crtl->return_rtx;
19385 /* Exit mode is set to AVX_U128_DIRTY if there are
19386 256bit modes used in the function return register. */
19387 if (reg && ix86_check_avx256_register (reg))
19388 return AVX_U128_DIRTY;
19390 return AVX_U128_CLEAN;
19393 /* Return a mode that ENTITY is assumed to be
19394 switched to at function exit. */
19396 static int
19397 ix86_mode_exit (int entity)
19399 switch (entity)
19401 case X86_DIRFLAG:
19402 return X86_DIRFLAG_ANY;
19403 case AVX_U128:
19404 return ix86_avx_u128_mode_exit ();
19405 case I387_TRUNC:
19406 case I387_FLOOR:
19407 case I387_CEIL:
19408 case I387_MASK_PM:
19409 return I387_CW_ANY;
19410 default:
19411 gcc_unreachable ();
19415 static int
19416 ix86_mode_priority (int, int n)
19418 return n;
19421 /* Output code to initialize control word copies used by trunc?f?i and
19422 rounding patterns. CURRENT_MODE is set to current control word,
19423 while NEW_MODE is set to new control word. */
19425 static void
19426 emit_i387_cw_initialization (int mode)
19428 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19429 rtx new_mode;
19431 enum ix86_stack_slot slot;
19433 rtx reg = gen_reg_rtx (HImode);
19435 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19436 emit_move_insn (reg, copy_rtx (stored_mode));
19438 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19439 || optimize_insn_for_size_p ())
19441 switch (mode)
19443 case I387_CW_TRUNC:
19444 /* round toward zero (truncate) */
19445 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19446 slot = SLOT_CW_TRUNC;
19447 break;
19449 case I387_CW_FLOOR:
19450 /* round down toward -oo */
19451 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19452 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19453 slot = SLOT_CW_FLOOR;
19454 break;
19456 case I387_CW_CEIL:
19457 /* round up toward +oo */
19458 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19459 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19460 slot = SLOT_CW_CEIL;
19461 break;
19463 case I387_CW_MASK_PM:
19464 /* mask precision exception for nearbyint() */
19465 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19466 slot = SLOT_CW_MASK_PM;
19467 break;
19469 default:
19470 gcc_unreachable ();
19473 else
19475 switch (mode)
19477 case I387_CW_TRUNC:
19478 /* round toward zero (truncate) */
19479 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19480 slot = SLOT_CW_TRUNC;
19481 break;
19483 case I387_CW_FLOOR:
19484 /* round down toward -oo */
19485 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19486 slot = SLOT_CW_FLOOR;
19487 break;
19489 case I387_CW_CEIL:
19490 /* round up toward +oo */
19491 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19492 slot = SLOT_CW_CEIL;
19493 break;
19495 case I387_CW_MASK_PM:
19496 /* mask precision exception for nearbyint() */
19497 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19498 slot = SLOT_CW_MASK_PM;
19499 break;
19501 default:
19502 gcc_unreachable ();
19506 gcc_assert (slot < MAX_386_STACK_LOCALS);
19508 new_mode = assign_386_stack_local (HImode, slot);
19509 emit_move_insn (new_mode, reg);
19512 /* Emit vzeroupper. */
19514 void
19515 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19517 int i;
19519 /* Cancel automatic vzeroupper insertion if there are
19520 live call-saved SSE registers at the insertion point. */
19522 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19523 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19524 return;
19526 if (TARGET_64BIT)
19527 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19528 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19529 return;
19531 emit_insn (gen_avx_vzeroupper ());
19534 /* Generate one or more insns to set ENTITY to MODE. */
19536 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19537 is the set of hard registers live at the point where the insn(s)
19538 are to be inserted. */
19540 static void
19541 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19542 HARD_REG_SET regs_live)
19544 switch (entity)
19546 case X86_DIRFLAG:
19547 if (mode == X86_DIRFLAG_RESET)
19548 emit_insn (gen_cld ());
19549 break;
19550 case AVX_U128:
19551 if (mode == AVX_U128_CLEAN)
19552 ix86_avx_emit_vzeroupper (regs_live);
19553 break;
19554 case I387_TRUNC:
19555 case I387_FLOOR:
19556 case I387_CEIL:
19557 case I387_MASK_PM:
19558 if (mode != I387_CW_ANY
19559 && mode != I387_CW_UNINITIALIZED)
19560 emit_i387_cw_initialization (mode);
19561 break;
19562 default:
19563 gcc_unreachable ();
19567 /* Output code for INSN to convert a float to a signed int. OPERANDS
19568 are the insn operands. The output may be [HSD]Imode and the input
19569 operand may be [SDX]Fmode. */
19571 const char *
19572 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19574 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19575 int dimode_p = GET_MODE (operands[0]) == DImode;
19576 int round_mode = get_attr_i387_cw (insn);
19578 /* Jump through a hoop or two for DImode, since the hardware has no
19579 non-popping instruction. We used to do this a different way, but
19580 that was somewhat fragile and broke with post-reload splitters. */
19581 if ((dimode_p || fisttp) && !stack_top_dies)
19582 output_asm_insn ("fld\t%y1", operands);
19584 gcc_assert (STACK_TOP_P (operands[1]));
19585 gcc_assert (MEM_P (operands[0]));
19586 gcc_assert (GET_MODE (operands[1]) != TFmode);
19588 if (fisttp)
19589 output_asm_insn ("fisttp%Z0\t%0", operands);
19590 else
19592 if (round_mode != I387_CW_ANY)
19593 output_asm_insn ("fldcw\t%3", operands);
19594 if (stack_top_dies || dimode_p)
19595 output_asm_insn ("fistp%Z0\t%0", operands);
19596 else
19597 output_asm_insn ("fist%Z0\t%0", operands);
19598 if (round_mode != I387_CW_ANY)
19599 output_asm_insn ("fldcw\t%2", operands);
19602 return "";
19605 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19606 have the values zero or one, indicates the ffreep insn's operand
19607 from the OPERANDS array. */
19609 static const char *
19610 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19612 if (TARGET_USE_FFREEP)
19613 #ifdef HAVE_AS_IX86_FFREEP
19614 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19615 #else
19617 static char retval[32];
19618 int regno = REGNO (operands[opno]);
19620 gcc_assert (STACK_REGNO_P (regno));
19622 regno -= FIRST_STACK_REG;
19624 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19625 return retval;
19627 #endif
19629 return opno ? "fstp\t%y1" : "fstp\t%y0";
19633 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19634 should be used. UNORDERED_P is true when fucom should be used. */
19636 const char *
19637 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
19639 int stack_top_dies;
19640 rtx cmp_op0, cmp_op1;
19641 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19643 if (eflags_p)
19645 cmp_op0 = operands[0];
19646 cmp_op1 = operands[1];
19648 else
19650 cmp_op0 = operands[1];
19651 cmp_op1 = operands[2];
19654 if (is_sse)
19656 if (GET_MODE (operands[0]) == SFmode)
19657 if (unordered_p)
19658 return "%vucomiss\t{%1, %0|%0, %1}";
19659 else
19660 return "%vcomiss\t{%1, %0|%0, %1}";
19661 else
19662 if (unordered_p)
19663 return "%vucomisd\t{%1, %0|%0, %1}";
19664 else
19665 return "%vcomisd\t{%1, %0|%0, %1}";
19668 gcc_assert (STACK_TOP_P (cmp_op0));
19670 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19672 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19674 if (stack_top_dies)
19676 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19677 return output_387_ffreep (operands, 1);
19679 else
19680 return "ftst\n\tfnstsw\t%0";
19683 if (STACK_REG_P (cmp_op1)
19684 && stack_top_dies
19685 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19686 && REGNO (cmp_op1) != FIRST_STACK_REG)
19688 /* If both the top of the 387 stack dies, and the other operand
19689 is also a stack register that dies, then this must be a
19690 `fcompp' float compare */
19692 if (eflags_p)
19694 /* There is no double popping fcomi variant. Fortunately,
19695 eflags is immune from the fstp's cc clobbering. */
19696 if (unordered_p)
19697 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19698 else
19699 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19700 return output_387_ffreep (operands, 0);
19702 else
19704 if (unordered_p)
19705 return "fucompp\n\tfnstsw\t%0";
19706 else
19707 return "fcompp\n\tfnstsw\t%0";
19710 else
19712 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19714 static const char * const alt[16] =
19716 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19717 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19718 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19719 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19721 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19722 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19723 NULL,
19724 NULL,
19726 "fcomi\t{%y1, %0|%0, %y1}",
19727 "fcomip\t{%y1, %0|%0, %y1}",
19728 "fucomi\t{%y1, %0|%0, %y1}",
19729 "fucomip\t{%y1, %0|%0, %y1}",
19731 NULL,
19732 NULL,
19733 NULL,
19734 NULL
19737 int mask;
19738 const char *ret;
19740 mask = eflags_p << 3;
19741 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19742 mask |= unordered_p << 1;
19743 mask |= stack_top_dies;
19745 gcc_assert (mask < 16);
19746 ret = alt[mask];
19747 gcc_assert (ret);
19749 return ret;
19753 void
19754 ix86_output_addr_vec_elt (FILE *file, int value)
19756 const char *directive = ASM_LONG;
19758 #ifdef ASM_QUAD
19759 if (TARGET_LP64)
19760 directive = ASM_QUAD;
19761 #else
19762 gcc_assert (!TARGET_64BIT);
19763 #endif
19765 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19768 void
19769 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19771 const char *directive = ASM_LONG;
19773 #ifdef ASM_QUAD
19774 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19775 directive = ASM_QUAD;
19776 #else
19777 gcc_assert (!TARGET_64BIT);
19778 #endif
19779 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19780 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19781 fprintf (file, "%s%s%d-%s%d\n",
19782 directive, LPREFIX, value, LPREFIX, rel);
19783 else if (HAVE_AS_GOTOFF_IN_DATA)
19784 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19785 #if TARGET_MACHO
19786 else if (TARGET_MACHO)
19788 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19789 machopic_output_function_base_name (file);
19790 putc ('\n', file);
19792 #endif
19793 else
19794 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19795 GOT_SYMBOL_NAME, LPREFIX, value);
19798 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19799 for the target. */
19801 void
19802 ix86_expand_clear (rtx dest)
19804 rtx tmp;
19806 /* We play register width games, which are only valid after reload. */
19807 gcc_assert (reload_completed);
19809 /* Avoid HImode and its attendant prefix byte. */
19810 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19811 dest = gen_rtx_REG (SImode, REGNO (dest));
19812 tmp = gen_rtx_SET (dest, const0_rtx);
19814 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19816 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19817 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19820 emit_insn (tmp);
19823 /* X is an unchanging MEM. If it is a constant pool reference, return
19824 the constant pool rtx, else NULL. */
19827 maybe_get_pool_constant (rtx x)
19829 x = ix86_delegitimize_address (XEXP (x, 0));
19831 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19832 return get_pool_constant (x);
19834 return NULL_RTX;
19837 void
19838 ix86_expand_move (machine_mode mode, rtx operands[])
19840 rtx op0, op1;
19841 rtx tmp, addend = NULL_RTX;
19842 enum tls_model model;
19844 op0 = operands[0];
19845 op1 = operands[1];
19847 switch (GET_CODE (op1))
19849 case CONST:
19850 tmp = XEXP (op1, 0);
19852 if (GET_CODE (tmp) != PLUS
19853 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19854 break;
19856 op1 = XEXP (tmp, 0);
19857 addend = XEXP (tmp, 1);
19858 /* FALLTHRU */
19860 case SYMBOL_REF:
19861 model = SYMBOL_REF_TLS_MODEL (op1);
19863 if (model)
19864 op1 = legitimize_tls_address (op1, model, true);
19865 else if (ix86_force_load_from_GOT_p (op1))
19867 /* Load the external function address via GOT slot to avoid PLT. */
19868 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19869 (TARGET_64BIT
19870 ? UNSPEC_GOTPCREL
19871 : UNSPEC_GOT));
19872 op1 = gen_rtx_CONST (Pmode, op1);
19873 op1 = gen_const_mem (Pmode, op1);
19874 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19876 else
19878 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19879 if (tmp)
19881 op1 = tmp;
19882 if (!addend)
19883 break;
19885 else
19887 op1 = operands[1];
19888 break;
19892 if (addend)
19894 op1 = force_operand (op1, NULL_RTX);
19895 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19896 op0, 1, OPTAB_DIRECT);
19898 else
19899 op1 = force_operand (op1, op0);
19901 if (op1 == op0)
19902 return;
19904 op1 = convert_to_mode (mode, op1, 1);
19906 default:
19907 break;
19910 if ((flag_pic || MACHOPIC_INDIRECT)
19911 && symbolic_operand (op1, mode))
19913 if (TARGET_MACHO && !TARGET_64BIT)
19915 #if TARGET_MACHO
19916 /* dynamic-no-pic */
19917 if (MACHOPIC_INDIRECT)
19919 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19920 ? op0 : gen_reg_rtx (Pmode);
19921 op1 = machopic_indirect_data_reference (op1, temp);
19922 if (MACHOPIC_PURE)
19923 op1 = machopic_legitimize_pic_address (op1, mode,
19924 temp == op1 ? 0 : temp);
19926 if (op0 != op1 && GET_CODE (op0) != MEM)
19928 rtx insn = gen_rtx_SET (op0, op1);
19929 emit_insn (insn);
19930 return;
19932 if (GET_CODE (op0) == MEM)
19933 op1 = force_reg (Pmode, op1);
19934 else
19936 rtx temp = op0;
19937 if (GET_CODE (temp) != REG)
19938 temp = gen_reg_rtx (Pmode);
19939 temp = legitimize_pic_address (op1, temp);
19940 if (temp == op0)
19941 return;
19942 op1 = temp;
19944 /* dynamic-no-pic */
19945 #endif
19947 else
19949 if (MEM_P (op0))
19950 op1 = force_reg (mode, op1);
19951 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19953 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19954 op1 = legitimize_pic_address (op1, reg);
19955 if (op0 == op1)
19956 return;
19957 op1 = convert_to_mode (mode, op1, 1);
19961 else
19963 if (MEM_P (op0)
19964 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19965 || !push_operand (op0, mode))
19966 && MEM_P (op1))
19967 op1 = force_reg (mode, op1);
19969 if (push_operand (op0, mode)
19970 && ! general_no_elim_operand (op1, mode))
19971 op1 = copy_to_mode_reg (mode, op1);
19973 /* Force large constants in 64bit compilation into register
19974 to get them CSEed. */
19975 if (can_create_pseudo_p ()
19976 && (mode == DImode) && TARGET_64BIT
19977 && immediate_operand (op1, mode)
19978 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19979 && !register_operand (op0, mode)
19980 && optimize)
19981 op1 = copy_to_mode_reg (mode, op1);
19983 if (can_create_pseudo_p ()
19984 && CONST_DOUBLE_P (op1))
19986 /* If we are loading a floating point constant to a register,
19987 force the value to memory now, since we'll get better code
19988 out the back end. */
19990 op1 = validize_mem (force_const_mem (mode, op1));
19991 if (!register_operand (op0, mode))
19993 rtx temp = gen_reg_rtx (mode);
19994 emit_insn (gen_rtx_SET (temp, op1));
19995 emit_move_insn (op0, temp);
19996 return;
20001 emit_insn (gen_rtx_SET (op0, op1));
20004 void
20005 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20007 rtx op0 = operands[0], op1 = operands[1];
20008 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20009 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20010 unsigned int align = (TARGET_IAMCU
20011 ? GET_MODE_BITSIZE (mode)
20012 : GET_MODE_ALIGNMENT (mode));
20014 if (push_operand (op0, VOIDmode))
20015 op0 = emit_move_resolve_push (mode, op0);
20017 /* Force constants other than zero into memory. We do not know how
20018 the instructions used to build constants modify the upper 64 bits
20019 of the register, once we have that information we may be able
20020 to handle some of them more efficiently. */
20021 if (can_create_pseudo_p ()
20022 && (CONSTANT_P (op1)
20023 || (SUBREG_P (op1)
20024 && CONSTANT_P (SUBREG_REG (op1))))
20025 && ((register_operand (op0, mode)
20026 && !standard_sse_constant_p (op1, mode))
20027 /* ix86_expand_vector_move_misalign() does not like constants. */
20028 || (SSE_REG_MODE_P (mode)
20029 && MEM_P (op0)
20030 && MEM_ALIGN (op0) < align)))
20032 if (SUBREG_P (op1))
20034 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20035 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20036 if (r)
20037 r = validize_mem (r);
20038 else
20039 r = force_reg (imode, SUBREG_REG (op1));
20040 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20042 else
20043 op1 = validize_mem (force_const_mem (mode, op1));
20046 /* We need to check memory alignment for SSE mode since attribute
20047 can make operands unaligned. */
20048 if (can_create_pseudo_p ()
20049 && SSE_REG_MODE_P (mode)
20050 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20051 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20053 rtx tmp[2];
20055 /* ix86_expand_vector_move_misalign() does not like both
20056 arguments in memory. */
20057 if (!register_operand (op0, mode)
20058 && !register_operand (op1, mode))
20059 op1 = force_reg (mode, op1);
20061 tmp[0] = op0; tmp[1] = op1;
20062 ix86_expand_vector_move_misalign (mode, tmp);
20063 return;
20066 /* Make operand1 a register if it isn't already. */
20067 if (can_create_pseudo_p ()
20068 && !register_operand (op0, mode)
20069 && !register_operand (op1, mode))
20071 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20072 return;
20075 emit_insn (gen_rtx_SET (op0, op1));
20078 /* Split 32-byte AVX unaligned load and store if needed. */
20080 static void
20081 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20083 rtx m;
20084 rtx (*extract) (rtx, rtx, rtx);
20085 machine_mode mode;
20087 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20088 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20090 emit_insn (gen_rtx_SET (op0, op1));
20091 return;
20094 rtx orig_op0 = NULL_RTX;
20095 mode = GET_MODE (op0);
20096 switch (GET_MODE_CLASS (mode))
20098 case MODE_VECTOR_INT:
20099 case MODE_INT:
20100 if (mode != V32QImode)
20102 if (!MEM_P (op0))
20104 orig_op0 = op0;
20105 op0 = gen_reg_rtx (V32QImode);
20107 else
20108 op0 = gen_lowpart (V32QImode, op0);
20109 op1 = gen_lowpart (V32QImode, op1);
20110 mode = V32QImode;
20112 break;
20113 case MODE_VECTOR_FLOAT:
20114 break;
20115 default:
20116 gcc_unreachable ();
20119 switch (mode)
20121 default:
20122 gcc_unreachable ();
20123 case V32QImode:
20124 extract = gen_avx_vextractf128v32qi;
20125 mode = V16QImode;
20126 break;
20127 case V8SFmode:
20128 extract = gen_avx_vextractf128v8sf;
20129 mode = V4SFmode;
20130 break;
20131 case V4DFmode:
20132 extract = gen_avx_vextractf128v4df;
20133 mode = V2DFmode;
20134 break;
20137 if (MEM_P (op1))
20139 rtx r = gen_reg_rtx (mode);
20140 m = adjust_address (op1, mode, 0);
20141 emit_move_insn (r, m);
20142 m = adjust_address (op1, mode, 16);
20143 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20144 emit_move_insn (op0, r);
20146 else if (MEM_P (op0))
20148 m = adjust_address (op0, mode, 0);
20149 emit_insn (extract (m, op1, const0_rtx));
20150 m = adjust_address (op0, mode, 16);
20151 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20153 else
20154 gcc_unreachable ();
20156 if (orig_op0)
20157 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20160 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20161 straight to ix86_expand_vector_move. */
20162 /* Code generation for scalar reg-reg moves of single and double precision data:
20163 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20164 movaps reg, reg
20165 else
20166 movss reg, reg
20167 if (x86_sse_partial_reg_dependency == true)
20168 movapd reg, reg
20169 else
20170 movsd reg, reg
20172 Code generation for scalar loads of double precision data:
20173 if (x86_sse_split_regs == true)
20174 movlpd mem, reg (gas syntax)
20175 else
20176 movsd mem, reg
20178 Code generation for unaligned packed loads of single precision data
20179 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20180 if (x86_sse_unaligned_move_optimal)
20181 movups mem, reg
20183 if (x86_sse_partial_reg_dependency == true)
20185 xorps reg, reg
20186 movlps mem, reg
20187 movhps mem+8, reg
20189 else
20191 movlps mem, reg
20192 movhps mem+8, reg
20195 Code generation for unaligned packed loads of double precision data
20196 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20197 if (x86_sse_unaligned_move_optimal)
20198 movupd mem, reg
20200 if (x86_sse_split_regs == true)
20202 movlpd mem, reg
20203 movhpd mem+8, reg
20205 else
20207 movsd mem, reg
20208 movhpd mem+8, reg
20212 void
20213 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20215 rtx op0, op1, m;
20217 op0 = operands[0];
20218 op1 = operands[1];
20220 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20221 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20223 emit_insn (gen_rtx_SET (op0, op1));
20224 return;
20227 if (TARGET_AVX)
20229 if (GET_MODE_SIZE (mode) == 32)
20230 ix86_avx256_split_vector_move_misalign (op0, op1);
20231 else
20232 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20233 emit_insn (gen_rtx_SET (op0, op1));
20234 return;
20237 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20238 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20240 emit_insn (gen_rtx_SET (op0, op1));
20241 return;
20244 /* ??? If we have typed data, then it would appear that using
20245 movdqu is the only way to get unaligned data loaded with
20246 integer type. */
20247 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20249 emit_insn (gen_rtx_SET (op0, op1));
20250 return;
20253 if (MEM_P (op1))
20255 if (TARGET_SSE2 && mode == V2DFmode)
20257 rtx zero;
20259 /* When SSE registers are split into halves, we can avoid
20260 writing to the top half twice. */
20261 if (TARGET_SSE_SPLIT_REGS)
20263 emit_clobber (op0);
20264 zero = op0;
20266 else
20268 /* ??? Not sure about the best option for the Intel chips.
20269 The following would seem to satisfy; the register is
20270 entirely cleared, breaking the dependency chain. We
20271 then store to the upper half, with a dependency depth
20272 of one. A rumor has it that Intel recommends two movsd
20273 followed by an unpacklpd, but this is unconfirmed. And
20274 given that the dependency depth of the unpacklpd would
20275 still be one, I'm not sure why this would be better. */
20276 zero = CONST0_RTX (V2DFmode);
20279 m = adjust_address (op1, DFmode, 0);
20280 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20281 m = adjust_address (op1, DFmode, 8);
20282 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20284 else
20286 rtx t;
20288 if (mode != V4SFmode)
20289 t = gen_reg_rtx (V4SFmode);
20290 else
20291 t = op0;
20293 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20294 emit_move_insn (t, CONST0_RTX (V4SFmode));
20295 else
20296 emit_clobber (t);
20298 m = adjust_address (op1, V2SFmode, 0);
20299 emit_insn (gen_sse_loadlps (t, t, m));
20300 m = adjust_address (op1, V2SFmode, 8);
20301 emit_insn (gen_sse_loadhps (t, t, m));
20302 if (mode != V4SFmode)
20303 emit_move_insn (op0, gen_lowpart (mode, t));
20306 else if (MEM_P (op0))
20308 if (TARGET_SSE2 && mode == V2DFmode)
20310 m = adjust_address (op0, DFmode, 0);
20311 emit_insn (gen_sse2_storelpd (m, op1));
20312 m = adjust_address (op0, DFmode, 8);
20313 emit_insn (gen_sse2_storehpd (m, op1));
20315 else
20317 if (mode != V4SFmode)
20318 op1 = gen_lowpart (V4SFmode, op1);
20320 m = adjust_address (op0, V2SFmode, 0);
20321 emit_insn (gen_sse_storelps (m, op1));
20322 m = adjust_address (op0, V2SFmode, 8);
20323 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20326 else
20327 gcc_unreachable ();
20330 /* Helper function of ix86_fixup_binary_operands to canonicalize
20331 operand order. Returns true if the operands should be swapped. */
20333 static bool
20334 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20335 rtx operands[])
20337 rtx dst = operands[0];
20338 rtx src1 = operands[1];
20339 rtx src2 = operands[2];
20341 /* If the operation is not commutative, we can't do anything. */
20342 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20343 return false;
20345 /* Highest priority is that src1 should match dst. */
20346 if (rtx_equal_p (dst, src1))
20347 return false;
20348 if (rtx_equal_p (dst, src2))
20349 return true;
20351 /* Next highest priority is that immediate constants come second. */
20352 if (immediate_operand (src2, mode))
20353 return false;
20354 if (immediate_operand (src1, mode))
20355 return true;
20357 /* Lowest priority is that memory references should come second. */
20358 if (MEM_P (src2))
20359 return false;
20360 if (MEM_P (src1))
20361 return true;
20363 return false;
20367 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20368 destination to use for the operation. If different from the true
20369 destination in operands[0], a copy operation will be required. */
20372 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20373 rtx operands[])
20375 rtx dst = operands[0];
20376 rtx src1 = operands[1];
20377 rtx src2 = operands[2];
20379 /* Canonicalize operand order. */
20380 if (ix86_swap_binary_operands_p (code, mode, operands))
20382 /* It is invalid to swap operands of different modes. */
20383 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20385 std::swap (src1, src2);
20388 /* Both source operands cannot be in memory. */
20389 if (MEM_P (src1) && MEM_P (src2))
20391 /* Optimization: Only read from memory once. */
20392 if (rtx_equal_p (src1, src2))
20394 src2 = force_reg (mode, src2);
20395 src1 = src2;
20397 else if (rtx_equal_p (dst, src1))
20398 src2 = force_reg (mode, src2);
20399 else
20400 src1 = force_reg (mode, src1);
20403 /* If the destination is memory, and we do not have matching source
20404 operands, do things in registers. */
20405 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20406 dst = gen_reg_rtx (mode);
20408 /* Source 1 cannot be a constant. */
20409 if (CONSTANT_P (src1))
20410 src1 = force_reg (mode, src1);
20412 /* Source 1 cannot be a non-matching memory. */
20413 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20414 src1 = force_reg (mode, src1);
20416 /* Improve address combine. */
20417 if (code == PLUS
20418 && GET_MODE_CLASS (mode) == MODE_INT
20419 && MEM_P (src2))
20420 src2 = force_reg (mode, src2);
20422 operands[1] = src1;
20423 operands[2] = src2;
20424 return dst;
20427 /* Similarly, but assume that the destination has already been
20428 set up properly. */
20430 void
20431 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20432 machine_mode mode, rtx operands[])
20434 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20435 gcc_assert (dst == operands[0]);
20438 /* Attempt to expand a binary operator. Make the expansion closer to the
20439 actual machine, then just general_operand, which will allow 3 separate
20440 memory references (one output, two input) in a single insn. */
20442 void
20443 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20444 rtx operands[])
20446 rtx src1, src2, dst, op, clob;
20448 dst = ix86_fixup_binary_operands (code, mode, operands);
20449 src1 = operands[1];
20450 src2 = operands[2];
20452 /* Emit the instruction. */
20454 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20456 if (reload_completed
20457 && code == PLUS
20458 && !rtx_equal_p (dst, src1))
20460 /* This is going to be an LEA; avoid splitting it later. */
20461 emit_insn (op);
20463 else
20465 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20466 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20469 /* Fix up the destination if needed. */
20470 if (dst != operands[0])
20471 emit_move_insn (operands[0], dst);
20474 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20475 the given OPERANDS. */
20477 void
20478 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20479 rtx operands[])
20481 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20482 if (SUBREG_P (operands[1]))
20484 op1 = operands[1];
20485 op2 = operands[2];
20487 else if (SUBREG_P (operands[2]))
20489 op1 = operands[2];
20490 op2 = operands[1];
20492 /* Optimize (__m128i) d | (__m128i) e and similar code
20493 when d and e are float vectors into float vector logical
20494 insn. In C/C++ without using intrinsics there is no other way
20495 to express vector logical operation on float vectors than
20496 to cast them temporarily to integer vectors. */
20497 if (op1
20498 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20499 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20500 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20501 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20502 && SUBREG_BYTE (op1) == 0
20503 && (GET_CODE (op2) == CONST_VECTOR
20504 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20505 && SUBREG_BYTE (op2) == 0))
20506 && can_create_pseudo_p ())
20508 rtx dst;
20509 switch (GET_MODE (SUBREG_REG (op1)))
20511 case V4SFmode:
20512 case V8SFmode:
20513 case V16SFmode:
20514 case V2DFmode:
20515 case V4DFmode:
20516 case V8DFmode:
20517 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20518 if (GET_CODE (op2) == CONST_VECTOR)
20520 op2 = gen_lowpart (GET_MODE (dst), op2);
20521 op2 = force_reg (GET_MODE (dst), op2);
20523 else
20525 op1 = operands[1];
20526 op2 = SUBREG_REG (operands[2]);
20527 if (!vector_operand (op2, GET_MODE (dst)))
20528 op2 = force_reg (GET_MODE (dst), op2);
20530 op1 = SUBREG_REG (op1);
20531 if (!vector_operand (op1, GET_MODE (dst)))
20532 op1 = force_reg (GET_MODE (dst), op1);
20533 emit_insn (gen_rtx_SET (dst,
20534 gen_rtx_fmt_ee (code, GET_MODE (dst),
20535 op1, op2)));
20536 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20537 return;
20538 default:
20539 break;
20542 if (!vector_operand (operands[1], mode))
20543 operands[1] = force_reg (mode, operands[1]);
20544 if (!vector_operand (operands[2], mode))
20545 operands[2] = force_reg (mode, operands[2]);
20546 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20547 emit_insn (gen_rtx_SET (operands[0],
20548 gen_rtx_fmt_ee (code, mode, operands[1],
20549 operands[2])));
20552 /* Return TRUE or FALSE depending on whether the binary operator meets the
20553 appropriate constraints. */
20555 bool
20556 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20557 rtx operands[3])
20559 rtx dst = operands[0];
20560 rtx src1 = operands[1];
20561 rtx src2 = operands[2];
20563 /* Both source operands cannot be in memory. */
20564 if (MEM_P (src1) && MEM_P (src2))
20565 return false;
20567 /* Canonicalize operand order for commutative operators. */
20568 if (ix86_swap_binary_operands_p (code, mode, operands))
20569 std::swap (src1, src2);
20571 /* If the destination is memory, we must have a matching source operand. */
20572 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20573 return false;
20575 /* Source 1 cannot be a constant. */
20576 if (CONSTANT_P (src1))
20577 return false;
20579 /* Source 1 cannot be a non-matching memory. */
20580 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20581 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20582 return (code == AND
20583 && (mode == HImode
20584 || mode == SImode
20585 || (TARGET_64BIT && mode == DImode))
20586 && satisfies_constraint_L (src2));
20588 return true;
20591 /* Attempt to expand a unary operator. Make the expansion closer to the
20592 actual machine, then just general_operand, which will allow 2 separate
20593 memory references (one output, one input) in a single insn. */
20595 void
20596 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20597 rtx operands[])
20599 bool matching_memory = false;
20600 rtx src, dst, op, clob;
20602 dst = operands[0];
20603 src = operands[1];
20605 /* If the destination is memory, and we do not have matching source
20606 operands, do things in registers. */
20607 if (MEM_P (dst))
20609 if (rtx_equal_p (dst, src))
20610 matching_memory = true;
20611 else
20612 dst = gen_reg_rtx (mode);
20615 /* When source operand is memory, destination must match. */
20616 if (MEM_P (src) && !matching_memory)
20617 src = force_reg (mode, src);
20619 /* Emit the instruction. */
20621 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20623 if (code == NOT)
20624 emit_insn (op);
20625 else
20627 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20628 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20631 /* Fix up the destination if needed. */
20632 if (dst != operands[0])
20633 emit_move_insn (operands[0], dst);
20636 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20637 divisor are within the range [0-255]. */
20639 void
20640 ix86_split_idivmod (machine_mode mode, rtx operands[],
20641 bool signed_p)
20643 rtx_code_label *end_label, *qimode_label;
20644 rtx div, mod;
20645 rtx_insn *insn;
20646 rtx scratch, tmp0, tmp1, tmp2;
20647 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20648 rtx (*gen_zero_extend) (rtx, rtx);
20649 rtx (*gen_test_ccno_1) (rtx, rtx);
20651 switch (mode)
20653 case SImode:
20654 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20655 gen_test_ccno_1 = gen_testsi_ccno_1;
20656 gen_zero_extend = gen_zero_extendqisi2;
20657 break;
20658 case DImode:
20659 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20660 gen_test_ccno_1 = gen_testdi_ccno_1;
20661 gen_zero_extend = gen_zero_extendqidi2;
20662 break;
20663 default:
20664 gcc_unreachable ();
20667 end_label = gen_label_rtx ();
20668 qimode_label = gen_label_rtx ();
20670 scratch = gen_reg_rtx (mode);
20672 /* Use 8bit unsigned divimod if dividend and divisor are within
20673 the range [0-255]. */
20674 emit_move_insn (scratch, operands[2]);
20675 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20676 scratch, 1, OPTAB_DIRECT);
20677 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20678 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20679 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20680 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20681 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20682 pc_rtx);
20683 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20684 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20685 JUMP_LABEL (insn) = qimode_label;
20687 /* Generate original signed/unsigned divimod. */
20688 div = gen_divmod4_1 (operands[0], operands[1],
20689 operands[2], operands[3]);
20690 emit_insn (div);
20692 /* Branch to the end. */
20693 emit_jump_insn (gen_jump (end_label));
20694 emit_barrier ();
20696 /* Generate 8bit unsigned divide. */
20697 emit_label (qimode_label);
20698 /* Don't use operands[0] for result of 8bit divide since not all
20699 registers support QImode ZERO_EXTRACT. */
20700 tmp0 = lowpart_subreg (HImode, scratch, mode);
20701 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20702 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20703 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20705 if (signed_p)
20707 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20708 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20710 else
20712 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20713 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20716 /* Extract remainder from AH. */
20717 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20718 if (REG_P (operands[1]))
20719 insn = emit_move_insn (operands[1], tmp1);
20720 else
20722 /* Need a new scratch register since the old one has result
20723 of 8bit divide. */
20724 scratch = gen_reg_rtx (mode);
20725 emit_move_insn (scratch, tmp1);
20726 insn = emit_move_insn (operands[1], scratch);
20728 set_unique_reg_note (insn, REG_EQUAL, mod);
20730 /* Zero extend quotient from AL. */
20731 tmp1 = gen_lowpart (QImode, tmp0);
20732 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20733 set_unique_reg_note (insn, REG_EQUAL, div);
20735 emit_label (end_label);
20738 #define LEA_MAX_STALL (3)
20739 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20741 /* Increase given DISTANCE in half-cycles according to
20742 dependencies between PREV and NEXT instructions.
20743 Add 1 half-cycle if there is no dependency and
20744 go to next cycle if there is some dependecy. */
20746 static unsigned int
20747 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20749 df_ref def, use;
20751 if (!prev || !next)
20752 return distance + (distance & 1) + 2;
20754 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20755 return distance + 1;
20757 FOR_EACH_INSN_USE (use, next)
20758 FOR_EACH_INSN_DEF (def, prev)
20759 if (!DF_REF_IS_ARTIFICIAL (def)
20760 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20761 return distance + (distance & 1) + 2;
20763 return distance + 1;
20766 /* Function checks if instruction INSN defines register number
20767 REGNO1 or REGNO2. */
20769 static bool
20770 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20771 rtx_insn *insn)
20773 df_ref def;
20775 FOR_EACH_INSN_DEF (def, insn)
20776 if (DF_REF_REG_DEF_P (def)
20777 && !DF_REF_IS_ARTIFICIAL (def)
20778 && (regno1 == DF_REF_REGNO (def)
20779 || regno2 == DF_REF_REGNO (def)))
20780 return true;
20782 return false;
20785 /* Function checks if instruction INSN uses register number
20786 REGNO as a part of address expression. */
20788 static bool
20789 insn_uses_reg_mem (unsigned int regno, rtx insn)
20791 df_ref use;
20793 FOR_EACH_INSN_USE (use, insn)
20794 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20795 return true;
20797 return false;
20800 /* Search backward for non-agu definition of register number REGNO1
20801 or register number REGNO2 in basic block starting from instruction
20802 START up to head of basic block or instruction INSN.
20804 Function puts true value into *FOUND var if definition was found
20805 and false otherwise.
20807 Distance in half-cycles between START and found instruction or head
20808 of BB is added to DISTANCE and returned. */
20810 static int
20811 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20812 rtx_insn *insn, int distance,
20813 rtx_insn *start, bool *found)
20815 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20816 rtx_insn *prev = start;
20817 rtx_insn *next = NULL;
20819 *found = false;
20821 while (prev
20822 && prev != insn
20823 && distance < LEA_SEARCH_THRESHOLD)
20825 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20827 distance = increase_distance (prev, next, distance);
20828 if (insn_defines_reg (regno1, regno2, prev))
20830 if (recog_memoized (prev) < 0
20831 || get_attr_type (prev) != TYPE_LEA)
20833 *found = true;
20834 return distance;
20838 next = prev;
20840 if (prev == BB_HEAD (bb))
20841 break;
20843 prev = PREV_INSN (prev);
20846 return distance;
20849 /* Search backward for non-agu definition of register number REGNO1
20850 or register number REGNO2 in INSN's basic block until
20851 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20852 2. Reach neighbor BBs boundary, or
20853 3. Reach agu definition.
20854 Returns the distance between the non-agu definition point and INSN.
20855 If no definition point, returns -1. */
20857 static int
20858 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20859 rtx_insn *insn)
20861 basic_block bb = BLOCK_FOR_INSN (insn);
20862 int distance = 0;
20863 bool found = false;
20865 if (insn != BB_HEAD (bb))
20866 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20867 distance, PREV_INSN (insn),
20868 &found);
20870 if (!found && distance < LEA_SEARCH_THRESHOLD)
20872 edge e;
20873 edge_iterator ei;
20874 bool simple_loop = false;
20876 FOR_EACH_EDGE (e, ei, bb->preds)
20877 if (e->src == bb)
20879 simple_loop = true;
20880 break;
20883 if (simple_loop)
20884 distance = distance_non_agu_define_in_bb (regno1, regno2,
20885 insn, distance,
20886 BB_END (bb), &found);
20887 else
20889 int shortest_dist = -1;
20890 bool found_in_bb = false;
20892 FOR_EACH_EDGE (e, ei, bb->preds)
20894 int bb_dist
20895 = distance_non_agu_define_in_bb (regno1, regno2,
20896 insn, distance,
20897 BB_END (e->src),
20898 &found_in_bb);
20899 if (found_in_bb)
20901 if (shortest_dist < 0)
20902 shortest_dist = bb_dist;
20903 else if (bb_dist > 0)
20904 shortest_dist = MIN (bb_dist, shortest_dist);
20906 found = true;
20910 distance = shortest_dist;
20914 /* get_attr_type may modify recog data. We want to make sure
20915 that recog data is valid for instruction INSN, on which
20916 distance_non_agu_define is called. INSN is unchanged here. */
20917 extract_insn_cached (insn);
20919 if (!found)
20920 return -1;
20922 return distance >> 1;
20925 /* Return the distance in half-cycles between INSN and the next
20926 insn that uses register number REGNO in memory address added
20927 to DISTANCE. Return -1 if REGNO0 is set.
20929 Put true value into *FOUND if register usage was found and
20930 false otherwise.
20931 Put true value into *REDEFINED if register redefinition was
20932 found and false otherwise. */
20934 static int
20935 distance_agu_use_in_bb (unsigned int regno,
20936 rtx_insn *insn, int distance, rtx_insn *start,
20937 bool *found, bool *redefined)
20939 basic_block bb = NULL;
20940 rtx_insn *next = start;
20941 rtx_insn *prev = NULL;
20943 *found = false;
20944 *redefined = false;
20946 if (start != NULL_RTX)
20948 bb = BLOCK_FOR_INSN (start);
20949 if (start != BB_HEAD (bb))
20950 /* If insn and start belong to the same bb, set prev to insn,
20951 so the call to increase_distance will increase the distance
20952 between insns by 1. */
20953 prev = insn;
20956 while (next
20957 && next != insn
20958 && distance < LEA_SEARCH_THRESHOLD)
20960 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20962 distance = increase_distance(prev, next, distance);
20963 if (insn_uses_reg_mem (regno, next))
20965 /* Return DISTANCE if OP0 is used in memory
20966 address in NEXT. */
20967 *found = true;
20968 return distance;
20971 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20973 /* Return -1 if OP0 is set in NEXT. */
20974 *redefined = true;
20975 return -1;
20978 prev = next;
20981 if (next == BB_END (bb))
20982 break;
20984 next = NEXT_INSN (next);
20987 return distance;
20990 /* Return the distance between INSN and the next insn that uses
20991 register number REGNO0 in memory address. Return -1 if no such
20992 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20994 static int
20995 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20997 basic_block bb = BLOCK_FOR_INSN (insn);
20998 int distance = 0;
20999 bool found = false;
21000 bool redefined = false;
21002 if (insn != BB_END (bb))
21003 distance = distance_agu_use_in_bb (regno0, insn, distance,
21004 NEXT_INSN (insn),
21005 &found, &redefined);
21007 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21009 edge e;
21010 edge_iterator ei;
21011 bool simple_loop = false;
21013 FOR_EACH_EDGE (e, ei, bb->succs)
21014 if (e->dest == bb)
21016 simple_loop = true;
21017 break;
21020 if (simple_loop)
21021 distance = distance_agu_use_in_bb (regno0, insn,
21022 distance, BB_HEAD (bb),
21023 &found, &redefined);
21024 else
21026 int shortest_dist = -1;
21027 bool found_in_bb = false;
21028 bool redefined_in_bb = false;
21030 FOR_EACH_EDGE (e, ei, bb->succs)
21032 int bb_dist
21033 = distance_agu_use_in_bb (regno0, insn,
21034 distance, BB_HEAD (e->dest),
21035 &found_in_bb, &redefined_in_bb);
21036 if (found_in_bb)
21038 if (shortest_dist < 0)
21039 shortest_dist = bb_dist;
21040 else if (bb_dist > 0)
21041 shortest_dist = MIN (bb_dist, shortest_dist);
21043 found = true;
21047 distance = shortest_dist;
21051 if (!found || redefined)
21052 return -1;
21054 return distance >> 1;
21057 /* Define this macro to tune LEA priority vs ADD, it take effect when
21058 there is a dilemma of choicing LEA or ADD
21059 Negative value: ADD is more preferred than LEA
21060 Zero: Netrual
21061 Positive value: LEA is more preferred than ADD*/
21062 #define IX86_LEA_PRIORITY 0
21064 /* Return true if usage of lea INSN has performance advantage
21065 over a sequence of instructions. Instructions sequence has
21066 SPLIT_COST cycles higher latency than lea latency. */
21068 static bool
21069 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21070 unsigned int regno2, int split_cost, bool has_scale)
21072 int dist_define, dist_use;
21074 /* For Silvermont if using a 2-source or 3-source LEA for
21075 non-destructive destination purposes, or due to wanting
21076 ability to use SCALE, the use of LEA is justified. */
21077 if (TARGET_SILVERMONT || TARGET_INTEL)
21079 if (has_scale)
21080 return true;
21081 if (split_cost < 1)
21082 return false;
21083 if (regno0 == regno1 || regno0 == regno2)
21084 return false;
21085 return true;
21088 dist_define = distance_non_agu_define (regno1, regno2, insn);
21089 dist_use = distance_agu_use (regno0, insn);
21091 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21093 /* If there is no non AGU operand definition, no AGU
21094 operand usage and split cost is 0 then both lea
21095 and non lea variants have same priority. Currently
21096 we prefer lea for 64 bit code and non lea on 32 bit
21097 code. */
21098 if (dist_use < 0 && split_cost == 0)
21099 return TARGET_64BIT || IX86_LEA_PRIORITY;
21100 else
21101 return true;
21104 /* With longer definitions distance lea is more preferable.
21105 Here we change it to take into account splitting cost and
21106 lea priority. */
21107 dist_define += split_cost + IX86_LEA_PRIORITY;
21109 /* If there is no use in memory addess then we just check
21110 that split cost exceeds AGU stall. */
21111 if (dist_use < 0)
21112 return dist_define > LEA_MAX_STALL;
21114 /* If this insn has both backward non-agu dependence and forward
21115 agu dependence, the one with short distance takes effect. */
21116 return dist_define >= dist_use;
21119 /* Return true if it is legal to clobber flags by INSN and
21120 false otherwise. */
21122 static bool
21123 ix86_ok_to_clobber_flags (rtx_insn *insn)
21125 basic_block bb = BLOCK_FOR_INSN (insn);
21126 df_ref use;
21127 bitmap live;
21129 while (insn)
21131 if (NONDEBUG_INSN_P (insn))
21133 FOR_EACH_INSN_USE (use, insn)
21134 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21135 return false;
21137 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21138 return true;
21141 if (insn == BB_END (bb))
21142 break;
21144 insn = NEXT_INSN (insn);
21147 live = df_get_live_out(bb);
21148 return !REGNO_REG_SET_P (live, FLAGS_REG);
21151 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21152 move and add to avoid AGU stalls. */
21154 bool
21155 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21157 unsigned int regno0, regno1, regno2;
21159 /* Check if we need to optimize. */
21160 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21161 return false;
21163 /* Check it is correct to split here. */
21164 if (!ix86_ok_to_clobber_flags(insn))
21165 return false;
21167 regno0 = true_regnum (operands[0]);
21168 regno1 = true_regnum (operands[1]);
21169 regno2 = true_regnum (operands[2]);
21171 /* We need to split only adds with non destructive
21172 destination operand. */
21173 if (regno0 == regno1 || regno0 == regno2)
21174 return false;
21175 else
21176 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21179 /* Return true if we should emit lea instruction instead of mov
21180 instruction. */
21182 bool
21183 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21185 unsigned int regno0, regno1;
21187 /* Check if we need to optimize. */
21188 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21189 return false;
21191 /* Use lea for reg to reg moves only. */
21192 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21193 return false;
21195 regno0 = true_regnum (operands[0]);
21196 regno1 = true_regnum (operands[1]);
21198 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21201 /* Return true if we need to split lea into a sequence of
21202 instructions to avoid AGU stalls. */
21204 bool
21205 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21207 unsigned int regno0, regno1, regno2;
21208 int split_cost;
21209 struct ix86_address parts;
21210 int ok;
21212 /* Check we need to optimize. */
21213 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21214 return false;
21216 /* The "at least two components" test below might not catch simple
21217 move or zero extension insns if parts.base is non-NULL and parts.disp
21218 is const0_rtx as the only components in the address, e.g. if the
21219 register is %rbp or %r13. As this test is much cheaper and moves or
21220 zero extensions are the common case, do this check first. */
21221 if (REG_P (operands[1])
21222 || (SImode_address_operand (operands[1], VOIDmode)
21223 && REG_P (XEXP (operands[1], 0))))
21224 return false;
21226 /* Check if it is OK to split here. */
21227 if (!ix86_ok_to_clobber_flags (insn))
21228 return false;
21230 ok = ix86_decompose_address (operands[1], &parts);
21231 gcc_assert (ok);
21233 /* There should be at least two components in the address. */
21234 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21235 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21236 return false;
21238 /* We should not split into add if non legitimate pic
21239 operand is used as displacement. */
21240 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21241 return false;
21243 regno0 = true_regnum (operands[0]) ;
21244 regno1 = INVALID_REGNUM;
21245 regno2 = INVALID_REGNUM;
21247 if (parts.base)
21248 regno1 = true_regnum (parts.base);
21249 if (parts.index)
21250 regno2 = true_regnum (parts.index);
21252 split_cost = 0;
21254 /* Compute how many cycles we will add to execution time
21255 if split lea into a sequence of instructions. */
21256 if (parts.base || parts.index)
21258 /* Have to use mov instruction if non desctructive
21259 destination form is used. */
21260 if (regno1 != regno0 && regno2 != regno0)
21261 split_cost += 1;
21263 /* Have to add index to base if both exist. */
21264 if (parts.base && parts.index)
21265 split_cost += 1;
21267 /* Have to use shift and adds if scale is 2 or greater. */
21268 if (parts.scale > 1)
21270 if (regno0 != regno1)
21271 split_cost += 1;
21272 else if (regno2 == regno0)
21273 split_cost += 4;
21274 else
21275 split_cost += parts.scale;
21278 /* Have to use add instruction with immediate if
21279 disp is non zero. */
21280 if (parts.disp && parts.disp != const0_rtx)
21281 split_cost += 1;
21283 /* Subtract the price of lea. */
21284 split_cost -= 1;
21287 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21288 parts.scale > 1);
21291 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21292 matches destination. RTX includes clobber of FLAGS_REG. */
21294 static void
21295 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21296 rtx dst, rtx src)
21298 rtx op, clob;
21300 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21301 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21303 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21306 /* Return true if regno1 def is nearest to the insn. */
21308 static bool
21309 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21311 rtx_insn *prev = insn;
21312 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21314 if (insn == start)
21315 return false;
21316 while (prev && prev != start)
21318 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21320 prev = PREV_INSN (prev);
21321 continue;
21323 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21324 return true;
21325 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21326 return false;
21327 prev = PREV_INSN (prev);
21330 /* None of the regs is defined in the bb. */
21331 return false;
21334 /* Split lea instructions into a sequence of instructions
21335 which are executed on ALU to avoid AGU stalls.
21336 It is assumed that it is allowed to clobber flags register
21337 at lea position. */
21339 void
21340 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21342 unsigned int regno0, regno1, regno2;
21343 struct ix86_address parts;
21344 rtx target, tmp;
21345 int ok, adds;
21347 ok = ix86_decompose_address (operands[1], &parts);
21348 gcc_assert (ok);
21350 target = gen_lowpart (mode, operands[0]);
21352 regno0 = true_regnum (target);
21353 regno1 = INVALID_REGNUM;
21354 regno2 = INVALID_REGNUM;
21356 if (parts.base)
21358 parts.base = gen_lowpart (mode, parts.base);
21359 regno1 = true_regnum (parts.base);
21362 if (parts.index)
21364 parts.index = gen_lowpart (mode, parts.index);
21365 regno2 = true_regnum (parts.index);
21368 if (parts.disp)
21369 parts.disp = gen_lowpart (mode, parts.disp);
21371 if (parts.scale > 1)
21373 /* Case r1 = r1 + ... */
21374 if (regno1 == regno0)
21376 /* If we have a case r1 = r1 + C * r2 then we
21377 should use multiplication which is very
21378 expensive. Assume cost model is wrong if we
21379 have such case here. */
21380 gcc_assert (regno2 != regno0);
21382 for (adds = parts.scale; adds > 0; adds--)
21383 ix86_emit_binop (PLUS, mode, target, parts.index);
21385 else
21387 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21388 if (regno0 != regno2)
21389 emit_insn (gen_rtx_SET (target, parts.index));
21391 /* Use shift for scaling. */
21392 ix86_emit_binop (ASHIFT, mode, target,
21393 GEN_INT (exact_log2 (parts.scale)));
21395 if (parts.base)
21396 ix86_emit_binop (PLUS, mode, target, parts.base);
21398 if (parts.disp && parts.disp != const0_rtx)
21399 ix86_emit_binop (PLUS, mode, target, parts.disp);
21402 else if (!parts.base && !parts.index)
21404 gcc_assert(parts.disp);
21405 emit_insn (gen_rtx_SET (target, parts.disp));
21407 else
21409 if (!parts.base)
21411 if (regno0 != regno2)
21412 emit_insn (gen_rtx_SET (target, parts.index));
21414 else if (!parts.index)
21416 if (regno0 != regno1)
21417 emit_insn (gen_rtx_SET (target, parts.base));
21419 else
21421 if (regno0 == regno1)
21422 tmp = parts.index;
21423 else if (regno0 == regno2)
21424 tmp = parts.base;
21425 else
21427 rtx tmp1;
21429 /* Find better operand for SET instruction, depending
21430 on which definition is farther from the insn. */
21431 if (find_nearest_reg_def (insn, regno1, regno2))
21432 tmp = parts.index, tmp1 = parts.base;
21433 else
21434 tmp = parts.base, tmp1 = parts.index;
21436 emit_insn (gen_rtx_SET (target, tmp));
21438 if (parts.disp && parts.disp != const0_rtx)
21439 ix86_emit_binop (PLUS, mode, target, parts.disp);
21441 ix86_emit_binop (PLUS, mode, target, tmp1);
21442 return;
21445 ix86_emit_binop (PLUS, mode, target, tmp);
21448 if (parts.disp && parts.disp != const0_rtx)
21449 ix86_emit_binop (PLUS, mode, target, parts.disp);
21453 /* Return true if it is ok to optimize an ADD operation to LEA
21454 operation to avoid flag register consumation. For most processors,
21455 ADD is faster than LEA. For the processors like BONNELL, if the
21456 destination register of LEA holds an actual address which will be
21457 used soon, LEA is better and otherwise ADD is better. */
21459 bool
21460 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21462 unsigned int regno0 = true_regnum (operands[0]);
21463 unsigned int regno1 = true_regnum (operands[1]);
21464 unsigned int regno2 = true_regnum (operands[2]);
21466 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21467 if (regno0 != regno1 && regno0 != regno2)
21468 return true;
21470 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21471 return false;
21473 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21476 /* Return true if destination reg of SET_BODY is shift count of
21477 USE_BODY. */
21479 static bool
21480 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21482 rtx set_dest;
21483 rtx shift_rtx;
21484 int i;
21486 /* Retrieve destination of SET_BODY. */
21487 switch (GET_CODE (set_body))
21489 case SET:
21490 set_dest = SET_DEST (set_body);
21491 if (!set_dest || !REG_P (set_dest))
21492 return false;
21493 break;
21494 case PARALLEL:
21495 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21496 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21497 use_body))
21498 return true;
21499 /* FALLTHROUGH */
21500 default:
21501 return false;
21504 /* Retrieve shift count of USE_BODY. */
21505 switch (GET_CODE (use_body))
21507 case SET:
21508 shift_rtx = XEXP (use_body, 1);
21509 break;
21510 case PARALLEL:
21511 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21512 if (ix86_dep_by_shift_count_body (set_body,
21513 XVECEXP (use_body, 0, i)))
21514 return true;
21515 /* FALLTHROUGH */
21516 default:
21517 return false;
21520 if (shift_rtx
21521 && (GET_CODE (shift_rtx) == ASHIFT
21522 || GET_CODE (shift_rtx) == LSHIFTRT
21523 || GET_CODE (shift_rtx) == ASHIFTRT
21524 || GET_CODE (shift_rtx) == ROTATE
21525 || GET_CODE (shift_rtx) == ROTATERT))
21527 rtx shift_count = XEXP (shift_rtx, 1);
21529 /* Return true if shift count is dest of SET_BODY. */
21530 if (REG_P (shift_count))
21532 /* Add check since it can be invoked before register
21533 allocation in pre-reload schedule. */
21534 if (reload_completed
21535 && true_regnum (set_dest) == true_regnum (shift_count))
21536 return true;
21537 else if (REGNO(set_dest) == REGNO(shift_count))
21538 return true;
21542 return false;
21545 /* Return true if destination reg of SET_INSN is shift count of
21546 USE_INSN. */
21548 bool
21549 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21551 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21552 PATTERN (use_insn));
21555 /* Return TRUE or FALSE depending on whether the unary operator meets the
21556 appropriate constraints. */
21558 bool
21559 ix86_unary_operator_ok (enum rtx_code,
21560 machine_mode,
21561 rtx operands[2])
21563 /* If one of operands is memory, source and destination must match. */
21564 if ((MEM_P (operands[0])
21565 || MEM_P (operands[1]))
21566 && ! rtx_equal_p (operands[0], operands[1]))
21567 return false;
21568 return true;
21571 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21572 are ok, keeping in mind the possible movddup alternative. */
21574 bool
21575 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21577 if (MEM_P (operands[0]))
21578 return rtx_equal_p (operands[0], operands[1 + high]);
21579 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21580 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21581 return true;
21584 /* Post-reload splitter for converting an SF or DFmode value in an
21585 SSE register into an unsigned SImode. */
21587 void
21588 ix86_split_convert_uns_si_sse (rtx operands[])
21590 machine_mode vecmode;
21591 rtx value, large, zero_or_two31, input, two31, x;
21593 large = operands[1];
21594 zero_or_two31 = operands[2];
21595 input = operands[3];
21596 two31 = operands[4];
21597 vecmode = GET_MODE (large);
21598 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21600 /* Load up the value into the low element. We must ensure that the other
21601 elements are valid floats -- zero is the easiest such value. */
21602 if (MEM_P (input))
21604 if (vecmode == V4SFmode)
21605 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21606 else
21607 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21609 else
21611 input = gen_rtx_REG (vecmode, REGNO (input));
21612 emit_move_insn (value, CONST0_RTX (vecmode));
21613 if (vecmode == V4SFmode)
21614 emit_insn (gen_sse_movss (value, value, input));
21615 else
21616 emit_insn (gen_sse2_movsd (value, value, input));
21619 emit_move_insn (large, two31);
21620 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21622 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21623 emit_insn (gen_rtx_SET (large, x));
21625 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21626 emit_insn (gen_rtx_SET (zero_or_two31, x));
21628 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21629 emit_insn (gen_rtx_SET (value, x));
21631 large = gen_rtx_REG (V4SImode, REGNO (large));
21632 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21634 x = gen_rtx_REG (V4SImode, REGNO (value));
21635 if (vecmode == V4SFmode)
21636 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21637 else
21638 emit_insn (gen_sse2_cvttpd2dq (x, value));
21639 value = x;
21641 emit_insn (gen_xorv4si3 (value, value, large));
21644 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21645 Expects the 64-bit DImode to be supplied in a pair of integral
21646 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21647 -mfpmath=sse, !optimize_size only. */
21649 void
21650 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21652 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21653 rtx int_xmm, fp_xmm;
21654 rtx biases, exponents;
21655 rtx x;
21657 int_xmm = gen_reg_rtx (V4SImode);
21658 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21659 emit_insn (gen_movdi_to_sse (int_xmm, input));
21660 else if (TARGET_SSE_SPLIT_REGS)
21662 emit_clobber (int_xmm);
21663 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21665 else
21667 x = gen_reg_rtx (V2DImode);
21668 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21669 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21672 x = gen_rtx_CONST_VECTOR (V4SImode,
21673 gen_rtvec (4, GEN_INT (0x43300000UL),
21674 GEN_INT (0x45300000UL),
21675 const0_rtx, const0_rtx));
21676 exponents = validize_mem (force_const_mem (V4SImode, x));
21678 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21679 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21681 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21682 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21683 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21684 (0x1.0p84 + double(fp_value_hi_xmm)).
21685 Note these exponents differ by 32. */
21687 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21689 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21690 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21691 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21692 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21693 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21694 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21695 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21696 biases = validize_mem (force_const_mem (V2DFmode, biases));
21697 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21699 /* Add the upper and lower DFmode values together. */
21700 if (TARGET_SSE3)
21701 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21702 else
21704 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21705 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21706 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21709 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21712 /* Not used, but eases macroization of patterns. */
21713 void
21714 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21716 gcc_unreachable ();
21719 /* Convert an unsigned SImode value into a DFmode. Only currently used
21720 for SSE, but applicable anywhere. */
21722 void
21723 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21725 REAL_VALUE_TYPE TWO31r;
21726 rtx x, fp;
21728 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21729 NULL, 1, OPTAB_DIRECT);
21731 fp = gen_reg_rtx (DFmode);
21732 emit_insn (gen_floatsidf2 (fp, x));
21734 real_ldexp (&TWO31r, &dconst1, 31);
21735 x = const_double_from_real_value (TWO31r, DFmode);
21737 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21738 if (x != target)
21739 emit_move_insn (target, x);
21742 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21743 32-bit mode; otherwise we have a direct convert instruction. */
21745 void
21746 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21748 REAL_VALUE_TYPE TWO32r;
21749 rtx fp_lo, fp_hi, x;
21751 fp_lo = gen_reg_rtx (DFmode);
21752 fp_hi = gen_reg_rtx (DFmode);
21754 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21756 real_ldexp (&TWO32r, &dconst1, 32);
21757 x = const_double_from_real_value (TWO32r, DFmode);
21758 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21760 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21762 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21763 0, OPTAB_DIRECT);
21764 if (x != target)
21765 emit_move_insn (target, x);
21768 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21769 For x86_32, -mfpmath=sse, !optimize_size only. */
21770 void
21771 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21773 REAL_VALUE_TYPE ONE16r;
21774 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21776 real_ldexp (&ONE16r, &dconst1, 16);
21777 x = const_double_from_real_value (ONE16r, SFmode);
21778 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21779 NULL, 0, OPTAB_DIRECT);
21780 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21781 NULL, 0, OPTAB_DIRECT);
21782 fp_hi = gen_reg_rtx (SFmode);
21783 fp_lo = gen_reg_rtx (SFmode);
21784 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21785 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21786 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21787 0, OPTAB_DIRECT);
21788 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21789 0, OPTAB_DIRECT);
21790 if (!rtx_equal_p (target, fp_hi))
21791 emit_move_insn (target, fp_hi);
21794 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21795 a vector of unsigned ints VAL to vector of floats TARGET. */
21797 void
21798 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21800 rtx tmp[8];
21801 REAL_VALUE_TYPE TWO16r;
21802 machine_mode intmode = GET_MODE (val);
21803 machine_mode fltmode = GET_MODE (target);
21804 rtx (*cvt) (rtx, rtx);
21806 if (intmode == V4SImode)
21807 cvt = gen_floatv4siv4sf2;
21808 else
21809 cvt = gen_floatv8siv8sf2;
21810 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21811 tmp[0] = force_reg (intmode, tmp[0]);
21812 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21813 OPTAB_DIRECT);
21814 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21815 NULL_RTX, 1, OPTAB_DIRECT);
21816 tmp[3] = gen_reg_rtx (fltmode);
21817 emit_insn (cvt (tmp[3], tmp[1]));
21818 tmp[4] = gen_reg_rtx (fltmode);
21819 emit_insn (cvt (tmp[4], tmp[2]));
21820 real_ldexp (&TWO16r, &dconst1, 16);
21821 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21822 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21823 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21824 OPTAB_DIRECT);
21825 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21826 OPTAB_DIRECT);
21827 if (tmp[7] != target)
21828 emit_move_insn (target, tmp[7]);
21831 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21832 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21833 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21834 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21837 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21839 REAL_VALUE_TYPE TWO31r;
21840 rtx two31r, tmp[4];
21841 machine_mode mode = GET_MODE (val);
21842 machine_mode scalarmode = GET_MODE_INNER (mode);
21843 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21844 rtx (*cmp) (rtx, rtx, rtx, rtx);
21845 int i;
21847 for (i = 0; i < 3; i++)
21848 tmp[i] = gen_reg_rtx (mode);
21849 real_ldexp (&TWO31r, &dconst1, 31);
21850 two31r = const_double_from_real_value (TWO31r, scalarmode);
21851 two31r = ix86_build_const_vector (mode, 1, two31r);
21852 two31r = force_reg (mode, two31r);
21853 switch (mode)
21855 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21856 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21857 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21858 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21859 default: gcc_unreachable ();
21861 tmp[3] = gen_rtx_LE (mode, two31r, val);
21862 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21863 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21864 0, OPTAB_DIRECT);
21865 if (intmode == V4SImode || TARGET_AVX2)
21866 *xorp = expand_simple_binop (intmode, ASHIFT,
21867 gen_lowpart (intmode, tmp[0]),
21868 GEN_INT (31), NULL_RTX, 0,
21869 OPTAB_DIRECT);
21870 else
21872 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21873 two31 = ix86_build_const_vector (intmode, 1, two31);
21874 *xorp = expand_simple_binop (intmode, AND,
21875 gen_lowpart (intmode, tmp[0]),
21876 two31, NULL_RTX, 0,
21877 OPTAB_DIRECT);
21879 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21880 0, OPTAB_DIRECT);
21883 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21884 then replicate the value for all elements of the vector
21885 register. */
21888 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21890 int i, n_elt;
21891 rtvec v;
21892 machine_mode scalar_mode;
21894 switch (mode)
21896 case V64QImode:
21897 case V32QImode:
21898 case V16QImode:
21899 case V32HImode:
21900 case V16HImode:
21901 case V8HImode:
21902 case V16SImode:
21903 case V8SImode:
21904 case V4SImode:
21905 case V8DImode:
21906 case V4DImode:
21907 case V2DImode:
21908 gcc_assert (vect);
21909 /* FALLTHRU */
21910 case V16SFmode:
21911 case V8SFmode:
21912 case V4SFmode:
21913 case V8DFmode:
21914 case V4DFmode:
21915 case V2DFmode:
21916 n_elt = GET_MODE_NUNITS (mode);
21917 v = rtvec_alloc (n_elt);
21918 scalar_mode = GET_MODE_INNER (mode);
21920 RTVEC_ELT (v, 0) = value;
21922 for (i = 1; i < n_elt; ++i)
21923 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21925 return gen_rtx_CONST_VECTOR (mode, v);
21927 default:
21928 gcc_unreachable ();
21932 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21933 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21934 for an SSE register. If VECT is true, then replicate the mask for
21935 all elements of the vector register. If INVERT is true, then create
21936 a mask excluding the sign bit. */
21939 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21941 machine_mode vec_mode, imode;
21942 wide_int w;
21943 rtx mask, v;
21945 switch (mode)
21947 case V16SImode:
21948 case V16SFmode:
21949 case V8SImode:
21950 case V4SImode:
21951 case V8SFmode:
21952 case V4SFmode:
21953 vec_mode = mode;
21954 imode = SImode;
21955 break;
21957 case V8DImode:
21958 case V4DImode:
21959 case V2DImode:
21960 case V8DFmode:
21961 case V4DFmode:
21962 case V2DFmode:
21963 vec_mode = mode;
21964 imode = DImode;
21965 break;
21967 case TImode:
21968 case TFmode:
21969 vec_mode = VOIDmode;
21970 imode = TImode;
21971 break;
21973 default:
21974 gcc_unreachable ();
21977 machine_mode inner_mode = GET_MODE_INNER (mode);
21978 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21979 GET_MODE_BITSIZE (inner_mode));
21980 if (invert)
21981 w = wi::bit_not (w);
21983 /* Force this value into the low part of a fp vector constant. */
21984 mask = immed_wide_int_const (w, imode);
21985 mask = gen_lowpart (inner_mode, mask);
21987 if (vec_mode == VOIDmode)
21988 return force_reg (inner_mode, mask);
21990 v = ix86_build_const_vector (vec_mode, vect, mask);
21991 return force_reg (vec_mode, v);
21994 /* Generate code for floating point ABS or NEG. */
21996 void
21997 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21998 rtx operands[])
22000 rtx mask, set, dst, src;
22001 bool use_sse = false;
22002 bool vector_mode = VECTOR_MODE_P (mode);
22003 machine_mode vmode = mode;
22005 if (vector_mode)
22006 use_sse = true;
22007 else if (mode == TFmode)
22008 use_sse = true;
22009 else if (TARGET_SSE_MATH)
22011 use_sse = SSE_FLOAT_MODE_P (mode);
22012 if (mode == SFmode)
22013 vmode = V4SFmode;
22014 else if (mode == DFmode)
22015 vmode = V2DFmode;
22018 /* NEG and ABS performed with SSE use bitwise mask operations.
22019 Create the appropriate mask now. */
22020 if (use_sse)
22021 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22022 else
22023 mask = NULL_RTX;
22025 dst = operands[0];
22026 src = operands[1];
22028 set = gen_rtx_fmt_e (code, mode, src);
22029 set = gen_rtx_SET (dst, set);
22031 if (mask)
22033 rtx use, clob;
22034 rtvec par;
22036 use = gen_rtx_USE (VOIDmode, mask);
22037 if (vector_mode)
22038 par = gen_rtvec (2, set, use);
22039 else
22041 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22042 par = gen_rtvec (3, set, use, clob);
22044 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22046 else
22047 emit_insn (set);
22050 /* Expand a copysign operation. Special case operand 0 being a constant. */
22052 void
22053 ix86_expand_copysign (rtx operands[])
22055 machine_mode mode, vmode;
22056 rtx dest, op0, op1, mask, nmask;
22058 dest = operands[0];
22059 op0 = operands[1];
22060 op1 = operands[2];
22062 mode = GET_MODE (dest);
22064 if (mode == SFmode)
22065 vmode = V4SFmode;
22066 else if (mode == DFmode)
22067 vmode = V2DFmode;
22068 else
22069 vmode = mode;
22071 if (CONST_DOUBLE_P (op0))
22073 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22075 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22076 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22078 if (mode == SFmode || mode == DFmode)
22080 if (op0 == CONST0_RTX (mode))
22081 op0 = CONST0_RTX (vmode);
22082 else
22084 rtx v = ix86_build_const_vector (vmode, false, op0);
22086 op0 = force_reg (vmode, v);
22089 else if (op0 != CONST0_RTX (mode))
22090 op0 = force_reg (mode, op0);
22092 mask = ix86_build_signbit_mask (vmode, 0, 0);
22094 if (mode == SFmode)
22095 copysign_insn = gen_copysignsf3_const;
22096 else if (mode == DFmode)
22097 copysign_insn = gen_copysigndf3_const;
22098 else
22099 copysign_insn = gen_copysigntf3_const;
22101 emit_insn (copysign_insn (dest, op0, op1, mask));
22103 else
22105 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22107 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22108 mask = ix86_build_signbit_mask (vmode, 0, 0);
22110 if (mode == SFmode)
22111 copysign_insn = gen_copysignsf3_var;
22112 else if (mode == DFmode)
22113 copysign_insn = gen_copysigndf3_var;
22114 else
22115 copysign_insn = gen_copysigntf3_var;
22117 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22121 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22122 be a constant, and so has already been expanded into a vector constant. */
22124 void
22125 ix86_split_copysign_const (rtx operands[])
22127 machine_mode mode, vmode;
22128 rtx dest, op0, mask, x;
22130 dest = operands[0];
22131 op0 = operands[1];
22132 mask = operands[3];
22134 mode = GET_MODE (dest);
22135 vmode = GET_MODE (mask);
22137 dest = lowpart_subreg (vmode, dest, mode);
22138 x = gen_rtx_AND (vmode, dest, mask);
22139 emit_insn (gen_rtx_SET (dest, x));
22141 if (op0 != CONST0_RTX (vmode))
22143 x = gen_rtx_IOR (vmode, dest, op0);
22144 emit_insn (gen_rtx_SET (dest, x));
22148 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22149 so we have to do two masks. */
22151 void
22152 ix86_split_copysign_var (rtx operands[])
22154 machine_mode mode, vmode;
22155 rtx dest, scratch, op0, op1, mask, nmask, x;
22157 dest = operands[0];
22158 scratch = operands[1];
22159 op0 = operands[2];
22160 op1 = operands[3];
22161 nmask = operands[4];
22162 mask = operands[5];
22164 mode = GET_MODE (dest);
22165 vmode = GET_MODE (mask);
22167 if (rtx_equal_p (op0, op1))
22169 /* Shouldn't happen often (it's useless, obviously), but when it does
22170 we'd generate incorrect code if we continue below. */
22171 emit_move_insn (dest, op0);
22172 return;
22175 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22177 gcc_assert (REGNO (op1) == REGNO (scratch));
22179 x = gen_rtx_AND (vmode, scratch, mask);
22180 emit_insn (gen_rtx_SET (scratch, x));
22182 dest = mask;
22183 op0 = lowpart_subreg (vmode, op0, mode);
22184 x = gen_rtx_NOT (vmode, dest);
22185 x = gen_rtx_AND (vmode, x, op0);
22186 emit_insn (gen_rtx_SET (dest, x));
22188 else
22190 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22192 x = gen_rtx_AND (vmode, scratch, mask);
22194 else /* alternative 2,4 */
22196 gcc_assert (REGNO (mask) == REGNO (scratch));
22197 op1 = lowpart_subreg (vmode, op1, mode);
22198 x = gen_rtx_AND (vmode, scratch, op1);
22200 emit_insn (gen_rtx_SET (scratch, x));
22202 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22204 dest = lowpart_subreg (vmode, op0, mode);
22205 x = gen_rtx_AND (vmode, dest, nmask);
22207 else /* alternative 3,4 */
22209 gcc_assert (REGNO (nmask) == REGNO (dest));
22210 dest = nmask;
22211 op0 = lowpart_subreg (vmode, op0, mode);
22212 x = gen_rtx_AND (vmode, dest, op0);
22214 emit_insn (gen_rtx_SET (dest, x));
22217 x = gen_rtx_IOR (vmode, dest, scratch);
22218 emit_insn (gen_rtx_SET (dest, x));
22221 /* Return TRUE or FALSE depending on whether the first SET in INSN
22222 has source and destination with matching CC modes, and that the
22223 CC mode is at least as constrained as REQ_MODE. */
22225 bool
22226 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22228 rtx set;
22229 machine_mode set_mode;
22231 set = PATTERN (insn);
22232 if (GET_CODE (set) == PARALLEL)
22233 set = XVECEXP (set, 0, 0);
22234 gcc_assert (GET_CODE (set) == SET);
22235 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22237 set_mode = GET_MODE (SET_DEST (set));
22238 switch (set_mode)
22240 case CCNOmode:
22241 if (req_mode != CCNOmode
22242 && (req_mode != CCmode
22243 || XEXP (SET_SRC (set), 1) != const0_rtx))
22244 return false;
22245 break;
22246 case CCmode:
22247 if (req_mode == CCGCmode)
22248 return false;
22249 /* FALLTHRU */
22250 case CCGCmode:
22251 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22252 return false;
22253 /* FALLTHRU */
22254 case CCGOCmode:
22255 if (req_mode == CCZmode)
22256 return false;
22257 /* FALLTHRU */
22258 case CCZmode:
22259 break;
22261 case CCAmode:
22262 case CCCmode:
22263 case CCOmode:
22264 case CCPmode:
22265 case CCSmode:
22266 if (set_mode != req_mode)
22267 return false;
22268 break;
22270 default:
22271 gcc_unreachable ();
22274 return GET_MODE (SET_SRC (set)) == set_mode;
22277 /* Generate insn patterns to do an integer compare of OPERANDS. */
22279 static rtx
22280 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22282 machine_mode cmpmode;
22283 rtx tmp, flags;
22285 cmpmode = SELECT_CC_MODE (code, op0, op1);
22286 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22288 /* This is very simple, but making the interface the same as in the
22289 FP case makes the rest of the code easier. */
22290 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22291 emit_insn (gen_rtx_SET (flags, tmp));
22293 /* Return the test that should be put into the flags user, i.e.
22294 the bcc, scc, or cmov instruction. */
22295 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22298 /* Figure out whether to use ordered or unordered fp comparisons.
22299 Return the appropriate mode to use. */
22301 machine_mode
22302 ix86_fp_compare_mode (enum rtx_code)
22304 /* ??? In order to make all comparisons reversible, we do all comparisons
22305 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22306 all forms trapping and nontrapping comparisons, we can make inequality
22307 comparisons trapping again, since it results in better code when using
22308 FCOM based compares. */
22309 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22312 machine_mode
22313 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22315 machine_mode mode = GET_MODE (op0);
22317 if (SCALAR_FLOAT_MODE_P (mode))
22319 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22320 return ix86_fp_compare_mode (code);
22323 switch (code)
22325 /* Only zero flag is needed. */
22326 case EQ: /* ZF=0 */
22327 case NE: /* ZF!=0 */
22328 return CCZmode;
22329 /* Codes needing carry flag. */
22330 case GEU: /* CF=0 */
22331 case LTU: /* CF=1 */
22332 /* Detect overflow checks. They need just the carry flag. */
22333 if (GET_CODE (op0) == PLUS
22334 && (rtx_equal_p (op1, XEXP (op0, 0))
22335 || rtx_equal_p (op1, XEXP (op0, 1))))
22336 return CCCmode;
22337 else
22338 return CCmode;
22339 case GTU: /* CF=0 & ZF=0 */
22340 case LEU: /* CF=1 | ZF=1 */
22341 return CCmode;
22342 /* Codes possibly doable only with sign flag when
22343 comparing against zero. */
22344 case GE: /* SF=OF or SF=0 */
22345 case LT: /* SF<>OF or SF=1 */
22346 if (op1 == const0_rtx)
22347 return CCGOCmode;
22348 else
22349 /* For other cases Carry flag is not required. */
22350 return CCGCmode;
22351 /* Codes doable only with sign flag when comparing
22352 against zero, but we miss jump instruction for it
22353 so we need to use relational tests against overflow
22354 that thus needs to be zero. */
22355 case GT: /* ZF=0 & SF=OF */
22356 case LE: /* ZF=1 | SF<>OF */
22357 if (op1 == const0_rtx)
22358 return CCNOmode;
22359 else
22360 return CCGCmode;
22361 /* strcmp pattern do (use flags) and combine may ask us for proper
22362 mode. */
22363 case USE:
22364 return CCmode;
22365 default:
22366 gcc_unreachable ();
22370 /* Return the fixed registers used for condition codes. */
22372 static bool
22373 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22375 *p1 = FLAGS_REG;
22376 *p2 = FPSR_REG;
22377 return true;
22380 /* If two condition code modes are compatible, return a condition code
22381 mode which is compatible with both. Otherwise, return
22382 VOIDmode. */
22384 static machine_mode
22385 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22387 if (m1 == m2)
22388 return m1;
22390 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22391 return VOIDmode;
22393 if ((m1 == CCGCmode && m2 == CCGOCmode)
22394 || (m1 == CCGOCmode && m2 == CCGCmode))
22395 return CCGCmode;
22397 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22398 return m2;
22399 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22400 return m1;
22402 switch (m1)
22404 default:
22405 gcc_unreachable ();
22407 case CCmode:
22408 case CCGCmode:
22409 case CCGOCmode:
22410 case CCNOmode:
22411 case CCAmode:
22412 case CCCmode:
22413 case CCOmode:
22414 case CCPmode:
22415 case CCSmode:
22416 case CCZmode:
22417 switch (m2)
22419 default:
22420 return VOIDmode;
22422 case CCmode:
22423 case CCGCmode:
22424 case CCGOCmode:
22425 case CCNOmode:
22426 case CCAmode:
22427 case CCCmode:
22428 case CCOmode:
22429 case CCPmode:
22430 case CCSmode:
22431 case CCZmode:
22432 return CCmode;
22435 case CCFPmode:
22436 case CCFPUmode:
22437 /* These are only compatible with themselves, which we already
22438 checked above. */
22439 return VOIDmode;
22444 /* Return a comparison we can do and that it is equivalent to
22445 swap_condition (code) apart possibly from orderedness.
22446 But, never change orderedness if TARGET_IEEE_FP, returning
22447 UNKNOWN in that case if necessary. */
22449 static enum rtx_code
22450 ix86_fp_swap_condition (enum rtx_code code)
22452 switch (code)
22454 case GT: /* GTU - CF=0 & ZF=0 */
22455 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22456 case GE: /* GEU - CF=0 */
22457 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22458 case UNLT: /* LTU - CF=1 */
22459 return TARGET_IEEE_FP ? UNKNOWN : GT;
22460 case UNLE: /* LEU - CF=1 | ZF=1 */
22461 return TARGET_IEEE_FP ? UNKNOWN : GE;
22462 default:
22463 return swap_condition (code);
22467 /* Return cost of comparison CODE using the best strategy for performance.
22468 All following functions do use number of instructions as a cost metrics.
22469 In future this should be tweaked to compute bytes for optimize_size and
22470 take into account performance of various instructions on various CPUs. */
22472 static int
22473 ix86_fp_comparison_cost (enum rtx_code code)
22475 int arith_cost;
22477 /* The cost of code using bit-twiddling on %ah. */
22478 switch (code)
22480 case UNLE:
22481 case UNLT:
22482 case LTGT:
22483 case GT:
22484 case GE:
22485 case UNORDERED:
22486 case ORDERED:
22487 case UNEQ:
22488 arith_cost = 4;
22489 break;
22490 case LT:
22491 case NE:
22492 case EQ:
22493 case UNGE:
22494 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22495 break;
22496 case LE:
22497 case UNGT:
22498 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22499 break;
22500 default:
22501 gcc_unreachable ();
22504 switch (ix86_fp_comparison_strategy (code))
22506 case IX86_FPCMP_COMI:
22507 return arith_cost > 4 ? 3 : 2;
22508 case IX86_FPCMP_SAHF:
22509 return arith_cost > 4 ? 4 : 3;
22510 default:
22511 return arith_cost;
22515 /* Return strategy to use for floating-point. We assume that fcomi is always
22516 preferrable where available, since that is also true when looking at size
22517 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22519 enum ix86_fpcmp_strategy
22520 ix86_fp_comparison_strategy (enum rtx_code)
22522 /* Do fcomi/sahf based test when profitable. */
22524 if (TARGET_CMOVE)
22525 return IX86_FPCMP_COMI;
22527 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22528 return IX86_FPCMP_SAHF;
22530 return IX86_FPCMP_ARITH;
22533 /* Swap, force into registers, or otherwise massage the two operands
22534 to a fp comparison. The operands are updated in place; the new
22535 comparison code is returned. */
22537 static enum rtx_code
22538 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22540 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22541 rtx op0 = *pop0, op1 = *pop1;
22542 machine_mode op_mode = GET_MODE (op0);
22543 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22545 /* All of the unordered compare instructions only work on registers.
22546 The same is true of the fcomi compare instructions. The XFmode
22547 compare instructions require registers except when comparing
22548 against zero or when converting operand 1 from fixed point to
22549 floating point. */
22551 if (!is_sse
22552 && (fpcmp_mode == CCFPUmode
22553 || (op_mode == XFmode
22554 && ! (standard_80387_constant_p (op0) == 1
22555 || standard_80387_constant_p (op1) == 1)
22556 && GET_CODE (op1) != FLOAT)
22557 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22559 op0 = force_reg (op_mode, op0);
22560 op1 = force_reg (op_mode, op1);
22562 else
22564 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22565 things around if they appear profitable, otherwise force op0
22566 into a register. */
22568 if (standard_80387_constant_p (op0) == 0
22569 || (MEM_P (op0)
22570 && ! (standard_80387_constant_p (op1) == 0
22571 || MEM_P (op1))))
22573 enum rtx_code new_code = ix86_fp_swap_condition (code);
22574 if (new_code != UNKNOWN)
22576 std::swap (op0, op1);
22577 code = new_code;
22581 if (!REG_P (op0))
22582 op0 = force_reg (op_mode, op0);
22584 if (CONSTANT_P (op1))
22586 int tmp = standard_80387_constant_p (op1);
22587 if (tmp == 0)
22588 op1 = validize_mem (force_const_mem (op_mode, op1));
22589 else if (tmp == 1)
22591 if (TARGET_CMOVE)
22592 op1 = force_reg (op_mode, op1);
22594 else
22595 op1 = force_reg (op_mode, op1);
22599 /* Try to rearrange the comparison to make it cheaper. */
22600 if (ix86_fp_comparison_cost (code)
22601 > ix86_fp_comparison_cost (swap_condition (code))
22602 && (REG_P (op1) || can_create_pseudo_p ()))
22604 std::swap (op0, op1);
22605 code = swap_condition (code);
22606 if (!REG_P (op0))
22607 op0 = force_reg (op_mode, op0);
22610 *pop0 = op0;
22611 *pop1 = op1;
22612 return code;
22615 /* Convert comparison codes we use to represent FP comparison to integer
22616 code that will result in proper branch. Return UNKNOWN if no such code
22617 is available. */
22619 enum rtx_code
22620 ix86_fp_compare_code_to_integer (enum rtx_code code)
22622 switch (code)
22624 case GT:
22625 return GTU;
22626 case GE:
22627 return GEU;
22628 case ORDERED:
22629 case UNORDERED:
22630 return code;
22631 case UNEQ:
22632 return EQ;
22633 case UNLT:
22634 return LTU;
22635 case UNLE:
22636 return LEU;
22637 case LTGT:
22638 return NE;
22639 default:
22640 return UNKNOWN;
22644 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22646 static rtx
22647 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22649 machine_mode fpcmp_mode, intcmp_mode;
22650 rtx tmp, tmp2;
22652 fpcmp_mode = ix86_fp_compare_mode (code);
22653 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22655 /* Do fcomi/sahf based test when profitable. */
22656 switch (ix86_fp_comparison_strategy (code))
22658 case IX86_FPCMP_COMI:
22659 intcmp_mode = fpcmp_mode;
22660 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22661 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22662 emit_insn (tmp);
22663 break;
22665 case IX86_FPCMP_SAHF:
22666 intcmp_mode = fpcmp_mode;
22667 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22668 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22670 if (!scratch)
22671 scratch = gen_reg_rtx (HImode);
22672 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22673 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22674 break;
22676 case IX86_FPCMP_ARITH:
22677 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22678 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22679 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22680 if (!scratch)
22681 scratch = gen_reg_rtx (HImode);
22682 emit_insn (gen_rtx_SET (scratch, tmp2));
22684 /* In the unordered case, we have to check C2 for NaN's, which
22685 doesn't happen to work out to anything nice combination-wise.
22686 So do some bit twiddling on the value we've got in AH to come
22687 up with an appropriate set of condition codes. */
22689 intcmp_mode = CCNOmode;
22690 switch (code)
22692 case GT:
22693 case UNGT:
22694 if (code == GT || !TARGET_IEEE_FP)
22696 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22697 code = EQ;
22699 else
22701 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22702 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22703 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22704 intcmp_mode = CCmode;
22705 code = GEU;
22707 break;
22708 case LT:
22709 case UNLT:
22710 if (code == LT && TARGET_IEEE_FP)
22712 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22713 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22714 intcmp_mode = CCmode;
22715 code = EQ;
22717 else
22719 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22720 code = NE;
22722 break;
22723 case GE:
22724 case UNGE:
22725 if (code == GE || !TARGET_IEEE_FP)
22727 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22728 code = EQ;
22730 else
22732 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22733 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22734 code = NE;
22736 break;
22737 case LE:
22738 case UNLE:
22739 if (code == LE && TARGET_IEEE_FP)
22741 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22742 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22743 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22744 intcmp_mode = CCmode;
22745 code = LTU;
22747 else
22749 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22750 code = NE;
22752 break;
22753 case EQ:
22754 case UNEQ:
22755 if (code == EQ && TARGET_IEEE_FP)
22757 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22758 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22759 intcmp_mode = CCmode;
22760 code = EQ;
22762 else
22764 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22765 code = NE;
22767 break;
22768 case NE:
22769 case LTGT:
22770 if (code == NE && TARGET_IEEE_FP)
22772 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22773 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22774 GEN_INT (0x40)));
22775 code = NE;
22777 else
22779 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22780 code = EQ;
22782 break;
22784 case UNORDERED:
22785 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22786 code = NE;
22787 break;
22788 case ORDERED:
22789 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22790 code = EQ;
22791 break;
22793 default:
22794 gcc_unreachable ();
22796 break;
22798 default:
22799 gcc_unreachable();
22802 /* Return the test that should be put into the flags user, i.e.
22803 the bcc, scc, or cmov instruction. */
22804 return gen_rtx_fmt_ee (code, VOIDmode,
22805 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22806 const0_rtx);
22809 static rtx
22810 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22812 rtx ret;
22814 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22815 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22817 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22819 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22820 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22822 else
22823 ret = ix86_expand_int_compare (code, op0, op1);
22825 return ret;
22828 void
22829 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22831 machine_mode mode = GET_MODE (op0);
22832 rtx tmp;
22834 /* Handle special case - vector comparsion with boolean result, transform
22835 it using ptest instruction. */
22836 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22838 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22839 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22841 gcc_assert (code == EQ || code == NE);
22842 /* Generate XOR since we can't check that one operand is zero vector. */
22843 tmp = gen_reg_rtx (mode);
22844 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22845 tmp = gen_lowpart (p_mode, tmp);
22846 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22847 gen_rtx_UNSPEC (CCmode,
22848 gen_rtvec (2, tmp, tmp),
22849 UNSPEC_PTEST)));
22850 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22851 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22852 gen_rtx_LABEL_REF (VOIDmode, label),
22853 pc_rtx);
22854 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22855 return;
22858 switch (mode)
22860 case SFmode:
22861 case DFmode:
22862 case XFmode:
22863 case QImode:
22864 case HImode:
22865 case SImode:
22866 simple:
22867 tmp = ix86_expand_compare (code, op0, op1);
22868 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22869 gen_rtx_LABEL_REF (VOIDmode, label),
22870 pc_rtx);
22871 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22872 return;
22874 case DImode:
22875 if (TARGET_64BIT)
22876 goto simple;
22877 /* For 32-bit target DI comparison may be performed on
22878 SSE registers. To allow this we should avoid split
22879 to SI mode which is achieved by doing xor in DI mode
22880 and then comparing with zero (which is recognized by
22881 STV pass). We don't compare using xor when optimizing
22882 for size. */
22883 if (!optimize_insn_for_size_p ()
22884 && TARGET_STV
22885 && (code == EQ || code == NE))
22887 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22888 op1 = const0_rtx;
22890 /* FALLTHRU */
22891 case TImode:
22892 /* Expand DImode branch into multiple compare+branch. */
22894 rtx lo[2], hi[2];
22895 rtx_code_label *label2;
22896 enum rtx_code code1, code2, code3;
22897 machine_mode submode;
22899 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22901 std::swap (op0, op1);
22902 code = swap_condition (code);
22905 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22906 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22908 submode = mode == DImode ? SImode : DImode;
22910 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22911 avoid two branches. This costs one extra insn, so disable when
22912 optimizing for size. */
22914 if ((code == EQ || code == NE)
22915 && (!optimize_insn_for_size_p ()
22916 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22918 rtx xor0, xor1;
22920 xor1 = hi[0];
22921 if (hi[1] != const0_rtx)
22922 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22923 NULL_RTX, 0, OPTAB_WIDEN);
22925 xor0 = lo[0];
22926 if (lo[1] != const0_rtx)
22927 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22928 NULL_RTX, 0, OPTAB_WIDEN);
22930 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22931 NULL_RTX, 0, OPTAB_WIDEN);
22933 ix86_expand_branch (code, tmp, const0_rtx, label);
22934 return;
22937 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22938 op1 is a constant and the low word is zero, then we can just
22939 examine the high word. Similarly for low word -1 and
22940 less-or-equal-than or greater-than. */
22942 if (CONST_INT_P (hi[1]))
22943 switch (code)
22945 case LT: case LTU: case GE: case GEU:
22946 if (lo[1] == const0_rtx)
22948 ix86_expand_branch (code, hi[0], hi[1], label);
22949 return;
22951 break;
22952 case LE: case LEU: case GT: case GTU:
22953 if (lo[1] == constm1_rtx)
22955 ix86_expand_branch (code, hi[0], hi[1], label);
22956 return;
22958 break;
22959 default:
22960 break;
22963 /* Otherwise, we need two or three jumps. */
22965 label2 = gen_label_rtx ();
22967 code1 = code;
22968 code2 = swap_condition (code);
22969 code3 = unsigned_condition (code);
22971 switch (code)
22973 case LT: case GT: case LTU: case GTU:
22974 break;
22976 case LE: code1 = LT; code2 = GT; break;
22977 case GE: code1 = GT; code2 = LT; break;
22978 case LEU: code1 = LTU; code2 = GTU; break;
22979 case GEU: code1 = GTU; code2 = LTU; break;
22981 case EQ: code1 = UNKNOWN; code2 = NE; break;
22982 case NE: code2 = UNKNOWN; break;
22984 default:
22985 gcc_unreachable ();
22989 * a < b =>
22990 * if (hi(a) < hi(b)) goto true;
22991 * if (hi(a) > hi(b)) goto false;
22992 * if (lo(a) < lo(b)) goto true;
22993 * false:
22996 if (code1 != UNKNOWN)
22997 ix86_expand_branch (code1, hi[0], hi[1], label);
22998 if (code2 != UNKNOWN)
22999 ix86_expand_branch (code2, hi[0], hi[1], label2);
23001 ix86_expand_branch (code3, lo[0], lo[1], label);
23003 if (code2 != UNKNOWN)
23004 emit_label (label2);
23005 return;
23008 default:
23009 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23010 goto simple;
23014 /* Split branch based on floating point condition. */
23015 void
23016 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23017 rtx target1, rtx target2, rtx tmp)
23019 rtx condition;
23020 rtx_insn *i;
23022 if (target2 != pc_rtx)
23024 std::swap (target1, target2);
23025 code = reverse_condition_maybe_unordered (code);
23028 condition = ix86_expand_fp_compare (code, op1, op2,
23029 tmp);
23031 i = emit_jump_insn (gen_rtx_SET
23032 (pc_rtx,
23033 gen_rtx_IF_THEN_ELSE (VOIDmode,
23034 condition, target1, target2)));
23035 if (split_branch_probability >= 0)
23036 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
23039 void
23040 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23042 rtx ret;
23044 gcc_assert (GET_MODE (dest) == QImode);
23046 ret = ix86_expand_compare (code, op0, op1);
23047 PUT_MODE (ret, QImode);
23048 emit_insn (gen_rtx_SET (dest, ret));
23051 /* Expand comparison setting or clearing carry flag. Return true when
23052 successful and set pop for the operation. */
23053 static bool
23054 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23056 machine_mode mode =
23057 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23059 /* Do not handle double-mode compares that go through special path. */
23060 if (mode == (TARGET_64BIT ? TImode : DImode))
23061 return false;
23063 if (SCALAR_FLOAT_MODE_P (mode))
23065 rtx compare_op;
23066 rtx_insn *compare_seq;
23068 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23070 /* Shortcut: following common codes never translate
23071 into carry flag compares. */
23072 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23073 || code == ORDERED || code == UNORDERED)
23074 return false;
23076 /* These comparisons require zero flag; swap operands so they won't. */
23077 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23078 && !TARGET_IEEE_FP)
23080 std::swap (op0, op1);
23081 code = swap_condition (code);
23084 /* Try to expand the comparison and verify that we end up with
23085 carry flag based comparison. This fails to be true only when
23086 we decide to expand comparison using arithmetic that is not
23087 too common scenario. */
23088 start_sequence ();
23089 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23090 compare_seq = get_insns ();
23091 end_sequence ();
23093 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23094 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23095 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23096 else
23097 code = GET_CODE (compare_op);
23099 if (code != LTU && code != GEU)
23100 return false;
23102 emit_insn (compare_seq);
23103 *pop = compare_op;
23104 return true;
23107 if (!INTEGRAL_MODE_P (mode))
23108 return false;
23110 switch (code)
23112 case LTU:
23113 case GEU:
23114 break;
23116 /* Convert a==0 into (unsigned)a<1. */
23117 case EQ:
23118 case NE:
23119 if (op1 != const0_rtx)
23120 return false;
23121 op1 = const1_rtx;
23122 code = (code == EQ ? LTU : GEU);
23123 break;
23125 /* Convert a>b into b<a or a>=b-1. */
23126 case GTU:
23127 case LEU:
23128 if (CONST_INT_P (op1))
23130 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23131 /* Bail out on overflow. We still can swap operands but that
23132 would force loading of the constant into register. */
23133 if (op1 == const0_rtx
23134 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23135 return false;
23136 code = (code == GTU ? GEU : LTU);
23138 else
23140 std::swap (op0, op1);
23141 code = (code == GTU ? LTU : GEU);
23143 break;
23145 /* Convert a>=0 into (unsigned)a<0x80000000. */
23146 case LT:
23147 case GE:
23148 if (mode == DImode || op1 != const0_rtx)
23149 return false;
23150 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23151 code = (code == LT ? GEU : LTU);
23152 break;
23153 case LE:
23154 case GT:
23155 if (mode == DImode || op1 != constm1_rtx)
23156 return false;
23157 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23158 code = (code == LE ? GEU : LTU);
23159 break;
23161 default:
23162 return false;
23164 /* Swapping operands may cause constant to appear as first operand. */
23165 if (!nonimmediate_operand (op0, VOIDmode))
23167 if (!can_create_pseudo_p ())
23168 return false;
23169 op0 = force_reg (mode, op0);
23171 *pop = ix86_expand_compare (code, op0, op1);
23172 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23173 return true;
23176 bool
23177 ix86_expand_int_movcc (rtx operands[])
23179 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23180 rtx_insn *compare_seq;
23181 rtx compare_op;
23182 machine_mode mode = GET_MODE (operands[0]);
23183 bool sign_bit_compare_p = false;
23184 rtx op0 = XEXP (operands[1], 0);
23185 rtx op1 = XEXP (operands[1], 1);
23187 if (GET_MODE (op0) == TImode
23188 || (GET_MODE (op0) == DImode
23189 && !TARGET_64BIT))
23190 return false;
23192 start_sequence ();
23193 compare_op = ix86_expand_compare (code, op0, op1);
23194 compare_seq = get_insns ();
23195 end_sequence ();
23197 compare_code = GET_CODE (compare_op);
23199 if ((op1 == const0_rtx && (code == GE || code == LT))
23200 || (op1 == constm1_rtx && (code == GT || code == LE)))
23201 sign_bit_compare_p = true;
23203 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23204 HImode insns, we'd be swallowed in word prefix ops. */
23206 if ((mode != HImode || TARGET_FAST_PREFIX)
23207 && (mode != (TARGET_64BIT ? TImode : DImode))
23208 && CONST_INT_P (operands[2])
23209 && CONST_INT_P (operands[3]))
23211 rtx out = operands[0];
23212 HOST_WIDE_INT ct = INTVAL (operands[2]);
23213 HOST_WIDE_INT cf = INTVAL (operands[3]);
23214 HOST_WIDE_INT diff;
23216 diff = ct - cf;
23217 /* Sign bit compares are better done using shifts than we do by using
23218 sbb. */
23219 if (sign_bit_compare_p
23220 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23222 /* Detect overlap between destination and compare sources. */
23223 rtx tmp = out;
23225 if (!sign_bit_compare_p)
23227 rtx flags;
23228 bool fpcmp = false;
23230 compare_code = GET_CODE (compare_op);
23232 flags = XEXP (compare_op, 0);
23234 if (GET_MODE (flags) == CCFPmode
23235 || GET_MODE (flags) == CCFPUmode)
23237 fpcmp = true;
23238 compare_code
23239 = ix86_fp_compare_code_to_integer (compare_code);
23242 /* To simplify rest of code, restrict to the GEU case. */
23243 if (compare_code == LTU)
23245 std::swap (ct, cf);
23246 compare_code = reverse_condition (compare_code);
23247 code = reverse_condition (code);
23249 else
23251 if (fpcmp)
23252 PUT_CODE (compare_op,
23253 reverse_condition_maybe_unordered
23254 (GET_CODE (compare_op)));
23255 else
23256 PUT_CODE (compare_op,
23257 reverse_condition (GET_CODE (compare_op)));
23259 diff = ct - cf;
23261 if (reg_overlap_mentioned_p (out, op0)
23262 || reg_overlap_mentioned_p (out, op1))
23263 tmp = gen_reg_rtx (mode);
23265 if (mode == DImode)
23266 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23267 else
23268 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23269 flags, compare_op));
23271 else
23273 if (code == GT || code == GE)
23274 code = reverse_condition (code);
23275 else
23277 std::swap (ct, cf);
23278 diff = ct - cf;
23280 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23283 if (diff == 1)
23286 * cmpl op0,op1
23287 * sbbl dest,dest
23288 * [addl dest, ct]
23290 * Size 5 - 8.
23292 if (ct)
23293 tmp = expand_simple_binop (mode, PLUS,
23294 tmp, GEN_INT (ct),
23295 copy_rtx (tmp), 1, OPTAB_DIRECT);
23297 else if (cf == -1)
23300 * cmpl op0,op1
23301 * sbbl dest,dest
23302 * orl $ct, dest
23304 * Size 8.
23306 tmp = expand_simple_binop (mode, IOR,
23307 tmp, GEN_INT (ct),
23308 copy_rtx (tmp), 1, OPTAB_DIRECT);
23310 else if (diff == -1 && ct)
23313 * cmpl op0,op1
23314 * sbbl dest,dest
23315 * notl dest
23316 * [addl dest, cf]
23318 * Size 8 - 11.
23320 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23321 if (cf)
23322 tmp = expand_simple_binop (mode, PLUS,
23323 copy_rtx (tmp), GEN_INT (cf),
23324 copy_rtx (tmp), 1, OPTAB_DIRECT);
23326 else
23329 * cmpl op0,op1
23330 * sbbl dest,dest
23331 * [notl dest]
23332 * andl cf - ct, dest
23333 * [addl dest, ct]
23335 * Size 8 - 11.
23338 if (cf == 0)
23340 cf = ct;
23341 ct = 0;
23342 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23345 tmp = expand_simple_binop (mode, AND,
23346 copy_rtx (tmp),
23347 gen_int_mode (cf - ct, mode),
23348 copy_rtx (tmp), 1, OPTAB_DIRECT);
23349 if (ct)
23350 tmp = expand_simple_binop (mode, PLUS,
23351 copy_rtx (tmp), GEN_INT (ct),
23352 copy_rtx (tmp), 1, OPTAB_DIRECT);
23355 if (!rtx_equal_p (tmp, out))
23356 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23358 return true;
23361 if (diff < 0)
23363 machine_mode cmp_mode = GET_MODE (op0);
23364 enum rtx_code new_code;
23366 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23368 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23370 /* We may be reversing unordered compare to normal compare, that
23371 is not valid in general (we may convert non-trapping condition
23372 to trapping one), however on i386 we currently emit all
23373 comparisons unordered. */
23374 new_code = reverse_condition_maybe_unordered (code);
23376 else
23377 new_code = ix86_reverse_condition (code, cmp_mode);
23378 if (new_code != UNKNOWN)
23380 std::swap (ct, cf);
23381 diff = -diff;
23382 code = new_code;
23386 compare_code = UNKNOWN;
23387 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23388 && CONST_INT_P (op1))
23390 if (op1 == const0_rtx
23391 && (code == LT || code == GE))
23392 compare_code = code;
23393 else if (op1 == constm1_rtx)
23395 if (code == LE)
23396 compare_code = LT;
23397 else if (code == GT)
23398 compare_code = GE;
23402 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23403 if (compare_code != UNKNOWN
23404 && GET_MODE (op0) == GET_MODE (out)
23405 && (cf == -1 || ct == -1))
23407 /* If lea code below could be used, only optimize
23408 if it results in a 2 insn sequence. */
23410 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23411 || diff == 3 || diff == 5 || diff == 9)
23412 || (compare_code == LT && ct == -1)
23413 || (compare_code == GE && cf == -1))
23416 * notl op1 (if necessary)
23417 * sarl $31, op1
23418 * orl cf, op1
23420 if (ct != -1)
23422 cf = ct;
23423 ct = -1;
23424 code = reverse_condition (code);
23427 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23429 out = expand_simple_binop (mode, IOR,
23430 out, GEN_INT (cf),
23431 out, 1, OPTAB_DIRECT);
23432 if (out != operands[0])
23433 emit_move_insn (operands[0], out);
23435 return true;
23440 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23441 || diff == 3 || diff == 5 || diff == 9)
23442 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23443 && (mode != DImode
23444 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23447 * xorl dest,dest
23448 * cmpl op1,op2
23449 * setcc dest
23450 * lea cf(dest*(ct-cf)),dest
23452 * Size 14.
23454 * This also catches the degenerate setcc-only case.
23457 rtx tmp;
23458 int nops;
23460 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23462 nops = 0;
23463 /* On x86_64 the lea instruction operates on Pmode, so we need
23464 to get arithmetics done in proper mode to match. */
23465 if (diff == 1)
23466 tmp = copy_rtx (out);
23467 else
23469 rtx out1;
23470 out1 = copy_rtx (out);
23471 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23472 nops++;
23473 if (diff & 1)
23475 tmp = gen_rtx_PLUS (mode, tmp, out1);
23476 nops++;
23479 if (cf != 0)
23481 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23482 nops++;
23484 if (!rtx_equal_p (tmp, out))
23486 if (nops == 1)
23487 out = force_operand (tmp, copy_rtx (out));
23488 else
23489 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23491 if (!rtx_equal_p (out, operands[0]))
23492 emit_move_insn (operands[0], copy_rtx (out));
23494 return true;
23498 * General case: Jumpful:
23499 * xorl dest,dest cmpl op1, op2
23500 * cmpl op1, op2 movl ct, dest
23501 * setcc dest jcc 1f
23502 * decl dest movl cf, dest
23503 * andl (cf-ct),dest 1:
23504 * addl ct,dest
23506 * Size 20. Size 14.
23508 * This is reasonably steep, but branch mispredict costs are
23509 * high on modern cpus, so consider failing only if optimizing
23510 * for space.
23513 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23514 && BRANCH_COST (optimize_insn_for_speed_p (),
23515 false) >= 2)
23517 if (cf == 0)
23519 machine_mode cmp_mode = GET_MODE (op0);
23520 enum rtx_code new_code;
23522 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23524 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23526 /* We may be reversing unordered compare to normal compare,
23527 that is not valid in general (we may convert non-trapping
23528 condition to trapping one), however on i386 we currently
23529 emit all comparisons unordered. */
23530 new_code = reverse_condition_maybe_unordered (code);
23532 else
23534 new_code = ix86_reverse_condition (code, cmp_mode);
23535 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23536 compare_code = reverse_condition (compare_code);
23539 if (new_code != UNKNOWN)
23541 cf = ct;
23542 ct = 0;
23543 code = new_code;
23547 if (compare_code != UNKNOWN)
23549 /* notl op1 (if needed)
23550 sarl $31, op1
23551 andl (cf-ct), op1
23552 addl ct, op1
23554 For x < 0 (resp. x <= -1) there will be no notl,
23555 so if possible swap the constants to get rid of the
23556 complement.
23557 True/false will be -1/0 while code below (store flag
23558 followed by decrement) is 0/-1, so the constants need
23559 to be exchanged once more. */
23561 if (compare_code == GE || !cf)
23563 code = reverse_condition (code);
23564 compare_code = LT;
23566 else
23567 std::swap (ct, cf);
23569 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23571 else
23573 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23575 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23576 constm1_rtx,
23577 copy_rtx (out), 1, OPTAB_DIRECT);
23580 out = expand_simple_binop (mode, AND, copy_rtx (out),
23581 gen_int_mode (cf - ct, mode),
23582 copy_rtx (out), 1, OPTAB_DIRECT);
23583 if (ct)
23584 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23585 copy_rtx (out), 1, OPTAB_DIRECT);
23586 if (!rtx_equal_p (out, operands[0]))
23587 emit_move_insn (operands[0], copy_rtx (out));
23589 return true;
23593 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23595 /* Try a few things more with specific constants and a variable. */
23597 optab op;
23598 rtx var, orig_out, out, tmp;
23600 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23601 return false;
23603 /* If one of the two operands is an interesting constant, load a
23604 constant with the above and mask it in with a logical operation. */
23606 if (CONST_INT_P (operands[2]))
23608 var = operands[3];
23609 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23610 operands[3] = constm1_rtx, op = and_optab;
23611 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23612 operands[3] = const0_rtx, op = ior_optab;
23613 else
23614 return false;
23616 else if (CONST_INT_P (operands[3]))
23618 var = operands[2];
23619 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23620 operands[2] = constm1_rtx, op = and_optab;
23621 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23622 operands[2] = const0_rtx, op = ior_optab;
23623 else
23624 return false;
23626 else
23627 return false;
23629 orig_out = operands[0];
23630 tmp = gen_reg_rtx (mode);
23631 operands[0] = tmp;
23633 /* Recurse to get the constant loaded. */
23634 if (!ix86_expand_int_movcc (operands))
23635 return false;
23637 /* Mask in the interesting variable. */
23638 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23639 OPTAB_WIDEN);
23640 if (!rtx_equal_p (out, orig_out))
23641 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23643 return true;
23647 * For comparison with above,
23649 * movl cf,dest
23650 * movl ct,tmp
23651 * cmpl op1,op2
23652 * cmovcc tmp,dest
23654 * Size 15.
23657 if (! nonimmediate_operand (operands[2], mode))
23658 operands[2] = force_reg (mode, operands[2]);
23659 if (! nonimmediate_operand (operands[3], mode))
23660 operands[3] = force_reg (mode, operands[3]);
23662 if (! register_operand (operands[2], VOIDmode)
23663 && (mode == QImode
23664 || ! register_operand (operands[3], VOIDmode)))
23665 operands[2] = force_reg (mode, operands[2]);
23667 if (mode == QImode
23668 && ! register_operand (operands[3], VOIDmode))
23669 operands[3] = force_reg (mode, operands[3]);
23671 emit_insn (compare_seq);
23672 emit_insn (gen_rtx_SET (operands[0],
23673 gen_rtx_IF_THEN_ELSE (mode,
23674 compare_op, operands[2],
23675 operands[3])));
23676 return true;
23679 /* Swap, force into registers, or otherwise massage the two operands
23680 to an sse comparison with a mask result. Thus we differ a bit from
23681 ix86_prepare_fp_compare_args which expects to produce a flags result.
23683 The DEST operand exists to help determine whether to commute commutative
23684 operators. The POP0/POP1 operands are updated in place. The new
23685 comparison code is returned, or UNKNOWN if not implementable. */
23687 static enum rtx_code
23688 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23689 rtx *pop0, rtx *pop1)
23691 switch (code)
23693 case LTGT:
23694 case UNEQ:
23695 /* AVX supports all the needed comparisons. */
23696 if (TARGET_AVX)
23697 break;
23698 /* We have no LTGT as an operator. We could implement it with
23699 NE & ORDERED, but this requires an extra temporary. It's
23700 not clear that it's worth it. */
23701 return UNKNOWN;
23703 case LT:
23704 case LE:
23705 case UNGT:
23706 case UNGE:
23707 /* These are supported directly. */
23708 break;
23710 case EQ:
23711 case NE:
23712 case UNORDERED:
23713 case ORDERED:
23714 /* AVX has 3 operand comparisons, no need to swap anything. */
23715 if (TARGET_AVX)
23716 break;
23717 /* For commutative operators, try to canonicalize the destination
23718 operand to be first in the comparison - this helps reload to
23719 avoid extra moves. */
23720 if (!dest || !rtx_equal_p (dest, *pop1))
23721 break;
23722 /* FALLTHRU */
23724 case GE:
23725 case GT:
23726 case UNLE:
23727 case UNLT:
23728 /* These are not supported directly before AVX, and furthermore
23729 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23730 comparison operands to transform into something that is
23731 supported. */
23732 std::swap (*pop0, *pop1);
23733 code = swap_condition (code);
23734 break;
23736 default:
23737 gcc_unreachable ();
23740 return code;
23743 /* Detect conditional moves that exactly match min/max operational
23744 semantics. Note that this is IEEE safe, as long as we don't
23745 interchange the operands.
23747 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23748 and TRUE if the operation is successful and instructions are emitted. */
23750 static bool
23751 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23752 rtx cmp_op1, rtx if_true, rtx if_false)
23754 machine_mode mode;
23755 bool is_min;
23756 rtx tmp;
23758 if (code == LT)
23760 else if (code == UNGE)
23761 std::swap (if_true, if_false);
23762 else
23763 return false;
23765 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23766 is_min = true;
23767 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23768 is_min = false;
23769 else
23770 return false;
23772 mode = GET_MODE (dest);
23774 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23775 but MODE may be a vector mode and thus not appropriate. */
23776 if (!flag_finite_math_only || flag_signed_zeros)
23778 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23779 rtvec v;
23781 if_true = force_reg (mode, if_true);
23782 v = gen_rtvec (2, if_true, if_false);
23783 tmp = gen_rtx_UNSPEC (mode, v, u);
23785 else
23787 code = is_min ? SMIN : SMAX;
23788 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23791 emit_insn (gen_rtx_SET (dest, tmp));
23792 return true;
23795 /* Expand an sse vector comparison. Return the register with the result. */
23797 static rtx
23798 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23799 rtx op_true, rtx op_false)
23801 machine_mode mode = GET_MODE (dest);
23802 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23804 /* In general case result of comparison can differ from operands' type. */
23805 machine_mode cmp_mode;
23807 /* In AVX512F the result of comparison is an integer mask. */
23808 bool maskcmp = false;
23809 rtx x;
23811 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23813 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23814 gcc_assert (cmp_mode != BLKmode);
23816 maskcmp = true;
23818 else
23819 cmp_mode = cmp_ops_mode;
23822 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23823 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23824 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23826 if (optimize
23827 || (maskcmp && cmp_mode != mode)
23828 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23829 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23830 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23832 /* Compare patterns for int modes are unspec in AVX512F only. */
23833 if (maskcmp && (code == GT || code == EQ))
23835 rtx (*gen)(rtx, rtx, rtx);
23837 switch (cmp_ops_mode)
23839 case V64QImode:
23840 gcc_assert (TARGET_AVX512BW);
23841 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23842 break;
23843 case V32HImode:
23844 gcc_assert (TARGET_AVX512BW);
23845 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23846 break;
23847 case V16SImode:
23848 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23849 break;
23850 case V8DImode:
23851 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23852 break;
23853 default:
23854 gen = NULL;
23857 if (gen)
23859 emit_insn (gen (dest, cmp_op0, cmp_op1));
23860 return dest;
23863 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23865 if (cmp_mode != mode && !maskcmp)
23867 x = force_reg (cmp_ops_mode, x);
23868 convert_move (dest, x, false);
23870 else
23871 emit_insn (gen_rtx_SET (dest, x));
23873 return dest;
23876 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23877 operations. This is used for both scalar and vector conditional moves. */
23879 void
23880 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23882 machine_mode mode = GET_MODE (dest);
23883 machine_mode cmpmode = GET_MODE (cmp);
23885 /* In AVX512F the result of comparison is an integer mask. */
23886 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23888 rtx t2, t3, x;
23890 /* If we have an integer mask and FP value then we need
23891 to cast mask to FP mode. */
23892 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23894 cmp = force_reg (cmpmode, cmp);
23895 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23898 if (vector_all_ones_operand (op_true, mode)
23899 && rtx_equal_p (op_false, CONST0_RTX (mode))
23900 && !maskcmp)
23902 emit_insn (gen_rtx_SET (dest, cmp));
23904 else if (op_false == CONST0_RTX (mode)
23905 && !maskcmp)
23907 op_true = force_reg (mode, op_true);
23908 x = gen_rtx_AND (mode, cmp, op_true);
23909 emit_insn (gen_rtx_SET (dest, x));
23911 else if (op_true == CONST0_RTX (mode)
23912 && !maskcmp)
23914 op_false = force_reg (mode, op_false);
23915 x = gen_rtx_NOT (mode, cmp);
23916 x = gen_rtx_AND (mode, x, op_false);
23917 emit_insn (gen_rtx_SET (dest, x));
23919 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23920 && !maskcmp)
23922 op_false = force_reg (mode, op_false);
23923 x = gen_rtx_IOR (mode, cmp, op_false);
23924 emit_insn (gen_rtx_SET (dest, x));
23926 else if (TARGET_XOP
23927 && !maskcmp)
23929 op_true = force_reg (mode, op_true);
23931 if (!nonimmediate_operand (op_false, mode))
23932 op_false = force_reg (mode, op_false);
23934 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23935 op_true,
23936 op_false)));
23938 else
23940 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23941 rtx d = dest;
23943 if (!nonimmediate_operand (op_true, mode))
23944 op_true = force_reg (mode, op_true);
23946 op_false = force_reg (mode, op_false);
23948 switch (mode)
23950 case V4SFmode:
23951 if (TARGET_SSE4_1)
23952 gen = gen_sse4_1_blendvps;
23953 break;
23954 case V2DFmode:
23955 if (TARGET_SSE4_1)
23956 gen = gen_sse4_1_blendvpd;
23957 break;
23958 case V16QImode:
23959 case V8HImode:
23960 case V4SImode:
23961 case V2DImode:
23962 if (TARGET_SSE4_1)
23964 gen = gen_sse4_1_pblendvb;
23965 if (mode != V16QImode)
23966 d = gen_reg_rtx (V16QImode);
23967 op_false = gen_lowpart (V16QImode, op_false);
23968 op_true = gen_lowpart (V16QImode, op_true);
23969 cmp = gen_lowpart (V16QImode, cmp);
23971 break;
23972 case V8SFmode:
23973 if (TARGET_AVX)
23974 gen = gen_avx_blendvps256;
23975 break;
23976 case V4DFmode:
23977 if (TARGET_AVX)
23978 gen = gen_avx_blendvpd256;
23979 break;
23980 case V32QImode:
23981 case V16HImode:
23982 case V8SImode:
23983 case V4DImode:
23984 if (TARGET_AVX2)
23986 gen = gen_avx2_pblendvb;
23987 if (mode != V32QImode)
23988 d = gen_reg_rtx (V32QImode);
23989 op_false = gen_lowpart (V32QImode, op_false);
23990 op_true = gen_lowpart (V32QImode, op_true);
23991 cmp = gen_lowpart (V32QImode, cmp);
23993 break;
23995 case V64QImode:
23996 gen = gen_avx512bw_blendmv64qi;
23997 break;
23998 case V32HImode:
23999 gen = gen_avx512bw_blendmv32hi;
24000 break;
24001 case V16SImode:
24002 gen = gen_avx512f_blendmv16si;
24003 break;
24004 case V8DImode:
24005 gen = gen_avx512f_blendmv8di;
24006 break;
24007 case V8DFmode:
24008 gen = gen_avx512f_blendmv8df;
24009 break;
24010 case V16SFmode:
24011 gen = gen_avx512f_blendmv16sf;
24012 break;
24014 default:
24015 break;
24018 if (gen != NULL)
24020 emit_insn (gen (d, op_false, op_true, cmp));
24021 if (d != dest)
24022 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24024 else
24026 op_true = force_reg (mode, op_true);
24028 t2 = gen_reg_rtx (mode);
24029 if (optimize)
24030 t3 = gen_reg_rtx (mode);
24031 else
24032 t3 = dest;
24034 x = gen_rtx_AND (mode, op_true, cmp);
24035 emit_insn (gen_rtx_SET (t2, x));
24037 x = gen_rtx_NOT (mode, cmp);
24038 x = gen_rtx_AND (mode, x, op_false);
24039 emit_insn (gen_rtx_SET (t3, x));
24041 x = gen_rtx_IOR (mode, t3, t2);
24042 emit_insn (gen_rtx_SET (dest, x));
24047 /* Expand a floating-point conditional move. Return true if successful. */
24049 bool
24050 ix86_expand_fp_movcc (rtx operands[])
24052 machine_mode mode = GET_MODE (operands[0]);
24053 enum rtx_code code = GET_CODE (operands[1]);
24054 rtx tmp, compare_op;
24055 rtx op0 = XEXP (operands[1], 0);
24056 rtx op1 = XEXP (operands[1], 1);
24058 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24060 machine_mode cmode;
24062 /* Since we've no cmove for sse registers, don't force bad register
24063 allocation just to gain access to it. Deny movcc when the
24064 comparison mode doesn't match the move mode. */
24065 cmode = GET_MODE (op0);
24066 if (cmode == VOIDmode)
24067 cmode = GET_MODE (op1);
24068 if (cmode != mode)
24069 return false;
24071 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24072 if (code == UNKNOWN)
24073 return false;
24075 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24076 operands[2], operands[3]))
24077 return true;
24079 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24080 operands[2], operands[3]);
24081 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24082 return true;
24085 if (GET_MODE (op0) == TImode
24086 || (GET_MODE (op0) == DImode
24087 && !TARGET_64BIT))
24088 return false;
24090 /* The floating point conditional move instructions don't directly
24091 support conditions resulting from a signed integer comparison. */
24093 compare_op = ix86_expand_compare (code, op0, op1);
24094 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24096 tmp = gen_reg_rtx (QImode);
24097 ix86_expand_setcc (tmp, code, op0, op1);
24099 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24102 emit_insn (gen_rtx_SET (operands[0],
24103 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24104 operands[2], operands[3])));
24106 return true;
24109 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24111 static int
24112 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24114 switch (code)
24116 case EQ:
24117 return 0;
24118 case LT:
24119 case LTU:
24120 return 1;
24121 case LE:
24122 case LEU:
24123 return 2;
24124 case NE:
24125 return 4;
24126 case GE:
24127 case GEU:
24128 return 5;
24129 case GT:
24130 case GTU:
24131 return 6;
24132 default:
24133 gcc_unreachable ();
24137 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24139 static int
24140 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24142 switch (code)
24144 case EQ:
24145 return 0x00;
24146 case NE:
24147 return 0x04;
24148 case GT:
24149 return 0x0e;
24150 case LE:
24151 return 0x02;
24152 case GE:
24153 return 0x0d;
24154 case LT:
24155 return 0x01;
24156 case UNLE:
24157 return 0x0a;
24158 case UNLT:
24159 return 0x09;
24160 case UNGE:
24161 return 0x05;
24162 case UNGT:
24163 return 0x06;
24164 case UNEQ:
24165 return 0x18;
24166 case LTGT:
24167 return 0x0c;
24168 case ORDERED:
24169 return 0x07;
24170 case UNORDERED:
24171 return 0x03;
24172 default:
24173 gcc_unreachable ();
24177 /* Return immediate value to be used in UNSPEC_PCMP
24178 for comparison CODE in MODE. */
24180 static int
24181 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24183 if (FLOAT_MODE_P (mode))
24184 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24185 return ix86_int_cmp_code_to_pcmp_immediate (code);
24188 /* Expand AVX-512 vector comparison. */
24190 bool
24191 ix86_expand_mask_vec_cmp (rtx operands[])
24193 machine_mode mask_mode = GET_MODE (operands[0]);
24194 machine_mode cmp_mode = GET_MODE (operands[2]);
24195 enum rtx_code code = GET_CODE (operands[1]);
24196 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24197 int unspec_code;
24198 rtx unspec;
24200 switch (code)
24202 case LEU:
24203 case GTU:
24204 case GEU:
24205 case LTU:
24206 unspec_code = UNSPEC_UNSIGNED_PCMP;
24207 break;
24209 default:
24210 unspec_code = UNSPEC_PCMP;
24213 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24214 operands[3], imm),
24215 unspec_code);
24216 emit_insn (gen_rtx_SET (operands[0], unspec));
24218 return true;
24221 /* Expand fp vector comparison. */
24223 bool
24224 ix86_expand_fp_vec_cmp (rtx operands[])
24226 enum rtx_code code = GET_CODE (operands[1]);
24227 rtx cmp;
24229 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24230 &operands[2], &operands[3]);
24231 if (code == UNKNOWN)
24233 rtx temp;
24234 switch (GET_CODE (operands[1]))
24236 case LTGT:
24237 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24238 operands[3], NULL, NULL);
24239 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24240 operands[3], NULL, NULL);
24241 code = AND;
24242 break;
24243 case UNEQ:
24244 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24245 operands[3], NULL, NULL);
24246 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24247 operands[3], NULL, NULL);
24248 code = IOR;
24249 break;
24250 default:
24251 gcc_unreachable ();
24253 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24254 OPTAB_DIRECT);
24256 else
24257 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24258 operands[1], operands[2]);
24260 if (operands[0] != cmp)
24261 emit_move_insn (operands[0], cmp);
24263 return true;
24266 static rtx
24267 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24268 rtx op_true, rtx op_false, bool *negate)
24270 machine_mode data_mode = GET_MODE (dest);
24271 machine_mode mode = GET_MODE (cop0);
24272 rtx x;
24274 *negate = false;
24276 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24277 if (TARGET_XOP
24278 && (mode == V16QImode || mode == V8HImode
24279 || mode == V4SImode || mode == V2DImode))
24281 else
24283 /* Canonicalize the comparison to EQ, GT, GTU. */
24284 switch (code)
24286 case EQ:
24287 case GT:
24288 case GTU:
24289 break;
24291 case NE:
24292 case LE:
24293 case LEU:
24294 code = reverse_condition (code);
24295 *negate = true;
24296 break;
24298 case GE:
24299 case GEU:
24300 code = reverse_condition (code);
24301 *negate = true;
24302 /* FALLTHRU */
24304 case LT:
24305 case LTU:
24306 std::swap (cop0, cop1);
24307 code = swap_condition (code);
24308 break;
24310 default:
24311 gcc_unreachable ();
24314 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24315 if (mode == V2DImode)
24317 switch (code)
24319 case EQ:
24320 /* SSE4.1 supports EQ. */
24321 if (!TARGET_SSE4_1)
24322 return NULL;
24323 break;
24325 case GT:
24326 case GTU:
24327 /* SSE4.2 supports GT/GTU. */
24328 if (!TARGET_SSE4_2)
24329 return NULL;
24330 break;
24332 default:
24333 gcc_unreachable ();
24337 /* Unsigned parallel compare is not supported by the hardware.
24338 Play some tricks to turn this into a signed comparison
24339 against 0. */
24340 if (code == GTU)
24342 cop0 = force_reg (mode, cop0);
24344 switch (mode)
24346 case V16SImode:
24347 case V8DImode:
24348 case V8SImode:
24349 case V4DImode:
24350 case V4SImode:
24351 case V2DImode:
24353 rtx t1, t2, mask;
24354 rtx (*gen_sub3) (rtx, rtx, rtx);
24356 switch (mode)
24358 case V16SImode: gen_sub3 = gen_subv16si3; break;
24359 case V8DImode: gen_sub3 = gen_subv8di3; break;
24360 case V8SImode: gen_sub3 = gen_subv8si3; break;
24361 case V4DImode: gen_sub3 = gen_subv4di3; break;
24362 case V4SImode: gen_sub3 = gen_subv4si3; break;
24363 case V2DImode: gen_sub3 = gen_subv2di3; break;
24364 default:
24365 gcc_unreachable ();
24367 /* Subtract (-(INT MAX) - 1) from both operands to make
24368 them signed. */
24369 mask = ix86_build_signbit_mask (mode, true, false);
24370 t1 = gen_reg_rtx (mode);
24371 emit_insn (gen_sub3 (t1, cop0, mask));
24373 t2 = gen_reg_rtx (mode);
24374 emit_insn (gen_sub3 (t2, cop1, mask));
24376 cop0 = t1;
24377 cop1 = t2;
24378 code = GT;
24380 break;
24382 case V64QImode:
24383 case V32HImode:
24384 case V32QImode:
24385 case V16HImode:
24386 case V16QImode:
24387 case V8HImode:
24388 /* Perform a parallel unsigned saturating subtraction. */
24389 x = gen_reg_rtx (mode);
24390 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24391 cop1)));
24393 cop0 = x;
24394 cop1 = CONST0_RTX (mode);
24395 code = EQ;
24396 *negate = !*negate;
24397 break;
24399 default:
24400 gcc_unreachable ();
24405 if (*negate)
24406 std::swap (op_true, op_false);
24408 /* Allow the comparison to be done in one mode, but the movcc to
24409 happen in another mode. */
24410 if (data_mode == mode)
24412 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24413 op_true, op_false);
24415 else
24417 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24418 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24419 op_true, op_false);
24420 if (GET_MODE (x) == mode)
24421 x = gen_lowpart (data_mode, x);
24424 return x;
24427 /* Expand integer vector comparison. */
24429 bool
24430 ix86_expand_int_vec_cmp (rtx operands[])
24432 rtx_code code = GET_CODE (operands[1]);
24433 bool negate = false;
24434 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24435 operands[3], NULL, NULL, &negate);
24437 if (!cmp)
24438 return false;
24440 if (negate)
24441 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24442 CONST0_RTX (GET_MODE (cmp)),
24443 NULL, NULL, &negate);
24445 gcc_assert (!negate);
24447 if (operands[0] != cmp)
24448 emit_move_insn (operands[0], cmp);
24450 return true;
24453 /* Expand a floating-point vector conditional move; a vcond operation
24454 rather than a movcc operation. */
24456 bool
24457 ix86_expand_fp_vcond (rtx operands[])
24459 enum rtx_code code = GET_CODE (operands[3]);
24460 rtx cmp;
24462 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24463 &operands[4], &operands[5]);
24464 if (code == UNKNOWN)
24466 rtx temp;
24467 switch (GET_CODE (operands[3]))
24469 case LTGT:
24470 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24471 operands[5], operands[0], operands[0]);
24472 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24473 operands[5], operands[1], operands[2]);
24474 code = AND;
24475 break;
24476 case UNEQ:
24477 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24478 operands[5], operands[0], operands[0]);
24479 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24480 operands[5], operands[1], operands[2]);
24481 code = IOR;
24482 break;
24483 default:
24484 gcc_unreachable ();
24486 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24487 OPTAB_DIRECT);
24488 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24489 return true;
24492 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24493 operands[5], operands[1], operands[2]))
24494 return true;
24496 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24497 operands[1], operands[2]);
24498 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24499 return true;
24502 /* Expand a signed/unsigned integral vector conditional move. */
24504 bool
24505 ix86_expand_int_vcond (rtx operands[])
24507 machine_mode data_mode = GET_MODE (operands[0]);
24508 machine_mode mode = GET_MODE (operands[4]);
24509 enum rtx_code code = GET_CODE (operands[3]);
24510 bool negate = false;
24511 rtx x, cop0, cop1;
24513 cop0 = operands[4];
24514 cop1 = operands[5];
24516 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24517 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24518 if ((code == LT || code == GE)
24519 && data_mode == mode
24520 && cop1 == CONST0_RTX (mode)
24521 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24522 && GET_MODE_UNIT_SIZE (data_mode) > 1
24523 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24524 && (GET_MODE_SIZE (data_mode) == 16
24525 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24527 rtx negop = operands[2 - (code == LT)];
24528 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24529 if (negop == CONST1_RTX (data_mode))
24531 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24532 operands[0], 1, OPTAB_DIRECT);
24533 if (res != operands[0])
24534 emit_move_insn (operands[0], res);
24535 return true;
24537 else if (GET_MODE_INNER (data_mode) != DImode
24538 && vector_all_ones_operand (negop, data_mode))
24540 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24541 operands[0], 0, OPTAB_DIRECT);
24542 if (res != operands[0])
24543 emit_move_insn (operands[0], res);
24544 return true;
24548 if (!nonimmediate_operand (cop1, mode))
24549 cop1 = force_reg (mode, cop1);
24550 if (!general_operand (operands[1], data_mode))
24551 operands[1] = force_reg (data_mode, operands[1]);
24552 if (!general_operand (operands[2], data_mode))
24553 operands[2] = force_reg (data_mode, operands[2]);
24555 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24556 operands[1], operands[2], &negate);
24558 if (!x)
24559 return false;
24561 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24562 operands[2-negate]);
24563 return true;
24566 /* AVX512F does support 64-byte integer vector operations,
24567 thus the longest vector we are faced with is V64QImode. */
24568 #define MAX_VECT_LEN 64
24570 struct expand_vec_perm_d
24572 rtx target, op0, op1;
24573 unsigned char perm[MAX_VECT_LEN];
24574 machine_mode vmode;
24575 unsigned char nelt;
24576 bool one_operand_p;
24577 bool testing_p;
24580 static bool
24581 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24582 struct expand_vec_perm_d *d)
24584 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24585 expander, so args are either in d, or in op0, op1 etc. */
24586 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24587 machine_mode maskmode = mode;
24588 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24590 switch (mode)
24592 case V8HImode:
24593 if (TARGET_AVX512VL && TARGET_AVX512BW)
24594 gen = gen_avx512vl_vpermi2varv8hi3;
24595 break;
24596 case V16HImode:
24597 if (TARGET_AVX512VL && TARGET_AVX512BW)
24598 gen = gen_avx512vl_vpermi2varv16hi3;
24599 break;
24600 case V64QImode:
24601 if (TARGET_AVX512VBMI)
24602 gen = gen_avx512bw_vpermi2varv64qi3;
24603 break;
24604 case V32HImode:
24605 if (TARGET_AVX512BW)
24606 gen = gen_avx512bw_vpermi2varv32hi3;
24607 break;
24608 case V4SImode:
24609 if (TARGET_AVX512VL)
24610 gen = gen_avx512vl_vpermi2varv4si3;
24611 break;
24612 case V8SImode:
24613 if (TARGET_AVX512VL)
24614 gen = gen_avx512vl_vpermi2varv8si3;
24615 break;
24616 case V16SImode:
24617 if (TARGET_AVX512F)
24618 gen = gen_avx512f_vpermi2varv16si3;
24619 break;
24620 case V4SFmode:
24621 if (TARGET_AVX512VL)
24623 gen = gen_avx512vl_vpermi2varv4sf3;
24624 maskmode = V4SImode;
24626 break;
24627 case V8SFmode:
24628 if (TARGET_AVX512VL)
24630 gen = gen_avx512vl_vpermi2varv8sf3;
24631 maskmode = V8SImode;
24633 break;
24634 case V16SFmode:
24635 if (TARGET_AVX512F)
24637 gen = gen_avx512f_vpermi2varv16sf3;
24638 maskmode = V16SImode;
24640 break;
24641 case V2DImode:
24642 if (TARGET_AVX512VL)
24643 gen = gen_avx512vl_vpermi2varv2di3;
24644 break;
24645 case V4DImode:
24646 if (TARGET_AVX512VL)
24647 gen = gen_avx512vl_vpermi2varv4di3;
24648 break;
24649 case V8DImode:
24650 if (TARGET_AVX512F)
24651 gen = gen_avx512f_vpermi2varv8di3;
24652 break;
24653 case V2DFmode:
24654 if (TARGET_AVX512VL)
24656 gen = gen_avx512vl_vpermi2varv2df3;
24657 maskmode = V2DImode;
24659 break;
24660 case V4DFmode:
24661 if (TARGET_AVX512VL)
24663 gen = gen_avx512vl_vpermi2varv4df3;
24664 maskmode = V4DImode;
24666 break;
24667 case V8DFmode:
24668 if (TARGET_AVX512F)
24670 gen = gen_avx512f_vpermi2varv8df3;
24671 maskmode = V8DImode;
24673 break;
24674 default:
24675 break;
24678 if (gen == NULL)
24679 return false;
24681 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24682 expander, so args are either in d, or in op0, op1 etc. */
24683 if (d)
24685 rtx vec[64];
24686 target = d->target;
24687 op0 = d->op0;
24688 op1 = d->op1;
24689 for (int i = 0; i < d->nelt; ++i)
24690 vec[i] = GEN_INT (d->perm[i]);
24691 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24694 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24695 return true;
24698 /* Expand a variable vector permutation. */
24700 void
24701 ix86_expand_vec_perm (rtx operands[])
24703 rtx target = operands[0];
24704 rtx op0 = operands[1];
24705 rtx op1 = operands[2];
24706 rtx mask = operands[3];
24707 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24708 machine_mode mode = GET_MODE (op0);
24709 machine_mode maskmode = GET_MODE (mask);
24710 int w, e, i;
24711 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24713 /* Number of elements in the vector. */
24714 w = GET_MODE_NUNITS (mode);
24715 e = GET_MODE_UNIT_SIZE (mode);
24716 gcc_assert (w <= 64);
24718 if (TARGET_AVX512F && one_operand_shuffle)
24720 rtx (*gen) (rtx, rtx, rtx) = NULL;
24721 switch (mode)
24723 case V16SImode:
24724 gen =gen_avx512f_permvarv16si;
24725 break;
24726 case V16SFmode:
24727 gen = gen_avx512f_permvarv16sf;
24728 break;
24729 case V8DImode:
24730 gen = gen_avx512f_permvarv8di;
24731 break;
24732 case V8DFmode:
24733 gen = gen_avx512f_permvarv8df;
24734 break;
24735 default:
24736 break;
24738 if (gen != NULL)
24740 emit_insn (gen (target, op0, mask));
24741 return;
24745 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24746 return;
24748 if (TARGET_AVX2)
24750 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24752 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24753 an constant shuffle operand. With a tiny bit of effort we can
24754 use VPERMD instead. A re-interpretation stall for V4DFmode is
24755 unfortunate but there's no avoiding it.
24756 Similarly for V16HImode we don't have instructions for variable
24757 shuffling, while for V32QImode we can use after preparing suitable
24758 masks vpshufb; vpshufb; vpermq; vpor. */
24760 if (mode == V16HImode)
24762 maskmode = mode = V32QImode;
24763 w = 32;
24764 e = 1;
24766 else
24768 maskmode = mode = V8SImode;
24769 w = 8;
24770 e = 4;
24772 t1 = gen_reg_rtx (maskmode);
24774 /* Replicate the low bits of the V4DImode mask into V8SImode:
24775 mask = { A B C D }
24776 t1 = { A A B B C C D D }. */
24777 for (i = 0; i < w / 2; ++i)
24778 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24779 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24780 vt = force_reg (maskmode, vt);
24781 mask = gen_lowpart (maskmode, mask);
24782 if (maskmode == V8SImode)
24783 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24784 else
24785 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24787 /* Multiply the shuffle indicies by two. */
24788 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24789 OPTAB_DIRECT);
24791 /* Add one to the odd shuffle indicies:
24792 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24793 for (i = 0; i < w / 2; ++i)
24795 vec[i * 2] = const0_rtx;
24796 vec[i * 2 + 1] = const1_rtx;
24798 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24799 vt = validize_mem (force_const_mem (maskmode, vt));
24800 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24801 OPTAB_DIRECT);
24803 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24804 operands[3] = mask = t1;
24805 target = gen_reg_rtx (mode);
24806 op0 = gen_lowpart (mode, op0);
24807 op1 = gen_lowpart (mode, op1);
24810 switch (mode)
24812 case V8SImode:
24813 /* The VPERMD and VPERMPS instructions already properly ignore
24814 the high bits of the shuffle elements. No need for us to
24815 perform an AND ourselves. */
24816 if (one_operand_shuffle)
24818 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24819 if (target != operands[0])
24820 emit_move_insn (operands[0],
24821 gen_lowpart (GET_MODE (operands[0]), target));
24823 else
24825 t1 = gen_reg_rtx (V8SImode);
24826 t2 = gen_reg_rtx (V8SImode);
24827 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24828 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24829 goto merge_two;
24831 return;
24833 case V8SFmode:
24834 mask = gen_lowpart (V8SImode, mask);
24835 if (one_operand_shuffle)
24836 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24837 else
24839 t1 = gen_reg_rtx (V8SFmode);
24840 t2 = gen_reg_rtx (V8SFmode);
24841 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24842 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24843 goto merge_two;
24845 return;
24847 case V4SImode:
24848 /* By combining the two 128-bit input vectors into one 256-bit
24849 input vector, we can use VPERMD and VPERMPS for the full
24850 two-operand shuffle. */
24851 t1 = gen_reg_rtx (V8SImode);
24852 t2 = gen_reg_rtx (V8SImode);
24853 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24854 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24855 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24856 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24857 return;
24859 case V4SFmode:
24860 t1 = gen_reg_rtx (V8SFmode);
24861 t2 = gen_reg_rtx (V8SImode);
24862 mask = gen_lowpart (V4SImode, mask);
24863 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24864 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24865 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24866 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24867 return;
24869 case V32QImode:
24870 t1 = gen_reg_rtx (V32QImode);
24871 t2 = gen_reg_rtx (V32QImode);
24872 t3 = gen_reg_rtx (V32QImode);
24873 vt2 = GEN_INT (-128);
24874 for (i = 0; i < 32; i++)
24875 vec[i] = vt2;
24876 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24877 vt = force_reg (V32QImode, vt);
24878 for (i = 0; i < 32; i++)
24879 vec[i] = i < 16 ? vt2 : const0_rtx;
24880 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24881 vt2 = force_reg (V32QImode, vt2);
24882 /* From mask create two adjusted masks, which contain the same
24883 bits as mask in the low 7 bits of each vector element.
24884 The first mask will have the most significant bit clear
24885 if it requests element from the same 128-bit lane
24886 and MSB set if it requests element from the other 128-bit lane.
24887 The second mask will have the opposite values of the MSB,
24888 and additionally will have its 128-bit lanes swapped.
24889 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24890 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24891 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24892 stands for other 12 bytes. */
24893 /* The bit whether element is from the same lane or the other
24894 lane is bit 4, so shift it up by 3 to the MSB position. */
24895 t5 = gen_reg_rtx (V4DImode);
24896 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24897 GEN_INT (3)));
24898 /* Clear MSB bits from the mask just in case it had them set. */
24899 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24900 /* After this t1 will have MSB set for elements from other lane. */
24901 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24902 /* Clear bits other than MSB. */
24903 emit_insn (gen_andv32qi3 (t1, t1, vt));
24904 /* Or in the lower bits from mask into t3. */
24905 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24906 /* And invert MSB bits in t1, so MSB is set for elements from the same
24907 lane. */
24908 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24909 /* Swap 128-bit lanes in t3. */
24910 t6 = gen_reg_rtx (V4DImode);
24911 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24912 const2_rtx, GEN_INT (3),
24913 const0_rtx, const1_rtx));
24914 /* And or in the lower bits from mask into t1. */
24915 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24916 if (one_operand_shuffle)
24918 /* Each of these shuffles will put 0s in places where
24919 element from the other 128-bit lane is needed, otherwise
24920 will shuffle in the requested value. */
24921 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24922 gen_lowpart (V32QImode, t6)));
24923 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24924 /* For t3 the 128-bit lanes are swapped again. */
24925 t7 = gen_reg_rtx (V4DImode);
24926 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24927 const2_rtx, GEN_INT (3),
24928 const0_rtx, const1_rtx));
24929 /* And oring both together leads to the result. */
24930 emit_insn (gen_iorv32qi3 (target, t1,
24931 gen_lowpart (V32QImode, t7)));
24932 if (target != operands[0])
24933 emit_move_insn (operands[0],
24934 gen_lowpart (GET_MODE (operands[0]), target));
24935 return;
24938 t4 = gen_reg_rtx (V32QImode);
24939 /* Similarly to the above one_operand_shuffle code,
24940 just for repeated twice for each operand. merge_two:
24941 code will merge the two results together. */
24942 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24943 gen_lowpart (V32QImode, t6)));
24944 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24945 gen_lowpart (V32QImode, t6)));
24946 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24947 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24948 t7 = gen_reg_rtx (V4DImode);
24949 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24950 const2_rtx, GEN_INT (3),
24951 const0_rtx, const1_rtx));
24952 t8 = gen_reg_rtx (V4DImode);
24953 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24954 const2_rtx, GEN_INT (3),
24955 const0_rtx, const1_rtx));
24956 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24957 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24958 t1 = t4;
24959 t2 = t3;
24960 goto merge_two;
24962 default:
24963 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24964 break;
24968 if (TARGET_XOP)
24970 /* The XOP VPPERM insn supports three inputs. By ignoring the
24971 one_operand_shuffle special case, we avoid creating another
24972 set of constant vectors in memory. */
24973 one_operand_shuffle = false;
24975 /* mask = mask & {2*w-1, ...} */
24976 vt = GEN_INT (2*w - 1);
24978 else
24980 /* mask = mask & {w-1, ...} */
24981 vt = GEN_INT (w - 1);
24984 for (i = 0; i < w; i++)
24985 vec[i] = vt;
24986 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24987 mask = expand_simple_binop (maskmode, AND, mask, vt,
24988 NULL_RTX, 0, OPTAB_DIRECT);
24990 /* For non-QImode operations, convert the word permutation control
24991 into a byte permutation control. */
24992 if (mode != V16QImode)
24994 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24995 GEN_INT (exact_log2 (e)),
24996 NULL_RTX, 0, OPTAB_DIRECT);
24998 /* Convert mask to vector of chars. */
24999 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25001 /* Replicate each of the input bytes into byte positions:
25002 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25003 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25004 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25005 for (i = 0; i < 16; ++i)
25006 vec[i] = GEN_INT (i/e * e);
25007 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25008 vt = validize_mem (force_const_mem (V16QImode, vt));
25009 if (TARGET_XOP)
25010 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25011 else
25012 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25014 /* Convert it into the byte positions by doing
25015 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25016 for (i = 0; i < 16; ++i)
25017 vec[i] = GEN_INT (i % e);
25018 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25019 vt = validize_mem (force_const_mem (V16QImode, vt));
25020 emit_insn (gen_addv16qi3 (mask, mask, vt));
25023 /* The actual shuffle operations all operate on V16QImode. */
25024 op0 = gen_lowpart (V16QImode, op0);
25025 op1 = gen_lowpart (V16QImode, op1);
25027 if (TARGET_XOP)
25029 if (GET_MODE (target) != V16QImode)
25030 target = gen_reg_rtx (V16QImode);
25031 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25032 if (target != operands[0])
25033 emit_move_insn (operands[0],
25034 gen_lowpart (GET_MODE (operands[0]), target));
25036 else if (one_operand_shuffle)
25038 if (GET_MODE (target) != V16QImode)
25039 target = gen_reg_rtx (V16QImode);
25040 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25041 if (target != operands[0])
25042 emit_move_insn (operands[0],
25043 gen_lowpart (GET_MODE (operands[0]), target));
25045 else
25047 rtx xops[6];
25048 bool ok;
25050 /* Shuffle the two input vectors independently. */
25051 t1 = gen_reg_rtx (V16QImode);
25052 t2 = gen_reg_rtx (V16QImode);
25053 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25054 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25056 merge_two:
25057 /* Then merge them together. The key is whether any given control
25058 element contained a bit set that indicates the second word. */
25059 mask = operands[3];
25060 vt = GEN_INT (w);
25061 if (maskmode == V2DImode && !TARGET_SSE4_1)
25063 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25064 more shuffle to convert the V2DI input mask into a V4SI
25065 input mask. At which point the masking that expand_int_vcond
25066 will work as desired. */
25067 rtx t3 = gen_reg_rtx (V4SImode);
25068 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25069 const0_rtx, const0_rtx,
25070 const2_rtx, const2_rtx));
25071 mask = t3;
25072 maskmode = V4SImode;
25073 e = w = 4;
25076 for (i = 0; i < w; i++)
25077 vec[i] = vt;
25078 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25079 vt = force_reg (maskmode, vt);
25080 mask = expand_simple_binop (maskmode, AND, mask, vt,
25081 NULL_RTX, 0, OPTAB_DIRECT);
25083 if (GET_MODE (target) != mode)
25084 target = gen_reg_rtx (mode);
25085 xops[0] = target;
25086 xops[1] = gen_lowpart (mode, t2);
25087 xops[2] = gen_lowpart (mode, t1);
25088 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25089 xops[4] = mask;
25090 xops[5] = vt;
25091 ok = ix86_expand_int_vcond (xops);
25092 gcc_assert (ok);
25093 if (target != operands[0])
25094 emit_move_insn (operands[0],
25095 gen_lowpart (GET_MODE (operands[0]), target));
25099 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25100 true if we should do zero extension, else sign extension. HIGH_P is
25101 true if we want the N/2 high elements, else the low elements. */
25103 void
25104 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25106 machine_mode imode = GET_MODE (src);
25107 rtx tmp;
25109 if (TARGET_SSE4_1)
25111 rtx (*unpack)(rtx, rtx);
25112 rtx (*extract)(rtx, rtx) = NULL;
25113 machine_mode halfmode = BLKmode;
25115 switch (imode)
25117 case V64QImode:
25118 if (unsigned_p)
25119 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25120 else
25121 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25122 halfmode = V32QImode;
25123 extract
25124 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25125 break;
25126 case V32QImode:
25127 if (unsigned_p)
25128 unpack = gen_avx2_zero_extendv16qiv16hi2;
25129 else
25130 unpack = gen_avx2_sign_extendv16qiv16hi2;
25131 halfmode = V16QImode;
25132 extract
25133 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25134 break;
25135 case V32HImode:
25136 if (unsigned_p)
25137 unpack = gen_avx512f_zero_extendv16hiv16si2;
25138 else
25139 unpack = gen_avx512f_sign_extendv16hiv16si2;
25140 halfmode = V16HImode;
25141 extract
25142 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25143 break;
25144 case V16HImode:
25145 if (unsigned_p)
25146 unpack = gen_avx2_zero_extendv8hiv8si2;
25147 else
25148 unpack = gen_avx2_sign_extendv8hiv8si2;
25149 halfmode = V8HImode;
25150 extract
25151 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25152 break;
25153 case V16SImode:
25154 if (unsigned_p)
25155 unpack = gen_avx512f_zero_extendv8siv8di2;
25156 else
25157 unpack = gen_avx512f_sign_extendv8siv8di2;
25158 halfmode = V8SImode;
25159 extract
25160 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25161 break;
25162 case V8SImode:
25163 if (unsigned_p)
25164 unpack = gen_avx2_zero_extendv4siv4di2;
25165 else
25166 unpack = gen_avx2_sign_extendv4siv4di2;
25167 halfmode = V4SImode;
25168 extract
25169 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25170 break;
25171 case V16QImode:
25172 if (unsigned_p)
25173 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25174 else
25175 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25176 break;
25177 case V8HImode:
25178 if (unsigned_p)
25179 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25180 else
25181 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25182 break;
25183 case V4SImode:
25184 if (unsigned_p)
25185 unpack = gen_sse4_1_zero_extendv2siv2di2;
25186 else
25187 unpack = gen_sse4_1_sign_extendv2siv2di2;
25188 break;
25189 default:
25190 gcc_unreachable ();
25193 if (GET_MODE_SIZE (imode) >= 32)
25195 tmp = gen_reg_rtx (halfmode);
25196 emit_insn (extract (tmp, src));
25198 else if (high_p)
25200 /* Shift higher 8 bytes to lower 8 bytes. */
25201 tmp = gen_reg_rtx (V1TImode);
25202 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25203 GEN_INT (64)));
25204 tmp = gen_lowpart (imode, tmp);
25206 else
25207 tmp = src;
25209 emit_insn (unpack (dest, tmp));
25211 else
25213 rtx (*unpack)(rtx, rtx, rtx);
25215 switch (imode)
25217 case V16QImode:
25218 if (high_p)
25219 unpack = gen_vec_interleave_highv16qi;
25220 else
25221 unpack = gen_vec_interleave_lowv16qi;
25222 break;
25223 case V8HImode:
25224 if (high_p)
25225 unpack = gen_vec_interleave_highv8hi;
25226 else
25227 unpack = gen_vec_interleave_lowv8hi;
25228 break;
25229 case V4SImode:
25230 if (high_p)
25231 unpack = gen_vec_interleave_highv4si;
25232 else
25233 unpack = gen_vec_interleave_lowv4si;
25234 break;
25235 default:
25236 gcc_unreachable ();
25239 if (unsigned_p)
25240 tmp = force_reg (imode, CONST0_RTX (imode));
25241 else
25242 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25243 src, pc_rtx, pc_rtx);
25245 rtx tmp2 = gen_reg_rtx (imode);
25246 emit_insn (unpack (tmp2, src, tmp));
25247 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25251 /* Expand conditional increment or decrement using adb/sbb instructions.
25252 The default case using setcc followed by the conditional move can be
25253 done by generic code. */
25254 bool
25255 ix86_expand_int_addcc (rtx operands[])
25257 enum rtx_code code = GET_CODE (operands[1]);
25258 rtx flags;
25259 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25260 rtx compare_op;
25261 rtx val = const0_rtx;
25262 bool fpcmp = false;
25263 machine_mode mode;
25264 rtx op0 = XEXP (operands[1], 0);
25265 rtx op1 = XEXP (operands[1], 1);
25267 if (operands[3] != const1_rtx
25268 && operands[3] != constm1_rtx)
25269 return false;
25270 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25271 return false;
25272 code = GET_CODE (compare_op);
25274 flags = XEXP (compare_op, 0);
25276 if (GET_MODE (flags) == CCFPmode
25277 || GET_MODE (flags) == CCFPUmode)
25279 fpcmp = true;
25280 code = ix86_fp_compare_code_to_integer (code);
25283 if (code != LTU)
25285 val = constm1_rtx;
25286 if (fpcmp)
25287 PUT_CODE (compare_op,
25288 reverse_condition_maybe_unordered
25289 (GET_CODE (compare_op)));
25290 else
25291 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25294 mode = GET_MODE (operands[0]);
25296 /* Construct either adc or sbb insn. */
25297 if ((code == LTU) == (operands[3] == constm1_rtx))
25299 switch (mode)
25301 case QImode:
25302 insn = gen_subqi3_carry;
25303 break;
25304 case HImode:
25305 insn = gen_subhi3_carry;
25306 break;
25307 case SImode:
25308 insn = gen_subsi3_carry;
25309 break;
25310 case DImode:
25311 insn = gen_subdi3_carry;
25312 break;
25313 default:
25314 gcc_unreachable ();
25317 else
25319 switch (mode)
25321 case QImode:
25322 insn = gen_addqi3_carry;
25323 break;
25324 case HImode:
25325 insn = gen_addhi3_carry;
25326 break;
25327 case SImode:
25328 insn = gen_addsi3_carry;
25329 break;
25330 case DImode:
25331 insn = gen_adddi3_carry;
25332 break;
25333 default:
25334 gcc_unreachable ();
25337 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25339 return true;
25343 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25344 but works for floating pointer parameters and nonoffsetable memories.
25345 For pushes, it returns just stack offsets; the values will be saved
25346 in the right order. Maximally three parts are generated. */
25348 static int
25349 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25351 int size;
25353 if (!TARGET_64BIT)
25354 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25355 else
25356 size = (GET_MODE_SIZE (mode) + 4) / 8;
25358 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25359 gcc_assert (size >= 2 && size <= 4);
25361 /* Optimize constant pool reference to immediates. This is used by fp
25362 moves, that force all constants to memory to allow combining. */
25363 if (MEM_P (operand) && MEM_READONLY_P (operand))
25365 rtx tmp = maybe_get_pool_constant (operand);
25366 if (tmp)
25367 operand = tmp;
25370 if (MEM_P (operand) && !offsettable_memref_p (operand))
25372 /* The only non-offsetable memories we handle are pushes. */
25373 int ok = push_operand (operand, VOIDmode);
25375 gcc_assert (ok);
25377 operand = copy_rtx (operand);
25378 PUT_MODE (operand, word_mode);
25379 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25380 return size;
25383 if (GET_CODE (operand) == CONST_VECTOR)
25385 machine_mode imode = int_mode_for_mode (mode);
25386 /* Caution: if we looked through a constant pool memory above,
25387 the operand may actually have a different mode now. That's
25388 ok, since we want to pun this all the way back to an integer. */
25389 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25390 gcc_assert (operand != NULL);
25391 mode = imode;
25394 if (!TARGET_64BIT)
25396 if (mode == DImode)
25397 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25398 else
25400 int i;
25402 if (REG_P (operand))
25404 gcc_assert (reload_completed);
25405 for (i = 0; i < size; i++)
25406 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25408 else if (offsettable_memref_p (operand))
25410 operand = adjust_address (operand, SImode, 0);
25411 parts[0] = operand;
25412 for (i = 1; i < size; i++)
25413 parts[i] = adjust_address (operand, SImode, 4 * i);
25415 else if (CONST_DOUBLE_P (operand))
25417 const REAL_VALUE_TYPE *r;
25418 long l[4];
25420 r = CONST_DOUBLE_REAL_VALUE (operand);
25421 switch (mode)
25423 case TFmode:
25424 real_to_target (l, r, mode);
25425 parts[3] = gen_int_mode (l[3], SImode);
25426 parts[2] = gen_int_mode (l[2], SImode);
25427 break;
25428 case XFmode:
25429 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25430 long double may not be 80-bit. */
25431 real_to_target (l, r, mode);
25432 parts[2] = gen_int_mode (l[2], SImode);
25433 break;
25434 case DFmode:
25435 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25436 break;
25437 default:
25438 gcc_unreachable ();
25440 parts[1] = gen_int_mode (l[1], SImode);
25441 parts[0] = gen_int_mode (l[0], SImode);
25443 else
25444 gcc_unreachable ();
25447 else
25449 if (mode == TImode)
25450 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25451 if (mode == XFmode || mode == TFmode)
25453 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25454 if (REG_P (operand))
25456 gcc_assert (reload_completed);
25457 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25458 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25460 else if (offsettable_memref_p (operand))
25462 operand = adjust_address (operand, DImode, 0);
25463 parts[0] = operand;
25464 parts[1] = adjust_address (operand, upper_mode, 8);
25466 else if (CONST_DOUBLE_P (operand))
25468 long l[4];
25470 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25472 /* real_to_target puts 32-bit pieces in each long. */
25473 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25474 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25475 << 32), DImode);
25477 if (upper_mode == SImode)
25478 parts[1] = gen_int_mode (l[2], SImode);
25479 else
25480 parts[1]
25481 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25482 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25483 << 32), DImode);
25485 else
25486 gcc_unreachable ();
25490 return size;
25493 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25494 Return false when normal moves are needed; true when all required
25495 insns have been emitted. Operands 2-4 contain the input values
25496 int the correct order; operands 5-7 contain the output values. */
25498 void
25499 ix86_split_long_move (rtx operands[])
25501 rtx part[2][4];
25502 int nparts, i, j;
25503 int push = 0;
25504 int collisions = 0;
25505 machine_mode mode = GET_MODE (operands[0]);
25506 bool collisionparts[4];
25508 /* The DFmode expanders may ask us to move double.
25509 For 64bit target this is single move. By hiding the fact
25510 here we simplify i386.md splitters. */
25511 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25513 /* Optimize constant pool reference to immediates. This is used by
25514 fp moves, that force all constants to memory to allow combining. */
25516 if (MEM_P (operands[1])
25517 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25518 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25519 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25520 if (push_operand (operands[0], VOIDmode))
25522 operands[0] = copy_rtx (operands[0]);
25523 PUT_MODE (operands[0], word_mode);
25525 else
25526 operands[0] = gen_lowpart (DImode, operands[0]);
25527 operands[1] = gen_lowpart (DImode, operands[1]);
25528 emit_move_insn (operands[0], operands[1]);
25529 return;
25532 /* The only non-offsettable memory we handle is push. */
25533 if (push_operand (operands[0], VOIDmode))
25534 push = 1;
25535 else
25536 gcc_assert (!MEM_P (operands[0])
25537 || offsettable_memref_p (operands[0]));
25539 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25540 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25542 /* When emitting push, take care for source operands on the stack. */
25543 if (push && MEM_P (operands[1])
25544 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25546 rtx src_base = XEXP (part[1][nparts - 1], 0);
25548 /* Compensate for the stack decrement by 4. */
25549 if (!TARGET_64BIT && nparts == 3
25550 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25551 src_base = plus_constant (Pmode, src_base, 4);
25553 /* src_base refers to the stack pointer and is
25554 automatically decreased by emitted push. */
25555 for (i = 0; i < nparts; i++)
25556 part[1][i] = change_address (part[1][i],
25557 GET_MODE (part[1][i]), src_base);
25560 /* We need to do copy in the right order in case an address register
25561 of the source overlaps the destination. */
25562 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25564 rtx tmp;
25566 for (i = 0; i < nparts; i++)
25568 collisionparts[i]
25569 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25570 if (collisionparts[i])
25571 collisions++;
25574 /* Collision in the middle part can be handled by reordering. */
25575 if (collisions == 1 && nparts == 3 && collisionparts [1])
25577 std::swap (part[0][1], part[0][2]);
25578 std::swap (part[1][1], part[1][2]);
25580 else if (collisions == 1
25581 && nparts == 4
25582 && (collisionparts [1] || collisionparts [2]))
25584 if (collisionparts [1])
25586 std::swap (part[0][1], part[0][2]);
25587 std::swap (part[1][1], part[1][2]);
25589 else
25591 std::swap (part[0][2], part[0][3]);
25592 std::swap (part[1][2], part[1][3]);
25596 /* If there are more collisions, we can't handle it by reordering.
25597 Do an lea to the last part and use only one colliding move. */
25598 else if (collisions > 1)
25600 rtx base, addr, tls_base = NULL_RTX;
25602 collisions = 1;
25604 base = part[0][nparts - 1];
25606 /* Handle the case when the last part isn't valid for lea.
25607 Happens in 64-bit mode storing the 12-byte XFmode. */
25608 if (GET_MODE (base) != Pmode)
25609 base = gen_rtx_REG (Pmode, REGNO (base));
25611 addr = XEXP (part[1][0], 0);
25612 if (TARGET_TLS_DIRECT_SEG_REFS)
25614 struct ix86_address parts;
25615 int ok = ix86_decompose_address (addr, &parts);
25616 gcc_assert (ok);
25617 if (parts.seg == DEFAULT_TLS_SEG_REG)
25619 /* It is not valid to use %gs: or %fs: in
25620 lea though, so we need to remove it from the
25621 address used for lea and add it to each individual
25622 memory loads instead. */
25623 addr = copy_rtx (addr);
25624 rtx *x = &addr;
25625 while (GET_CODE (*x) == PLUS)
25627 for (i = 0; i < 2; i++)
25629 rtx u = XEXP (*x, i);
25630 if (GET_CODE (u) == ZERO_EXTEND)
25631 u = XEXP (u, 0);
25632 if (GET_CODE (u) == UNSPEC
25633 && XINT (u, 1) == UNSPEC_TP)
25635 tls_base = XEXP (*x, i);
25636 *x = XEXP (*x, 1 - i);
25637 break;
25640 if (tls_base)
25641 break;
25642 x = &XEXP (*x, 0);
25644 gcc_assert (tls_base);
25647 emit_insn (gen_rtx_SET (base, addr));
25648 if (tls_base)
25649 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25650 part[1][0] = replace_equiv_address (part[1][0], base);
25651 for (i = 1; i < nparts; i++)
25653 if (tls_base)
25654 base = copy_rtx (base);
25655 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25656 part[1][i] = replace_equiv_address (part[1][i], tmp);
25661 if (push)
25663 if (!TARGET_64BIT)
25665 if (nparts == 3)
25667 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25668 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25669 stack_pointer_rtx, GEN_INT (-4)));
25670 emit_move_insn (part[0][2], part[1][2]);
25672 else if (nparts == 4)
25674 emit_move_insn (part[0][3], part[1][3]);
25675 emit_move_insn (part[0][2], part[1][2]);
25678 else
25680 /* In 64bit mode we don't have 32bit push available. In case this is
25681 register, it is OK - we will just use larger counterpart. We also
25682 retype memory - these comes from attempt to avoid REX prefix on
25683 moving of second half of TFmode value. */
25684 if (GET_MODE (part[1][1]) == SImode)
25686 switch (GET_CODE (part[1][1]))
25688 case MEM:
25689 part[1][1] = adjust_address (part[1][1], DImode, 0);
25690 break;
25692 case REG:
25693 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25694 break;
25696 default:
25697 gcc_unreachable ();
25700 if (GET_MODE (part[1][0]) == SImode)
25701 part[1][0] = part[1][1];
25704 emit_move_insn (part[0][1], part[1][1]);
25705 emit_move_insn (part[0][0], part[1][0]);
25706 return;
25709 /* Choose correct order to not overwrite the source before it is copied. */
25710 if ((REG_P (part[0][0])
25711 && REG_P (part[1][1])
25712 && (REGNO (part[0][0]) == REGNO (part[1][1])
25713 || (nparts == 3
25714 && REGNO (part[0][0]) == REGNO (part[1][2]))
25715 || (nparts == 4
25716 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25717 || (collisions > 0
25718 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25720 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25722 operands[2 + i] = part[0][j];
25723 operands[6 + i] = part[1][j];
25726 else
25728 for (i = 0; i < nparts; i++)
25730 operands[2 + i] = part[0][i];
25731 operands[6 + i] = part[1][i];
25735 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25736 if (optimize_insn_for_size_p ())
25738 for (j = 0; j < nparts - 1; j++)
25739 if (CONST_INT_P (operands[6 + j])
25740 && operands[6 + j] != const0_rtx
25741 && REG_P (operands[2 + j]))
25742 for (i = j; i < nparts - 1; i++)
25743 if (CONST_INT_P (operands[7 + i])
25744 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25745 operands[7 + i] = operands[2 + j];
25748 for (i = 0; i < nparts; i++)
25749 emit_move_insn (operands[2 + i], operands[6 + i]);
25751 return;
25754 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25755 left shift by a constant, either using a single shift or
25756 a sequence of add instructions. */
25758 static void
25759 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25761 rtx (*insn)(rtx, rtx, rtx);
25763 if (count == 1
25764 || (count * ix86_cost->add <= ix86_cost->shift_const
25765 && !optimize_insn_for_size_p ()))
25767 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25768 while (count-- > 0)
25769 emit_insn (insn (operand, operand, operand));
25771 else
25773 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25774 emit_insn (insn (operand, operand, GEN_INT (count)));
25778 void
25779 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25781 rtx (*gen_ashl3)(rtx, rtx, rtx);
25782 rtx (*gen_shld)(rtx, rtx, rtx);
25783 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25785 rtx low[2], high[2];
25786 int count;
25788 if (CONST_INT_P (operands[2]))
25790 split_double_mode (mode, operands, 2, low, high);
25791 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25793 if (count >= half_width)
25795 emit_move_insn (high[0], low[1]);
25796 emit_move_insn (low[0], const0_rtx);
25798 if (count > half_width)
25799 ix86_expand_ashl_const (high[0], count - half_width, mode);
25801 else
25803 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25805 if (!rtx_equal_p (operands[0], operands[1]))
25806 emit_move_insn (operands[0], operands[1]);
25808 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25809 ix86_expand_ashl_const (low[0], count, mode);
25811 return;
25814 split_double_mode (mode, operands, 1, low, high);
25816 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25818 if (operands[1] == const1_rtx)
25820 /* Assuming we've chosen a QImode capable registers, then 1 << N
25821 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25822 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25824 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25826 ix86_expand_clear (low[0]);
25827 ix86_expand_clear (high[0]);
25828 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25830 d = gen_lowpart (QImode, low[0]);
25831 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25832 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25833 emit_insn (gen_rtx_SET (d, s));
25835 d = gen_lowpart (QImode, high[0]);
25836 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25837 s = gen_rtx_NE (QImode, flags, const0_rtx);
25838 emit_insn (gen_rtx_SET (d, s));
25841 /* Otherwise, we can get the same results by manually performing
25842 a bit extract operation on bit 5/6, and then performing the two
25843 shifts. The two methods of getting 0/1 into low/high are exactly
25844 the same size. Avoiding the shift in the bit extract case helps
25845 pentium4 a bit; no one else seems to care much either way. */
25846 else
25848 machine_mode half_mode;
25849 rtx (*gen_lshr3)(rtx, rtx, rtx);
25850 rtx (*gen_and3)(rtx, rtx, rtx);
25851 rtx (*gen_xor3)(rtx, rtx, rtx);
25852 HOST_WIDE_INT bits;
25853 rtx x;
25855 if (mode == DImode)
25857 half_mode = SImode;
25858 gen_lshr3 = gen_lshrsi3;
25859 gen_and3 = gen_andsi3;
25860 gen_xor3 = gen_xorsi3;
25861 bits = 5;
25863 else
25865 half_mode = DImode;
25866 gen_lshr3 = gen_lshrdi3;
25867 gen_and3 = gen_anddi3;
25868 gen_xor3 = gen_xordi3;
25869 bits = 6;
25872 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25873 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25874 else
25875 x = gen_lowpart (half_mode, operands[2]);
25876 emit_insn (gen_rtx_SET (high[0], x));
25878 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25879 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25880 emit_move_insn (low[0], high[0]);
25881 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25884 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25885 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25886 return;
25889 if (operands[1] == constm1_rtx)
25891 /* For -1 << N, we can avoid the shld instruction, because we
25892 know that we're shifting 0...31/63 ones into a -1. */
25893 emit_move_insn (low[0], constm1_rtx);
25894 if (optimize_insn_for_size_p ())
25895 emit_move_insn (high[0], low[0]);
25896 else
25897 emit_move_insn (high[0], constm1_rtx);
25899 else
25901 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25903 if (!rtx_equal_p (operands[0], operands[1]))
25904 emit_move_insn (operands[0], operands[1]);
25906 split_double_mode (mode, operands, 1, low, high);
25907 emit_insn (gen_shld (high[0], low[0], operands[2]));
25910 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25912 if (TARGET_CMOVE && scratch)
25914 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25915 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25917 ix86_expand_clear (scratch);
25918 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25920 else
25922 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25923 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25925 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25929 void
25930 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25932 rtx (*gen_ashr3)(rtx, rtx, rtx)
25933 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25934 rtx (*gen_shrd)(rtx, rtx, rtx);
25935 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25937 rtx low[2], high[2];
25938 int count;
25940 if (CONST_INT_P (operands[2]))
25942 split_double_mode (mode, operands, 2, low, high);
25943 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25945 if (count == GET_MODE_BITSIZE (mode) - 1)
25947 emit_move_insn (high[0], high[1]);
25948 emit_insn (gen_ashr3 (high[0], high[0],
25949 GEN_INT (half_width - 1)));
25950 emit_move_insn (low[0], high[0]);
25953 else if (count >= half_width)
25955 emit_move_insn (low[0], high[1]);
25956 emit_move_insn (high[0], low[0]);
25957 emit_insn (gen_ashr3 (high[0], high[0],
25958 GEN_INT (half_width - 1)));
25960 if (count > half_width)
25961 emit_insn (gen_ashr3 (low[0], low[0],
25962 GEN_INT (count - half_width)));
25964 else
25966 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25968 if (!rtx_equal_p (operands[0], operands[1]))
25969 emit_move_insn (operands[0], operands[1]);
25971 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25972 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25975 else
25977 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25979 if (!rtx_equal_p (operands[0], operands[1]))
25980 emit_move_insn (operands[0], operands[1]);
25982 split_double_mode (mode, operands, 1, low, high);
25984 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25985 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25987 if (TARGET_CMOVE && scratch)
25989 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25990 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25992 emit_move_insn (scratch, high[0]);
25993 emit_insn (gen_ashr3 (scratch, scratch,
25994 GEN_INT (half_width - 1)));
25995 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25996 scratch));
25998 else
26000 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26001 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26003 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26008 void
26009 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26011 rtx (*gen_lshr3)(rtx, rtx, rtx)
26012 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26013 rtx (*gen_shrd)(rtx, rtx, rtx);
26014 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26016 rtx low[2], high[2];
26017 int count;
26019 if (CONST_INT_P (operands[2]))
26021 split_double_mode (mode, operands, 2, low, high);
26022 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26024 if (count >= half_width)
26026 emit_move_insn (low[0], high[1]);
26027 ix86_expand_clear (high[0]);
26029 if (count > half_width)
26030 emit_insn (gen_lshr3 (low[0], low[0],
26031 GEN_INT (count - half_width)));
26033 else
26035 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26037 if (!rtx_equal_p (operands[0], operands[1]))
26038 emit_move_insn (operands[0], operands[1]);
26040 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26041 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26044 else
26046 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26048 if (!rtx_equal_p (operands[0], operands[1]))
26049 emit_move_insn (operands[0], operands[1]);
26051 split_double_mode (mode, operands, 1, low, high);
26053 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26054 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26056 if (TARGET_CMOVE && scratch)
26058 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26059 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26061 ix86_expand_clear (scratch);
26062 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26063 scratch));
26065 else
26067 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26068 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26070 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26075 /* Predict just emitted jump instruction to be taken with probability PROB. */
26076 static void
26077 predict_jump (int prob)
26079 rtx_insn *insn = get_last_insn ();
26080 gcc_assert (JUMP_P (insn));
26081 add_int_reg_note (insn, REG_BR_PROB, prob);
26084 /* Helper function for the string operations below. Dest VARIABLE whether
26085 it is aligned to VALUE bytes. If true, jump to the label. */
26086 static rtx_code_label *
26087 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26089 rtx_code_label *label = gen_label_rtx ();
26090 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26091 if (GET_MODE (variable) == DImode)
26092 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26093 else
26094 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26095 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26096 1, label);
26097 if (epilogue)
26098 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26099 else
26100 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26101 return label;
26104 /* Adjust COUNTER by the VALUE. */
26105 static void
26106 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26108 rtx (*gen_add)(rtx, rtx, rtx)
26109 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26111 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26114 /* Zero extend possibly SImode EXP to Pmode register. */
26116 ix86_zero_extend_to_Pmode (rtx exp)
26118 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26121 /* Divide COUNTREG by SCALE. */
26122 static rtx
26123 scale_counter (rtx countreg, int scale)
26125 rtx sc;
26127 if (scale == 1)
26128 return countreg;
26129 if (CONST_INT_P (countreg))
26130 return GEN_INT (INTVAL (countreg) / scale);
26131 gcc_assert (REG_P (countreg));
26133 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26134 GEN_INT (exact_log2 (scale)),
26135 NULL, 1, OPTAB_DIRECT);
26136 return sc;
26139 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26140 DImode for constant loop counts. */
26142 static machine_mode
26143 counter_mode (rtx count_exp)
26145 if (GET_MODE (count_exp) != VOIDmode)
26146 return GET_MODE (count_exp);
26147 if (!CONST_INT_P (count_exp))
26148 return Pmode;
26149 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26150 return DImode;
26151 return SImode;
26154 /* Copy the address to a Pmode register. This is used for x32 to
26155 truncate DImode TLS address to a SImode register. */
26157 static rtx
26158 ix86_copy_addr_to_reg (rtx addr)
26160 rtx reg;
26161 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26163 reg = copy_addr_to_reg (addr);
26164 REG_POINTER (reg) = 1;
26165 return reg;
26167 else
26169 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26170 reg = copy_to_mode_reg (DImode, addr);
26171 REG_POINTER (reg) = 1;
26172 return gen_rtx_SUBREG (SImode, reg, 0);
26176 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26177 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26178 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26179 memory by VALUE (supposed to be in MODE).
26181 The size is rounded down to whole number of chunk size moved at once.
26182 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26185 static void
26186 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26187 rtx destptr, rtx srcptr, rtx value,
26188 rtx count, machine_mode mode, int unroll,
26189 int expected_size, bool issetmem)
26191 rtx_code_label *out_label, *top_label;
26192 rtx iter, tmp;
26193 machine_mode iter_mode = counter_mode (count);
26194 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26195 rtx piece_size = GEN_INT (piece_size_n);
26196 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26197 rtx size;
26198 int i;
26200 top_label = gen_label_rtx ();
26201 out_label = gen_label_rtx ();
26202 iter = gen_reg_rtx (iter_mode);
26204 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26205 NULL, 1, OPTAB_DIRECT);
26206 /* Those two should combine. */
26207 if (piece_size == const1_rtx)
26209 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26210 true, out_label);
26211 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26213 emit_move_insn (iter, const0_rtx);
26215 emit_label (top_label);
26217 tmp = convert_modes (Pmode, iter_mode, iter, true);
26219 /* This assert could be relaxed - in this case we'll need to compute
26220 smallest power of two, containing in PIECE_SIZE_N and pass it to
26221 offset_address. */
26222 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26223 destmem = offset_address (destmem, tmp, piece_size_n);
26224 destmem = adjust_address (destmem, mode, 0);
26226 if (!issetmem)
26228 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26229 srcmem = adjust_address (srcmem, mode, 0);
26231 /* When unrolling for chips that reorder memory reads and writes,
26232 we can save registers by using single temporary.
26233 Also using 4 temporaries is overkill in 32bit mode. */
26234 if (!TARGET_64BIT && 0)
26236 for (i = 0; i < unroll; i++)
26238 if (i)
26240 destmem =
26241 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26242 srcmem =
26243 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26245 emit_move_insn (destmem, srcmem);
26248 else
26250 rtx tmpreg[4];
26251 gcc_assert (unroll <= 4);
26252 for (i = 0; i < unroll; i++)
26254 tmpreg[i] = gen_reg_rtx (mode);
26255 if (i)
26257 srcmem =
26258 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26260 emit_move_insn (tmpreg[i], srcmem);
26262 for (i = 0; i < unroll; i++)
26264 if (i)
26266 destmem =
26267 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26269 emit_move_insn (destmem, tmpreg[i]);
26273 else
26274 for (i = 0; i < unroll; i++)
26276 if (i)
26277 destmem =
26278 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26279 emit_move_insn (destmem, value);
26282 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26283 true, OPTAB_LIB_WIDEN);
26284 if (tmp != iter)
26285 emit_move_insn (iter, tmp);
26287 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26288 true, top_label);
26289 if (expected_size != -1)
26291 expected_size /= GET_MODE_SIZE (mode) * unroll;
26292 if (expected_size == 0)
26293 predict_jump (0);
26294 else if (expected_size > REG_BR_PROB_BASE)
26295 predict_jump (REG_BR_PROB_BASE - 1);
26296 else
26297 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26299 else
26300 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26301 iter = ix86_zero_extend_to_Pmode (iter);
26302 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26303 true, OPTAB_LIB_WIDEN);
26304 if (tmp != destptr)
26305 emit_move_insn (destptr, tmp);
26306 if (!issetmem)
26308 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26309 true, OPTAB_LIB_WIDEN);
26310 if (tmp != srcptr)
26311 emit_move_insn (srcptr, tmp);
26313 emit_label (out_label);
26316 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26317 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26318 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26319 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26320 ORIG_VALUE is the original value passed to memset to fill the memory with.
26321 Other arguments have same meaning as for previous function. */
26323 static void
26324 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26325 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26326 rtx count,
26327 machine_mode mode, bool issetmem)
26329 rtx destexp;
26330 rtx srcexp;
26331 rtx countreg;
26332 HOST_WIDE_INT rounded_count;
26334 /* If possible, it is shorter to use rep movs.
26335 TODO: Maybe it is better to move this logic to decide_alg. */
26336 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26337 && (!issetmem || orig_value == const0_rtx))
26338 mode = SImode;
26340 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26341 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26343 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26344 GET_MODE_SIZE (mode)));
26345 if (mode != QImode)
26347 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26348 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26349 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26351 else
26352 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26353 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26355 rounded_count
26356 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26357 destmem = shallow_copy_rtx (destmem);
26358 set_mem_size (destmem, rounded_count);
26360 else if (MEM_SIZE_KNOWN_P (destmem))
26361 clear_mem_size (destmem);
26363 if (issetmem)
26365 value = force_reg (mode, gen_lowpart (mode, value));
26366 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26368 else
26370 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26371 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26372 if (mode != QImode)
26374 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26375 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26376 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26378 else
26379 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26380 if (CONST_INT_P (count))
26382 rounded_count
26383 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26384 srcmem = shallow_copy_rtx (srcmem);
26385 set_mem_size (srcmem, rounded_count);
26387 else
26389 if (MEM_SIZE_KNOWN_P (srcmem))
26390 clear_mem_size (srcmem);
26392 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26393 destexp, srcexp));
26397 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26398 DESTMEM.
26399 SRC is passed by pointer to be updated on return.
26400 Return value is updated DST. */
26401 static rtx
26402 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26403 HOST_WIDE_INT size_to_move)
26405 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26406 enum insn_code code;
26407 machine_mode move_mode;
26408 int piece_size, i;
26410 /* Find the widest mode in which we could perform moves.
26411 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26412 it until move of such size is supported. */
26413 piece_size = 1 << floor_log2 (size_to_move);
26414 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26415 code = optab_handler (mov_optab, move_mode);
26416 while (code == CODE_FOR_nothing && piece_size > 1)
26418 piece_size >>= 1;
26419 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26420 code = optab_handler (mov_optab, move_mode);
26423 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26424 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26425 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26427 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26428 move_mode = mode_for_vector (word_mode, nunits);
26429 code = optab_handler (mov_optab, move_mode);
26430 if (code == CODE_FOR_nothing)
26432 move_mode = word_mode;
26433 piece_size = GET_MODE_SIZE (move_mode);
26434 code = optab_handler (mov_optab, move_mode);
26437 gcc_assert (code != CODE_FOR_nothing);
26439 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26440 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26442 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26443 gcc_assert (size_to_move % piece_size == 0);
26444 adjust = GEN_INT (piece_size);
26445 for (i = 0; i < size_to_move; i += piece_size)
26447 /* We move from memory to memory, so we'll need to do it via
26448 a temporary register. */
26449 tempreg = gen_reg_rtx (move_mode);
26450 emit_insn (GEN_FCN (code) (tempreg, src));
26451 emit_insn (GEN_FCN (code) (dst, tempreg));
26453 emit_move_insn (destptr,
26454 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26455 emit_move_insn (srcptr,
26456 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26458 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26459 piece_size);
26460 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26461 piece_size);
26464 /* Update DST and SRC rtx. */
26465 *srcmem = src;
26466 return dst;
26469 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26470 static void
26471 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26472 rtx destptr, rtx srcptr, rtx count, int max_size)
26474 rtx src, dest;
26475 if (CONST_INT_P (count))
26477 HOST_WIDE_INT countval = INTVAL (count);
26478 HOST_WIDE_INT epilogue_size = countval % max_size;
26479 int i;
26481 /* For now MAX_SIZE should be a power of 2. This assert could be
26482 relaxed, but it'll require a bit more complicated epilogue
26483 expanding. */
26484 gcc_assert ((max_size & (max_size - 1)) == 0);
26485 for (i = max_size; i >= 1; i >>= 1)
26487 if (epilogue_size & i)
26488 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26490 return;
26492 if (max_size > 8)
26494 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26495 count, 1, OPTAB_DIRECT);
26496 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26497 count, QImode, 1, 4, false);
26498 return;
26501 /* When there are stringops, we can cheaply increase dest and src pointers.
26502 Otherwise we save code size by maintaining offset (zero is readily
26503 available from preceding rep operation) and using x86 addressing modes.
26505 if (TARGET_SINGLE_STRINGOP)
26507 if (max_size > 4)
26509 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26510 src = change_address (srcmem, SImode, srcptr);
26511 dest = change_address (destmem, SImode, destptr);
26512 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26513 emit_label (label);
26514 LABEL_NUSES (label) = 1;
26516 if (max_size > 2)
26518 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26519 src = change_address (srcmem, HImode, srcptr);
26520 dest = change_address (destmem, HImode, destptr);
26521 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26522 emit_label (label);
26523 LABEL_NUSES (label) = 1;
26525 if (max_size > 1)
26527 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26528 src = change_address (srcmem, QImode, srcptr);
26529 dest = change_address (destmem, QImode, destptr);
26530 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26531 emit_label (label);
26532 LABEL_NUSES (label) = 1;
26535 else
26537 rtx offset = force_reg (Pmode, const0_rtx);
26538 rtx tmp;
26540 if (max_size > 4)
26542 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26543 src = change_address (srcmem, SImode, srcptr);
26544 dest = change_address (destmem, SImode, destptr);
26545 emit_move_insn (dest, src);
26546 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26547 true, OPTAB_LIB_WIDEN);
26548 if (tmp != offset)
26549 emit_move_insn (offset, tmp);
26550 emit_label (label);
26551 LABEL_NUSES (label) = 1;
26553 if (max_size > 2)
26555 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26556 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26557 src = change_address (srcmem, HImode, tmp);
26558 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26559 dest = change_address (destmem, HImode, tmp);
26560 emit_move_insn (dest, src);
26561 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26562 true, OPTAB_LIB_WIDEN);
26563 if (tmp != offset)
26564 emit_move_insn (offset, tmp);
26565 emit_label (label);
26566 LABEL_NUSES (label) = 1;
26568 if (max_size > 1)
26570 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26571 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26572 src = change_address (srcmem, QImode, tmp);
26573 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26574 dest = change_address (destmem, QImode, tmp);
26575 emit_move_insn (dest, src);
26576 emit_label (label);
26577 LABEL_NUSES (label) = 1;
26582 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26583 with value PROMOTED_VAL.
26584 SRC is passed by pointer to be updated on return.
26585 Return value is updated DST. */
26586 static rtx
26587 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26588 HOST_WIDE_INT size_to_move)
26590 rtx dst = destmem, adjust;
26591 enum insn_code code;
26592 machine_mode move_mode;
26593 int piece_size, i;
26595 /* Find the widest mode in which we could perform moves.
26596 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26597 it until move of such size is supported. */
26598 move_mode = GET_MODE (promoted_val);
26599 if (move_mode == VOIDmode)
26600 move_mode = QImode;
26601 if (size_to_move < GET_MODE_SIZE (move_mode))
26603 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26604 promoted_val = gen_lowpart (move_mode, promoted_val);
26606 piece_size = GET_MODE_SIZE (move_mode);
26607 code = optab_handler (mov_optab, move_mode);
26608 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26610 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26612 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26613 gcc_assert (size_to_move % piece_size == 0);
26614 adjust = GEN_INT (piece_size);
26615 for (i = 0; i < size_to_move; i += piece_size)
26617 if (piece_size <= GET_MODE_SIZE (word_mode))
26619 emit_insn (gen_strset (destptr, dst, promoted_val));
26620 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26621 piece_size);
26622 continue;
26625 emit_insn (GEN_FCN (code) (dst, promoted_val));
26627 emit_move_insn (destptr,
26628 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26630 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26631 piece_size);
26634 /* Update DST rtx. */
26635 return dst;
26637 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26638 static void
26639 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26640 rtx count, int max_size)
26642 count =
26643 expand_simple_binop (counter_mode (count), AND, count,
26644 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26645 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26646 gen_lowpart (QImode, value), count, QImode,
26647 1, max_size / 2, true);
26650 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26651 static void
26652 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26653 rtx count, int max_size)
26655 rtx dest;
26657 if (CONST_INT_P (count))
26659 HOST_WIDE_INT countval = INTVAL (count);
26660 HOST_WIDE_INT epilogue_size = countval % max_size;
26661 int i;
26663 /* For now MAX_SIZE should be a power of 2. This assert could be
26664 relaxed, but it'll require a bit more complicated epilogue
26665 expanding. */
26666 gcc_assert ((max_size & (max_size - 1)) == 0);
26667 for (i = max_size; i >= 1; i >>= 1)
26669 if (epilogue_size & i)
26671 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26672 destmem = emit_memset (destmem, destptr, vec_value, i);
26673 else
26674 destmem = emit_memset (destmem, destptr, value, i);
26677 return;
26679 if (max_size > 32)
26681 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26682 return;
26684 if (max_size > 16)
26686 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26687 if (TARGET_64BIT)
26689 dest = change_address (destmem, DImode, destptr);
26690 emit_insn (gen_strset (destptr, dest, value));
26691 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26692 emit_insn (gen_strset (destptr, dest, value));
26694 else
26696 dest = change_address (destmem, SImode, destptr);
26697 emit_insn (gen_strset (destptr, dest, value));
26698 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26699 emit_insn (gen_strset (destptr, dest, value));
26700 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26701 emit_insn (gen_strset (destptr, dest, value));
26702 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26703 emit_insn (gen_strset (destptr, dest, value));
26705 emit_label (label);
26706 LABEL_NUSES (label) = 1;
26708 if (max_size > 8)
26710 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26711 if (TARGET_64BIT)
26713 dest = change_address (destmem, DImode, destptr);
26714 emit_insn (gen_strset (destptr, dest, value));
26716 else
26718 dest = change_address (destmem, SImode, destptr);
26719 emit_insn (gen_strset (destptr, dest, value));
26720 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26721 emit_insn (gen_strset (destptr, dest, value));
26723 emit_label (label);
26724 LABEL_NUSES (label) = 1;
26726 if (max_size > 4)
26728 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26729 dest = change_address (destmem, SImode, destptr);
26730 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26731 emit_label (label);
26732 LABEL_NUSES (label) = 1;
26734 if (max_size > 2)
26736 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26737 dest = change_address (destmem, HImode, destptr);
26738 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26739 emit_label (label);
26740 LABEL_NUSES (label) = 1;
26742 if (max_size > 1)
26744 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26745 dest = change_address (destmem, QImode, destptr);
26746 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26747 emit_label (label);
26748 LABEL_NUSES (label) = 1;
26752 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26753 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26754 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26755 ignored.
26756 Return value is updated DESTMEM. */
26757 static rtx
26758 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26759 rtx destptr, rtx srcptr, rtx value,
26760 rtx vec_value, rtx count, int align,
26761 int desired_alignment, bool issetmem)
26763 int i;
26764 for (i = 1; i < desired_alignment; i <<= 1)
26766 if (align <= i)
26768 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26769 if (issetmem)
26771 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26772 destmem = emit_memset (destmem, destptr, vec_value, i);
26773 else
26774 destmem = emit_memset (destmem, destptr, value, i);
26776 else
26777 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26778 ix86_adjust_counter (count, i);
26779 emit_label (label);
26780 LABEL_NUSES (label) = 1;
26781 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26784 return destmem;
26787 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26788 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26789 and jump to DONE_LABEL. */
26790 static void
26791 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26792 rtx destptr, rtx srcptr,
26793 rtx value, rtx vec_value,
26794 rtx count, int size,
26795 rtx done_label, bool issetmem)
26797 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26798 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26799 rtx modesize;
26800 int n;
26802 /* If we do not have vector value to copy, we must reduce size. */
26803 if (issetmem)
26805 if (!vec_value)
26807 if (GET_MODE (value) == VOIDmode && size > 8)
26808 mode = Pmode;
26809 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26810 mode = GET_MODE (value);
26812 else
26813 mode = GET_MODE (vec_value), value = vec_value;
26815 else
26817 /* Choose appropriate vector mode. */
26818 if (size >= 32)
26819 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26820 else if (size >= 16)
26821 mode = TARGET_SSE ? V16QImode : DImode;
26822 srcmem = change_address (srcmem, mode, srcptr);
26824 destmem = change_address (destmem, mode, destptr);
26825 modesize = GEN_INT (GET_MODE_SIZE (mode));
26826 gcc_assert (GET_MODE_SIZE (mode) <= size);
26827 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26829 if (issetmem)
26830 emit_move_insn (destmem, gen_lowpart (mode, value));
26831 else
26833 emit_move_insn (destmem, srcmem);
26834 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26836 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26839 destmem = offset_address (destmem, count, 1);
26840 destmem = offset_address (destmem, GEN_INT (-2 * size),
26841 GET_MODE_SIZE (mode));
26842 if (!issetmem)
26844 srcmem = offset_address (srcmem, count, 1);
26845 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26846 GET_MODE_SIZE (mode));
26848 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26850 if (issetmem)
26851 emit_move_insn (destmem, gen_lowpart (mode, value));
26852 else
26854 emit_move_insn (destmem, srcmem);
26855 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26857 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26859 emit_jump_insn (gen_jump (done_label));
26860 emit_barrier ();
26862 emit_label (label);
26863 LABEL_NUSES (label) = 1;
26866 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26867 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26868 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26869 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26870 DONE_LABEL is a label after the whole copying sequence. The label is created
26871 on demand if *DONE_LABEL is NULL.
26872 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26873 bounds after the initial copies.
26875 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26876 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26877 we will dispatch to a library call for large blocks.
26879 In pseudocode we do:
26881 if (COUNT < SIZE)
26883 Assume that SIZE is 4. Bigger sizes are handled analogously
26884 if (COUNT & 4)
26886 copy 4 bytes from SRCPTR to DESTPTR
26887 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26888 goto done_label
26890 if (!COUNT)
26891 goto done_label;
26892 copy 1 byte from SRCPTR to DESTPTR
26893 if (COUNT & 2)
26895 copy 2 bytes from SRCPTR to DESTPTR
26896 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26899 else
26901 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26902 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26904 OLD_DESPTR = DESTPTR;
26905 Align DESTPTR up to DESIRED_ALIGN
26906 SRCPTR += DESTPTR - OLD_DESTPTR
26907 COUNT -= DEST_PTR - OLD_DESTPTR
26908 if (DYNAMIC_CHECK)
26909 Round COUNT down to multiple of SIZE
26910 << optional caller supplied zero size guard is here >>
26911 << optional caller supplied dynamic check is here >>
26912 << caller supplied main copy loop is here >>
26914 done_label:
26916 static void
26917 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26918 rtx *destptr, rtx *srcptr,
26919 machine_mode mode,
26920 rtx value, rtx vec_value,
26921 rtx *count,
26922 rtx_code_label **done_label,
26923 int size,
26924 int desired_align,
26925 int align,
26926 unsigned HOST_WIDE_INT *min_size,
26927 bool dynamic_check,
26928 bool issetmem)
26930 rtx_code_label *loop_label = NULL, *label;
26931 int n;
26932 rtx modesize;
26933 int prolog_size = 0;
26934 rtx mode_value;
26936 /* Chose proper value to copy. */
26937 if (issetmem && VECTOR_MODE_P (mode))
26938 mode_value = vec_value;
26939 else
26940 mode_value = value;
26941 gcc_assert (GET_MODE_SIZE (mode) <= size);
26943 /* See if block is big or small, handle small blocks. */
26944 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26946 int size2 = size;
26947 loop_label = gen_label_rtx ();
26949 if (!*done_label)
26950 *done_label = gen_label_rtx ();
26952 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26953 1, loop_label);
26954 size2 >>= 1;
26956 /* Handle sizes > 3. */
26957 for (;size2 > 2; size2 >>= 1)
26958 expand_small_movmem_or_setmem (destmem, srcmem,
26959 *destptr, *srcptr,
26960 value, vec_value,
26961 *count,
26962 size2, *done_label, issetmem);
26963 /* Nothing to copy? Jump to DONE_LABEL if so */
26964 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26965 1, *done_label);
26967 /* Do a byte copy. */
26968 destmem = change_address (destmem, QImode, *destptr);
26969 if (issetmem)
26970 emit_move_insn (destmem, gen_lowpart (QImode, value));
26971 else
26973 srcmem = change_address (srcmem, QImode, *srcptr);
26974 emit_move_insn (destmem, srcmem);
26977 /* Handle sizes 2 and 3. */
26978 label = ix86_expand_aligntest (*count, 2, false);
26979 destmem = change_address (destmem, HImode, *destptr);
26980 destmem = offset_address (destmem, *count, 1);
26981 destmem = offset_address (destmem, GEN_INT (-2), 2);
26982 if (issetmem)
26983 emit_move_insn (destmem, gen_lowpart (HImode, value));
26984 else
26986 srcmem = change_address (srcmem, HImode, *srcptr);
26987 srcmem = offset_address (srcmem, *count, 1);
26988 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26989 emit_move_insn (destmem, srcmem);
26992 emit_label (label);
26993 LABEL_NUSES (label) = 1;
26994 emit_jump_insn (gen_jump (*done_label));
26995 emit_barrier ();
26997 else
26998 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26999 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27001 /* Start memcpy for COUNT >= SIZE. */
27002 if (loop_label)
27004 emit_label (loop_label);
27005 LABEL_NUSES (loop_label) = 1;
27008 /* Copy first desired_align bytes. */
27009 if (!issetmem)
27010 srcmem = change_address (srcmem, mode, *srcptr);
27011 destmem = change_address (destmem, mode, *destptr);
27012 modesize = GEN_INT (GET_MODE_SIZE (mode));
27013 for (n = 0; prolog_size < desired_align - align; n++)
27015 if (issetmem)
27016 emit_move_insn (destmem, mode_value);
27017 else
27019 emit_move_insn (destmem, srcmem);
27020 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27022 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27023 prolog_size += GET_MODE_SIZE (mode);
27027 /* Copy last SIZE bytes. */
27028 destmem = offset_address (destmem, *count, 1);
27029 destmem = offset_address (destmem,
27030 GEN_INT (-size - prolog_size),
27032 if (issetmem)
27033 emit_move_insn (destmem, mode_value);
27034 else
27036 srcmem = offset_address (srcmem, *count, 1);
27037 srcmem = offset_address (srcmem,
27038 GEN_INT (-size - prolog_size),
27040 emit_move_insn (destmem, srcmem);
27042 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27044 destmem = offset_address (destmem, modesize, 1);
27045 if (issetmem)
27046 emit_move_insn (destmem, mode_value);
27047 else
27049 srcmem = offset_address (srcmem, modesize, 1);
27050 emit_move_insn (destmem, srcmem);
27054 /* Align destination. */
27055 if (desired_align > 1 && desired_align > align)
27057 rtx saveddest = *destptr;
27059 gcc_assert (desired_align <= size);
27060 /* Align destptr up, place it to new register. */
27061 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27062 GEN_INT (prolog_size),
27063 NULL_RTX, 1, OPTAB_DIRECT);
27064 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27065 REG_POINTER (*destptr) = 1;
27066 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27067 GEN_INT (-desired_align),
27068 *destptr, 1, OPTAB_DIRECT);
27069 /* See how many bytes we skipped. */
27070 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27071 *destptr,
27072 saveddest, 1, OPTAB_DIRECT);
27073 /* Adjust srcptr and count. */
27074 if (!issetmem)
27075 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27076 saveddest, *srcptr, 1, OPTAB_DIRECT);
27077 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27078 saveddest, *count, 1, OPTAB_DIRECT);
27079 /* We copied at most size + prolog_size. */
27080 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27081 *min_size
27082 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27083 else
27084 *min_size = 0;
27086 /* Our loops always round down the block size, but for dispatch to
27087 library we need precise value. */
27088 if (dynamic_check)
27089 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27090 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27092 else
27094 gcc_assert (prolog_size == 0);
27095 /* Decrease count, so we won't end up copying last word twice. */
27096 if (!CONST_INT_P (*count))
27097 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27098 constm1_rtx, *count, 1, OPTAB_DIRECT);
27099 else
27100 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27101 (unsigned HOST_WIDE_INT)size));
27102 if (*min_size)
27103 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27108 /* This function is like the previous one, except here we know how many bytes
27109 need to be copied. That allows us to update alignment not only of DST, which
27110 is returned, but also of SRC, which is passed as a pointer for that
27111 reason. */
27112 static rtx
27113 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27114 rtx srcreg, rtx value, rtx vec_value,
27115 int desired_align, int align_bytes,
27116 bool issetmem)
27118 rtx src = NULL;
27119 rtx orig_dst = dst;
27120 rtx orig_src = NULL;
27121 int piece_size = 1;
27122 int copied_bytes = 0;
27124 if (!issetmem)
27126 gcc_assert (srcp != NULL);
27127 src = *srcp;
27128 orig_src = src;
27131 for (piece_size = 1;
27132 piece_size <= desired_align && copied_bytes < align_bytes;
27133 piece_size <<= 1)
27135 if (align_bytes & piece_size)
27137 if (issetmem)
27139 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27140 dst = emit_memset (dst, destreg, vec_value, piece_size);
27141 else
27142 dst = emit_memset (dst, destreg, value, piece_size);
27144 else
27145 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27146 copied_bytes += piece_size;
27149 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27150 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27151 if (MEM_SIZE_KNOWN_P (orig_dst))
27152 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27154 if (!issetmem)
27156 int src_align_bytes = get_mem_align_offset (src, desired_align
27157 * BITS_PER_UNIT);
27158 if (src_align_bytes >= 0)
27159 src_align_bytes = desired_align - src_align_bytes;
27160 if (src_align_bytes >= 0)
27162 unsigned int src_align;
27163 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27165 if ((src_align_bytes & (src_align - 1))
27166 == (align_bytes & (src_align - 1)))
27167 break;
27169 if (src_align > (unsigned int) desired_align)
27170 src_align = desired_align;
27171 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27172 set_mem_align (src, src_align * BITS_PER_UNIT);
27174 if (MEM_SIZE_KNOWN_P (orig_src))
27175 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27176 *srcp = src;
27179 return dst;
27182 /* Return true if ALG can be used in current context.
27183 Assume we expand memset if MEMSET is true. */
27184 static bool
27185 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27187 if (alg == no_stringop)
27188 return false;
27189 if (alg == vector_loop)
27190 return TARGET_SSE || TARGET_AVX;
27191 /* Algorithms using the rep prefix want at least edi and ecx;
27192 additionally, memset wants eax and memcpy wants esi. Don't
27193 consider such algorithms if the user has appropriated those
27194 registers for their own purposes, or if we have a non-default
27195 address space, since some string insns cannot override the segment. */
27196 if (alg == rep_prefix_1_byte
27197 || alg == rep_prefix_4_byte
27198 || alg == rep_prefix_8_byte)
27200 if (have_as)
27201 return false;
27202 if (fixed_regs[CX_REG]
27203 || fixed_regs[DI_REG]
27204 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27205 return false;
27207 return true;
27210 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27211 static enum stringop_alg
27212 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27213 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27214 bool memset, bool zero_memset, bool have_as,
27215 int *dynamic_check, bool *noalign, bool recur)
27217 const struct stringop_algs *algs;
27218 bool optimize_for_speed;
27219 int max = 0;
27220 const struct processor_costs *cost;
27221 int i;
27222 bool any_alg_usable_p = false;
27224 *noalign = false;
27225 *dynamic_check = -1;
27227 /* Even if the string operation call is cold, we still might spend a lot
27228 of time processing large blocks. */
27229 if (optimize_function_for_size_p (cfun)
27230 || (optimize_insn_for_size_p ()
27231 && (max_size < 256
27232 || (expected_size != -1 && expected_size < 256))))
27233 optimize_for_speed = false;
27234 else
27235 optimize_for_speed = true;
27237 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27238 if (memset)
27239 algs = &cost->memset[TARGET_64BIT != 0];
27240 else
27241 algs = &cost->memcpy[TARGET_64BIT != 0];
27243 /* See maximal size for user defined algorithm. */
27244 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27246 enum stringop_alg candidate = algs->size[i].alg;
27247 bool usable = alg_usable_p (candidate, memset, have_as);
27248 any_alg_usable_p |= usable;
27250 if (candidate != libcall && candidate && usable)
27251 max = algs->size[i].max;
27254 /* If expected size is not known but max size is small enough
27255 so inline version is a win, set expected size into
27256 the range. */
27257 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27258 && expected_size == -1)
27259 expected_size = min_size / 2 + max_size / 2;
27261 /* If user specified the algorithm, honor it if possible. */
27262 if (ix86_stringop_alg != no_stringop
27263 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27264 return ix86_stringop_alg;
27265 /* rep; movq or rep; movl is the smallest variant. */
27266 else if (!optimize_for_speed)
27268 *noalign = true;
27269 if (!count || (count & 3) || (memset && !zero_memset))
27270 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27271 ? rep_prefix_1_byte : loop_1_byte;
27272 else
27273 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27274 ? rep_prefix_4_byte : loop;
27276 /* Very tiny blocks are best handled via the loop, REP is expensive to
27277 setup. */
27278 else if (expected_size != -1 && expected_size < 4)
27279 return loop_1_byte;
27280 else if (expected_size != -1)
27282 enum stringop_alg alg = libcall;
27283 bool alg_noalign = false;
27284 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27286 /* We get here if the algorithms that were not libcall-based
27287 were rep-prefix based and we are unable to use rep prefixes
27288 based on global register usage. Break out of the loop and
27289 use the heuristic below. */
27290 if (algs->size[i].max == 0)
27291 break;
27292 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27294 enum stringop_alg candidate = algs->size[i].alg;
27296 if (candidate != libcall
27297 && alg_usable_p (candidate, memset, have_as))
27299 alg = candidate;
27300 alg_noalign = algs->size[i].noalign;
27302 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27303 last non-libcall inline algorithm. */
27304 if (TARGET_INLINE_ALL_STRINGOPS)
27306 /* When the current size is best to be copied by a libcall,
27307 but we are still forced to inline, run the heuristic below
27308 that will pick code for medium sized blocks. */
27309 if (alg != libcall)
27311 *noalign = alg_noalign;
27312 return alg;
27314 else if (!any_alg_usable_p)
27315 break;
27317 else if (alg_usable_p (candidate, memset, have_as))
27319 *noalign = algs->size[i].noalign;
27320 return candidate;
27325 /* When asked to inline the call anyway, try to pick meaningful choice.
27326 We look for maximal size of block that is faster to copy by hand and
27327 take blocks of at most of that size guessing that average size will
27328 be roughly half of the block.
27330 If this turns out to be bad, we might simply specify the preferred
27331 choice in ix86_costs. */
27332 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27333 && (algs->unknown_size == libcall
27334 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27336 enum stringop_alg alg;
27337 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27339 /* If there aren't any usable algorithms or if recursing already,
27340 then recursing on smaller sizes or same size isn't going to
27341 find anything. Just return the simple byte-at-a-time copy loop. */
27342 if (!any_alg_usable_p || recur)
27344 /* Pick something reasonable. */
27345 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27346 *dynamic_check = 128;
27347 return loop_1_byte;
27349 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27350 zero_memset, have_as, dynamic_check, noalign, true);
27351 gcc_assert (*dynamic_check == -1);
27352 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27353 *dynamic_check = max;
27354 else
27355 gcc_assert (alg != libcall);
27356 return alg;
27358 return (alg_usable_p (algs->unknown_size, memset, have_as)
27359 ? algs->unknown_size : libcall);
27362 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27363 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27364 static int
27365 decide_alignment (int align,
27366 enum stringop_alg alg,
27367 int expected_size,
27368 machine_mode move_mode)
27370 int desired_align = 0;
27372 gcc_assert (alg != no_stringop);
27374 if (alg == libcall)
27375 return 0;
27376 if (move_mode == VOIDmode)
27377 return 0;
27379 desired_align = GET_MODE_SIZE (move_mode);
27380 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27381 copying whole cacheline at once. */
27382 if (TARGET_PENTIUMPRO
27383 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27384 desired_align = 8;
27386 if (optimize_size)
27387 desired_align = 1;
27388 if (desired_align < align)
27389 desired_align = align;
27390 if (expected_size != -1 && expected_size < 4)
27391 desired_align = align;
27393 return desired_align;
27397 /* Helper function for memcpy. For QImode value 0xXY produce
27398 0xXYXYXYXY of wide specified by MODE. This is essentially
27399 a * 0x10101010, but we can do slightly better than
27400 synth_mult by unwinding the sequence by hand on CPUs with
27401 slow multiply. */
27402 static rtx
27403 promote_duplicated_reg (machine_mode mode, rtx val)
27405 machine_mode valmode = GET_MODE (val);
27406 rtx tmp;
27407 int nops = mode == DImode ? 3 : 2;
27409 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27410 if (val == const0_rtx)
27411 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27412 if (CONST_INT_P (val))
27414 HOST_WIDE_INT v = INTVAL (val) & 255;
27416 v |= v << 8;
27417 v |= v << 16;
27418 if (mode == DImode)
27419 v |= (v << 16) << 16;
27420 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27423 if (valmode == VOIDmode)
27424 valmode = QImode;
27425 if (valmode != QImode)
27426 val = gen_lowpart (QImode, val);
27427 if (mode == QImode)
27428 return val;
27429 if (!TARGET_PARTIAL_REG_STALL)
27430 nops--;
27431 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27432 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27433 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27434 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27436 rtx reg = convert_modes (mode, QImode, val, true);
27437 tmp = promote_duplicated_reg (mode, const1_rtx);
27438 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27439 OPTAB_DIRECT);
27441 else
27443 rtx reg = convert_modes (mode, QImode, val, true);
27445 if (!TARGET_PARTIAL_REG_STALL)
27446 if (mode == SImode)
27447 emit_insn (gen_insvsi_1 (reg, reg));
27448 else
27449 emit_insn (gen_insvdi_1 (reg, reg));
27450 else
27452 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27453 NULL, 1, OPTAB_DIRECT);
27454 reg =
27455 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27457 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27458 NULL, 1, OPTAB_DIRECT);
27459 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27460 if (mode == SImode)
27461 return reg;
27462 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27463 NULL, 1, OPTAB_DIRECT);
27464 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27465 return reg;
27469 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27470 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27471 alignment from ALIGN to DESIRED_ALIGN. */
27472 static rtx
27473 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27474 int align)
27476 rtx promoted_val;
27478 if (TARGET_64BIT
27479 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27480 promoted_val = promote_duplicated_reg (DImode, val);
27481 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27482 promoted_val = promote_duplicated_reg (SImode, val);
27483 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27484 promoted_val = promote_duplicated_reg (HImode, val);
27485 else
27486 promoted_val = val;
27488 return promoted_val;
27491 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27492 operations when profitable. The code depends upon architecture, block size
27493 and alignment, but always has one of the following overall structures:
27495 Aligned move sequence:
27497 1) Prologue guard: Conditional that jumps up to epilogues for small
27498 blocks that can be handled by epilogue alone. This is faster
27499 but also needed for correctness, since prologue assume the block
27500 is larger than the desired alignment.
27502 Optional dynamic check for size and libcall for large
27503 blocks is emitted here too, with -minline-stringops-dynamically.
27505 2) Prologue: copy first few bytes in order to get destination
27506 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27507 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27508 copied. We emit either a jump tree on power of two sized
27509 blocks, or a byte loop.
27511 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27512 with specified algorithm.
27514 4) Epilogue: code copying tail of the block that is too small to be
27515 handled by main body (or up to size guarded by prologue guard).
27517 Misaligned move sequence
27519 1) missaligned move prologue/epilogue containing:
27520 a) Prologue handling small memory blocks and jumping to done_label
27521 (skipped if blocks are known to be large enough)
27522 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27523 needed by single possibly misaligned move
27524 (skipped if alignment is not needed)
27525 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27527 2) Zero size guard dispatching to done_label, if needed
27529 3) dispatch to library call, if needed,
27531 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27532 with specified algorithm. */
27533 bool
27534 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27535 rtx align_exp, rtx expected_align_exp,
27536 rtx expected_size_exp, rtx min_size_exp,
27537 rtx max_size_exp, rtx probable_max_size_exp,
27538 bool issetmem)
27540 rtx destreg;
27541 rtx srcreg = NULL;
27542 rtx_code_label *label = NULL;
27543 rtx tmp;
27544 rtx_code_label *jump_around_label = NULL;
27545 HOST_WIDE_INT align = 1;
27546 unsigned HOST_WIDE_INT count = 0;
27547 HOST_WIDE_INT expected_size = -1;
27548 int size_needed = 0, epilogue_size_needed;
27549 int desired_align = 0, align_bytes = 0;
27550 enum stringop_alg alg;
27551 rtx promoted_val = NULL;
27552 rtx vec_promoted_val = NULL;
27553 bool force_loopy_epilogue = false;
27554 int dynamic_check;
27555 bool need_zero_guard = false;
27556 bool noalign;
27557 machine_mode move_mode = VOIDmode;
27558 int unroll_factor = 1;
27559 /* TODO: Once value ranges are available, fill in proper data. */
27560 unsigned HOST_WIDE_INT min_size = 0;
27561 unsigned HOST_WIDE_INT max_size = -1;
27562 unsigned HOST_WIDE_INT probable_max_size = -1;
27563 bool misaligned_prologue_used = false;
27564 bool have_as;
27566 if (CONST_INT_P (align_exp))
27567 align = INTVAL (align_exp);
27568 /* i386 can do misaligned access on reasonably increased cost. */
27569 if (CONST_INT_P (expected_align_exp)
27570 && INTVAL (expected_align_exp) > align)
27571 align = INTVAL (expected_align_exp);
27572 /* ALIGN is the minimum of destination and source alignment, but we care here
27573 just about destination alignment. */
27574 else if (!issetmem
27575 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27576 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27578 if (CONST_INT_P (count_exp))
27580 min_size = max_size = probable_max_size = count = expected_size
27581 = INTVAL (count_exp);
27582 /* When COUNT is 0, there is nothing to do. */
27583 if (!count)
27584 return true;
27586 else
27588 if (min_size_exp)
27589 min_size = INTVAL (min_size_exp);
27590 if (max_size_exp)
27591 max_size = INTVAL (max_size_exp);
27592 if (probable_max_size_exp)
27593 probable_max_size = INTVAL (probable_max_size_exp);
27594 if (CONST_INT_P (expected_size_exp))
27595 expected_size = INTVAL (expected_size_exp);
27598 /* Make sure we don't need to care about overflow later on. */
27599 if (count > (HOST_WIDE_INT_1U << 30))
27600 return false;
27602 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27603 if (!issetmem)
27604 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27606 /* Step 0: Decide on preferred algorithm, desired alignment and
27607 size of chunks to be copied by main loop. */
27608 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27609 issetmem,
27610 issetmem && val_exp == const0_rtx, have_as,
27611 &dynamic_check, &noalign, false);
27612 if (alg == libcall)
27613 return false;
27614 gcc_assert (alg != no_stringop);
27616 /* For now vector-version of memset is generated only for memory zeroing, as
27617 creating of promoted vector value is very cheap in this case. */
27618 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27619 alg = unrolled_loop;
27621 if (!count)
27622 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27623 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27624 if (!issetmem)
27625 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27627 unroll_factor = 1;
27628 move_mode = word_mode;
27629 switch (alg)
27631 case libcall:
27632 case no_stringop:
27633 case last_alg:
27634 gcc_unreachable ();
27635 case loop_1_byte:
27636 need_zero_guard = true;
27637 move_mode = QImode;
27638 break;
27639 case loop:
27640 need_zero_guard = true;
27641 break;
27642 case unrolled_loop:
27643 need_zero_guard = true;
27644 unroll_factor = (TARGET_64BIT ? 4 : 2);
27645 break;
27646 case vector_loop:
27647 need_zero_guard = true;
27648 unroll_factor = 4;
27649 /* Find the widest supported mode. */
27650 move_mode = word_mode;
27651 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27652 != CODE_FOR_nothing)
27653 move_mode = GET_MODE_WIDER_MODE (move_mode);
27655 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27656 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27657 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27659 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27660 move_mode = mode_for_vector (word_mode, nunits);
27661 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27662 move_mode = word_mode;
27664 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27665 break;
27666 case rep_prefix_8_byte:
27667 move_mode = DImode;
27668 break;
27669 case rep_prefix_4_byte:
27670 move_mode = SImode;
27671 break;
27672 case rep_prefix_1_byte:
27673 move_mode = QImode;
27674 break;
27676 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27677 epilogue_size_needed = size_needed;
27679 /* If we are going to call any library calls conditionally, make sure any
27680 pending stack adjustment happen before the first conditional branch,
27681 otherwise they will be emitted before the library call only and won't
27682 happen from the other branches. */
27683 if (dynamic_check != -1)
27684 do_pending_stack_adjust ();
27686 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27687 if (!TARGET_ALIGN_STRINGOPS || noalign)
27688 align = desired_align;
27690 /* Step 1: Prologue guard. */
27692 /* Alignment code needs count to be in register. */
27693 if (CONST_INT_P (count_exp) && desired_align > align)
27695 if (INTVAL (count_exp) > desired_align
27696 && INTVAL (count_exp) > size_needed)
27698 align_bytes
27699 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27700 if (align_bytes <= 0)
27701 align_bytes = 0;
27702 else
27703 align_bytes = desired_align - align_bytes;
27705 if (align_bytes == 0)
27706 count_exp = force_reg (counter_mode (count_exp), count_exp);
27708 gcc_assert (desired_align >= 1 && align >= 1);
27710 /* Misaligned move sequences handle both prologue and epilogue at once.
27711 Default code generation results in a smaller code for large alignments
27712 and also avoids redundant job when sizes are known precisely. */
27713 misaligned_prologue_used
27714 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27715 && MAX (desired_align, epilogue_size_needed) <= 32
27716 && desired_align <= epilogue_size_needed
27717 && ((desired_align > align && !align_bytes)
27718 || (!count && epilogue_size_needed > 1)));
27720 /* Do the cheap promotion to allow better CSE across the
27721 main loop and epilogue (ie one load of the big constant in the
27722 front of all code.
27723 For now the misaligned move sequences do not have fast path
27724 without broadcasting. */
27725 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27727 if (alg == vector_loop)
27729 gcc_assert (val_exp == const0_rtx);
27730 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27731 promoted_val = promote_duplicated_reg_to_size (val_exp,
27732 GET_MODE_SIZE (word_mode),
27733 desired_align, align);
27735 else
27737 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27738 desired_align, align);
27741 /* Misaligned move sequences handles both prologues and epilogues at once.
27742 Default code generation results in smaller code for large alignments and
27743 also avoids redundant job when sizes are known precisely. */
27744 if (misaligned_prologue_used)
27746 /* Misaligned move prologue handled small blocks by itself. */
27747 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27748 (dst, src, &destreg, &srcreg,
27749 move_mode, promoted_val, vec_promoted_val,
27750 &count_exp,
27751 &jump_around_label,
27752 desired_align < align
27753 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27754 desired_align, align, &min_size, dynamic_check, issetmem);
27755 if (!issetmem)
27756 src = change_address (src, BLKmode, srcreg);
27757 dst = change_address (dst, BLKmode, destreg);
27758 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27759 epilogue_size_needed = 0;
27760 if (need_zero_guard
27761 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27763 /* It is possible that we copied enough so the main loop will not
27764 execute. */
27765 gcc_assert (size_needed > 1);
27766 if (jump_around_label == NULL_RTX)
27767 jump_around_label = gen_label_rtx ();
27768 emit_cmp_and_jump_insns (count_exp,
27769 GEN_INT (size_needed),
27770 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27771 if (expected_size == -1
27772 || expected_size < (desired_align - align) / 2 + size_needed)
27773 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27774 else
27775 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27778 /* Ensure that alignment prologue won't copy past end of block. */
27779 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27781 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27782 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27783 Make sure it is power of 2. */
27784 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27786 /* To improve performance of small blocks, we jump around the VAL
27787 promoting mode. This mean that if the promoted VAL is not constant,
27788 we might not use it in the epilogue and have to use byte
27789 loop variant. */
27790 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27791 force_loopy_epilogue = true;
27792 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27793 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27795 /* If main algorithm works on QImode, no epilogue is needed.
27796 For small sizes just don't align anything. */
27797 if (size_needed == 1)
27798 desired_align = align;
27799 else
27800 goto epilogue;
27802 else if (!count
27803 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27805 label = gen_label_rtx ();
27806 emit_cmp_and_jump_insns (count_exp,
27807 GEN_INT (epilogue_size_needed),
27808 LTU, 0, counter_mode (count_exp), 1, label);
27809 if (expected_size == -1 || expected_size < epilogue_size_needed)
27810 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27811 else
27812 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27816 /* Emit code to decide on runtime whether library call or inline should be
27817 used. */
27818 if (dynamic_check != -1)
27820 if (!issetmem && CONST_INT_P (count_exp))
27822 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27824 emit_block_copy_via_libcall (dst, src, count_exp);
27825 count_exp = const0_rtx;
27826 goto epilogue;
27829 else
27831 rtx_code_label *hot_label = gen_label_rtx ();
27832 if (jump_around_label == NULL_RTX)
27833 jump_around_label = gen_label_rtx ();
27834 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27835 LEU, 0, counter_mode (count_exp),
27836 1, hot_label);
27837 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27838 if (issetmem)
27839 set_storage_via_libcall (dst, count_exp, val_exp);
27840 else
27841 emit_block_copy_via_libcall (dst, src, count_exp);
27842 emit_jump (jump_around_label);
27843 emit_label (hot_label);
27847 /* Step 2: Alignment prologue. */
27848 /* Do the expensive promotion once we branched off the small blocks. */
27849 if (issetmem && !promoted_val)
27850 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27851 desired_align, align);
27853 if (desired_align > align && !misaligned_prologue_used)
27855 if (align_bytes == 0)
27857 /* Except for the first move in prologue, we no longer know
27858 constant offset in aliasing info. It don't seems to worth
27859 the pain to maintain it for the first move, so throw away
27860 the info early. */
27861 dst = change_address (dst, BLKmode, destreg);
27862 if (!issetmem)
27863 src = change_address (src, BLKmode, srcreg);
27864 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27865 promoted_val, vec_promoted_val,
27866 count_exp, align, desired_align,
27867 issetmem);
27868 /* At most desired_align - align bytes are copied. */
27869 if (min_size < (unsigned)(desired_align - align))
27870 min_size = 0;
27871 else
27872 min_size -= desired_align - align;
27874 else
27876 /* If we know how many bytes need to be stored before dst is
27877 sufficiently aligned, maintain aliasing info accurately. */
27878 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27879 srcreg,
27880 promoted_val,
27881 vec_promoted_val,
27882 desired_align,
27883 align_bytes,
27884 issetmem);
27886 count_exp = plus_constant (counter_mode (count_exp),
27887 count_exp, -align_bytes);
27888 count -= align_bytes;
27889 min_size -= align_bytes;
27890 max_size -= align_bytes;
27892 if (need_zero_guard
27893 && min_size < (unsigned HOST_WIDE_INT) size_needed
27894 && (count < (unsigned HOST_WIDE_INT) size_needed
27895 || (align_bytes == 0
27896 && count < ((unsigned HOST_WIDE_INT) size_needed
27897 + desired_align - align))))
27899 /* It is possible that we copied enough so the main loop will not
27900 execute. */
27901 gcc_assert (size_needed > 1);
27902 if (label == NULL_RTX)
27903 label = gen_label_rtx ();
27904 emit_cmp_and_jump_insns (count_exp,
27905 GEN_INT (size_needed),
27906 LTU, 0, counter_mode (count_exp), 1, label);
27907 if (expected_size == -1
27908 || expected_size < (desired_align - align) / 2 + size_needed)
27909 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27910 else
27911 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27914 if (label && size_needed == 1)
27916 emit_label (label);
27917 LABEL_NUSES (label) = 1;
27918 label = NULL;
27919 epilogue_size_needed = 1;
27920 if (issetmem)
27921 promoted_val = val_exp;
27923 else if (label == NULL_RTX && !misaligned_prologue_used)
27924 epilogue_size_needed = size_needed;
27926 /* Step 3: Main loop. */
27928 switch (alg)
27930 case libcall:
27931 case no_stringop:
27932 case last_alg:
27933 gcc_unreachable ();
27934 case loop_1_byte:
27935 case loop:
27936 case unrolled_loop:
27937 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27938 count_exp, move_mode, unroll_factor,
27939 expected_size, issetmem);
27940 break;
27941 case vector_loop:
27942 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27943 vec_promoted_val, count_exp, move_mode,
27944 unroll_factor, expected_size, issetmem);
27945 break;
27946 case rep_prefix_8_byte:
27947 case rep_prefix_4_byte:
27948 case rep_prefix_1_byte:
27949 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27950 val_exp, count_exp, move_mode, issetmem);
27951 break;
27953 /* Adjust properly the offset of src and dest memory for aliasing. */
27954 if (CONST_INT_P (count_exp))
27956 if (!issetmem)
27957 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27958 (count / size_needed) * size_needed);
27959 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27960 (count / size_needed) * size_needed);
27962 else
27964 if (!issetmem)
27965 src = change_address (src, BLKmode, srcreg);
27966 dst = change_address (dst, BLKmode, destreg);
27969 /* Step 4: Epilogue to copy the remaining bytes. */
27970 epilogue:
27971 if (label)
27973 /* When the main loop is done, COUNT_EXP might hold original count,
27974 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27975 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27976 bytes. Compensate if needed. */
27978 if (size_needed < epilogue_size_needed)
27980 tmp =
27981 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27982 GEN_INT (size_needed - 1), count_exp, 1,
27983 OPTAB_DIRECT);
27984 if (tmp != count_exp)
27985 emit_move_insn (count_exp, tmp);
27987 emit_label (label);
27988 LABEL_NUSES (label) = 1;
27991 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27993 if (force_loopy_epilogue)
27994 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27995 epilogue_size_needed);
27996 else
27998 if (issetmem)
27999 expand_setmem_epilogue (dst, destreg, promoted_val,
28000 vec_promoted_val, count_exp,
28001 epilogue_size_needed);
28002 else
28003 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28004 epilogue_size_needed);
28007 if (jump_around_label)
28008 emit_label (jump_around_label);
28009 return true;
28013 /* Expand the appropriate insns for doing strlen if not just doing
28014 repnz; scasb
28016 out = result, initialized with the start address
28017 align_rtx = alignment of the address.
28018 scratch = scratch register, initialized with the startaddress when
28019 not aligned, otherwise undefined
28021 This is just the body. It needs the initializations mentioned above and
28022 some address computing at the end. These things are done in i386.md. */
28024 static void
28025 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28027 int align;
28028 rtx tmp;
28029 rtx_code_label *align_2_label = NULL;
28030 rtx_code_label *align_3_label = NULL;
28031 rtx_code_label *align_4_label = gen_label_rtx ();
28032 rtx_code_label *end_0_label = gen_label_rtx ();
28033 rtx mem;
28034 rtx tmpreg = gen_reg_rtx (SImode);
28035 rtx scratch = gen_reg_rtx (SImode);
28036 rtx cmp;
28038 align = 0;
28039 if (CONST_INT_P (align_rtx))
28040 align = INTVAL (align_rtx);
28042 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28044 /* Is there a known alignment and is it less than 4? */
28045 if (align < 4)
28047 rtx scratch1 = gen_reg_rtx (Pmode);
28048 emit_move_insn (scratch1, out);
28049 /* Is there a known alignment and is it not 2? */
28050 if (align != 2)
28052 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28053 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28055 /* Leave just the 3 lower bits. */
28056 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28057 NULL_RTX, 0, OPTAB_WIDEN);
28059 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28060 Pmode, 1, align_4_label);
28061 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28062 Pmode, 1, align_2_label);
28063 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28064 Pmode, 1, align_3_label);
28066 else
28068 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28069 check if is aligned to 4 - byte. */
28071 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28072 NULL_RTX, 0, OPTAB_WIDEN);
28074 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28075 Pmode, 1, align_4_label);
28078 mem = change_address (src, QImode, out);
28080 /* Now compare the bytes. */
28082 /* Compare the first n unaligned byte on a byte per byte basis. */
28083 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28084 QImode, 1, end_0_label);
28086 /* Increment the address. */
28087 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28089 /* Not needed with an alignment of 2 */
28090 if (align != 2)
28092 emit_label (align_2_label);
28094 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28095 end_0_label);
28097 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28099 emit_label (align_3_label);
28102 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28103 end_0_label);
28105 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28108 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28109 align this loop. It gives only huge programs, but does not help to
28110 speed up. */
28111 emit_label (align_4_label);
28113 mem = change_address (src, SImode, out);
28114 emit_move_insn (scratch, mem);
28115 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28117 /* This formula yields a nonzero result iff one of the bytes is zero.
28118 This saves three branches inside loop and many cycles. */
28120 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28121 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28122 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28123 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28124 gen_int_mode (0x80808080, SImode)));
28125 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28126 align_4_label);
28128 if (TARGET_CMOVE)
28130 rtx reg = gen_reg_rtx (SImode);
28131 rtx reg2 = gen_reg_rtx (Pmode);
28132 emit_move_insn (reg, tmpreg);
28133 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28135 /* If zero is not in the first two bytes, move two bytes forward. */
28136 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28137 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28138 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28139 emit_insn (gen_rtx_SET (tmpreg,
28140 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28141 reg,
28142 tmpreg)));
28143 /* Emit lea manually to avoid clobbering of flags. */
28144 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28146 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28147 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28148 emit_insn (gen_rtx_SET (out,
28149 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28150 reg2,
28151 out)));
28153 else
28155 rtx_code_label *end_2_label = gen_label_rtx ();
28156 /* Is zero in the first two bytes? */
28158 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28159 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28160 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28161 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28162 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28163 pc_rtx);
28164 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28165 JUMP_LABEL (tmp) = end_2_label;
28167 /* Not in the first two. Move two bytes forward. */
28168 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28169 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28171 emit_label (end_2_label);
28175 /* Avoid branch in fixing the byte. */
28176 tmpreg = gen_lowpart (QImode, tmpreg);
28177 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28178 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28179 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28180 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28182 emit_label (end_0_label);
28185 /* Expand strlen. */
28187 bool
28188 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28190 rtx addr, scratch1, scratch2, scratch3, scratch4;
28192 /* The generic case of strlen expander is long. Avoid it's
28193 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28195 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28196 && !TARGET_INLINE_ALL_STRINGOPS
28197 && !optimize_insn_for_size_p ()
28198 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28199 return false;
28201 addr = force_reg (Pmode, XEXP (src, 0));
28202 scratch1 = gen_reg_rtx (Pmode);
28204 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28205 && !optimize_insn_for_size_p ())
28207 /* Well it seems that some optimizer does not combine a call like
28208 foo(strlen(bar), strlen(bar));
28209 when the move and the subtraction is done here. It does calculate
28210 the length just once when these instructions are done inside of
28211 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28212 often used and I use one fewer register for the lifetime of
28213 output_strlen_unroll() this is better. */
28215 emit_move_insn (out, addr);
28217 ix86_expand_strlensi_unroll_1 (out, src, align);
28219 /* strlensi_unroll_1 returns the address of the zero at the end of
28220 the string, like memchr(), so compute the length by subtracting
28221 the start address. */
28222 emit_insn (ix86_gen_sub3 (out, out, addr));
28224 else
28226 rtx unspec;
28228 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28229 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28230 return false;
28231 /* Can't use this for non-default address spaces. */
28232 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28233 return false;
28235 scratch2 = gen_reg_rtx (Pmode);
28236 scratch3 = gen_reg_rtx (Pmode);
28237 scratch4 = force_reg (Pmode, constm1_rtx);
28239 emit_move_insn (scratch3, addr);
28240 eoschar = force_reg (QImode, eoschar);
28242 src = replace_equiv_address_nv (src, scratch3);
28244 /* If .md starts supporting :P, this can be done in .md. */
28245 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28246 scratch4), UNSPEC_SCAS);
28247 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28248 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28249 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28251 return true;
28254 /* For given symbol (function) construct code to compute address of it's PLT
28255 entry in large x86-64 PIC model. */
28256 static rtx
28257 construct_plt_address (rtx symbol)
28259 rtx tmp, unspec;
28261 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28262 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28263 gcc_assert (Pmode == DImode);
28265 tmp = gen_reg_rtx (Pmode);
28266 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28268 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28269 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28270 return tmp;
28274 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28275 rtx callarg2,
28276 rtx pop, bool sibcall)
28278 rtx vec[3];
28279 rtx use = NULL, call;
28280 unsigned int vec_len = 0;
28281 tree fndecl;
28283 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28285 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28286 if (fndecl
28287 && (lookup_attribute ("interrupt",
28288 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28289 error ("interrupt service routine can't be called directly");
28291 else
28292 fndecl = NULL_TREE;
28294 if (pop == const0_rtx)
28295 pop = NULL;
28296 gcc_assert (!TARGET_64BIT || !pop);
28298 if (TARGET_MACHO && !TARGET_64BIT)
28300 #if TARGET_MACHO
28301 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28302 fnaddr = machopic_indirect_call_target (fnaddr);
28303 #endif
28305 else
28307 /* Static functions and indirect calls don't need the pic register. Also,
28308 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28309 it an indirect call. */
28310 rtx addr = XEXP (fnaddr, 0);
28311 if (flag_pic
28312 && GET_CODE (addr) == SYMBOL_REF
28313 && !SYMBOL_REF_LOCAL_P (addr))
28315 if (flag_plt
28316 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28317 || !lookup_attribute ("noplt",
28318 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28320 if (!TARGET_64BIT
28321 || (ix86_cmodel == CM_LARGE_PIC
28322 && DEFAULT_ABI != MS_ABI))
28324 use_reg (&use, gen_rtx_REG (Pmode,
28325 REAL_PIC_OFFSET_TABLE_REGNUM));
28326 if (ix86_use_pseudo_pic_reg ())
28327 emit_move_insn (gen_rtx_REG (Pmode,
28328 REAL_PIC_OFFSET_TABLE_REGNUM),
28329 pic_offset_table_rtx);
28332 else if (!TARGET_PECOFF && !TARGET_MACHO)
28334 if (TARGET_64BIT)
28336 fnaddr = gen_rtx_UNSPEC (Pmode,
28337 gen_rtvec (1, addr),
28338 UNSPEC_GOTPCREL);
28339 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28341 else
28343 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28344 UNSPEC_GOT);
28345 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28346 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28347 fnaddr);
28349 fnaddr = gen_const_mem (Pmode, fnaddr);
28350 /* Pmode may not be the same as word_mode for x32, which
28351 doesn't support indirect branch via 32-bit memory slot.
28352 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28353 indirect branch via x32 GOT slot is OK. */
28354 if (GET_MODE (fnaddr) != word_mode)
28355 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28356 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28361 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28362 parameters passed in vector registers. */
28363 if (TARGET_64BIT
28364 && (INTVAL (callarg2) > 0
28365 || (INTVAL (callarg2) == 0
28366 && (TARGET_SSE || !flag_skip_rax_setup))))
28368 rtx al = gen_rtx_REG (QImode, AX_REG);
28369 emit_move_insn (al, callarg2);
28370 use_reg (&use, al);
28373 if (ix86_cmodel == CM_LARGE_PIC
28374 && !TARGET_PECOFF
28375 && MEM_P (fnaddr)
28376 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28377 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28378 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28379 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28380 branch via x32 GOT slot is OK. */
28381 else if (!(TARGET_X32
28382 && MEM_P (fnaddr)
28383 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28384 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28385 && (sibcall
28386 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28387 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28389 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28390 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28393 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28395 if (retval)
28397 /* We should add bounds as destination register in case
28398 pointer with bounds may be returned. */
28399 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28401 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28402 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28403 if (GET_CODE (retval) == PARALLEL)
28405 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28406 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28407 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28408 retval = chkp_join_splitted_slot (retval, par);
28410 else
28412 retval = gen_rtx_PARALLEL (VOIDmode,
28413 gen_rtvec (3, retval, b0, b1));
28414 chkp_put_regs_to_expr_list (retval);
28418 call = gen_rtx_SET (retval, call);
28420 vec[vec_len++] = call;
28422 if (pop)
28424 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28425 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28426 vec[vec_len++] = pop;
28429 if (cfun->machine->no_caller_saved_registers
28430 && (!fndecl
28431 || (!TREE_THIS_VOLATILE (fndecl)
28432 && !lookup_attribute ("no_caller_saved_registers",
28433 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28435 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28436 bool is_64bit_ms_abi = (TARGET_64BIT
28437 && ix86_function_abi (fndecl) == MS_ABI);
28438 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28440 /* If there are no caller-saved registers, add all registers
28441 that are clobbered by the call which returns. */
28442 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28443 if (!fixed_regs[i]
28444 && (ix86_call_used_regs[i] == 1
28445 || (ix86_call_used_regs[i] & c_mask))
28446 && !STACK_REGNO_P (i)
28447 && !MMX_REGNO_P (i))
28448 clobber_reg (&use,
28449 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28451 else if (TARGET_64BIT_MS_ABI
28452 && (!callarg2 || INTVAL (callarg2) != -2))
28454 int const cregs_size
28455 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28456 int i;
28458 for (i = 0; i < cregs_size; i++)
28460 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28461 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28463 clobber_reg (&use, gen_rtx_REG (mode, regno));
28467 if (vec_len > 1)
28468 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28469 call = emit_call_insn (call);
28470 if (use)
28471 CALL_INSN_FUNCTION_USAGE (call) = use;
28473 return call;
28476 /* Return true if the function being called was marked with attribute
28477 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28478 to handle the non-PIC case in the backend because there is no easy
28479 interface for the front-end to force non-PLT calls to use the GOT.
28480 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28481 to call the function marked "noplt" indirectly. */
28483 static bool
28484 ix86_nopic_noplt_attribute_p (rtx call_op)
28486 if (flag_pic || ix86_cmodel == CM_LARGE
28487 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28488 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28489 || SYMBOL_REF_LOCAL_P (call_op))
28490 return false;
28492 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28494 if (!flag_plt
28495 || (symbol_decl != NULL_TREE
28496 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28497 return true;
28499 return false;
28502 /* Output the assembly for a call instruction. */
28504 const char *
28505 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28507 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28508 bool seh_nop_p = false;
28509 const char *xasm;
28511 if (SIBLING_CALL_P (insn))
28513 if (direct_p)
28515 if (ix86_nopic_noplt_attribute_p (call_op))
28517 if (TARGET_64BIT)
28518 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28519 else
28520 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28522 else
28523 xasm = "%!jmp\t%P0";
28525 /* SEH epilogue detection requires the indirect branch case
28526 to include REX.W. */
28527 else if (TARGET_SEH)
28528 xasm = "%!rex.W jmp\t%A0";
28529 else
28530 xasm = "%!jmp\t%A0";
28532 output_asm_insn (xasm, &call_op);
28533 return "";
28536 /* SEH unwinding can require an extra nop to be emitted in several
28537 circumstances. Determine if we have one of those. */
28538 if (TARGET_SEH)
28540 rtx_insn *i;
28542 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28544 /* If we get to another real insn, we don't need the nop. */
28545 if (INSN_P (i))
28546 break;
28548 /* If we get to the epilogue note, prevent a catch region from
28549 being adjacent to the standard epilogue sequence. If non-
28550 call-exceptions, we'll have done this during epilogue emission. */
28551 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28552 && !flag_non_call_exceptions
28553 && !can_throw_internal (insn))
28555 seh_nop_p = true;
28556 break;
28560 /* If we didn't find a real insn following the call, prevent the
28561 unwinder from looking into the next function. */
28562 if (i == NULL)
28563 seh_nop_p = true;
28566 if (direct_p)
28568 if (ix86_nopic_noplt_attribute_p (call_op))
28570 if (TARGET_64BIT)
28571 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28572 else
28573 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28575 else
28576 xasm = "%!call\t%P0";
28578 else
28579 xasm = "%!call\t%A0";
28581 output_asm_insn (xasm, &call_op);
28583 if (seh_nop_p)
28584 return "nop";
28586 return "";
28589 /* Clear stack slot assignments remembered from previous functions.
28590 This is called from INIT_EXPANDERS once before RTL is emitted for each
28591 function. */
28593 static struct machine_function *
28594 ix86_init_machine_status (void)
28596 struct machine_function *f;
28598 f = ggc_cleared_alloc<machine_function> ();
28599 f->use_fast_prologue_epilogue_nregs = -1;
28600 f->call_abi = ix86_abi;
28602 return f;
28605 /* Return a MEM corresponding to a stack slot with mode MODE.
28606 Allocate a new slot if necessary.
28608 The RTL for a function can have several slots available: N is
28609 which slot to use. */
28612 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28614 struct stack_local_entry *s;
28616 gcc_assert (n < MAX_386_STACK_LOCALS);
28618 for (s = ix86_stack_locals; s; s = s->next)
28619 if (s->mode == mode && s->n == n)
28620 return validize_mem (copy_rtx (s->rtl));
28622 s = ggc_alloc<stack_local_entry> ();
28623 s->n = n;
28624 s->mode = mode;
28625 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28627 s->next = ix86_stack_locals;
28628 ix86_stack_locals = s;
28629 return validize_mem (copy_rtx (s->rtl));
28632 static void
28633 ix86_instantiate_decls (void)
28635 struct stack_local_entry *s;
28637 for (s = ix86_stack_locals; s; s = s->next)
28638 if (s->rtl != NULL_RTX)
28639 instantiate_decl_rtl (s->rtl);
28642 /* Return the number used for encoding REG, in the range 0..7. */
28644 static int
28645 reg_encoded_number (rtx reg)
28647 unsigned regno = REGNO (reg);
28648 switch (regno)
28650 case AX_REG:
28651 return 0;
28652 case CX_REG:
28653 return 1;
28654 case DX_REG:
28655 return 2;
28656 case BX_REG:
28657 return 3;
28658 case SP_REG:
28659 return 4;
28660 case BP_REG:
28661 return 5;
28662 case SI_REG:
28663 return 6;
28664 case DI_REG:
28665 return 7;
28666 default:
28667 break;
28669 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28670 return regno - FIRST_STACK_REG;
28671 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28672 return regno - FIRST_SSE_REG;
28673 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28674 return regno - FIRST_MMX_REG;
28675 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28676 return regno - FIRST_REX_SSE_REG;
28677 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28678 return regno - FIRST_REX_INT_REG;
28679 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28680 return regno - FIRST_MASK_REG;
28681 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28682 return regno - FIRST_BND_REG;
28683 return -1;
28686 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28687 in its encoding if it could be relevant for ROP mitigation, otherwise
28688 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28689 used for calculating it into them. */
28691 static int
28692 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28693 int *popno0 = 0, int *popno1 = 0)
28695 if (asm_noperands (PATTERN (insn)) >= 0)
28696 return -1;
28697 int has_modrm = get_attr_modrm (insn);
28698 if (!has_modrm)
28699 return -1;
28700 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28701 rtx op0, op1;
28702 switch (cls)
28704 case MODRM_CLASS_OP02:
28705 gcc_assert (noperands >= 3);
28706 if (popno0)
28708 *popno0 = 0;
28709 *popno1 = 2;
28711 op0 = operands[0];
28712 op1 = operands[2];
28713 break;
28714 case MODRM_CLASS_OP01:
28715 gcc_assert (noperands >= 2);
28716 if (popno0)
28718 *popno0 = 0;
28719 *popno1 = 1;
28721 op0 = operands[0];
28722 op1 = operands[1];
28723 break;
28724 default:
28725 return -1;
28727 if (REG_P (op0) && REG_P (op1))
28729 int enc0 = reg_encoded_number (op0);
28730 int enc1 = reg_encoded_number (op1);
28731 return 0xc0 + (enc1 << 3) + enc0;
28733 return -1;
28736 /* Check whether x86 address PARTS is a pc-relative address. */
28738 static bool
28739 rip_relative_addr_p (struct ix86_address *parts)
28741 rtx base, index, disp;
28743 base = parts->base;
28744 index = parts->index;
28745 disp = parts->disp;
28747 if (disp && !base && !index)
28749 if (TARGET_64BIT)
28751 rtx symbol = disp;
28753 if (GET_CODE (disp) == CONST)
28754 symbol = XEXP (disp, 0);
28755 if (GET_CODE (symbol) == PLUS
28756 && CONST_INT_P (XEXP (symbol, 1)))
28757 symbol = XEXP (symbol, 0);
28759 if (GET_CODE (symbol) == LABEL_REF
28760 || (GET_CODE (symbol) == SYMBOL_REF
28761 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28762 || (GET_CODE (symbol) == UNSPEC
28763 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28764 || XINT (symbol, 1) == UNSPEC_PCREL
28765 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28766 return true;
28769 return false;
28772 /* Calculate the length of the memory address in the instruction encoding.
28773 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28774 or other prefixes. We never generate addr32 prefix for LEA insn. */
28777 memory_address_length (rtx addr, bool lea)
28779 struct ix86_address parts;
28780 rtx base, index, disp;
28781 int len;
28782 int ok;
28784 if (GET_CODE (addr) == PRE_DEC
28785 || GET_CODE (addr) == POST_INC
28786 || GET_CODE (addr) == PRE_MODIFY
28787 || GET_CODE (addr) == POST_MODIFY)
28788 return 0;
28790 ok = ix86_decompose_address (addr, &parts);
28791 gcc_assert (ok);
28793 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28795 /* If this is not LEA instruction, add the length of addr32 prefix. */
28796 if (TARGET_64BIT && !lea
28797 && (SImode_address_operand (addr, VOIDmode)
28798 || (parts.base && GET_MODE (parts.base) == SImode)
28799 || (parts.index && GET_MODE (parts.index) == SImode)))
28800 len++;
28802 base = parts.base;
28803 index = parts.index;
28804 disp = parts.disp;
28806 if (base && SUBREG_P (base))
28807 base = SUBREG_REG (base);
28808 if (index && SUBREG_P (index))
28809 index = SUBREG_REG (index);
28811 gcc_assert (base == NULL_RTX || REG_P (base));
28812 gcc_assert (index == NULL_RTX || REG_P (index));
28814 /* Rule of thumb:
28815 - esp as the base always wants an index,
28816 - ebp as the base always wants a displacement,
28817 - r12 as the base always wants an index,
28818 - r13 as the base always wants a displacement. */
28820 /* Register Indirect. */
28821 if (base && !index && !disp)
28823 /* esp (for its index) and ebp (for its displacement) need
28824 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28825 code. */
28826 if (base == arg_pointer_rtx
28827 || base == frame_pointer_rtx
28828 || REGNO (base) == SP_REG
28829 || REGNO (base) == BP_REG
28830 || REGNO (base) == R12_REG
28831 || REGNO (base) == R13_REG)
28832 len++;
28835 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28836 is not disp32, but disp32(%rip), so for disp32
28837 SIB byte is needed, unless print_operand_address
28838 optimizes it into disp32(%rip) or (%rip) is implied
28839 by UNSPEC. */
28840 else if (disp && !base && !index)
28842 len += 4;
28843 if (!rip_relative_addr_p (&parts))
28844 len++;
28846 else
28848 /* Find the length of the displacement constant. */
28849 if (disp)
28851 if (base && satisfies_constraint_K (disp))
28852 len += 1;
28853 else
28854 len += 4;
28856 /* ebp always wants a displacement. Similarly r13. */
28857 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28858 len++;
28860 /* An index requires the two-byte modrm form.... */
28861 if (index
28862 /* ...like esp (or r12), which always wants an index. */
28863 || base == arg_pointer_rtx
28864 || base == frame_pointer_rtx
28865 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28866 len++;
28869 return len;
28872 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28873 is set, expect that insn have 8bit immediate alternative. */
28875 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28877 int len = 0;
28878 int i;
28879 extract_insn_cached (insn);
28880 for (i = recog_data.n_operands - 1; i >= 0; --i)
28881 if (CONSTANT_P (recog_data.operand[i]))
28883 enum attr_mode mode = get_attr_mode (insn);
28885 gcc_assert (!len);
28886 if (shortform && CONST_INT_P (recog_data.operand[i]))
28888 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28889 switch (mode)
28891 case MODE_QI:
28892 len = 1;
28893 continue;
28894 case MODE_HI:
28895 ival = trunc_int_for_mode (ival, HImode);
28896 break;
28897 case MODE_SI:
28898 ival = trunc_int_for_mode (ival, SImode);
28899 break;
28900 default:
28901 break;
28903 if (IN_RANGE (ival, -128, 127))
28905 len = 1;
28906 continue;
28909 switch (mode)
28911 case MODE_QI:
28912 len = 1;
28913 break;
28914 case MODE_HI:
28915 len = 2;
28916 break;
28917 case MODE_SI:
28918 len = 4;
28919 break;
28920 /* Immediates for DImode instructions are encoded
28921 as 32bit sign extended values. */
28922 case MODE_DI:
28923 len = 4;
28924 break;
28925 default:
28926 fatal_insn ("unknown insn mode", insn);
28929 return len;
28932 /* Compute default value for "length_address" attribute. */
28934 ix86_attr_length_address_default (rtx_insn *insn)
28936 int i;
28938 if (get_attr_type (insn) == TYPE_LEA)
28940 rtx set = PATTERN (insn), addr;
28942 if (GET_CODE (set) == PARALLEL)
28943 set = XVECEXP (set, 0, 0);
28945 gcc_assert (GET_CODE (set) == SET);
28947 addr = SET_SRC (set);
28949 return memory_address_length (addr, true);
28952 extract_insn_cached (insn);
28953 for (i = recog_data.n_operands - 1; i >= 0; --i)
28955 rtx op = recog_data.operand[i];
28956 if (MEM_P (op))
28958 constrain_operands_cached (insn, reload_completed);
28959 if (which_alternative != -1)
28961 const char *constraints = recog_data.constraints[i];
28962 int alt = which_alternative;
28964 while (*constraints == '=' || *constraints == '+')
28965 constraints++;
28966 while (alt-- > 0)
28967 while (*constraints++ != ',')
28969 /* Skip ignored operands. */
28970 if (*constraints == 'X')
28971 continue;
28974 int len = memory_address_length (XEXP (op, 0), false);
28976 /* Account for segment prefix for non-default addr spaces. */
28977 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28978 len++;
28980 return len;
28983 return 0;
28986 /* Compute default value for "length_vex" attribute. It includes
28987 2 or 3 byte VEX prefix and 1 opcode byte. */
28990 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28991 bool has_vex_w)
28993 int i;
28995 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28996 byte VEX prefix. */
28997 if (!has_0f_opcode || has_vex_w)
28998 return 3 + 1;
29000 /* We can always use 2 byte VEX prefix in 32bit. */
29001 if (!TARGET_64BIT)
29002 return 2 + 1;
29004 extract_insn_cached (insn);
29006 for (i = recog_data.n_operands - 1; i >= 0; --i)
29007 if (REG_P (recog_data.operand[i]))
29009 /* REX.W bit uses 3 byte VEX prefix. */
29010 if (GET_MODE (recog_data.operand[i]) == DImode
29011 && GENERAL_REG_P (recog_data.operand[i]))
29012 return 3 + 1;
29014 else
29016 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29017 if (MEM_P (recog_data.operand[i])
29018 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29019 return 3 + 1;
29022 return 2 + 1;
29025 /* Return the maximum number of instructions a cpu can issue. */
29027 static int
29028 ix86_issue_rate (void)
29030 switch (ix86_tune)
29032 case PROCESSOR_PENTIUM:
29033 case PROCESSOR_LAKEMONT:
29034 case PROCESSOR_BONNELL:
29035 case PROCESSOR_SILVERMONT:
29036 case PROCESSOR_KNL:
29037 case PROCESSOR_INTEL:
29038 case PROCESSOR_K6:
29039 case PROCESSOR_BTVER2:
29040 case PROCESSOR_PENTIUM4:
29041 case PROCESSOR_NOCONA:
29042 return 2;
29044 case PROCESSOR_PENTIUMPRO:
29045 case PROCESSOR_ATHLON:
29046 case PROCESSOR_K8:
29047 case PROCESSOR_AMDFAM10:
29048 case PROCESSOR_GENERIC:
29049 case PROCESSOR_BTVER1:
29050 return 3;
29052 case PROCESSOR_BDVER1:
29053 case PROCESSOR_BDVER2:
29054 case PROCESSOR_BDVER3:
29055 case PROCESSOR_BDVER4:
29056 case PROCESSOR_ZNVER1:
29057 case PROCESSOR_CORE2:
29058 case PROCESSOR_NEHALEM:
29059 case PROCESSOR_SANDYBRIDGE:
29060 case PROCESSOR_HASWELL:
29061 return 4;
29063 default:
29064 return 1;
29068 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29069 by DEP_INSN and nothing set by DEP_INSN. */
29071 static bool
29072 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29074 rtx set, set2;
29076 /* Simplify the test for uninteresting insns. */
29077 if (insn_type != TYPE_SETCC
29078 && insn_type != TYPE_ICMOV
29079 && insn_type != TYPE_FCMOV
29080 && insn_type != TYPE_IBR)
29081 return false;
29083 if ((set = single_set (dep_insn)) != 0)
29085 set = SET_DEST (set);
29086 set2 = NULL_RTX;
29088 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29089 && XVECLEN (PATTERN (dep_insn), 0) == 2
29090 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29091 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29093 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29094 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29096 else
29097 return false;
29099 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29100 return false;
29102 /* This test is true if the dependent insn reads the flags but
29103 not any other potentially set register. */
29104 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29105 return false;
29107 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29108 return false;
29110 return true;
29113 /* Return true iff USE_INSN has a memory address with operands set by
29114 SET_INSN. */
29116 bool
29117 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29119 int i;
29120 extract_insn_cached (use_insn);
29121 for (i = recog_data.n_operands - 1; i >= 0; --i)
29122 if (MEM_P (recog_data.operand[i]))
29124 rtx addr = XEXP (recog_data.operand[i], 0);
29125 return modified_in_p (addr, set_insn) != 0;
29127 return false;
29130 /* Helper function for exact_store_load_dependency.
29131 Return true if addr is found in insn. */
29132 static bool
29133 exact_dependency_1 (rtx addr, rtx insn)
29135 enum rtx_code code;
29136 const char *format_ptr;
29137 int i, j;
29139 code = GET_CODE (insn);
29140 switch (code)
29142 case MEM:
29143 if (rtx_equal_p (addr, insn))
29144 return true;
29145 break;
29146 case REG:
29147 CASE_CONST_ANY:
29148 case SYMBOL_REF:
29149 case CODE_LABEL:
29150 case PC:
29151 case CC0:
29152 case EXPR_LIST:
29153 return false;
29154 default:
29155 break;
29158 format_ptr = GET_RTX_FORMAT (code);
29159 for (i = 0; i < GET_RTX_LENGTH (code); i++)
29161 switch (*format_ptr++)
29163 case 'e':
29164 if (exact_dependency_1 (addr, XEXP (insn, i)))
29165 return true;
29166 break;
29167 case 'E':
29168 for (j = 0; j < XVECLEN (insn, i); j++)
29169 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
29170 return true;
29171 break;
29174 return false;
29177 /* Return true if there exists exact dependency for store & load, i.e.
29178 the same memory address is used in them. */
29179 static bool
29180 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
29182 rtx set1, set2;
29184 set1 = single_set (store);
29185 if (!set1)
29186 return false;
29187 if (!MEM_P (SET_DEST (set1)))
29188 return false;
29189 set2 = single_set (load);
29190 if (!set2)
29191 return false;
29192 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
29193 return true;
29194 return false;
29197 static int
29198 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
29199 unsigned int)
29201 enum attr_type insn_type, dep_insn_type;
29202 enum attr_memory memory;
29203 rtx set, set2;
29204 int dep_insn_code_number;
29206 /* Anti and output dependencies have zero cost on all CPUs. */
29207 if (dep_type != 0)
29208 return 0;
29210 dep_insn_code_number = recog_memoized (dep_insn);
29212 /* If we can't recognize the insns, we can't really do anything. */
29213 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
29214 return cost;
29216 insn_type = get_attr_type (insn);
29217 dep_insn_type = get_attr_type (dep_insn);
29219 switch (ix86_tune)
29221 case PROCESSOR_PENTIUM:
29222 case PROCESSOR_LAKEMONT:
29223 /* Address Generation Interlock adds a cycle of latency. */
29224 if (insn_type == TYPE_LEA)
29226 rtx addr = PATTERN (insn);
29228 if (GET_CODE (addr) == PARALLEL)
29229 addr = XVECEXP (addr, 0, 0);
29231 gcc_assert (GET_CODE (addr) == SET);
29233 addr = SET_SRC (addr);
29234 if (modified_in_p (addr, dep_insn))
29235 cost += 1;
29237 else if (ix86_agi_dependent (dep_insn, insn))
29238 cost += 1;
29240 /* ??? Compares pair with jump/setcc. */
29241 if (ix86_flags_dependent (insn, dep_insn, insn_type))
29242 cost = 0;
29244 /* Floating point stores require value to be ready one cycle earlier. */
29245 if (insn_type == TYPE_FMOV
29246 && get_attr_memory (insn) == MEMORY_STORE
29247 && !ix86_agi_dependent (dep_insn, insn))
29248 cost += 1;
29249 break;
29251 case PROCESSOR_PENTIUMPRO:
29252 /* INT->FP conversion is expensive. */
29253 if (get_attr_fp_int_src (dep_insn))
29254 cost += 5;
29256 /* There is one cycle extra latency between an FP op and a store. */
29257 if (insn_type == TYPE_FMOV
29258 && (set = single_set (dep_insn)) != NULL_RTX
29259 && (set2 = single_set (insn)) != NULL_RTX
29260 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
29261 && MEM_P (SET_DEST (set2)))
29262 cost += 1;
29264 memory = get_attr_memory (insn);
29266 /* Show ability of reorder buffer to hide latency of load by executing
29267 in parallel with previous instruction in case
29268 previous instruction is not needed to compute the address. */
29269 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29270 && !ix86_agi_dependent (dep_insn, insn))
29272 /* Claim moves to take one cycle, as core can issue one load
29273 at time and the next load can start cycle later. */
29274 if (dep_insn_type == TYPE_IMOV
29275 || dep_insn_type == TYPE_FMOV)
29276 cost = 1;
29277 else if (cost > 1)
29278 cost--;
29280 break;
29282 case PROCESSOR_K6:
29283 /* The esp dependency is resolved before
29284 the instruction is really finished. */
29285 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29286 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29287 return 1;
29289 /* INT->FP conversion is expensive. */
29290 if (get_attr_fp_int_src (dep_insn))
29291 cost += 5;
29293 memory = get_attr_memory (insn);
29295 /* Show ability of reorder buffer to hide latency of load by executing
29296 in parallel with previous instruction in case
29297 previous instruction is not needed to compute the address. */
29298 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29299 && !ix86_agi_dependent (dep_insn, insn))
29301 /* Claim moves to take one cycle, as core can issue one load
29302 at time and the next load can start cycle later. */
29303 if (dep_insn_type == TYPE_IMOV
29304 || dep_insn_type == TYPE_FMOV)
29305 cost = 1;
29306 else if (cost > 2)
29307 cost -= 2;
29308 else
29309 cost = 1;
29311 break;
29313 case PROCESSOR_AMDFAM10:
29314 case PROCESSOR_BDVER1:
29315 case PROCESSOR_BDVER2:
29316 case PROCESSOR_BDVER3:
29317 case PROCESSOR_BDVER4:
29318 case PROCESSOR_ZNVER1:
29319 case PROCESSOR_BTVER1:
29320 case PROCESSOR_BTVER2:
29321 case PROCESSOR_GENERIC:
29322 /* Stack engine allows to execute push&pop instructions in parall. */
29323 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29324 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29325 return 0;
29326 /* FALLTHRU */
29328 case PROCESSOR_ATHLON:
29329 case PROCESSOR_K8:
29330 memory = get_attr_memory (insn);
29332 /* Show ability of reorder buffer to hide latency of load by executing
29333 in parallel with previous instruction in case
29334 previous instruction is not needed to compute the address. */
29335 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29336 && !ix86_agi_dependent (dep_insn, insn))
29338 enum attr_unit unit = get_attr_unit (insn);
29339 int loadcost = 3;
29341 /* Because of the difference between the length of integer and
29342 floating unit pipeline preparation stages, the memory operands
29343 for floating point are cheaper.
29345 ??? For Athlon it the difference is most probably 2. */
29346 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29347 loadcost = 3;
29348 else
29349 loadcost = TARGET_ATHLON ? 2 : 0;
29351 if (cost >= loadcost)
29352 cost -= loadcost;
29353 else
29354 cost = 0;
29356 break;
29358 case PROCESSOR_CORE2:
29359 case PROCESSOR_NEHALEM:
29360 case PROCESSOR_SANDYBRIDGE:
29361 case PROCESSOR_HASWELL:
29362 /* Stack engine allows to execute push&pop instructions in parall. */
29363 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29364 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29365 return 0;
29367 memory = get_attr_memory (insn);
29369 /* Show ability of reorder buffer to hide latency of load by executing
29370 in parallel with previous instruction in case
29371 previous instruction is not needed to compute the address. */
29372 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29373 && !ix86_agi_dependent (dep_insn, insn))
29375 if (cost >= 4)
29376 cost -= 4;
29377 else
29378 cost = 0;
29380 break;
29382 case PROCESSOR_SILVERMONT:
29383 case PROCESSOR_KNL:
29384 case PROCESSOR_INTEL:
29385 if (!reload_completed)
29386 return cost;
29388 /* Increase cost of integer loads. */
29389 memory = get_attr_memory (dep_insn);
29390 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29392 enum attr_unit unit = get_attr_unit (dep_insn);
29393 if (unit == UNIT_INTEGER && cost == 1)
29395 if (memory == MEMORY_LOAD)
29396 cost = 3;
29397 else
29399 /* Increase cost of ld/st for short int types only
29400 because of store forwarding issue. */
29401 rtx set = single_set (dep_insn);
29402 if (set && (GET_MODE (SET_DEST (set)) == QImode
29403 || GET_MODE (SET_DEST (set)) == HImode))
29405 /* Increase cost of store/load insn if exact
29406 dependence exists and it is load insn. */
29407 enum attr_memory insn_memory = get_attr_memory (insn);
29408 if (insn_memory == MEMORY_LOAD
29409 && exact_store_load_dependency (dep_insn, insn))
29410 cost = 3;
29416 default:
29417 break;
29420 return cost;
29423 /* How many alternative schedules to try. This should be as wide as the
29424 scheduling freedom in the DFA, but no wider. Making this value too
29425 large results extra work for the scheduler. */
29427 static int
29428 ia32_multipass_dfa_lookahead (void)
29430 switch (ix86_tune)
29432 case PROCESSOR_PENTIUM:
29433 case PROCESSOR_LAKEMONT:
29434 return 2;
29436 case PROCESSOR_PENTIUMPRO:
29437 case PROCESSOR_K6:
29438 return 1;
29440 case PROCESSOR_BDVER1:
29441 case PROCESSOR_BDVER2:
29442 case PROCESSOR_BDVER3:
29443 case PROCESSOR_BDVER4:
29444 /* We use lookahead value 4 for BD both before and after reload
29445 schedules. Plan is to have value 8 included for O3. */
29446 return 4;
29448 case PROCESSOR_CORE2:
29449 case PROCESSOR_NEHALEM:
29450 case PROCESSOR_SANDYBRIDGE:
29451 case PROCESSOR_HASWELL:
29452 case PROCESSOR_BONNELL:
29453 case PROCESSOR_SILVERMONT:
29454 case PROCESSOR_KNL:
29455 case PROCESSOR_INTEL:
29456 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29457 as many instructions can be executed on a cycle, i.e.,
29458 issue_rate. I wonder why tuning for many CPUs does not do this. */
29459 if (reload_completed)
29460 return ix86_issue_rate ();
29461 /* Don't use lookahead for pre-reload schedule to save compile time. */
29462 return 0;
29464 default:
29465 return 0;
29469 /* Return true if target platform supports macro-fusion. */
29471 static bool
29472 ix86_macro_fusion_p ()
29474 return TARGET_FUSE_CMP_AND_BRANCH;
29477 /* Check whether current microarchitecture support macro fusion
29478 for insn pair "CONDGEN + CONDJMP". Refer to
29479 "Intel Architectures Optimization Reference Manual". */
29481 static bool
29482 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29484 rtx src, dest;
29485 enum rtx_code ccode;
29486 rtx compare_set = NULL_RTX, test_if, cond;
29487 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29489 if (!any_condjump_p (condjmp))
29490 return false;
29492 if (get_attr_type (condgen) != TYPE_TEST
29493 && get_attr_type (condgen) != TYPE_ICMP
29494 && get_attr_type (condgen) != TYPE_INCDEC
29495 && get_attr_type (condgen) != TYPE_ALU)
29496 return false;
29498 compare_set = single_set (condgen);
29499 if (compare_set == NULL_RTX
29500 && !TARGET_FUSE_ALU_AND_BRANCH)
29501 return false;
29503 if (compare_set == NULL_RTX)
29505 int i;
29506 rtx pat = PATTERN (condgen);
29507 for (i = 0; i < XVECLEN (pat, 0); i++)
29508 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29510 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29511 if (GET_CODE (set_src) == COMPARE)
29512 compare_set = XVECEXP (pat, 0, i);
29513 else
29514 alu_set = XVECEXP (pat, 0, i);
29517 if (compare_set == NULL_RTX)
29518 return false;
29519 src = SET_SRC (compare_set);
29520 if (GET_CODE (src) != COMPARE)
29521 return false;
29523 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29524 supported. */
29525 if ((MEM_P (XEXP (src, 0))
29526 && CONST_INT_P (XEXP (src, 1)))
29527 || (MEM_P (XEXP (src, 1))
29528 && CONST_INT_P (XEXP (src, 0))))
29529 return false;
29531 /* No fusion for RIP-relative address. */
29532 if (MEM_P (XEXP (src, 0)))
29533 addr = XEXP (XEXP (src, 0), 0);
29534 else if (MEM_P (XEXP (src, 1)))
29535 addr = XEXP (XEXP (src, 1), 0);
29537 if (addr) {
29538 ix86_address parts;
29539 int ok = ix86_decompose_address (addr, &parts);
29540 gcc_assert (ok);
29542 if (rip_relative_addr_p (&parts))
29543 return false;
29546 test_if = SET_SRC (pc_set (condjmp));
29547 cond = XEXP (test_if, 0);
29548 ccode = GET_CODE (cond);
29549 /* Check whether conditional jump use Sign or Overflow Flags. */
29550 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29551 && (ccode == GE
29552 || ccode == GT
29553 || ccode == LE
29554 || ccode == LT))
29555 return false;
29557 /* Return true for TYPE_TEST and TYPE_ICMP. */
29558 if (get_attr_type (condgen) == TYPE_TEST
29559 || get_attr_type (condgen) == TYPE_ICMP)
29560 return true;
29562 /* The following is the case that macro-fusion for alu + jmp. */
29563 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29564 return false;
29566 /* No fusion for alu op with memory destination operand. */
29567 dest = SET_DEST (alu_set);
29568 if (MEM_P (dest))
29569 return false;
29571 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29572 supported. */
29573 if (get_attr_type (condgen) == TYPE_INCDEC
29574 && (ccode == GEU
29575 || ccode == GTU
29576 || ccode == LEU
29577 || ccode == LTU))
29578 return false;
29580 return true;
29583 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29584 execution. It is applied if
29585 (1) IMUL instruction is on the top of list;
29586 (2) There exists the only producer of independent IMUL instruction in
29587 ready list.
29588 Return index of IMUL producer if it was found and -1 otherwise. */
29589 static int
29590 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29592 rtx_insn *insn;
29593 rtx set, insn1, insn2;
29594 sd_iterator_def sd_it;
29595 dep_t dep;
29596 int index = -1;
29597 int i;
29599 if (!TARGET_BONNELL)
29600 return index;
29602 /* Check that IMUL instruction is on the top of ready list. */
29603 insn = ready[n_ready - 1];
29604 set = single_set (insn);
29605 if (!set)
29606 return index;
29607 if (!(GET_CODE (SET_SRC (set)) == MULT
29608 && GET_MODE (SET_SRC (set)) == SImode))
29609 return index;
29611 /* Search for producer of independent IMUL instruction. */
29612 for (i = n_ready - 2; i >= 0; i--)
29614 insn = ready[i];
29615 if (!NONDEBUG_INSN_P (insn))
29616 continue;
29617 /* Skip IMUL instruction. */
29618 insn2 = PATTERN (insn);
29619 if (GET_CODE (insn2) == PARALLEL)
29620 insn2 = XVECEXP (insn2, 0, 0);
29621 if (GET_CODE (insn2) == SET
29622 && GET_CODE (SET_SRC (insn2)) == MULT
29623 && GET_MODE (SET_SRC (insn2)) == SImode)
29624 continue;
29626 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29628 rtx con;
29629 con = DEP_CON (dep);
29630 if (!NONDEBUG_INSN_P (con))
29631 continue;
29632 insn1 = PATTERN (con);
29633 if (GET_CODE (insn1) == PARALLEL)
29634 insn1 = XVECEXP (insn1, 0, 0);
29636 if (GET_CODE (insn1) == SET
29637 && GET_CODE (SET_SRC (insn1)) == MULT
29638 && GET_MODE (SET_SRC (insn1)) == SImode)
29640 sd_iterator_def sd_it1;
29641 dep_t dep1;
29642 /* Check if there is no other dependee for IMUL. */
29643 index = i;
29644 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29646 rtx pro;
29647 pro = DEP_PRO (dep1);
29648 if (!NONDEBUG_INSN_P (pro))
29649 continue;
29650 if (pro != insn)
29651 index = -1;
29653 if (index >= 0)
29654 break;
29657 if (index >= 0)
29658 break;
29660 return index;
29663 /* Try to find the best candidate on the top of ready list if two insns
29664 have the same priority - candidate is best if its dependees were
29665 scheduled earlier. Applied for Silvermont only.
29666 Return true if top 2 insns must be interchanged. */
29667 static bool
29668 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29670 rtx_insn *top = ready[n_ready - 1];
29671 rtx_insn *next = ready[n_ready - 2];
29672 rtx set;
29673 sd_iterator_def sd_it;
29674 dep_t dep;
29675 int clock1 = -1;
29676 int clock2 = -1;
29677 #define INSN_TICK(INSN) (HID (INSN)->tick)
29679 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29680 return false;
29682 if (!NONDEBUG_INSN_P (top))
29683 return false;
29684 if (!NONJUMP_INSN_P (top))
29685 return false;
29686 if (!NONDEBUG_INSN_P (next))
29687 return false;
29688 if (!NONJUMP_INSN_P (next))
29689 return false;
29690 set = single_set (top);
29691 if (!set)
29692 return false;
29693 set = single_set (next);
29694 if (!set)
29695 return false;
29697 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29699 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29700 return false;
29701 /* Determine winner more precise. */
29702 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29704 rtx pro;
29705 pro = DEP_PRO (dep);
29706 if (!NONDEBUG_INSN_P (pro))
29707 continue;
29708 if (INSN_TICK (pro) > clock1)
29709 clock1 = INSN_TICK (pro);
29711 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29713 rtx pro;
29714 pro = DEP_PRO (dep);
29715 if (!NONDEBUG_INSN_P (pro))
29716 continue;
29717 if (INSN_TICK (pro) > clock2)
29718 clock2 = INSN_TICK (pro);
29721 if (clock1 == clock2)
29723 /* Determine winner - load must win. */
29724 enum attr_memory memory1, memory2;
29725 memory1 = get_attr_memory (top);
29726 memory2 = get_attr_memory (next);
29727 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29728 return true;
29730 return (bool) (clock2 < clock1);
29732 return false;
29733 #undef INSN_TICK
29736 /* Perform possible reodering of ready list for Atom/Silvermont only.
29737 Return issue rate. */
29738 static int
29739 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29740 int *pn_ready, int clock_var)
29742 int issue_rate = -1;
29743 int n_ready = *pn_ready;
29744 int i;
29745 rtx_insn *insn;
29746 int index = -1;
29748 /* Set up issue rate. */
29749 issue_rate = ix86_issue_rate ();
29751 /* Do reodering for BONNELL/SILVERMONT only. */
29752 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29753 return issue_rate;
29755 /* Nothing to do if ready list contains only 1 instruction. */
29756 if (n_ready <= 1)
29757 return issue_rate;
29759 /* Do reodering for post-reload scheduler only. */
29760 if (!reload_completed)
29761 return issue_rate;
29763 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29765 if (sched_verbose > 1)
29766 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29767 INSN_UID (ready[index]));
29769 /* Put IMUL producer (ready[index]) at the top of ready list. */
29770 insn = ready[index];
29771 for (i = index; i < n_ready - 1; i++)
29772 ready[i] = ready[i + 1];
29773 ready[n_ready - 1] = insn;
29774 return issue_rate;
29777 /* Skip selective scheduling since HID is not populated in it. */
29778 if (clock_var != 0
29779 && !sel_sched_p ()
29780 && swap_top_of_ready_list (ready, n_ready))
29782 if (sched_verbose > 1)
29783 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29784 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29785 /* Swap 2 top elements of ready list. */
29786 insn = ready[n_ready - 1];
29787 ready[n_ready - 1] = ready[n_ready - 2];
29788 ready[n_ready - 2] = insn;
29790 return issue_rate;
29793 static bool
29794 ix86_class_likely_spilled_p (reg_class_t);
29796 /* Returns true if lhs of insn is HW function argument register and set up
29797 is_spilled to true if it is likely spilled HW register. */
29798 static bool
29799 insn_is_function_arg (rtx insn, bool* is_spilled)
29801 rtx dst;
29803 if (!NONDEBUG_INSN_P (insn))
29804 return false;
29805 /* Call instructions are not movable, ignore it. */
29806 if (CALL_P (insn))
29807 return false;
29808 insn = PATTERN (insn);
29809 if (GET_CODE (insn) == PARALLEL)
29810 insn = XVECEXP (insn, 0, 0);
29811 if (GET_CODE (insn) != SET)
29812 return false;
29813 dst = SET_DEST (insn);
29814 if (REG_P (dst) && HARD_REGISTER_P (dst)
29815 && ix86_function_arg_regno_p (REGNO (dst)))
29817 /* Is it likely spilled HW register? */
29818 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29819 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29820 *is_spilled = true;
29821 return true;
29823 return false;
29826 /* Add output dependencies for chain of function adjacent arguments if only
29827 there is a move to likely spilled HW register. Return first argument
29828 if at least one dependence was added or NULL otherwise. */
29829 static rtx_insn *
29830 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29832 rtx_insn *insn;
29833 rtx_insn *last = call;
29834 rtx_insn *first_arg = NULL;
29835 bool is_spilled = false;
29837 head = PREV_INSN (head);
29839 /* Find nearest to call argument passing instruction. */
29840 while (true)
29842 last = PREV_INSN (last);
29843 if (last == head)
29844 return NULL;
29845 if (!NONDEBUG_INSN_P (last))
29846 continue;
29847 if (insn_is_function_arg (last, &is_spilled))
29848 break;
29849 return NULL;
29852 first_arg = last;
29853 while (true)
29855 insn = PREV_INSN (last);
29856 if (!INSN_P (insn))
29857 break;
29858 if (insn == head)
29859 break;
29860 if (!NONDEBUG_INSN_P (insn))
29862 last = insn;
29863 continue;
29865 if (insn_is_function_arg (insn, &is_spilled))
29867 /* Add output depdendence between two function arguments if chain
29868 of output arguments contains likely spilled HW registers. */
29869 if (is_spilled)
29870 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29871 first_arg = last = insn;
29873 else
29874 break;
29876 if (!is_spilled)
29877 return NULL;
29878 return first_arg;
29881 /* Add output or anti dependency from insn to first_arg to restrict its code
29882 motion. */
29883 static void
29884 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29886 rtx set;
29887 rtx tmp;
29889 /* Add anti dependencies for bounds stores. */
29890 if (INSN_P (insn)
29891 && GET_CODE (PATTERN (insn)) == PARALLEL
29892 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29893 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29895 add_dependence (first_arg, insn, REG_DEP_ANTI);
29896 return;
29899 set = single_set (insn);
29900 if (!set)
29901 return;
29902 tmp = SET_DEST (set);
29903 if (REG_P (tmp))
29905 /* Add output dependency to the first function argument. */
29906 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29907 return;
29909 /* Add anti dependency. */
29910 add_dependence (first_arg, insn, REG_DEP_ANTI);
29913 /* Avoid cross block motion of function argument through adding dependency
29914 from the first non-jump instruction in bb. */
29915 static void
29916 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29918 rtx_insn *insn = BB_END (bb);
29920 while (insn)
29922 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29924 rtx set = single_set (insn);
29925 if (set)
29927 avoid_func_arg_motion (arg, insn);
29928 return;
29931 if (insn == BB_HEAD (bb))
29932 return;
29933 insn = PREV_INSN (insn);
29937 /* Hook for pre-reload schedule - avoid motion of function arguments
29938 passed in likely spilled HW registers. */
29939 static void
29940 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29942 rtx_insn *insn;
29943 rtx_insn *first_arg = NULL;
29944 if (reload_completed)
29945 return;
29946 while (head != tail && DEBUG_INSN_P (head))
29947 head = NEXT_INSN (head);
29948 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29949 if (INSN_P (insn) && CALL_P (insn))
29951 first_arg = add_parameter_dependencies (insn, head);
29952 if (first_arg)
29954 /* Add dependee for first argument to predecessors if only
29955 region contains more than one block. */
29956 basic_block bb = BLOCK_FOR_INSN (insn);
29957 int rgn = CONTAINING_RGN (bb->index);
29958 int nr_blks = RGN_NR_BLOCKS (rgn);
29959 /* Skip trivial regions and region head blocks that can have
29960 predecessors outside of region. */
29961 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29963 edge e;
29964 edge_iterator ei;
29966 /* Regions are SCCs with the exception of selective
29967 scheduling with pipelining of outer blocks enabled.
29968 So also check that immediate predecessors of a non-head
29969 block are in the same region. */
29970 FOR_EACH_EDGE (e, ei, bb->preds)
29972 /* Avoid creating of loop-carried dependencies through
29973 using topological ordering in the region. */
29974 if (rgn == CONTAINING_RGN (e->src->index)
29975 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29976 add_dependee_for_func_arg (first_arg, e->src);
29979 insn = first_arg;
29980 if (insn == head)
29981 break;
29984 else if (first_arg)
29985 avoid_func_arg_motion (first_arg, insn);
29988 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29989 HW registers to maximum, to schedule them at soon as possible. These are
29990 moves from function argument registers at the top of the function entry
29991 and moves from function return value registers after call. */
29992 static int
29993 ix86_adjust_priority (rtx_insn *insn, int priority)
29995 rtx set;
29997 if (reload_completed)
29998 return priority;
30000 if (!NONDEBUG_INSN_P (insn))
30001 return priority;
30003 set = single_set (insn);
30004 if (set)
30006 rtx tmp = SET_SRC (set);
30007 if (REG_P (tmp)
30008 && HARD_REGISTER_P (tmp)
30009 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
30010 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
30011 return current_sched_info->sched_max_insns_priority;
30014 return priority;
30017 /* Model decoder of Core 2/i7.
30018 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
30019 track the instruction fetch block boundaries and make sure that long
30020 (9+ bytes) instructions are assigned to D0. */
30022 /* Maximum length of an insn that can be handled by
30023 a secondary decoder unit. '8' for Core 2/i7. */
30024 static int core2i7_secondary_decoder_max_insn_size;
30026 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
30027 '16' for Core 2/i7. */
30028 static int core2i7_ifetch_block_size;
30030 /* Maximum number of instructions decoder can handle per cycle.
30031 '6' for Core 2/i7. */
30032 static int core2i7_ifetch_block_max_insns;
30034 typedef struct ix86_first_cycle_multipass_data_ *
30035 ix86_first_cycle_multipass_data_t;
30036 typedef const struct ix86_first_cycle_multipass_data_ *
30037 const_ix86_first_cycle_multipass_data_t;
30039 /* A variable to store target state across calls to max_issue within
30040 one cycle. */
30041 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30042 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30044 /* Initialize DATA. */
30045 static void
30046 core2i7_first_cycle_multipass_init (void *_data)
30048 ix86_first_cycle_multipass_data_t data
30049 = (ix86_first_cycle_multipass_data_t) _data;
30051 data->ifetch_block_len = 0;
30052 data->ifetch_block_n_insns = 0;
30053 data->ready_try_change = NULL;
30054 data->ready_try_change_size = 0;
30057 /* Advancing the cycle; reset ifetch block counts. */
30058 static void
30059 core2i7_dfa_post_advance_cycle (void)
30061 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30063 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30065 data->ifetch_block_len = 0;
30066 data->ifetch_block_n_insns = 0;
30069 static int min_insn_size (rtx_insn *);
30071 /* Filter out insns from ready_try that the core will not be able to issue
30072 on current cycle due to decoder. */
30073 static void
30074 core2i7_first_cycle_multipass_filter_ready_try
30075 (const_ix86_first_cycle_multipass_data_t data,
30076 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30078 while (n_ready--)
30080 rtx_insn *insn;
30081 int insn_size;
30083 if (ready_try[n_ready])
30084 continue;
30086 insn = get_ready_element (n_ready);
30087 insn_size = min_insn_size (insn);
30089 if (/* If this is a too long an insn for a secondary decoder ... */
30090 (!first_cycle_insn_p
30091 && insn_size > core2i7_secondary_decoder_max_insn_size)
30092 /* ... or it would not fit into the ifetch block ... */
30093 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30094 /* ... or the decoder is full already ... */
30095 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30096 /* ... mask the insn out. */
30098 ready_try[n_ready] = 1;
30100 if (data->ready_try_change)
30101 bitmap_set_bit (data->ready_try_change, n_ready);
30106 /* Prepare for a new round of multipass lookahead scheduling. */
30107 static void
30108 core2i7_first_cycle_multipass_begin (void *_data,
30109 signed char *ready_try, int n_ready,
30110 bool first_cycle_insn_p)
30112 ix86_first_cycle_multipass_data_t data
30113 = (ix86_first_cycle_multipass_data_t) _data;
30114 const_ix86_first_cycle_multipass_data_t prev_data
30115 = ix86_first_cycle_multipass_data;
30117 /* Restore the state from the end of the previous round. */
30118 data->ifetch_block_len = prev_data->ifetch_block_len;
30119 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
30121 /* Filter instructions that cannot be issued on current cycle due to
30122 decoder restrictions. */
30123 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30124 first_cycle_insn_p);
30127 /* INSN is being issued in current solution. Account for its impact on
30128 the decoder model. */
30129 static void
30130 core2i7_first_cycle_multipass_issue (void *_data,
30131 signed char *ready_try, int n_ready,
30132 rtx_insn *insn, const void *_prev_data)
30134 ix86_first_cycle_multipass_data_t data
30135 = (ix86_first_cycle_multipass_data_t) _data;
30136 const_ix86_first_cycle_multipass_data_t prev_data
30137 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
30139 int insn_size = min_insn_size (insn);
30141 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
30142 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
30143 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
30144 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30146 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
30147 if (!data->ready_try_change)
30149 data->ready_try_change = sbitmap_alloc (n_ready);
30150 data->ready_try_change_size = n_ready;
30152 else if (data->ready_try_change_size < n_ready)
30154 data->ready_try_change = sbitmap_resize (data->ready_try_change,
30155 n_ready, 0);
30156 data->ready_try_change_size = n_ready;
30158 bitmap_clear (data->ready_try_change);
30160 /* Filter out insns from ready_try that the core will not be able to issue
30161 on current cycle due to decoder. */
30162 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30163 false);
30166 /* Revert the effect on ready_try. */
30167 static void
30168 core2i7_first_cycle_multipass_backtrack (const void *_data,
30169 signed char *ready_try,
30170 int n_ready ATTRIBUTE_UNUSED)
30172 const_ix86_first_cycle_multipass_data_t data
30173 = (const_ix86_first_cycle_multipass_data_t) _data;
30174 unsigned int i = 0;
30175 sbitmap_iterator sbi;
30177 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
30178 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
30180 ready_try[i] = 0;
30184 /* Save the result of multipass lookahead scheduling for the next round. */
30185 static void
30186 core2i7_first_cycle_multipass_end (const void *_data)
30188 const_ix86_first_cycle_multipass_data_t data
30189 = (const_ix86_first_cycle_multipass_data_t) _data;
30190 ix86_first_cycle_multipass_data_t next_data
30191 = ix86_first_cycle_multipass_data;
30193 if (data != NULL)
30195 next_data->ifetch_block_len = data->ifetch_block_len;
30196 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
30200 /* Deallocate target data. */
30201 static void
30202 core2i7_first_cycle_multipass_fini (void *_data)
30204 ix86_first_cycle_multipass_data_t data
30205 = (ix86_first_cycle_multipass_data_t) _data;
30207 if (data->ready_try_change)
30209 sbitmap_free (data->ready_try_change);
30210 data->ready_try_change = NULL;
30211 data->ready_try_change_size = 0;
30215 /* Prepare for scheduling pass. */
30216 static void
30217 ix86_sched_init_global (FILE *, int, int)
30219 /* Install scheduling hooks for current CPU. Some of these hooks are used
30220 in time-critical parts of the scheduler, so we only set them up when
30221 they are actually used. */
30222 switch (ix86_tune)
30224 case PROCESSOR_CORE2:
30225 case PROCESSOR_NEHALEM:
30226 case PROCESSOR_SANDYBRIDGE:
30227 case PROCESSOR_HASWELL:
30228 /* Do not perform multipass scheduling for pre-reload schedule
30229 to save compile time. */
30230 if (reload_completed)
30232 targetm.sched.dfa_post_advance_cycle
30233 = core2i7_dfa_post_advance_cycle;
30234 targetm.sched.first_cycle_multipass_init
30235 = core2i7_first_cycle_multipass_init;
30236 targetm.sched.first_cycle_multipass_begin
30237 = core2i7_first_cycle_multipass_begin;
30238 targetm.sched.first_cycle_multipass_issue
30239 = core2i7_first_cycle_multipass_issue;
30240 targetm.sched.first_cycle_multipass_backtrack
30241 = core2i7_first_cycle_multipass_backtrack;
30242 targetm.sched.first_cycle_multipass_end
30243 = core2i7_first_cycle_multipass_end;
30244 targetm.sched.first_cycle_multipass_fini
30245 = core2i7_first_cycle_multipass_fini;
30247 /* Set decoder parameters. */
30248 core2i7_secondary_decoder_max_insn_size = 8;
30249 core2i7_ifetch_block_size = 16;
30250 core2i7_ifetch_block_max_insns = 6;
30251 break;
30253 /* Fall through. */
30254 default:
30255 targetm.sched.dfa_post_advance_cycle = NULL;
30256 targetm.sched.first_cycle_multipass_init = NULL;
30257 targetm.sched.first_cycle_multipass_begin = NULL;
30258 targetm.sched.first_cycle_multipass_issue = NULL;
30259 targetm.sched.first_cycle_multipass_backtrack = NULL;
30260 targetm.sched.first_cycle_multipass_end = NULL;
30261 targetm.sched.first_cycle_multipass_fini = NULL;
30262 break;
30267 /* Compute the alignment given to a constant that is being placed in memory.
30268 EXP is the constant and ALIGN is the alignment that the object would
30269 ordinarily have.
30270 The value of this function is used instead of that alignment to align
30271 the object. */
30274 ix86_constant_alignment (tree exp, int align)
30276 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30277 || TREE_CODE (exp) == INTEGER_CST)
30279 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30280 return 64;
30281 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30282 return 128;
30284 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30285 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30286 return BITS_PER_WORD;
30288 return align;
30291 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30292 the data type, and ALIGN is the alignment that the object would
30293 ordinarily have. */
30295 static int
30296 iamcu_alignment (tree type, int align)
30298 enum machine_mode mode;
30300 if (align < 32 || TYPE_USER_ALIGN (type))
30301 return align;
30303 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30304 bytes. */
30305 mode = TYPE_MODE (strip_array_types (type));
30306 switch (GET_MODE_CLASS (mode))
30308 case MODE_INT:
30309 case MODE_COMPLEX_INT:
30310 case MODE_COMPLEX_FLOAT:
30311 case MODE_FLOAT:
30312 case MODE_DECIMAL_FLOAT:
30313 return 32;
30314 default:
30315 return align;
30319 /* Compute the alignment for a static variable.
30320 TYPE is the data type, and ALIGN is the alignment that
30321 the object would ordinarily have. The value of this function is used
30322 instead of that alignment to align the object. */
30325 ix86_data_alignment (tree type, int align, bool opt)
30327 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30328 for symbols from other compilation units or symbols that don't need
30329 to bind locally. In order to preserve some ABI compatibility with
30330 those compilers, ensure we don't decrease alignment from what we
30331 used to assume. */
30333 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30335 /* A data structure, equal or greater than the size of a cache line
30336 (64 bytes in the Pentium 4 and other recent Intel processors, including
30337 processors based on Intel Core microarchitecture) should be aligned
30338 so that its base address is a multiple of a cache line size. */
30340 int max_align
30341 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30343 if (max_align < BITS_PER_WORD)
30344 max_align = BITS_PER_WORD;
30346 switch (ix86_align_data_type)
30348 case ix86_align_data_type_abi: opt = false; break;
30349 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30350 case ix86_align_data_type_cacheline: break;
30353 if (TARGET_IAMCU)
30354 align = iamcu_alignment (type, align);
30356 if (opt
30357 && AGGREGATE_TYPE_P (type)
30358 && TYPE_SIZE (type)
30359 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30361 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30362 && align < max_align_compat)
30363 align = max_align_compat;
30364 if (wi::geu_p (TYPE_SIZE (type), max_align)
30365 && align < max_align)
30366 align = max_align;
30369 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30370 to 16byte boundary. */
30371 if (TARGET_64BIT)
30373 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30374 && TYPE_SIZE (type)
30375 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30376 && wi::geu_p (TYPE_SIZE (type), 128)
30377 && align < 128)
30378 return 128;
30381 if (!opt)
30382 return align;
30384 if (TREE_CODE (type) == ARRAY_TYPE)
30386 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30387 return 64;
30388 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30389 return 128;
30391 else if (TREE_CODE (type) == COMPLEX_TYPE)
30394 if (TYPE_MODE (type) == DCmode && align < 64)
30395 return 64;
30396 if ((TYPE_MODE (type) == XCmode
30397 || TYPE_MODE (type) == TCmode) && align < 128)
30398 return 128;
30400 else if ((TREE_CODE (type) == RECORD_TYPE
30401 || TREE_CODE (type) == UNION_TYPE
30402 || TREE_CODE (type) == QUAL_UNION_TYPE)
30403 && TYPE_FIELDS (type))
30405 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30406 return 64;
30407 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30408 return 128;
30410 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30411 || TREE_CODE (type) == INTEGER_TYPE)
30413 if (TYPE_MODE (type) == DFmode && align < 64)
30414 return 64;
30415 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30416 return 128;
30419 return align;
30422 /* Compute the alignment for a local variable or a stack slot. EXP is
30423 the data type or decl itself, MODE is the widest mode available and
30424 ALIGN is the alignment that the object would ordinarily have. The
30425 value of this macro is used instead of that alignment to align the
30426 object. */
30428 unsigned int
30429 ix86_local_alignment (tree exp, machine_mode mode,
30430 unsigned int align)
30432 tree type, decl;
30434 if (exp && DECL_P (exp))
30436 type = TREE_TYPE (exp);
30437 decl = exp;
30439 else
30441 type = exp;
30442 decl = NULL;
30445 /* Don't do dynamic stack realignment for long long objects with
30446 -mpreferred-stack-boundary=2. */
30447 if (!TARGET_64BIT
30448 && align == 64
30449 && ix86_preferred_stack_boundary < 64
30450 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30451 && (!type || !TYPE_USER_ALIGN (type))
30452 && (!decl || !DECL_USER_ALIGN (decl)))
30453 align = 32;
30455 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30456 register in MODE. We will return the largest alignment of XF
30457 and DF. */
30458 if (!type)
30460 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30461 align = GET_MODE_ALIGNMENT (DFmode);
30462 return align;
30465 /* Don't increase alignment for Intel MCU psABI. */
30466 if (TARGET_IAMCU)
30467 return align;
30469 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30470 to 16byte boundary. Exact wording is:
30472 An array uses the same alignment as its elements, except that a local or
30473 global array variable of length at least 16 bytes or
30474 a C99 variable-length array variable always has alignment of at least 16 bytes.
30476 This was added to allow use of aligned SSE instructions at arrays. This
30477 rule is meant for static storage (where compiler can not do the analysis
30478 by itself). We follow it for automatic variables only when convenient.
30479 We fully control everything in the function compiled and functions from
30480 other unit can not rely on the alignment.
30482 Exclude va_list type. It is the common case of local array where
30483 we can not benefit from the alignment.
30485 TODO: Probably one should optimize for size only when var is not escaping. */
30486 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30487 && TARGET_SSE)
30489 if (AGGREGATE_TYPE_P (type)
30490 && (va_list_type_node == NULL_TREE
30491 || (TYPE_MAIN_VARIANT (type)
30492 != TYPE_MAIN_VARIANT (va_list_type_node)))
30493 && TYPE_SIZE (type)
30494 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30495 && wi::geu_p (TYPE_SIZE (type), 128)
30496 && align < 128)
30497 return 128;
30499 if (TREE_CODE (type) == ARRAY_TYPE)
30501 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30502 return 64;
30503 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30504 return 128;
30506 else if (TREE_CODE (type) == COMPLEX_TYPE)
30508 if (TYPE_MODE (type) == DCmode && align < 64)
30509 return 64;
30510 if ((TYPE_MODE (type) == XCmode
30511 || TYPE_MODE (type) == TCmode) && align < 128)
30512 return 128;
30514 else if ((TREE_CODE (type) == RECORD_TYPE
30515 || TREE_CODE (type) == UNION_TYPE
30516 || TREE_CODE (type) == QUAL_UNION_TYPE)
30517 && TYPE_FIELDS (type))
30519 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30520 return 64;
30521 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30522 return 128;
30524 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30525 || TREE_CODE (type) == INTEGER_TYPE)
30528 if (TYPE_MODE (type) == DFmode && align < 64)
30529 return 64;
30530 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30531 return 128;
30533 return align;
30536 /* Compute the minimum required alignment for dynamic stack realignment
30537 purposes for a local variable, parameter or a stack slot. EXP is
30538 the data type or decl itself, MODE is its mode and ALIGN is the
30539 alignment that the object would ordinarily have. */
30541 unsigned int
30542 ix86_minimum_alignment (tree exp, machine_mode mode,
30543 unsigned int align)
30545 tree type, decl;
30547 if (exp && DECL_P (exp))
30549 type = TREE_TYPE (exp);
30550 decl = exp;
30552 else
30554 type = exp;
30555 decl = NULL;
30558 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30559 return align;
30561 /* Don't do dynamic stack realignment for long long objects with
30562 -mpreferred-stack-boundary=2. */
30563 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30564 && (!type || !TYPE_USER_ALIGN (type))
30565 && (!decl || !DECL_USER_ALIGN (decl)))
30567 gcc_checking_assert (!TARGET_STV);
30568 return 32;
30571 return align;
30574 /* Find a location for the static chain incoming to a nested function.
30575 This is a register, unless all free registers are used by arguments. */
30577 static rtx
30578 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30580 unsigned regno;
30582 /* While this function won't be called by the middle-end when a static
30583 chain isn't needed, it's also used throughout the backend so it's
30584 easiest to keep this check centralized. */
30585 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30586 return NULL;
30588 if (TARGET_64BIT)
30590 /* We always use R10 in 64-bit mode. */
30591 regno = R10_REG;
30593 else
30595 const_tree fntype, fndecl;
30596 unsigned int ccvt;
30598 /* By default in 32-bit mode we use ECX to pass the static chain. */
30599 regno = CX_REG;
30601 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30603 fntype = TREE_TYPE (fndecl_or_type);
30604 fndecl = fndecl_or_type;
30606 else
30608 fntype = fndecl_or_type;
30609 fndecl = NULL;
30612 ccvt = ix86_get_callcvt (fntype);
30613 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30615 /* Fastcall functions use ecx/edx for arguments, which leaves
30616 us with EAX for the static chain.
30617 Thiscall functions use ecx for arguments, which also
30618 leaves us with EAX for the static chain. */
30619 regno = AX_REG;
30621 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30623 /* Thiscall functions use ecx for arguments, which leaves
30624 us with EAX and EDX for the static chain.
30625 We are using for abi-compatibility EAX. */
30626 regno = AX_REG;
30628 else if (ix86_function_regparm (fntype, fndecl) == 3)
30630 /* For regparm 3, we have no free call-clobbered registers in
30631 which to store the static chain. In order to implement this,
30632 we have the trampoline push the static chain to the stack.
30633 However, we can't push a value below the return address when
30634 we call the nested function directly, so we have to use an
30635 alternate entry point. For this we use ESI, and have the
30636 alternate entry point push ESI, so that things appear the
30637 same once we're executing the nested function. */
30638 if (incoming_p)
30640 if (fndecl == current_function_decl)
30641 ix86_static_chain_on_stack = true;
30642 return gen_frame_mem (SImode,
30643 plus_constant (Pmode,
30644 arg_pointer_rtx, -8));
30646 regno = SI_REG;
30650 return gen_rtx_REG (Pmode, regno);
30653 /* Emit RTL insns to initialize the variable parts of a trampoline.
30654 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30655 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30656 to be passed to the target function. */
30658 static void
30659 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30661 rtx mem, fnaddr;
30662 int opcode;
30663 int offset = 0;
30665 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30667 if (TARGET_64BIT)
30669 int size;
30671 /* Load the function address to r11. Try to load address using
30672 the shorter movl instead of movabs. We may want to support
30673 movq for kernel mode, but kernel does not use trampolines at
30674 the moment. FNADDR is a 32bit address and may not be in
30675 DImode when ptr_mode == SImode. Always use movl in this
30676 case. */
30677 if (ptr_mode == SImode
30678 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30680 fnaddr = copy_addr_to_reg (fnaddr);
30682 mem = adjust_address (m_tramp, HImode, offset);
30683 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30685 mem = adjust_address (m_tramp, SImode, offset + 2);
30686 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30687 offset += 6;
30689 else
30691 mem = adjust_address (m_tramp, HImode, offset);
30692 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30694 mem = adjust_address (m_tramp, DImode, offset + 2);
30695 emit_move_insn (mem, fnaddr);
30696 offset += 10;
30699 /* Load static chain using movabs to r10. Use the shorter movl
30700 instead of movabs when ptr_mode == SImode. */
30701 if (ptr_mode == SImode)
30703 opcode = 0xba41;
30704 size = 6;
30706 else
30708 opcode = 0xba49;
30709 size = 10;
30712 mem = adjust_address (m_tramp, HImode, offset);
30713 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30715 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30716 emit_move_insn (mem, chain_value);
30717 offset += size;
30719 /* Jump to r11; the last (unused) byte is a nop, only there to
30720 pad the write out to a single 32-bit store. */
30721 mem = adjust_address (m_tramp, SImode, offset);
30722 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30723 offset += 4;
30725 else
30727 rtx disp, chain;
30729 /* Depending on the static chain location, either load a register
30730 with a constant, or push the constant to the stack. All of the
30731 instructions are the same size. */
30732 chain = ix86_static_chain (fndecl, true);
30733 if (REG_P (chain))
30735 switch (REGNO (chain))
30737 case AX_REG:
30738 opcode = 0xb8; break;
30739 case CX_REG:
30740 opcode = 0xb9; break;
30741 default:
30742 gcc_unreachable ();
30745 else
30746 opcode = 0x68;
30748 mem = adjust_address (m_tramp, QImode, offset);
30749 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30751 mem = adjust_address (m_tramp, SImode, offset + 1);
30752 emit_move_insn (mem, chain_value);
30753 offset += 5;
30755 mem = adjust_address (m_tramp, QImode, offset);
30756 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30758 mem = adjust_address (m_tramp, SImode, offset + 1);
30760 /* Compute offset from the end of the jmp to the target function.
30761 In the case in which the trampoline stores the static chain on
30762 the stack, we need to skip the first insn which pushes the
30763 (call-saved) register static chain; this push is 1 byte. */
30764 offset += 5;
30765 disp = expand_binop (SImode, sub_optab, fnaddr,
30766 plus_constant (Pmode, XEXP (m_tramp, 0),
30767 offset - (MEM_P (chain) ? 1 : 0)),
30768 NULL_RTX, 1, OPTAB_DIRECT);
30769 emit_move_insn (mem, disp);
30772 gcc_assert (offset <= TRAMPOLINE_SIZE);
30774 #ifdef HAVE_ENABLE_EXECUTE_STACK
30775 #ifdef CHECK_EXECUTE_STACK_ENABLED
30776 if (CHECK_EXECUTE_STACK_ENABLED)
30777 #endif
30778 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30779 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30780 #endif
30783 /* The following file contains several enumerations and data structures
30784 built from the definitions in i386-builtin-types.def. */
30786 #include "i386-builtin-types.inc"
30788 /* Table for the ix86 builtin non-function types. */
30789 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30791 /* Retrieve an element from the above table, building some of
30792 the types lazily. */
30794 static tree
30795 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30797 unsigned int index;
30798 tree type, itype;
30800 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30802 type = ix86_builtin_type_tab[(int) tcode];
30803 if (type != NULL)
30804 return type;
30806 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30807 if (tcode <= IX86_BT_LAST_VECT)
30809 machine_mode mode;
30811 index = tcode - IX86_BT_LAST_PRIM - 1;
30812 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30813 mode = ix86_builtin_type_vect_mode[index];
30815 type = build_vector_type_for_mode (itype, mode);
30817 else
30819 int quals;
30821 index = tcode - IX86_BT_LAST_VECT - 1;
30822 if (tcode <= IX86_BT_LAST_PTR)
30823 quals = TYPE_UNQUALIFIED;
30824 else
30825 quals = TYPE_QUAL_CONST;
30827 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30828 if (quals != TYPE_UNQUALIFIED)
30829 itype = build_qualified_type (itype, quals);
30831 type = build_pointer_type (itype);
30834 ix86_builtin_type_tab[(int) tcode] = type;
30835 return type;
30838 /* Table for the ix86 builtin function types. */
30839 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30841 /* Retrieve an element from the above table, building some of
30842 the types lazily. */
30844 static tree
30845 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30847 tree type;
30849 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30851 type = ix86_builtin_func_type_tab[(int) tcode];
30852 if (type != NULL)
30853 return type;
30855 if (tcode <= IX86_BT_LAST_FUNC)
30857 unsigned start = ix86_builtin_func_start[(int) tcode];
30858 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30859 tree rtype, atype, args = void_list_node;
30860 unsigned i;
30862 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30863 for (i = after - 1; i > start; --i)
30865 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30866 args = tree_cons (NULL, atype, args);
30869 type = build_function_type (rtype, args);
30871 else
30873 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30874 enum ix86_builtin_func_type icode;
30876 icode = ix86_builtin_func_alias_base[index];
30877 type = ix86_get_builtin_func_type (icode);
30880 ix86_builtin_func_type_tab[(int) tcode] = type;
30881 return type;
30885 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30886 bdesc_* arrays below should come first, then builtins for each bdesc_*
30887 array in ascending order, so that we can use direct array accesses. */
30888 enum ix86_builtins
30890 IX86_BUILTIN_MASKMOVQ,
30891 IX86_BUILTIN_LDMXCSR,
30892 IX86_BUILTIN_STMXCSR,
30893 IX86_BUILTIN_MASKMOVDQU,
30894 IX86_BUILTIN_PSLLDQ128,
30895 IX86_BUILTIN_CLFLUSH,
30896 IX86_BUILTIN_MONITOR,
30897 IX86_BUILTIN_MWAIT,
30898 IX86_BUILTIN_CLZERO,
30899 IX86_BUILTIN_VEC_INIT_V2SI,
30900 IX86_BUILTIN_VEC_INIT_V4HI,
30901 IX86_BUILTIN_VEC_INIT_V8QI,
30902 IX86_BUILTIN_VEC_EXT_V2DF,
30903 IX86_BUILTIN_VEC_EXT_V2DI,
30904 IX86_BUILTIN_VEC_EXT_V4SF,
30905 IX86_BUILTIN_VEC_EXT_V4SI,
30906 IX86_BUILTIN_VEC_EXT_V8HI,
30907 IX86_BUILTIN_VEC_EXT_V2SI,
30908 IX86_BUILTIN_VEC_EXT_V4HI,
30909 IX86_BUILTIN_VEC_EXT_V16QI,
30910 IX86_BUILTIN_VEC_SET_V2DI,
30911 IX86_BUILTIN_VEC_SET_V4SF,
30912 IX86_BUILTIN_VEC_SET_V4SI,
30913 IX86_BUILTIN_VEC_SET_V8HI,
30914 IX86_BUILTIN_VEC_SET_V4HI,
30915 IX86_BUILTIN_VEC_SET_V16QI,
30916 IX86_BUILTIN_GATHERSIV2DF,
30917 IX86_BUILTIN_GATHERSIV4DF,
30918 IX86_BUILTIN_GATHERDIV2DF,
30919 IX86_BUILTIN_GATHERDIV4DF,
30920 IX86_BUILTIN_GATHERSIV4SF,
30921 IX86_BUILTIN_GATHERSIV8SF,
30922 IX86_BUILTIN_GATHERDIV4SF,
30923 IX86_BUILTIN_GATHERDIV8SF,
30924 IX86_BUILTIN_GATHERSIV2DI,
30925 IX86_BUILTIN_GATHERSIV4DI,
30926 IX86_BUILTIN_GATHERDIV2DI,
30927 IX86_BUILTIN_GATHERDIV4DI,
30928 IX86_BUILTIN_GATHERSIV4SI,
30929 IX86_BUILTIN_GATHERSIV8SI,
30930 IX86_BUILTIN_GATHERDIV4SI,
30931 IX86_BUILTIN_GATHERDIV8SI,
30932 IX86_BUILTIN_VFMSUBSD3_MASK3,
30933 IX86_BUILTIN_VFMSUBSS3_MASK3,
30934 IX86_BUILTIN_GATHER3SIV8SF,
30935 IX86_BUILTIN_GATHER3SIV4SF,
30936 IX86_BUILTIN_GATHER3SIV4DF,
30937 IX86_BUILTIN_GATHER3SIV2DF,
30938 IX86_BUILTIN_GATHER3DIV8SF,
30939 IX86_BUILTIN_GATHER3DIV4SF,
30940 IX86_BUILTIN_GATHER3DIV4DF,
30941 IX86_BUILTIN_GATHER3DIV2DF,
30942 IX86_BUILTIN_GATHER3SIV8SI,
30943 IX86_BUILTIN_GATHER3SIV4SI,
30944 IX86_BUILTIN_GATHER3SIV4DI,
30945 IX86_BUILTIN_GATHER3SIV2DI,
30946 IX86_BUILTIN_GATHER3DIV8SI,
30947 IX86_BUILTIN_GATHER3DIV4SI,
30948 IX86_BUILTIN_GATHER3DIV4DI,
30949 IX86_BUILTIN_GATHER3DIV2DI,
30950 IX86_BUILTIN_SCATTERSIV8SF,
30951 IX86_BUILTIN_SCATTERSIV4SF,
30952 IX86_BUILTIN_SCATTERSIV4DF,
30953 IX86_BUILTIN_SCATTERSIV2DF,
30954 IX86_BUILTIN_SCATTERDIV8SF,
30955 IX86_BUILTIN_SCATTERDIV4SF,
30956 IX86_BUILTIN_SCATTERDIV4DF,
30957 IX86_BUILTIN_SCATTERDIV2DF,
30958 IX86_BUILTIN_SCATTERSIV8SI,
30959 IX86_BUILTIN_SCATTERSIV4SI,
30960 IX86_BUILTIN_SCATTERSIV4DI,
30961 IX86_BUILTIN_SCATTERSIV2DI,
30962 IX86_BUILTIN_SCATTERDIV8SI,
30963 IX86_BUILTIN_SCATTERDIV4SI,
30964 IX86_BUILTIN_SCATTERDIV4DI,
30965 IX86_BUILTIN_SCATTERDIV2DI,
30966 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30967 where all operands are 32-byte or 64-byte wide respectively. */
30968 IX86_BUILTIN_GATHERALTSIV4DF,
30969 IX86_BUILTIN_GATHERALTDIV8SF,
30970 IX86_BUILTIN_GATHERALTSIV4DI,
30971 IX86_BUILTIN_GATHERALTDIV8SI,
30972 IX86_BUILTIN_GATHER3ALTDIV16SF,
30973 IX86_BUILTIN_GATHER3ALTDIV16SI,
30974 IX86_BUILTIN_GATHER3ALTSIV4DF,
30975 IX86_BUILTIN_GATHER3ALTDIV8SF,
30976 IX86_BUILTIN_GATHER3ALTSIV4DI,
30977 IX86_BUILTIN_GATHER3ALTDIV8SI,
30978 IX86_BUILTIN_GATHER3ALTSIV8DF,
30979 IX86_BUILTIN_GATHER3ALTSIV8DI,
30980 IX86_BUILTIN_GATHER3DIV16SF,
30981 IX86_BUILTIN_GATHER3DIV16SI,
30982 IX86_BUILTIN_GATHER3DIV8DF,
30983 IX86_BUILTIN_GATHER3DIV8DI,
30984 IX86_BUILTIN_GATHER3SIV16SF,
30985 IX86_BUILTIN_GATHER3SIV16SI,
30986 IX86_BUILTIN_GATHER3SIV8DF,
30987 IX86_BUILTIN_GATHER3SIV8DI,
30988 IX86_BUILTIN_SCATTERALTSIV8DF,
30989 IX86_BUILTIN_SCATTERALTDIV16SF,
30990 IX86_BUILTIN_SCATTERALTSIV8DI,
30991 IX86_BUILTIN_SCATTERALTDIV16SI,
30992 IX86_BUILTIN_SCATTERDIV16SF,
30993 IX86_BUILTIN_SCATTERDIV16SI,
30994 IX86_BUILTIN_SCATTERDIV8DF,
30995 IX86_BUILTIN_SCATTERDIV8DI,
30996 IX86_BUILTIN_SCATTERSIV16SF,
30997 IX86_BUILTIN_SCATTERSIV16SI,
30998 IX86_BUILTIN_SCATTERSIV8DF,
30999 IX86_BUILTIN_SCATTERSIV8DI,
31000 IX86_BUILTIN_GATHERPFQPD,
31001 IX86_BUILTIN_GATHERPFDPS,
31002 IX86_BUILTIN_GATHERPFDPD,
31003 IX86_BUILTIN_GATHERPFQPS,
31004 IX86_BUILTIN_SCATTERPFDPD,
31005 IX86_BUILTIN_SCATTERPFDPS,
31006 IX86_BUILTIN_SCATTERPFQPD,
31007 IX86_BUILTIN_SCATTERPFQPS,
31008 IX86_BUILTIN_CLWB,
31009 IX86_BUILTIN_CLFLUSHOPT,
31010 IX86_BUILTIN_INFQ,
31011 IX86_BUILTIN_HUGE_VALQ,
31012 IX86_BUILTIN_NANQ,
31013 IX86_BUILTIN_NANSQ,
31014 IX86_BUILTIN_XABORT,
31015 IX86_BUILTIN_ADDCARRYX32,
31016 IX86_BUILTIN_ADDCARRYX64,
31017 IX86_BUILTIN_SBB32,
31018 IX86_BUILTIN_SBB64,
31019 IX86_BUILTIN_RDRAND16_STEP,
31020 IX86_BUILTIN_RDRAND32_STEP,
31021 IX86_BUILTIN_RDRAND64_STEP,
31022 IX86_BUILTIN_RDSEED16_STEP,
31023 IX86_BUILTIN_RDSEED32_STEP,
31024 IX86_BUILTIN_RDSEED64_STEP,
31025 IX86_BUILTIN_MONITORX,
31026 IX86_BUILTIN_MWAITX,
31027 IX86_BUILTIN_CFSTRING,
31028 IX86_BUILTIN_CPU_INIT,
31029 IX86_BUILTIN_CPU_IS,
31030 IX86_BUILTIN_CPU_SUPPORTS,
31031 IX86_BUILTIN_READ_FLAGS,
31032 IX86_BUILTIN_WRITE_FLAGS,
31034 /* All the remaining builtins are tracked in bdesc_* arrays in
31035 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
31036 this point. */
31037 #define BDESC(mask, icode, name, code, comparison, flag) \
31038 code,
31039 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31040 code, \
31041 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31042 #define BDESC_END(kind, next_kind)
31044 #include "i386-builtin.def"
31046 #undef BDESC
31047 #undef BDESC_FIRST
31048 #undef BDESC_END
31050 IX86_BUILTIN_MAX,
31052 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31054 /* Now just the aliases for bdesc_* start/end. */
31055 #define BDESC(mask, icode, name, code, comparison, flag)
31056 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31057 #define BDESC_END(kind, next_kind) \
31058 IX86_BUILTIN__BDESC_##kind##_LAST \
31059 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31061 #include "i386-builtin.def"
31063 #undef BDESC
31064 #undef BDESC_FIRST
31065 #undef BDESC_END
31067 /* Just to make sure there is no comma after the last enumerator. */
31068 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31071 /* Table for the ix86 builtin decls. */
31072 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31074 /* Table of all of the builtin functions that are possible with different ISA's
31075 but are waiting to be built until a function is declared to use that
31076 ISA. */
31077 struct builtin_isa {
31078 const char *name; /* function name */
31079 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
31080 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31081 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31082 bool const_p; /* true if the declaration is constant */
31083 bool leaf_p; /* true if the declaration has leaf attribute */
31084 bool nothrow_p; /* true if the declaration has nothrow attribute */
31085 bool set_and_not_built_p;
31088 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
31090 /* Bits that can still enable any inclusion of a builtin. */
31091 static HOST_WIDE_INT deferred_isa_values = 0;
31092 static HOST_WIDE_INT deferred_isa_values2 = 0;
31094 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
31095 of which isa_flags to use in the ix86_builtins_isa array. Stores the
31096 function decl in the ix86_builtins array. Returns the function decl or
31097 NULL_TREE, if the builtin was not added.
31099 If the front end has a special hook for builtin functions, delay adding
31100 builtin functions that aren't in the current ISA until the ISA is changed
31101 with function specific optimization. Doing so, can save about 300K for the
31102 default compiler. When the builtin is expanded, check at that time whether
31103 it is valid.
31105 If the front end doesn't have a special hook, record all builtins, even if
31106 it isn't an instruction set in the current ISA in case the user uses
31107 function specific options for a different ISA, so that we don't get scope
31108 errors if a builtin is added in the middle of a function scope. */
31110 static inline tree
31111 def_builtin (HOST_WIDE_INT mask, const char *name,
31112 enum ix86_builtin_func_type tcode,
31113 enum ix86_builtins code)
31115 tree decl = NULL_TREE;
31117 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
31119 ix86_builtins_isa[(int) code].isa = mask;
31121 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
31122 where any bit set means that built-in is enable, this bit must be *and-ed*
31123 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
31124 means that *both* cpuid bits must be set for the built-in to be available.
31125 Handle this here. */
31126 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31127 mask &= ~OPTION_MASK_ISA_AVX512VL;
31129 mask &= ~OPTION_MASK_ISA_64BIT;
31130 if (mask == 0
31131 || (mask & ix86_isa_flags) != 0
31132 || (lang_hooks.builtin_function
31133 == lang_hooks.builtin_function_ext_scope))
31136 tree type = ix86_get_builtin_func_type (tcode);
31137 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31138 NULL, NULL_TREE);
31139 ix86_builtins[(int) code] = decl;
31140 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31142 else
31144 /* Just a MASK where set_and_not_built_p == true can potentially
31145 include a builtin. */
31146 deferred_isa_values |= mask;
31147 ix86_builtins[(int) code] = NULL_TREE;
31148 ix86_builtins_isa[(int) code].tcode = tcode;
31149 ix86_builtins_isa[(int) code].name = name;
31150 ix86_builtins_isa[(int) code].leaf_p = false;
31151 ix86_builtins_isa[(int) code].nothrow_p = false;
31152 ix86_builtins_isa[(int) code].const_p = false;
31153 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31157 return decl;
31160 /* Like def_builtin, but also marks the function decl "const". */
31162 static inline tree
31163 def_builtin_const (HOST_WIDE_INT mask, const char *name,
31164 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31166 tree decl = def_builtin (mask, name, tcode, code);
31167 if (decl)
31168 TREE_READONLY (decl) = 1;
31169 else
31170 ix86_builtins_isa[(int) code].const_p = true;
31172 return decl;
31175 /* Like def_builtin, but for additional isa2 flags. */
31177 static inline tree
31178 def_builtin2 (HOST_WIDE_INT mask, const char *name,
31179 enum ix86_builtin_func_type tcode,
31180 enum ix86_builtins code)
31182 tree decl = NULL_TREE;
31184 ix86_builtins_isa[(int) code].isa2 = mask;
31186 if (mask == 0
31187 || (mask & ix86_isa_flags2) != 0
31188 || (lang_hooks.builtin_function
31189 == lang_hooks.builtin_function_ext_scope))
31192 tree type = ix86_get_builtin_func_type (tcode);
31193 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31194 NULL, NULL_TREE);
31195 ix86_builtins[(int) code] = decl;
31196 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31198 else
31200 /* Just a MASK where set_and_not_built_p == true can potentially
31201 include a builtin. */
31202 deferred_isa_values2 |= mask;
31203 ix86_builtins[(int) code] = NULL_TREE;
31204 ix86_builtins_isa[(int) code].tcode = tcode;
31205 ix86_builtins_isa[(int) code].name = name;
31206 ix86_builtins_isa[(int) code].leaf_p = false;
31207 ix86_builtins_isa[(int) code].nothrow_p = false;
31208 ix86_builtins_isa[(int) code].const_p = false;
31209 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31212 return decl;
31215 /* Like def_builtin, but also marks the function decl "const". */
31217 static inline tree
31218 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31219 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31221 tree decl = def_builtin2 (mask, name, tcode, code);
31222 if (decl)
31223 TREE_READONLY (decl) = 1;
31224 else
31225 ix86_builtins_isa[(int) code].const_p = true;
31227 return decl;
31230 /* Add any new builtin functions for a given ISA that may not have been
31231 declared. This saves a bit of space compared to adding all of the
31232 declarations to the tree, even if we didn't use them. */
31234 static void
31235 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31237 if ((isa & deferred_isa_values) == 0
31238 && (isa2 & deferred_isa_values2) == 0)
31239 return;
31241 /* Bits in ISA value can be removed from potential isa values. */
31242 deferred_isa_values &= ~isa;
31243 deferred_isa_values2 &= ~isa2;
31245 int i;
31246 tree saved_current_target_pragma = current_target_pragma;
31247 current_target_pragma = NULL_TREE;
31249 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31251 if (((ix86_builtins_isa[i].isa & isa) != 0
31252 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31253 && ix86_builtins_isa[i].set_and_not_built_p)
31255 tree decl, type;
31257 /* Don't define the builtin again. */
31258 ix86_builtins_isa[i].set_and_not_built_p = false;
31260 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31261 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31262 type, i, BUILT_IN_MD, NULL,
31263 NULL_TREE);
31265 ix86_builtins[i] = decl;
31266 if (ix86_builtins_isa[i].const_p)
31267 TREE_READONLY (decl) = 1;
31268 if (ix86_builtins_isa[i].leaf_p)
31269 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31270 NULL_TREE);
31271 if (ix86_builtins_isa[i].nothrow_p)
31272 TREE_NOTHROW (decl) = 1;
31276 current_target_pragma = saved_current_target_pragma;
31279 /* Bits for builtin_description.flag. */
31281 /* Set when we don't support the comparison natively, and should
31282 swap_comparison in order to support it. */
31283 #define BUILTIN_DESC_SWAP_OPERANDS 1
31285 struct builtin_description
31287 const HOST_WIDE_INT mask;
31288 const enum insn_code icode;
31289 const char *const name;
31290 const enum ix86_builtins code;
31291 const enum rtx_code comparison;
31292 const int flag;
31295 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31296 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31297 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31298 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31299 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31300 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31301 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31302 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31303 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31304 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31305 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31306 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31307 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31308 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31309 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31310 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31311 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31312 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31313 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31314 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31315 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31316 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31317 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31318 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31319 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31320 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31321 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31322 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31323 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31324 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31325 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31326 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31327 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31328 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31329 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31330 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31331 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31332 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31333 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31334 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31335 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31336 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31337 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31338 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31339 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31340 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31341 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31342 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31343 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31344 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31345 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31346 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31348 #define BDESC(mask, icode, name, code, comparison, flag) \
31349 { mask, icode, name, code, comparison, flag },
31350 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31351 static const struct builtin_description bdesc_##kind[] = \
31353 BDESC (mask, icode, name, code, comparison, flag)
31354 #define BDESC_END(kind, next_kind) \
31357 #include "i386-builtin.def"
31359 #undef BDESC
31360 #undef BDESC_FIRST
31361 #undef BDESC_END
31363 /* TM vector builtins. */
31365 /* Reuse the existing x86-specific `struct builtin_description' cause
31366 we're lazy. Add casts to make them fit. */
31367 static const struct builtin_description bdesc_tm[] =
31369 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31370 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31371 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31372 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31373 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31374 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31375 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31377 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31378 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31379 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31380 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31381 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31382 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31383 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31385 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31386 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31387 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31388 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31389 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31390 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31391 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31393 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31394 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31395 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31398 /* Initialize the transactional memory vector load/store builtins. */
31400 static void
31401 ix86_init_tm_builtins (void)
31403 enum ix86_builtin_func_type ftype;
31404 const struct builtin_description *d;
31405 size_t i;
31406 tree decl;
31407 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31408 tree attrs_log, attrs_type_log;
31410 if (!flag_tm)
31411 return;
31413 /* If there are no builtins defined, we must be compiling in a
31414 language without trans-mem support. */
31415 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31416 return;
31418 /* Use whatever attributes a normal TM load has. */
31419 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31420 attrs_load = DECL_ATTRIBUTES (decl);
31421 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31422 /* Use whatever attributes a normal TM store has. */
31423 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31424 attrs_store = DECL_ATTRIBUTES (decl);
31425 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31426 /* Use whatever attributes a normal TM log has. */
31427 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31428 attrs_log = DECL_ATTRIBUTES (decl);
31429 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31431 for (i = 0, d = bdesc_tm;
31432 i < ARRAY_SIZE (bdesc_tm);
31433 i++, d++)
31435 if ((d->mask & ix86_isa_flags) != 0
31436 || (lang_hooks.builtin_function
31437 == lang_hooks.builtin_function_ext_scope))
31439 tree type, attrs, attrs_type;
31440 enum built_in_function code = (enum built_in_function) d->code;
31442 ftype = (enum ix86_builtin_func_type) d->flag;
31443 type = ix86_get_builtin_func_type (ftype);
31445 if (BUILTIN_TM_LOAD_P (code))
31447 attrs = attrs_load;
31448 attrs_type = attrs_type_load;
31450 else if (BUILTIN_TM_STORE_P (code))
31452 attrs = attrs_store;
31453 attrs_type = attrs_type_store;
31455 else
31457 attrs = attrs_log;
31458 attrs_type = attrs_type_log;
31460 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31461 /* The builtin without the prefix for
31462 calling it directly. */
31463 d->name + strlen ("__builtin_"),
31464 attrs);
31465 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31466 set the TYPE_ATTRIBUTES. */
31467 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31469 set_builtin_decl (code, decl, false);
31474 /* Macros for verification of enum ix86_builtins order. */
31475 #define BDESC_VERIFY(x, y, z) \
31476 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31477 #define BDESC_VERIFYS(x, y, z) \
31478 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31480 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31481 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31482 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31483 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31484 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31485 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31486 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31487 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31488 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31489 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31490 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31491 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31492 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31493 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31494 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31495 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31496 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31497 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31498 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31499 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31501 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31502 in the current target ISA to allow the user to compile particular modules
31503 with different target specific options that differ from the command line
31504 options. */
31505 static void
31506 ix86_init_mmx_sse_builtins (void)
31508 const struct builtin_description * d;
31509 enum ix86_builtin_func_type ftype;
31510 size_t i;
31512 /* Add all special builtins with variable number of operands. */
31513 for (i = 0, d = bdesc_special_args;
31514 i < ARRAY_SIZE (bdesc_special_args);
31515 i++, d++)
31517 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31518 if (d->name == 0)
31519 continue;
31521 ftype = (enum ix86_builtin_func_type) d->flag;
31522 def_builtin (d->mask, d->name, ftype, d->code);
31524 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31525 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31526 ARRAY_SIZE (bdesc_special_args) - 1);
31528 /* Add all builtins with variable number of operands. */
31529 for (i = 0, d = bdesc_args;
31530 i < ARRAY_SIZE (bdesc_args);
31531 i++, d++)
31533 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31534 if (d->name == 0)
31535 continue;
31537 ftype = (enum ix86_builtin_func_type) d->flag;
31538 def_builtin_const (d->mask, d->name, ftype, d->code);
31540 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31541 IX86_BUILTIN__BDESC_ARGS_FIRST,
31542 ARRAY_SIZE (bdesc_args) - 1);
31544 /* Add all builtins with variable number of operands. */
31545 for (i = 0, d = bdesc_args2;
31546 i < ARRAY_SIZE (bdesc_args2);
31547 i++, d++)
31549 if (d->name == 0)
31550 continue;
31552 ftype = (enum ix86_builtin_func_type) d->flag;
31553 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31556 /* Add all builtins with rounding. */
31557 for (i = 0, d = bdesc_round_args;
31558 i < ARRAY_SIZE (bdesc_round_args);
31559 i++, d++)
31561 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31562 if (d->name == 0)
31563 continue;
31565 ftype = (enum ix86_builtin_func_type) d->flag;
31566 def_builtin_const (d->mask, d->name, ftype, d->code);
31568 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31569 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31570 ARRAY_SIZE (bdesc_round_args) - 1);
31572 /* pcmpestr[im] insns. */
31573 for (i = 0, d = bdesc_pcmpestr;
31574 i < ARRAY_SIZE (bdesc_pcmpestr);
31575 i++, d++)
31577 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31578 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31579 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31580 else
31581 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31582 def_builtin_const (d->mask, d->name, ftype, d->code);
31584 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31585 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31586 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31588 /* pcmpistr[im] insns. */
31589 for (i = 0, d = bdesc_pcmpistr;
31590 i < ARRAY_SIZE (bdesc_pcmpistr);
31591 i++, d++)
31593 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31594 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31595 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31596 else
31597 ftype = INT_FTYPE_V16QI_V16QI_INT;
31598 def_builtin_const (d->mask, d->name, ftype, d->code);
31600 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31601 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31602 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31604 /* comi/ucomi insns. */
31605 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31607 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31608 if (d->mask == OPTION_MASK_ISA_SSE2)
31609 ftype = INT_FTYPE_V2DF_V2DF;
31610 else
31611 ftype = INT_FTYPE_V4SF_V4SF;
31612 def_builtin_const (d->mask, d->name, ftype, d->code);
31614 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31615 IX86_BUILTIN__BDESC_COMI_FIRST,
31616 ARRAY_SIZE (bdesc_comi) - 1);
31618 /* SSE */
31619 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31620 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31621 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31622 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31624 /* SSE or 3DNow!A */
31625 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31626 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31627 IX86_BUILTIN_MASKMOVQ);
31629 /* SSE2 */
31630 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31631 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31633 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31634 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31635 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31636 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31638 /* SSE3. */
31639 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31640 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31641 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31642 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31644 /* AES */
31645 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31646 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31647 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31648 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31649 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31650 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31651 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31652 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31653 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31654 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31655 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31656 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31658 /* PCLMUL */
31659 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31660 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31662 /* RDRND */
31663 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31664 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31665 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31666 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31667 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31668 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31669 IX86_BUILTIN_RDRAND64_STEP);
31671 /* AVX2 */
31672 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31673 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31674 IX86_BUILTIN_GATHERSIV2DF);
31676 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31677 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31678 IX86_BUILTIN_GATHERSIV4DF);
31680 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31681 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31682 IX86_BUILTIN_GATHERDIV2DF);
31684 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31685 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31686 IX86_BUILTIN_GATHERDIV4DF);
31688 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31689 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31690 IX86_BUILTIN_GATHERSIV4SF);
31692 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31693 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31694 IX86_BUILTIN_GATHERSIV8SF);
31696 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31697 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31698 IX86_BUILTIN_GATHERDIV4SF);
31700 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31701 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31702 IX86_BUILTIN_GATHERDIV8SF);
31704 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31705 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31706 IX86_BUILTIN_GATHERSIV2DI);
31708 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31709 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31710 IX86_BUILTIN_GATHERSIV4DI);
31712 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31713 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31714 IX86_BUILTIN_GATHERDIV2DI);
31716 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31717 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31718 IX86_BUILTIN_GATHERDIV4DI);
31720 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31721 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31722 IX86_BUILTIN_GATHERSIV4SI);
31724 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31725 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31726 IX86_BUILTIN_GATHERSIV8SI);
31728 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31729 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31730 IX86_BUILTIN_GATHERDIV4SI);
31732 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31733 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31734 IX86_BUILTIN_GATHERDIV8SI);
31736 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31737 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31738 IX86_BUILTIN_GATHERALTSIV4DF);
31740 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31741 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31742 IX86_BUILTIN_GATHERALTDIV8SF);
31744 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31745 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31746 IX86_BUILTIN_GATHERALTSIV4DI);
31748 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31749 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31750 IX86_BUILTIN_GATHERALTDIV8SI);
31752 /* AVX512F */
31753 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31754 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31755 IX86_BUILTIN_GATHER3SIV16SF);
31757 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31758 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31759 IX86_BUILTIN_GATHER3SIV8DF);
31761 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31762 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31763 IX86_BUILTIN_GATHER3DIV16SF);
31765 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31766 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31767 IX86_BUILTIN_GATHER3DIV8DF);
31769 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31770 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31771 IX86_BUILTIN_GATHER3SIV16SI);
31773 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31774 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31775 IX86_BUILTIN_GATHER3SIV8DI);
31777 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31778 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31779 IX86_BUILTIN_GATHER3DIV16SI);
31781 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31782 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31783 IX86_BUILTIN_GATHER3DIV8DI);
31785 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31786 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31787 IX86_BUILTIN_GATHER3ALTSIV8DF);
31789 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31790 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31791 IX86_BUILTIN_GATHER3ALTDIV16SF);
31793 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31794 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31795 IX86_BUILTIN_GATHER3ALTSIV8DI);
31797 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31798 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31799 IX86_BUILTIN_GATHER3ALTDIV16SI);
31801 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31802 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31803 IX86_BUILTIN_SCATTERSIV16SF);
31805 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31806 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31807 IX86_BUILTIN_SCATTERSIV8DF);
31809 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31810 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31811 IX86_BUILTIN_SCATTERDIV16SF);
31813 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31814 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31815 IX86_BUILTIN_SCATTERDIV8DF);
31817 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31818 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31819 IX86_BUILTIN_SCATTERSIV16SI);
31821 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31822 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31823 IX86_BUILTIN_SCATTERSIV8DI);
31825 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31826 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31827 IX86_BUILTIN_SCATTERDIV16SI);
31829 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31830 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31831 IX86_BUILTIN_SCATTERDIV8DI);
31833 /* AVX512VL */
31834 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31835 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31836 IX86_BUILTIN_GATHER3SIV2DF);
31838 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31839 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31840 IX86_BUILTIN_GATHER3SIV4DF);
31842 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31843 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31844 IX86_BUILTIN_GATHER3DIV2DF);
31846 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31847 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31848 IX86_BUILTIN_GATHER3DIV4DF);
31850 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31851 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31852 IX86_BUILTIN_GATHER3SIV4SF);
31854 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31855 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31856 IX86_BUILTIN_GATHER3SIV8SF);
31858 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31859 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31860 IX86_BUILTIN_GATHER3DIV4SF);
31862 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31863 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31864 IX86_BUILTIN_GATHER3DIV8SF);
31866 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31867 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31868 IX86_BUILTIN_GATHER3SIV2DI);
31870 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31871 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31872 IX86_BUILTIN_GATHER3SIV4DI);
31874 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31875 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31876 IX86_BUILTIN_GATHER3DIV2DI);
31878 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31879 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31880 IX86_BUILTIN_GATHER3DIV4DI);
31882 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31883 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31884 IX86_BUILTIN_GATHER3SIV4SI);
31886 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31887 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31888 IX86_BUILTIN_GATHER3SIV8SI);
31890 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31891 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31892 IX86_BUILTIN_GATHER3DIV4SI);
31894 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31895 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31896 IX86_BUILTIN_GATHER3DIV8SI);
31898 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31899 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31900 IX86_BUILTIN_GATHER3ALTSIV4DF);
31902 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31903 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31904 IX86_BUILTIN_GATHER3ALTDIV8SF);
31906 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31907 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31908 IX86_BUILTIN_GATHER3ALTSIV4DI);
31910 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31911 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31912 IX86_BUILTIN_GATHER3ALTDIV8SI);
31914 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31915 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31916 IX86_BUILTIN_SCATTERSIV8SF);
31918 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31919 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31920 IX86_BUILTIN_SCATTERSIV4SF);
31922 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31923 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31924 IX86_BUILTIN_SCATTERSIV4DF);
31926 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31927 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31928 IX86_BUILTIN_SCATTERSIV2DF);
31930 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31931 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31932 IX86_BUILTIN_SCATTERDIV8SF);
31934 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31935 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31936 IX86_BUILTIN_SCATTERDIV4SF);
31938 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31939 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31940 IX86_BUILTIN_SCATTERDIV4DF);
31942 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31943 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31944 IX86_BUILTIN_SCATTERDIV2DF);
31946 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31947 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31948 IX86_BUILTIN_SCATTERSIV8SI);
31950 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31951 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31952 IX86_BUILTIN_SCATTERSIV4SI);
31954 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31955 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31956 IX86_BUILTIN_SCATTERSIV4DI);
31958 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31959 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31960 IX86_BUILTIN_SCATTERSIV2DI);
31962 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31963 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31964 IX86_BUILTIN_SCATTERDIV8SI);
31966 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31967 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31968 IX86_BUILTIN_SCATTERDIV4SI);
31970 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31971 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31972 IX86_BUILTIN_SCATTERDIV4DI);
31974 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31975 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31976 IX86_BUILTIN_SCATTERDIV2DI);
31977 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31978 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31979 IX86_BUILTIN_SCATTERALTSIV8DF);
31981 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31982 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31983 IX86_BUILTIN_SCATTERALTDIV16SF);
31985 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31986 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31987 IX86_BUILTIN_SCATTERALTSIV8DI);
31989 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31990 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31991 IX86_BUILTIN_SCATTERALTDIV16SI);
31993 /* AVX512PF */
31994 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31995 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31996 IX86_BUILTIN_GATHERPFDPD);
31997 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31998 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31999 IX86_BUILTIN_GATHERPFDPS);
32000 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
32001 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32002 IX86_BUILTIN_GATHERPFQPD);
32003 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
32004 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32005 IX86_BUILTIN_GATHERPFQPS);
32006 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
32007 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32008 IX86_BUILTIN_SCATTERPFDPD);
32009 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
32010 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32011 IX86_BUILTIN_SCATTERPFDPS);
32012 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
32013 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32014 IX86_BUILTIN_SCATTERPFQPD);
32015 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
32016 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32017 IX86_BUILTIN_SCATTERPFQPS);
32019 /* SHA */
32020 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
32021 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
32022 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
32023 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
32024 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
32025 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
32026 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
32027 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
32028 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
32029 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
32030 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
32031 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
32032 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
32033 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
32035 /* RTM. */
32036 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
32037 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32039 /* MMX access to the vec_init patterns. */
32040 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32041 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32043 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32044 V4HI_FTYPE_HI_HI_HI_HI,
32045 IX86_BUILTIN_VEC_INIT_V4HI);
32047 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
32048 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
32049 IX86_BUILTIN_VEC_INIT_V8QI);
32051 /* Access to the vec_extract patterns. */
32052 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
32053 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
32054 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
32055 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
32056 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
32057 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
32058 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
32059 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
32060 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
32061 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
32063 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32064 "__builtin_ia32_vec_ext_v4hi",
32065 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
32067 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
32068 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
32070 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
32071 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
32073 /* Access to the vec_set patterns. */
32074 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
32075 "__builtin_ia32_vec_set_v2di",
32076 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
32078 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
32079 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
32081 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
32082 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
32084 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
32085 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
32087 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32088 "__builtin_ia32_vec_set_v4hi",
32089 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
32091 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
32092 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
32094 /* RDSEED */
32095 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
32096 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
32097 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
32098 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
32099 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
32100 "__builtin_ia32_rdseed_di_step",
32101 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
32103 /* ADCX */
32104 def_builtin (0, "__builtin_ia32_addcarryx_u32",
32105 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
32106 def_builtin (OPTION_MASK_ISA_64BIT,
32107 "__builtin_ia32_addcarryx_u64",
32108 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32109 IX86_BUILTIN_ADDCARRYX64);
32111 /* SBB */
32112 def_builtin (0, "__builtin_ia32_sbb_u32",
32113 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
32114 def_builtin (OPTION_MASK_ISA_64BIT,
32115 "__builtin_ia32_sbb_u64",
32116 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32117 IX86_BUILTIN_SBB64);
32119 /* Read/write FLAGS. */
32120 def_builtin (0, "__builtin_ia32_readeflags_u32",
32121 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32122 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
32123 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32124 def_builtin (0, "__builtin_ia32_writeeflags_u32",
32125 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
32126 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
32127 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
32129 /* CLFLUSHOPT. */
32130 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
32131 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
32133 /* CLWB. */
32134 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
32135 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
32137 /* MONITORX and MWAITX. */
32138 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
32139 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
32140 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
32141 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
32143 /* CLZERO. */
32144 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
32145 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
32147 /* Add FMA4 multi-arg argument instructions */
32148 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32150 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
32151 if (d->name == 0)
32152 continue;
32154 ftype = (enum ix86_builtin_func_type) d->flag;
32155 def_builtin_const (d->mask, d->name, ftype, d->code);
32157 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
32158 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32159 ARRAY_SIZE (bdesc_multi_arg) - 1);
32162 static void
32163 ix86_init_mpx_builtins ()
32165 const struct builtin_description * d;
32166 enum ix86_builtin_func_type ftype;
32167 tree decl;
32168 size_t i;
32170 for (i = 0, d = bdesc_mpx;
32171 i < ARRAY_SIZE (bdesc_mpx);
32172 i++, d++)
32174 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32175 if (d->name == 0)
32176 continue;
32178 ftype = (enum ix86_builtin_func_type) d->flag;
32179 decl = def_builtin (d->mask, d->name, ftype, d->code);
32181 /* With no leaf and nothrow flags for MPX builtins
32182 abnormal edges may follow its call when setjmp
32183 presents in the function. Since we may have a lot
32184 of MPX builtins calls it causes lots of useless
32185 edges and enormous PHI nodes. To avoid this we mark
32186 MPX builtins as leaf and nothrow. */
32187 if (decl)
32189 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32190 NULL_TREE);
32191 TREE_NOTHROW (decl) = 1;
32193 else
32195 ix86_builtins_isa[(int)d->code].leaf_p = true;
32196 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32199 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32200 IX86_BUILTIN__BDESC_MPX_FIRST,
32201 ARRAY_SIZE (bdesc_mpx) - 1);
32203 for (i = 0, d = bdesc_mpx_const;
32204 i < ARRAY_SIZE (bdesc_mpx_const);
32205 i++, d++)
32207 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32208 if (d->name == 0)
32209 continue;
32211 ftype = (enum ix86_builtin_func_type) d->flag;
32212 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
32214 if (decl)
32216 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32217 NULL_TREE);
32218 TREE_NOTHROW (decl) = 1;
32220 else
32222 ix86_builtins_isa[(int)d->code].leaf_p = true;
32223 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32226 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32227 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32228 ARRAY_SIZE (bdesc_mpx_const) - 1);
32230 #undef BDESC_VERIFY
32231 #undef BDESC_VERIFYS
32233 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32234 to return a pointer to VERSION_DECL if the outcome of the expression
32235 formed by PREDICATE_CHAIN is true. This function will be called during
32236 version dispatch to decide which function version to execute. It returns
32237 the basic block at the end, to which more conditions can be added. */
32239 static basic_block
32240 add_condition_to_bb (tree function_decl, tree version_decl,
32241 tree predicate_chain, basic_block new_bb)
32243 gimple *return_stmt;
32244 tree convert_expr, result_var;
32245 gimple *convert_stmt;
32246 gimple *call_cond_stmt;
32247 gimple *if_else_stmt;
32249 basic_block bb1, bb2, bb3;
32250 edge e12, e23;
32252 tree cond_var, and_expr_var = NULL_TREE;
32253 gimple_seq gseq;
32255 tree predicate_decl, predicate_arg;
32257 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32259 gcc_assert (new_bb != NULL);
32260 gseq = bb_seq (new_bb);
32263 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32264 build_fold_addr_expr (version_decl));
32265 result_var = create_tmp_var (ptr_type_node);
32266 convert_stmt = gimple_build_assign (result_var, convert_expr);
32267 return_stmt = gimple_build_return (result_var);
32269 if (predicate_chain == NULL_TREE)
32271 gimple_seq_add_stmt (&gseq, convert_stmt);
32272 gimple_seq_add_stmt (&gseq, return_stmt);
32273 set_bb_seq (new_bb, gseq);
32274 gimple_set_bb (convert_stmt, new_bb);
32275 gimple_set_bb (return_stmt, new_bb);
32276 pop_cfun ();
32277 return new_bb;
32280 while (predicate_chain != NULL)
32282 cond_var = create_tmp_var (integer_type_node);
32283 predicate_decl = TREE_PURPOSE (predicate_chain);
32284 predicate_arg = TREE_VALUE (predicate_chain);
32285 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32286 gimple_call_set_lhs (call_cond_stmt, cond_var);
32288 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32289 gimple_set_bb (call_cond_stmt, new_bb);
32290 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32292 predicate_chain = TREE_CHAIN (predicate_chain);
32294 if (and_expr_var == NULL)
32295 and_expr_var = cond_var;
32296 else
32298 gimple *assign_stmt;
32299 /* Use MIN_EXPR to check if any integer is zero?.
32300 and_expr_var = min_expr <cond_var, and_expr_var> */
32301 assign_stmt = gimple_build_assign (and_expr_var,
32302 build2 (MIN_EXPR, integer_type_node,
32303 cond_var, and_expr_var));
32305 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32306 gimple_set_bb (assign_stmt, new_bb);
32307 gimple_seq_add_stmt (&gseq, assign_stmt);
32311 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32312 integer_zero_node,
32313 NULL_TREE, NULL_TREE);
32314 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32315 gimple_set_bb (if_else_stmt, new_bb);
32316 gimple_seq_add_stmt (&gseq, if_else_stmt);
32318 gimple_seq_add_stmt (&gseq, convert_stmt);
32319 gimple_seq_add_stmt (&gseq, return_stmt);
32320 set_bb_seq (new_bb, gseq);
32322 bb1 = new_bb;
32323 e12 = split_block (bb1, if_else_stmt);
32324 bb2 = e12->dest;
32325 e12->flags &= ~EDGE_FALLTHRU;
32326 e12->flags |= EDGE_TRUE_VALUE;
32328 e23 = split_block (bb2, return_stmt);
32330 gimple_set_bb (convert_stmt, bb2);
32331 gimple_set_bb (return_stmt, bb2);
32333 bb3 = e23->dest;
32334 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32336 remove_edge (e23);
32337 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32339 pop_cfun ();
32341 return bb3;
32344 /* This parses the attribute arguments to target in DECL and determines
32345 the right builtin to use to match the platform specification.
32346 It returns the priority value for this version decl. If PREDICATE_LIST
32347 is not NULL, it stores the list of cpu features that need to be checked
32348 before dispatching this function. */
32350 static unsigned int
32351 get_builtin_code_for_version (tree decl, tree *predicate_list)
32353 tree attrs;
32354 struct cl_target_option cur_target;
32355 tree target_node;
32356 struct cl_target_option *new_target;
32357 const char *arg_str = NULL;
32358 const char *attrs_str = NULL;
32359 char *tok_str = NULL;
32360 char *token;
32362 /* Priority of i386 features, greater value is higher priority. This is
32363 used to decide the order in which function dispatch must happen. For
32364 instance, a version specialized for SSE4.2 should be checked for dispatch
32365 before a version for SSE3, as SSE4.2 implies SSE3. */
32366 enum feature_priority
32368 P_ZERO = 0,
32369 P_MMX,
32370 P_SSE,
32371 P_SSE2,
32372 P_SSE3,
32373 P_SSSE3,
32374 P_PROC_SSSE3,
32375 P_SSE4_A,
32376 P_PROC_SSE4_A,
32377 P_SSE4_1,
32378 P_SSE4_2,
32379 P_PROC_SSE4_2,
32380 P_POPCNT,
32381 P_AES,
32382 P_PCLMUL,
32383 P_AVX,
32384 P_PROC_AVX,
32385 P_BMI,
32386 P_PROC_BMI,
32387 P_FMA4,
32388 P_XOP,
32389 P_PROC_XOP,
32390 P_FMA,
32391 P_PROC_FMA,
32392 P_BMI2,
32393 P_AVX2,
32394 P_PROC_AVX2,
32395 P_AVX512F,
32396 P_PROC_AVX512F
32399 enum feature_priority priority = P_ZERO;
32401 /* These are the target attribute strings for which a dispatcher is
32402 available, from fold_builtin_cpu. */
32404 static struct _feature_list
32406 const char *const name;
32407 const enum feature_priority priority;
32409 const feature_list[] =
32411 {"mmx", P_MMX},
32412 {"sse", P_SSE},
32413 {"sse2", P_SSE2},
32414 {"sse3", P_SSE3},
32415 {"sse4a", P_SSE4_A},
32416 {"ssse3", P_SSSE3},
32417 {"sse4.1", P_SSE4_1},
32418 {"sse4.2", P_SSE4_2},
32419 {"popcnt", P_POPCNT},
32420 {"aes", P_AES},
32421 {"pclmul", P_PCLMUL},
32422 {"avx", P_AVX},
32423 {"bmi", P_BMI},
32424 {"fma4", P_FMA4},
32425 {"xop", P_XOP},
32426 {"fma", P_FMA},
32427 {"bmi2", P_BMI2},
32428 {"avx2", P_AVX2},
32429 {"avx512f", P_AVX512F}
32433 static unsigned int NUM_FEATURES
32434 = sizeof (feature_list) / sizeof (struct _feature_list);
32436 unsigned int i;
32438 tree predicate_chain = NULL_TREE;
32439 tree predicate_decl, predicate_arg;
32441 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32442 gcc_assert (attrs != NULL);
32444 attrs = TREE_VALUE (TREE_VALUE (attrs));
32446 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32447 attrs_str = TREE_STRING_POINTER (attrs);
32449 /* Return priority zero for default function. */
32450 if (strcmp (attrs_str, "default") == 0)
32451 return 0;
32453 /* Handle arch= if specified. For priority, set it to be 1 more than
32454 the best instruction set the processor can handle. For instance, if
32455 there is a version for atom and a version for ssse3 (the highest ISA
32456 priority for atom), the atom version must be checked for dispatch
32457 before the ssse3 version. */
32458 if (strstr (attrs_str, "arch=") != NULL)
32460 cl_target_option_save (&cur_target, &global_options);
32461 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32462 &global_options_set);
32464 gcc_assert (target_node);
32465 new_target = TREE_TARGET_OPTION (target_node);
32466 gcc_assert (new_target);
32468 if (new_target->arch_specified && new_target->arch > 0)
32470 switch (new_target->arch)
32472 case PROCESSOR_CORE2:
32473 arg_str = "core2";
32474 priority = P_PROC_SSSE3;
32475 break;
32476 case PROCESSOR_NEHALEM:
32477 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32478 arg_str = "westmere";
32479 else
32480 /* We translate "arch=corei7" and "arch=nehalem" to
32481 "corei7" so that it will be mapped to M_INTEL_COREI7
32482 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32483 arg_str = "corei7";
32484 priority = P_PROC_SSE4_2;
32485 break;
32486 case PROCESSOR_SANDYBRIDGE:
32487 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32488 arg_str = "ivybridge";
32489 else
32490 arg_str = "sandybridge";
32491 priority = P_PROC_AVX;
32492 break;
32493 case PROCESSOR_HASWELL:
32494 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32495 arg_str = "skylake-avx512";
32496 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32497 arg_str = "skylake";
32498 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32499 arg_str = "broadwell";
32500 else
32501 arg_str = "haswell";
32502 priority = P_PROC_AVX2;
32503 break;
32504 case PROCESSOR_BONNELL:
32505 arg_str = "bonnell";
32506 priority = P_PROC_SSSE3;
32507 break;
32508 case PROCESSOR_KNL:
32509 arg_str = "knl";
32510 priority = P_PROC_AVX512F;
32511 break;
32512 case PROCESSOR_SILVERMONT:
32513 arg_str = "silvermont";
32514 priority = P_PROC_SSE4_2;
32515 break;
32516 case PROCESSOR_AMDFAM10:
32517 arg_str = "amdfam10h";
32518 priority = P_PROC_SSE4_A;
32519 break;
32520 case PROCESSOR_BTVER1:
32521 arg_str = "btver1";
32522 priority = P_PROC_SSE4_A;
32523 break;
32524 case PROCESSOR_BTVER2:
32525 arg_str = "btver2";
32526 priority = P_PROC_BMI;
32527 break;
32528 case PROCESSOR_BDVER1:
32529 arg_str = "bdver1";
32530 priority = P_PROC_XOP;
32531 break;
32532 case PROCESSOR_BDVER2:
32533 arg_str = "bdver2";
32534 priority = P_PROC_FMA;
32535 break;
32536 case PROCESSOR_BDVER3:
32537 arg_str = "bdver3";
32538 priority = P_PROC_FMA;
32539 break;
32540 case PROCESSOR_BDVER4:
32541 arg_str = "bdver4";
32542 priority = P_PROC_AVX2;
32543 break;
32544 case PROCESSOR_ZNVER1:
32545 arg_str = "znver1";
32546 priority = P_PROC_AVX2;
32547 break;
32551 cl_target_option_restore (&global_options, &cur_target);
32553 if (predicate_list && arg_str == NULL)
32555 error_at (DECL_SOURCE_LOCATION (decl),
32556 "No dispatcher found for the versioning attributes");
32557 return 0;
32560 if (predicate_list)
32562 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32563 /* For a C string literal the length includes the trailing NULL. */
32564 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32565 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32566 predicate_chain);
32570 /* Process feature name. */
32571 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32572 strcpy (tok_str, attrs_str);
32573 token = strtok (tok_str, ",");
32574 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32576 while (token != NULL)
32578 /* Do not process "arch=" */
32579 if (strncmp (token, "arch=", 5) == 0)
32581 token = strtok (NULL, ",");
32582 continue;
32584 for (i = 0; i < NUM_FEATURES; ++i)
32586 if (strcmp (token, feature_list[i].name) == 0)
32588 if (predicate_list)
32590 predicate_arg = build_string_literal (
32591 strlen (feature_list[i].name) + 1,
32592 feature_list[i].name);
32593 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32594 predicate_chain);
32596 /* Find the maximum priority feature. */
32597 if (feature_list[i].priority > priority)
32598 priority = feature_list[i].priority;
32600 break;
32603 if (predicate_list && i == NUM_FEATURES)
32605 error_at (DECL_SOURCE_LOCATION (decl),
32606 "No dispatcher found for %s", token);
32607 return 0;
32609 token = strtok (NULL, ",");
32611 free (tok_str);
32613 if (predicate_list && predicate_chain == NULL_TREE)
32615 error_at (DECL_SOURCE_LOCATION (decl),
32616 "No dispatcher found for the versioning attributes : %s",
32617 attrs_str);
32618 return 0;
32620 else if (predicate_list)
32622 predicate_chain = nreverse (predicate_chain);
32623 *predicate_list = predicate_chain;
32626 return priority;
32629 /* This compares the priority of target features in function DECL1
32630 and DECL2. It returns positive value if DECL1 is higher priority,
32631 negative value if DECL2 is higher priority and 0 if they are the
32632 same. */
32634 static int
32635 ix86_compare_version_priority (tree decl1, tree decl2)
32637 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32638 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32640 return (int)priority1 - (int)priority2;
32643 /* V1 and V2 point to function versions with different priorities
32644 based on the target ISA. This function compares their priorities. */
32646 static int
32647 feature_compare (const void *v1, const void *v2)
32649 typedef struct _function_version_info
32651 tree version_decl;
32652 tree predicate_chain;
32653 unsigned int dispatch_priority;
32654 } function_version_info;
32656 const function_version_info c1 = *(const function_version_info *)v1;
32657 const function_version_info c2 = *(const function_version_info *)v2;
32658 return (c2.dispatch_priority - c1.dispatch_priority);
32661 /* This function generates the dispatch function for
32662 multi-versioned functions. DISPATCH_DECL is the function which will
32663 contain the dispatch logic. FNDECLS are the function choices for
32664 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32665 in DISPATCH_DECL in which the dispatch code is generated. */
32667 static int
32668 dispatch_function_versions (tree dispatch_decl,
32669 void *fndecls_p,
32670 basic_block *empty_bb)
32672 tree default_decl;
32673 gimple *ifunc_cpu_init_stmt;
32674 gimple_seq gseq;
32675 int ix;
32676 tree ele;
32677 vec<tree> *fndecls;
32678 unsigned int num_versions = 0;
32679 unsigned int actual_versions = 0;
32680 unsigned int i;
32682 struct _function_version_info
32684 tree version_decl;
32685 tree predicate_chain;
32686 unsigned int dispatch_priority;
32687 }*function_version_info;
32689 gcc_assert (dispatch_decl != NULL
32690 && fndecls_p != NULL
32691 && empty_bb != NULL);
32693 /*fndecls_p is actually a vector. */
32694 fndecls = static_cast<vec<tree> *> (fndecls_p);
32696 /* At least one more version other than the default. */
32697 num_versions = fndecls->length ();
32698 gcc_assert (num_versions >= 2);
32700 function_version_info = (struct _function_version_info *)
32701 XNEWVEC (struct _function_version_info, (num_versions - 1));
32703 /* The first version in the vector is the default decl. */
32704 default_decl = (*fndecls)[0];
32706 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32708 gseq = bb_seq (*empty_bb);
32709 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32710 constructors, so explicity call __builtin_cpu_init here. */
32711 ifunc_cpu_init_stmt = gimple_build_call_vec (
32712 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32713 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32714 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32715 set_bb_seq (*empty_bb, gseq);
32717 pop_cfun ();
32720 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32722 tree version_decl = ele;
32723 tree predicate_chain = NULL_TREE;
32724 unsigned int priority;
32725 /* Get attribute string, parse it and find the right predicate decl.
32726 The predicate function could be a lengthy combination of many
32727 features, like arch-type and various isa-variants. */
32728 priority = get_builtin_code_for_version (version_decl,
32729 &predicate_chain);
32731 if (predicate_chain == NULL_TREE)
32732 continue;
32734 function_version_info [actual_versions].version_decl = version_decl;
32735 function_version_info [actual_versions].predicate_chain
32736 = predicate_chain;
32737 function_version_info [actual_versions].dispatch_priority = priority;
32738 actual_versions++;
32741 /* Sort the versions according to descending order of dispatch priority. The
32742 priority is based on the ISA. This is not a perfect solution. There
32743 could still be ambiguity. If more than one function version is suitable
32744 to execute, which one should be dispatched? In future, allow the user
32745 to specify a dispatch priority next to the version. */
32746 qsort (function_version_info, actual_versions,
32747 sizeof (struct _function_version_info), feature_compare);
32749 for (i = 0; i < actual_versions; ++i)
32750 *empty_bb = add_condition_to_bb (dispatch_decl,
32751 function_version_info[i].version_decl,
32752 function_version_info[i].predicate_chain,
32753 *empty_bb);
32755 /* dispatch default version at the end. */
32756 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32757 NULL, *empty_bb);
32759 free (function_version_info);
32760 return 0;
32763 /* Comparator function to be used in qsort routine to sort attribute
32764 specification strings to "target". */
32766 static int
32767 attr_strcmp (const void *v1, const void *v2)
32769 const char *c1 = *(char *const*)v1;
32770 const char *c2 = *(char *const*)v2;
32771 return strcmp (c1, c2);
32774 /* ARGLIST is the argument to target attribute. This function tokenizes
32775 the comma separated arguments, sorts them and returns a string which
32776 is a unique identifier for the comma separated arguments. It also
32777 replaces non-identifier characters "=,-" with "_". */
32779 static char *
32780 sorted_attr_string (tree arglist)
32782 tree arg;
32783 size_t str_len_sum = 0;
32784 char **args = NULL;
32785 char *attr_str, *ret_str;
32786 char *attr = NULL;
32787 unsigned int argnum = 1;
32788 unsigned int i;
32790 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32792 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32793 size_t len = strlen (str);
32794 str_len_sum += len + 1;
32795 if (arg != arglist)
32796 argnum++;
32797 for (i = 0; i < strlen (str); i++)
32798 if (str[i] == ',')
32799 argnum++;
32802 attr_str = XNEWVEC (char, str_len_sum);
32803 str_len_sum = 0;
32804 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32806 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32807 size_t len = strlen (str);
32808 memcpy (attr_str + str_len_sum, str, len);
32809 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32810 str_len_sum += len + 1;
32813 /* Replace "=,-" with "_". */
32814 for (i = 0; i < strlen (attr_str); i++)
32815 if (attr_str[i] == '=' || attr_str[i]== '-')
32816 attr_str[i] = '_';
32818 if (argnum == 1)
32819 return attr_str;
32821 args = XNEWVEC (char *, argnum);
32823 i = 0;
32824 attr = strtok (attr_str, ",");
32825 while (attr != NULL)
32827 args[i] = attr;
32828 i++;
32829 attr = strtok (NULL, ",");
32832 qsort (args, argnum, sizeof (char *), attr_strcmp);
32834 ret_str = XNEWVEC (char, str_len_sum);
32835 str_len_sum = 0;
32836 for (i = 0; i < argnum; i++)
32838 size_t len = strlen (args[i]);
32839 memcpy (ret_str + str_len_sum, args[i], len);
32840 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32841 str_len_sum += len + 1;
32844 XDELETEVEC (args);
32845 XDELETEVEC (attr_str);
32846 return ret_str;
32849 /* This function changes the assembler name for functions that are
32850 versions. If DECL is a function version and has a "target"
32851 attribute, it appends the attribute string to its assembler name. */
32853 static tree
32854 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32856 tree version_attr;
32857 const char *orig_name, *version_string;
32858 char *attr_str, *assembler_name;
32860 if (DECL_DECLARED_INLINE_P (decl)
32861 && lookup_attribute ("gnu_inline",
32862 DECL_ATTRIBUTES (decl)))
32863 error_at (DECL_SOURCE_LOCATION (decl),
32864 "Function versions cannot be marked as gnu_inline,"
32865 " bodies have to be generated");
32867 if (DECL_VIRTUAL_P (decl)
32868 || DECL_VINDEX (decl))
32869 sorry ("Virtual function multiversioning not supported");
32871 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32873 /* target attribute string cannot be NULL. */
32874 gcc_assert (version_attr != NULL_TREE);
32876 orig_name = IDENTIFIER_POINTER (id);
32877 version_string
32878 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32880 if (strcmp (version_string, "default") == 0)
32881 return id;
32883 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32884 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32886 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32888 /* Allow assembler name to be modified if already set. */
32889 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32890 SET_DECL_RTL (decl, NULL);
32892 tree ret = get_identifier (assembler_name);
32893 XDELETEVEC (attr_str);
32894 XDELETEVEC (assembler_name);
32895 return ret;
32898 /* This function returns true if FN1 and FN2 are versions of the same function,
32899 that is, the target strings of the function decls are different. This assumes
32900 that FN1 and FN2 have the same signature. */
32902 static bool
32903 ix86_function_versions (tree fn1, tree fn2)
32905 tree attr1, attr2;
32906 char *target1, *target2;
32907 bool result;
32909 if (TREE_CODE (fn1) != FUNCTION_DECL
32910 || TREE_CODE (fn2) != FUNCTION_DECL)
32911 return false;
32913 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32914 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32916 /* At least one function decl should have the target attribute specified. */
32917 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32918 return false;
32920 /* Diagnose missing target attribute if one of the decls is already
32921 multi-versioned. */
32922 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32924 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32926 if (attr2 != NULL_TREE)
32928 std::swap (fn1, fn2);
32929 attr1 = attr2;
32931 error_at (DECL_SOURCE_LOCATION (fn2),
32932 "missing %<target%> attribute for multi-versioned %D",
32933 fn2);
32934 inform (DECL_SOURCE_LOCATION (fn1),
32935 "previous declaration of %D", fn1);
32936 /* Prevent diagnosing of the same error multiple times. */
32937 DECL_ATTRIBUTES (fn2)
32938 = tree_cons (get_identifier ("target"),
32939 copy_node (TREE_VALUE (attr1)),
32940 DECL_ATTRIBUTES (fn2));
32942 return false;
32945 target1 = sorted_attr_string (TREE_VALUE (attr1));
32946 target2 = sorted_attr_string (TREE_VALUE (attr2));
32948 /* The sorted target strings must be different for fn1 and fn2
32949 to be versions. */
32950 if (strcmp (target1, target2) == 0)
32951 result = false;
32952 else
32953 result = true;
32955 XDELETEVEC (target1);
32956 XDELETEVEC (target2);
32958 return result;
32961 static tree
32962 ix86_mangle_decl_assembler_name (tree decl, tree id)
32964 /* For function version, add the target suffix to the assembler name. */
32965 if (TREE_CODE (decl) == FUNCTION_DECL
32966 && DECL_FUNCTION_VERSIONED (decl))
32967 id = ix86_mangle_function_version_assembler_name (decl, id);
32968 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32969 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32970 #endif
32972 return id;
32975 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32976 is true, append the full path name of the source file. */
32978 static char *
32979 make_name (tree decl, const char *suffix, bool make_unique)
32981 char *global_var_name;
32982 int name_len;
32983 const char *name;
32984 const char *unique_name = NULL;
32986 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32988 /* Get a unique name that can be used globally without any chances
32989 of collision at link time. */
32990 if (make_unique)
32991 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32993 name_len = strlen (name) + strlen (suffix) + 2;
32995 if (make_unique)
32996 name_len += strlen (unique_name) + 1;
32997 global_var_name = XNEWVEC (char, name_len);
32999 /* Use '.' to concatenate names as it is demangler friendly. */
33000 if (make_unique)
33001 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
33002 suffix);
33003 else
33004 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
33006 return global_var_name;
33009 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33011 /* Make a dispatcher declaration for the multi-versioned function DECL.
33012 Calls to DECL function will be replaced with calls to the dispatcher
33013 by the front-end. Return the decl created. */
33015 static tree
33016 make_dispatcher_decl (const tree decl)
33018 tree func_decl;
33019 char *func_name;
33020 tree fn_type, func_type;
33021 bool is_uniq = false;
33023 if (TREE_PUBLIC (decl) == 0)
33024 is_uniq = true;
33026 func_name = make_name (decl, "ifunc", is_uniq);
33028 fn_type = TREE_TYPE (decl);
33029 func_type = build_function_type (TREE_TYPE (fn_type),
33030 TYPE_ARG_TYPES (fn_type));
33032 func_decl = build_fn_decl (func_name, func_type);
33033 XDELETEVEC (func_name);
33034 TREE_USED (func_decl) = 1;
33035 DECL_CONTEXT (func_decl) = NULL_TREE;
33036 DECL_INITIAL (func_decl) = error_mark_node;
33037 DECL_ARTIFICIAL (func_decl) = 1;
33038 /* Mark this func as external, the resolver will flip it again if
33039 it gets generated. */
33040 DECL_EXTERNAL (func_decl) = 1;
33041 /* This will be of type IFUNCs have to be externally visible. */
33042 TREE_PUBLIC (func_decl) = 1;
33044 return func_decl;
33047 #endif
33049 /* Returns true if decl is multi-versioned and DECL is the default function,
33050 that is it is not tagged with target specific optimization. */
33052 static bool
33053 is_function_default_version (const tree decl)
33055 if (TREE_CODE (decl) != FUNCTION_DECL
33056 || !DECL_FUNCTION_VERSIONED (decl))
33057 return false;
33058 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33059 gcc_assert (attr);
33060 attr = TREE_VALUE (TREE_VALUE (attr));
33061 return (TREE_CODE (attr) == STRING_CST
33062 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
33065 /* Make a dispatcher declaration for the multi-versioned function DECL.
33066 Calls to DECL function will be replaced with calls to the dispatcher
33067 by the front-end. Returns the decl of the dispatcher function. */
33069 static tree
33070 ix86_get_function_versions_dispatcher (void *decl)
33072 tree fn = (tree) decl;
33073 struct cgraph_node *node = NULL;
33074 struct cgraph_node *default_node = NULL;
33075 struct cgraph_function_version_info *node_v = NULL;
33076 struct cgraph_function_version_info *first_v = NULL;
33078 tree dispatch_decl = NULL;
33080 struct cgraph_function_version_info *default_version_info = NULL;
33082 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33084 node = cgraph_node::get (fn);
33085 gcc_assert (node != NULL);
33087 node_v = node->function_version ();
33088 gcc_assert (node_v != NULL);
33090 if (node_v->dispatcher_resolver != NULL)
33091 return node_v->dispatcher_resolver;
33093 /* Find the default version and make it the first node. */
33094 first_v = node_v;
33095 /* Go to the beginning of the chain. */
33096 while (first_v->prev != NULL)
33097 first_v = first_v->prev;
33098 default_version_info = first_v;
33099 while (default_version_info != NULL)
33101 if (is_function_default_version
33102 (default_version_info->this_node->decl))
33103 break;
33104 default_version_info = default_version_info->next;
33107 /* If there is no default node, just return NULL. */
33108 if (default_version_info == NULL)
33109 return NULL;
33111 /* Make default info the first node. */
33112 if (first_v != default_version_info)
33114 default_version_info->prev->next = default_version_info->next;
33115 if (default_version_info->next)
33116 default_version_info->next->prev = default_version_info->prev;
33117 first_v->prev = default_version_info;
33118 default_version_info->next = first_v;
33119 default_version_info->prev = NULL;
33122 default_node = default_version_info->this_node;
33124 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33125 if (targetm.has_ifunc_p ())
33127 struct cgraph_function_version_info *it_v = NULL;
33128 struct cgraph_node *dispatcher_node = NULL;
33129 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33131 /* Right now, the dispatching is done via ifunc. */
33132 dispatch_decl = make_dispatcher_decl (default_node->decl);
33134 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33135 gcc_assert (dispatcher_node != NULL);
33136 dispatcher_node->dispatcher_function = 1;
33137 dispatcher_version_info
33138 = dispatcher_node->insert_new_function_version ();
33139 dispatcher_version_info->next = default_version_info;
33140 dispatcher_node->definition = 1;
33142 /* Set the dispatcher for all the versions. */
33143 it_v = default_version_info;
33144 while (it_v != NULL)
33146 it_v->dispatcher_resolver = dispatch_decl;
33147 it_v = it_v->next;
33150 else
33151 #endif
33153 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33154 "multiversioning needs ifunc which is not supported "
33155 "on this target");
33158 return dispatch_decl;
33161 /* Make the resolver function decl to dispatch the versions of
33162 a multi-versioned function, DEFAULT_DECL. Create an
33163 empty basic block in the resolver and store the pointer in
33164 EMPTY_BB. Return the decl of the resolver function. */
33166 static tree
33167 make_resolver_func (const tree default_decl,
33168 const tree dispatch_decl,
33169 basic_block *empty_bb)
33171 char *resolver_name;
33172 tree decl, type, decl_name, t;
33173 bool is_uniq = false;
33175 /* IFUNC's have to be globally visible. So, if the default_decl is
33176 not, then the name of the IFUNC should be made unique. */
33177 if (TREE_PUBLIC (default_decl) == 0)
33178 is_uniq = true;
33180 /* Append the filename to the resolver function if the versions are
33181 not externally visible. This is because the resolver function has
33182 to be externally visible for the loader to find it. So, appending
33183 the filename will prevent conflicts with a resolver function from
33184 another module which is based on the same version name. */
33185 resolver_name = make_name (default_decl, "resolver", is_uniq);
33187 /* The resolver function should return a (void *). */
33188 type = build_function_type_list (ptr_type_node, NULL_TREE);
33190 decl = build_fn_decl (resolver_name, type);
33191 decl_name = get_identifier (resolver_name);
33192 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33194 DECL_NAME (decl) = decl_name;
33195 TREE_USED (decl) = 1;
33196 DECL_ARTIFICIAL (decl) = 1;
33197 DECL_IGNORED_P (decl) = 0;
33198 /* IFUNC resolvers have to be externally visible. */
33199 TREE_PUBLIC (decl) = 1;
33200 DECL_UNINLINABLE (decl) = 1;
33202 /* Resolver is not external, body is generated. */
33203 DECL_EXTERNAL (decl) = 0;
33204 DECL_EXTERNAL (dispatch_decl) = 0;
33206 DECL_CONTEXT (decl) = NULL_TREE;
33207 DECL_INITIAL (decl) = make_node (BLOCK);
33208 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33210 if (DECL_COMDAT_GROUP (default_decl)
33211 || TREE_PUBLIC (default_decl))
33213 /* In this case, each translation unit with a call to this
33214 versioned function will put out a resolver. Ensure it
33215 is comdat to keep just one copy. */
33216 DECL_COMDAT (decl) = 1;
33217 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33219 /* Build result decl and add to function_decl. */
33220 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33221 DECL_ARTIFICIAL (t) = 1;
33222 DECL_IGNORED_P (t) = 1;
33223 DECL_RESULT (decl) = t;
33225 gimplify_function_tree (decl);
33226 push_cfun (DECL_STRUCT_FUNCTION (decl));
33227 *empty_bb = init_lowered_empty_function (decl, false, 0);
33229 cgraph_node::add_new_function (decl, true);
33230 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33232 pop_cfun ();
33234 gcc_assert (dispatch_decl != NULL);
33235 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33236 DECL_ATTRIBUTES (dispatch_decl)
33237 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33239 /* Create the alias for dispatch to resolver here. */
33240 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33241 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33242 XDELETEVEC (resolver_name);
33243 return decl;
33246 /* Generate the dispatching code body to dispatch multi-versioned function
33247 DECL. The target hook is called to process the "target" attributes and
33248 provide the code to dispatch the right function at run-time. NODE points
33249 to the dispatcher decl whose body will be created. */
33251 static tree
33252 ix86_generate_version_dispatcher_body (void *node_p)
33254 tree resolver_decl;
33255 basic_block empty_bb;
33256 tree default_ver_decl;
33257 struct cgraph_node *versn;
33258 struct cgraph_node *node;
33260 struct cgraph_function_version_info *node_version_info = NULL;
33261 struct cgraph_function_version_info *versn_info = NULL;
33263 node = (cgraph_node *)node_p;
33265 node_version_info = node->function_version ();
33266 gcc_assert (node->dispatcher_function
33267 && node_version_info != NULL);
33269 if (node_version_info->dispatcher_resolver)
33270 return node_version_info->dispatcher_resolver;
33272 /* The first version in the chain corresponds to the default version. */
33273 default_ver_decl = node_version_info->next->this_node->decl;
33275 /* node is going to be an alias, so remove the finalized bit. */
33276 node->definition = false;
33278 resolver_decl = make_resolver_func (default_ver_decl,
33279 node->decl, &empty_bb);
33281 node_version_info->dispatcher_resolver = resolver_decl;
33283 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33285 auto_vec<tree, 2> fn_ver_vec;
33287 for (versn_info = node_version_info->next; versn_info;
33288 versn_info = versn_info->next)
33290 versn = versn_info->this_node;
33291 /* Check for virtual functions here again, as by this time it should
33292 have been determined if this function needs a vtable index or
33293 not. This happens for methods in derived classes that override
33294 virtual methods in base classes but are not explicitly marked as
33295 virtual. */
33296 if (DECL_VINDEX (versn->decl))
33297 sorry ("Virtual function multiversioning not supported");
33299 fn_ver_vec.safe_push (versn->decl);
33302 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
33303 cgraph_edge::rebuild_edges ();
33304 pop_cfun ();
33305 return resolver_decl;
33307 /* This builds the processor_model struct type defined in
33308 libgcc/config/i386/cpuinfo.c */
33310 static tree
33311 build_processor_model_struct (void)
33313 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
33314 "__cpu_features"};
33315 tree field = NULL_TREE, field_chain = NULL_TREE;
33316 int i;
33317 tree type = make_node (RECORD_TYPE);
33319 /* The first 3 fields are unsigned int. */
33320 for (i = 0; i < 3; ++i)
33322 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33323 get_identifier (field_name[i]), unsigned_type_node);
33324 if (field_chain != NULL_TREE)
33325 DECL_CHAIN (field) = field_chain;
33326 field_chain = field;
33329 /* The last field is an array of unsigned integers of size one. */
33330 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33331 get_identifier (field_name[3]),
33332 build_array_type (unsigned_type_node,
33333 build_index_type (size_one_node)));
33334 if (field_chain != NULL_TREE)
33335 DECL_CHAIN (field) = field_chain;
33336 field_chain = field;
33338 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33339 return type;
33342 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33344 static tree
33345 make_var_decl (tree type, const char *name)
33347 tree new_decl;
33349 new_decl = build_decl (UNKNOWN_LOCATION,
33350 VAR_DECL,
33351 get_identifier(name),
33352 type);
33354 DECL_EXTERNAL (new_decl) = 1;
33355 TREE_STATIC (new_decl) = 1;
33356 TREE_PUBLIC (new_decl) = 1;
33357 DECL_INITIAL (new_decl) = 0;
33358 DECL_ARTIFICIAL (new_decl) = 0;
33359 DECL_PRESERVE_P (new_decl) = 1;
33361 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33362 assemble_variable (new_decl, 0, 0, 0);
33364 return new_decl;
33367 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33368 into an integer defined in libgcc/config/i386/cpuinfo.c */
33370 static tree
33371 fold_builtin_cpu (tree fndecl, tree *args)
33373 unsigned int i;
33374 enum ix86_builtins fn_code = (enum ix86_builtins)
33375 DECL_FUNCTION_CODE (fndecl);
33376 tree param_string_cst = NULL;
33378 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33379 enum processor_features
33381 F_CMOV = 0,
33382 F_MMX,
33383 F_POPCNT,
33384 F_SSE,
33385 F_SSE2,
33386 F_SSE3,
33387 F_SSSE3,
33388 F_SSE4_1,
33389 F_SSE4_2,
33390 F_AVX,
33391 F_AVX2,
33392 F_SSE4_A,
33393 F_FMA4,
33394 F_XOP,
33395 F_FMA,
33396 F_AVX512F,
33397 F_BMI,
33398 F_BMI2,
33399 F_AES,
33400 F_PCLMUL,
33401 F_AVX512VL,
33402 F_AVX512BW,
33403 F_AVX512DQ,
33404 F_AVX512CD,
33405 F_AVX512ER,
33406 F_AVX512PF,
33407 F_AVX512VBMI,
33408 F_AVX512IFMA,
33409 F_AVX5124VNNIW,
33410 F_AVX5124FMAPS,
33411 F_AVX512VPOPCNTDQ,
33412 F_MAX
33415 /* These are the values for vendor types and cpu types and subtypes
33416 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33417 the corresponding start value. */
33418 enum processor_model
33420 M_INTEL = 1,
33421 M_AMD,
33422 M_CPU_TYPE_START,
33423 M_INTEL_BONNELL,
33424 M_INTEL_CORE2,
33425 M_INTEL_COREI7,
33426 M_AMDFAM10H,
33427 M_AMDFAM15H,
33428 M_INTEL_SILVERMONT,
33429 M_INTEL_KNL,
33430 M_AMD_BTVER1,
33431 M_AMD_BTVER2,
33432 M_CPU_SUBTYPE_START,
33433 M_INTEL_COREI7_NEHALEM,
33434 M_INTEL_COREI7_WESTMERE,
33435 M_INTEL_COREI7_SANDYBRIDGE,
33436 M_AMDFAM10H_BARCELONA,
33437 M_AMDFAM10H_SHANGHAI,
33438 M_AMDFAM10H_ISTANBUL,
33439 M_AMDFAM15H_BDVER1,
33440 M_AMDFAM15H_BDVER2,
33441 M_AMDFAM15H_BDVER3,
33442 M_AMDFAM15H_BDVER4,
33443 M_AMDFAM17H_ZNVER1,
33444 M_INTEL_COREI7_IVYBRIDGE,
33445 M_INTEL_COREI7_HASWELL,
33446 M_INTEL_COREI7_BROADWELL,
33447 M_INTEL_COREI7_SKYLAKE,
33448 M_INTEL_COREI7_SKYLAKE_AVX512
33451 static struct _arch_names_table
33453 const char *const name;
33454 const enum processor_model model;
33456 const arch_names_table[] =
33458 {"amd", M_AMD},
33459 {"intel", M_INTEL},
33460 {"atom", M_INTEL_BONNELL},
33461 {"slm", M_INTEL_SILVERMONT},
33462 {"core2", M_INTEL_CORE2},
33463 {"corei7", M_INTEL_COREI7},
33464 {"nehalem", M_INTEL_COREI7_NEHALEM},
33465 {"westmere", M_INTEL_COREI7_WESTMERE},
33466 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33467 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33468 {"haswell", M_INTEL_COREI7_HASWELL},
33469 {"broadwell", M_INTEL_COREI7_BROADWELL},
33470 {"skylake", M_INTEL_COREI7_SKYLAKE},
33471 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33472 {"bonnell", M_INTEL_BONNELL},
33473 {"silvermont", M_INTEL_SILVERMONT},
33474 {"knl", M_INTEL_KNL},
33475 {"amdfam10h", M_AMDFAM10H},
33476 {"barcelona", M_AMDFAM10H_BARCELONA},
33477 {"shanghai", M_AMDFAM10H_SHANGHAI},
33478 {"istanbul", M_AMDFAM10H_ISTANBUL},
33479 {"btver1", M_AMD_BTVER1},
33480 {"amdfam15h", M_AMDFAM15H},
33481 {"bdver1", M_AMDFAM15H_BDVER1},
33482 {"bdver2", M_AMDFAM15H_BDVER2},
33483 {"bdver3", M_AMDFAM15H_BDVER3},
33484 {"bdver4", M_AMDFAM15H_BDVER4},
33485 {"btver2", M_AMD_BTVER2},
33486 {"znver1", M_AMDFAM17H_ZNVER1},
33489 static struct _isa_names_table
33491 const char *const name;
33492 const enum processor_features feature;
33494 const isa_names_table[] =
33496 {"cmov", F_CMOV},
33497 {"mmx", F_MMX},
33498 {"popcnt", F_POPCNT},
33499 {"sse", F_SSE},
33500 {"sse2", F_SSE2},
33501 {"sse3", F_SSE3},
33502 {"ssse3", F_SSSE3},
33503 {"sse4a", F_SSE4_A},
33504 {"sse4.1", F_SSE4_1},
33505 {"sse4.2", F_SSE4_2},
33506 {"avx", F_AVX},
33507 {"fma4", F_FMA4},
33508 {"xop", F_XOP},
33509 {"fma", F_FMA},
33510 {"avx2", F_AVX2},
33511 {"avx512f", F_AVX512F},
33512 {"bmi", F_BMI},
33513 {"bmi2", F_BMI2},
33514 {"aes", F_AES},
33515 {"pclmul", F_PCLMUL},
33516 {"avx512vl",F_AVX512VL},
33517 {"avx512bw",F_AVX512BW},
33518 {"avx512dq",F_AVX512DQ},
33519 {"avx512cd",F_AVX512CD},
33520 {"avx512er",F_AVX512ER},
33521 {"avx512pf",F_AVX512PF},
33522 {"avx512vbmi",F_AVX512VBMI},
33523 {"avx512ifma",F_AVX512IFMA},
33524 {"avx5124vnniw",F_AVX5124VNNIW},
33525 {"avx5124fmaps",F_AVX5124FMAPS},
33526 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
33529 tree __processor_model_type = build_processor_model_struct ();
33530 tree __cpu_model_var = make_var_decl (__processor_model_type,
33531 "__cpu_model");
33534 varpool_node::add (__cpu_model_var);
33536 gcc_assert ((args != NULL) && (*args != NULL));
33538 param_string_cst = *args;
33539 while (param_string_cst
33540 && TREE_CODE (param_string_cst) != STRING_CST)
33542 /* *args must be a expr that can contain other EXPRS leading to a
33543 STRING_CST. */
33544 if (!EXPR_P (param_string_cst))
33546 error ("Parameter to builtin must be a string constant or literal");
33547 return integer_zero_node;
33549 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33552 gcc_assert (param_string_cst);
33554 if (fn_code == IX86_BUILTIN_CPU_IS)
33556 tree ref;
33557 tree field;
33558 tree final;
33560 unsigned int field_val = 0;
33561 unsigned int NUM_ARCH_NAMES
33562 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33564 for (i = 0; i < NUM_ARCH_NAMES; i++)
33565 if (strcmp (arch_names_table[i].name,
33566 TREE_STRING_POINTER (param_string_cst)) == 0)
33567 break;
33569 if (i == NUM_ARCH_NAMES)
33571 error ("Parameter to builtin not valid: %s",
33572 TREE_STRING_POINTER (param_string_cst));
33573 return integer_zero_node;
33576 field = TYPE_FIELDS (__processor_model_type);
33577 field_val = arch_names_table[i].model;
33579 /* CPU types are stored in the next field. */
33580 if (field_val > M_CPU_TYPE_START
33581 && field_val < M_CPU_SUBTYPE_START)
33583 field = DECL_CHAIN (field);
33584 field_val -= M_CPU_TYPE_START;
33587 /* CPU subtypes are stored in the next field. */
33588 if (field_val > M_CPU_SUBTYPE_START)
33590 field = DECL_CHAIN ( DECL_CHAIN (field));
33591 field_val -= M_CPU_SUBTYPE_START;
33594 /* Get the appropriate field in __cpu_model. */
33595 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33596 field, NULL_TREE);
33598 /* Check the value. */
33599 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33600 build_int_cstu (unsigned_type_node, field_val));
33601 return build1 (CONVERT_EXPR, integer_type_node, final);
33603 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33605 tree ref;
33606 tree array_elt;
33607 tree field;
33608 tree final;
33610 unsigned int field_val = 0;
33611 unsigned int NUM_ISA_NAMES
33612 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33614 for (i = 0; i < NUM_ISA_NAMES; i++)
33615 if (strcmp (isa_names_table[i].name,
33616 TREE_STRING_POINTER (param_string_cst)) == 0)
33617 break;
33619 if (i == NUM_ISA_NAMES)
33621 error ("Parameter to builtin not valid: %s",
33622 TREE_STRING_POINTER (param_string_cst));
33623 return integer_zero_node;
33626 field = TYPE_FIELDS (__processor_model_type);
33627 /* Get the last field, which is __cpu_features. */
33628 while (DECL_CHAIN (field))
33629 field = DECL_CHAIN (field);
33631 /* Get the appropriate field: __cpu_model.__cpu_features */
33632 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33633 field, NULL_TREE);
33635 /* Access the 0th element of __cpu_features array. */
33636 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33637 integer_zero_node, NULL_TREE, NULL_TREE);
33639 field_val = (1 << isa_names_table[i].feature);
33640 /* Return __cpu_model.__cpu_features[0] & field_val */
33641 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33642 build_int_cstu (unsigned_type_node, field_val));
33643 return build1 (CONVERT_EXPR, integer_type_node, final);
33645 gcc_unreachable ();
33648 static tree
33649 ix86_fold_builtin (tree fndecl, int n_args,
33650 tree *args, bool ignore ATTRIBUTE_UNUSED)
33652 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33654 enum ix86_builtins fn_code = (enum ix86_builtins)
33655 DECL_FUNCTION_CODE (fndecl);
33656 switch (fn_code)
33658 case IX86_BUILTIN_CPU_IS:
33659 case IX86_BUILTIN_CPU_SUPPORTS:
33660 gcc_assert (n_args == 1);
33661 return fold_builtin_cpu (fndecl, args);
33663 case IX86_BUILTIN_NANQ:
33664 case IX86_BUILTIN_NANSQ:
33666 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33667 const char *str = c_getstr (*args);
33668 int quiet = fn_code == IX86_BUILTIN_NANQ;
33669 REAL_VALUE_TYPE real;
33671 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33672 return build_real (type, real);
33673 return NULL_TREE;
33676 case IX86_BUILTIN_INFQ:
33677 case IX86_BUILTIN_HUGE_VALQ:
33679 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33680 REAL_VALUE_TYPE inf;
33681 real_inf (&inf);
33682 return build_real (type, inf);
33685 case IX86_BUILTIN_TZCNT16:
33686 case IX86_BUILTIN_CTZS:
33687 case IX86_BUILTIN_TZCNT32:
33688 case IX86_BUILTIN_TZCNT64:
33689 gcc_assert (n_args == 1);
33690 if (TREE_CODE (args[0]) == INTEGER_CST)
33692 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33693 tree arg = args[0];
33694 if (fn_code == IX86_BUILTIN_TZCNT16
33695 || fn_code == IX86_BUILTIN_CTZS)
33696 arg = fold_convert (short_unsigned_type_node, arg);
33697 if (integer_zerop (arg))
33698 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33699 else
33700 return fold_const_call (CFN_CTZ, type, arg);
33702 break;
33704 case IX86_BUILTIN_LZCNT16:
33705 case IX86_BUILTIN_CLZS:
33706 case IX86_BUILTIN_LZCNT32:
33707 case IX86_BUILTIN_LZCNT64:
33708 gcc_assert (n_args == 1);
33709 if (TREE_CODE (args[0]) == INTEGER_CST)
33711 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33712 tree arg = args[0];
33713 if (fn_code == IX86_BUILTIN_LZCNT16
33714 || fn_code == IX86_BUILTIN_CLZS)
33715 arg = fold_convert (short_unsigned_type_node, arg);
33716 if (integer_zerop (arg))
33717 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33718 else
33719 return fold_const_call (CFN_CLZ, type, arg);
33721 break;
33723 case IX86_BUILTIN_BEXTR32:
33724 case IX86_BUILTIN_BEXTR64:
33725 case IX86_BUILTIN_BEXTRI32:
33726 case IX86_BUILTIN_BEXTRI64:
33727 gcc_assert (n_args == 2);
33728 if (tree_fits_uhwi_p (args[1]))
33730 unsigned HOST_WIDE_INT res = 0;
33731 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33732 unsigned int start = tree_to_uhwi (args[1]);
33733 unsigned int len = (start & 0xff00) >> 8;
33734 start &= 0xff;
33735 if (start >= prec || len == 0)
33736 res = 0;
33737 else if (!tree_fits_uhwi_p (args[0]))
33738 break;
33739 else
33740 res = tree_to_uhwi (args[0]) >> start;
33741 if (len > prec)
33742 len = prec;
33743 if (len < HOST_BITS_PER_WIDE_INT)
33744 res &= (HOST_WIDE_INT_1U << len) - 1;
33745 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33747 break;
33749 case IX86_BUILTIN_BZHI32:
33750 case IX86_BUILTIN_BZHI64:
33751 gcc_assert (n_args == 2);
33752 if (tree_fits_uhwi_p (args[1]))
33754 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33755 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33756 return args[0];
33757 if (!tree_fits_uhwi_p (args[0]))
33758 break;
33759 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33760 res &= ~(HOST_WIDE_INT_M1U << idx);
33761 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33763 break;
33765 case IX86_BUILTIN_PDEP32:
33766 case IX86_BUILTIN_PDEP64:
33767 gcc_assert (n_args == 2);
33768 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33770 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33771 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33772 unsigned HOST_WIDE_INT res = 0;
33773 unsigned HOST_WIDE_INT m, k = 1;
33774 for (m = 1; m; m <<= 1)
33775 if ((mask & m) != 0)
33777 if ((src & k) != 0)
33778 res |= m;
33779 k <<= 1;
33781 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33783 break;
33785 case IX86_BUILTIN_PEXT32:
33786 case IX86_BUILTIN_PEXT64:
33787 gcc_assert (n_args == 2);
33788 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33790 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33791 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33792 unsigned HOST_WIDE_INT res = 0;
33793 unsigned HOST_WIDE_INT m, k = 1;
33794 for (m = 1; m; m <<= 1)
33795 if ((mask & m) != 0)
33797 if ((src & m) != 0)
33798 res |= k;
33799 k <<= 1;
33801 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33803 break;
33805 default:
33806 break;
33810 #ifdef SUBTARGET_FOLD_BUILTIN
33811 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33812 #endif
33814 return NULL_TREE;
33817 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33818 constant) in GIMPLE. */
33820 bool
33821 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33823 gimple *stmt = gsi_stmt (*gsi);
33824 tree fndecl = gimple_call_fndecl (stmt);
33825 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33826 int n_args = gimple_call_num_args (stmt);
33827 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33828 tree decl = NULL_TREE;
33829 tree arg0, arg1;
33831 switch (fn_code)
33833 case IX86_BUILTIN_TZCNT32:
33834 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33835 goto fold_tzcnt_lzcnt;
33837 case IX86_BUILTIN_TZCNT64:
33838 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33839 goto fold_tzcnt_lzcnt;
33841 case IX86_BUILTIN_LZCNT32:
33842 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33843 goto fold_tzcnt_lzcnt;
33845 case IX86_BUILTIN_LZCNT64:
33846 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33847 goto fold_tzcnt_lzcnt;
33849 fold_tzcnt_lzcnt:
33850 gcc_assert (n_args == 1);
33851 arg0 = gimple_call_arg (stmt, 0);
33852 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33854 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33855 /* If arg0 is provably non-zero, optimize into generic
33856 __builtin_c[tl]z{,ll} function the middle-end handles
33857 better. */
33858 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33859 return false;
33861 location_t loc = gimple_location (stmt);
33862 gimple *g = gimple_build_call (decl, 1, arg0);
33863 gimple_set_location (g, loc);
33864 tree lhs = make_ssa_name (integer_type_node);
33865 gimple_call_set_lhs (g, lhs);
33866 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33867 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33868 gimple_set_location (g, loc);
33869 gsi_replace (gsi, g, false);
33870 return true;
33872 break;
33874 case IX86_BUILTIN_BZHI32:
33875 case IX86_BUILTIN_BZHI64:
33876 gcc_assert (n_args == 2);
33877 arg1 = gimple_call_arg (stmt, 1);
33878 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33880 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33881 arg0 = gimple_call_arg (stmt, 0);
33882 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33883 break;
33884 location_t loc = gimple_location (stmt);
33885 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33886 gimple_set_location (g, loc);
33887 gsi_replace (gsi, g, false);
33888 return true;
33890 break;
33892 case IX86_BUILTIN_PDEP32:
33893 case IX86_BUILTIN_PDEP64:
33894 case IX86_BUILTIN_PEXT32:
33895 case IX86_BUILTIN_PEXT64:
33896 gcc_assert (n_args == 2);
33897 arg1 = gimple_call_arg (stmt, 1);
33898 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33900 location_t loc = gimple_location (stmt);
33901 arg0 = gimple_call_arg (stmt, 0);
33902 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33903 gimple_set_location (g, loc);
33904 gsi_replace (gsi, g, false);
33905 return true;
33907 break;
33909 default:
33910 break;
33913 return false;
33916 /* Make builtins to detect cpu type and features supported. NAME is
33917 the builtin name, CODE is the builtin code, and FTYPE is the function
33918 type of the builtin. */
33920 static void
33921 make_cpu_type_builtin (const char* name, int code,
33922 enum ix86_builtin_func_type ftype, bool is_const)
33924 tree decl;
33925 tree type;
33927 type = ix86_get_builtin_func_type (ftype);
33928 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33929 NULL, NULL_TREE);
33930 gcc_assert (decl != NULL_TREE);
33931 ix86_builtins[(int) code] = decl;
33932 TREE_READONLY (decl) = is_const;
33935 /* Make builtins to get CPU type and features supported. The created
33936 builtins are :
33938 __builtin_cpu_init (), to detect cpu type and features,
33939 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33940 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33943 static void
33944 ix86_init_platform_type_builtins (void)
33946 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33947 INT_FTYPE_VOID, false);
33948 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33949 INT_FTYPE_PCCHAR, true);
33950 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33951 INT_FTYPE_PCCHAR, true);
33954 /* Internal method for ix86_init_builtins. */
33956 static void
33957 ix86_init_builtins_va_builtins_abi (void)
33959 tree ms_va_ref, sysv_va_ref;
33960 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33961 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33962 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33963 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33965 if (!TARGET_64BIT)
33966 return;
33967 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33968 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33969 ms_va_ref = build_reference_type (ms_va_list_type_node);
33970 sysv_va_ref =
33971 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33973 fnvoid_va_end_ms =
33974 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33975 fnvoid_va_start_ms =
33976 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33977 fnvoid_va_end_sysv =
33978 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33979 fnvoid_va_start_sysv =
33980 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33981 NULL_TREE);
33982 fnvoid_va_copy_ms =
33983 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33984 NULL_TREE);
33985 fnvoid_va_copy_sysv =
33986 build_function_type_list (void_type_node, sysv_va_ref,
33987 sysv_va_ref, NULL_TREE);
33989 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33990 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33991 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33992 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33993 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33994 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33995 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33996 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33997 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33998 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33999 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34000 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34003 static void
34004 ix86_init_builtin_types (void)
34006 tree float80_type_node, const_string_type_node;
34008 /* The __float80 type. */
34009 float80_type_node = long_double_type_node;
34010 if (TYPE_MODE (float80_type_node) != XFmode)
34012 if (float64x_type_node != NULL_TREE
34013 && TYPE_MODE (float64x_type_node) == XFmode)
34014 float80_type_node = float64x_type_node;
34015 else
34017 /* The __float80 type. */
34018 float80_type_node = make_node (REAL_TYPE);
34020 TYPE_PRECISION (float80_type_node) = 80;
34021 layout_type (float80_type_node);
34024 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34026 /* The __float128 type. The node has already been created as
34027 _Float128, so we only need to register the __float128 name for
34028 it. */
34029 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34031 const_string_type_node
34032 = build_pointer_type (build_qualified_type
34033 (char_type_node, TYPE_QUAL_CONST));
34035 /* This macro is built by i386-builtin-types.awk. */
34036 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34039 static void
34040 ix86_init_builtins (void)
34042 tree ftype, decl;
34044 ix86_init_builtin_types ();
34046 /* Builtins to get CPU type and features. */
34047 ix86_init_platform_type_builtins ();
34049 /* TFmode support builtins. */
34050 def_builtin_const (0, "__builtin_infq",
34051 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34052 def_builtin_const (0, "__builtin_huge_valq",
34053 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34055 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34056 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34057 BUILT_IN_MD, "nanq", NULL_TREE);
34058 TREE_READONLY (decl) = 1;
34059 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34061 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34062 BUILT_IN_MD, "nansq", NULL_TREE);
34063 TREE_READONLY (decl) = 1;
34064 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34066 /* We will expand them to normal call if SSE isn't available since
34067 they are used by libgcc. */
34068 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34069 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34070 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34071 TREE_READONLY (decl) = 1;
34072 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34074 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34075 decl = add_builtin_function ("__builtin_copysignq", ftype,
34076 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34077 "__copysigntf3", NULL_TREE);
34078 TREE_READONLY (decl) = 1;
34079 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34081 ix86_init_tm_builtins ();
34082 ix86_init_mmx_sse_builtins ();
34083 ix86_init_mpx_builtins ();
34085 if (TARGET_LP64)
34086 ix86_init_builtins_va_builtins_abi ();
34088 #ifdef SUBTARGET_INIT_BUILTINS
34089 SUBTARGET_INIT_BUILTINS;
34090 #endif
34093 /* Return the ix86 builtin for CODE. */
34095 static tree
34096 ix86_builtin_decl (unsigned code, bool)
34098 if (code >= IX86_BUILTIN_MAX)
34099 return error_mark_node;
34101 return ix86_builtins[code];
34104 /* Errors in the source file can cause expand_expr to return const0_rtx
34105 where we expect a vector. To avoid crashing, use one of the vector
34106 clear instructions. */
34107 static rtx
34108 safe_vector_operand (rtx x, machine_mode mode)
34110 if (x == const0_rtx)
34111 x = CONST0_RTX (mode);
34112 return x;
34115 /* Fixup modeless constants to fit required mode. */
34116 static rtx
34117 fixup_modeless_constant (rtx x, machine_mode mode)
34119 if (GET_MODE (x) == VOIDmode)
34120 x = convert_to_mode (mode, x, 1);
34121 return x;
34124 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34126 static rtx
34127 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34129 rtx pat;
34130 tree arg0 = CALL_EXPR_ARG (exp, 0);
34131 tree arg1 = CALL_EXPR_ARG (exp, 1);
34132 rtx op0 = expand_normal (arg0);
34133 rtx op1 = expand_normal (arg1);
34134 machine_mode tmode = insn_data[icode].operand[0].mode;
34135 machine_mode mode0 = insn_data[icode].operand[1].mode;
34136 machine_mode mode1 = insn_data[icode].operand[2].mode;
34138 if (VECTOR_MODE_P (mode0))
34139 op0 = safe_vector_operand (op0, mode0);
34140 if (VECTOR_MODE_P (mode1))
34141 op1 = safe_vector_operand (op1, mode1);
34143 if (optimize || !target
34144 || GET_MODE (target) != tmode
34145 || !insn_data[icode].operand[0].predicate (target, tmode))
34146 target = gen_reg_rtx (tmode);
34148 if (GET_MODE (op1) == SImode && mode1 == TImode)
34150 rtx x = gen_reg_rtx (V4SImode);
34151 emit_insn (gen_sse2_loadd (x, op1));
34152 op1 = gen_lowpart (TImode, x);
34155 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34156 op0 = copy_to_mode_reg (mode0, op0);
34157 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34158 op1 = copy_to_mode_reg (mode1, op1);
34160 pat = GEN_FCN (icode) (target, op0, op1);
34161 if (! pat)
34162 return 0;
34164 emit_insn (pat);
34166 return target;
34169 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34171 static rtx
34172 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34173 enum ix86_builtin_func_type m_type,
34174 enum rtx_code sub_code)
34176 rtx pat;
34177 int i;
34178 int nargs;
34179 bool comparison_p = false;
34180 bool tf_p = false;
34181 bool last_arg_constant = false;
34182 int num_memory = 0;
34183 struct {
34184 rtx op;
34185 machine_mode mode;
34186 } args[4];
34188 machine_mode tmode = insn_data[icode].operand[0].mode;
34190 switch (m_type)
34192 case MULTI_ARG_4_DF2_DI_I:
34193 case MULTI_ARG_4_DF2_DI_I1:
34194 case MULTI_ARG_4_SF2_SI_I:
34195 case MULTI_ARG_4_SF2_SI_I1:
34196 nargs = 4;
34197 last_arg_constant = true;
34198 break;
34200 case MULTI_ARG_3_SF:
34201 case MULTI_ARG_3_DF:
34202 case MULTI_ARG_3_SF2:
34203 case MULTI_ARG_3_DF2:
34204 case MULTI_ARG_3_DI:
34205 case MULTI_ARG_3_SI:
34206 case MULTI_ARG_3_SI_DI:
34207 case MULTI_ARG_3_HI:
34208 case MULTI_ARG_3_HI_SI:
34209 case MULTI_ARG_3_QI:
34210 case MULTI_ARG_3_DI2:
34211 case MULTI_ARG_3_SI2:
34212 case MULTI_ARG_3_HI2:
34213 case MULTI_ARG_3_QI2:
34214 nargs = 3;
34215 break;
34217 case MULTI_ARG_2_SF:
34218 case MULTI_ARG_2_DF:
34219 case MULTI_ARG_2_DI:
34220 case MULTI_ARG_2_SI:
34221 case MULTI_ARG_2_HI:
34222 case MULTI_ARG_2_QI:
34223 nargs = 2;
34224 break;
34226 case MULTI_ARG_2_DI_IMM:
34227 case MULTI_ARG_2_SI_IMM:
34228 case MULTI_ARG_2_HI_IMM:
34229 case MULTI_ARG_2_QI_IMM:
34230 nargs = 2;
34231 last_arg_constant = true;
34232 break;
34234 case MULTI_ARG_1_SF:
34235 case MULTI_ARG_1_DF:
34236 case MULTI_ARG_1_SF2:
34237 case MULTI_ARG_1_DF2:
34238 case MULTI_ARG_1_DI:
34239 case MULTI_ARG_1_SI:
34240 case MULTI_ARG_1_HI:
34241 case MULTI_ARG_1_QI:
34242 case MULTI_ARG_1_SI_DI:
34243 case MULTI_ARG_1_HI_DI:
34244 case MULTI_ARG_1_HI_SI:
34245 case MULTI_ARG_1_QI_DI:
34246 case MULTI_ARG_1_QI_SI:
34247 case MULTI_ARG_1_QI_HI:
34248 nargs = 1;
34249 break;
34251 case MULTI_ARG_2_DI_CMP:
34252 case MULTI_ARG_2_SI_CMP:
34253 case MULTI_ARG_2_HI_CMP:
34254 case MULTI_ARG_2_QI_CMP:
34255 nargs = 2;
34256 comparison_p = true;
34257 break;
34259 case MULTI_ARG_2_SF_TF:
34260 case MULTI_ARG_2_DF_TF:
34261 case MULTI_ARG_2_DI_TF:
34262 case MULTI_ARG_2_SI_TF:
34263 case MULTI_ARG_2_HI_TF:
34264 case MULTI_ARG_2_QI_TF:
34265 nargs = 2;
34266 tf_p = true;
34267 break;
34269 default:
34270 gcc_unreachable ();
34273 if (optimize || !target
34274 || GET_MODE (target) != tmode
34275 || !insn_data[icode].operand[0].predicate (target, tmode))
34276 target = gen_reg_rtx (tmode);
34277 else if (memory_operand (target, tmode))
34278 num_memory++;
34280 gcc_assert (nargs <= 4);
34282 for (i = 0; i < nargs; i++)
34284 tree arg = CALL_EXPR_ARG (exp, i);
34285 rtx op = expand_normal (arg);
34286 int adjust = (comparison_p) ? 1 : 0;
34287 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34289 if (last_arg_constant && i == nargs - 1)
34291 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34293 enum insn_code new_icode = icode;
34294 switch (icode)
34296 case CODE_FOR_xop_vpermil2v2df3:
34297 case CODE_FOR_xop_vpermil2v4sf3:
34298 case CODE_FOR_xop_vpermil2v4df3:
34299 case CODE_FOR_xop_vpermil2v8sf3:
34300 error ("the last argument must be a 2-bit immediate");
34301 return gen_reg_rtx (tmode);
34302 case CODE_FOR_xop_rotlv2di3:
34303 new_icode = CODE_FOR_rotlv2di3;
34304 goto xop_rotl;
34305 case CODE_FOR_xop_rotlv4si3:
34306 new_icode = CODE_FOR_rotlv4si3;
34307 goto xop_rotl;
34308 case CODE_FOR_xop_rotlv8hi3:
34309 new_icode = CODE_FOR_rotlv8hi3;
34310 goto xop_rotl;
34311 case CODE_FOR_xop_rotlv16qi3:
34312 new_icode = CODE_FOR_rotlv16qi3;
34313 xop_rotl:
34314 if (CONST_INT_P (op))
34316 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34317 op = GEN_INT (INTVAL (op) & mask);
34318 gcc_checking_assert
34319 (insn_data[icode].operand[i + 1].predicate (op, mode));
34321 else
34323 gcc_checking_assert
34324 (nargs == 2
34325 && insn_data[new_icode].operand[0].mode == tmode
34326 && insn_data[new_icode].operand[1].mode == tmode
34327 && insn_data[new_icode].operand[2].mode == mode
34328 && insn_data[new_icode].operand[0].predicate
34329 == insn_data[icode].operand[0].predicate
34330 && insn_data[new_icode].operand[1].predicate
34331 == insn_data[icode].operand[1].predicate);
34332 icode = new_icode;
34333 goto non_constant;
34335 break;
34336 default:
34337 gcc_unreachable ();
34341 else
34343 non_constant:
34344 if (VECTOR_MODE_P (mode))
34345 op = safe_vector_operand (op, mode);
34347 /* If we aren't optimizing, only allow one memory operand to be
34348 generated. */
34349 if (memory_operand (op, mode))
34350 num_memory++;
34352 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34354 if (optimize
34355 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34356 || num_memory > 1)
34357 op = force_reg (mode, op);
34360 args[i].op = op;
34361 args[i].mode = mode;
34364 switch (nargs)
34366 case 1:
34367 pat = GEN_FCN (icode) (target, args[0].op);
34368 break;
34370 case 2:
34371 if (tf_p)
34372 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34373 GEN_INT ((int)sub_code));
34374 else if (! comparison_p)
34375 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34376 else
34378 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34379 args[0].op,
34380 args[1].op);
34382 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34384 break;
34386 case 3:
34387 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34388 break;
34390 case 4:
34391 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34392 break;
34394 default:
34395 gcc_unreachable ();
34398 if (! pat)
34399 return 0;
34401 emit_insn (pat);
34402 return target;
34405 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34406 insns with vec_merge. */
34408 static rtx
34409 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34410 rtx target)
34412 rtx pat;
34413 tree arg0 = CALL_EXPR_ARG (exp, 0);
34414 rtx op1, op0 = expand_normal (arg0);
34415 machine_mode tmode = insn_data[icode].operand[0].mode;
34416 machine_mode mode0 = insn_data[icode].operand[1].mode;
34418 if (optimize || !target
34419 || GET_MODE (target) != tmode
34420 || !insn_data[icode].operand[0].predicate (target, tmode))
34421 target = gen_reg_rtx (tmode);
34423 if (VECTOR_MODE_P (mode0))
34424 op0 = safe_vector_operand (op0, mode0);
34426 if ((optimize && !register_operand (op0, mode0))
34427 || !insn_data[icode].operand[1].predicate (op0, mode0))
34428 op0 = copy_to_mode_reg (mode0, op0);
34430 op1 = op0;
34431 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34432 op1 = copy_to_mode_reg (mode0, op1);
34434 pat = GEN_FCN (icode) (target, op0, op1);
34435 if (! pat)
34436 return 0;
34437 emit_insn (pat);
34438 return target;
34441 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34443 static rtx
34444 ix86_expand_sse_compare (const struct builtin_description *d,
34445 tree exp, rtx target, bool swap)
34447 rtx pat;
34448 tree arg0 = CALL_EXPR_ARG (exp, 0);
34449 tree arg1 = CALL_EXPR_ARG (exp, 1);
34450 rtx op0 = expand_normal (arg0);
34451 rtx op1 = expand_normal (arg1);
34452 rtx op2;
34453 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34454 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34455 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34456 enum rtx_code comparison = d->comparison;
34458 if (VECTOR_MODE_P (mode0))
34459 op0 = safe_vector_operand (op0, mode0);
34460 if (VECTOR_MODE_P (mode1))
34461 op1 = safe_vector_operand (op1, mode1);
34463 /* Swap operands if we have a comparison that isn't available in
34464 hardware. */
34465 if (swap)
34466 std::swap (op0, op1);
34468 if (optimize || !target
34469 || GET_MODE (target) != tmode
34470 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34471 target = gen_reg_rtx (tmode);
34473 if ((optimize && !register_operand (op0, mode0))
34474 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34475 op0 = copy_to_mode_reg (mode0, op0);
34476 if ((optimize && !register_operand (op1, mode1))
34477 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34478 op1 = copy_to_mode_reg (mode1, op1);
34480 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34481 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34482 if (! pat)
34483 return 0;
34484 emit_insn (pat);
34485 return target;
34488 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34490 static rtx
34491 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34492 rtx target)
34494 rtx pat;
34495 tree arg0 = CALL_EXPR_ARG (exp, 0);
34496 tree arg1 = CALL_EXPR_ARG (exp, 1);
34497 rtx op0 = expand_normal (arg0);
34498 rtx op1 = expand_normal (arg1);
34499 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34500 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34501 enum rtx_code comparison = d->comparison;
34503 if (VECTOR_MODE_P (mode0))
34504 op0 = safe_vector_operand (op0, mode0);
34505 if (VECTOR_MODE_P (mode1))
34506 op1 = safe_vector_operand (op1, mode1);
34508 /* Swap operands if we have a comparison that isn't available in
34509 hardware. */
34510 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34511 std::swap (op0, op1);
34513 target = gen_reg_rtx (SImode);
34514 emit_move_insn (target, const0_rtx);
34515 target = gen_rtx_SUBREG (QImode, target, 0);
34517 if ((optimize && !register_operand (op0, mode0))
34518 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34519 op0 = copy_to_mode_reg (mode0, op0);
34520 if ((optimize && !register_operand (op1, mode1))
34521 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34522 op1 = copy_to_mode_reg (mode1, op1);
34524 pat = GEN_FCN (d->icode) (op0, op1);
34525 if (! pat)
34526 return 0;
34527 emit_insn (pat);
34528 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34529 gen_rtx_fmt_ee (comparison, QImode,
34530 SET_DEST (pat),
34531 const0_rtx)));
34533 return SUBREG_REG (target);
34536 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34538 static rtx
34539 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34540 rtx target)
34542 rtx pat;
34543 tree arg0 = CALL_EXPR_ARG (exp, 0);
34544 rtx op1, op0 = expand_normal (arg0);
34545 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34546 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34548 if (optimize || target == 0
34549 || GET_MODE (target) != tmode
34550 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34551 target = gen_reg_rtx (tmode);
34553 if (VECTOR_MODE_P (mode0))
34554 op0 = safe_vector_operand (op0, mode0);
34556 if ((optimize && !register_operand (op0, mode0))
34557 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34558 op0 = copy_to_mode_reg (mode0, op0);
34560 op1 = GEN_INT (d->comparison);
34562 pat = GEN_FCN (d->icode) (target, op0, op1);
34563 if (! pat)
34564 return 0;
34565 emit_insn (pat);
34566 return target;
34569 static rtx
34570 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34571 tree exp, rtx target)
34573 rtx pat;
34574 tree arg0 = CALL_EXPR_ARG (exp, 0);
34575 tree arg1 = CALL_EXPR_ARG (exp, 1);
34576 rtx op0 = expand_normal (arg0);
34577 rtx op1 = expand_normal (arg1);
34578 rtx op2;
34579 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34580 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34581 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34583 if (optimize || target == 0
34584 || GET_MODE (target) != tmode
34585 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34586 target = gen_reg_rtx (tmode);
34588 op0 = safe_vector_operand (op0, mode0);
34589 op1 = safe_vector_operand (op1, mode1);
34591 if ((optimize && !register_operand (op0, mode0))
34592 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34593 op0 = copy_to_mode_reg (mode0, op0);
34594 if ((optimize && !register_operand (op1, mode1))
34595 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34596 op1 = copy_to_mode_reg (mode1, op1);
34598 op2 = GEN_INT (d->comparison);
34600 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34601 if (! pat)
34602 return 0;
34603 emit_insn (pat);
34604 return target;
34607 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34609 static rtx
34610 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34611 rtx target)
34613 rtx pat;
34614 tree arg0 = CALL_EXPR_ARG (exp, 0);
34615 tree arg1 = CALL_EXPR_ARG (exp, 1);
34616 rtx op0 = expand_normal (arg0);
34617 rtx op1 = expand_normal (arg1);
34618 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34619 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34620 enum rtx_code comparison = d->comparison;
34622 if (VECTOR_MODE_P (mode0))
34623 op0 = safe_vector_operand (op0, mode0);
34624 if (VECTOR_MODE_P (mode1))
34625 op1 = safe_vector_operand (op1, mode1);
34627 target = gen_reg_rtx (SImode);
34628 emit_move_insn (target, const0_rtx);
34629 target = gen_rtx_SUBREG (QImode, target, 0);
34631 if ((optimize && !register_operand (op0, mode0))
34632 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34633 op0 = copy_to_mode_reg (mode0, op0);
34634 if ((optimize && !register_operand (op1, mode1))
34635 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34636 op1 = copy_to_mode_reg (mode1, op1);
34638 pat = GEN_FCN (d->icode) (op0, op1);
34639 if (! pat)
34640 return 0;
34641 emit_insn (pat);
34642 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34643 gen_rtx_fmt_ee (comparison, QImode,
34644 SET_DEST (pat),
34645 const0_rtx)));
34647 return SUBREG_REG (target);
34650 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34652 static rtx
34653 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34654 tree exp, rtx target)
34656 rtx pat;
34657 tree arg0 = CALL_EXPR_ARG (exp, 0);
34658 tree arg1 = CALL_EXPR_ARG (exp, 1);
34659 tree arg2 = CALL_EXPR_ARG (exp, 2);
34660 tree arg3 = CALL_EXPR_ARG (exp, 3);
34661 tree arg4 = CALL_EXPR_ARG (exp, 4);
34662 rtx scratch0, scratch1;
34663 rtx op0 = expand_normal (arg0);
34664 rtx op1 = expand_normal (arg1);
34665 rtx op2 = expand_normal (arg2);
34666 rtx op3 = expand_normal (arg3);
34667 rtx op4 = expand_normal (arg4);
34668 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34670 tmode0 = insn_data[d->icode].operand[0].mode;
34671 tmode1 = insn_data[d->icode].operand[1].mode;
34672 modev2 = insn_data[d->icode].operand[2].mode;
34673 modei3 = insn_data[d->icode].operand[3].mode;
34674 modev4 = insn_data[d->icode].operand[4].mode;
34675 modei5 = insn_data[d->icode].operand[5].mode;
34676 modeimm = insn_data[d->icode].operand[6].mode;
34678 if (VECTOR_MODE_P (modev2))
34679 op0 = safe_vector_operand (op0, modev2);
34680 if (VECTOR_MODE_P (modev4))
34681 op2 = safe_vector_operand (op2, modev4);
34683 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34684 op0 = copy_to_mode_reg (modev2, op0);
34685 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34686 op1 = copy_to_mode_reg (modei3, op1);
34687 if ((optimize && !register_operand (op2, modev4))
34688 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34689 op2 = copy_to_mode_reg (modev4, op2);
34690 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34691 op3 = copy_to_mode_reg (modei5, op3);
34693 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34695 error ("the fifth argument must be an 8-bit immediate");
34696 return const0_rtx;
34699 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34701 if (optimize || !target
34702 || GET_MODE (target) != tmode0
34703 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34704 target = gen_reg_rtx (tmode0);
34706 scratch1 = gen_reg_rtx (tmode1);
34708 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34710 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34712 if (optimize || !target
34713 || GET_MODE (target) != tmode1
34714 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34715 target = gen_reg_rtx (tmode1);
34717 scratch0 = gen_reg_rtx (tmode0);
34719 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34721 else
34723 gcc_assert (d->flag);
34725 scratch0 = gen_reg_rtx (tmode0);
34726 scratch1 = gen_reg_rtx (tmode1);
34728 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34731 if (! pat)
34732 return 0;
34734 emit_insn (pat);
34736 if (d->flag)
34738 target = gen_reg_rtx (SImode);
34739 emit_move_insn (target, const0_rtx);
34740 target = gen_rtx_SUBREG (QImode, target, 0);
34742 emit_insn
34743 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34744 gen_rtx_fmt_ee (EQ, QImode,
34745 gen_rtx_REG ((machine_mode) d->flag,
34746 FLAGS_REG),
34747 const0_rtx)));
34748 return SUBREG_REG (target);
34750 else
34751 return target;
34755 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34757 static rtx
34758 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34759 tree exp, rtx target)
34761 rtx pat;
34762 tree arg0 = CALL_EXPR_ARG (exp, 0);
34763 tree arg1 = CALL_EXPR_ARG (exp, 1);
34764 tree arg2 = CALL_EXPR_ARG (exp, 2);
34765 rtx scratch0, scratch1;
34766 rtx op0 = expand_normal (arg0);
34767 rtx op1 = expand_normal (arg1);
34768 rtx op2 = expand_normal (arg2);
34769 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34771 tmode0 = insn_data[d->icode].operand[0].mode;
34772 tmode1 = insn_data[d->icode].operand[1].mode;
34773 modev2 = insn_data[d->icode].operand[2].mode;
34774 modev3 = insn_data[d->icode].operand[3].mode;
34775 modeimm = insn_data[d->icode].operand[4].mode;
34777 if (VECTOR_MODE_P (modev2))
34778 op0 = safe_vector_operand (op0, modev2);
34779 if (VECTOR_MODE_P (modev3))
34780 op1 = safe_vector_operand (op1, modev3);
34782 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34783 op0 = copy_to_mode_reg (modev2, op0);
34784 if ((optimize && !register_operand (op1, modev3))
34785 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34786 op1 = copy_to_mode_reg (modev3, op1);
34788 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34790 error ("the third argument must be an 8-bit immediate");
34791 return const0_rtx;
34794 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34796 if (optimize || !target
34797 || GET_MODE (target) != tmode0
34798 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34799 target = gen_reg_rtx (tmode0);
34801 scratch1 = gen_reg_rtx (tmode1);
34803 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34805 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34807 if (optimize || !target
34808 || GET_MODE (target) != tmode1
34809 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34810 target = gen_reg_rtx (tmode1);
34812 scratch0 = gen_reg_rtx (tmode0);
34814 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34816 else
34818 gcc_assert (d->flag);
34820 scratch0 = gen_reg_rtx (tmode0);
34821 scratch1 = gen_reg_rtx (tmode1);
34823 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34826 if (! pat)
34827 return 0;
34829 emit_insn (pat);
34831 if (d->flag)
34833 target = gen_reg_rtx (SImode);
34834 emit_move_insn (target, const0_rtx);
34835 target = gen_rtx_SUBREG (QImode, target, 0);
34837 emit_insn
34838 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34839 gen_rtx_fmt_ee (EQ, QImode,
34840 gen_rtx_REG ((machine_mode) d->flag,
34841 FLAGS_REG),
34842 const0_rtx)));
34843 return SUBREG_REG (target);
34845 else
34846 return target;
34849 /* Subroutine of ix86_expand_builtin to take care of insns with
34850 variable number of operands. */
34852 static rtx
34853 ix86_expand_args_builtin (const struct builtin_description *d,
34854 tree exp, rtx target)
34856 rtx pat, real_target;
34857 unsigned int i, nargs;
34858 unsigned int nargs_constant = 0;
34859 unsigned int mask_pos = 0;
34860 int num_memory = 0;
34861 struct
34863 rtx op;
34864 machine_mode mode;
34865 } args[6];
34866 bool last_arg_count = false;
34867 enum insn_code icode = d->icode;
34868 const struct insn_data_d *insn_p = &insn_data[icode];
34869 machine_mode tmode = insn_p->operand[0].mode;
34870 machine_mode rmode = VOIDmode;
34871 bool swap = false;
34872 enum rtx_code comparison = d->comparison;
34874 switch ((enum ix86_builtin_func_type) d->flag)
34876 case V2DF_FTYPE_V2DF_ROUND:
34877 case V4DF_FTYPE_V4DF_ROUND:
34878 case V8DF_FTYPE_V8DF_ROUND:
34879 case V4SF_FTYPE_V4SF_ROUND:
34880 case V8SF_FTYPE_V8SF_ROUND:
34881 case V16SF_FTYPE_V16SF_ROUND:
34882 case V4SI_FTYPE_V4SF_ROUND:
34883 case V8SI_FTYPE_V8SF_ROUND:
34884 case V16SI_FTYPE_V16SF_ROUND:
34885 return ix86_expand_sse_round (d, exp, target);
34886 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34887 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34888 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34889 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34890 case INT_FTYPE_V8SF_V8SF_PTEST:
34891 case INT_FTYPE_V4DI_V4DI_PTEST:
34892 case INT_FTYPE_V4DF_V4DF_PTEST:
34893 case INT_FTYPE_V4SF_V4SF_PTEST:
34894 case INT_FTYPE_V2DI_V2DI_PTEST:
34895 case INT_FTYPE_V2DF_V2DF_PTEST:
34896 return ix86_expand_sse_ptest (d, exp, target);
34897 case FLOAT128_FTYPE_FLOAT128:
34898 case FLOAT_FTYPE_FLOAT:
34899 case INT_FTYPE_INT:
34900 case UINT_FTYPE_UINT:
34901 case UINT16_FTYPE_UINT16:
34902 case UINT64_FTYPE_INT:
34903 case UINT64_FTYPE_UINT64:
34904 case INT64_FTYPE_INT64:
34905 case INT64_FTYPE_V4SF:
34906 case INT64_FTYPE_V2DF:
34907 case INT_FTYPE_V16QI:
34908 case INT_FTYPE_V8QI:
34909 case INT_FTYPE_V8SF:
34910 case INT_FTYPE_V4DF:
34911 case INT_FTYPE_V4SF:
34912 case INT_FTYPE_V2DF:
34913 case INT_FTYPE_V32QI:
34914 case V16QI_FTYPE_V16QI:
34915 case V8SI_FTYPE_V8SF:
34916 case V8SI_FTYPE_V4SI:
34917 case V8HI_FTYPE_V8HI:
34918 case V8HI_FTYPE_V16QI:
34919 case V8QI_FTYPE_V8QI:
34920 case V8SF_FTYPE_V8SF:
34921 case V8SF_FTYPE_V8SI:
34922 case V8SF_FTYPE_V4SF:
34923 case V8SF_FTYPE_V8HI:
34924 case V4SI_FTYPE_V4SI:
34925 case V4SI_FTYPE_V16QI:
34926 case V4SI_FTYPE_V4SF:
34927 case V4SI_FTYPE_V8SI:
34928 case V4SI_FTYPE_V8HI:
34929 case V4SI_FTYPE_V4DF:
34930 case V4SI_FTYPE_V2DF:
34931 case V4HI_FTYPE_V4HI:
34932 case V4DF_FTYPE_V4DF:
34933 case V4DF_FTYPE_V4SI:
34934 case V4DF_FTYPE_V4SF:
34935 case V4DF_FTYPE_V2DF:
34936 case V4SF_FTYPE_V4SF:
34937 case V4SF_FTYPE_V4SI:
34938 case V4SF_FTYPE_V8SF:
34939 case V4SF_FTYPE_V4DF:
34940 case V4SF_FTYPE_V8HI:
34941 case V4SF_FTYPE_V2DF:
34942 case V2DI_FTYPE_V2DI:
34943 case V2DI_FTYPE_V16QI:
34944 case V2DI_FTYPE_V8HI:
34945 case V2DI_FTYPE_V4SI:
34946 case V2DF_FTYPE_V2DF:
34947 case V2DF_FTYPE_V4SI:
34948 case V2DF_FTYPE_V4DF:
34949 case V2DF_FTYPE_V4SF:
34950 case V2DF_FTYPE_V2SI:
34951 case V2SI_FTYPE_V2SI:
34952 case V2SI_FTYPE_V4SF:
34953 case V2SI_FTYPE_V2SF:
34954 case V2SI_FTYPE_V2DF:
34955 case V2SF_FTYPE_V2SF:
34956 case V2SF_FTYPE_V2SI:
34957 case V32QI_FTYPE_V32QI:
34958 case V32QI_FTYPE_V16QI:
34959 case V16HI_FTYPE_V16HI:
34960 case V16HI_FTYPE_V8HI:
34961 case V8SI_FTYPE_V8SI:
34962 case V16HI_FTYPE_V16QI:
34963 case V8SI_FTYPE_V16QI:
34964 case V4DI_FTYPE_V16QI:
34965 case V8SI_FTYPE_V8HI:
34966 case V4DI_FTYPE_V8HI:
34967 case V4DI_FTYPE_V4SI:
34968 case V4DI_FTYPE_V2DI:
34969 case UQI_FTYPE_UQI:
34970 case UHI_FTYPE_UHI:
34971 case USI_FTYPE_USI:
34972 case USI_FTYPE_UQI:
34973 case USI_FTYPE_UHI:
34974 case UDI_FTYPE_UDI:
34975 case UHI_FTYPE_V16QI:
34976 case USI_FTYPE_V32QI:
34977 case UDI_FTYPE_V64QI:
34978 case V16QI_FTYPE_UHI:
34979 case V32QI_FTYPE_USI:
34980 case V64QI_FTYPE_UDI:
34981 case V8HI_FTYPE_UQI:
34982 case V16HI_FTYPE_UHI:
34983 case V32HI_FTYPE_USI:
34984 case V4SI_FTYPE_UQI:
34985 case V8SI_FTYPE_UQI:
34986 case V4SI_FTYPE_UHI:
34987 case V8SI_FTYPE_UHI:
34988 case UQI_FTYPE_V8HI:
34989 case UHI_FTYPE_V16HI:
34990 case USI_FTYPE_V32HI:
34991 case UQI_FTYPE_V4SI:
34992 case UQI_FTYPE_V8SI:
34993 case UHI_FTYPE_V16SI:
34994 case UQI_FTYPE_V2DI:
34995 case UQI_FTYPE_V4DI:
34996 case UQI_FTYPE_V8DI:
34997 case V16SI_FTYPE_UHI:
34998 case V2DI_FTYPE_UQI:
34999 case V4DI_FTYPE_UQI:
35000 case V16SI_FTYPE_INT:
35001 case V16SF_FTYPE_V8SF:
35002 case V16SI_FTYPE_V8SI:
35003 case V16SF_FTYPE_V4SF:
35004 case V16SI_FTYPE_V4SI:
35005 case V16SI_FTYPE_V16SF:
35006 case V16SI_FTYPE_V16SI:
35007 case V16SF_FTYPE_V16SF:
35008 case V8DI_FTYPE_UQI:
35009 case V8DI_FTYPE_V8DI:
35010 case V8DF_FTYPE_V4DF:
35011 case V8DF_FTYPE_V2DF:
35012 case V8DF_FTYPE_V8DF:
35013 nargs = 1;
35014 break;
35015 case V4SF_FTYPE_V4SF_VEC_MERGE:
35016 case V2DF_FTYPE_V2DF_VEC_MERGE:
35017 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35018 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35019 case V16QI_FTYPE_V16QI_V16QI:
35020 case V16QI_FTYPE_V8HI_V8HI:
35021 case V16SF_FTYPE_V16SF_V16SF:
35022 case V8QI_FTYPE_V8QI_V8QI:
35023 case V8QI_FTYPE_V4HI_V4HI:
35024 case V8HI_FTYPE_V8HI_V8HI:
35025 case V8HI_FTYPE_V16QI_V16QI:
35026 case V8HI_FTYPE_V4SI_V4SI:
35027 case V8SF_FTYPE_V8SF_V8SF:
35028 case V8SF_FTYPE_V8SF_V8SI:
35029 case V8DF_FTYPE_V8DF_V8DF:
35030 case V4SI_FTYPE_V4SI_V4SI:
35031 case V4SI_FTYPE_V8HI_V8HI:
35032 case V4SI_FTYPE_V2DF_V2DF:
35033 case V4HI_FTYPE_V4HI_V4HI:
35034 case V4HI_FTYPE_V8QI_V8QI:
35035 case V4HI_FTYPE_V2SI_V2SI:
35036 case V4DF_FTYPE_V4DF_V4DF:
35037 case V4DF_FTYPE_V4DF_V4DI:
35038 case V4SF_FTYPE_V4SF_V4SF:
35039 case V4SF_FTYPE_V4SF_V4SI:
35040 case V4SF_FTYPE_V4SF_V2SI:
35041 case V4SF_FTYPE_V4SF_V2DF:
35042 case V4SF_FTYPE_V4SF_UINT:
35043 case V4SF_FTYPE_V4SF_DI:
35044 case V4SF_FTYPE_V4SF_SI:
35045 case V2DI_FTYPE_V2DI_V2DI:
35046 case V2DI_FTYPE_V16QI_V16QI:
35047 case V2DI_FTYPE_V4SI_V4SI:
35048 case V2DI_FTYPE_V2DI_V16QI:
35049 case V2SI_FTYPE_V2SI_V2SI:
35050 case V2SI_FTYPE_V4HI_V4HI:
35051 case V2SI_FTYPE_V2SF_V2SF:
35052 case V2DF_FTYPE_V2DF_V2DF:
35053 case V2DF_FTYPE_V2DF_V4SF:
35054 case V2DF_FTYPE_V2DF_V2DI:
35055 case V2DF_FTYPE_V2DF_DI:
35056 case V2DF_FTYPE_V2DF_SI:
35057 case V2DF_FTYPE_V2DF_UINT:
35058 case V2SF_FTYPE_V2SF_V2SF:
35059 case V1DI_FTYPE_V1DI_V1DI:
35060 case V1DI_FTYPE_V8QI_V8QI:
35061 case V1DI_FTYPE_V2SI_V2SI:
35062 case V32QI_FTYPE_V16HI_V16HI:
35063 case V16HI_FTYPE_V8SI_V8SI:
35064 case V32QI_FTYPE_V32QI_V32QI:
35065 case V16HI_FTYPE_V32QI_V32QI:
35066 case V16HI_FTYPE_V16HI_V16HI:
35067 case V8SI_FTYPE_V4DF_V4DF:
35068 case V8SI_FTYPE_V8SI_V8SI:
35069 case V8SI_FTYPE_V16HI_V16HI:
35070 case V4DI_FTYPE_V4DI_V4DI:
35071 case V4DI_FTYPE_V8SI_V8SI:
35072 case V8DI_FTYPE_V64QI_V64QI:
35073 if (comparison == UNKNOWN)
35074 return ix86_expand_binop_builtin (icode, exp, target);
35075 nargs = 2;
35076 break;
35077 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35078 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35079 gcc_assert (comparison != UNKNOWN);
35080 nargs = 2;
35081 swap = true;
35082 break;
35083 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35084 case V16HI_FTYPE_V16HI_SI_COUNT:
35085 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35086 case V8SI_FTYPE_V8SI_SI_COUNT:
35087 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35088 case V4DI_FTYPE_V4DI_INT_COUNT:
35089 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35090 case V8HI_FTYPE_V8HI_SI_COUNT:
35091 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35092 case V4SI_FTYPE_V4SI_SI_COUNT:
35093 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35094 case V4HI_FTYPE_V4HI_SI_COUNT:
35095 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35096 case V2DI_FTYPE_V2DI_SI_COUNT:
35097 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35098 case V2SI_FTYPE_V2SI_SI_COUNT:
35099 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35100 case V1DI_FTYPE_V1DI_SI_COUNT:
35101 nargs = 2;
35102 last_arg_count = true;
35103 break;
35104 case UINT64_FTYPE_UINT64_UINT64:
35105 case UINT_FTYPE_UINT_UINT:
35106 case UINT_FTYPE_UINT_USHORT:
35107 case UINT_FTYPE_UINT_UCHAR:
35108 case UINT16_FTYPE_UINT16_INT:
35109 case UINT8_FTYPE_UINT8_INT:
35110 case UQI_FTYPE_UQI_UQI:
35111 case UHI_FTYPE_UHI_UHI:
35112 case USI_FTYPE_USI_USI:
35113 case UDI_FTYPE_UDI_UDI:
35114 case V16SI_FTYPE_V8DF_V8DF:
35115 nargs = 2;
35116 break;
35117 case V2DI_FTYPE_V2DI_INT_CONVERT:
35118 nargs = 2;
35119 rmode = V1TImode;
35120 nargs_constant = 1;
35121 break;
35122 case V4DI_FTYPE_V4DI_INT_CONVERT:
35123 nargs = 2;
35124 rmode = V2TImode;
35125 nargs_constant = 1;
35126 break;
35127 case V8DI_FTYPE_V8DI_INT_CONVERT:
35128 nargs = 2;
35129 rmode = V4TImode;
35130 nargs_constant = 1;
35131 break;
35132 case V8HI_FTYPE_V8HI_INT:
35133 case V8HI_FTYPE_V8SF_INT:
35134 case V16HI_FTYPE_V16SF_INT:
35135 case V8HI_FTYPE_V4SF_INT:
35136 case V8SF_FTYPE_V8SF_INT:
35137 case V4SF_FTYPE_V16SF_INT:
35138 case V16SF_FTYPE_V16SF_INT:
35139 case V4SI_FTYPE_V4SI_INT:
35140 case V4SI_FTYPE_V8SI_INT:
35141 case V4HI_FTYPE_V4HI_INT:
35142 case V4DF_FTYPE_V4DF_INT:
35143 case V4DF_FTYPE_V8DF_INT:
35144 case V4SF_FTYPE_V4SF_INT:
35145 case V4SF_FTYPE_V8SF_INT:
35146 case V2DI_FTYPE_V2DI_INT:
35147 case V2DF_FTYPE_V2DF_INT:
35148 case V2DF_FTYPE_V4DF_INT:
35149 case V16HI_FTYPE_V16HI_INT:
35150 case V8SI_FTYPE_V8SI_INT:
35151 case V16SI_FTYPE_V16SI_INT:
35152 case V4SI_FTYPE_V16SI_INT:
35153 case V4DI_FTYPE_V4DI_INT:
35154 case V2DI_FTYPE_V4DI_INT:
35155 case V4DI_FTYPE_V8DI_INT:
35156 case QI_FTYPE_V4SF_INT:
35157 case QI_FTYPE_V2DF_INT:
35158 case UQI_FTYPE_UQI_UQI_CONST:
35159 case UHI_FTYPE_UHI_UQI:
35160 case USI_FTYPE_USI_UQI:
35161 case UDI_FTYPE_UDI_UQI:
35162 nargs = 2;
35163 nargs_constant = 1;
35164 break;
35165 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35166 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35167 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35168 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35169 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35170 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35171 case UHI_FTYPE_V16SI_V16SI_UHI:
35172 case UQI_FTYPE_V8DI_V8DI_UQI:
35173 case V16HI_FTYPE_V16SI_V16HI_UHI:
35174 case V16QI_FTYPE_V16SI_V16QI_UHI:
35175 case V16QI_FTYPE_V8DI_V16QI_UQI:
35176 case V16SF_FTYPE_V16SF_V16SF_UHI:
35177 case V16SF_FTYPE_V4SF_V16SF_UHI:
35178 case V16SI_FTYPE_SI_V16SI_UHI:
35179 case V16SI_FTYPE_V16HI_V16SI_UHI:
35180 case V16SI_FTYPE_V16QI_V16SI_UHI:
35181 case V8SF_FTYPE_V4SF_V8SF_UQI:
35182 case V4DF_FTYPE_V2DF_V4DF_UQI:
35183 case V8SI_FTYPE_V4SI_V8SI_UQI:
35184 case V8SI_FTYPE_SI_V8SI_UQI:
35185 case V4SI_FTYPE_V4SI_V4SI_UQI:
35186 case V4SI_FTYPE_SI_V4SI_UQI:
35187 case V4DI_FTYPE_V2DI_V4DI_UQI:
35188 case V4DI_FTYPE_DI_V4DI_UQI:
35189 case V2DI_FTYPE_V2DI_V2DI_UQI:
35190 case V2DI_FTYPE_DI_V2DI_UQI:
35191 case V64QI_FTYPE_V64QI_V64QI_UDI:
35192 case V64QI_FTYPE_V16QI_V64QI_UDI:
35193 case V64QI_FTYPE_QI_V64QI_UDI:
35194 case V32QI_FTYPE_V32QI_V32QI_USI:
35195 case V32QI_FTYPE_V16QI_V32QI_USI:
35196 case V32QI_FTYPE_QI_V32QI_USI:
35197 case V16QI_FTYPE_V16QI_V16QI_UHI:
35198 case V16QI_FTYPE_QI_V16QI_UHI:
35199 case V32HI_FTYPE_V8HI_V32HI_USI:
35200 case V32HI_FTYPE_HI_V32HI_USI:
35201 case V16HI_FTYPE_V8HI_V16HI_UHI:
35202 case V16HI_FTYPE_HI_V16HI_UHI:
35203 case V8HI_FTYPE_V8HI_V8HI_UQI:
35204 case V8HI_FTYPE_HI_V8HI_UQI:
35205 case V8SF_FTYPE_V8HI_V8SF_UQI:
35206 case V4SF_FTYPE_V8HI_V4SF_UQI:
35207 case V8SI_FTYPE_V8SF_V8SI_UQI:
35208 case V4SI_FTYPE_V4SF_V4SI_UQI:
35209 case V4DI_FTYPE_V4SF_V4DI_UQI:
35210 case V2DI_FTYPE_V4SF_V2DI_UQI:
35211 case V4SF_FTYPE_V4DI_V4SF_UQI:
35212 case V4SF_FTYPE_V2DI_V4SF_UQI:
35213 case V4DF_FTYPE_V4DI_V4DF_UQI:
35214 case V2DF_FTYPE_V2DI_V2DF_UQI:
35215 case V16QI_FTYPE_V8HI_V16QI_UQI:
35216 case V16QI_FTYPE_V16HI_V16QI_UHI:
35217 case V16QI_FTYPE_V4SI_V16QI_UQI:
35218 case V16QI_FTYPE_V8SI_V16QI_UQI:
35219 case V8HI_FTYPE_V4SI_V8HI_UQI:
35220 case V8HI_FTYPE_V8SI_V8HI_UQI:
35221 case V16QI_FTYPE_V2DI_V16QI_UQI:
35222 case V16QI_FTYPE_V4DI_V16QI_UQI:
35223 case V8HI_FTYPE_V2DI_V8HI_UQI:
35224 case V8HI_FTYPE_V4DI_V8HI_UQI:
35225 case V4SI_FTYPE_V2DI_V4SI_UQI:
35226 case V4SI_FTYPE_V4DI_V4SI_UQI:
35227 case V32QI_FTYPE_V32HI_V32QI_USI:
35228 case UHI_FTYPE_V16QI_V16QI_UHI:
35229 case USI_FTYPE_V32QI_V32QI_USI:
35230 case UDI_FTYPE_V64QI_V64QI_UDI:
35231 case UQI_FTYPE_V8HI_V8HI_UQI:
35232 case UHI_FTYPE_V16HI_V16HI_UHI:
35233 case USI_FTYPE_V32HI_V32HI_USI:
35234 case UQI_FTYPE_V4SI_V4SI_UQI:
35235 case UQI_FTYPE_V8SI_V8SI_UQI:
35236 case UQI_FTYPE_V2DI_V2DI_UQI:
35237 case UQI_FTYPE_V4DI_V4DI_UQI:
35238 case V4SF_FTYPE_V2DF_V4SF_UQI:
35239 case V4SF_FTYPE_V4DF_V4SF_UQI:
35240 case V16SI_FTYPE_V16SI_V16SI_UHI:
35241 case V16SI_FTYPE_V4SI_V16SI_UHI:
35242 case V2DI_FTYPE_V4SI_V2DI_UQI:
35243 case V2DI_FTYPE_V8HI_V2DI_UQI:
35244 case V2DI_FTYPE_V16QI_V2DI_UQI:
35245 case V4DI_FTYPE_V4DI_V4DI_UQI:
35246 case V4DI_FTYPE_V4SI_V4DI_UQI:
35247 case V4DI_FTYPE_V8HI_V4DI_UQI:
35248 case V4DI_FTYPE_V16QI_V4DI_UQI:
35249 case V4DI_FTYPE_V4DF_V4DI_UQI:
35250 case V2DI_FTYPE_V2DF_V2DI_UQI:
35251 case V4SI_FTYPE_V4DF_V4SI_UQI:
35252 case V4SI_FTYPE_V2DF_V4SI_UQI:
35253 case V4SI_FTYPE_V8HI_V4SI_UQI:
35254 case V4SI_FTYPE_V16QI_V4SI_UQI:
35255 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35256 case V8DF_FTYPE_V2DF_V8DF_UQI:
35257 case V8DF_FTYPE_V4DF_V8DF_UQI:
35258 case V8DF_FTYPE_V8DF_V8DF_UQI:
35259 case V8SF_FTYPE_V8SF_V8SF_UQI:
35260 case V8SF_FTYPE_V8SI_V8SF_UQI:
35261 case V4DF_FTYPE_V4DF_V4DF_UQI:
35262 case V4SF_FTYPE_V4SF_V4SF_UQI:
35263 case V2DF_FTYPE_V2DF_V2DF_UQI:
35264 case V2DF_FTYPE_V4SF_V2DF_UQI:
35265 case V2DF_FTYPE_V4SI_V2DF_UQI:
35266 case V4SF_FTYPE_V4SI_V4SF_UQI:
35267 case V4DF_FTYPE_V4SF_V4DF_UQI:
35268 case V4DF_FTYPE_V4SI_V4DF_UQI:
35269 case V8SI_FTYPE_V8SI_V8SI_UQI:
35270 case V8SI_FTYPE_V8HI_V8SI_UQI:
35271 case V8SI_FTYPE_V16QI_V8SI_UQI:
35272 case V8DF_FTYPE_V8SI_V8DF_UQI:
35273 case V8DI_FTYPE_DI_V8DI_UQI:
35274 case V16SF_FTYPE_V8SF_V16SF_UHI:
35275 case V16SI_FTYPE_V8SI_V16SI_UHI:
35276 case V16HI_FTYPE_V16HI_V16HI_UHI:
35277 case V8HI_FTYPE_V16QI_V8HI_UQI:
35278 case V16HI_FTYPE_V16QI_V16HI_UHI:
35279 case V32HI_FTYPE_V32HI_V32HI_USI:
35280 case V32HI_FTYPE_V32QI_V32HI_USI:
35281 case V8DI_FTYPE_V16QI_V8DI_UQI:
35282 case V8DI_FTYPE_V2DI_V8DI_UQI:
35283 case V8DI_FTYPE_V4DI_V8DI_UQI:
35284 case V8DI_FTYPE_V8DI_V8DI_UQI:
35285 case V8DI_FTYPE_V8HI_V8DI_UQI:
35286 case V8DI_FTYPE_V8SI_V8DI_UQI:
35287 case V8HI_FTYPE_V8DI_V8HI_UQI:
35288 case V8SI_FTYPE_V8DI_V8SI_UQI:
35289 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35290 nargs = 3;
35291 break;
35292 case V32QI_FTYPE_V32QI_V32QI_INT:
35293 case V16HI_FTYPE_V16HI_V16HI_INT:
35294 case V16QI_FTYPE_V16QI_V16QI_INT:
35295 case V4DI_FTYPE_V4DI_V4DI_INT:
35296 case V8HI_FTYPE_V8HI_V8HI_INT:
35297 case V8SI_FTYPE_V8SI_V8SI_INT:
35298 case V8SI_FTYPE_V8SI_V4SI_INT:
35299 case V8SF_FTYPE_V8SF_V8SF_INT:
35300 case V8SF_FTYPE_V8SF_V4SF_INT:
35301 case V4SI_FTYPE_V4SI_V4SI_INT:
35302 case V4DF_FTYPE_V4DF_V4DF_INT:
35303 case V16SF_FTYPE_V16SF_V16SF_INT:
35304 case V16SF_FTYPE_V16SF_V4SF_INT:
35305 case V16SI_FTYPE_V16SI_V4SI_INT:
35306 case V4DF_FTYPE_V4DF_V2DF_INT:
35307 case V4SF_FTYPE_V4SF_V4SF_INT:
35308 case V2DI_FTYPE_V2DI_V2DI_INT:
35309 case V4DI_FTYPE_V4DI_V2DI_INT:
35310 case V2DF_FTYPE_V2DF_V2DF_INT:
35311 case UQI_FTYPE_V8DI_V8UDI_INT:
35312 case UQI_FTYPE_V8DF_V8DF_INT:
35313 case UQI_FTYPE_V2DF_V2DF_INT:
35314 case UQI_FTYPE_V4SF_V4SF_INT:
35315 case UHI_FTYPE_V16SI_V16SI_INT:
35316 case UHI_FTYPE_V16SF_V16SF_INT:
35317 nargs = 3;
35318 nargs_constant = 1;
35319 break;
35320 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35321 nargs = 3;
35322 rmode = V4DImode;
35323 nargs_constant = 1;
35324 break;
35325 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35326 nargs = 3;
35327 rmode = V2DImode;
35328 nargs_constant = 1;
35329 break;
35330 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35331 nargs = 3;
35332 rmode = DImode;
35333 nargs_constant = 1;
35334 break;
35335 case V2DI_FTYPE_V2DI_UINT_UINT:
35336 nargs = 3;
35337 nargs_constant = 2;
35338 break;
35339 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35340 nargs = 3;
35341 rmode = V8DImode;
35342 nargs_constant = 1;
35343 break;
35344 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35345 nargs = 5;
35346 rmode = V8DImode;
35347 mask_pos = 2;
35348 nargs_constant = 1;
35349 break;
35350 case QI_FTYPE_V8DF_INT_UQI:
35351 case QI_FTYPE_V4DF_INT_UQI:
35352 case QI_FTYPE_V2DF_INT_UQI:
35353 case HI_FTYPE_V16SF_INT_UHI:
35354 case QI_FTYPE_V8SF_INT_UQI:
35355 case QI_FTYPE_V4SF_INT_UQI:
35356 nargs = 3;
35357 mask_pos = 1;
35358 nargs_constant = 1;
35359 break;
35360 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35361 nargs = 5;
35362 rmode = V4DImode;
35363 mask_pos = 2;
35364 nargs_constant = 1;
35365 break;
35366 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35367 nargs = 5;
35368 rmode = V2DImode;
35369 mask_pos = 2;
35370 nargs_constant = 1;
35371 break;
35372 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35373 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35374 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35375 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35376 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35377 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35378 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35379 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35380 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35381 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35382 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35383 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35384 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35385 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35386 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35387 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35388 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35389 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35390 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35391 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35392 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35393 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35394 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35395 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35396 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35397 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35398 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35399 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35400 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35401 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35402 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35403 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35404 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35405 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35406 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35407 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35408 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35409 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35410 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35411 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35412 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35413 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35414 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35415 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35416 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35417 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35418 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35419 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35420 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35421 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35422 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35423 nargs = 4;
35424 break;
35425 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35426 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35427 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35428 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35429 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35430 nargs = 4;
35431 nargs_constant = 1;
35432 break;
35433 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35434 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35435 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35436 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35437 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35438 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35439 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35440 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35441 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35442 case USI_FTYPE_V32QI_V32QI_INT_USI:
35443 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35444 case USI_FTYPE_V32HI_V32HI_INT_USI:
35445 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35446 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35447 nargs = 4;
35448 mask_pos = 1;
35449 nargs_constant = 1;
35450 break;
35451 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35452 nargs = 4;
35453 nargs_constant = 2;
35454 break;
35455 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35456 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35457 nargs = 4;
35458 break;
35459 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35460 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35461 mask_pos = 1;
35462 nargs = 4;
35463 nargs_constant = 1;
35464 break;
35465 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35466 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35467 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35468 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35469 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35470 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35471 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35472 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35473 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35474 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35475 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35476 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35477 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35478 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35479 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35480 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35481 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35482 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35483 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35484 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35485 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35486 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35487 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35488 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35489 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35490 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35491 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35492 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35493 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35494 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35495 nargs = 4;
35496 mask_pos = 2;
35497 nargs_constant = 1;
35498 break;
35499 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35500 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35501 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35502 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35503 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35504 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35505 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35506 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35507 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35508 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35509 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35510 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35511 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35512 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35513 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35514 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35515 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35516 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35517 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35518 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35519 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35520 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35521 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35522 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35523 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35524 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35525 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35526 nargs = 5;
35527 mask_pos = 2;
35528 nargs_constant = 1;
35529 break;
35530 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35531 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35532 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35533 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35534 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35535 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35536 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35537 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35538 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35539 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35540 nargs = 5;
35541 mask_pos = 1;
35542 nargs_constant = 1;
35543 break;
35545 default:
35546 gcc_unreachable ();
35549 gcc_assert (nargs <= ARRAY_SIZE (args));
35551 if (comparison != UNKNOWN)
35553 gcc_assert (nargs == 2);
35554 return ix86_expand_sse_compare (d, exp, target, swap);
35557 if (rmode == VOIDmode || rmode == tmode)
35559 if (optimize
35560 || target == 0
35561 || GET_MODE (target) != tmode
35562 || !insn_p->operand[0].predicate (target, tmode))
35563 target = gen_reg_rtx (tmode);
35564 else if (memory_operand (target, tmode))
35565 num_memory++;
35566 real_target = target;
35568 else
35570 real_target = gen_reg_rtx (tmode);
35571 target = lowpart_subreg (rmode, real_target, tmode);
35574 for (i = 0; i < nargs; i++)
35576 tree arg = CALL_EXPR_ARG (exp, i);
35577 rtx op = expand_normal (arg);
35578 machine_mode mode = insn_p->operand[i + 1].mode;
35579 bool match = insn_p->operand[i + 1].predicate (op, mode);
35581 if (last_arg_count && (i + 1) == nargs)
35583 /* SIMD shift insns take either an 8-bit immediate or
35584 register as count. But builtin functions take int as
35585 count. If count doesn't match, we put it in register. */
35586 if (!match)
35588 op = lowpart_subreg (SImode, op, GET_MODE (op));
35589 if (!insn_p->operand[i + 1].predicate (op, mode))
35590 op = copy_to_reg (op);
35593 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35594 (!mask_pos && (nargs - i) <= nargs_constant))
35596 if (!match)
35597 switch (icode)
35599 case CODE_FOR_avx_vinsertf128v4di:
35600 case CODE_FOR_avx_vextractf128v4di:
35601 error ("the last argument must be an 1-bit immediate");
35602 return const0_rtx;
35604 case CODE_FOR_avx512f_cmpv8di3_mask:
35605 case CODE_FOR_avx512f_cmpv16si3_mask:
35606 case CODE_FOR_avx512f_ucmpv8di3_mask:
35607 case CODE_FOR_avx512f_ucmpv16si3_mask:
35608 case CODE_FOR_avx512vl_cmpv4di3_mask:
35609 case CODE_FOR_avx512vl_cmpv8si3_mask:
35610 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35611 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35612 case CODE_FOR_avx512vl_cmpv2di3_mask:
35613 case CODE_FOR_avx512vl_cmpv4si3_mask:
35614 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35615 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35616 error ("the last argument must be a 3-bit immediate");
35617 return const0_rtx;
35619 case CODE_FOR_sse4_1_roundsd:
35620 case CODE_FOR_sse4_1_roundss:
35622 case CODE_FOR_sse4_1_roundpd:
35623 case CODE_FOR_sse4_1_roundps:
35624 case CODE_FOR_avx_roundpd256:
35625 case CODE_FOR_avx_roundps256:
35627 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35628 case CODE_FOR_sse4_1_roundps_sfix:
35629 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35630 case CODE_FOR_avx_roundps_sfix256:
35632 case CODE_FOR_sse4_1_blendps:
35633 case CODE_FOR_avx_blendpd256:
35634 case CODE_FOR_avx_vpermilv4df:
35635 case CODE_FOR_avx_vpermilv4df_mask:
35636 case CODE_FOR_avx512f_getmantv8df_mask:
35637 case CODE_FOR_avx512f_getmantv16sf_mask:
35638 case CODE_FOR_avx512vl_getmantv8sf_mask:
35639 case CODE_FOR_avx512vl_getmantv4df_mask:
35640 case CODE_FOR_avx512vl_getmantv4sf_mask:
35641 case CODE_FOR_avx512vl_getmantv2df_mask:
35642 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35643 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35644 case CODE_FOR_avx512dq_rangepv4df_mask:
35645 case CODE_FOR_avx512dq_rangepv8sf_mask:
35646 case CODE_FOR_avx512dq_rangepv2df_mask:
35647 case CODE_FOR_avx512dq_rangepv4sf_mask:
35648 case CODE_FOR_avx_shufpd256_mask:
35649 error ("the last argument must be a 4-bit immediate");
35650 return const0_rtx;
35652 case CODE_FOR_sha1rnds4:
35653 case CODE_FOR_sse4_1_blendpd:
35654 case CODE_FOR_avx_vpermilv2df:
35655 case CODE_FOR_avx_vpermilv2df_mask:
35656 case CODE_FOR_xop_vpermil2v2df3:
35657 case CODE_FOR_xop_vpermil2v4sf3:
35658 case CODE_FOR_xop_vpermil2v4df3:
35659 case CODE_FOR_xop_vpermil2v8sf3:
35660 case CODE_FOR_avx512f_vinsertf32x4_mask:
35661 case CODE_FOR_avx512f_vinserti32x4_mask:
35662 case CODE_FOR_avx512f_vextractf32x4_mask:
35663 case CODE_FOR_avx512f_vextracti32x4_mask:
35664 case CODE_FOR_sse2_shufpd:
35665 case CODE_FOR_sse2_shufpd_mask:
35666 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35667 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35668 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35669 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35670 error ("the last argument must be a 2-bit immediate");
35671 return const0_rtx;
35673 case CODE_FOR_avx_vextractf128v4df:
35674 case CODE_FOR_avx_vextractf128v8sf:
35675 case CODE_FOR_avx_vextractf128v8si:
35676 case CODE_FOR_avx_vinsertf128v4df:
35677 case CODE_FOR_avx_vinsertf128v8sf:
35678 case CODE_FOR_avx_vinsertf128v8si:
35679 case CODE_FOR_avx512f_vinsertf64x4_mask:
35680 case CODE_FOR_avx512f_vinserti64x4_mask:
35681 case CODE_FOR_avx512f_vextractf64x4_mask:
35682 case CODE_FOR_avx512f_vextracti64x4_mask:
35683 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35684 case CODE_FOR_avx512dq_vinserti32x8_mask:
35685 case CODE_FOR_avx512vl_vinsertv4df:
35686 case CODE_FOR_avx512vl_vinsertv4di:
35687 case CODE_FOR_avx512vl_vinsertv8sf:
35688 case CODE_FOR_avx512vl_vinsertv8si:
35689 error ("the last argument must be a 1-bit immediate");
35690 return const0_rtx;
35692 case CODE_FOR_avx_vmcmpv2df3:
35693 case CODE_FOR_avx_vmcmpv4sf3:
35694 case CODE_FOR_avx_cmpv2df3:
35695 case CODE_FOR_avx_cmpv4sf3:
35696 case CODE_FOR_avx_cmpv4df3:
35697 case CODE_FOR_avx_cmpv8sf3:
35698 case CODE_FOR_avx512f_cmpv8df3_mask:
35699 case CODE_FOR_avx512f_cmpv16sf3_mask:
35700 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35701 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35702 error ("the last argument must be a 5-bit immediate");
35703 return const0_rtx;
35705 default:
35706 switch (nargs_constant)
35708 case 2:
35709 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35710 (!mask_pos && (nargs - i) == nargs_constant))
35712 error ("the next to last argument must be an 8-bit immediate");
35713 break;
35715 /* FALLTHRU */
35716 case 1:
35717 error ("the last argument must be an 8-bit immediate");
35718 break;
35719 default:
35720 gcc_unreachable ();
35722 return const0_rtx;
35725 else
35727 if (VECTOR_MODE_P (mode))
35728 op = safe_vector_operand (op, mode);
35730 /* If we aren't optimizing, only allow one memory operand to
35731 be generated. */
35732 if (memory_operand (op, mode))
35733 num_memory++;
35735 op = fixup_modeless_constant (op, mode);
35737 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35739 if (optimize || !match || num_memory > 1)
35740 op = copy_to_mode_reg (mode, op);
35742 else
35744 op = copy_to_reg (op);
35745 op = lowpart_subreg (mode, op, GET_MODE (op));
35749 args[i].op = op;
35750 args[i].mode = mode;
35753 switch (nargs)
35755 case 1:
35756 pat = GEN_FCN (icode) (real_target, args[0].op);
35757 break;
35758 case 2:
35759 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35760 break;
35761 case 3:
35762 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35763 args[2].op);
35764 break;
35765 case 4:
35766 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35767 args[2].op, args[3].op);
35768 break;
35769 case 5:
35770 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35771 args[2].op, args[3].op, args[4].op);
35772 break;
35773 case 6:
35774 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35775 args[2].op, args[3].op, args[4].op,
35776 args[5].op);
35777 break;
35778 default:
35779 gcc_unreachable ();
35782 if (! pat)
35783 return 0;
35785 emit_insn (pat);
35786 return target;
35789 /* Transform pattern of following layout:
35790 (parallel [
35791 set (A B)
35792 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35794 into:
35795 (set (A B))
35798 (parallel [ A B
35800 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35803 into:
35804 (parallel [ A B ... ]) */
35806 static rtx
35807 ix86_erase_embedded_rounding (rtx pat)
35809 if (GET_CODE (pat) == INSN)
35810 pat = PATTERN (pat);
35812 gcc_assert (GET_CODE (pat) == PARALLEL);
35814 if (XVECLEN (pat, 0) == 2)
35816 rtx p0 = XVECEXP (pat, 0, 0);
35817 rtx p1 = XVECEXP (pat, 0, 1);
35819 gcc_assert (GET_CODE (p0) == SET
35820 && GET_CODE (p1) == UNSPEC
35821 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35823 return p0;
35825 else
35827 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35828 int i = 0;
35829 int j = 0;
35831 for (; i < XVECLEN (pat, 0); ++i)
35833 rtx elem = XVECEXP (pat, 0, i);
35834 if (GET_CODE (elem) != UNSPEC
35835 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35836 res [j++] = elem;
35839 /* No more than 1 occurence was removed. */
35840 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35842 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35846 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35847 with rounding. */
35848 static rtx
35849 ix86_expand_sse_comi_round (const struct builtin_description *d,
35850 tree exp, rtx target)
35852 rtx pat, set_dst;
35853 tree arg0 = CALL_EXPR_ARG (exp, 0);
35854 tree arg1 = CALL_EXPR_ARG (exp, 1);
35855 tree arg2 = CALL_EXPR_ARG (exp, 2);
35856 tree arg3 = CALL_EXPR_ARG (exp, 3);
35857 rtx op0 = expand_normal (arg0);
35858 rtx op1 = expand_normal (arg1);
35859 rtx op2 = expand_normal (arg2);
35860 rtx op3 = expand_normal (arg3);
35861 enum insn_code icode = d->icode;
35862 const struct insn_data_d *insn_p = &insn_data[icode];
35863 machine_mode mode0 = insn_p->operand[0].mode;
35864 machine_mode mode1 = insn_p->operand[1].mode;
35865 enum rtx_code comparison = UNEQ;
35866 bool need_ucomi = false;
35868 /* See avxintrin.h for values. */
35869 enum rtx_code comi_comparisons[32] =
35871 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35872 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35873 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35875 bool need_ucomi_values[32] =
35877 true, false, false, true, true, false, false, true,
35878 true, false, false, true, true, false, false, true,
35879 false, true, true, false, false, true, true, false,
35880 false, true, true, false, false, true, true, false
35883 if (!CONST_INT_P (op2))
35885 error ("the third argument must be comparison constant");
35886 return const0_rtx;
35888 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35890 error ("incorrect comparison mode");
35891 return const0_rtx;
35894 if (!insn_p->operand[2].predicate (op3, SImode))
35896 error ("incorrect rounding operand");
35897 return const0_rtx;
35900 comparison = comi_comparisons[INTVAL (op2)];
35901 need_ucomi = need_ucomi_values[INTVAL (op2)];
35903 if (VECTOR_MODE_P (mode0))
35904 op0 = safe_vector_operand (op0, mode0);
35905 if (VECTOR_MODE_P (mode1))
35906 op1 = safe_vector_operand (op1, mode1);
35908 target = gen_reg_rtx (SImode);
35909 emit_move_insn (target, const0_rtx);
35910 target = gen_rtx_SUBREG (QImode, target, 0);
35912 if ((optimize && !register_operand (op0, mode0))
35913 || !insn_p->operand[0].predicate (op0, mode0))
35914 op0 = copy_to_mode_reg (mode0, op0);
35915 if ((optimize && !register_operand (op1, mode1))
35916 || !insn_p->operand[1].predicate (op1, mode1))
35917 op1 = copy_to_mode_reg (mode1, op1);
35919 if (need_ucomi)
35920 icode = icode == CODE_FOR_sse_comi_round
35921 ? CODE_FOR_sse_ucomi_round
35922 : CODE_FOR_sse2_ucomi_round;
35924 pat = GEN_FCN (icode) (op0, op1, op3);
35925 if (! pat)
35926 return 0;
35928 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35929 if (INTVAL (op3) == NO_ROUND)
35931 pat = ix86_erase_embedded_rounding (pat);
35932 if (! pat)
35933 return 0;
35935 set_dst = SET_DEST (pat);
35937 else
35939 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35940 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35943 emit_insn (pat);
35944 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35945 gen_rtx_fmt_ee (comparison, QImode,
35946 set_dst,
35947 const0_rtx)));
35949 return SUBREG_REG (target);
35952 static rtx
35953 ix86_expand_round_builtin (const struct builtin_description *d,
35954 tree exp, rtx target)
35956 rtx pat;
35957 unsigned int i, nargs;
35958 struct
35960 rtx op;
35961 machine_mode mode;
35962 } args[6];
35963 enum insn_code icode = d->icode;
35964 const struct insn_data_d *insn_p = &insn_data[icode];
35965 machine_mode tmode = insn_p->operand[0].mode;
35966 unsigned int nargs_constant = 0;
35967 unsigned int redundant_embed_rnd = 0;
35969 switch ((enum ix86_builtin_func_type) d->flag)
35971 case UINT64_FTYPE_V2DF_INT:
35972 case UINT64_FTYPE_V4SF_INT:
35973 case UINT_FTYPE_V2DF_INT:
35974 case UINT_FTYPE_V4SF_INT:
35975 case INT64_FTYPE_V2DF_INT:
35976 case INT64_FTYPE_V4SF_INT:
35977 case INT_FTYPE_V2DF_INT:
35978 case INT_FTYPE_V4SF_INT:
35979 nargs = 2;
35980 break;
35981 case V4SF_FTYPE_V4SF_UINT_INT:
35982 case V4SF_FTYPE_V4SF_UINT64_INT:
35983 case V2DF_FTYPE_V2DF_UINT64_INT:
35984 case V4SF_FTYPE_V4SF_INT_INT:
35985 case V4SF_FTYPE_V4SF_INT64_INT:
35986 case V2DF_FTYPE_V2DF_INT64_INT:
35987 case V4SF_FTYPE_V4SF_V4SF_INT:
35988 case V2DF_FTYPE_V2DF_V2DF_INT:
35989 case V4SF_FTYPE_V4SF_V2DF_INT:
35990 case V2DF_FTYPE_V2DF_V4SF_INT:
35991 nargs = 3;
35992 break;
35993 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35994 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35995 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35996 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35997 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35998 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35999 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36000 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36001 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36002 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36003 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36004 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36005 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36006 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36007 nargs = 4;
36008 break;
36009 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36010 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36011 nargs_constant = 2;
36012 nargs = 4;
36013 break;
36014 case INT_FTYPE_V4SF_V4SF_INT_INT:
36015 case INT_FTYPE_V2DF_V2DF_INT_INT:
36016 return ix86_expand_sse_comi_round (d, exp, target);
36017 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36018 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36019 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36020 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36021 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36022 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36023 nargs = 5;
36024 break;
36025 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36026 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36027 nargs_constant = 4;
36028 nargs = 5;
36029 break;
36030 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36031 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36032 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36033 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36034 nargs_constant = 3;
36035 nargs = 5;
36036 break;
36037 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36038 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36039 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36040 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36041 nargs = 6;
36042 nargs_constant = 4;
36043 break;
36044 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36045 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36046 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36047 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36048 nargs = 6;
36049 nargs_constant = 3;
36050 break;
36051 default:
36052 gcc_unreachable ();
36054 gcc_assert (nargs <= ARRAY_SIZE (args));
36056 if (optimize
36057 || target == 0
36058 || GET_MODE (target) != tmode
36059 || !insn_p->operand[0].predicate (target, tmode))
36060 target = gen_reg_rtx (tmode);
36062 for (i = 0; i < nargs; i++)
36064 tree arg = CALL_EXPR_ARG (exp, i);
36065 rtx op = expand_normal (arg);
36066 machine_mode mode = insn_p->operand[i + 1].mode;
36067 bool match = insn_p->operand[i + 1].predicate (op, mode);
36069 if (i == nargs - nargs_constant)
36071 if (!match)
36073 switch (icode)
36075 case CODE_FOR_avx512f_getmantv8df_mask_round:
36076 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36077 case CODE_FOR_avx512f_vgetmantv2df_round:
36078 case CODE_FOR_avx512f_vgetmantv4sf_round:
36079 error ("the immediate argument must be a 4-bit immediate");
36080 return const0_rtx;
36081 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36082 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36083 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36084 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36085 error ("the immediate argument must be a 5-bit immediate");
36086 return const0_rtx;
36087 default:
36088 error ("the immediate argument must be an 8-bit immediate");
36089 return const0_rtx;
36093 else if (i == nargs-1)
36095 if (!insn_p->operand[nargs].predicate (op, SImode))
36097 error ("incorrect rounding operand");
36098 return const0_rtx;
36101 /* If there is no rounding use normal version of the pattern. */
36102 if (INTVAL (op) == NO_ROUND)
36103 redundant_embed_rnd = 1;
36105 else
36107 if (VECTOR_MODE_P (mode))
36108 op = safe_vector_operand (op, mode);
36110 op = fixup_modeless_constant (op, mode);
36112 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36114 if (optimize || !match)
36115 op = copy_to_mode_reg (mode, op);
36117 else
36119 op = copy_to_reg (op);
36120 op = lowpart_subreg (mode, op, GET_MODE (op));
36124 args[i].op = op;
36125 args[i].mode = mode;
36128 switch (nargs)
36130 case 1:
36131 pat = GEN_FCN (icode) (target, args[0].op);
36132 break;
36133 case 2:
36134 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36135 break;
36136 case 3:
36137 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36138 args[2].op);
36139 break;
36140 case 4:
36141 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36142 args[2].op, args[3].op);
36143 break;
36144 case 5:
36145 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36146 args[2].op, args[3].op, args[4].op);
36147 break;
36148 case 6:
36149 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36150 args[2].op, args[3].op, args[4].op,
36151 args[5].op);
36152 break;
36153 default:
36154 gcc_unreachable ();
36157 if (!pat)
36158 return 0;
36160 if (redundant_embed_rnd)
36161 pat = ix86_erase_embedded_rounding (pat);
36163 emit_insn (pat);
36164 return target;
36167 /* Subroutine of ix86_expand_builtin to take care of special insns
36168 with variable number of operands. */
36170 static rtx
36171 ix86_expand_special_args_builtin (const struct builtin_description *d,
36172 tree exp, rtx target)
36174 tree arg;
36175 rtx pat, op;
36176 unsigned int i, nargs, arg_adjust, memory;
36177 bool aligned_mem = false;
36178 struct
36180 rtx op;
36181 machine_mode mode;
36182 } args[3];
36183 enum insn_code icode = d->icode;
36184 bool last_arg_constant = false;
36185 const struct insn_data_d *insn_p = &insn_data[icode];
36186 machine_mode tmode = insn_p->operand[0].mode;
36187 enum { load, store } klass;
36189 switch ((enum ix86_builtin_func_type) d->flag)
36191 case VOID_FTYPE_VOID:
36192 emit_insn (GEN_FCN (icode) (target));
36193 return 0;
36194 case VOID_FTYPE_UINT64:
36195 case VOID_FTYPE_UNSIGNED:
36196 nargs = 0;
36197 klass = store;
36198 memory = 0;
36199 break;
36201 case INT_FTYPE_VOID:
36202 case USHORT_FTYPE_VOID:
36203 case UINT64_FTYPE_VOID:
36204 case UNSIGNED_FTYPE_VOID:
36205 nargs = 0;
36206 klass = load;
36207 memory = 0;
36208 break;
36209 case UINT64_FTYPE_PUNSIGNED:
36210 case V2DI_FTYPE_PV2DI:
36211 case V4DI_FTYPE_PV4DI:
36212 case V32QI_FTYPE_PCCHAR:
36213 case V16QI_FTYPE_PCCHAR:
36214 case V8SF_FTYPE_PCV4SF:
36215 case V8SF_FTYPE_PCFLOAT:
36216 case V4SF_FTYPE_PCFLOAT:
36217 case V4DF_FTYPE_PCV2DF:
36218 case V4DF_FTYPE_PCDOUBLE:
36219 case V2DF_FTYPE_PCDOUBLE:
36220 case VOID_FTYPE_PVOID:
36221 case V8DI_FTYPE_PV8DI:
36222 nargs = 1;
36223 klass = load;
36224 memory = 0;
36225 switch (icode)
36227 case CODE_FOR_sse4_1_movntdqa:
36228 case CODE_FOR_avx2_movntdqa:
36229 case CODE_FOR_avx512f_movntdqa:
36230 aligned_mem = true;
36231 break;
36232 default:
36233 break;
36235 break;
36236 case VOID_FTYPE_PV2SF_V4SF:
36237 case VOID_FTYPE_PV8DI_V8DI:
36238 case VOID_FTYPE_PV4DI_V4DI:
36239 case VOID_FTYPE_PV2DI_V2DI:
36240 case VOID_FTYPE_PCHAR_V32QI:
36241 case VOID_FTYPE_PCHAR_V16QI:
36242 case VOID_FTYPE_PFLOAT_V16SF:
36243 case VOID_FTYPE_PFLOAT_V8SF:
36244 case VOID_FTYPE_PFLOAT_V4SF:
36245 case VOID_FTYPE_PDOUBLE_V8DF:
36246 case VOID_FTYPE_PDOUBLE_V4DF:
36247 case VOID_FTYPE_PDOUBLE_V2DF:
36248 case VOID_FTYPE_PLONGLONG_LONGLONG:
36249 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36250 case VOID_FTYPE_PINT_INT:
36251 nargs = 1;
36252 klass = store;
36253 /* Reserve memory operand for target. */
36254 memory = ARRAY_SIZE (args);
36255 switch (icode)
36257 /* These builtins and instructions require the memory
36258 to be properly aligned. */
36259 case CODE_FOR_avx_movntv4di:
36260 case CODE_FOR_sse2_movntv2di:
36261 case CODE_FOR_avx_movntv8sf:
36262 case CODE_FOR_sse_movntv4sf:
36263 case CODE_FOR_sse4a_vmmovntv4sf:
36264 case CODE_FOR_avx_movntv4df:
36265 case CODE_FOR_sse2_movntv2df:
36266 case CODE_FOR_sse4a_vmmovntv2df:
36267 case CODE_FOR_sse2_movntidi:
36268 case CODE_FOR_sse_movntq:
36269 case CODE_FOR_sse2_movntisi:
36270 case CODE_FOR_avx512f_movntv16sf:
36271 case CODE_FOR_avx512f_movntv8df:
36272 case CODE_FOR_avx512f_movntv8di:
36273 aligned_mem = true;
36274 break;
36275 default:
36276 break;
36278 break;
36279 case V4SF_FTYPE_V4SF_PCV2SF:
36280 case V2DF_FTYPE_V2DF_PCDOUBLE:
36281 nargs = 2;
36282 klass = load;
36283 memory = 1;
36284 break;
36285 case V8SF_FTYPE_PCV8SF_V8SI:
36286 case V4DF_FTYPE_PCV4DF_V4DI:
36287 case V4SF_FTYPE_PCV4SF_V4SI:
36288 case V2DF_FTYPE_PCV2DF_V2DI:
36289 case V8SI_FTYPE_PCV8SI_V8SI:
36290 case V4DI_FTYPE_PCV4DI_V4DI:
36291 case V4SI_FTYPE_PCV4SI_V4SI:
36292 case V2DI_FTYPE_PCV2DI_V2DI:
36293 nargs = 2;
36294 klass = load;
36295 memory = 0;
36296 break;
36297 case VOID_FTYPE_PV8DF_V8DF_UQI:
36298 case VOID_FTYPE_PV4DF_V4DF_UQI:
36299 case VOID_FTYPE_PV2DF_V2DF_UQI:
36300 case VOID_FTYPE_PV16SF_V16SF_UHI:
36301 case VOID_FTYPE_PV8SF_V8SF_UQI:
36302 case VOID_FTYPE_PV4SF_V4SF_UQI:
36303 case VOID_FTYPE_PV8DI_V8DI_UQI:
36304 case VOID_FTYPE_PV4DI_V4DI_UQI:
36305 case VOID_FTYPE_PV2DI_V2DI_UQI:
36306 case VOID_FTYPE_PV16SI_V16SI_UHI:
36307 case VOID_FTYPE_PV8SI_V8SI_UQI:
36308 case VOID_FTYPE_PV4SI_V4SI_UQI:
36309 switch (icode)
36311 /* These builtins and instructions require the memory
36312 to be properly aligned. */
36313 case CODE_FOR_avx512f_storev16sf_mask:
36314 case CODE_FOR_avx512f_storev16si_mask:
36315 case CODE_FOR_avx512f_storev8df_mask:
36316 case CODE_FOR_avx512f_storev8di_mask:
36317 case CODE_FOR_avx512vl_storev8sf_mask:
36318 case CODE_FOR_avx512vl_storev8si_mask:
36319 case CODE_FOR_avx512vl_storev4df_mask:
36320 case CODE_FOR_avx512vl_storev4di_mask:
36321 case CODE_FOR_avx512vl_storev4sf_mask:
36322 case CODE_FOR_avx512vl_storev4si_mask:
36323 case CODE_FOR_avx512vl_storev2df_mask:
36324 case CODE_FOR_avx512vl_storev2di_mask:
36325 aligned_mem = true;
36326 break;
36327 default:
36328 break;
36330 /* FALLTHRU */
36331 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36332 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36333 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36334 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36335 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36336 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36337 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36338 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36339 case VOID_FTYPE_PV8SI_V8DI_UQI:
36340 case VOID_FTYPE_PV8HI_V8DI_UQI:
36341 case VOID_FTYPE_PV16HI_V16SI_UHI:
36342 case VOID_FTYPE_PV16QI_V8DI_UQI:
36343 case VOID_FTYPE_PV16QI_V16SI_UHI:
36344 case VOID_FTYPE_PV4SI_V4DI_UQI:
36345 case VOID_FTYPE_PV4SI_V2DI_UQI:
36346 case VOID_FTYPE_PV8HI_V4DI_UQI:
36347 case VOID_FTYPE_PV8HI_V2DI_UQI:
36348 case VOID_FTYPE_PV8HI_V8SI_UQI:
36349 case VOID_FTYPE_PV8HI_V4SI_UQI:
36350 case VOID_FTYPE_PV16QI_V4DI_UQI:
36351 case VOID_FTYPE_PV16QI_V2DI_UQI:
36352 case VOID_FTYPE_PV16QI_V8SI_UQI:
36353 case VOID_FTYPE_PV16QI_V4SI_UQI:
36354 case VOID_FTYPE_PCHAR_V64QI_UDI:
36355 case VOID_FTYPE_PCHAR_V32QI_USI:
36356 case VOID_FTYPE_PCHAR_V16QI_UHI:
36357 case VOID_FTYPE_PSHORT_V32HI_USI:
36358 case VOID_FTYPE_PSHORT_V16HI_UHI:
36359 case VOID_FTYPE_PSHORT_V8HI_UQI:
36360 case VOID_FTYPE_PINT_V16SI_UHI:
36361 case VOID_FTYPE_PINT_V8SI_UQI:
36362 case VOID_FTYPE_PINT_V4SI_UQI:
36363 case VOID_FTYPE_PINT64_V8DI_UQI:
36364 case VOID_FTYPE_PINT64_V4DI_UQI:
36365 case VOID_FTYPE_PINT64_V2DI_UQI:
36366 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36367 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36368 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36369 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36370 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36371 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36372 nargs = 2;
36373 klass = store;
36374 /* Reserve memory operand for target. */
36375 memory = ARRAY_SIZE (args);
36376 break;
36377 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36378 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36379 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36380 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36381 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36382 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36383 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36384 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36385 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36386 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36387 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36388 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36389 switch (icode)
36391 /* These builtins and instructions require the memory
36392 to be properly aligned. */
36393 case CODE_FOR_avx512f_loadv16sf_mask:
36394 case CODE_FOR_avx512f_loadv16si_mask:
36395 case CODE_FOR_avx512f_loadv8df_mask:
36396 case CODE_FOR_avx512f_loadv8di_mask:
36397 case CODE_FOR_avx512vl_loadv8sf_mask:
36398 case CODE_FOR_avx512vl_loadv8si_mask:
36399 case CODE_FOR_avx512vl_loadv4df_mask:
36400 case CODE_FOR_avx512vl_loadv4di_mask:
36401 case CODE_FOR_avx512vl_loadv4sf_mask:
36402 case CODE_FOR_avx512vl_loadv4si_mask:
36403 case CODE_FOR_avx512vl_loadv2df_mask:
36404 case CODE_FOR_avx512vl_loadv2di_mask:
36405 case CODE_FOR_avx512bw_loadv64qi_mask:
36406 case CODE_FOR_avx512vl_loadv32qi_mask:
36407 case CODE_FOR_avx512vl_loadv16qi_mask:
36408 case CODE_FOR_avx512bw_loadv32hi_mask:
36409 case CODE_FOR_avx512vl_loadv16hi_mask:
36410 case CODE_FOR_avx512vl_loadv8hi_mask:
36411 aligned_mem = true;
36412 break;
36413 default:
36414 break;
36416 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36417 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36418 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36419 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36420 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36421 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36422 case V16SI_FTYPE_PCINT_V16SI_UHI:
36423 case V8SI_FTYPE_PCINT_V8SI_UQI:
36424 case V4SI_FTYPE_PCINT_V4SI_UQI:
36425 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36426 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36427 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36428 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36429 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36430 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36431 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36432 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36433 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36434 nargs = 3;
36435 klass = load;
36436 memory = 0;
36437 break;
36438 case VOID_FTYPE_UINT_UINT_UINT:
36439 case VOID_FTYPE_UINT64_UINT_UINT:
36440 case UCHAR_FTYPE_UINT_UINT_UINT:
36441 case UCHAR_FTYPE_UINT64_UINT_UINT:
36442 nargs = 3;
36443 klass = load;
36444 memory = ARRAY_SIZE (args);
36445 last_arg_constant = true;
36446 break;
36447 default:
36448 gcc_unreachable ();
36451 gcc_assert (nargs <= ARRAY_SIZE (args));
36453 if (klass == store)
36455 arg = CALL_EXPR_ARG (exp, 0);
36456 op = expand_normal (arg);
36457 gcc_assert (target == 0);
36458 if (memory)
36460 op = ix86_zero_extend_to_Pmode (op);
36461 target = gen_rtx_MEM (tmode, op);
36462 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36463 on it. Try to improve it using get_pointer_alignment,
36464 and if the special builtin is one that requires strict
36465 mode alignment, also from it's GET_MODE_ALIGNMENT.
36466 Failure to do so could lead to ix86_legitimate_combined_insn
36467 rejecting all changes to such insns. */
36468 unsigned int align = get_pointer_alignment (arg);
36469 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36470 align = GET_MODE_ALIGNMENT (tmode);
36471 if (MEM_ALIGN (target) < align)
36472 set_mem_align (target, align);
36474 else
36475 target = force_reg (tmode, op);
36476 arg_adjust = 1;
36478 else
36480 arg_adjust = 0;
36481 if (optimize
36482 || target == 0
36483 || !register_operand (target, tmode)
36484 || GET_MODE (target) != tmode)
36485 target = gen_reg_rtx (tmode);
36488 for (i = 0; i < nargs; i++)
36490 machine_mode mode = insn_p->operand[i + 1].mode;
36491 bool match;
36493 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36494 op = expand_normal (arg);
36495 match = insn_p->operand[i + 1].predicate (op, mode);
36497 if (last_arg_constant && (i + 1) == nargs)
36499 if (!match)
36501 if (icode == CODE_FOR_lwp_lwpvalsi3
36502 || icode == CODE_FOR_lwp_lwpinssi3
36503 || icode == CODE_FOR_lwp_lwpvaldi3
36504 || icode == CODE_FOR_lwp_lwpinsdi3)
36505 error ("the last argument must be a 32-bit immediate");
36506 else
36507 error ("the last argument must be an 8-bit immediate");
36508 return const0_rtx;
36511 else
36513 if (i == memory)
36515 /* This must be the memory operand. */
36516 op = ix86_zero_extend_to_Pmode (op);
36517 op = gen_rtx_MEM (mode, op);
36518 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36519 on it. Try to improve it using get_pointer_alignment,
36520 and if the special builtin is one that requires strict
36521 mode alignment, also from it's GET_MODE_ALIGNMENT.
36522 Failure to do so could lead to ix86_legitimate_combined_insn
36523 rejecting all changes to such insns. */
36524 unsigned int align = get_pointer_alignment (arg);
36525 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36526 align = GET_MODE_ALIGNMENT (mode);
36527 if (MEM_ALIGN (op) < align)
36528 set_mem_align (op, align);
36530 else
36532 /* This must be register. */
36533 if (VECTOR_MODE_P (mode))
36534 op = safe_vector_operand (op, mode);
36536 op = fixup_modeless_constant (op, mode);
36538 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36539 op = copy_to_mode_reg (mode, op);
36540 else
36542 op = copy_to_reg (op);
36543 op = lowpart_subreg (mode, op, GET_MODE (op));
36548 args[i].op = op;
36549 args[i].mode = mode;
36552 switch (nargs)
36554 case 0:
36555 pat = GEN_FCN (icode) (target);
36556 break;
36557 case 1:
36558 pat = GEN_FCN (icode) (target, args[0].op);
36559 break;
36560 case 2:
36561 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36562 break;
36563 case 3:
36564 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36565 break;
36566 default:
36567 gcc_unreachable ();
36570 if (! pat)
36571 return 0;
36572 emit_insn (pat);
36573 return klass == store ? 0 : target;
36576 /* Return the integer constant in ARG. Constrain it to be in the range
36577 of the subparts of VEC_TYPE; issue an error if not. */
36579 static int
36580 get_element_number (tree vec_type, tree arg)
36582 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36584 if (!tree_fits_uhwi_p (arg)
36585 || (elt = tree_to_uhwi (arg), elt > max))
36587 error ("selector must be an integer constant in the range 0..%wi", max);
36588 return 0;
36591 return elt;
36594 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36595 ix86_expand_vector_init. We DO have language-level syntax for this, in
36596 the form of (type){ init-list }. Except that since we can't place emms
36597 instructions from inside the compiler, we can't allow the use of MMX
36598 registers unless the user explicitly asks for it. So we do *not* define
36599 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36600 we have builtins invoked by mmintrin.h that gives us license to emit
36601 these sorts of instructions. */
36603 static rtx
36604 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36606 machine_mode tmode = TYPE_MODE (type);
36607 machine_mode inner_mode = GET_MODE_INNER (tmode);
36608 int i, n_elt = GET_MODE_NUNITS (tmode);
36609 rtvec v = rtvec_alloc (n_elt);
36611 gcc_assert (VECTOR_MODE_P (tmode));
36612 gcc_assert (call_expr_nargs (exp) == n_elt);
36614 for (i = 0; i < n_elt; ++i)
36616 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36617 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36620 if (!target || !register_operand (target, tmode))
36621 target = gen_reg_rtx (tmode);
36623 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36624 return target;
36627 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36628 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36629 had a language-level syntax for referencing vector elements. */
36631 static rtx
36632 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36634 machine_mode tmode, mode0;
36635 tree arg0, arg1;
36636 int elt;
36637 rtx op0;
36639 arg0 = CALL_EXPR_ARG (exp, 0);
36640 arg1 = CALL_EXPR_ARG (exp, 1);
36642 op0 = expand_normal (arg0);
36643 elt = get_element_number (TREE_TYPE (arg0), arg1);
36645 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36646 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36647 gcc_assert (VECTOR_MODE_P (mode0));
36649 op0 = force_reg (mode0, op0);
36651 if (optimize || !target || !register_operand (target, tmode))
36652 target = gen_reg_rtx (tmode);
36654 ix86_expand_vector_extract (true, target, op0, elt);
36656 return target;
36659 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36660 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36661 a language-level syntax for referencing vector elements. */
36663 static rtx
36664 ix86_expand_vec_set_builtin (tree exp)
36666 machine_mode tmode, mode1;
36667 tree arg0, arg1, arg2;
36668 int elt;
36669 rtx op0, op1, target;
36671 arg0 = CALL_EXPR_ARG (exp, 0);
36672 arg1 = CALL_EXPR_ARG (exp, 1);
36673 arg2 = CALL_EXPR_ARG (exp, 2);
36675 tmode = TYPE_MODE (TREE_TYPE (arg0));
36676 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36677 gcc_assert (VECTOR_MODE_P (tmode));
36679 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36680 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36681 elt = get_element_number (TREE_TYPE (arg0), arg2);
36683 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36684 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36686 op0 = force_reg (tmode, op0);
36687 op1 = force_reg (mode1, op1);
36689 /* OP0 is the source of these builtin functions and shouldn't be
36690 modified. Create a copy, use it and return it as target. */
36691 target = gen_reg_rtx (tmode);
36692 emit_move_insn (target, op0);
36693 ix86_expand_vector_set (true, target, op1, elt);
36695 return target;
36698 /* Emit conditional move of SRC to DST with condition
36699 OP1 CODE OP2. */
36700 static void
36701 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36703 rtx t;
36705 if (TARGET_CMOVE)
36707 t = ix86_expand_compare (code, op1, op2);
36708 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36709 src, dst)));
36711 else
36713 rtx_code_label *nomove = gen_label_rtx ();
36714 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36715 const0_rtx, GET_MODE (op1), 1, nomove);
36716 emit_move_insn (dst, src);
36717 emit_label (nomove);
36721 /* Choose max of DST and SRC and put it to DST. */
36722 static void
36723 ix86_emit_move_max (rtx dst, rtx src)
36725 ix86_emit_cmove (dst, src, LTU, dst, src);
36728 /* Expand an expression EXP that calls a built-in function,
36729 with result going to TARGET if that's convenient
36730 (and in mode MODE if that's convenient).
36731 SUBTARGET may be used as the target for computing one of EXP's operands.
36732 IGNORE is nonzero if the value is to be ignored. */
36734 static rtx
36735 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36736 machine_mode mode, int ignore)
36738 size_t i;
36739 enum insn_code icode;
36740 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36741 tree arg0, arg1, arg2, arg3, arg4;
36742 rtx op0, op1, op2, op3, op4, pat, insn;
36743 machine_mode mode0, mode1, mode2, mode3, mode4;
36744 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36746 /* For CPU builtins that can be folded, fold first and expand the fold. */
36747 switch (fcode)
36749 case IX86_BUILTIN_CPU_INIT:
36751 /* Make it call __cpu_indicator_init in libgcc. */
36752 tree call_expr, fndecl, type;
36753 type = build_function_type_list (integer_type_node, NULL_TREE);
36754 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36755 call_expr = build_call_expr (fndecl, 0);
36756 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36758 case IX86_BUILTIN_CPU_IS:
36759 case IX86_BUILTIN_CPU_SUPPORTS:
36761 tree arg0 = CALL_EXPR_ARG (exp, 0);
36762 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36763 gcc_assert (fold_expr != NULL_TREE);
36764 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36768 /* Determine whether the builtin function is available under the current ISA.
36769 Originally the builtin was not created if it wasn't applicable to the
36770 current ISA based on the command line switches. With function specific
36771 options, we need to check in the context of the function making the call
36772 whether it is supported. Treat AVX512VL specially. For other flags,
36773 if isa includes more than one ISA bit, treat those are requiring any
36774 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
36775 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
36776 at all, -m64 is a whole TU option. */
36777 if (((ix86_builtins_isa[fcode].isa
36778 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
36779 && !(ix86_builtins_isa[fcode].isa
36780 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
36781 & ix86_isa_flags))
36782 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
36783 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
36784 || (ix86_builtins_isa[fcode].isa2
36785 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
36787 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
36788 ix86_builtins_isa[fcode].isa2, 0, 0,
36789 NULL, NULL, (enum fpmath_unit) 0,
36790 false);
36791 if (!opts)
36792 error ("%qE needs unknown isa option", fndecl);
36793 else
36795 gcc_assert (opts != NULL);
36796 error ("%qE needs isa option %s", fndecl, opts);
36797 free (opts);
36799 return expand_call (exp, target, ignore);
36802 switch (fcode)
36804 case IX86_BUILTIN_BNDMK:
36805 if (!target
36806 || GET_MODE (target) != BNDmode
36807 || !register_operand (target, BNDmode))
36808 target = gen_reg_rtx (BNDmode);
36810 arg0 = CALL_EXPR_ARG (exp, 0);
36811 arg1 = CALL_EXPR_ARG (exp, 1);
36813 op0 = expand_normal (arg0);
36814 op1 = expand_normal (arg1);
36816 if (!register_operand (op0, Pmode))
36817 op0 = ix86_zero_extend_to_Pmode (op0);
36818 if (!register_operand (op1, Pmode))
36819 op1 = ix86_zero_extend_to_Pmode (op1);
36821 /* Builtin arg1 is size of block but instruction op1 should
36822 be (size - 1). */
36823 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36824 NULL_RTX, 1, OPTAB_DIRECT);
36826 emit_insn (BNDmode == BND64mode
36827 ? gen_bnd64_mk (target, op0, op1)
36828 : gen_bnd32_mk (target, op0, op1));
36829 return target;
36831 case IX86_BUILTIN_BNDSTX:
36832 arg0 = CALL_EXPR_ARG (exp, 0);
36833 arg1 = CALL_EXPR_ARG (exp, 1);
36834 arg2 = CALL_EXPR_ARG (exp, 2);
36836 op0 = expand_normal (arg0);
36837 op1 = expand_normal (arg1);
36838 op2 = expand_normal (arg2);
36840 if (!register_operand (op0, Pmode))
36841 op0 = ix86_zero_extend_to_Pmode (op0);
36842 if (!register_operand (op1, BNDmode))
36843 op1 = copy_to_mode_reg (BNDmode, op1);
36844 if (!register_operand (op2, Pmode))
36845 op2 = ix86_zero_extend_to_Pmode (op2);
36847 emit_insn (BNDmode == BND64mode
36848 ? gen_bnd64_stx (op2, op0, op1)
36849 : gen_bnd32_stx (op2, op0, op1));
36850 return 0;
36852 case IX86_BUILTIN_BNDLDX:
36853 if (!target
36854 || GET_MODE (target) != BNDmode
36855 || !register_operand (target, BNDmode))
36856 target = gen_reg_rtx (BNDmode);
36858 arg0 = CALL_EXPR_ARG (exp, 0);
36859 arg1 = CALL_EXPR_ARG (exp, 1);
36861 op0 = expand_normal (arg0);
36862 op1 = expand_normal (arg1);
36864 if (!register_operand (op0, Pmode))
36865 op0 = ix86_zero_extend_to_Pmode (op0);
36866 if (!register_operand (op1, Pmode))
36867 op1 = ix86_zero_extend_to_Pmode (op1);
36869 emit_insn (BNDmode == BND64mode
36870 ? gen_bnd64_ldx (target, op0, op1)
36871 : gen_bnd32_ldx (target, op0, op1));
36872 return target;
36874 case IX86_BUILTIN_BNDCL:
36875 arg0 = CALL_EXPR_ARG (exp, 0);
36876 arg1 = CALL_EXPR_ARG (exp, 1);
36878 op0 = expand_normal (arg0);
36879 op1 = expand_normal (arg1);
36881 if (!register_operand (op0, Pmode))
36882 op0 = ix86_zero_extend_to_Pmode (op0);
36883 if (!register_operand (op1, BNDmode))
36884 op1 = copy_to_mode_reg (BNDmode, op1);
36886 emit_insn (BNDmode == BND64mode
36887 ? gen_bnd64_cl (op1, op0)
36888 : gen_bnd32_cl (op1, op0));
36889 return 0;
36891 case IX86_BUILTIN_BNDCU:
36892 arg0 = CALL_EXPR_ARG (exp, 0);
36893 arg1 = CALL_EXPR_ARG (exp, 1);
36895 op0 = expand_normal (arg0);
36896 op1 = expand_normal (arg1);
36898 if (!register_operand (op0, Pmode))
36899 op0 = ix86_zero_extend_to_Pmode (op0);
36900 if (!register_operand (op1, BNDmode))
36901 op1 = copy_to_mode_reg (BNDmode, op1);
36903 emit_insn (BNDmode == BND64mode
36904 ? gen_bnd64_cu (op1, op0)
36905 : gen_bnd32_cu (op1, op0));
36906 return 0;
36908 case IX86_BUILTIN_BNDRET:
36909 arg0 = CALL_EXPR_ARG (exp, 0);
36910 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36911 target = chkp_get_rtl_bounds (arg0);
36913 /* If no bounds were specified for returned value,
36914 then use INIT bounds. It usually happens when
36915 some built-in function is expanded. */
36916 if (!target)
36918 rtx t1 = gen_reg_rtx (Pmode);
36919 rtx t2 = gen_reg_rtx (Pmode);
36920 target = gen_reg_rtx (BNDmode);
36921 emit_move_insn (t1, const0_rtx);
36922 emit_move_insn (t2, constm1_rtx);
36923 emit_insn (BNDmode == BND64mode
36924 ? gen_bnd64_mk (target, t1, t2)
36925 : gen_bnd32_mk (target, t1, t2));
36928 gcc_assert (target && REG_P (target));
36929 return target;
36931 case IX86_BUILTIN_BNDNARROW:
36933 rtx m1, m1h1, m1h2, lb, ub, t1;
36935 /* Return value and lb. */
36936 arg0 = CALL_EXPR_ARG (exp, 0);
36937 /* Bounds. */
36938 arg1 = CALL_EXPR_ARG (exp, 1);
36939 /* Size. */
36940 arg2 = CALL_EXPR_ARG (exp, 2);
36942 lb = expand_normal (arg0);
36943 op1 = expand_normal (arg1);
36944 op2 = expand_normal (arg2);
36946 /* Size was passed but we need to use (size - 1) as for bndmk. */
36947 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36948 NULL_RTX, 1, OPTAB_DIRECT);
36950 /* Add LB to size and inverse to get UB. */
36951 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36952 op2, 1, OPTAB_DIRECT);
36953 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36955 if (!register_operand (lb, Pmode))
36956 lb = ix86_zero_extend_to_Pmode (lb);
36957 if (!register_operand (ub, Pmode))
36958 ub = ix86_zero_extend_to_Pmode (ub);
36960 /* We need to move bounds to memory before any computations. */
36961 if (MEM_P (op1))
36962 m1 = op1;
36963 else
36965 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36966 emit_move_insn (m1, op1);
36969 /* Generate mem expression to be used for access to LB and UB. */
36970 m1h1 = adjust_address (m1, Pmode, 0);
36971 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36973 t1 = gen_reg_rtx (Pmode);
36975 /* Compute LB. */
36976 emit_move_insn (t1, m1h1);
36977 ix86_emit_move_max (t1, lb);
36978 emit_move_insn (m1h1, t1);
36980 /* Compute UB. UB is stored in 1's complement form. Therefore
36981 we also use max here. */
36982 emit_move_insn (t1, m1h2);
36983 ix86_emit_move_max (t1, ub);
36984 emit_move_insn (m1h2, t1);
36986 op2 = gen_reg_rtx (BNDmode);
36987 emit_move_insn (op2, m1);
36989 return chkp_join_splitted_slot (lb, op2);
36992 case IX86_BUILTIN_BNDINT:
36994 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36996 if (!target
36997 || GET_MODE (target) != BNDmode
36998 || !register_operand (target, BNDmode))
36999 target = gen_reg_rtx (BNDmode);
37001 arg0 = CALL_EXPR_ARG (exp, 0);
37002 arg1 = CALL_EXPR_ARG (exp, 1);
37004 op0 = expand_normal (arg0);
37005 op1 = expand_normal (arg1);
37007 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37008 rh1 = adjust_address (res, Pmode, 0);
37009 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37011 /* Put first bounds to temporaries. */
37012 lb1 = gen_reg_rtx (Pmode);
37013 ub1 = gen_reg_rtx (Pmode);
37014 if (MEM_P (op0))
37016 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37017 emit_move_insn (ub1, adjust_address (op0, Pmode,
37018 GET_MODE_SIZE (Pmode)));
37020 else
37022 emit_move_insn (res, op0);
37023 emit_move_insn (lb1, rh1);
37024 emit_move_insn (ub1, rh2);
37027 /* Put second bounds to temporaries. */
37028 lb2 = gen_reg_rtx (Pmode);
37029 ub2 = gen_reg_rtx (Pmode);
37030 if (MEM_P (op1))
37032 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37033 emit_move_insn (ub2, adjust_address (op1, Pmode,
37034 GET_MODE_SIZE (Pmode)));
37036 else
37038 emit_move_insn (res, op1);
37039 emit_move_insn (lb2, rh1);
37040 emit_move_insn (ub2, rh2);
37043 /* Compute LB. */
37044 ix86_emit_move_max (lb1, lb2);
37045 emit_move_insn (rh1, lb1);
37047 /* Compute UB. UB is stored in 1's complement form. Therefore
37048 we also use max here. */
37049 ix86_emit_move_max (ub1, ub2);
37050 emit_move_insn (rh2, ub1);
37052 emit_move_insn (target, res);
37054 return target;
37057 case IX86_BUILTIN_SIZEOF:
37059 tree name;
37060 rtx symbol;
37062 if (!target
37063 || GET_MODE (target) != Pmode
37064 || !register_operand (target, Pmode))
37065 target = gen_reg_rtx (Pmode);
37067 arg0 = CALL_EXPR_ARG (exp, 0);
37068 gcc_assert (VAR_P (arg0));
37070 name = DECL_ASSEMBLER_NAME (arg0);
37071 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37073 emit_insn (Pmode == SImode
37074 ? gen_move_size_reloc_si (target, symbol)
37075 : gen_move_size_reloc_di (target, symbol));
37077 return target;
37080 case IX86_BUILTIN_BNDLOWER:
37082 rtx mem, hmem;
37084 if (!target
37085 || GET_MODE (target) != Pmode
37086 || !register_operand (target, Pmode))
37087 target = gen_reg_rtx (Pmode);
37089 arg0 = CALL_EXPR_ARG (exp, 0);
37090 op0 = expand_normal (arg0);
37092 /* We need to move bounds to memory first. */
37093 if (MEM_P (op0))
37094 mem = op0;
37095 else
37097 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37098 emit_move_insn (mem, op0);
37101 /* Generate mem expression to access LB and load it. */
37102 hmem = adjust_address (mem, Pmode, 0);
37103 emit_move_insn (target, hmem);
37105 return target;
37108 case IX86_BUILTIN_BNDUPPER:
37110 rtx mem, hmem, res;
37112 if (!target
37113 || GET_MODE (target) != Pmode
37114 || !register_operand (target, Pmode))
37115 target = gen_reg_rtx (Pmode);
37117 arg0 = CALL_EXPR_ARG (exp, 0);
37118 op0 = expand_normal (arg0);
37120 /* We need to move bounds to memory first. */
37121 if (MEM_P (op0))
37122 mem = op0;
37123 else
37125 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37126 emit_move_insn (mem, op0);
37129 /* Generate mem expression to access UB. */
37130 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37132 /* We need to inverse all bits of UB. */
37133 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37135 if (res != target)
37136 emit_move_insn (target, res);
37138 return target;
37141 case IX86_BUILTIN_MASKMOVQ:
37142 case IX86_BUILTIN_MASKMOVDQU:
37143 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37144 ? CODE_FOR_mmx_maskmovq
37145 : CODE_FOR_sse2_maskmovdqu);
37146 /* Note the arg order is different from the operand order. */
37147 arg1 = CALL_EXPR_ARG (exp, 0);
37148 arg2 = CALL_EXPR_ARG (exp, 1);
37149 arg0 = CALL_EXPR_ARG (exp, 2);
37150 op0 = expand_normal (arg0);
37151 op1 = expand_normal (arg1);
37152 op2 = expand_normal (arg2);
37153 mode0 = insn_data[icode].operand[0].mode;
37154 mode1 = insn_data[icode].operand[1].mode;
37155 mode2 = insn_data[icode].operand[2].mode;
37157 op0 = ix86_zero_extend_to_Pmode (op0);
37158 op0 = gen_rtx_MEM (mode1, op0);
37160 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37161 op0 = copy_to_mode_reg (mode0, op0);
37162 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37163 op1 = copy_to_mode_reg (mode1, op1);
37164 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37165 op2 = copy_to_mode_reg (mode2, op2);
37166 pat = GEN_FCN (icode) (op0, op1, op2);
37167 if (! pat)
37168 return 0;
37169 emit_insn (pat);
37170 return 0;
37172 case IX86_BUILTIN_LDMXCSR:
37173 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37174 target = assign_386_stack_local (SImode, SLOT_TEMP);
37175 emit_move_insn (target, op0);
37176 emit_insn (gen_sse_ldmxcsr (target));
37177 return 0;
37179 case IX86_BUILTIN_STMXCSR:
37180 target = assign_386_stack_local (SImode, SLOT_TEMP);
37181 emit_insn (gen_sse_stmxcsr (target));
37182 return copy_to_mode_reg (SImode, target);
37184 case IX86_BUILTIN_CLFLUSH:
37185 arg0 = CALL_EXPR_ARG (exp, 0);
37186 op0 = expand_normal (arg0);
37187 icode = CODE_FOR_sse2_clflush;
37188 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37189 op0 = ix86_zero_extend_to_Pmode (op0);
37191 emit_insn (gen_sse2_clflush (op0));
37192 return 0;
37194 case IX86_BUILTIN_CLWB:
37195 arg0 = CALL_EXPR_ARG (exp, 0);
37196 op0 = expand_normal (arg0);
37197 icode = CODE_FOR_clwb;
37198 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37199 op0 = ix86_zero_extend_to_Pmode (op0);
37201 emit_insn (gen_clwb (op0));
37202 return 0;
37204 case IX86_BUILTIN_CLFLUSHOPT:
37205 arg0 = CALL_EXPR_ARG (exp, 0);
37206 op0 = expand_normal (arg0);
37207 icode = CODE_FOR_clflushopt;
37208 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37209 op0 = ix86_zero_extend_to_Pmode (op0);
37211 emit_insn (gen_clflushopt (op0));
37212 return 0;
37214 case IX86_BUILTIN_MONITOR:
37215 case IX86_BUILTIN_MONITORX:
37216 arg0 = CALL_EXPR_ARG (exp, 0);
37217 arg1 = CALL_EXPR_ARG (exp, 1);
37218 arg2 = CALL_EXPR_ARG (exp, 2);
37219 op0 = expand_normal (arg0);
37220 op1 = expand_normal (arg1);
37221 op2 = expand_normal (arg2);
37222 if (!REG_P (op0))
37223 op0 = ix86_zero_extend_to_Pmode (op0);
37224 if (!REG_P (op1))
37225 op1 = copy_to_mode_reg (SImode, op1);
37226 if (!REG_P (op2))
37227 op2 = copy_to_mode_reg (SImode, op2);
37229 emit_insn (fcode == IX86_BUILTIN_MONITOR
37230 ? ix86_gen_monitor (op0, op1, op2)
37231 : ix86_gen_monitorx (op0, op1, op2));
37232 return 0;
37234 case IX86_BUILTIN_MWAIT:
37235 arg0 = CALL_EXPR_ARG (exp, 0);
37236 arg1 = CALL_EXPR_ARG (exp, 1);
37237 op0 = expand_normal (arg0);
37238 op1 = expand_normal (arg1);
37239 if (!REG_P (op0))
37240 op0 = copy_to_mode_reg (SImode, op0);
37241 if (!REG_P (op1))
37242 op1 = copy_to_mode_reg (SImode, op1);
37243 emit_insn (gen_sse3_mwait (op0, op1));
37244 return 0;
37246 case IX86_BUILTIN_MWAITX:
37247 arg0 = CALL_EXPR_ARG (exp, 0);
37248 arg1 = CALL_EXPR_ARG (exp, 1);
37249 arg2 = CALL_EXPR_ARG (exp, 2);
37250 op0 = expand_normal (arg0);
37251 op1 = expand_normal (arg1);
37252 op2 = expand_normal (arg2);
37253 if (!REG_P (op0))
37254 op0 = copy_to_mode_reg (SImode, op0);
37255 if (!REG_P (op1))
37256 op1 = copy_to_mode_reg (SImode, op1);
37257 if (!REG_P (op2))
37258 op2 = copy_to_mode_reg (SImode, op2);
37259 emit_insn (gen_mwaitx (op0, op1, op2));
37260 return 0;
37262 case IX86_BUILTIN_CLZERO:
37263 arg0 = CALL_EXPR_ARG (exp, 0);
37264 op0 = expand_normal (arg0);
37265 if (!REG_P (op0))
37266 op0 = ix86_zero_extend_to_Pmode (op0);
37267 emit_insn (ix86_gen_clzero (op0));
37268 return 0;
37270 case IX86_BUILTIN_VEC_INIT_V2SI:
37271 case IX86_BUILTIN_VEC_INIT_V4HI:
37272 case IX86_BUILTIN_VEC_INIT_V8QI:
37273 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37275 case IX86_BUILTIN_VEC_EXT_V2DF:
37276 case IX86_BUILTIN_VEC_EXT_V2DI:
37277 case IX86_BUILTIN_VEC_EXT_V4SF:
37278 case IX86_BUILTIN_VEC_EXT_V4SI:
37279 case IX86_BUILTIN_VEC_EXT_V8HI:
37280 case IX86_BUILTIN_VEC_EXT_V2SI:
37281 case IX86_BUILTIN_VEC_EXT_V4HI:
37282 case IX86_BUILTIN_VEC_EXT_V16QI:
37283 return ix86_expand_vec_ext_builtin (exp, target);
37285 case IX86_BUILTIN_VEC_SET_V2DI:
37286 case IX86_BUILTIN_VEC_SET_V4SF:
37287 case IX86_BUILTIN_VEC_SET_V4SI:
37288 case IX86_BUILTIN_VEC_SET_V8HI:
37289 case IX86_BUILTIN_VEC_SET_V4HI:
37290 case IX86_BUILTIN_VEC_SET_V16QI:
37291 return ix86_expand_vec_set_builtin (exp);
37293 case IX86_BUILTIN_NANQ:
37294 case IX86_BUILTIN_NANSQ:
37295 return expand_call (exp, target, ignore);
37297 case IX86_BUILTIN_RDPMC:
37298 case IX86_BUILTIN_RDTSC:
37299 case IX86_BUILTIN_RDTSCP:
37301 op0 = gen_reg_rtx (DImode);
37302 op1 = gen_reg_rtx (DImode);
37304 if (fcode == IX86_BUILTIN_RDPMC)
37306 arg0 = CALL_EXPR_ARG (exp, 0);
37307 op2 = expand_normal (arg0);
37308 if (!register_operand (op2, SImode))
37309 op2 = copy_to_mode_reg (SImode, op2);
37311 insn = (TARGET_64BIT
37312 ? gen_rdpmc_rex64 (op0, op1, op2)
37313 : gen_rdpmc (op0, op2));
37314 emit_insn (insn);
37316 else if (fcode == IX86_BUILTIN_RDTSC)
37318 insn = (TARGET_64BIT
37319 ? gen_rdtsc_rex64 (op0, op1)
37320 : gen_rdtsc (op0));
37321 emit_insn (insn);
37323 else
37325 op2 = gen_reg_rtx (SImode);
37327 insn = (TARGET_64BIT
37328 ? gen_rdtscp_rex64 (op0, op1, op2)
37329 : gen_rdtscp (op0, op2));
37330 emit_insn (insn);
37332 arg0 = CALL_EXPR_ARG (exp, 0);
37333 op4 = expand_normal (arg0);
37334 if (!address_operand (op4, VOIDmode))
37336 op4 = convert_memory_address (Pmode, op4);
37337 op4 = copy_addr_to_reg (op4);
37339 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37342 if (target == 0)
37344 /* mode is VOIDmode if __builtin_rd* has been called
37345 without lhs. */
37346 if (mode == VOIDmode)
37347 return target;
37348 target = gen_reg_rtx (mode);
37351 if (TARGET_64BIT)
37353 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37354 op1, 1, OPTAB_DIRECT);
37355 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37356 op0, 1, OPTAB_DIRECT);
37359 emit_move_insn (target, op0);
37360 return target;
37362 case IX86_BUILTIN_FXSAVE:
37363 case IX86_BUILTIN_FXRSTOR:
37364 case IX86_BUILTIN_FXSAVE64:
37365 case IX86_BUILTIN_FXRSTOR64:
37366 case IX86_BUILTIN_FNSTENV:
37367 case IX86_BUILTIN_FLDENV:
37368 mode0 = BLKmode;
37369 switch (fcode)
37371 case IX86_BUILTIN_FXSAVE:
37372 icode = CODE_FOR_fxsave;
37373 break;
37374 case IX86_BUILTIN_FXRSTOR:
37375 icode = CODE_FOR_fxrstor;
37376 break;
37377 case IX86_BUILTIN_FXSAVE64:
37378 icode = CODE_FOR_fxsave64;
37379 break;
37380 case IX86_BUILTIN_FXRSTOR64:
37381 icode = CODE_FOR_fxrstor64;
37382 break;
37383 case IX86_BUILTIN_FNSTENV:
37384 icode = CODE_FOR_fnstenv;
37385 break;
37386 case IX86_BUILTIN_FLDENV:
37387 icode = CODE_FOR_fldenv;
37388 break;
37389 default:
37390 gcc_unreachable ();
37393 arg0 = CALL_EXPR_ARG (exp, 0);
37394 op0 = expand_normal (arg0);
37396 if (!address_operand (op0, VOIDmode))
37398 op0 = convert_memory_address (Pmode, op0);
37399 op0 = copy_addr_to_reg (op0);
37401 op0 = gen_rtx_MEM (mode0, op0);
37403 pat = GEN_FCN (icode) (op0);
37404 if (pat)
37405 emit_insn (pat);
37406 return 0;
37408 case IX86_BUILTIN_XSAVE:
37409 case IX86_BUILTIN_XRSTOR:
37410 case IX86_BUILTIN_XSAVE64:
37411 case IX86_BUILTIN_XRSTOR64:
37412 case IX86_BUILTIN_XSAVEOPT:
37413 case IX86_BUILTIN_XSAVEOPT64:
37414 case IX86_BUILTIN_XSAVES:
37415 case IX86_BUILTIN_XRSTORS:
37416 case IX86_BUILTIN_XSAVES64:
37417 case IX86_BUILTIN_XRSTORS64:
37418 case IX86_BUILTIN_XSAVEC:
37419 case IX86_BUILTIN_XSAVEC64:
37420 arg0 = CALL_EXPR_ARG (exp, 0);
37421 arg1 = CALL_EXPR_ARG (exp, 1);
37422 op0 = expand_normal (arg0);
37423 op1 = expand_normal (arg1);
37425 if (!address_operand (op0, VOIDmode))
37427 op0 = convert_memory_address (Pmode, op0);
37428 op0 = copy_addr_to_reg (op0);
37430 op0 = gen_rtx_MEM (BLKmode, op0);
37432 op1 = force_reg (DImode, op1);
37434 if (TARGET_64BIT)
37436 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37437 NULL, 1, OPTAB_DIRECT);
37438 switch (fcode)
37440 case IX86_BUILTIN_XSAVE:
37441 icode = CODE_FOR_xsave_rex64;
37442 break;
37443 case IX86_BUILTIN_XRSTOR:
37444 icode = CODE_FOR_xrstor_rex64;
37445 break;
37446 case IX86_BUILTIN_XSAVE64:
37447 icode = CODE_FOR_xsave64;
37448 break;
37449 case IX86_BUILTIN_XRSTOR64:
37450 icode = CODE_FOR_xrstor64;
37451 break;
37452 case IX86_BUILTIN_XSAVEOPT:
37453 icode = CODE_FOR_xsaveopt_rex64;
37454 break;
37455 case IX86_BUILTIN_XSAVEOPT64:
37456 icode = CODE_FOR_xsaveopt64;
37457 break;
37458 case IX86_BUILTIN_XSAVES:
37459 icode = CODE_FOR_xsaves_rex64;
37460 break;
37461 case IX86_BUILTIN_XRSTORS:
37462 icode = CODE_FOR_xrstors_rex64;
37463 break;
37464 case IX86_BUILTIN_XSAVES64:
37465 icode = CODE_FOR_xsaves64;
37466 break;
37467 case IX86_BUILTIN_XRSTORS64:
37468 icode = CODE_FOR_xrstors64;
37469 break;
37470 case IX86_BUILTIN_XSAVEC:
37471 icode = CODE_FOR_xsavec_rex64;
37472 break;
37473 case IX86_BUILTIN_XSAVEC64:
37474 icode = CODE_FOR_xsavec64;
37475 break;
37476 default:
37477 gcc_unreachable ();
37480 op2 = gen_lowpart (SImode, op2);
37481 op1 = gen_lowpart (SImode, op1);
37482 pat = GEN_FCN (icode) (op0, op1, op2);
37484 else
37486 switch (fcode)
37488 case IX86_BUILTIN_XSAVE:
37489 icode = CODE_FOR_xsave;
37490 break;
37491 case IX86_BUILTIN_XRSTOR:
37492 icode = CODE_FOR_xrstor;
37493 break;
37494 case IX86_BUILTIN_XSAVEOPT:
37495 icode = CODE_FOR_xsaveopt;
37496 break;
37497 case IX86_BUILTIN_XSAVES:
37498 icode = CODE_FOR_xsaves;
37499 break;
37500 case IX86_BUILTIN_XRSTORS:
37501 icode = CODE_FOR_xrstors;
37502 break;
37503 case IX86_BUILTIN_XSAVEC:
37504 icode = CODE_FOR_xsavec;
37505 break;
37506 default:
37507 gcc_unreachable ();
37509 pat = GEN_FCN (icode) (op0, op1);
37512 if (pat)
37513 emit_insn (pat);
37514 return 0;
37516 case IX86_BUILTIN_LLWPCB:
37517 arg0 = CALL_EXPR_ARG (exp, 0);
37518 op0 = expand_normal (arg0);
37519 icode = CODE_FOR_lwp_llwpcb;
37520 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37521 op0 = ix86_zero_extend_to_Pmode (op0);
37522 emit_insn (gen_lwp_llwpcb (op0));
37523 return 0;
37525 case IX86_BUILTIN_SLWPCB:
37526 icode = CODE_FOR_lwp_slwpcb;
37527 if (!target
37528 || !insn_data[icode].operand[0].predicate (target, Pmode))
37529 target = gen_reg_rtx (Pmode);
37530 emit_insn (gen_lwp_slwpcb (target));
37531 return target;
37533 case IX86_BUILTIN_BEXTRI32:
37534 case IX86_BUILTIN_BEXTRI64:
37535 arg0 = CALL_EXPR_ARG (exp, 0);
37536 arg1 = CALL_EXPR_ARG (exp, 1);
37537 op0 = expand_normal (arg0);
37538 op1 = expand_normal (arg1);
37539 icode = (fcode == IX86_BUILTIN_BEXTRI32
37540 ? CODE_FOR_tbm_bextri_si
37541 : CODE_FOR_tbm_bextri_di);
37542 if (!CONST_INT_P (op1))
37544 error ("last argument must be an immediate");
37545 return const0_rtx;
37547 else
37549 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37550 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37551 op1 = GEN_INT (length);
37552 op2 = GEN_INT (lsb_index);
37553 pat = GEN_FCN (icode) (target, op0, op1, op2);
37554 if (pat)
37555 emit_insn (pat);
37556 return target;
37559 case IX86_BUILTIN_RDRAND16_STEP:
37560 icode = CODE_FOR_rdrandhi_1;
37561 mode0 = HImode;
37562 goto rdrand_step;
37564 case IX86_BUILTIN_RDRAND32_STEP:
37565 icode = CODE_FOR_rdrandsi_1;
37566 mode0 = SImode;
37567 goto rdrand_step;
37569 case IX86_BUILTIN_RDRAND64_STEP:
37570 icode = CODE_FOR_rdranddi_1;
37571 mode0 = DImode;
37573 rdrand_step:
37574 arg0 = CALL_EXPR_ARG (exp, 0);
37575 op1 = expand_normal (arg0);
37576 if (!address_operand (op1, VOIDmode))
37578 op1 = convert_memory_address (Pmode, op1);
37579 op1 = copy_addr_to_reg (op1);
37582 op0 = gen_reg_rtx (mode0);
37583 emit_insn (GEN_FCN (icode) (op0));
37585 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37587 op1 = gen_reg_rtx (SImode);
37588 emit_move_insn (op1, CONST1_RTX (SImode));
37590 /* Emit SImode conditional move. */
37591 if (mode0 == HImode)
37593 if (TARGET_ZERO_EXTEND_WITH_AND
37594 && optimize_function_for_speed_p (cfun))
37596 op2 = force_reg (SImode, const0_rtx);
37598 emit_insn (gen_movstricthi
37599 (gen_lowpart (HImode, op2), op0));
37601 else
37603 op2 = gen_reg_rtx (SImode);
37605 emit_insn (gen_zero_extendhisi2 (op2, op0));
37608 else if (mode0 == SImode)
37609 op2 = op0;
37610 else
37611 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37613 if (target == 0
37614 || !register_operand (target, SImode))
37615 target = gen_reg_rtx (SImode);
37617 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37618 const0_rtx);
37619 emit_insn (gen_rtx_SET (target,
37620 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37621 return target;
37623 case IX86_BUILTIN_RDSEED16_STEP:
37624 icode = CODE_FOR_rdseedhi_1;
37625 mode0 = HImode;
37626 goto rdseed_step;
37628 case IX86_BUILTIN_RDSEED32_STEP:
37629 icode = CODE_FOR_rdseedsi_1;
37630 mode0 = SImode;
37631 goto rdseed_step;
37633 case IX86_BUILTIN_RDSEED64_STEP:
37634 icode = CODE_FOR_rdseeddi_1;
37635 mode0 = DImode;
37637 rdseed_step:
37638 arg0 = CALL_EXPR_ARG (exp, 0);
37639 op1 = expand_normal (arg0);
37640 if (!address_operand (op1, VOIDmode))
37642 op1 = convert_memory_address (Pmode, op1);
37643 op1 = copy_addr_to_reg (op1);
37646 op0 = gen_reg_rtx (mode0);
37647 emit_insn (GEN_FCN (icode) (op0));
37649 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37651 op2 = gen_reg_rtx (QImode);
37653 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37654 const0_rtx);
37655 emit_insn (gen_rtx_SET (op2, pat));
37657 if (target == 0
37658 || !register_operand (target, SImode))
37659 target = gen_reg_rtx (SImode);
37661 emit_insn (gen_zero_extendqisi2 (target, op2));
37662 return target;
37664 case IX86_BUILTIN_SBB32:
37665 icode = CODE_FOR_subborrowsi;
37666 mode0 = SImode;
37667 goto handlecarry;
37669 case IX86_BUILTIN_SBB64:
37670 icode = CODE_FOR_subborrowdi;
37671 mode0 = DImode;
37672 goto handlecarry;
37674 case IX86_BUILTIN_ADDCARRYX32:
37675 icode = CODE_FOR_addcarrysi;
37676 mode0 = SImode;
37677 goto handlecarry;
37679 case IX86_BUILTIN_ADDCARRYX64:
37680 icode = CODE_FOR_addcarrydi;
37681 mode0 = DImode;
37683 handlecarry:
37684 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37685 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37686 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37687 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37689 op1 = expand_normal (arg0);
37690 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37692 op2 = expand_normal (arg1);
37693 if (!register_operand (op2, mode0))
37694 op2 = copy_to_mode_reg (mode0, op2);
37696 op3 = expand_normal (arg2);
37697 if (!register_operand (op3, mode0))
37698 op3 = copy_to_mode_reg (mode0, op3);
37700 op4 = expand_normal (arg3);
37701 if (!address_operand (op4, VOIDmode))
37703 op4 = convert_memory_address (Pmode, op4);
37704 op4 = copy_addr_to_reg (op4);
37707 /* Generate CF from input operand. */
37708 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37710 /* Generate instruction that consumes CF. */
37711 op0 = gen_reg_rtx (mode0);
37713 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37714 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37715 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37717 /* Return current CF value. */
37718 if (target == 0)
37719 target = gen_reg_rtx (QImode);
37721 PUT_MODE (pat, QImode);
37722 emit_insn (gen_rtx_SET (target, pat));
37724 /* Store the result. */
37725 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37727 return target;
37729 case IX86_BUILTIN_READ_FLAGS:
37730 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37732 if (optimize
37733 || target == NULL_RTX
37734 || !nonimmediate_operand (target, word_mode)
37735 || GET_MODE (target) != word_mode)
37736 target = gen_reg_rtx (word_mode);
37738 emit_insn (gen_pop (target));
37739 return target;
37741 case IX86_BUILTIN_WRITE_FLAGS:
37743 arg0 = CALL_EXPR_ARG (exp, 0);
37744 op0 = expand_normal (arg0);
37745 if (!general_no_elim_operand (op0, word_mode))
37746 op0 = copy_to_mode_reg (word_mode, op0);
37748 emit_insn (gen_push (op0));
37749 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37750 return 0;
37752 case IX86_BUILTIN_KTESTC8:
37753 icode = CODE_FOR_ktestqi;
37754 mode0 = QImode;
37755 mode1 = CCCmode;
37756 goto kortest;
37758 case IX86_BUILTIN_KTESTZ8:
37759 icode = CODE_FOR_ktestqi;
37760 mode0 = QImode;
37761 mode1 = CCZmode;
37762 goto kortest;
37764 case IX86_BUILTIN_KTESTC16:
37765 icode = CODE_FOR_ktesthi;
37766 mode0 = HImode;
37767 mode1 = CCCmode;
37768 goto kortest;
37770 case IX86_BUILTIN_KTESTZ16:
37771 icode = CODE_FOR_ktesthi;
37772 mode0 = HImode;
37773 mode1 = CCZmode;
37774 goto kortest;
37776 case IX86_BUILTIN_KTESTC32:
37777 icode = CODE_FOR_ktestsi;
37778 mode0 = SImode;
37779 mode1 = CCCmode;
37780 goto kortest;
37782 case IX86_BUILTIN_KTESTZ32:
37783 icode = CODE_FOR_ktestsi;
37784 mode0 = SImode;
37785 mode1 = CCZmode;
37786 goto kortest;
37788 case IX86_BUILTIN_KTESTC64:
37789 icode = CODE_FOR_ktestdi;
37790 mode0 = DImode;
37791 mode1 = CCCmode;
37792 goto kortest;
37794 case IX86_BUILTIN_KTESTZ64:
37795 icode = CODE_FOR_ktestdi;
37796 mode0 = DImode;
37797 mode1 = CCZmode;
37798 goto kortest;
37800 case IX86_BUILTIN_KORTESTC8:
37801 icode = CODE_FOR_kortestqi;
37802 mode0 = QImode;
37803 mode1 = CCCmode;
37804 goto kortest;
37806 case IX86_BUILTIN_KORTESTZ8:
37807 icode = CODE_FOR_kortestqi;
37808 mode0 = QImode;
37809 mode1 = CCZmode;
37810 goto kortest;
37812 case IX86_BUILTIN_KORTESTC16:
37813 icode = CODE_FOR_kortesthi;
37814 mode0 = HImode;
37815 mode1 = CCCmode;
37816 goto kortest;
37818 case IX86_BUILTIN_KORTESTZ16:
37819 icode = CODE_FOR_kortesthi;
37820 mode0 = HImode;
37821 mode1 = CCZmode;
37822 goto kortest;
37824 case IX86_BUILTIN_KORTESTC32:
37825 icode = CODE_FOR_kortestsi;
37826 mode0 = SImode;
37827 mode1 = CCCmode;
37828 goto kortest;
37830 case IX86_BUILTIN_KORTESTZ32:
37831 icode = CODE_FOR_kortestsi;
37832 mode0 = SImode;
37833 mode1 = CCZmode;
37834 goto kortest;
37836 case IX86_BUILTIN_KORTESTC64:
37837 icode = CODE_FOR_kortestdi;
37838 mode0 = DImode;
37839 mode1 = CCCmode;
37840 goto kortest;
37842 case IX86_BUILTIN_KORTESTZ64:
37843 icode = CODE_FOR_kortestdi;
37844 mode0 = DImode;
37845 mode1 = CCZmode;
37847 kortest:
37848 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37849 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37850 op0 = expand_normal (arg0);
37851 op1 = expand_normal (arg1);
37853 op0 = copy_to_reg (op0);
37854 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37855 op1 = copy_to_reg (op1);
37856 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37858 target = gen_reg_rtx (QImode);
37859 emit_insn (gen_rtx_SET (target, const0_rtx));
37861 /* Emit kortest. */
37862 emit_insn (GEN_FCN (icode) (op0, op1));
37863 /* And use setcc to return result from flags. */
37864 ix86_expand_setcc (target, EQ,
37865 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37866 return target;
37868 case IX86_BUILTIN_GATHERSIV2DF:
37869 icode = CODE_FOR_avx2_gathersiv2df;
37870 goto gather_gen;
37871 case IX86_BUILTIN_GATHERSIV4DF:
37872 icode = CODE_FOR_avx2_gathersiv4df;
37873 goto gather_gen;
37874 case IX86_BUILTIN_GATHERDIV2DF:
37875 icode = CODE_FOR_avx2_gatherdiv2df;
37876 goto gather_gen;
37877 case IX86_BUILTIN_GATHERDIV4DF:
37878 icode = CODE_FOR_avx2_gatherdiv4df;
37879 goto gather_gen;
37880 case IX86_BUILTIN_GATHERSIV4SF:
37881 icode = CODE_FOR_avx2_gathersiv4sf;
37882 goto gather_gen;
37883 case IX86_BUILTIN_GATHERSIV8SF:
37884 icode = CODE_FOR_avx2_gathersiv8sf;
37885 goto gather_gen;
37886 case IX86_BUILTIN_GATHERDIV4SF:
37887 icode = CODE_FOR_avx2_gatherdiv4sf;
37888 goto gather_gen;
37889 case IX86_BUILTIN_GATHERDIV8SF:
37890 icode = CODE_FOR_avx2_gatherdiv8sf;
37891 goto gather_gen;
37892 case IX86_BUILTIN_GATHERSIV2DI:
37893 icode = CODE_FOR_avx2_gathersiv2di;
37894 goto gather_gen;
37895 case IX86_BUILTIN_GATHERSIV4DI:
37896 icode = CODE_FOR_avx2_gathersiv4di;
37897 goto gather_gen;
37898 case IX86_BUILTIN_GATHERDIV2DI:
37899 icode = CODE_FOR_avx2_gatherdiv2di;
37900 goto gather_gen;
37901 case IX86_BUILTIN_GATHERDIV4DI:
37902 icode = CODE_FOR_avx2_gatherdiv4di;
37903 goto gather_gen;
37904 case IX86_BUILTIN_GATHERSIV4SI:
37905 icode = CODE_FOR_avx2_gathersiv4si;
37906 goto gather_gen;
37907 case IX86_BUILTIN_GATHERSIV8SI:
37908 icode = CODE_FOR_avx2_gathersiv8si;
37909 goto gather_gen;
37910 case IX86_BUILTIN_GATHERDIV4SI:
37911 icode = CODE_FOR_avx2_gatherdiv4si;
37912 goto gather_gen;
37913 case IX86_BUILTIN_GATHERDIV8SI:
37914 icode = CODE_FOR_avx2_gatherdiv8si;
37915 goto gather_gen;
37916 case IX86_BUILTIN_GATHERALTSIV4DF:
37917 icode = CODE_FOR_avx2_gathersiv4df;
37918 goto gather_gen;
37919 case IX86_BUILTIN_GATHERALTDIV8SF:
37920 icode = CODE_FOR_avx2_gatherdiv8sf;
37921 goto gather_gen;
37922 case IX86_BUILTIN_GATHERALTSIV4DI:
37923 icode = CODE_FOR_avx2_gathersiv4di;
37924 goto gather_gen;
37925 case IX86_BUILTIN_GATHERALTDIV8SI:
37926 icode = CODE_FOR_avx2_gatherdiv8si;
37927 goto gather_gen;
37928 case IX86_BUILTIN_GATHER3SIV16SF:
37929 icode = CODE_FOR_avx512f_gathersiv16sf;
37930 goto gather_gen;
37931 case IX86_BUILTIN_GATHER3SIV8DF:
37932 icode = CODE_FOR_avx512f_gathersiv8df;
37933 goto gather_gen;
37934 case IX86_BUILTIN_GATHER3DIV16SF:
37935 icode = CODE_FOR_avx512f_gatherdiv16sf;
37936 goto gather_gen;
37937 case IX86_BUILTIN_GATHER3DIV8DF:
37938 icode = CODE_FOR_avx512f_gatherdiv8df;
37939 goto gather_gen;
37940 case IX86_BUILTIN_GATHER3SIV16SI:
37941 icode = CODE_FOR_avx512f_gathersiv16si;
37942 goto gather_gen;
37943 case IX86_BUILTIN_GATHER3SIV8DI:
37944 icode = CODE_FOR_avx512f_gathersiv8di;
37945 goto gather_gen;
37946 case IX86_BUILTIN_GATHER3DIV16SI:
37947 icode = CODE_FOR_avx512f_gatherdiv16si;
37948 goto gather_gen;
37949 case IX86_BUILTIN_GATHER3DIV8DI:
37950 icode = CODE_FOR_avx512f_gatherdiv8di;
37951 goto gather_gen;
37952 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37953 icode = CODE_FOR_avx512f_gathersiv8df;
37954 goto gather_gen;
37955 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37956 icode = CODE_FOR_avx512f_gatherdiv16sf;
37957 goto gather_gen;
37958 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37959 icode = CODE_FOR_avx512f_gathersiv8di;
37960 goto gather_gen;
37961 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37962 icode = CODE_FOR_avx512f_gatherdiv16si;
37963 goto gather_gen;
37964 case IX86_BUILTIN_GATHER3SIV2DF:
37965 icode = CODE_FOR_avx512vl_gathersiv2df;
37966 goto gather_gen;
37967 case IX86_BUILTIN_GATHER3SIV4DF:
37968 icode = CODE_FOR_avx512vl_gathersiv4df;
37969 goto gather_gen;
37970 case IX86_BUILTIN_GATHER3DIV2DF:
37971 icode = CODE_FOR_avx512vl_gatherdiv2df;
37972 goto gather_gen;
37973 case IX86_BUILTIN_GATHER3DIV4DF:
37974 icode = CODE_FOR_avx512vl_gatherdiv4df;
37975 goto gather_gen;
37976 case IX86_BUILTIN_GATHER3SIV4SF:
37977 icode = CODE_FOR_avx512vl_gathersiv4sf;
37978 goto gather_gen;
37979 case IX86_BUILTIN_GATHER3SIV8SF:
37980 icode = CODE_FOR_avx512vl_gathersiv8sf;
37981 goto gather_gen;
37982 case IX86_BUILTIN_GATHER3DIV4SF:
37983 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37984 goto gather_gen;
37985 case IX86_BUILTIN_GATHER3DIV8SF:
37986 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37987 goto gather_gen;
37988 case IX86_BUILTIN_GATHER3SIV2DI:
37989 icode = CODE_FOR_avx512vl_gathersiv2di;
37990 goto gather_gen;
37991 case IX86_BUILTIN_GATHER3SIV4DI:
37992 icode = CODE_FOR_avx512vl_gathersiv4di;
37993 goto gather_gen;
37994 case IX86_BUILTIN_GATHER3DIV2DI:
37995 icode = CODE_FOR_avx512vl_gatherdiv2di;
37996 goto gather_gen;
37997 case IX86_BUILTIN_GATHER3DIV4DI:
37998 icode = CODE_FOR_avx512vl_gatherdiv4di;
37999 goto gather_gen;
38000 case IX86_BUILTIN_GATHER3SIV4SI:
38001 icode = CODE_FOR_avx512vl_gathersiv4si;
38002 goto gather_gen;
38003 case IX86_BUILTIN_GATHER3SIV8SI:
38004 icode = CODE_FOR_avx512vl_gathersiv8si;
38005 goto gather_gen;
38006 case IX86_BUILTIN_GATHER3DIV4SI:
38007 icode = CODE_FOR_avx512vl_gatherdiv4si;
38008 goto gather_gen;
38009 case IX86_BUILTIN_GATHER3DIV8SI:
38010 icode = CODE_FOR_avx512vl_gatherdiv8si;
38011 goto gather_gen;
38012 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38013 icode = CODE_FOR_avx512vl_gathersiv4df;
38014 goto gather_gen;
38015 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38016 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38017 goto gather_gen;
38018 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38019 icode = CODE_FOR_avx512vl_gathersiv4di;
38020 goto gather_gen;
38021 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38022 icode = CODE_FOR_avx512vl_gatherdiv8si;
38023 goto gather_gen;
38024 case IX86_BUILTIN_SCATTERSIV16SF:
38025 icode = CODE_FOR_avx512f_scattersiv16sf;
38026 goto scatter_gen;
38027 case IX86_BUILTIN_SCATTERSIV8DF:
38028 icode = CODE_FOR_avx512f_scattersiv8df;
38029 goto scatter_gen;
38030 case IX86_BUILTIN_SCATTERDIV16SF:
38031 icode = CODE_FOR_avx512f_scatterdiv16sf;
38032 goto scatter_gen;
38033 case IX86_BUILTIN_SCATTERDIV8DF:
38034 icode = CODE_FOR_avx512f_scatterdiv8df;
38035 goto scatter_gen;
38036 case IX86_BUILTIN_SCATTERSIV16SI:
38037 icode = CODE_FOR_avx512f_scattersiv16si;
38038 goto scatter_gen;
38039 case IX86_BUILTIN_SCATTERSIV8DI:
38040 icode = CODE_FOR_avx512f_scattersiv8di;
38041 goto scatter_gen;
38042 case IX86_BUILTIN_SCATTERDIV16SI:
38043 icode = CODE_FOR_avx512f_scatterdiv16si;
38044 goto scatter_gen;
38045 case IX86_BUILTIN_SCATTERDIV8DI:
38046 icode = CODE_FOR_avx512f_scatterdiv8di;
38047 goto scatter_gen;
38048 case IX86_BUILTIN_SCATTERSIV8SF:
38049 icode = CODE_FOR_avx512vl_scattersiv8sf;
38050 goto scatter_gen;
38051 case IX86_BUILTIN_SCATTERSIV4SF:
38052 icode = CODE_FOR_avx512vl_scattersiv4sf;
38053 goto scatter_gen;
38054 case IX86_BUILTIN_SCATTERSIV4DF:
38055 icode = CODE_FOR_avx512vl_scattersiv4df;
38056 goto scatter_gen;
38057 case IX86_BUILTIN_SCATTERSIV2DF:
38058 icode = CODE_FOR_avx512vl_scattersiv2df;
38059 goto scatter_gen;
38060 case IX86_BUILTIN_SCATTERDIV8SF:
38061 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38062 goto scatter_gen;
38063 case IX86_BUILTIN_SCATTERDIV4SF:
38064 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38065 goto scatter_gen;
38066 case IX86_BUILTIN_SCATTERDIV4DF:
38067 icode = CODE_FOR_avx512vl_scatterdiv4df;
38068 goto scatter_gen;
38069 case IX86_BUILTIN_SCATTERDIV2DF:
38070 icode = CODE_FOR_avx512vl_scatterdiv2df;
38071 goto scatter_gen;
38072 case IX86_BUILTIN_SCATTERSIV8SI:
38073 icode = CODE_FOR_avx512vl_scattersiv8si;
38074 goto scatter_gen;
38075 case IX86_BUILTIN_SCATTERSIV4SI:
38076 icode = CODE_FOR_avx512vl_scattersiv4si;
38077 goto scatter_gen;
38078 case IX86_BUILTIN_SCATTERSIV4DI:
38079 icode = CODE_FOR_avx512vl_scattersiv4di;
38080 goto scatter_gen;
38081 case IX86_BUILTIN_SCATTERSIV2DI:
38082 icode = CODE_FOR_avx512vl_scattersiv2di;
38083 goto scatter_gen;
38084 case IX86_BUILTIN_SCATTERDIV8SI:
38085 icode = CODE_FOR_avx512vl_scatterdiv8si;
38086 goto scatter_gen;
38087 case IX86_BUILTIN_SCATTERDIV4SI:
38088 icode = CODE_FOR_avx512vl_scatterdiv4si;
38089 goto scatter_gen;
38090 case IX86_BUILTIN_SCATTERDIV4DI:
38091 icode = CODE_FOR_avx512vl_scatterdiv4di;
38092 goto scatter_gen;
38093 case IX86_BUILTIN_SCATTERDIV2DI:
38094 icode = CODE_FOR_avx512vl_scatterdiv2di;
38095 goto scatter_gen;
38096 case IX86_BUILTIN_GATHERPFDPD:
38097 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38098 goto vec_prefetch_gen;
38099 case IX86_BUILTIN_SCATTERALTSIV8DF:
38100 icode = CODE_FOR_avx512f_scattersiv8df;
38101 goto scatter_gen;
38102 case IX86_BUILTIN_SCATTERALTDIV16SF:
38103 icode = CODE_FOR_avx512f_scatterdiv16sf;
38104 goto scatter_gen;
38105 case IX86_BUILTIN_SCATTERALTSIV8DI:
38106 icode = CODE_FOR_avx512f_scattersiv8di;
38107 goto scatter_gen;
38108 case IX86_BUILTIN_SCATTERALTDIV16SI:
38109 icode = CODE_FOR_avx512f_scatterdiv16si;
38110 goto scatter_gen;
38111 case IX86_BUILTIN_GATHERPFDPS:
38112 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38113 goto vec_prefetch_gen;
38114 case IX86_BUILTIN_GATHERPFQPD:
38115 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38116 goto vec_prefetch_gen;
38117 case IX86_BUILTIN_GATHERPFQPS:
38118 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38119 goto vec_prefetch_gen;
38120 case IX86_BUILTIN_SCATTERPFDPD:
38121 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38122 goto vec_prefetch_gen;
38123 case IX86_BUILTIN_SCATTERPFDPS:
38124 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38125 goto vec_prefetch_gen;
38126 case IX86_BUILTIN_SCATTERPFQPD:
38127 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38128 goto vec_prefetch_gen;
38129 case IX86_BUILTIN_SCATTERPFQPS:
38130 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38131 goto vec_prefetch_gen;
38133 gather_gen:
38134 rtx half;
38135 rtx (*gen) (rtx, rtx);
38137 arg0 = CALL_EXPR_ARG (exp, 0);
38138 arg1 = CALL_EXPR_ARG (exp, 1);
38139 arg2 = CALL_EXPR_ARG (exp, 2);
38140 arg3 = CALL_EXPR_ARG (exp, 3);
38141 arg4 = CALL_EXPR_ARG (exp, 4);
38142 op0 = expand_normal (arg0);
38143 op1 = expand_normal (arg1);
38144 op2 = expand_normal (arg2);
38145 op3 = expand_normal (arg3);
38146 op4 = expand_normal (arg4);
38147 /* Note the arg order is different from the operand order. */
38148 mode0 = insn_data[icode].operand[1].mode;
38149 mode2 = insn_data[icode].operand[3].mode;
38150 mode3 = insn_data[icode].operand[4].mode;
38151 mode4 = insn_data[icode].operand[5].mode;
38153 if (target == NULL_RTX
38154 || GET_MODE (target) != insn_data[icode].operand[0].mode
38155 || !insn_data[icode].operand[0].predicate (target,
38156 GET_MODE (target)))
38157 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38158 else
38159 subtarget = target;
38161 switch (fcode)
38163 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38164 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38165 half = gen_reg_rtx (V8SImode);
38166 if (!nonimmediate_operand (op2, V16SImode))
38167 op2 = copy_to_mode_reg (V16SImode, op2);
38168 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38169 op2 = half;
38170 break;
38171 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38172 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38173 case IX86_BUILTIN_GATHERALTSIV4DF:
38174 case IX86_BUILTIN_GATHERALTSIV4DI:
38175 half = gen_reg_rtx (V4SImode);
38176 if (!nonimmediate_operand (op2, V8SImode))
38177 op2 = copy_to_mode_reg (V8SImode, op2);
38178 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38179 op2 = half;
38180 break;
38181 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38182 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38183 half = gen_reg_rtx (mode0);
38184 if (mode0 == V8SFmode)
38185 gen = gen_vec_extract_lo_v16sf;
38186 else
38187 gen = gen_vec_extract_lo_v16si;
38188 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38189 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38190 emit_insn (gen (half, op0));
38191 op0 = half;
38192 if (GET_MODE (op3) != VOIDmode)
38194 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38195 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38196 emit_insn (gen (half, op3));
38197 op3 = half;
38199 break;
38200 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38201 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38202 case IX86_BUILTIN_GATHERALTDIV8SF:
38203 case IX86_BUILTIN_GATHERALTDIV8SI:
38204 half = gen_reg_rtx (mode0);
38205 if (mode0 == V4SFmode)
38206 gen = gen_vec_extract_lo_v8sf;
38207 else
38208 gen = gen_vec_extract_lo_v8si;
38209 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38210 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38211 emit_insn (gen (half, op0));
38212 op0 = half;
38213 if (GET_MODE (op3) != VOIDmode)
38215 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38216 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38217 emit_insn (gen (half, op3));
38218 op3 = half;
38220 break;
38221 default:
38222 break;
38225 /* Force memory operand only with base register here. But we
38226 don't want to do it on memory operand for other builtin
38227 functions. */
38228 op1 = ix86_zero_extend_to_Pmode (op1);
38230 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38231 op0 = copy_to_mode_reg (mode0, op0);
38232 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38233 op1 = copy_to_mode_reg (Pmode, op1);
38234 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38235 op2 = copy_to_mode_reg (mode2, op2);
38237 op3 = fixup_modeless_constant (op3, mode3);
38239 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38241 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38242 op3 = copy_to_mode_reg (mode3, op3);
38244 else
38246 op3 = copy_to_reg (op3);
38247 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38249 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38251 error ("the last argument must be scale 1, 2, 4, 8");
38252 return const0_rtx;
38255 /* Optimize. If mask is known to have all high bits set,
38256 replace op0 with pc_rtx to signal that the instruction
38257 overwrites the whole destination and doesn't use its
38258 previous contents. */
38259 if (optimize)
38261 if (TREE_CODE (arg3) == INTEGER_CST)
38263 if (integer_all_onesp (arg3))
38264 op0 = pc_rtx;
38266 else if (TREE_CODE (arg3) == VECTOR_CST)
38268 unsigned int negative = 0;
38269 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38271 tree cst = VECTOR_CST_ELT (arg3, i);
38272 if (TREE_CODE (cst) == INTEGER_CST
38273 && tree_int_cst_sign_bit (cst))
38274 negative++;
38275 else if (TREE_CODE (cst) == REAL_CST
38276 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38277 negative++;
38279 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38280 op0 = pc_rtx;
38282 else if (TREE_CODE (arg3) == SSA_NAME
38283 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38285 /* Recognize also when mask is like:
38286 __v2df src = _mm_setzero_pd ();
38287 __v2df mask = _mm_cmpeq_pd (src, src);
38289 __v8sf src = _mm256_setzero_ps ();
38290 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38291 as that is a cheaper way to load all ones into
38292 a register than having to load a constant from
38293 memory. */
38294 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38295 if (is_gimple_call (def_stmt))
38297 tree fndecl = gimple_call_fndecl (def_stmt);
38298 if (fndecl
38299 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38300 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38302 case IX86_BUILTIN_CMPPD:
38303 case IX86_BUILTIN_CMPPS:
38304 case IX86_BUILTIN_CMPPD256:
38305 case IX86_BUILTIN_CMPPS256:
38306 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38307 break;
38308 /* FALLTHRU */
38309 case IX86_BUILTIN_CMPEQPD:
38310 case IX86_BUILTIN_CMPEQPS:
38311 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38312 && initializer_zerop (gimple_call_arg (def_stmt,
38313 1)))
38314 op0 = pc_rtx;
38315 break;
38316 default:
38317 break;
38323 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38324 if (! pat)
38325 return const0_rtx;
38326 emit_insn (pat);
38328 switch (fcode)
38330 case IX86_BUILTIN_GATHER3DIV16SF:
38331 if (target == NULL_RTX)
38332 target = gen_reg_rtx (V8SFmode);
38333 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38334 break;
38335 case IX86_BUILTIN_GATHER3DIV16SI:
38336 if (target == NULL_RTX)
38337 target = gen_reg_rtx (V8SImode);
38338 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38339 break;
38340 case IX86_BUILTIN_GATHER3DIV8SF:
38341 case IX86_BUILTIN_GATHERDIV8SF:
38342 if (target == NULL_RTX)
38343 target = gen_reg_rtx (V4SFmode);
38344 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38345 break;
38346 case IX86_BUILTIN_GATHER3DIV8SI:
38347 case IX86_BUILTIN_GATHERDIV8SI:
38348 if (target == NULL_RTX)
38349 target = gen_reg_rtx (V4SImode);
38350 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38351 break;
38352 default:
38353 target = subtarget;
38354 break;
38356 return target;
38358 scatter_gen:
38359 arg0 = CALL_EXPR_ARG (exp, 0);
38360 arg1 = CALL_EXPR_ARG (exp, 1);
38361 arg2 = CALL_EXPR_ARG (exp, 2);
38362 arg3 = CALL_EXPR_ARG (exp, 3);
38363 arg4 = CALL_EXPR_ARG (exp, 4);
38364 op0 = expand_normal (arg0);
38365 op1 = expand_normal (arg1);
38366 op2 = expand_normal (arg2);
38367 op3 = expand_normal (arg3);
38368 op4 = expand_normal (arg4);
38369 mode1 = insn_data[icode].operand[1].mode;
38370 mode2 = insn_data[icode].operand[2].mode;
38371 mode3 = insn_data[icode].operand[3].mode;
38372 mode4 = insn_data[icode].operand[4].mode;
38374 /* Scatter instruction stores operand op3 to memory with
38375 indices from op2 and scale from op4 under writemask op1.
38376 If index operand op2 has more elements then source operand
38377 op3 one need to use only its low half. And vice versa. */
38378 switch (fcode)
38380 case IX86_BUILTIN_SCATTERALTSIV8DF:
38381 case IX86_BUILTIN_SCATTERALTSIV8DI:
38382 half = gen_reg_rtx (V8SImode);
38383 if (!nonimmediate_operand (op2, V16SImode))
38384 op2 = copy_to_mode_reg (V16SImode, op2);
38385 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38386 op2 = half;
38387 break;
38388 case IX86_BUILTIN_SCATTERALTDIV16SF:
38389 case IX86_BUILTIN_SCATTERALTDIV16SI:
38390 half = gen_reg_rtx (mode3);
38391 if (mode3 == V8SFmode)
38392 gen = gen_vec_extract_lo_v16sf;
38393 else
38394 gen = gen_vec_extract_lo_v16si;
38395 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38396 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38397 emit_insn (gen (half, op3));
38398 op3 = half;
38399 break;
38400 default:
38401 break;
38404 /* Force memory operand only with base register here. But we
38405 don't want to do it on memory operand for other builtin
38406 functions. */
38407 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38409 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38410 op0 = copy_to_mode_reg (Pmode, op0);
38412 op1 = fixup_modeless_constant (op1, mode1);
38414 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38416 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38417 op1 = copy_to_mode_reg (mode1, op1);
38419 else
38421 op1 = copy_to_reg (op1);
38422 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38425 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38426 op2 = copy_to_mode_reg (mode2, op2);
38428 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38429 op3 = copy_to_mode_reg (mode3, op3);
38431 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38433 error ("the last argument must be scale 1, 2, 4, 8");
38434 return const0_rtx;
38437 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38438 if (! pat)
38439 return const0_rtx;
38441 emit_insn (pat);
38442 return 0;
38444 vec_prefetch_gen:
38445 arg0 = CALL_EXPR_ARG (exp, 0);
38446 arg1 = CALL_EXPR_ARG (exp, 1);
38447 arg2 = CALL_EXPR_ARG (exp, 2);
38448 arg3 = CALL_EXPR_ARG (exp, 3);
38449 arg4 = CALL_EXPR_ARG (exp, 4);
38450 op0 = expand_normal (arg0);
38451 op1 = expand_normal (arg1);
38452 op2 = expand_normal (arg2);
38453 op3 = expand_normal (arg3);
38454 op4 = expand_normal (arg4);
38455 mode0 = insn_data[icode].operand[0].mode;
38456 mode1 = insn_data[icode].operand[1].mode;
38457 mode3 = insn_data[icode].operand[3].mode;
38458 mode4 = insn_data[icode].operand[4].mode;
38460 op0 = fixup_modeless_constant (op0, mode0);
38462 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38464 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38465 op0 = copy_to_mode_reg (mode0, op0);
38467 else
38469 op0 = copy_to_reg (op0);
38470 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38473 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38474 op1 = copy_to_mode_reg (mode1, op1);
38476 /* Force memory operand only with base register here. But we
38477 don't want to do it on memory operand for other builtin
38478 functions. */
38479 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38481 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38482 op2 = copy_to_mode_reg (Pmode, op2);
38484 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38486 error ("the forth argument must be scale 1, 2, 4, 8");
38487 return const0_rtx;
38490 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38492 error ("incorrect hint operand");
38493 return const0_rtx;
38496 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38497 if (! pat)
38498 return const0_rtx;
38500 emit_insn (pat);
38502 return 0;
38504 case IX86_BUILTIN_XABORT:
38505 icode = CODE_FOR_xabort;
38506 arg0 = CALL_EXPR_ARG (exp, 0);
38507 op0 = expand_normal (arg0);
38508 mode0 = insn_data[icode].operand[0].mode;
38509 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38511 error ("the xabort's argument must be an 8-bit immediate");
38512 return const0_rtx;
38514 emit_insn (gen_xabort (op0));
38515 return 0;
38517 default:
38518 break;
38521 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38522 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38524 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38525 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38526 target);
38529 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38530 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38532 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38533 switch (fcode)
38535 case IX86_BUILTIN_FABSQ:
38536 case IX86_BUILTIN_COPYSIGNQ:
38537 if (!TARGET_SSE)
38538 /* Emit a normal call if SSE isn't available. */
38539 return expand_call (exp, target, ignore);
38540 /* FALLTHRU */
38541 default:
38542 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38546 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38547 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38549 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38550 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38551 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38552 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38553 int masked = 1;
38554 machine_mode mode, wide_mode, nar_mode;
38556 nar_mode = V4SFmode;
38557 mode = V16SFmode;
38558 wide_mode = V64SFmode;
38559 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38560 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38562 switch (fcode)
38564 case IX86_BUILTIN_4FMAPS:
38565 fcn = gen_avx5124fmaddps_4fmaddps;
38566 masked = 0;
38567 goto v4fma_expand;
38569 case IX86_BUILTIN_4DPWSSD:
38570 nar_mode = V4SImode;
38571 mode = V16SImode;
38572 wide_mode = V64SImode;
38573 fcn = gen_avx5124vnniw_vp4dpwssd;
38574 masked = 0;
38575 goto v4fma_expand;
38577 case IX86_BUILTIN_4DPWSSDS:
38578 nar_mode = V4SImode;
38579 mode = V16SImode;
38580 wide_mode = V64SImode;
38581 fcn = gen_avx5124vnniw_vp4dpwssds;
38582 masked = 0;
38583 goto v4fma_expand;
38585 case IX86_BUILTIN_4FNMAPS:
38586 fcn = gen_avx5124fmaddps_4fnmaddps;
38587 masked = 0;
38588 goto v4fma_expand;
38590 case IX86_BUILTIN_4FNMAPS_MASK:
38591 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38592 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38593 goto v4fma_expand;
38595 case IX86_BUILTIN_4DPWSSD_MASK:
38596 nar_mode = V4SImode;
38597 mode = V16SImode;
38598 wide_mode = V64SImode;
38599 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38600 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38601 goto v4fma_expand;
38603 case IX86_BUILTIN_4DPWSSDS_MASK:
38604 nar_mode = V4SImode;
38605 mode = V16SImode;
38606 wide_mode = V64SImode;
38607 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38608 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38609 goto v4fma_expand;
38611 case IX86_BUILTIN_4FMAPS_MASK:
38613 tree args[4];
38614 rtx ops[4];
38615 rtx wide_reg;
38616 rtx accum;
38617 rtx addr;
38618 rtx mem;
38620 v4fma_expand:
38621 wide_reg = gen_reg_rtx (wide_mode);
38622 for (i = 0; i < 4; i++)
38624 args[i] = CALL_EXPR_ARG (exp, i);
38625 ops[i] = expand_normal (args[i]);
38627 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38628 ops[i]);
38631 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38632 accum = force_reg (mode, accum);
38634 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38635 addr = force_reg (Pmode, addr);
38637 mem = gen_rtx_MEM (nar_mode, addr);
38639 target = gen_reg_rtx (mode);
38641 emit_move_insn (target, accum);
38643 if (! masked)
38644 emit_insn (fcn (target, accum, wide_reg, mem));
38645 else
38647 rtx merge, mask;
38648 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38650 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38652 if (CONST_INT_P (mask))
38653 mask = fixup_modeless_constant (mask, HImode);
38655 mask = force_reg (HImode, mask);
38657 if (GET_MODE (mask) != HImode)
38658 mask = gen_rtx_SUBREG (HImode, mask, 0);
38660 /* If merge is 0 then we're about to emit z-masked variant. */
38661 if (const0_operand (merge, mode))
38662 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38663 /* If merge is the same as accum then emit merge-masked variant. */
38664 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38666 merge = force_reg (mode, merge);
38667 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38669 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38670 else
38672 target = gen_reg_rtx (mode);
38673 emit_move_insn (target, merge);
38674 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38677 return target;
38680 case IX86_BUILTIN_4FNMASS:
38681 fcn = gen_avx5124fmaddps_4fnmaddss;
38682 masked = 0;
38683 goto s4fma_expand;
38685 case IX86_BUILTIN_4FMASS:
38686 fcn = gen_avx5124fmaddps_4fmaddss;
38687 masked = 0;
38688 goto s4fma_expand;
38690 case IX86_BUILTIN_4FNMASS_MASK:
38691 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38692 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38693 goto s4fma_expand;
38695 case IX86_BUILTIN_4FMASS_MASK:
38697 tree args[4];
38698 rtx ops[4];
38699 rtx wide_reg;
38700 rtx accum;
38701 rtx addr;
38702 rtx mem;
38704 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38705 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38707 s4fma_expand:
38708 mode = V4SFmode;
38709 wide_reg = gen_reg_rtx (V64SFmode);
38710 for (i = 0; i < 4; i++)
38712 rtx tmp;
38713 args[i] = CALL_EXPR_ARG (exp, i);
38714 ops[i] = expand_normal (args[i]);
38716 tmp = gen_reg_rtx (SFmode);
38717 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38719 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38720 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38723 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38724 accum = force_reg (V4SFmode, accum);
38726 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38727 addr = force_reg (Pmode, addr);
38729 mem = gen_rtx_MEM (V4SFmode, addr);
38731 target = gen_reg_rtx (V4SFmode);
38733 emit_move_insn (target, accum);
38735 if (! masked)
38736 emit_insn (fcn (target, accum, wide_reg, mem));
38737 else
38739 rtx merge, mask;
38740 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38742 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38744 if (CONST_INT_P (mask))
38745 mask = fixup_modeless_constant (mask, QImode);
38747 mask = force_reg (QImode, mask);
38749 if (GET_MODE (mask) != QImode)
38750 mask = gen_rtx_SUBREG (QImode, mask, 0);
38752 /* If merge is 0 then we're about to emit z-masked variant. */
38753 if (const0_operand (merge, mode))
38754 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38755 /* If merge is the same as accum then emit merge-masked
38756 variant. */
38757 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38759 merge = force_reg (mode, merge);
38760 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38762 /* Merge with something unknown might happen if we z-mask
38763 w/ -O0. */
38764 else
38766 target = gen_reg_rtx (mode);
38767 emit_move_insn (target, merge);
38768 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38771 return target;
38773 case IX86_BUILTIN_RDPID:
38774 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38775 target);
38776 default:
38777 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38781 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38782 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38784 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38785 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38788 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38789 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38791 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38792 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38795 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38796 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38798 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38799 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38802 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38803 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38805 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38806 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38809 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38810 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38812 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38813 const struct builtin_description *d = bdesc_multi_arg + i;
38814 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38815 (enum ix86_builtin_func_type)
38816 d->flag, d->comparison);
38819 gcc_unreachable ();
38822 /* This returns the target-specific builtin with code CODE if
38823 current_function_decl has visibility on this builtin, which is checked
38824 using isa flags. Returns NULL_TREE otherwise. */
38826 static tree ix86_get_builtin (enum ix86_builtins code)
38828 struct cl_target_option *opts;
38829 tree target_tree = NULL_TREE;
38831 /* Determine the isa flags of current_function_decl. */
38833 if (current_function_decl)
38834 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38836 if (target_tree == NULL)
38837 target_tree = target_option_default_node;
38839 opts = TREE_TARGET_OPTION (target_tree);
38841 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38842 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38843 return ix86_builtin_decl (code, true);
38844 else
38845 return NULL_TREE;
38848 /* Return function decl for target specific builtin
38849 for given MPX builtin passed i FCODE. */
38850 static tree
38851 ix86_builtin_mpx_function (unsigned fcode)
38853 switch (fcode)
38855 case BUILT_IN_CHKP_BNDMK:
38856 return ix86_builtins[IX86_BUILTIN_BNDMK];
38858 case BUILT_IN_CHKP_BNDSTX:
38859 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38861 case BUILT_IN_CHKP_BNDLDX:
38862 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38864 case BUILT_IN_CHKP_BNDCL:
38865 return ix86_builtins[IX86_BUILTIN_BNDCL];
38867 case BUILT_IN_CHKP_BNDCU:
38868 return ix86_builtins[IX86_BUILTIN_BNDCU];
38870 case BUILT_IN_CHKP_BNDRET:
38871 return ix86_builtins[IX86_BUILTIN_BNDRET];
38873 case BUILT_IN_CHKP_INTERSECT:
38874 return ix86_builtins[IX86_BUILTIN_BNDINT];
38876 case BUILT_IN_CHKP_NARROW:
38877 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38879 case BUILT_IN_CHKP_SIZEOF:
38880 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38882 case BUILT_IN_CHKP_EXTRACT_LOWER:
38883 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38885 case BUILT_IN_CHKP_EXTRACT_UPPER:
38886 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38888 default:
38889 return NULL_TREE;
38892 gcc_unreachable ();
38895 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38897 Return an address to be used to load/store bounds for pointer
38898 passed in SLOT.
38900 SLOT_NO is an integer constant holding number of a target
38901 dependent special slot to be used in case SLOT is not a memory.
38903 SPECIAL_BASE is a pointer to be used as a base of fake address
38904 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38905 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38907 static rtx
38908 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38910 rtx addr = NULL;
38912 /* NULL slot means we pass bounds for pointer not passed to the
38913 function at all. Register slot means we pass pointer in a
38914 register. In both these cases bounds are passed via Bounds
38915 Table. Since we do not have actual pointer stored in memory,
38916 we have to use fake addresses to access Bounds Table. We
38917 start with (special_base - sizeof (void*)) and decrease this
38918 address by pointer size to get addresses for other slots. */
38919 if (!slot || REG_P (slot))
38921 gcc_assert (CONST_INT_P (slot_no));
38922 addr = plus_constant (Pmode, special_base,
38923 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38925 /* If pointer is passed in a memory then its address is used to
38926 access Bounds Table. */
38927 else if (MEM_P (slot))
38929 addr = XEXP (slot, 0);
38930 if (!register_operand (addr, Pmode))
38931 addr = copy_addr_to_reg (addr);
38933 else
38934 gcc_unreachable ();
38936 return addr;
38939 /* Expand pass uses this hook to load bounds for function parameter
38940 PTR passed in SLOT in case its bounds are not passed in a register.
38942 If SLOT is a memory, then bounds are loaded as for regular pointer
38943 loaded from memory. PTR may be NULL in case SLOT is a memory.
38944 In such case value of PTR (if required) may be loaded from SLOT.
38946 If SLOT is NULL or a register then SLOT_NO is an integer constant
38947 holding number of the target dependent special slot which should be
38948 used to obtain bounds.
38950 Return loaded bounds. */
38952 static rtx
38953 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38955 rtx reg = gen_reg_rtx (BNDmode);
38956 rtx addr;
38958 /* Get address to be used to access Bounds Table. Special slots start
38959 at the location of return address of the current function. */
38960 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38962 /* Load pointer value from a memory if we don't have it. */
38963 if (!ptr)
38965 gcc_assert (MEM_P (slot));
38966 ptr = copy_addr_to_reg (slot);
38969 if (!register_operand (ptr, Pmode))
38970 ptr = ix86_zero_extend_to_Pmode (ptr);
38972 emit_insn (BNDmode == BND64mode
38973 ? gen_bnd64_ldx (reg, addr, ptr)
38974 : gen_bnd32_ldx (reg, addr, ptr));
38976 return reg;
38979 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38980 passed in SLOT in case BOUNDS are not passed in a register.
38982 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38983 stored in memory. PTR may be NULL in case SLOT is a memory.
38984 In such case value of PTR (if required) may be loaded from SLOT.
38986 If SLOT is NULL or a register then SLOT_NO is an integer constant
38987 holding number of the target dependent special slot which should be
38988 used to store BOUNDS. */
38990 static void
38991 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38993 rtx addr;
38995 /* Get address to be used to access Bounds Table. Special slots start
38996 at the location of return address of a called function. */
38997 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38999 /* Load pointer value from a memory if we don't have it. */
39000 if (!ptr)
39002 gcc_assert (MEM_P (slot));
39003 ptr = copy_addr_to_reg (slot);
39006 if (!register_operand (ptr, Pmode))
39007 ptr = ix86_zero_extend_to_Pmode (ptr);
39009 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39010 if (!register_operand (bounds, BNDmode))
39011 bounds = copy_to_mode_reg (BNDmode, bounds);
39013 emit_insn (BNDmode == BND64mode
39014 ? gen_bnd64_stx (addr, ptr, bounds)
39015 : gen_bnd32_stx (addr, ptr, bounds));
39018 /* Load and return bounds returned by function in SLOT. */
39020 static rtx
39021 ix86_load_returned_bounds (rtx slot)
39023 rtx res;
39025 gcc_assert (REG_P (slot));
39026 res = gen_reg_rtx (BNDmode);
39027 emit_move_insn (res, slot);
39029 return res;
39032 /* Store BOUNDS returned by function into SLOT. */
39034 static void
39035 ix86_store_returned_bounds (rtx slot, rtx bounds)
39037 gcc_assert (REG_P (slot));
39038 emit_move_insn (slot, bounds);
39041 /* Returns a function decl for a vectorized version of the combined function
39042 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39043 if it is not available. */
39045 static tree
39046 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39047 tree type_in)
39049 machine_mode in_mode, out_mode;
39050 int in_n, out_n;
39052 if (TREE_CODE (type_out) != VECTOR_TYPE
39053 || TREE_CODE (type_in) != VECTOR_TYPE)
39054 return NULL_TREE;
39056 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39057 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39058 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39059 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39061 switch (fn)
39063 CASE_CFN_EXP2:
39064 if (out_mode == SFmode && in_mode == SFmode)
39066 if (out_n == 16 && in_n == 16)
39067 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39069 break;
39071 CASE_CFN_IFLOOR:
39072 CASE_CFN_LFLOOR:
39073 CASE_CFN_LLFLOOR:
39074 /* The round insn does not trap on denormals. */
39075 if (flag_trapping_math || !TARGET_ROUND)
39076 break;
39078 if (out_mode == SImode && in_mode == DFmode)
39080 if (out_n == 4 && in_n == 2)
39081 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39082 else if (out_n == 8 && in_n == 4)
39083 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39084 else if (out_n == 16 && in_n == 8)
39085 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39087 if (out_mode == SImode && in_mode == SFmode)
39089 if (out_n == 4 && in_n == 4)
39090 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39091 else if (out_n == 8 && in_n == 8)
39092 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39093 else if (out_n == 16 && in_n == 16)
39094 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39096 break;
39098 CASE_CFN_ICEIL:
39099 CASE_CFN_LCEIL:
39100 CASE_CFN_LLCEIL:
39101 /* The round insn does not trap on denormals. */
39102 if (flag_trapping_math || !TARGET_ROUND)
39103 break;
39105 if (out_mode == SImode && in_mode == DFmode)
39107 if (out_n == 4 && in_n == 2)
39108 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39109 else if (out_n == 8 && in_n == 4)
39110 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39111 else if (out_n == 16 && in_n == 8)
39112 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39114 if (out_mode == SImode && in_mode == SFmode)
39116 if (out_n == 4 && in_n == 4)
39117 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39118 else if (out_n == 8 && in_n == 8)
39119 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39120 else if (out_n == 16 && in_n == 16)
39121 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39123 break;
39125 CASE_CFN_IRINT:
39126 CASE_CFN_LRINT:
39127 CASE_CFN_LLRINT:
39128 if (out_mode == SImode && in_mode == DFmode)
39130 if (out_n == 4 && in_n == 2)
39131 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39132 else if (out_n == 8 && in_n == 4)
39133 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39134 else if (out_n == 16 && in_n == 8)
39135 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39137 if (out_mode == SImode && in_mode == SFmode)
39139 if (out_n == 4 && in_n == 4)
39140 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39141 else if (out_n == 8 && in_n == 8)
39142 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39143 else if (out_n == 16 && in_n == 16)
39144 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39146 break;
39148 CASE_CFN_IROUND:
39149 CASE_CFN_LROUND:
39150 CASE_CFN_LLROUND:
39151 /* The round insn does not trap on denormals. */
39152 if (flag_trapping_math || !TARGET_ROUND)
39153 break;
39155 if (out_mode == SImode && in_mode == DFmode)
39157 if (out_n == 4 && in_n == 2)
39158 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39159 else if (out_n == 8 && in_n == 4)
39160 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39161 else if (out_n == 16 && in_n == 8)
39162 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39164 if (out_mode == SImode && in_mode == SFmode)
39166 if (out_n == 4 && in_n == 4)
39167 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39168 else if (out_n == 8 && in_n == 8)
39169 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39170 else if (out_n == 16 && in_n == 16)
39171 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39173 break;
39175 CASE_CFN_FLOOR:
39176 /* The round insn does not trap on denormals. */
39177 if (flag_trapping_math || !TARGET_ROUND)
39178 break;
39180 if (out_mode == DFmode && in_mode == DFmode)
39182 if (out_n == 2 && in_n == 2)
39183 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39184 else if (out_n == 4 && in_n == 4)
39185 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39186 else if (out_n == 8 && in_n == 8)
39187 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39189 if (out_mode == SFmode && in_mode == SFmode)
39191 if (out_n == 4 && in_n == 4)
39192 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39193 else if (out_n == 8 && in_n == 8)
39194 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39195 else if (out_n == 16 && in_n == 16)
39196 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39198 break;
39200 CASE_CFN_CEIL:
39201 /* The round insn does not trap on denormals. */
39202 if (flag_trapping_math || !TARGET_ROUND)
39203 break;
39205 if (out_mode == DFmode && in_mode == DFmode)
39207 if (out_n == 2 && in_n == 2)
39208 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39209 else if (out_n == 4 && in_n == 4)
39210 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39211 else if (out_n == 8 && in_n == 8)
39212 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39214 if (out_mode == SFmode && in_mode == SFmode)
39216 if (out_n == 4 && in_n == 4)
39217 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39218 else if (out_n == 8 && in_n == 8)
39219 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39220 else if (out_n == 16 && in_n == 16)
39221 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39223 break;
39225 CASE_CFN_TRUNC:
39226 /* The round insn does not trap on denormals. */
39227 if (flag_trapping_math || !TARGET_ROUND)
39228 break;
39230 if (out_mode == DFmode && in_mode == DFmode)
39232 if (out_n == 2 && in_n == 2)
39233 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39234 else if (out_n == 4 && in_n == 4)
39235 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39236 else if (out_n == 8 && in_n == 8)
39237 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39239 if (out_mode == SFmode && in_mode == SFmode)
39241 if (out_n == 4 && in_n == 4)
39242 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39243 else if (out_n == 8 && in_n == 8)
39244 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39245 else if (out_n == 16 && in_n == 16)
39246 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39248 break;
39250 CASE_CFN_RINT:
39251 /* The round insn does not trap on denormals. */
39252 if (flag_trapping_math || !TARGET_ROUND)
39253 break;
39255 if (out_mode == DFmode && in_mode == DFmode)
39257 if (out_n == 2 && in_n == 2)
39258 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39259 else if (out_n == 4 && in_n == 4)
39260 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39262 if (out_mode == SFmode && in_mode == SFmode)
39264 if (out_n == 4 && in_n == 4)
39265 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39266 else if (out_n == 8 && in_n == 8)
39267 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39269 break;
39271 CASE_CFN_FMA:
39272 if (out_mode == DFmode && in_mode == DFmode)
39274 if (out_n == 2 && in_n == 2)
39275 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39276 if (out_n == 4 && in_n == 4)
39277 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39279 if (out_mode == SFmode && in_mode == SFmode)
39281 if (out_n == 4 && in_n == 4)
39282 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39283 if (out_n == 8 && in_n == 8)
39284 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39286 break;
39288 default:
39289 break;
39292 /* Dispatch to a handler for a vectorization library. */
39293 if (ix86_veclib_handler)
39294 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39296 return NULL_TREE;
39299 /* Handler for an SVML-style interface to
39300 a library with vectorized intrinsics. */
39302 static tree
39303 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39305 char name[20];
39306 tree fntype, new_fndecl, args;
39307 unsigned arity;
39308 const char *bname;
39309 machine_mode el_mode, in_mode;
39310 int n, in_n;
39312 /* The SVML is suitable for unsafe math only. */
39313 if (!flag_unsafe_math_optimizations)
39314 return NULL_TREE;
39316 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39317 n = TYPE_VECTOR_SUBPARTS (type_out);
39318 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39319 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39320 if (el_mode != in_mode
39321 || n != in_n)
39322 return NULL_TREE;
39324 switch (fn)
39326 CASE_CFN_EXP:
39327 CASE_CFN_LOG:
39328 CASE_CFN_LOG10:
39329 CASE_CFN_POW:
39330 CASE_CFN_TANH:
39331 CASE_CFN_TAN:
39332 CASE_CFN_ATAN:
39333 CASE_CFN_ATAN2:
39334 CASE_CFN_ATANH:
39335 CASE_CFN_CBRT:
39336 CASE_CFN_SINH:
39337 CASE_CFN_SIN:
39338 CASE_CFN_ASINH:
39339 CASE_CFN_ASIN:
39340 CASE_CFN_COSH:
39341 CASE_CFN_COS:
39342 CASE_CFN_ACOSH:
39343 CASE_CFN_ACOS:
39344 if ((el_mode != DFmode || n != 2)
39345 && (el_mode != SFmode || n != 4))
39346 return NULL_TREE;
39347 break;
39349 default:
39350 return NULL_TREE;
39353 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39354 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39356 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39357 strcpy (name, "vmlsLn4");
39358 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39359 strcpy (name, "vmldLn2");
39360 else if (n == 4)
39362 sprintf (name, "vmls%s", bname+10);
39363 name[strlen (name)-1] = '4';
39365 else
39366 sprintf (name, "vmld%s2", bname+10);
39368 /* Convert to uppercase. */
39369 name[4] &= ~0x20;
39371 arity = 0;
39372 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39373 arity++;
39375 if (arity == 1)
39376 fntype = build_function_type_list (type_out, type_in, NULL);
39377 else
39378 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39380 /* Build a function declaration for the vectorized function. */
39381 new_fndecl = build_decl (BUILTINS_LOCATION,
39382 FUNCTION_DECL, get_identifier (name), fntype);
39383 TREE_PUBLIC (new_fndecl) = 1;
39384 DECL_EXTERNAL (new_fndecl) = 1;
39385 DECL_IS_NOVOPS (new_fndecl) = 1;
39386 TREE_READONLY (new_fndecl) = 1;
39388 return new_fndecl;
39391 /* Handler for an ACML-style interface to
39392 a library with vectorized intrinsics. */
39394 static tree
39395 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39397 char name[20] = "__vr.._";
39398 tree fntype, new_fndecl, args;
39399 unsigned arity;
39400 const char *bname;
39401 machine_mode el_mode, in_mode;
39402 int n, in_n;
39404 /* The ACML is 64bits only and suitable for unsafe math only as
39405 it does not correctly support parts of IEEE with the required
39406 precision such as denormals. */
39407 if (!TARGET_64BIT
39408 || !flag_unsafe_math_optimizations)
39409 return NULL_TREE;
39411 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39412 n = TYPE_VECTOR_SUBPARTS (type_out);
39413 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39414 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39415 if (el_mode != in_mode
39416 || n != in_n)
39417 return NULL_TREE;
39419 switch (fn)
39421 CASE_CFN_SIN:
39422 CASE_CFN_COS:
39423 CASE_CFN_EXP:
39424 CASE_CFN_LOG:
39425 CASE_CFN_LOG2:
39426 CASE_CFN_LOG10:
39427 if (el_mode == DFmode && n == 2)
39429 name[4] = 'd';
39430 name[5] = '2';
39432 else if (el_mode == SFmode && n == 4)
39434 name[4] = 's';
39435 name[5] = '4';
39437 else
39438 return NULL_TREE;
39439 break;
39441 default:
39442 return NULL_TREE;
39445 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39446 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39447 sprintf (name + 7, "%s", bname+10);
39449 arity = 0;
39450 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39451 arity++;
39453 if (arity == 1)
39454 fntype = build_function_type_list (type_out, type_in, NULL);
39455 else
39456 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39458 /* Build a function declaration for the vectorized function. */
39459 new_fndecl = build_decl (BUILTINS_LOCATION,
39460 FUNCTION_DECL, get_identifier (name), fntype);
39461 TREE_PUBLIC (new_fndecl) = 1;
39462 DECL_EXTERNAL (new_fndecl) = 1;
39463 DECL_IS_NOVOPS (new_fndecl) = 1;
39464 TREE_READONLY (new_fndecl) = 1;
39466 return new_fndecl;
39469 /* Returns a decl of a function that implements gather load with
39470 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39471 Return NULL_TREE if it is not available. */
39473 static tree
39474 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39475 const_tree index_type, int scale)
39477 bool si;
39478 enum ix86_builtins code;
39480 if (! TARGET_AVX2)
39481 return NULL_TREE;
39483 if ((TREE_CODE (index_type) != INTEGER_TYPE
39484 && !POINTER_TYPE_P (index_type))
39485 || (TYPE_MODE (index_type) != SImode
39486 && TYPE_MODE (index_type) != DImode))
39487 return NULL_TREE;
39489 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39490 return NULL_TREE;
39492 /* v*gather* insn sign extends index to pointer mode. */
39493 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39494 && TYPE_UNSIGNED (index_type))
39495 return NULL_TREE;
39497 if (scale <= 0
39498 || scale > 8
39499 || (scale & (scale - 1)) != 0)
39500 return NULL_TREE;
39502 si = TYPE_MODE (index_type) == SImode;
39503 switch (TYPE_MODE (mem_vectype))
39505 case V2DFmode:
39506 if (TARGET_AVX512VL)
39507 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39508 else
39509 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39510 break;
39511 case V4DFmode:
39512 if (TARGET_AVX512VL)
39513 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39514 else
39515 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39516 break;
39517 case V2DImode:
39518 if (TARGET_AVX512VL)
39519 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39520 else
39521 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39522 break;
39523 case V4DImode:
39524 if (TARGET_AVX512VL)
39525 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39526 else
39527 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39528 break;
39529 case V4SFmode:
39530 if (TARGET_AVX512VL)
39531 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39532 else
39533 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39534 break;
39535 case V8SFmode:
39536 if (TARGET_AVX512VL)
39537 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39538 else
39539 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39540 break;
39541 case V4SImode:
39542 if (TARGET_AVX512VL)
39543 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39544 else
39545 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39546 break;
39547 case V8SImode:
39548 if (TARGET_AVX512VL)
39549 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39550 else
39551 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39552 break;
39553 case V8DFmode:
39554 if (TARGET_AVX512F)
39555 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39556 else
39557 return NULL_TREE;
39558 break;
39559 case V8DImode:
39560 if (TARGET_AVX512F)
39561 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39562 else
39563 return NULL_TREE;
39564 break;
39565 case V16SFmode:
39566 if (TARGET_AVX512F)
39567 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39568 else
39569 return NULL_TREE;
39570 break;
39571 case V16SImode:
39572 if (TARGET_AVX512F)
39573 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39574 else
39575 return NULL_TREE;
39576 break;
39577 default:
39578 return NULL_TREE;
39581 return ix86_get_builtin (code);
39584 /* Returns a decl of a function that implements scatter store with
39585 register type VECTYPE and index type INDEX_TYPE and SCALE.
39586 Return NULL_TREE if it is not available. */
39588 static tree
39589 ix86_vectorize_builtin_scatter (const_tree vectype,
39590 const_tree index_type, int scale)
39592 bool si;
39593 enum ix86_builtins code;
39595 if (!TARGET_AVX512F)
39596 return NULL_TREE;
39598 if ((TREE_CODE (index_type) != INTEGER_TYPE
39599 && !POINTER_TYPE_P (index_type))
39600 || (TYPE_MODE (index_type) != SImode
39601 && TYPE_MODE (index_type) != DImode))
39602 return NULL_TREE;
39604 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39605 return NULL_TREE;
39607 /* v*scatter* insn sign extends index to pointer mode. */
39608 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39609 && TYPE_UNSIGNED (index_type))
39610 return NULL_TREE;
39612 /* Scale can be 1, 2, 4 or 8. */
39613 if (scale <= 0
39614 || scale > 8
39615 || (scale & (scale - 1)) != 0)
39616 return NULL_TREE;
39618 si = TYPE_MODE (index_type) == SImode;
39619 switch (TYPE_MODE (vectype))
39621 case V8DFmode:
39622 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39623 break;
39624 case V8DImode:
39625 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39626 break;
39627 case V16SFmode:
39628 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39629 break;
39630 case V16SImode:
39631 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39632 break;
39633 default:
39634 return NULL_TREE;
39637 return ix86_builtins[code];
39640 /* Return true if it is safe to use the rsqrt optabs to optimize
39641 1.0/sqrt. */
39643 static bool
39644 use_rsqrt_p ()
39646 return (TARGET_SSE_MATH
39647 && flag_finite_math_only
39648 && !flag_trapping_math
39649 && flag_unsafe_math_optimizations);
39652 /* Returns a code for a target-specific builtin that implements
39653 reciprocal of the function, or NULL_TREE if not available. */
39655 static tree
39656 ix86_builtin_reciprocal (tree fndecl)
39658 switch (DECL_FUNCTION_CODE (fndecl))
39660 /* Vectorized version of sqrt to rsqrt conversion. */
39661 case IX86_BUILTIN_SQRTPS_NR:
39662 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39664 case IX86_BUILTIN_SQRTPS_NR256:
39665 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39667 default:
39668 return NULL_TREE;
39672 /* Helper for avx_vpermilps256_operand et al. This is also used by
39673 the expansion functions to turn the parallel back into a mask.
39674 The return value is 0 for no match and the imm8+1 for a match. */
39677 avx_vpermilp_parallel (rtx par, machine_mode mode)
39679 unsigned i, nelt = GET_MODE_NUNITS (mode);
39680 unsigned mask = 0;
39681 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39683 if (XVECLEN (par, 0) != (int) nelt)
39684 return 0;
39686 /* Validate that all of the elements are constants, and not totally
39687 out of range. Copy the data into an integral array to make the
39688 subsequent checks easier. */
39689 for (i = 0; i < nelt; ++i)
39691 rtx er = XVECEXP (par, 0, i);
39692 unsigned HOST_WIDE_INT ei;
39694 if (!CONST_INT_P (er))
39695 return 0;
39696 ei = INTVAL (er);
39697 if (ei >= nelt)
39698 return 0;
39699 ipar[i] = ei;
39702 switch (mode)
39704 case V8DFmode:
39705 /* In the 512-bit DFmode case, we can only move elements within
39706 a 128-bit lane. First fill the second part of the mask,
39707 then fallthru. */
39708 for (i = 4; i < 6; ++i)
39710 if (ipar[i] < 4 || ipar[i] >= 6)
39711 return 0;
39712 mask |= (ipar[i] - 4) << i;
39714 for (i = 6; i < 8; ++i)
39716 if (ipar[i] < 6)
39717 return 0;
39718 mask |= (ipar[i] - 6) << i;
39720 /* FALLTHRU */
39722 case V4DFmode:
39723 /* In the 256-bit DFmode case, we can only move elements within
39724 a 128-bit lane. */
39725 for (i = 0; i < 2; ++i)
39727 if (ipar[i] >= 2)
39728 return 0;
39729 mask |= ipar[i] << i;
39731 for (i = 2; i < 4; ++i)
39733 if (ipar[i] < 2)
39734 return 0;
39735 mask |= (ipar[i] - 2) << i;
39737 break;
39739 case V16SFmode:
39740 /* In 512 bit SFmode case, permutation in the upper 256 bits
39741 must mirror the permutation in the lower 256-bits. */
39742 for (i = 0; i < 8; ++i)
39743 if (ipar[i] + 8 != ipar[i + 8])
39744 return 0;
39745 /* FALLTHRU */
39747 case V8SFmode:
39748 /* In 256 bit SFmode case, we have full freedom of
39749 movement within the low 128-bit lane, but the high 128-bit
39750 lane must mirror the exact same pattern. */
39751 for (i = 0; i < 4; ++i)
39752 if (ipar[i] + 4 != ipar[i + 4])
39753 return 0;
39754 nelt = 4;
39755 /* FALLTHRU */
39757 case V2DFmode:
39758 case V4SFmode:
39759 /* In the 128-bit case, we've full freedom in the placement of
39760 the elements from the source operand. */
39761 for (i = 0; i < nelt; ++i)
39762 mask |= ipar[i] << (i * (nelt / 2));
39763 break;
39765 default:
39766 gcc_unreachable ();
39769 /* Make sure success has a non-zero value by adding one. */
39770 return mask + 1;
39773 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39774 the expansion functions to turn the parallel back into a mask.
39775 The return value is 0 for no match and the imm8+1 for a match. */
39778 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39780 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39781 unsigned mask = 0;
39782 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39784 if (XVECLEN (par, 0) != (int) nelt)
39785 return 0;
39787 /* Validate that all of the elements are constants, and not totally
39788 out of range. Copy the data into an integral array to make the
39789 subsequent checks easier. */
39790 for (i = 0; i < nelt; ++i)
39792 rtx er = XVECEXP (par, 0, i);
39793 unsigned HOST_WIDE_INT ei;
39795 if (!CONST_INT_P (er))
39796 return 0;
39797 ei = INTVAL (er);
39798 if (ei >= 2 * nelt)
39799 return 0;
39800 ipar[i] = ei;
39803 /* Validate that the halves of the permute are halves. */
39804 for (i = 0; i < nelt2 - 1; ++i)
39805 if (ipar[i] + 1 != ipar[i + 1])
39806 return 0;
39807 for (i = nelt2; i < nelt - 1; ++i)
39808 if (ipar[i] + 1 != ipar[i + 1])
39809 return 0;
39811 /* Reconstruct the mask. */
39812 for (i = 0; i < 2; ++i)
39814 unsigned e = ipar[i * nelt2];
39815 if (e % nelt2)
39816 return 0;
39817 e /= nelt2;
39818 mask |= e << (i * 4);
39821 /* Make sure success has a non-zero value by adding one. */
39822 return mask + 1;
39825 /* Return a register priority for hard reg REGNO. */
39826 static int
39827 ix86_register_priority (int hard_regno)
39829 /* ebp and r13 as the base always wants a displacement, r12 as the
39830 base always wants an index. So discourage their usage in an
39831 address. */
39832 if (hard_regno == R12_REG || hard_regno == R13_REG)
39833 return 0;
39834 if (hard_regno == BP_REG)
39835 return 1;
39836 /* New x86-64 int registers result in bigger code size. Discourage
39837 them. */
39838 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39839 return 2;
39840 /* New x86-64 SSE registers result in bigger code size. Discourage
39841 them. */
39842 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39843 return 2;
39844 /* Usage of AX register results in smaller code. Prefer it. */
39845 if (hard_regno == AX_REG)
39846 return 4;
39847 return 3;
39850 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39852 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39853 QImode must go into class Q_REGS.
39854 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39855 movdf to do mem-to-mem moves through integer regs. */
39857 static reg_class_t
39858 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39860 machine_mode mode = GET_MODE (x);
39862 /* We're only allowed to return a subclass of CLASS. Many of the
39863 following checks fail for NO_REGS, so eliminate that early. */
39864 if (regclass == NO_REGS)
39865 return NO_REGS;
39867 /* All classes can load zeros. */
39868 if (x == CONST0_RTX (mode))
39869 return regclass;
39871 /* Force constants into memory if we are loading a (nonzero) constant into
39872 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39873 instructions to load from a constant. */
39874 if (CONSTANT_P (x)
39875 && (MAYBE_MMX_CLASS_P (regclass)
39876 || MAYBE_SSE_CLASS_P (regclass)
39877 || MAYBE_MASK_CLASS_P (regclass)))
39878 return NO_REGS;
39880 /* Floating-point constants need more complex checks. */
39881 if (CONST_DOUBLE_P (x))
39883 /* General regs can load everything. */
39884 if (INTEGER_CLASS_P (regclass))
39885 return regclass;
39887 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39888 zero above. We only want to wind up preferring 80387 registers if
39889 we plan on doing computation with them. */
39890 if (IS_STACK_MODE (mode)
39891 && standard_80387_constant_p (x) > 0)
39893 /* Limit class to FP regs. */
39894 if (FLOAT_CLASS_P (regclass))
39895 return FLOAT_REGS;
39896 else if (regclass == FP_TOP_SSE_REGS)
39897 return FP_TOP_REG;
39898 else if (regclass == FP_SECOND_SSE_REGS)
39899 return FP_SECOND_REG;
39902 return NO_REGS;
39905 /* Prefer SSE regs only, if we can use them for math. */
39906 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39907 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39909 /* Generally when we see PLUS here, it's the function invariant
39910 (plus soft-fp const_int). Which can only be computed into general
39911 regs. */
39912 if (GET_CODE (x) == PLUS)
39913 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39915 /* QImode constants are easy to load, but non-constant QImode data
39916 must go into Q_REGS. */
39917 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39919 if (Q_CLASS_P (regclass))
39920 return regclass;
39921 else if (reg_class_subset_p (Q_REGS, regclass))
39922 return Q_REGS;
39923 else
39924 return NO_REGS;
39927 return regclass;
39930 /* Discourage putting floating-point values in SSE registers unless
39931 SSE math is being used, and likewise for the 387 registers. */
39932 static reg_class_t
39933 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39935 machine_mode mode = GET_MODE (x);
39937 /* Restrict the output reload class to the register bank that we are doing
39938 math on. If we would like not to return a subset of CLASS, reject this
39939 alternative: if reload cannot do this, it will still use its choice. */
39940 mode = GET_MODE (x);
39941 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39942 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39944 if (IS_STACK_MODE (mode))
39946 if (regclass == FP_TOP_SSE_REGS)
39947 return FP_TOP_REG;
39948 else if (regclass == FP_SECOND_SSE_REGS)
39949 return FP_SECOND_REG;
39950 else
39951 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39954 return regclass;
39957 static reg_class_t
39958 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39959 machine_mode mode, secondary_reload_info *sri)
39961 /* Double-word spills from general registers to non-offsettable memory
39962 references (zero-extended addresses) require special handling. */
39963 if (TARGET_64BIT
39964 && MEM_P (x)
39965 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39966 && INTEGER_CLASS_P (rclass)
39967 && !offsettable_memref_p (x))
39969 sri->icode = (in_p
39970 ? CODE_FOR_reload_noff_load
39971 : CODE_FOR_reload_noff_store);
39972 /* Add the cost of moving address to a temporary. */
39973 sri->extra_cost = 1;
39975 return NO_REGS;
39978 /* QImode spills from non-QI registers require
39979 intermediate register on 32bit targets. */
39980 if (mode == QImode
39981 && ((!TARGET_64BIT && !in_p
39982 && INTEGER_CLASS_P (rclass)
39983 && MAYBE_NON_Q_CLASS_P (rclass))
39984 || (!TARGET_AVX512DQ
39985 && MAYBE_MASK_CLASS_P (rclass))))
39987 int regno = true_regnum (x);
39989 /* Return Q_REGS if the operand is in memory. */
39990 if (regno == -1)
39991 return Q_REGS;
39993 return NO_REGS;
39996 /* This condition handles corner case where an expression involving
39997 pointers gets vectorized. We're trying to use the address of a
39998 stack slot as a vector initializer.
40000 (set (reg:V2DI 74 [ vect_cst_.2 ])
40001 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40003 Eventually frame gets turned into sp+offset like this:
40005 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40006 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40007 (const_int 392 [0x188]))))
40009 That later gets turned into:
40011 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40012 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40013 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40015 We'll have the following reload recorded:
40017 Reload 0: reload_in (DI) =
40018 (plus:DI (reg/f:DI 7 sp)
40019 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40020 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40021 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40022 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40023 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40024 reload_reg_rtx: (reg:V2DI 22 xmm1)
40026 Which isn't going to work since SSE instructions can't handle scalar
40027 additions. Returning GENERAL_REGS forces the addition into integer
40028 register and reload can handle subsequent reloads without problems. */
40030 if (in_p && GET_CODE (x) == PLUS
40031 && SSE_CLASS_P (rclass)
40032 && SCALAR_INT_MODE_P (mode))
40033 return GENERAL_REGS;
40035 return NO_REGS;
40038 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40040 static bool
40041 ix86_class_likely_spilled_p (reg_class_t rclass)
40043 switch (rclass)
40045 case AREG:
40046 case DREG:
40047 case CREG:
40048 case BREG:
40049 case AD_REGS:
40050 case SIREG:
40051 case DIREG:
40052 case SSE_FIRST_REG:
40053 case FP_TOP_REG:
40054 case FP_SECOND_REG:
40055 case BND_REGS:
40056 return true;
40058 default:
40059 break;
40062 return false;
40065 /* If we are copying between registers from different register sets
40066 (e.g. FP and integer), we may need a memory location.
40068 The function can't work reliably when one of the CLASSES is a class
40069 containing registers from multiple sets. We avoid this by never combining
40070 different sets in a single alternative in the machine description.
40071 Ensure that this constraint holds to avoid unexpected surprises.
40073 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40074 so do not enforce these sanity checks.
40076 To optimize register_move_cost performance, define inline variant. */
40078 static inline bool
40079 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40080 machine_mode mode, int strict)
40082 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40083 return false;
40085 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40086 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40087 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40088 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40089 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40090 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40091 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40092 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40094 gcc_assert (!strict || lra_in_progress);
40095 return true;
40098 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40099 return true;
40101 /* Between mask and general, we have moves no larger than word size. */
40102 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40103 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40104 return true;
40106 /* ??? This is a lie. We do have moves between mmx/general, and for
40107 mmx/sse2. But by saying we need secondary memory we discourage the
40108 register allocator from using the mmx registers unless needed. */
40109 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40110 return true;
40112 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40114 /* SSE1 doesn't have any direct moves from other classes. */
40115 if (!TARGET_SSE2)
40116 return true;
40118 /* If the target says that inter-unit moves are more expensive
40119 than moving through memory, then don't generate them. */
40120 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40121 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40122 return true;
40124 /* Between SSE and general, we have moves no larger than word size. */
40125 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40126 return true;
40129 return false;
40132 bool
40133 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40134 machine_mode mode, int strict)
40136 return inline_secondary_memory_needed (class1, class2, mode, strict);
40139 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40141 On the 80386, this is the size of MODE in words,
40142 except in the FP regs, where a single reg is always enough. */
40144 static unsigned char
40145 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40147 if (MAYBE_INTEGER_CLASS_P (rclass))
40149 if (mode == XFmode)
40150 return (TARGET_64BIT ? 2 : 3);
40151 else if (mode == XCmode)
40152 return (TARGET_64BIT ? 4 : 6);
40153 else
40154 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40156 else
40158 if (COMPLEX_MODE_P (mode))
40159 return 2;
40160 else
40161 return 1;
40165 /* Return true if the registers in CLASS cannot represent the change from
40166 modes FROM to TO. */
40168 bool
40169 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40170 enum reg_class regclass)
40172 if (from == to)
40173 return false;
40175 /* x87 registers can't do subreg at all, as all values are reformatted
40176 to extended precision. */
40177 if (MAYBE_FLOAT_CLASS_P (regclass))
40178 return true;
40180 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40182 /* Vector registers do not support QI or HImode loads. If we don't
40183 disallow a change to these modes, reload will assume it's ok to
40184 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40185 the vec_dupv4hi pattern. */
40186 if (GET_MODE_SIZE (from) < 4)
40187 return true;
40190 return false;
40193 /* Return the cost of moving data of mode M between a
40194 register and memory. A value of 2 is the default; this cost is
40195 relative to those in `REGISTER_MOVE_COST'.
40197 This function is used extensively by register_move_cost that is used to
40198 build tables at startup. Make it inline in this case.
40199 When IN is 2, return maximum of in and out move cost.
40201 If moving between registers and memory is more expensive than
40202 between two registers, you should define this macro to express the
40203 relative cost.
40205 Model also increased moving costs of QImode registers in non
40206 Q_REGS classes.
40208 static inline int
40209 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40210 int in)
40212 int cost;
40213 if (FLOAT_CLASS_P (regclass))
40215 int index;
40216 switch (mode)
40218 case SFmode:
40219 index = 0;
40220 break;
40221 case DFmode:
40222 index = 1;
40223 break;
40224 case XFmode:
40225 index = 2;
40226 break;
40227 default:
40228 return 100;
40230 if (in == 2)
40231 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40232 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40234 if (SSE_CLASS_P (regclass))
40236 int index;
40237 switch (GET_MODE_SIZE (mode))
40239 case 4:
40240 index = 0;
40241 break;
40242 case 8:
40243 index = 1;
40244 break;
40245 case 16:
40246 index = 2;
40247 break;
40248 default:
40249 return 100;
40251 if (in == 2)
40252 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40253 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40255 if (MMX_CLASS_P (regclass))
40257 int index;
40258 switch (GET_MODE_SIZE (mode))
40260 case 4:
40261 index = 0;
40262 break;
40263 case 8:
40264 index = 1;
40265 break;
40266 default:
40267 return 100;
40269 if (in)
40270 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40271 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40273 switch (GET_MODE_SIZE (mode))
40275 case 1:
40276 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40278 if (!in)
40279 return ix86_cost->int_store[0];
40280 if (TARGET_PARTIAL_REG_DEPENDENCY
40281 && optimize_function_for_speed_p (cfun))
40282 cost = ix86_cost->movzbl_load;
40283 else
40284 cost = ix86_cost->int_load[0];
40285 if (in == 2)
40286 return MAX (cost, ix86_cost->int_store[0]);
40287 return cost;
40289 else
40291 if (in == 2)
40292 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40293 if (in)
40294 return ix86_cost->movzbl_load;
40295 else
40296 return ix86_cost->int_store[0] + 4;
40298 break;
40299 case 2:
40300 if (in == 2)
40301 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40302 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40303 default:
40304 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40305 if (mode == TFmode)
40306 mode = XFmode;
40307 if (in == 2)
40308 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40309 else if (in)
40310 cost = ix86_cost->int_load[2];
40311 else
40312 cost = ix86_cost->int_store[2];
40313 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40317 static int
40318 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40319 bool in)
40321 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40325 /* Return the cost of moving data from a register in class CLASS1 to
40326 one in class CLASS2.
40328 It is not required that the cost always equal 2 when FROM is the same as TO;
40329 on some machines it is expensive to move between registers if they are not
40330 general registers. */
40332 static int
40333 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40334 reg_class_t class2_i)
40336 enum reg_class class1 = (enum reg_class) class1_i;
40337 enum reg_class class2 = (enum reg_class) class2_i;
40339 /* In case we require secondary memory, compute cost of the store followed
40340 by load. In order to avoid bad register allocation choices, we need
40341 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40343 if (inline_secondary_memory_needed (class1, class2, mode, 0))
40345 int cost = 1;
40347 cost += inline_memory_move_cost (mode, class1, 2);
40348 cost += inline_memory_move_cost (mode, class2, 2);
40350 /* In case of copying from general_purpose_register we may emit multiple
40351 stores followed by single load causing memory size mismatch stall.
40352 Count this as arbitrarily high cost of 20. */
40353 if (targetm.class_max_nregs (class1, mode)
40354 > targetm.class_max_nregs (class2, mode))
40355 cost += 20;
40357 /* In the case of FP/MMX moves, the registers actually overlap, and we
40358 have to switch modes in order to treat them differently. */
40359 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40360 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40361 cost += 20;
40363 return cost;
40366 /* Moves between SSE/MMX and integer unit are expensive. */
40367 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40368 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40370 /* ??? By keeping returned value relatively high, we limit the number
40371 of moves between integer and MMX/SSE registers for all targets.
40372 Additionally, high value prevents problem with x86_modes_tieable_p(),
40373 where integer modes in MMX/SSE registers are not tieable
40374 because of missing QImode and HImode moves to, from or between
40375 MMX/SSE registers. */
40376 return MAX (8, ix86_cost->mmxsse_to_integer);
40378 if (MAYBE_FLOAT_CLASS_P (class1))
40379 return ix86_cost->fp_move;
40380 if (MAYBE_SSE_CLASS_P (class1))
40381 return ix86_cost->sse_move;
40382 if (MAYBE_MMX_CLASS_P (class1))
40383 return ix86_cost->mmx_move;
40384 return 2;
40387 /* Return TRUE if hard register REGNO can hold a value of machine-mode
40388 MODE. */
40390 bool
40391 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
40393 /* Flags and only flags can only hold CCmode values. */
40394 if (CC_REGNO_P (regno))
40395 return GET_MODE_CLASS (mode) == MODE_CC;
40396 if (GET_MODE_CLASS (mode) == MODE_CC
40397 || GET_MODE_CLASS (mode) == MODE_RANDOM
40398 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40399 return false;
40400 if (STACK_REGNO_P (regno))
40401 return VALID_FP_MODE_P (mode);
40402 if (MASK_REGNO_P (regno))
40403 return (VALID_MASK_REG_MODE (mode)
40404 || (TARGET_AVX512BW
40405 && VALID_MASK_AVX512BW_MODE (mode)));
40406 if (BND_REGNO_P (regno))
40407 return VALID_BND_REG_MODE (mode);
40408 if (SSE_REGNO_P (regno))
40410 /* We implement the move patterns for all vector modes into and
40411 out of SSE registers, even when no operation instructions
40412 are available. */
40414 /* For AVX-512 we allow, regardless of regno:
40415 - XI mode
40416 - any of 512-bit wide vector mode
40417 - any scalar mode. */
40418 if (TARGET_AVX512F
40419 && (mode == XImode
40420 || VALID_AVX512F_REG_MODE (mode)
40421 || VALID_AVX512F_SCALAR_MODE (mode)))
40422 return true;
40424 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40425 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40426 && MOD4_SSE_REGNO_P (regno)
40427 && mode == V64SFmode)
40428 return true;
40430 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40431 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40432 && MOD4_SSE_REGNO_P (regno)
40433 && mode == V64SImode)
40434 return true;
40436 /* TODO check for QI/HI scalars. */
40437 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40438 if (TARGET_AVX512VL
40439 && (mode == OImode
40440 || mode == TImode
40441 || VALID_AVX256_REG_MODE (mode)
40442 || VALID_AVX512VL_128_REG_MODE (mode)))
40443 return true;
40445 /* xmm16-xmm31 are only available for AVX-512. */
40446 if (EXT_REX_SSE_REGNO_P (regno))
40447 return false;
40449 /* OImode and AVX modes are available only when AVX is enabled. */
40450 return ((TARGET_AVX
40451 && VALID_AVX256_REG_OR_OI_MODE (mode))
40452 || VALID_SSE_REG_MODE (mode)
40453 || VALID_SSE2_REG_MODE (mode)
40454 || VALID_MMX_REG_MODE (mode)
40455 || VALID_MMX_REG_MODE_3DNOW (mode));
40457 if (MMX_REGNO_P (regno))
40459 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40460 so if the register is available at all, then we can move data of
40461 the given mode into or out of it. */
40462 return (VALID_MMX_REG_MODE (mode)
40463 || VALID_MMX_REG_MODE_3DNOW (mode));
40466 if (mode == QImode)
40468 /* Take care for QImode values - they can be in non-QI regs,
40469 but then they do cause partial register stalls. */
40470 if (ANY_QI_REGNO_P (regno))
40471 return true;
40472 if (!TARGET_PARTIAL_REG_STALL)
40473 return true;
40474 /* LRA checks if the hard register is OK for the given mode.
40475 QImode values can live in non-QI regs, so we allow all
40476 registers here. */
40477 if (lra_in_progress)
40478 return true;
40479 return !can_create_pseudo_p ();
40481 /* We handle both integer and floats in the general purpose registers. */
40482 else if (VALID_INT_MODE_P (mode))
40483 return true;
40484 else if (VALID_FP_MODE_P (mode))
40485 return true;
40486 else if (VALID_DFP_MODE_P (mode))
40487 return true;
40488 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40489 on to use that value in smaller contexts, this can easily force a
40490 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40491 supporting DImode, allow it. */
40492 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40493 return true;
40495 return false;
40498 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40499 tieable integer mode. */
40501 static bool
40502 ix86_tieable_integer_mode_p (machine_mode mode)
40504 switch (mode)
40506 case HImode:
40507 case SImode:
40508 return true;
40510 case QImode:
40511 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40513 case DImode:
40514 return TARGET_64BIT;
40516 default:
40517 return false;
40521 /* Return true if MODE1 is accessible in a register that can hold MODE2
40522 without copying. That is, all register classes that can hold MODE2
40523 can also hold MODE1. */
40525 bool
40526 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40528 if (mode1 == mode2)
40529 return true;
40531 if (ix86_tieable_integer_mode_p (mode1)
40532 && ix86_tieable_integer_mode_p (mode2))
40533 return true;
40535 /* MODE2 being XFmode implies fp stack or general regs, which means we
40536 can tie any smaller floating point modes to it. Note that we do not
40537 tie this with TFmode. */
40538 if (mode2 == XFmode)
40539 return mode1 == SFmode || mode1 == DFmode;
40541 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40542 that we can tie it with SFmode. */
40543 if (mode2 == DFmode)
40544 return mode1 == SFmode;
40546 /* If MODE2 is only appropriate for an SSE register, then tie with
40547 any other mode acceptable to SSE registers. */
40548 if (GET_MODE_SIZE (mode2) == 32
40549 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40550 return (GET_MODE_SIZE (mode1) == 32
40551 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40552 if (GET_MODE_SIZE (mode2) == 16
40553 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40554 return (GET_MODE_SIZE (mode1) == 16
40555 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40557 /* If MODE2 is appropriate for an MMX register, then tie
40558 with any other mode acceptable to MMX registers. */
40559 if (GET_MODE_SIZE (mode2) == 8
40560 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40561 return (GET_MODE_SIZE (mode1) == 8
40562 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40564 return false;
40567 /* Return the cost of moving between two registers of mode MODE. */
40569 static int
40570 ix86_set_reg_reg_cost (machine_mode mode)
40572 unsigned int units = UNITS_PER_WORD;
40574 switch (GET_MODE_CLASS (mode))
40576 default:
40577 break;
40579 case MODE_CC:
40580 units = GET_MODE_SIZE (CCmode);
40581 break;
40583 case MODE_FLOAT:
40584 if ((TARGET_SSE && mode == TFmode)
40585 || (TARGET_80387 && mode == XFmode)
40586 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40587 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40588 units = GET_MODE_SIZE (mode);
40589 break;
40591 case MODE_COMPLEX_FLOAT:
40592 if ((TARGET_SSE && mode == TCmode)
40593 || (TARGET_80387 && mode == XCmode)
40594 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40595 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40596 units = GET_MODE_SIZE (mode);
40597 break;
40599 case MODE_VECTOR_INT:
40600 case MODE_VECTOR_FLOAT:
40601 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40602 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40603 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40604 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40605 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40606 units = GET_MODE_SIZE (mode);
40609 /* Return the cost of moving between two registers of mode MODE,
40610 assuming that the move will be in pieces of at most UNITS bytes. */
40611 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40614 /* Compute a (partial) cost for rtx X. Return true if the complete
40615 cost has been computed, and false if subexpressions should be
40616 scanned. In either case, *TOTAL contains the cost result. */
40618 static bool
40619 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40620 int *total, bool speed)
40622 rtx mask;
40623 enum rtx_code code = GET_CODE (x);
40624 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40625 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40626 int src_cost;
40628 switch (code)
40630 case SET:
40631 if (register_operand (SET_DEST (x), VOIDmode)
40632 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40634 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40635 return true;
40638 if (register_operand (SET_SRC (x), VOIDmode))
40639 /* Avoid potentially incorrect high cost from rtx_costs
40640 for non-tieable SUBREGs. */
40641 src_cost = 0;
40642 else
40644 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40646 if (CONSTANT_P (SET_SRC (x)))
40647 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40648 a small value, possibly zero for cheap constants. */
40649 src_cost += COSTS_N_INSNS (1);
40652 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40653 return true;
40655 case CONST_INT:
40656 case CONST:
40657 case LABEL_REF:
40658 case SYMBOL_REF:
40659 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40660 *total = 3;
40661 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40662 *total = 2;
40663 else if (flag_pic && SYMBOLIC_CONST (x)
40664 && !(TARGET_64BIT
40665 && (GET_CODE (x) == LABEL_REF
40666 || (GET_CODE (x) == SYMBOL_REF
40667 && SYMBOL_REF_LOCAL_P (x))))
40668 /* Use 0 cost for CONST to improve its propagation. */
40669 && (TARGET_64BIT || GET_CODE (x) != CONST))
40670 *total = 1;
40671 else
40672 *total = 0;
40673 return true;
40675 case CONST_DOUBLE:
40676 if (IS_STACK_MODE (mode))
40677 switch (standard_80387_constant_p (x))
40679 case -1:
40680 case 0:
40681 break;
40682 case 1: /* 0.0 */
40683 *total = 1;
40684 return true;
40685 default: /* Other constants */
40686 *total = 2;
40687 return true;
40689 /* FALLTHRU */
40691 case CONST_VECTOR:
40692 switch (standard_sse_constant_p (x, mode))
40694 case 0:
40695 break;
40696 case 1: /* 0: xor eliminates false dependency */
40697 *total = 0;
40698 return true;
40699 default: /* -1: cmp contains false dependency */
40700 *total = 1;
40701 return true;
40703 /* FALLTHRU */
40705 case CONST_WIDE_INT:
40706 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40707 it'll probably end up. Add a penalty for size. */
40708 *total = (COSTS_N_INSNS (1)
40709 + (!TARGET_64BIT && flag_pic)
40710 + (GET_MODE_SIZE (mode) <= 4
40711 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40712 return true;
40714 case ZERO_EXTEND:
40715 /* The zero extensions is often completely free on x86_64, so make
40716 it as cheap as possible. */
40717 if (TARGET_64BIT && mode == DImode
40718 && GET_MODE (XEXP (x, 0)) == SImode)
40719 *total = 1;
40720 else if (TARGET_ZERO_EXTEND_WITH_AND)
40721 *total = cost->add;
40722 else
40723 *total = cost->movzx;
40724 return false;
40726 case SIGN_EXTEND:
40727 *total = cost->movsx;
40728 return false;
40730 case ASHIFT:
40731 if (SCALAR_INT_MODE_P (mode)
40732 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40733 && CONST_INT_P (XEXP (x, 1)))
40735 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40736 if (value == 1)
40738 *total = cost->add;
40739 return false;
40741 if ((value == 2 || value == 3)
40742 && cost->lea <= cost->shift_const)
40744 *total = cost->lea;
40745 return false;
40748 /* FALLTHRU */
40750 case ROTATE:
40751 case ASHIFTRT:
40752 case LSHIFTRT:
40753 case ROTATERT:
40754 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40756 /* ??? Should be SSE vector operation cost. */
40757 /* At least for published AMD latencies, this really is the same
40758 as the latency for a simple fpu operation like fabs. */
40759 /* V*QImode is emulated with 1-11 insns. */
40760 if (mode == V16QImode || mode == V32QImode)
40762 int count = 11;
40763 if (TARGET_XOP && mode == V16QImode)
40765 /* For XOP we use vpshab, which requires a broadcast of the
40766 value to the variable shift insn. For constants this
40767 means a V16Q const in mem; even when we can perform the
40768 shift with one insn set the cost to prefer paddb. */
40769 if (CONSTANT_P (XEXP (x, 1)))
40771 *total = (cost->fabs
40772 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
40773 + (speed ? 2 : COSTS_N_BYTES (16)));
40774 return true;
40776 count = 3;
40778 else if (TARGET_SSSE3)
40779 count = 7;
40780 *total = cost->fabs * count;
40782 else
40783 *total = cost->fabs;
40785 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40787 if (CONST_INT_P (XEXP (x, 1)))
40789 if (INTVAL (XEXP (x, 1)) > 32)
40790 *total = cost->shift_const + COSTS_N_INSNS (2);
40791 else
40792 *total = cost->shift_const * 2;
40794 else
40796 if (GET_CODE (XEXP (x, 1)) == AND)
40797 *total = cost->shift_var * 2;
40798 else
40799 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
40802 else
40804 if (CONST_INT_P (XEXP (x, 1)))
40805 *total = cost->shift_const;
40806 else if (SUBREG_P (XEXP (x, 1))
40807 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
40809 /* Return the cost after shift-and truncation. */
40810 *total = cost->shift_var;
40811 return true;
40813 else
40814 *total = cost->shift_var;
40816 return false;
40818 case FMA:
40820 rtx sub;
40822 gcc_assert (FLOAT_MODE_P (mode));
40823 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40825 /* ??? SSE scalar/vector cost should be used here. */
40826 /* ??? Bald assumption that fma has the same cost as fmul. */
40827 *total = cost->fmul;
40828 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40830 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40831 sub = XEXP (x, 0);
40832 if (GET_CODE (sub) == NEG)
40833 sub = XEXP (sub, 0);
40834 *total += rtx_cost (sub, mode, FMA, 0, speed);
40836 sub = XEXP (x, 2);
40837 if (GET_CODE (sub) == NEG)
40838 sub = XEXP (sub, 0);
40839 *total += rtx_cost (sub, mode, FMA, 2, speed);
40840 return true;
40843 case MULT:
40844 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40846 /* ??? SSE scalar cost should be used here. */
40847 *total = cost->fmul;
40848 return false;
40850 else if (X87_FLOAT_MODE_P (mode))
40852 *total = cost->fmul;
40853 return false;
40855 else if (FLOAT_MODE_P (mode))
40857 /* ??? SSE vector cost should be used here. */
40858 *total = cost->fmul;
40859 return false;
40861 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40863 /* V*QImode is emulated with 7-13 insns. */
40864 if (mode == V16QImode || mode == V32QImode)
40866 int extra = 11;
40867 if (TARGET_XOP && mode == V16QImode)
40868 extra = 5;
40869 else if (TARGET_SSSE3)
40870 extra = 6;
40871 *total = cost->fmul * 2 + cost->fabs * extra;
40873 /* V*DImode is emulated with 5-8 insns. */
40874 else if (mode == V2DImode || mode == V4DImode)
40876 if (TARGET_XOP && mode == V2DImode)
40877 *total = cost->fmul * 2 + cost->fabs * 3;
40878 else
40879 *total = cost->fmul * 3 + cost->fabs * 5;
40881 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40882 insns, including two PMULUDQ. */
40883 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40884 *total = cost->fmul * 2 + cost->fabs * 5;
40885 else
40886 *total = cost->fmul;
40887 return false;
40889 else
40891 rtx op0 = XEXP (x, 0);
40892 rtx op1 = XEXP (x, 1);
40893 int nbits;
40894 if (CONST_INT_P (XEXP (x, 1)))
40896 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40897 for (nbits = 0; value != 0; value &= value - 1)
40898 nbits++;
40900 else
40901 /* This is arbitrary. */
40902 nbits = 7;
40904 /* Compute costs correctly for widening multiplication. */
40905 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40906 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40907 == GET_MODE_SIZE (mode))
40909 int is_mulwiden = 0;
40910 machine_mode inner_mode = GET_MODE (op0);
40912 if (GET_CODE (op0) == GET_CODE (op1))
40913 is_mulwiden = 1, op1 = XEXP (op1, 0);
40914 else if (CONST_INT_P (op1))
40916 if (GET_CODE (op0) == SIGN_EXTEND)
40917 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40918 == INTVAL (op1);
40919 else
40920 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40923 if (is_mulwiden)
40924 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40927 *total = (cost->mult_init[MODE_INDEX (mode)]
40928 + nbits * cost->mult_bit
40929 + rtx_cost (op0, mode, outer_code, opno, speed)
40930 + rtx_cost (op1, mode, outer_code, opno, speed));
40932 return true;
40935 case DIV:
40936 case UDIV:
40937 case MOD:
40938 case UMOD:
40939 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40940 /* ??? SSE cost should be used here. */
40941 *total = cost->fdiv;
40942 else if (X87_FLOAT_MODE_P (mode))
40943 *total = cost->fdiv;
40944 else if (FLOAT_MODE_P (mode))
40945 /* ??? SSE vector cost should be used here. */
40946 *total = cost->fdiv;
40947 else
40948 *total = cost->divide[MODE_INDEX (mode)];
40949 return false;
40951 case PLUS:
40952 if (GET_MODE_CLASS (mode) == MODE_INT
40953 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40955 if (GET_CODE (XEXP (x, 0)) == PLUS
40956 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40957 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40958 && CONSTANT_P (XEXP (x, 1)))
40960 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40961 if (val == 2 || val == 4 || val == 8)
40963 *total = cost->lea;
40964 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40965 outer_code, opno, speed);
40966 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40967 outer_code, opno, speed);
40968 *total += rtx_cost (XEXP (x, 1), mode,
40969 outer_code, opno, speed);
40970 return true;
40973 else if (GET_CODE (XEXP (x, 0)) == MULT
40974 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40976 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40977 if (val == 2 || val == 4 || val == 8)
40979 *total = cost->lea;
40980 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40981 outer_code, opno, speed);
40982 *total += rtx_cost (XEXP (x, 1), mode,
40983 outer_code, opno, speed);
40984 return true;
40987 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40989 *total = cost->lea;
40990 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40991 outer_code, opno, speed);
40992 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40993 outer_code, opno, speed);
40994 *total += rtx_cost (XEXP (x, 1), mode,
40995 outer_code, opno, speed);
40996 return true;
40999 /* FALLTHRU */
41001 case MINUS:
41002 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41004 /* ??? SSE cost should be used here. */
41005 *total = cost->fadd;
41006 return false;
41008 else if (X87_FLOAT_MODE_P (mode))
41010 *total = cost->fadd;
41011 return false;
41013 else if (FLOAT_MODE_P (mode))
41015 /* ??? SSE vector cost should be used here. */
41016 *total = cost->fadd;
41017 return false;
41019 /* FALLTHRU */
41021 case AND:
41022 case IOR:
41023 case XOR:
41024 if (GET_MODE_CLASS (mode) == MODE_INT
41025 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41027 *total = (cost->add * 2
41028 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41029 << (GET_MODE (XEXP (x, 0)) != DImode))
41030 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41031 << (GET_MODE (XEXP (x, 1)) != DImode)));
41032 return true;
41034 /* FALLTHRU */
41036 case NEG:
41037 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41039 /* ??? SSE cost should be used here. */
41040 *total = cost->fchs;
41041 return false;
41043 else if (X87_FLOAT_MODE_P (mode))
41045 *total = cost->fchs;
41046 return false;
41048 else if (FLOAT_MODE_P (mode))
41050 /* ??? SSE vector cost should be used here. */
41051 *total = cost->fchs;
41052 return false;
41054 /* FALLTHRU */
41056 case NOT:
41057 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41059 /* ??? Should be SSE vector operation cost. */
41060 /* At least for published AMD latencies, this really is the same
41061 as the latency for a simple fpu operation like fabs. */
41062 *total = cost->fabs;
41064 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41065 *total = cost->add * 2;
41066 else
41067 *total = cost->add;
41068 return false;
41070 case COMPARE:
41071 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41072 && XEXP (XEXP (x, 0), 1) == const1_rtx
41073 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41074 && XEXP (x, 1) == const0_rtx)
41076 /* This kind of construct is implemented using test[bwl].
41077 Treat it as if we had an AND. */
41078 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41079 *total = (cost->add
41080 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41081 opno, speed)
41082 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41083 return true;
41086 /* The embedded comparison operand is completely free. */
41087 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41088 && XEXP (x, 1) == const0_rtx)
41089 *total = 0;
41091 return false;
41093 case FLOAT_EXTEND:
41094 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41095 *total = 0;
41096 return false;
41098 case ABS:
41099 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41100 /* ??? SSE cost should be used here. */
41101 *total = cost->fabs;
41102 else if (X87_FLOAT_MODE_P (mode))
41103 *total = cost->fabs;
41104 else if (FLOAT_MODE_P (mode))
41105 /* ??? SSE vector cost should be used here. */
41106 *total = cost->fabs;
41107 return false;
41109 case SQRT:
41110 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41111 /* ??? SSE cost should be used here. */
41112 *total = cost->fsqrt;
41113 else if (X87_FLOAT_MODE_P (mode))
41114 *total = cost->fsqrt;
41115 else if (FLOAT_MODE_P (mode))
41116 /* ??? SSE vector cost should be used here. */
41117 *total = cost->fsqrt;
41118 return false;
41120 case UNSPEC:
41121 if (XINT (x, 1) == UNSPEC_TP)
41122 *total = 0;
41123 return false;
41125 case VEC_SELECT:
41126 case VEC_CONCAT:
41127 case VEC_DUPLICATE:
41128 /* ??? Assume all of these vector manipulation patterns are
41129 recognizable. In which case they all pretty much have the
41130 same cost. */
41131 *total = cost->fabs;
41132 return true;
41133 case VEC_MERGE:
41134 mask = XEXP (x, 2);
41135 /* This is masked instruction, assume the same cost,
41136 as nonmasked variant. */
41137 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41138 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41139 else
41140 *total = cost->fabs;
41141 return true;
41143 default:
41144 return false;
41148 #if TARGET_MACHO
41150 static int current_machopic_label_num;
41152 /* Given a symbol name and its associated stub, write out the
41153 definition of the stub. */
41155 void
41156 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41158 unsigned int length;
41159 char *binder_name, *symbol_name, lazy_ptr_name[32];
41160 int label = ++current_machopic_label_num;
41162 /* For 64-bit we shouldn't get here. */
41163 gcc_assert (!TARGET_64BIT);
41165 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41166 symb = targetm.strip_name_encoding (symb);
41168 length = strlen (stub);
41169 binder_name = XALLOCAVEC (char, length + 32);
41170 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41172 length = strlen (symb);
41173 symbol_name = XALLOCAVEC (char, length + 32);
41174 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41176 sprintf (lazy_ptr_name, "L%d$lz", label);
41178 if (MACHOPIC_ATT_STUB)
41179 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41180 else if (MACHOPIC_PURE)
41181 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41182 else
41183 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41185 fprintf (file, "%s:\n", stub);
41186 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41188 if (MACHOPIC_ATT_STUB)
41190 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41192 else if (MACHOPIC_PURE)
41194 /* PIC stub. */
41195 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41196 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41197 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41198 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41199 label, lazy_ptr_name, label);
41200 fprintf (file, "\tjmp\t*%%ecx\n");
41202 else
41203 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41205 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41206 it needs no stub-binding-helper. */
41207 if (MACHOPIC_ATT_STUB)
41208 return;
41210 fprintf (file, "%s:\n", binder_name);
41212 if (MACHOPIC_PURE)
41214 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41215 fprintf (file, "\tpushl\t%%ecx\n");
41217 else
41218 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41220 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41222 /* N.B. Keep the correspondence of these
41223 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41224 old-pic/new-pic/non-pic stubs; altering this will break
41225 compatibility with existing dylibs. */
41226 if (MACHOPIC_PURE)
41228 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41229 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41231 else
41232 /* 16-byte -mdynamic-no-pic stub. */
41233 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41235 fprintf (file, "%s:\n", lazy_ptr_name);
41236 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41237 fprintf (file, ASM_LONG "%s\n", binder_name);
41239 #endif /* TARGET_MACHO */
41241 /* Order the registers for register allocator. */
41243 void
41244 x86_order_regs_for_local_alloc (void)
41246 int pos = 0;
41247 int i;
41249 /* First allocate the local general purpose registers. */
41250 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41251 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41252 reg_alloc_order [pos++] = i;
41254 /* Global general purpose registers. */
41255 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41256 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41257 reg_alloc_order [pos++] = i;
41259 /* x87 registers come first in case we are doing FP math
41260 using them. */
41261 if (!TARGET_SSE_MATH)
41262 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41263 reg_alloc_order [pos++] = i;
41265 /* SSE registers. */
41266 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41267 reg_alloc_order [pos++] = i;
41268 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41269 reg_alloc_order [pos++] = i;
41271 /* Extended REX SSE registers. */
41272 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41273 reg_alloc_order [pos++] = i;
41275 /* Mask register. */
41276 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41277 reg_alloc_order [pos++] = i;
41279 /* MPX bound registers. */
41280 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41281 reg_alloc_order [pos++] = i;
41283 /* x87 registers. */
41284 if (TARGET_SSE_MATH)
41285 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41286 reg_alloc_order [pos++] = i;
41288 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41289 reg_alloc_order [pos++] = i;
41291 /* Initialize the rest of array as we do not allocate some registers
41292 at all. */
41293 while (pos < FIRST_PSEUDO_REGISTER)
41294 reg_alloc_order [pos++] = 0;
41297 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41298 in struct attribute_spec handler. */
41299 static tree
41300 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
41301 tree args,
41302 int,
41303 bool *no_add_attrs)
41305 if (TREE_CODE (*node) != FUNCTION_TYPE
41306 && TREE_CODE (*node) != METHOD_TYPE
41307 && TREE_CODE (*node) != FIELD_DECL
41308 && TREE_CODE (*node) != TYPE_DECL)
41310 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41311 name);
41312 *no_add_attrs = true;
41313 return NULL_TREE;
41315 if (TARGET_64BIT)
41317 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41318 name);
41319 *no_add_attrs = true;
41320 return NULL_TREE;
41322 if (is_attribute_p ("callee_pop_aggregate_return", name))
41324 tree cst;
41326 cst = TREE_VALUE (args);
41327 if (TREE_CODE (cst) != INTEGER_CST)
41329 warning (OPT_Wattributes,
41330 "%qE attribute requires an integer constant argument",
41331 name);
41332 *no_add_attrs = true;
41334 else if (compare_tree_int (cst, 0) != 0
41335 && compare_tree_int (cst, 1) != 0)
41337 warning (OPT_Wattributes,
41338 "argument to %qE attribute is neither zero, nor one",
41339 name);
41340 *no_add_attrs = true;
41343 return NULL_TREE;
41346 return NULL_TREE;
41349 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41350 struct attribute_spec.handler. */
41351 static tree
41352 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41353 bool *no_add_attrs)
41355 if (TREE_CODE (*node) != FUNCTION_TYPE
41356 && TREE_CODE (*node) != METHOD_TYPE
41357 && TREE_CODE (*node) != FIELD_DECL
41358 && TREE_CODE (*node) != TYPE_DECL)
41360 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41361 name);
41362 *no_add_attrs = true;
41363 return NULL_TREE;
41366 /* Can combine regparm with all attributes but fastcall. */
41367 if (is_attribute_p ("ms_abi", name))
41369 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41371 error ("ms_abi and sysv_abi attributes are not compatible");
41374 return NULL_TREE;
41376 else if (is_attribute_p ("sysv_abi", name))
41378 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41380 error ("ms_abi and sysv_abi attributes are not compatible");
41383 return NULL_TREE;
41386 return NULL_TREE;
41389 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41390 struct attribute_spec.handler. */
41391 static tree
41392 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41393 bool *no_add_attrs)
41395 tree *type = NULL;
41396 if (DECL_P (*node))
41398 if (TREE_CODE (*node) == TYPE_DECL)
41399 type = &TREE_TYPE (*node);
41401 else
41402 type = node;
41404 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41406 warning (OPT_Wattributes, "%qE attribute ignored",
41407 name);
41408 *no_add_attrs = true;
41411 else if ((is_attribute_p ("ms_struct", name)
41412 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41413 || ((is_attribute_p ("gcc_struct", name)
41414 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41416 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41417 name);
41418 *no_add_attrs = true;
41421 return NULL_TREE;
41424 static tree
41425 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
41426 bool *no_add_attrs)
41428 if (TREE_CODE (*node) != FUNCTION_DECL)
41430 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41431 name);
41432 *no_add_attrs = true;
41434 return NULL_TREE;
41437 static tree
41438 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41439 int, bool *)
41441 return NULL_TREE;
41444 static tree
41445 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41447 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41448 but the function type contains args and return type data. */
41449 tree func_type = *node;
41450 tree return_type = TREE_TYPE (func_type);
41452 int nargs = 0;
41453 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41454 while (current_arg_type
41455 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41457 if (nargs == 0)
41459 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41460 error ("interrupt service routine should have a pointer "
41461 "as the first argument");
41463 else if (nargs == 1)
41465 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41466 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41467 error ("interrupt service routine should have unsigned %s"
41468 "int as the second argument",
41469 TARGET_64BIT
41470 ? (TARGET_X32 ? "long long " : "long ")
41471 : "");
41473 nargs++;
41474 current_arg_type = TREE_CHAIN (current_arg_type);
41476 if (!nargs || nargs > 2)
41477 error ("interrupt service routine can only have a pointer argument "
41478 "and an optional integer argument");
41479 if (! VOID_TYPE_P (return_type))
41480 error ("interrupt service routine can't have non-void return value");
41482 return NULL_TREE;
41485 static bool
41486 ix86_ms_bitfield_layout_p (const_tree record_type)
41488 return ((TARGET_MS_BITFIELD_LAYOUT
41489 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41490 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41493 /* Returns an expression indicating where the this parameter is
41494 located on entry to the FUNCTION. */
41496 static rtx
41497 x86_this_parameter (tree function)
41499 tree type = TREE_TYPE (function);
41500 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41501 int nregs;
41503 if (TARGET_64BIT)
41505 const int *parm_regs;
41507 if (ix86_function_type_abi (type) == MS_ABI)
41508 parm_regs = x86_64_ms_abi_int_parameter_registers;
41509 else
41510 parm_regs = x86_64_int_parameter_registers;
41511 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41514 nregs = ix86_function_regparm (type, function);
41516 if (nregs > 0 && !stdarg_p (type))
41518 int regno;
41519 unsigned int ccvt = ix86_get_callcvt (type);
41521 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41522 regno = aggr ? DX_REG : CX_REG;
41523 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41525 regno = CX_REG;
41526 if (aggr)
41527 return gen_rtx_MEM (SImode,
41528 plus_constant (Pmode, stack_pointer_rtx, 4));
41530 else
41532 regno = AX_REG;
41533 if (aggr)
41535 regno = DX_REG;
41536 if (nregs == 1)
41537 return gen_rtx_MEM (SImode,
41538 plus_constant (Pmode,
41539 stack_pointer_rtx, 4));
41542 return gen_rtx_REG (SImode, regno);
41545 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41546 aggr ? 8 : 4));
41549 /* Determine whether x86_output_mi_thunk can succeed. */
41551 static bool
41552 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41553 const_tree function)
41555 /* 64-bit can handle anything. */
41556 if (TARGET_64BIT)
41557 return true;
41559 /* For 32-bit, everything's fine if we have one free register. */
41560 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41561 return true;
41563 /* Need a free register for vcall_offset. */
41564 if (vcall_offset)
41565 return false;
41567 /* Need a free register for GOT references. */
41568 if (flag_pic && !targetm.binds_local_p (function))
41569 return false;
41571 /* Otherwise ok. */
41572 return true;
41575 /* Output the assembler code for a thunk function. THUNK_DECL is the
41576 declaration for the thunk function itself, FUNCTION is the decl for
41577 the target function. DELTA is an immediate constant offset to be
41578 added to THIS. If VCALL_OFFSET is nonzero, the word at
41579 *(*this + vcall_offset) should be added to THIS. */
41581 static void
41582 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41583 HOST_WIDE_INT vcall_offset, tree function)
41585 rtx this_param = x86_this_parameter (function);
41586 rtx this_reg, tmp, fnaddr;
41587 unsigned int tmp_regno;
41588 rtx_insn *insn;
41590 if (TARGET_64BIT)
41591 tmp_regno = R10_REG;
41592 else
41594 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41595 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41596 tmp_regno = AX_REG;
41597 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41598 tmp_regno = DX_REG;
41599 else
41600 tmp_regno = CX_REG;
41603 emit_note (NOTE_INSN_PROLOGUE_END);
41605 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41606 pull it in now and let DELTA benefit. */
41607 if (REG_P (this_param))
41608 this_reg = this_param;
41609 else if (vcall_offset)
41611 /* Put the this parameter into %eax. */
41612 this_reg = gen_rtx_REG (Pmode, AX_REG);
41613 emit_move_insn (this_reg, this_param);
41615 else
41616 this_reg = NULL_RTX;
41618 /* Adjust the this parameter by a fixed constant. */
41619 if (delta)
41621 rtx delta_rtx = GEN_INT (delta);
41622 rtx delta_dst = this_reg ? this_reg : this_param;
41624 if (TARGET_64BIT)
41626 if (!x86_64_general_operand (delta_rtx, Pmode))
41628 tmp = gen_rtx_REG (Pmode, tmp_regno);
41629 emit_move_insn (tmp, delta_rtx);
41630 delta_rtx = tmp;
41634 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41637 /* Adjust the this parameter by a value stored in the vtable. */
41638 if (vcall_offset)
41640 rtx vcall_addr, vcall_mem, this_mem;
41642 tmp = gen_rtx_REG (Pmode, tmp_regno);
41644 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41645 if (Pmode != ptr_mode)
41646 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41647 emit_move_insn (tmp, this_mem);
41649 /* Adjust the this parameter. */
41650 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41651 if (TARGET_64BIT
41652 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41654 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41655 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41656 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41659 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41660 if (Pmode != ptr_mode)
41661 emit_insn (gen_addsi_1_zext (this_reg,
41662 gen_rtx_REG (ptr_mode,
41663 REGNO (this_reg)),
41664 vcall_mem));
41665 else
41666 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41669 /* If necessary, drop THIS back to its stack slot. */
41670 if (this_reg && this_reg != this_param)
41671 emit_move_insn (this_param, this_reg);
41673 fnaddr = XEXP (DECL_RTL (function), 0);
41674 if (TARGET_64BIT)
41676 if (!flag_pic || targetm.binds_local_p (function)
41677 || TARGET_PECOFF)
41679 else
41681 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41682 tmp = gen_rtx_CONST (Pmode, tmp);
41683 fnaddr = gen_const_mem (Pmode, tmp);
41686 else
41688 if (!flag_pic || targetm.binds_local_p (function))
41690 #if TARGET_MACHO
41691 else if (TARGET_MACHO)
41693 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41694 fnaddr = XEXP (fnaddr, 0);
41696 #endif /* TARGET_MACHO */
41697 else
41699 tmp = gen_rtx_REG (Pmode, CX_REG);
41700 output_set_got (tmp, NULL_RTX);
41702 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41703 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41704 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41705 fnaddr = gen_const_mem (Pmode, fnaddr);
41709 /* Our sibling call patterns do not allow memories, because we have no
41710 predicate that can distinguish between frame and non-frame memory.
41711 For our purposes here, we can get away with (ab)using a jump pattern,
41712 because we're going to do no optimization. */
41713 if (MEM_P (fnaddr))
41715 if (sibcall_insn_operand (fnaddr, word_mode))
41717 fnaddr = XEXP (DECL_RTL (function), 0);
41718 tmp = gen_rtx_MEM (QImode, fnaddr);
41719 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41720 tmp = emit_call_insn (tmp);
41721 SIBLING_CALL_P (tmp) = 1;
41723 else
41724 emit_jump_insn (gen_indirect_jump (fnaddr));
41726 else
41728 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41730 // CM_LARGE_PIC always uses pseudo PIC register which is
41731 // uninitialized. Since FUNCTION is local and calling it
41732 // doesn't go through PLT, we use scratch register %r11 as
41733 // PIC register and initialize it here.
41734 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41735 ix86_init_large_pic_reg (tmp_regno);
41736 fnaddr = legitimize_pic_address (fnaddr,
41737 gen_rtx_REG (Pmode, tmp_regno));
41740 if (!sibcall_insn_operand (fnaddr, word_mode))
41742 tmp = gen_rtx_REG (word_mode, tmp_regno);
41743 if (GET_MODE (fnaddr) != word_mode)
41744 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41745 emit_move_insn (tmp, fnaddr);
41746 fnaddr = tmp;
41749 tmp = gen_rtx_MEM (QImode, fnaddr);
41750 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41751 tmp = emit_call_insn (tmp);
41752 SIBLING_CALL_P (tmp) = 1;
41754 emit_barrier ();
41756 /* Emit just enough of rest_of_compilation to get the insns emitted.
41757 Note that use_thunk calls assemble_start_function et al. */
41758 insn = get_insns ();
41759 shorten_branches (insn);
41760 final_start_function (insn, file, 1);
41761 final (insn, file, 1);
41762 final_end_function ();
41765 static void
41766 x86_file_start (void)
41768 default_file_start ();
41769 if (TARGET_16BIT)
41770 fputs ("\t.code16gcc\n", asm_out_file);
41771 #if TARGET_MACHO
41772 darwin_file_start ();
41773 #endif
41774 if (X86_FILE_START_VERSION_DIRECTIVE)
41775 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41776 if (X86_FILE_START_FLTUSED)
41777 fputs ("\t.global\t__fltused\n", asm_out_file);
41778 if (ix86_asm_dialect == ASM_INTEL)
41779 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41783 x86_field_alignment (tree type, int computed)
41785 machine_mode mode;
41787 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41788 return computed;
41789 if (TARGET_IAMCU)
41790 return iamcu_alignment (type, computed);
41791 mode = TYPE_MODE (strip_array_types (type));
41792 if (mode == DFmode || mode == DCmode
41793 || GET_MODE_CLASS (mode) == MODE_INT
41794 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41795 return MIN (32, computed);
41796 return computed;
41799 /* Print call to TARGET to FILE. */
41801 static void
41802 x86_print_call_or_nop (FILE *file, const char *target)
41804 if (flag_nop_mcount)
41805 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
41806 else
41807 fprintf (file, "1:\tcall\t%s\n", target);
41810 /* Output assembler code to FILE to increment profiler label # LABELNO
41811 for profiling a function entry. */
41812 void
41813 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41815 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41816 : MCOUNT_NAME);
41817 if (TARGET_64BIT)
41819 #ifndef NO_PROFILE_COUNTERS
41820 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41821 #endif
41823 if (!TARGET_PECOFF && flag_pic)
41824 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41825 else
41826 x86_print_call_or_nop (file, mcount_name);
41828 else if (flag_pic)
41830 #ifndef NO_PROFILE_COUNTERS
41831 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41832 LPREFIX, labelno);
41833 #endif
41834 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41836 else
41838 #ifndef NO_PROFILE_COUNTERS
41839 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41840 LPREFIX, labelno);
41841 #endif
41842 x86_print_call_or_nop (file, mcount_name);
41845 if (flag_record_mcount)
41847 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41848 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41849 fprintf (file, "\t.previous\n");
41853 /* We don't have exact information about the insn sizes, but we may assume
41854 quite safely that we are informed about all 1 byte insns and memory
41855 address sizes. This is enough to eliminate unnecessary padding in
41856 99% of cases. */
41858 static int
41859 min_insn_size (rtx_insn *insn)
41861 int l = 0, len;
41863 if (!INSN_P (insn) || !active_insn_p (insn))
41864 return 0;
41866 /* Discard alignments we've emit and jump instructions. */
41867 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41868 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41869 return 0;
41871 /* Important case - calls are always 5 bytes.
41872 It is common to have many calls in the row. */
41873 if (CALL_P (insn)
41874 && symbolic_reference_mentioned_p (PATTERN (insn))
41875 && !SIBLING_CALL_P (insn))
41876 return 5;
41877 len = get_attr_length (insn);
41878 if (len <= 1)
41879 return 1;
41881 /* For normal instructions we rely on get_attr_length being exact,
41882 with a few exceptions. */
41883 if (!JUMP_P (insn))
41885 enum attr_type type = get_attr_type (insn);
41887 switch (type)
41889 case TYPE_MULTI:
41890 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41891 || asm_noperands (PATTERN (insn)) >= 0)
41892 return 0;
41893 break;
41894 case TYPE_OTHER:
41895 case TYPE_FCMP:
41896 break;
41897 default:
41898 /* Otherwise trust get_attr_length. */
41899 return len;
41902 l = get_attr_length_address (insn);
41903 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41904 l = 4;
41906 if (l)
41907 return 1+l;
41908 else
41909 return 2;
41912 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41914 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41915 window. */
41917 static void
41918 ix86_avoid_jump_mispredicts (void)
41920 rtx_insn *insn, *start = get_insns ();
41921 int nbytes = 0, njumps = 0;
41922 bool isjump = false;
41924 /* Look for all minimal intervals of instructions containing 4 jumps.
41925 The intervals are bounded by START and INSN. NBYTES is the total
41926 size of instructions in the interval including INSN and not including
41927 START. When the NBYTES is smaller than 16 bytes, it is possible
41928 that the end of START and INSN ends up in the same 16byte page.
41930 The smallest offset in the page INSN can start is the case where START
41931 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41932 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41934 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41935 have to, control transfer to label(s) can be performed through other
41936 means, and also we estimate minimum length of all asm stmts as 0. */
41937 for (insn = start; insn; insn = NEXT_INSN (insn))
41939 int min_size;
41941 if (LABEL_P (insn))
41943 int align = label_to_alignment (insn);
41944 int max_skip = label_to_max_skip (insn);
41946 if (max_skip > 15)
41947 max_skip = 15;
41948 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41949 already in the current 16 byte page, because otherwise
41950 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41951 bytes to reach 16 byte boundary. */
41952 if (align <= 0
41953 || (align <= 3 && max_skip != (1 << align) - 1))
41954 max_skip = 0;
41955 if (dump_file)
41956 fprintf (dump_file, "Label %i with max_skip %i\n",
41957 INSN_UID (insn), max_skip);
41958 if (max_skip)
41960 while (nbytes + max_skip >= 16)
41962 start = NEXT_INSN (start);
41963 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41964 || CALL_P (start))
41965 njumps--, isjump = true;
41966 else
41967 isjump = false;
41968 nbytes -= min_insn_size (start);
41971 continue;
41974 min_size = min_insn_size (insn);
41975 nbytes += min_size;
41976 if (dump_file)
41977 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41978 INSN_UID (insn), min_size);
41979 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41980 || CALL_P (insn))
41981 njumps++;
41982 else
41983 continue;
41985 while (njumps > 3)
41987 start = NEXT_INSN (start);
41988 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41989 || CALL_P (start))
41990 njumps--, isjump = true;
41991 else
41992 isjump = false;
41993 nbytes -= min_insn_size (start);
41995 gcc_assert (njumps >= 0);
41996 if (dump_file)
41997 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41998 INSN_UID (start), INSN_UID (insn), nbytes);
42000 if (njumps == 3 && isjump && nbytes < 16)
42002 int padsize = 15 - nbytes + min_insn_size (insn);
42004 if (dump_file)
42005 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42006 INSN_UID (insn), padsize);
42007 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42011 #endif
42013 /* AMD Athlon works faster
42014 when RET is not destination of conditional jump or directly preceded
42015 by other jump instruction. We avoid the penalty by inserting NOP just
42016 before the RET instructions in such cases. */
42017 static void
42018 ix86_pad_returns (void)
42020 edge e;
42021 edge_iterator ei;
42023 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42025 basic_block bb = e->src;
42026 rtx_insn *ret = BB_END (bb);
42027 rtx_insn *prev;
42028 bool replace = false;
42030 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42031 || optimize_bb_for_size_p (bb))
42032 continue;
42033 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42034 if (active_insn_p (prev) || LABEL_P (prev))
42035 break;
42036 if (prev && LABEL_P (prev))
42038 edge e;
42039 edge_iterator ei;
42041 FOR_EACH_EDGE (e, ei, bb->preds)
42042 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42043 && !(e->flags & EDGE_FALLTHRU))
42045 replace = true;
42046 break;
42049 if (!replace)
42051 prev = prev_active_insn (ret);
42052 if (prev
42053 && ((JUMP_P (prev) && any_condjump_p (prev))
42054 || CALL_P (prev)))
42055 replace = true;
42056 /* Empty functions get branch mispredict even when
42057 the jump destination is not visible to us. */
42058 if (!prev && !optimize_function_for_size_p (cfun))
42059 replace = true;
42061 if (replace)
42063 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42064 delete_insn (ret);
42069 /* Count the minimum number of instructions in BB. Return 4 if the
42070 number of instructions >= 4. */
42072 static int
42073 ix86_count_insn_bb (basic_block bb)
42075 rtx_insn *insn;
42076 int insn_count = 0;
42078 /* Count number of instructions in this block. Return 4 if the number
42079 of instructions >= 4. */
42080 FOR_BB_INSNS (bb, insn)
42082 /* Only happen in exit blocks. */
42083 if (JUMP_P (insn)
42084 && ANY_RETURN_P (PATTERN (insn)))
42085 break;
42087 if (NONDEBUG_INSN_P (insn)
42088 && GET_CODE (PATTERN (insn)) != USE
42089 && GET_CODE (PATTERN (insn)) != CLOBBER)
42091 insn_count++;
42092 if (insn_count >= 4)
42093 return insn_count;
42097 return insn_count;
42101 /* Count the minimum number of instructions in code path in BB.
42102 Return 4 if the number of instructions >= 4. */
42104 static int
42105 ix86_count_insn (basic_block bb)
42107 edge e;
42108 edge_iterator ei;
42109 int min_prev_count;
42111 /* Only bother counting instructions along paths with no
42112 more than 2 basic blocks between entry and exit. Given
42113 that BB has an edge to exit, determine if a predecessor
42114 of BB has an edge from entry. If so, compute the number
42115 of instructions in the predecessor block. If there
42116 happen to be multiple such blocks, compute the minimum. */
42117 min_prev_count = 4;
42118 FOR_EACH_EDGE (e, ei, bb->preds)
42120 edge prev_e;
42121 edge_iterator prev_ei;
42123 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42125 min_prev_count = 0;
42126 break;
42128 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42130 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42132 int count = ix86_count_insn_bb (e->src);
42133 if (count < min_prev_count)
42134 min_prev_count = count;
42135 break;
42140 if (min_prev_count < 4)
42141 min_prev_count += ix86_count_insn_bb (bb);
42143 return min_prev_count;
42146 /* Pad short function to 4 instructions. */
42148 static void
42149 ix86_pad_short_function (void)
42151 edge e;
42152 edge_iterator ei;
42154 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42156 rtx_insn *ret = BB_END (e->src);
42157 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42159 int insn_count = ix86_count_insn (e->src);
42161 /* Pad short function. */
42162 if (insn_count < 4)
42164 rtx_insn *insn = ret;
42166 /* Find epilogue. */
42167 while (insn
42168 && (!NOTE_P (insn)
42169 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42170 insn = PREV_INSN (insn);
42172 if (!insn)
42173 insn = ret;
42175 /* Two NOPs count as one instruction. */
42176 insn_count = 2 * (4 - insn_count);
42177 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42183 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42184 the epilogue, the Windows system unwinder will apply epilogue logic and
42185 produce incorrect offsets. This can be avoided by adding a nop between
42186 the last insn that can throw and the first insn of the epilogue. */
42188 static void
42189 ix86_seh_fixup_eh_fallthru (void)
42191 edge e;
42192 edge_iterator ei;
42194 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42196 rtx_insn *insn, *next;
42198 /* Find the beginning of the epilogue. */
42199 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42200 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42201 break;
42202 if (insn == NULL)
42203 continue;
42205 /* We only care about preceding insns that can throw. */
42206 insn = prev_active_insn (insn);
42207 if (insn == NULL || !can_throw_internal (insn))
42208 continue;
42210 /* Do not separate calls from their debug information. */
42211 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42212 if (NOTE_P (next)
42213 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42214 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42215 insn = next;
42216 else
42217 break;
42219 emit_insn_after (gen_nops (const1_rtx), insn);
42223 /* Given a register number BASE, the lowest of a group of registers, update
42224 regsets IN and OUT with the registers that should be avoided in input
42225 and output operands respectively when trying to avoid generating a modr/m
42226 byte for -fmitigate-rop. */
42228 static void
42229 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42231 SET_HARD_REG_BIT (out, base);
42232 SET_HARD_REG_BIT (out, base + 1);
42233 SET_HARD_REG_BIT (in, base + 2);
42234 SET_HARD_REG_BIT (in, base + 3);
42237 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
42238 that certain encodings of modr/m bytes do not occur. */
42239 static void
42240 ix86_mitigate_rop (void)
42242 HARD_REG_SET input_risky;
42243 HARD_REG_SET output_risky;
42244 HARD_REG_SET inout_risky;
42246 CLEAR_HARD_REG_SET (output_risky);
42247 CLEAR_HARD_REG_SET (input_risky);
42248 SET_HARD_REG_BIT (output_risky, AX_REG);
42249 SET_HARD_REG_BIT (output_risky, CX_REG);
42250 SET_HARD_REG_BIT (input_risky, BX_REG);
42251 SET_HARD_REG_BIT (input_risky, DX_REG);
42252 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42253 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42254 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42255 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42256 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42257 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42258 COPY_HARD_REG_SET (inout_risky, input_risky);
42259 IOR_HARD_REG_SET (inout_risky, output_risky);
42261 df_note_add_problem ();
42262 /* Fix up what stack-regs did. */
42263 df_insn_rescan_all ();
42264 df_analyze ();
42266 regrename_init (true);
42267 regrename_analyze (NULL);
42269 auto_vec<du_head_p> cands;
42271 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42273 if (!NONDEBUG_INSN_P (insn))
42274 continue;
42276 if (GET_CODE (PATTERN (insn)) == USE
42277 || GET_CODE (PATTERN (insn)) == CLOBBER)
42278 continue;
42280 extract_insn (insn);
42282 int opno0, opno1;
42283 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42284 recog_data.n_operands, &opno0,
42285 &opno1);
42287 if (!ix86_rop_should_change_byte_p (modrm))
42288 continue;
42290 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42292 /* This happens when regrename has to fail a block. */
42293 if (!info->op_info)
42294 continue;
42296 if (info->op_info[opno0].n_chains != 0)
42298 gcc_assert (info->op_info[opno0].n_chains == 1);
42299 du_head_p op0c;
42300 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42301 if (op0c->target_data_1 + op0c->target_data_2 == 0
42302 && !op0c->cannot_rename)
42303 cands.safe_push (op0c);
42305 op0c->target_data_1++;
42307 if (info->op_info[opno1].n_chains != 0)
42309 gcc_assert (info->op_info[opno1].n_chains == 1);
42310 du_head_p op1c;
42311 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42312 if (op1c->target_data_1 + op1c->target_data_2 == 0
42313 && !op1c->cannot_rename)
42314 cands.safe_push (op1c);
42316 op1c->target_data_2++;
42320 int i;
42321 du_head_p head;
42322 FOR_EACH_VEC_ELT (cands, i, head)
42324 int old_reg, best_reg;
42325 HARD_REG_SET unavailable;
42327 CLEAR_HARD_REG_SET (unavailable);
42328 if (head->target_data_1)
42329 IOR_HARD_REG_SET (unavailable, output_risky);
42330 if (head->target_data_2)
42331 IOR_HARD_REG_SET (unavailable, input_risky);
42333 int n_uses;
42334 reg_class superclass = regrename_find_superclass (head, &n_uses,
42335 &unavailable);
42336 old_reg = head->regno;
42337 best_reg = find_rename_reg (head, superclass, &unavailable,
42338 old_reg, false);
42339 bool ok = regrename_do_replace (head, best_reg);
42340 gcc_assert (ok);
42341 if (dump_file)
42342 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42343 reg_names[best_reg], reg_class_names[superclass]);
42347 regrename_finish ();
42349 df_analyze ();
42351 basic_block bb;
42352 regset_head live;
42354 INIT_REG_SET (&live);
42356 FOR_EACH_BB_FN (bb, cfun)
42358 rtx_insn *insn;
42360 COPY_REG_SET (&live, DF_LR_OUT (bb));
42361 df_simulate_initialize_backwards (bb, &live);
42363 FOR_BB_INSNS_REVERSE (bb, insn)
42365 if (!NONDEBUG_INSN_P (insn))
42366 continue;
42368 df_simulate_one_insn_backwards (bb, insn, &live);
42370 if (GET_CODE (PATTERN (insn)) == USE
42371 || GET_CODE (PATTERN (insn)) == CLOBBER)
42372 continue;
42374 extract_insn (insn);
42375 constrain_operands_cached (insn, reload_completed);
42376 int opno0, opno1;
42377 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42378 recog_data.n_operands, &opno0,
42379 &opno1);
42380 if (modrm < 0
42381 || !ix86_rop_should_change_byte_p (modrm)
42382 || opno0 == opno1)
42383 continue;
42385 rtx oldreg = recog_data.operand[opno1];
42386 preprocess_constraints (insn);
42387 const operand_alternative *alt = which_op_alt ();
42389 int i;
42390 for (i = 0; i < recog_data.n_operands; i++)
42391 if (i != opno1
42392 && alt[i].earlyclobber
42393 && reg_overlap_mentioned_p (recog_data.operand[i],
42394 oldreg))
42395 break;
42397 if (i < recog_data.n_operands)
42398 continue;
42400 if (dump_file)
42401 fprintf (dump_file,
42402 "attempting to fix modrm byte in insn %d:"
42403 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42404 reg_class_names[alt[opno1].cl]);
42406 HARD_REG_SET unavailable;
42407 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42408 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42409 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42410 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42411 IOR_HARD_REG_SET (unavailable, output_risky);
42412 IOR_COMPL_HARD_REG_SET (unavailable,
42413 reg_class_contents[alt[opno1].cl]);
42415 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42416 if (!TEST_HARD_REG_BIT (unavailable, i))
42417 break;
42418 if (i == FIRST_PSEUDO_REGISTER)
42420 if (dump_file)
42421 fprintf (dump_file, ", none available\n");
42422 continue;
42424 if (dump_file)
42425 fprintf (dump_file, " -> %d\n", i);
42426 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42427 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42428 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42433 /* Implement machine specific optimizations. We implement padding of returns
42434 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42435 static void
42436 ix86_reorg (void)
42438 /* We are freeing block_for_insn in the toplev to keep compatibility
42439 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42440 compute_bb_for_insn ();
42442 if (flag_mitigate_rop)
42443 ix86_mitigate_rop ();
42445 if (TARGET_SEH && current_function_has_exception_handlers ())
42446 ix86_seh_fixup_eh_fallthru ();
42448 if (optimize && optimize_function_for_speed_p (cfun))
42450 if (TARGET_PAD_SHORT_FUNCTION)
42451 ix86_pad_short_function ();
42452 else if (TARGET_PAD_RETURNS)
42453 ix86_pad_returns ();
42454 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42455 if (TARGET_FOUR_JUMP_LIMIT)
42456 ix86_avoid_jump_mispredicts ();
42457 #endif
42461 /* Return nonzero when QImode register that must be represented via REX prefix
42462 is used. */
42463 bool
42464 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42466 int i;
42467 extract_insn_cached (insn);
42468 for (i = 0; i < recog_data.n_operands; i++)
42469 if (GENERAL_REG_P (recog_data.operand[i])
42470 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42471 return true;
42472 return false;
42475 /* Return true when INSN mentions register that must be encoded using REX
42476 prefix. */
42477 bool
42478 x86_extended_reg_mentioned_p (rtx insn)
42480 subrtx_iterator::array_type array;
42481 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42483 const_rtx x = *iter;
42484 if (REG_P (x)
42485 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42486 return true;
42488 return false;
42491 /* If profitable, negate (without causing overflow) integer constant
42492 of mode MODE at location LOC. Return true in this case. */
42493 bool
42494 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42496 HOST_WIDE_INT val;
42498 if (!CONST_INT_P (*loc))
42499 return false;
42501 switch (mode)
42503 case DImode:
42504 /* DImode x86_64 constants must fit in 32 bits. */
42505 gcc_assert (x86_64_immediate_operand (*loc, mode));
42507 mode = SImode;
42508 break;
42510 case SImode:
42511 case HImode:
42512 case QImode:
42513 break;
42515 default:
42516 gcc_unreachable ();
42519 /* Avoid overflows. */
42520 if (mode_signbit_p (mode, *loc))
42521 return false;
42523 val = INTVAL (*loc);
42525 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42526 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42527 if ((val < 0 && val != -128)
42528 || val == 128)
42530 *loc = GEN_INT (-val);
42531 return true;
42534 return false;
42537 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42538 optabs would emit if we didn't have TFmode patterns. */
42540 void
42541 x86_emit_floatuns (rtx operands[2])
42543 rtx_code_label *neglab, *donelab;
42544 rtx i0, i1, f0, in, out;
42545 machine_mode mode, inmode;
42547 inmode = GET_MODE (operands[1]);
42548 gcc_assert (inmode == SImode || inmode == DImode);
42550 out = operands[0];
42551 in = force_reg (inmode, operands[1]);
42552 mode = GET_MODE (out);
42553 neglab = gen_label_rtx ();
42554 donelab = gen_label_rtx ();
42555 f0 = gen_reg_rtx (mode);
42557 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42559 expand_float (out, in, 0);
42561 emit_jump_insn (gen_jump (donelab));
42562 emit_barrier ();
42564 emit_label (neglab);
42566 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42567 1, OPTAB_DIRECT);
42568 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42569 1, OPTAB_DIRECT);
42570 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42572 expand_float (f0, i0, 0);
42574 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42576 emit_label (donelab);
42579 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42580 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42581 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42582 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42584 /* Get a vector mode of the same size as the original but with elements
42585 twice as wide. This is only guaranteed to apply to integral vectors. */
42587 static inline machine_mode
42588 get_mode_wider_vector (machine_mode o)
42590 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42591 machine_mode n = GET_MODE_WIDER_MODE (o);
42592 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42593 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42594 return n;
42597 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42598 fill target with val via vec_duplicate. */
42600 static bool
42601 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42603 bool ok;
42604 rtx_insn *insn;
42605 rtx dup;
42607 /* First attempt to recognize VAL as-is. */
42608 dup = gen_rtx_VEC_DUPLICATE (mode, val);
42609 insn = emit_insn (gen_rtx_SET (target, dup));
42610 if (recog_memoized (insn) < 0)
42612 rtx_insn *seq;
42613 machine_mode innermode = GET_MODE_INNER (mode);
42614 rtx reg;
42616 /* If that fails, force VAL into a register. */
42618 start_sequence ();
42619 reg = force_reg (innermode, val);
42620 if (GET_MODE (reg) != innermode)
42621 reg = gen_lowpart (innermode, reg);
42622 XEXP (dup, 0) = reg;
42623 seq = get_insns ();
42624 end_sequence ();
42625 if (seq)
42626 emit_insn_before (seq, insn);
42628 ok = recog_memoized (insn) >= 0;
42629 gcc_assert (ok);
42631 return true;
42634 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42635 with all elements equal to VAR. Return true if successful. */
42637 static bool
42638 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42639 rtx target, rtx val)
42641 bool ok;
42643 switch (mode)
42645 case V2SImode:
42646 case V2SFmode:
42647 if (!mmx_ok)
42648 return false;
42649 /* FALLTHRU */
42651 case V4DFmode:
42652 case V4DImode:
42653 case V8SFmode:
42654 case V8SImode:
42655 case V2DFmode:
42656 case V2DImode:
42657 case V4SFmode:
42658 case V4SImode:
42659 case V16SImode:
42660 case V8DImode:
42661 case V16SFmode:
42662 case V8DFmode:
42663 return ix86_vector_duplicate_value (mode, target, val);
42665 case V4HImode:
42666 if (!mmx_ok)
42667 return false;
42668 if (TARGET_SSE || TARGET_3DNOW_A)
42670 rtx x;
42672 val = gen_lowpart (SImode, val);
42673 x = gen_rtx_TRUNCATE (HImode, val);
42674 x = gen_rtx_VEC_DUPLICATE (mode, x);
42675 emit_insn (gen_rtx_SET (target, x));
42676 return true;
42678 goto widen;
42680 case V8QImode:
42681 if (!mmx_ok)
42682 return false;
42683 goto widen;
42685 case V8HImode:
42686 if (TARGET_AVX2)
42687 return ix86_vector_duplicate_value (mode, target, val);
42689 if (TARGET_SSE2)
42691 struct expand_vec_perm_d dperm;
42692 rtx tmp1, tmp2;
42694 permute:
42695 memset (&dperm, 0, sizeof (dperm));
42696 dperm.target = target;
42697 dperm.vmode = mode;
42698 dperm.nelt = GET_MODE_NUNITS (mode);
42699 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42700 dperm.one_operand_p = true;
42702 /* Extend to SImode using a paradoxical SUBREG. */
42703 tmp1 = gen_reg_rtx (SImode);
42704 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42706 /* Insert the SImode value as low element of a V4SImode vector. */
42707 tmp2 = gen_reg_rtx (V4SImode);
42708 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42709 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42711 ok = (expand_vec_perm_1 (&dperm)
42712 || expand_vec_perm_broadcast_1 (&dperm));
42713 gcc_assert (ok);
42714 return ok;
42716 goto widen;
42718 case V16QImode:
42719 if (TARGET_AVX2)
42720 return ix86_vector_duplicate_value (mode, target, val);
42722 if (TARGET_SSE2)
42723 goto permute;
42724 goto widen;
42726 widen:
42727 /* Replicate the value once into the next wider mode and recurse. */
42729 machine_mode smode, wsmode, wvmode;
42730 rtx x;
42732 smode = GET_MODE_INNER (mode);
42733 wvmode = get_mode_wider_vector (mode);
42734 wsmode = GET_MODE_INNER (wvmode);
42736 val = convert_modes (wsmode, smode, val, true);
42737 x = expand_simple_binop (wsmode, ASHIFT, val,
42738 GEN_INT (GET_MODE_BITSIZE (smode)),
42739 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42740 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42742 x = gen_reg_rtx (wvmode);
42743 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42744 gcc_assert (ok);
42745 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42746 return ok;
42749 case V16HImode:
42750 case V32QImode:
42751 if (TARGET_AVX2)
42752 return ix86_vector_duplicate_value (mode, target, val);
42753 else
42755 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42756 rtx x = gen_reg_rtx (hvmode);
42758 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42759 gcc_assert (ok);
42761 x = gen_rtx_VEC_CONCAT (mode, x, x);
42762 emit_insn (gen_rtx_SET (target, x));
42764 return true;
42766 case V64QImode:
42767 case V32HImode:
42768 if (TARGET_AVX512BW)
42769 return ix86_vector_duplicate_value (mode, target, val);
42770 else
42772 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42773 rtx x = gen_reg_rtx (hvmode);
42775 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42776 gcc_assert (ok);
42778 x = gen_rtx_VEC_CONCAT (mode, x, x);
42779 emit_insn (gen_rtx_SET (target, x));
42781 return true;
42783 default:
42784 return false;
42788 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42789 whose ONE_VAR element is VAR, and other elements are zero. Return true
42790 if successful. */
42792 static bool
42793 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42794 rtx target, rtx var, int one_var)
42796 machine_mode vsimode;
42797 rtx new_target;
42798 rtx x, tmp;
42799 bool use_vector_set = false;
42801 switch (mode)
42803 case V2DImode:
42804 /* For SSE4.1, we normally use vector set. But if the second
42805 element is zero and inter-unit moves are OK, we use movq
42806 instead. */
42807 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42808 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42809 && one_var == 0));
42810 break;
42811 case V16QImode:
42812 case V4SImode:
42813 case V4SFmode:
42814 use_vector_set = TARGET_SSE4_1;
42815 break;
42816 case V8HImode:
42817 use_vector_set = TARGET_SSE2;
42818 break;
42819 case V4HImode:
42820 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42821 break;
42822 case V32QImode:
42823 case V16HImode:
42824 case V8SImode:
42825 case V8SFmode:
42826 case V4DFmode:
42827 use_vector_set = TARGET_AVX;
42828 break;
42829 case V4DImode:
42830 /* Use ix86_expand_vector_set in 64bit mode only. */
42831 use_vector_set = TARGET_AVX && TARGET_64BIT;
42832 break;
42833 default:
42834 break;
42837 if (use_vector_set)
42839 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42840 var = force_reg (GET_MODE_INNER (mode), var);
42841 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42842 return true;
42845 switch (mode)
42847 case V2SFmode:
42848 case V2SImode:
42849 if (!mmx_ok)
42850 return false;
42851 /* FALLTHRU */
42853 case V2DFmode:
42854 case V2DImode:
42855 if (one_var != 0)
42856 return false;
42857 var = force_reg (GET_MODE_INNER (mode), var);
42858 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42859 emit_insn (gen_rtx_SET (target, x));
42860 return true;
42862 case V4SFmode:
42863 case V4SImode:
42864 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42865 new_target = gen_reg_rtx (mode);
42866 else
42867 new_target = target;
42868 var = force_reg (GET_MODE_INNER (mode), var);
42869 x = gen_rtx_VEC_DUPLICATE (mode, var);
42870 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42871 emit_insn (gen_rtx_SET (new_target, x));
42872 if (one_var != 0)
42874 /* We need to shuffle the value to the correct position, so
42875 create a new pseudo to store the intermediate result. */
42877 /* With SSE2, we can use the integer shuffle insns. */
42878 if (mode != V4SFmode && TARGET_SSE2)
42880 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42881 const1_rtx,
42882 GEN_INT (one_var == 1 ? 0 : 1),
42883 GEN_INT (one_var == 2 ? 0 : 1),
42884 GEN_INT (one_var == 3 ? 0 : 1)));
42885 if (target != new_target)
42886 emit_move_insn (target, new_target);
42887 return true;
42890 /* Otherwise convert the intermediate result to V4SFmode and
42891 use the SSE1 shuffle instructions. */
42892 if (mode != V4SFmode)
42894 tmp = gen_reg_rtx (V4SFmode);
42895 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42897 else
42898 tmp = new_target;
42900 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42901 const1_rtx,
42902 GEN_INT (one_var == 1 ? 0 : 1),
42903 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42904 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42906 if (mode != V4SFmode)
42907 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42908 else if (tmp != target)
42909 emit_move_insn (target, tmp);
42911 else if (target != new_target)
42912 emit_move_insn (target, new_target);
42913 return true;
42915 case V8HImode:
42916 case V16QImode:
42917 vsimode = V4SImode;
42918 goto widen;
42919 case V4HImode:
42920 case V8QImode:
42921 if (!mmx_ok)
42922 return false;
42923 vsimode = V2SImode;
42924 goto widen;
42925 widen:
42926 if (one_var != 0)
42927 return false;
42929 /* Zero extend the variable element to SImode and recurse. */
42930 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42932 x = gen_reg_rtx (vsimode);
42933 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42934 var, one_var))
42935 gcc_unreachable ();
42937 emit_move_insn (target, gen_lowpart (mode, x));
42938 return true;
42940 default:
42941 return false;
42945 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42946 consisting of the values in VALS. It is known that all elements
42947 except ONE_VAR are constants. Return true if successful. */
42949 static bool
42950 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42951 rtx target, rtx vals, int one_var)
42953 rtx var = XVECEXP (vals, 0, one_var);
42954 machine_mode wmode;
42955 rtx const_vec, x;
42957 const_vec = copy_rtx (vals);
42958 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42959 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42961 switch (mode)
42963 case V2DFmode:
42964 case V2DImode:
42965 case V2SFmode:
42966 case V2SImode:
42967 /* For the two element vectors, it's just as easy to use
42968 the general case. */
42969 return false;
42971 case V4DImode:
42972 /* Use ix86_expand_vector_set in 64bit mode only. */
42973 if (!TARGET_64BIT)
42974 return false;
42975 /* FALLTHRU */
42976 case V4DFmode:
42977 case V8SFmode:
42978 case V8SImode:
42979 case V16HImode:
42980 case V32QImode:
42981 case V4SFmode:
42982 case V4SImode:
42983 case V8HImode:
42984 case V4HImode:
42985 break;
42987 case V16QImode:
42988 if (TARGET_SSE4_1)
42989 break;
42990 wmode = V8HImode;
42991 goto widen;
42992 case V8QImode:
42993 wmode = V4HImode;
42994 goto widen;
42995 widen:
42996 /* There's no way to set one QImode entry easily. Combine
42997 the variable value with its adjacent constant value, and
42998 promote to an HImode set. */
42999 x = XVECEXP (vals, 0, one_var ^ 1);
43000 if (one_var & 1)
43002 var = convert_modes (HImode, QImode, var, true);
43003 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43004 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43005 x = GEN_INT (INTVAL (x) & 0xff);
43007 else
43009 var = convert_modes (HImode, QImode, var, true);
43010 x = gen_int_mode (INTVAL (x) << 8, HImode);
43012 if (x != const0_rtx)
43013 var = expand_simple_binop (HImode, IOR, var, x, var,
43014 1, OPTAB_LIB_WIDEN);
43016 x = gen_reg_rtx (wmode);
43017 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43018 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43020 emit_move_insn (target, gen_lowpart (mode, x));
43021 return true;
43023 default:
43024 return false;
43027 emit_move_insn (target, const_vec);
43028 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43029 return true;
43032 /* A subroutine of ix86_expand_vector_init_general. Use vector
43033 concatenate to handle the most general case: all values variable,
43034 and none identical. */
43036 static void
43037 ix86_expand_vector_init_concat (machine_mode mode,
43038 rtx target, rtx *ops, int n)
43040 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43041 rtx first[16], second[8], third[4];
43042 rtvec v;
43043 int i, j;
43045 switch (n)
43047 case 2:
43048 switch (mode)
43050 case V16SImode:
43051 cmode = V8SImode;
43052 break;
43053 case V16SFmode:
43054 cmode = V8SFmode;
43055 break;
43056 case V8DImode:
43057 cmode = V4DImode;
43058 break;
43059 case V8DFmode:
43060 cmode = V4DFmode;
43061 break;
43062 case V8SImode:
43063 cmode = V4SImode;
43064 break;
43065 case V8SFmode:
43066 cmode = V4SFmode;
43067 break;
43068 case V4DImode:
43069 cmode = V2DImode;
43070 break;
43071 case V4DFmode:
43072 cmode = V2DFmode;
43073 break;
43074 case V4SImode:
43075 cmode = V2SImode;
43076 break;
43077 case V4SFmode:
43078 cmode = V2SFmode;
43079 break;
43080 case V2DImode:
43081 cmode = DImode;
43082 break;
43083 case V2SImode:
43084 cmode = SImode;
43085 break;
43086 case V2DFmode:
43087 cmode = DFmode;
43088 break;
43089 case V2SFmode:
43090 cmode = SFmode;
43091 break;
43092 default:
43093 gcc_unreachable ();
43096 if (!register_operand (ops[1], cmode))
43097 ops[1] = force_reg (cmode, ops[1]);
43098 if (!register_operand (ops[0], cmode))
43099 ops[0] = force_reg (cmode, ops[0]);
43100 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43101 ops[1])));
43102 break;
43104 case 4:
43105 switch (mode)
43107 case V4DImode:
43108 cmode = V2DImode;
43109 break;
43110 case V4DFmode:
43111 cmode = V2DFmode;
43112 break;
43113 case V4SImode:
43114 cmode = V2SImode;
43115 break;
43116 case V4SFmode:
43117 cmode = V2SFmode;
43118 break;
43119 default:
43120 gcc_unreachable ();
43122 goto half;
43124 case 8:
43125 switch (mode)
43127 case V8DImode:
43128 cmode = V2DImode;
43129 hmode = V4DImode;
43130 break;
43131 case V8DFmode:
43132 cmode = V2DFmode;
43133 hmode = V4DFmode;
43134 break;
43135 case V8SImode:
43136 cmode = V2SImode;
43137 hmode = V4SImode;
43138 break;
43139 case V8SFmode:
43140 cmode = V2SFmode;
43141 hmode = V4SFmode;
43142 break;
43143 default:
43144 gcc_unreachable ();
43146 goto half;
43148 case 16:
43149 switch (mode)
43151 case V16SImode:
43152 cmode = V2SImode;
43153 hmode = V4SImode;
43154 gmode = V8SImode;
43155 break;
43156 case V16SFmode:
43157 cmode = V2SFmode;
43158 hmode = V4SFmode;
43159 gmode = V8SFmode;
43160 break;
43161 default:
43162 gcc_unreachable ();
43164 goto half;
43166 half:
43167 /* FIXME: We process inputs backward to help RA. PR 36222. */
43168 i = n - 1;
43169 j = (n >> 1) - 1;
43170 for (; i > 0; i -= 2, j--)
43172 first[j] = gen_reg_rtx (cmode);
43173 v = gen_rtvec (2, ops[i - 1], ops[i]);
43174 ix86_expand_vector_init (false, first[j],
43175 gen_rtx_PARALLEL (cmode, v));
43178 n >>= 1;
43179 if (n > 4)
43181 gcc_assert (hmode != VOIDmode);
43182 gcc_assert (gmode != VOIDmode);
43183 for (i = j = 0; i < n; i += 2, j++)
43185 second[j] = gen_reg_rtx (hmode);
43186 ix86_expand_vector_init_concat (hmode, second [j],
43187 &first [i], 2);
43189 n >>= 1;
43190 for (i = j = 0; i < n; i += 2, j++)
43192 third[j] = gen_reg_rtx (gmode);
43193 ix86_expand_vector_init_concat (gmode, third[j],
43194 &second[i], 2);
43196 n >>= 1;
43197 ix86_expand_vector_init_concat (mode, target, third, n);
43199 else if (n > 2)
43201 gcc_assert (hmode != VOIDmode);
43202 for (i = j = 0; i < n; i += 2, j++)
43204 second[j] = gen_reg_rtx (hmode);
43205 ix86_expand_vector_init_concat (hmode, second [j],
43206 &first [i], 2);
43208 n >>= 1;
43209 ix86_expand_vector_init_concat (mode, target, second, n);
43211 else
43212 ix86_expand_vector_init_concat (mode, target, first, n);
43213 break;
43215 default:
43216 gcc_unreachable ();
43220 /* A subroutine of ix86_expand_vector_init_general. Use vector
43221 interleave to handle the most general case: all values variable,
43222 and none identical. */
43224 static void
43225 ix86_expand_vector_init_interleave (machine_mode mode,
43226 rtx target, rtx *ops, int n)
43228 machine_mode first_imode, second_imode, third_imode, inner_mode;
43229 int i, j;
43230 rtx op0, op1;
43231 rtx (*gen_load_even) (rtx, rtx, rtx);
43232 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43233 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43235 switch (mode)
43237 case V8HImode:
43238 gen_load_even = gen_vec_setv8hi;
43239 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43240 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43241 inner_mode = HImode;
43242 first_imode = V4SImode;
43243 second_imode = V2DImode;
43244 third_imode = VOIDmode;
43245 break;
43246 case V16QImode:
43247 gen_load_even = gen_vec_setv16qi;
43248 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43249 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43250 inner_mode = QImode;
43251 first_imode = V8HImode;
43252 second_imode = V4SImode;
43253 third_imode = V2DImode;
43254 break;
43255 default:
43256 gcc_unreachable ();
43259 for (i = 0; i < n; i++)
43261 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43262 op0 = gen_reg_rtx (SImode);
43263 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43265 /* Insert the SImode value as low element of V4SImode vector. */
43266 op1 = gen_reg_rtx (V4SImode);
43267 op0 = gen_rtx_VEC_MERGE (V4SImode,
43268 gen_rtx_VEC_DUPLICATE (V4SImode,
43269 op0),
43270 CONST0_RTX (V4SImode),
43271 const1_rtx);
43272 emit_insn (gen_rtx_SET (op1, op0));
43274 /* Cast the V4SImode vector back to a vector in orignal mode. */
43275 op0 = gen_reg_rtx (mode);
43276 emit_move_insn (op0, gen_lowpart (mode, op1));
43278 /* Load even elements into the second position. */
43279 emit_insn (gen_load_even (op0,
43280 force_reg (inner_mode,
43281 ops [i + i + 1]),
43282 const1_rtx));
43284 /* Cast vector to FIRST_IMODE vector. */
43285 ops[i] = gen_reg_rtx (first_imode);
43286 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43289 /* Interleave low FIRST_IMODE vectors. */
43290 for (i = j = 0; i < n; i += 2, j++)
43292 op0 = gen_reg_rtx (first_imode);
43293 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43295 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43296 ops[j] = gen_reg_rtx (second_imode);
43297 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43300 /* Interleave low SECOND_IMODE vectors. */
43301 switch (second_imode)
43303 case V4SImode:
43304 for (i = j = 0; i < n / 2; i += 2, j++)
43306 op0 = gen_reg_rtx (second_imode);
43307 emit_insn (gen_interleave_second_low (op0, ops[i],
43308 ops[i + 1]));
43310 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43311 vector. */
43312 ops[j] = gen_reg_rtx (third_imode);
43313 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43315 second_imode = V2DImode;
43316 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43317 /* FALLTHRU */
43319 case V2DImode:
43320 op0 = gen_reg_rtx (second_imode);
43321 emit_insn (gen_interleave_second_low (op0, ops[0],
43322 ops[1]));
43324 /* Cast the SECOND_IMODE vector back to a vector on original
43325 mode. */
43326 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43327 break;
43329 default:
43330 gcc_unreachable ();
43334 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43335 all values variable, and none identical. */
43337 static void
43338 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43339 rtx target, rtx vals)
43341 rtx ops[64], op0, op1, op2, op3, op4, op5;
43342 machine_mode half_mode = VOIDmode;
43343 machine_mode quarter_mode = VOIDmode;
43344 int n, i;
43346 switch (mode)
43348 case V2SFmode:
43349 case V2SImode:
43350 if (!mmx_ok && !TARGET_SSE)
43351 break;
43352 /* FALLTHRU */
43354 case V16SImode:
43355 case V16SFmode:
43356 case V8DFmode:
43357 case V8DImode:
43358 case V8SFmode:
43359 case V8SImode:
43360 case V4DFmode:
43361 case V4DImode:
43362 case V4SFmode:
43363 case V4SImode:
43364 case V2DFmode:
43365 case V2DImode:
43366 n = GET_MODE_NUNITS (mode);
43367 for (i = 0; i < n; i++)
43368 ops[i] = XVECEXP (vals, 0, i);
43369 ix86_expand_vector_init_concat (mode, target, ops, n);
43370 return;
43372 case V32QImode:
43373 half_mode = V16QImode;
43374 goto half;
43376 case V16HImode:
43377 half_mode = V8HImode;
43378 goto half;
43380 half:
43381 n = GET_MODE_NUNITS (mode);
43382 for (i = 0; i < n; i++)
43383 ops[i] = XVECEXP (vals, 0, i);
43384 op0 = gen_reg_rtx (half_mode);
43385 op1 = gen_reg_rtx (half_mode);
43386 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43387 n >> 2);
43388 ix86_expand_vector_init_interleave (half_mode, op1,
43389 &ops [n >> 1], n >> 2);
43390 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43391 return;
43393 case V64QImode:
43394 quarter_mode = V16QImode;
43395 half_mode = V32QImode;
43396 goto quarter;
43398 case V32HImode:
43399 quarter_mode = V8HImode;
43400 half_mode = V16HImode;
43401 goto quarter;
43403 quarter:
43404 n = GET_MODE_NUNITS (mode);
43405 for (i = 0; i < n; i++)
43406 ops[i] = XVECEXP (vals, 0, i);
43407 op0 = gen_reg_rtx (quarter_mode);
43408 op1 = gen_reg_rtx (quarter_mode);
43409 op2 = gen_reg_rtx (quarter_mode);
43410 op3 = gen_reg_rtx (quarter_mode);
43411 op4 = gen_reg_rtx (half_mode);
43412 op5 = gen_reg_rtx (half_mode);
43413 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43414 n >> 3);
43415 ix86_expand_vector_init_interleave (quarter_mode, op1,
43416 &ops [n >> 2], n >> 3);
43417 ix86_expand_vector_init_interleave (quarter_mode, op2,
43418 &ops [n >> 1], n >> 3);
43419 ix86_expand_vector_init_interleave (quarter_mode, op3,
43420 &ops [(n >> 1) | (n >> 2)], n >> 3);
43421 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43422 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43423 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43424 return;
43426 case V16QImode:
43427 if (!TARGET_SSE4_1)
43428 break;
43429 /* FALLTHRU */
43431 case V8HImode:
43432 if (!TARGET_SSE2)
43433 break;
43435 /* Don't use ix86_expand_vector_init_interleave if we can't
43436 move from GPR to SSE register directly. */
43437 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43438 break;
43440 n = GET_MODE_NUNITS (mode);
43441 for (i = 0; i < n; i++)
43442 ops[i] = XVECEXP (vals, 0, i);
43443 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43444 return;
43446 case V4HImode:
43447 case V8QImode:
43448 break;
43450 default:
43451 gcc_unreachable ();
43455 int i, j, n_elts, n_words, n_elt_per_word;
43456 machine_mode inner_mode;
43457 rtx words[4], shift;
43459 inner_mode = GET_MODE_INNER (mode);
43460 n_elts = GET_MODE_NUNITS (mode);
43461 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43462 n_elt_per_word = n_elts / n_words;
43463 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43465 for (i = 0; i < n_words; ++i)
43467 rtx word = NULL_RTX;
43469 for (j = 0; j < n_elt_per_word; ++j)
43471 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43472 elt = convert_modes (word_mode, inner_mode, elt, true);
43474 if (j == 0)
43475 word = elt;
43476 else
43478 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43479 word, 1, OPTAB_LIB_WIDEN);
43480 word = expand_simple_binop (word_mode, IOR, word, elt,
43481 word, 1, OPTAB_LIB_WIDEN);
43485 words[i] = word;
43488 if (n_words == 1)
43489 emit_move_insn (target, gen_lowpart (mode, words[0]));
43490 else if (n_words == 2)
43492 rtx tmp = gen_reg_rtx (mode);
43493 emit_clobber (tmp);
43494 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43495 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43496 emit_move_insn (target, tmp);
43498 else if (n_words == 4)
43500 rtx tmp = gen_reg_rtx (V4SImode);
43501 gcc_assert (word_mode == SImode);
43502 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43503 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43504 emit_move_insn (target, gen_lowpart (mode, tmp));
43506 else
43507 gcc_unreachable ();
43511 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43512 instructions unless MMX_OK is true. */
43514 void
43515 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43517 machine_mode mode = GET_MODE (target);
43518 machine_mode inner_mode = GET_MODE_INNER (mode);
43519 int n_elts = GET_MODE_NUNITS (mode);
43520 int n_var = 0, one_var = -1;
43521 bool all_same = true, all_const_zero = true;
43522 int i;
43523 rtx x;
43525 for (i = 0; i < n_elts; ++i)
43527 x = XVECEXP (vals, 0, i);
43528 if (!(CONST_SCALAR_INT_P (x)
43529 || CONST_DOUBLE_P (x)
43530 || CONST_FIXED_P (x)))
43531 n_var++, one_var = i;
43532 else if (x != CONST0_RTX (inner_mode))
43533 all_const_zero = false;
43534 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43535 all_same = false;
43538 /* Constants are best loaded from the constant pool. */
43539 if (n_var == 0)
43541 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43542 return;
43545 /* If all values are identical, broadcast the value. */
43546 if (all_same
43547 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43548 XVECEXP (vals, 0, 0)))
43549 return;
43551 /* Values where only one field is non-constant are best loaded from
43552 the pool and overwritten via move later. */
43553 if (n_var == 1)
43555 if (all_const_zero
43556 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43557 XVECEXP (vals, 0, one_var),
43558 one_var))
43559 return;
43561 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43562 return;
43565 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43568 void
43569 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43571 machine_mode mode = GET_MODE (target);
43572 machine_mode inner_mode = GET_MODE_INNER (mode);
43573 machine_mode half_mode;
43574 bool use_vec_merge = false;
43575 rtx tmp;
43576 static rtx (*gen_extract[6][2]) (rtx, rtx)
43578 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43579 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43580 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43581 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43582 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43583 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43585 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43587 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43588 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43589 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43590 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43591 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43592 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43594 int i, j, n;
43595 machine_mode mmode = VOIDmode;
43596 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43598 switch (mode)
43600 case V2SFmode:
43601 case V2SImode:
43602 if (mmx_ok)
43604 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43605 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43606 if (elt == 0)
43607 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43608 else
43609 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43610 emit_insn (gen_rtx_SET (target, tmp));
43611 return;
43613 break;
43615 case V2DImode:
43616 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43617 if (use_vec_merge)
43618 break;
43620 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43621 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43622 if (elt == 0)
43623 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43624 else
43625 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43626 emit_insn (gen_rtx_SET (target, tmp));
43627 return;
43629 case V2DFmode:
43631 rtx op0, op1;
43633 /* For the two element vectors, we implement a VEC_CONCAT with
43634 the extraction of the other element. */
43636 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43637 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43639 if (elt == 0)
43640 op0 = val, op1 = tmp;
43641 else
43642 op0 = tmp, op1 = val;
43644 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43645 emit_insn (gen_rtx_SET (target, tmp));
43647 return;
43649 case V4SFmode:
43650 use_vec_merge = TARGET_SSE4_1;
43651 if (use_vec_merge)
43652 break;
43654 switch (elt)
43656 case 0:
43657 use_vec_merge = true;
43658 break;
43660 case 1:
43661 /* tmp = target = A B C D */
43662 tmp = copy_to_reg (target);
43663 /* target = A A B B */
43664 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43665 /* target = X A B B */
43666 ix86_expand_vector_set (false, target, val, 0);
43667 /* target = A X C D */
43668 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43669 const1_rtx, const0_rtx,
43670 GEN_INT (2+4), GEN_INT (3+4)));
43671 return;
43673 case 2:
43674 /* tmp = target = A B C D */
43675 tmp = copy_to_reg (target);
43676 /* tmp = X B C D */
43677 ix86_expand_vector_set (false, tmp, val, 0);
43678 /* target = A B X D */
43679 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43680 const0_rtx, const1_rtx,
43681 GEN_INT (0+4), GEN_INT (3+4)));
43682 return;
43684 case 3:
43685 /* tmp = target = A B C D */
43686 tmp = copy_to_reg (target);
43687 /* tmp = X B C D */
43688 ix86_expand_vector_set (false, tmp, val, 0);
43689 /* target = A B X D */
43690 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43691 const0_rtx, const1_rtx,
43692 GEN_INT (2+4), GEN_INT (0+4)));
43693 return;
43695 default:
43696 gcc_unreachable ();
43698 break;
43700 case V4SImode:
43701 use_vec_merge = TARGET_SSE4_1;
43702 if (use_vec_merge)
43703 break;
43705 /* Element 0 handled by vec_merge below. */
43706 if (elt == 0)
43708 use_vec_merge = true;
43709 break;
43712 if (TARGET_SSE2)
43714 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43715 store into element 0, then shuffle them back. */
43717 rtx order[4];
43719 order[0] = GEN_INT (elt);
43720 order[1] = const1_rtx;
43721 order[2] = const2_rtx;
43722 order[3] = GEN_INT (3);
43723 order[elt] = const0_rtx;
43725 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43726 order[1], order[2], order[3]));
43728 ix86_expand_vector_set (false, target, val, 0);
43730 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43731 order[1], order[2], order[3]));
43733 else
43735 /* For SSE1, we have to reuse the V4SF code. */
43736 rtx t = gen_reg_rtx (V4SFmode);
43737 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43738 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43739 emit_move_insn (target, gen_lowpart (mode, t));
43741 return;
43743 case V8HImode:
43744 use_vec_merge = TARGET_SSE2;
43745 break;
43746 case V4HImode:
43747 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43748 break;
43750 case V16QImode:
43751 use_vec_merge = TARGET_SSE4_1;
43752 break;
43754 case V8QImode:
43755 break;
43757 case V32QImode:
43758 half_mode = V16QImode;
43759 j = 0;
43760 n = 16;
43761 goto half;
43763 case V16HImode:
43764 half_mode = V8HImode;
43765 j = 1;
43766 n = 8;
43767 goto half;
43769 case V8SImode:
43770 half_mode = V4SImode;
43771 j = 2;
43772 n = 4;
43773 goto half;
43775 case V4DImode:
43776 half_mode = V2DImode;
43777 j = 3;
43778 n = 2;
43779 goto half;
43781 case V8SFmode:
43782 half_mode = V4SFmode;
43783 j = 4;
43784 n = 4;
43785 goto half;
43787 case V4DFmode:
43788 half_mode = V2DFmode;
43789 j = 5;
43790 n = 2;
43791 goto half;
43793 half:
43794 /* Compute offset. */
43795 i = elt / n;
43796 elt %= n;
43798 gcc_assert (i <= 1);
43800 /* Extract the half. */
43801 tmp = gen_reg_rtx (half_mode);
43802 emit_insn (gen_extract[j][i] (tmp, target));
43804 /* Put val in tmp at elt. */
43805 ix86_expand_vector_set (false, tmp, val, elt);
43807 /* Put it back. */
43808 emit_insn (gen_insert[j][i] (target, target, tmp));
43809 return;
43811 case V8DFmode:
43812 if (TARGET_AVX512F)
43814 mmode = QImode;
43815 gen_blendm = gen_avx512f_blendmv8df;
43817 break;
43819 case V8DImode:
43820 if (TARGET_AVX512F)
43822 mmode = QImode;
43823 gen_blendm = gen_avx512f_blendmv8di;
43825 break;
43827 case V16SFmode:
43828 if (TARGET_AVX512F)
43830 mmode = HImode;
43831 gen_blendm = gen_avx512f_blendmv16sf;
43833 break;
43835 case V16SImode:
43836 if (TARGET_AVX512F)
43838 mmode = HImode;
43839 gen_blendm = gen_avx512f_blendmv16si;
43841 break;
43843 case V32HImode:
43844 if (TARGET_AVX512F && TARGET_AVX512BW)
43846 mmode = SImode;
43847 gen_blendm = gen_avx512bw_blendmv32hi;
43849 break;
43851 case V64QImode:
43852 if (TARGET_AVX512F && TARGET_AVX512BW)
43854 mmode = DImode;
43855 gen_blendm = gen_avx512bw_blendmv64qi;
43857 break;
43859 default:
43860 break;
43863 if (mmode != VOIDmode)
43865 tmp = gen_reg_rtx (mode);
43866 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43867 /* The avx512*_blendm<mode> expanders have different operand order
43868 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43869 elements where the mask is set and second input operand otherwise,
43870 in {sse,avx}*_*blend* the first input operand is used for elements
43871 where the mask is clear and second input operand otherwise. */
43872 emit_insn (gen_blendm (target, target, tmp,
43873 force_reg (mmode,
43874 gen_int_mode (1 << elt, mmode))));
43876 else if (use_vec_merge)
43878 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43879 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43880 emit_insn (gen_rtx_SET (target, tmp));
43882 else
43884 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43886 emit_move_insn (mem, target);
43888 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43889 emit_move_insn (tmp, val);
43891 emit_move_insn (target, mem);
43895 void
43896 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43898 machine_mode mode = GET_MODE (vec);
43899 machine_mode inner_mode = GET_MODE_INNER (mode);
43900 bool use_vec_extr = false;
43901 rtx tmp;
43903 switch (mode)
43905 case V2SImode:
43906 case V2SFmode:
43907 if (!mmx_ok)
43908 break;
43909 /* FALLTHRU */
43911 case V2DFmode:
43912 case V2DImode:
43913 use_vec_extr = true;
43914 break;
43916 case V4SFmode:
43917 use_vec_extr = TARGET_SSE4_1;
43918 if (use_vec_extr)
43919 break;
43921 switch (elt)
43923 case 0:
43924 tmp = vec;
43925 break;
43927 case 1:
43928 case 3:
43929 tmp = gen_reg_rtx (mode);
43930 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43931 GEN_INT (elt), GEN_INT (elt),
43932 GEN_INT (elt+4), GEN_INT (elt+4)));
43933 break;
43935 case 2:
43936 tmp = gen_reg_rtx (mode);
43937 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43938 break;
43940 default:
43941 gcc_unreachable ();
43943 vec = tmp;
43944 use_vec_extr = true;
43945 elt = 0;
43946 break;
43948 case V4SImode:
43949 use_vec_extr = TARGET_SSE4_1;
43950 if (use_vec_extr)
43951 break;
43953 if (TARGET_SSE2)
43955 switch (elt)
43957 case 0:
43958 tmp = vec;
43959 break;
43961 case 1:
43962 case 3:
43963 tmp = gen_reg_rtx (mode);
43964 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43965 GEN_INT (elt), GEN_INT (elt),
43966 GEN_INT (elt), GEN_INT (elt)));
43967 break;
43969 case 2:
43970 tmp = gen_reg_rtx (mode);
43971 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43972 break;
43974 default:
43975 gcc_unreachable ();
43977 vec = tmp;
43978 use_vec_extr = true;
43979 elt = 0;
43981 else
43983 /* For SSE1, we have to reuse the V4SF code. */
43984 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43985 gen_lowpart (V4SFmode, vec), elt);
43986 return;
43988 break;
43990 case V8HImode:
43991 use_vec_extr = TARGET_SSE2;
43992 break;
43993 case V4HImode:
43994 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43995 break;
43997 case V16QImode:
43998 use_vec_extr = TARGET_SSE4_1;
43999 break;
44001 case V8SFmode:
44002 if (TARGET_AVX)
44004 tmp = gen_reg_rtx (V4SFmode);
44005 if (elt < 4)
44006 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44007 else
44008 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44009 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44010 return;
44012 break;
44014 case V4DFmode:
44015 if (TARGET_AVX)
44017 tmp = gen_reg_rtx (V2DFmode);
44018 if (elt < 2)
44019 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44020 else
44021 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44022 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44023 return;
44025 break;
44027 case V32QImode:
44028 if (TARGET_AVX)
44030 tmp = gen_reg_rtx (V16QImode);
44031 if (elt < 16)
44032 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44033 else
44034 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44035 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44036 return;
44038 break;
44040 case V16HImode:
44041 if (TARGET_AVX)
44043 tmp = gen_reg_rtx (V8HImode);
44044 if (elt < 8)
44045 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44046 else
44047 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44048 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44049 return;
44051 break;
44053 case V8SImode:
44054 if (TARGET_AVX)
44056 tmp = gen_reg_rtx (V4SImode);
44057 if (elt < 4)
44058 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44059 else
44060 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44061 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44062 return;
44064 break;
44066 case V4DImode:
44067 if (TARGET_AVX)
44069 tmp = gen_reg_rtx (V2DImode);
44070 if (elt < 2)
44071 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44072 else
44073 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44074 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44075 return;
44077 break;
44079 case V32HImode:
44080 if (TARGET_AVX512BW)
44082 tmp = gen_reg_rtx (V16HImode);
44083 if (elt < 16)
44084 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44085 else
44086 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44087 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44088 return;
44090 break;
44092 case V64QImode:
44093 if (TARGET_AVX512BW)
44095 tmp = gen_reg_rtx (V32QImode);
44096 if (elt < 32)
44097 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44098 else
44099 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44100 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44101 return;
44103 break;
44105 case V16SFmode:
44106 tmp = gen_reg_rtx (V8SFmode);
44107 if (elt < 8)
44108 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44109 else
44110 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44111 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44112 return;
44114 case V8DFmode:
44115 tmp = gen_reg_rtx (V4DFmode);
44116 if (elt < 4)
44117 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44118 else
44119 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44120 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44121 return;
44123 case V16SImode:
44124 tmp = gen_reg_rtx (V8SImode);
44125 if (elt < 8)
44126 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44127 else
44128 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44129 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44130 return;
44132 case V8DImode:
44133 tmp = gen_reg_rtx (V4DImode);
44134 if (elt < 4)
44135 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44136 else
44137 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44138 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44139 return;
44141 case V8QImode:
44142 /* ??? Could extract the appropriate HImode element and shift. */
44143 default:
44144 break;
44147 if (use_vec_extr)
44149 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44150 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44152 /* Let the rtl optimizers know about the zero extension performed. */
44153 if (inner_mode == QImode || inner_mode == HImode)
44155 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44156 target = gen_lowpart (SImode, target);
44159 emit_insn (gen_rtx_SET (target, tmp));
44161 else
44163 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44165 emit_move_insn (mem, vec);
44167 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44168 emit_move_insn (target, tmp);
44172 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44173 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44174 The upper bits of DEST are undefined, though they shouldn't cause
44175 exceptions (some bits from src or all zeros are ok). */
44177 static void
44178 emit_reduc_half (rtx dest, rtx src, int i)
44180 rtx tem, d = dest;
44181 switch (GET_MODE (src))
44183 case V4SFmode:
44184 if (i == 128)
44185 tem = gen_sse_movhlps (dest, src, src);
44186 else
44187 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44188 GEN_INT (1 + 4), GEN_INT (1 + 4));
44189 break;
44190 case V2DFmode:
44191 tem = gen_vec_interleave_highv2df (dest, src, src);
44192 break;
44193 case V16QImode:
44194 case V8HImode:
44195 case V4SImode:
44196 case V2DImode:
44197 d = gen_reg_rtx (V1TImode);
44198 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44199 GEN_INT (i / 2));
44200 break;
44201 case V8SFmode:
44202 if (i == 256)
44203 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44204 else
44205 tem = gen_avx_shufps256 (dest, src, src,
44206 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44207 break;
44208 case V4DFmode:
44209 if (i == 256)
44210 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44211 else
44212 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44213 break;
44214 case V32QImode:
44215 case V16HImode:
44216 case V8SImode:
44217 case V4DImode:
44218 if (i == 256)
44220 if (GET_MODE (dest) != V4DImode)
44221 d = gen_reg_rtx (V4DImode);
44222 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44223 gen_lowpart (V4DImode, src),
44224 const1_rtx);
44226 else
44228 d = gen_reg_rtx (V2TImode);
44229 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44230 GEN_INT (i / 2));
44232 break;
44233 case V64QImode:
44234 case V32HImode:
44235 case V16SImode:
44236 case V16SFmode:
44237 case V8DImode:
44238 case V8DFmode:
44239 if (i > 128)
44240 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44241 gen_lowpart (V16SImode, src),
44242 gen_lowpart (V16SImode, src),
44243 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44244 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44245 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44246 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44247 GEN_INT (0xC), GEN_INT (0xD),
44248 GEN_INT (0xE), GEN_INT (0xF),
44249 GEN_INT (0x10), GEN_INT (0x11),
44250 GEN_INT (0x12), GEN_INT (0x13),
44251 GEN_INT (0x14), GEN_INT (0x15),
44252 GEN_INT (0x16), GEN_INT (0x17));
44253 else
44254 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44255 gen_lowpart (V16SImode, src),
44256 GEN_INT (i == 128 ? 0x2 : 0x1),
44257 GEN_INT (0x3),
44258 GEN_INT (0x3),
44259 GEN_INT (0x3),
44260 GEN_INT (i == 128 ? 0x6 : 0x5),
44261 GEN_INT (0x7),
44262 GEN_INT (0x7),
44263 GEN_INT (0x7),
44264 GEN_INT (i == 128 ? 0xA : 0x9),
44265 GEN_INT (0xB),
44266 GEN_INT (0xB),
44267 GEN_INT (0xB),
44268 GEN_INT (i == 128 ? 0xE : 0xD),
44269 GEN_INT (0xF),
44270 GEN_INT (0xF),
44271 GEN_INT (0xF));
44272 break;
44273 default:
44274 gcc_unreachable ();
44276 emit_insn (tem);
44277 if (d != dest)
44278 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44281 /* Expand a vector reduction. FN is the binary pattern to reduce;
44282 DEST is the destination; IN is the input vector. */
44284 void
44285 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44287 rtx half, dst, vec = in;
44288 machine_mode mode = GET_MODE (in);
44289 int i;
44291 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44292 if (TARGET_SSE4_1
44293 && mode == V8HImode
44294 && fn == gen_uminv8hi3)
44296 emit_insn (gen_sse4_1_phminposuw (dest, in));
44297 return;
44300 for (i = GET_MODE_BITSIZE (mode);
44301 i > GET_MODE_UNIT_BITSIZE (mode);
44302 i >>= 1)
44304 half = gen_reg_rtx (mode);
44305 emit_reduc_half (half, vec, i);
44306 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44307 dst = dest;
44308 else
44309 dst = gen_reg_rtx (mode);
44310 emit_insn (fn (dst, half, vec));
44311 vec = dst;
44315 /* Target hook for scalar_mode_supported_p. */
44316 static bool
44317 ix86_scalar_mode_supported_p (machine_mode mode)
44319 if (DECIMAL_FLOAT_MODE_P (mode))
44320 return default_decimal_float_supported_p ();
44321 else if (mode == TFmode)
44322 return true;
44323 else
44324 return default_scalar_mode_supported_p (mode);
44327 /* Implements target hook vector_mode_supported_p. */
44328 static bool
44329 ix86_vector_mode_supported_p (machine_mode mode)
44331 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44332 return true;
44333 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44334 return true;
44335 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44336 return true;
44337 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44338 return true;
44339 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44340 return true;
44341 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44342 return true;
44343 return false;
44346 /* Target hook for c_mode_for_suffix. */
44347 static machine_mode
44348 ix86_c_mode_for_suffix (char suffix)
44350 if (suffix == 'q')
44351 return TFmode;
44352 if (suffix == 'w')
44353 return XFmode;
44355 return VOIDmode;
44358 /* Worker function for TARGET_MD_ASM_ADJUST.
44360 We implement asm flag outputs, and maintain source compatibility
44361 with the old cc0-based compiler. */
44363 static rtx_insn *
44364 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44365 vec<const char *> &constraints,
44366 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44368 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44369 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44371 bool saw_asm_flag = false;
44373 start_sequence ();
44374 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44376 const char *con = constraints[i];
44377 if (strncmp (con, "=@cc", 4) != 0)
44378 continue;
44379 con += 4;
44380 if (strchr (con, ',') != NULL)
44382 error ("alternatives not allowed in asm flag output");
44383 continue;
44386 bool invert = false;
44387 if (con[0] == 'n')
44388 invert = true, con++;
44390 machine_mode mode = CCmode;
44391 rtx_code code = UNKNOWN;
44393 switch (con[0])
44395 case 'a':
44396 if (con[1] == 0)
44397 mode = CCAmode, code = EQ;
44398 else if (con[1] == 'e' && con[2] == 0)
44399 mode = CCCmode, code = NE;
44400 break;
44401 case 'b':
44402 if (con[1] == 0)
44403 mode = CCCmode, code = EQ;
44404 else if (con[1] == 'e' && con[2] == 0)
44405 mode = CCAmode, code = NE;
44406 break;
44407 case 'c':
44408 if (con[1] == 0)
44409 mode = CCCmode, code = EQ;
44410 break;
44411 case 'e':
44412 if (con[1] == 0)
44413 mode = CCZmode, code = EQ;
44414 break;
44415 case 'g':
44416 if (con[1] == 0)
44417 mode = CCGCmode, code = GT;
44418 else if (con[1] == 'e' && con[2] == 0)
44419 mode = CCGCmode, code = GE;
44420 break;
44421 case 'l':
44422 if (con[1] == 0)
44423 mode = CCGCmode, code = LT;
44424 else if (con[1] == 'e' && con[2] == 0)
44425 mode = CCGCmode, code = LE;
44426 break;
44427 case 'o':
44428 if (con[1] == 0)
44429 mode = CCOmode, code = EQ;
44430 break;
44431 case 'p':
44432 if (con[1] == 0)
44433 mode = CCPmode, code = EQ;
44434 break;
44435 case 's':
44436 if (con[1] == 0)
44437 mode = CCSmode, code = EQ;
44438 break;
44439 case 'z':
44440 if (con[1] == 0)
44441 mode = CCZmode, code = EQ;
44442 break;
44444 if (code == UNKNOWN)
44446 error ("unknown asm flag output %qs", constraints[i]);
44447 continue;
44449 if (invert)
44450 code = reverse_condition (code);
44452 rtx dest = outputs[i];
44453 if (!saw_asm_flag)
44455 /* This is the first asm flag output. Here we put the flags
44456 register in as the real output and adjust the condition to
44457 allow it. */
44458 constraints[i] = "=Bf";
44459 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44460 saw_asm_flag = true;
44462 else
44464 /* We don't need the flags register as output twice. */
44465 constraints[i] = "=X";
44466 outputs[i] = gen_rtx_SCRATCH (SImode);
44469 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44470 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44472 machine_mode dest_mode = GET_MODE (dest);
44473 if (!SCALAR_INT_MODE_P (dest_mode))
44475 error ("invalid type for asm flag output");
44476 continue;
44479 if (dest_mode == DImode && !TARGET_64BIT)
44480 dest_mode = SImode;
44482 if (dest_mode != QImode)
44484 rtx destqi = gen_reg_rtx (QImode);
44485 emit_insn (gen_rtx_SET (destqi, x));
44487 if (TARGET_ZERO_EXTEND_WITH_AND
44488 && optimize_function_for_speed_p (cfun))
44490 x = force_reg (dest_mode, const0_rtx);
44492 emit_insn (gen_movstrictqi
44493 (gen_lowpart (QImode, x), destqi));
44495 else
44496 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44499 if (dest_mode != GET_MODE (dest))
44501 rtx tmp = gen_reg_rtx (SImode);
44503 emit_insn (gen_rtx_SET (tmp, x));
44504 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44506 else
44507 emit_insn (gen_rtx_SET (dest, x));
44509 rtx_insn *seq = get_insns ();
44510 end_sequence ();
44512 if (saw_asm_flag)
44513 return seq;
44514 else
44516 /* If we had no asm flag outputs, clobber the flags. */
44517 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44518 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44519 return NULL;
44523 /* Implements target vector targetm.asm.encode_section_info. */
44525 static void ATTRIBUTE_UNUSED
44526 ix86_encode_section_info (tree decl, rtx rtl, int first)
44528 default_encode_section_info (decl, rtl, first);
44530 if (ix86_in_large_data_p (decl))
44531 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44534 /* Worker function for REVERSE_CONDITION. */
44536 enum rtx_code
44537 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44539 return (mode != CCFPmode && mode != CCFPUmode
44540 ? reverse_condition (code)
44541 : reverse_condition_maybe_unordered (code));
44544 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44545 to OPERANDS[0]. */
44547 const char *
44548 output_387_reg_move (rtx_insn *insn, rtx *operands)
44550 if (REG_P (operands[0]))
44552 if (REG_P (operands[1])
44553 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44555 if (REGNO (operands[0]) == FIRST_STACK_REG)
44556 return output_387_ffreep (operands, 0);
44557 return "fstp\t%y0";
44559 if (STACK_TOP_P (operands[0]))
44560 return "fld%Z1\t%y1";
44561 return "fst\t%y0";
44563 else if (MEM_P (operands[0]))
44565 gcc_assert (REG_P (operands[1]));
44566 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44567 return "fstp%Z0\t%y0";
44568 else
44570 /* There is no non-popping store to memory for XFmode.
44571 So if we need one, follow the store with a load. */
44572 if (GET_MODE (operands[0]) == XFmode)
44573 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44574 else
44575 return "fst%Z0\t%y0";
44578 else
44579 gcc_unreachable();
44582 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44583 FP status register is set. */
44585 void
44586 ix86_emit_fp_unordered_jump (rtx label)
44588 rtx reg = gen_reg_rtx (HImode);
44589 rtx temp;
44591 emit_insn (gen_x86_fnstsw_1 (reg));
44593 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44595 emit_insn (gen_x86_sahf_1 (reg));
44597 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44598 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44600 else
44602 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44604 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44605 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44608 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44609 gen_rtx_LABEL_REF (VOIDmode, label),
44610 pc_rtx);
44611 temp = gen_rtx_SET (pc_rtx, temp);
44613 emit_jump_insn (temp);
44614 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44617 /* Output code to perform a log1p XFmode calculation. */
44619 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44621 rtx_code_label *label1 = gen_label_rtx ();
44622 rtx_code_label *label2 = gen_label_rtx ();
44624 rtx tmp = gen_reg_rtx (XFmode);
44625 rtx tmp2 = gen_reg_rtx (XFmode);
44626 rtx test;
44628 emit_insn (gen_absxf2 (tmp, op1));
44629 test = gen_rtx_GE (VOIDmode, tmp,
44630 const_double_from_real_value (
44631 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44632 XFmode));
44633 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44635 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44636 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44637 emit_jump (label2);
44639 emit_label (label1);
44640 emit_move_insn (tmp, CONST1_RTX (XFmode));
44641 emit_insn (gen_addxf3 (tmp, op1, tmp));
44642 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44643 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44645 emit_label (label2);
44648 /* Emit code for round calculation. */
44649 void ix86_emit_i387_round (rtx op0, rtx op1)
44651 machine_mode inmode = GET_MODE (op1);
44652 machine_mode outmode = GET_MODE (op0);
44653 rtx e1, e2, res, tmp, tmp1, half;
44654 rtx scratch = gen_reg_rtx (HImode);
44655 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44656 rtx_code_label *jump_label = gen_label_rtx ();
44657 rtx insn;
44658 rtx (*gen_abs) (rtx, rtx);
44659 rtx (*gen_neg) (rtx, rtx);
44661 switch (inmode)
44663 case SFmode:
44664 gen_abs = gen_abssf2;
44665 break;
44666 case DFmode:
44667 gen_abs = gen_absdf2;
44668 break;
44669 case XFmode:
44670 gen_abs = gen_absxf2;
44671 break;
44672 default:
44673 gcc_unreachable ();
44676 switch (outmode)
44678 case SFmode:
44679 gen_neg = gen_negsf2;
44680 break;
44681 case DFmode:
44682 gen_neg = gen_negdf2;
44683 break;
44684 case XFmode:
44685 gen_neg = gen_negxf2;
44686 break;
44687 case HImode:
44688 gen_neg = gen_neghi2;
44689 break;
44690 case SImode:
44691 gen_neg = gen_negsi2;
44692 break;
44693 case DImode:
44694 gen_neg = gen_negdi2;
44695 break;
44696 default:
44697 gcc_unreachable ();
44700 e1 = gen_reg_rtx (inmode);
44701 e2 = gen_reg_rtx (inmode);
44702 res = gen_reg_rtx (outmode);
44704 half = const_double_from_real_value (dconsthalf, inmode);
44706 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44708 /* scratch = fxam(op1) */
44709 emit_insn (gen_rtx_SET (scratch,
44710 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44711 UNSPEC_FXAM)));
44712 /* e1 = fabs(op1) */
44713 emit_insn (gen_abs (e1, op1));
44715 /* e2 = e1 + 0.5 */
44716 half = force_reg (inmode, half);
44717 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44719 /* res = floor(e2) */
44720 if (inmode != XFmode)
44722 tmp1 = gen_reg_rtx (XFmode);
44724 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44726 else
44727 tmp1 = e2;
44729 switch (outmode)
44731 case SFmode:
44732 case DFmode:
44734 rtx tmp0 = gen_reg_rtx (XFmode);
44736 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44738 emit_insn (gen_rtx_SET (res,
44739 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44740 UNSPEC_TRUNC_NOOP)));
44742 break;
44743 case XFmode:
44744 emit_insn (gen_frndintxf2_floor (res, tmp1));
44745 break;
44746 case HImode:
44747 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44748 break;
44749 case SImode:
44750 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44751 break;
44752 case DImode:
44753 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44754 break;
44755 default:
44756 gcc_unreachable ();
44759 /* flags = signbit(a) */
44760 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44762 /* if (flags) then res = -res */
44763 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44764 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44765 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44766 pc_rtx);
44767 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44768 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44769 JUMP_LABEL (insn) = jump_label;
44771 emit_insn (gen_neg (res, res));
44773 emit_label (jump_label);
44774 LABEL_NUSES (jump_label) = 1;
44776 emit_move_insn (op0, res);
44779 /* Output code to perform a Newton-Rhapson approximation of a single precision
44780 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44782 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44784 rtx x0, x1, e0, e1;
44786 x0 = gen_reg_rtx (mode);
44787 e0 = gen_reg_rtx (mode);
44788 e1 = gen_reg_rtx (mode);
44789 x1 = gen_reg_rtx (mode);
44791 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44793 b = force_reg (mode, b);
44795 /* x0 = rcp(b) estimate */
44796 if (mode == V16SFmode || mode == V8DFmode)
44798 if (TARGET_AVX512ER)
44800 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44801 UNSPEC_RCP28)));
44802 /* res = a * x0 */
44803 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44804 return;
44806 else
44807 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44808 UNSPEC_RCP14)));
44810 else
44811 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44812 UNSPEC_RCP)));
44814 /* e0 = x0 * b */
44815 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44817 /* e0 = x0 * e0 */
44818 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44820 /* e1 = x0 + x0 */
44821 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44823 /* x1 = e1 - e0 */
44824 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44826 /* res = a * x1 */
44827 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44830 /* Output code to perform a Newton-Rhapson approximation of a
44831 single precision floating point [reciprocal] square root. */
44833 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44835 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44836 REAL_VALUE_TYPE r;
44837 int unspec;
44839 x0 = gen_reg_rtx (mode);
44840 e0 = gen_reg_rtx (mode);
44841 e1 = gen_reg_rtx (mode);
44842 e2 = gen_reg_rtx (mode);
44843 e3 = gen_reg_rtx (mode);
44845 if (TARGET_AVX512ER && mode == V16SFmode)
44847 if (recip)
44848 /* res = rsqrt28(a) estimate */
44849 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44850 UNSPEC_RSQRT28)));
44851 else
44853 /* x0 = rsqrt28(a) estimate */
44854 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44855 UNSPEC_RSQRT28)));
44856 /* res = rcp28(x0) estimate */
44857 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44858 UNSPEC_RCP28)));
44860 return;
44863 real_from_integer (&r, VOIDmode, -3, SIGNED);
44864 mthree = const_double_from_real_value (r, SFmode);
44866 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44867 mhalf = const_double_from_real_value (r, SFmode);
44868 unspec = UNSPEC_RSQRT;
44870 if (VECTOR_MODE_P (mode))
44872 mthree = ix86_build_const_vector (mode, true, mthree);
44873 mhalf = ix86_build_const_vector (mode, true, mhalf);
44874 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44875 if (GET_MODE_SIZE (mode) == 64)
44876 unspec = UNSPEC_RSQRT14;
44879 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44880 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44882 a = force_reg (mode, a);
44884 /* x0 = rsqrt(a) estimate */
44885 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44886 unspec)));
44888 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44889 if (!recip)
44891 rtx zero = force_reg (mode, CONST0_RTX(mode));
44892 rtx mask;
44894 /* Handle masked compare. */
44895 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44897 mask = gen_reg_rtx (HImode);
44898 /* Imm value 0x4 corresponds to not-equal comparison. */
44899 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44900 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44902 else
44904 mask = gen_reg_rtx (mode);
44905 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44906 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44910 /* e0 = x0 * a */
44911 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44912 /* e1 = e0 * x0 */
44913 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44915 /* e2 = e1 - 3. */
44916 mthree = force_reg (mode, mthree);
44917 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44919 mhalf = force_reg (mode, mhalf);
44920 if (recip)
44921 /* e3 = -.5 * x0 */
44922 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44923 else
44924 /* e3 = -.5 * e0 */
44925 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44926 /* ret = e2 * e3 */
44927 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44930 #ifdef TARGET_SOLARIS
44931 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44933 static void
44934 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44935 tree decl)
44937 /* With Binutils 2.15, the "@unwind" marker must be specified on
44938 every occurrence of the ".eh_frame" section, not just the first
44939 one. */
44940 if (TARGET_64BIT
44941 && strcmp (name, ".eh_frame") == 0)
44943 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44944 flags & SECTION_WRITE ? "aw" : "a");
44945 return;
44948 #ifndef USE_GAS
44949 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44951 solaris_elf_asm_comdat_section (name, flags, decl);
44952 return;
44954 #endif
44956 default_elf_asm_named_section (name, flags, decl);
44958 #endif /* TARGET_SOLARIS */
44960 /* Return the mangling of TYPE if it is an extended fundamental type. */
44962 static const char *
44963 ix86_mangle_type (const_tree type)
44965 type = TYPE_MAIN_VARIANT (type);
44967 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44968 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44969 return NULL;
44971 switch (TYPE_MODE (type))
44973 case TFmode:
44974 /* __float128 is "g". */
44975 return "g";
44976 case XFmode:
44977 /* "long double" or __float80 is "e". */
44978 return "e";
44979 default:
44980 return NULL;
44984 #ifdef TARGET_THREAD_SSP_OFFSET
44985 /* If using TLS guards, don't waste time creating and expanding
44986 __stack_chk_guard decl and MEM as we are going to ignore it. */
44987 static tree
44988 ix86_stack_protect_guard (void)
44990 if (TARGET_SSP_TLS_GUARD)
44991 return NULL_TREE;
44992 return default_stack_protect_guard ();
44994 #endif
44996 /* For 32-bit code we can save PIC register setup by using
44997 __stack_chk_fail_local hidden function instead of calling
44998 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44999 register, so it is better to call __stack_chk_fail directly. */
45001 static tree ATTRIBUTE_UNUSED
45002 ix86_stack_protect_fail (void)
45004 return TARGET_64BIT
45005 ? default_external_stack_protect_fail ()
45006 : default_hidden_stack_protect_fail ();
45009 /* Select a format to encode pointers in exception handling data. CODE
45010 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45011 true if the symbol may be affected by dynamic relocations.
45013 ??? All x86 object file formats are capable of representing this.
45014 After all, the relocation needed is the same as for the call insn.
45015 Whether or not a particular assembler allows us to enter such, I
45016 guess we'll have to see. */
45018 asm_preferred_eh_data_format (int code, int global)
45020 if (flag_pic)
45022 int type = DW_EH_PE_sdata8;
45023 if (!TARGET_64BIT
45024 || ix86_cmodel == CM_SMALL_PIC
45025 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45026 type = DW_EH_PE_sdata4;
45027 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45029 if (ix86_cmodel == CM_SMALL
45030 || (ix86_cmodel == CM_MEDIUM && code))
45031 return DW_EH_PE_udata4;
45032 return DW_EH_PE_absptr;
45035 /* Expand copysign from SIGN to the positive value ABS_VALUE
45036 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45037 the sign-bit. */
45038 static void
45039 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45041 machine_mode mode = GET_MODE (sign);
45042 rtx sgn = gen_reg_rtx (mode);
45043 if (mask == NULL_RTX)
45045 machine_mode vmode;
45047 if (mode == SFmode)
45048 vmode = V4SFmode;
45049 else if (mode == DFmode)
45050 vmode = V2DFmode;
45051 else
45052 vmode = mode;
45054 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45055 if (!VECTOR_MODE_P (mode))
45057 /* We need to generate a scalar mode mask in this case. */
45058 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45059 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45060 mask = gen_reg_rtx (mode);
45061 emit_insn (gen_rtx_SET (mask, tmp));
45064 else
45065 mask = gen_rtx_NOT (mode, mask);
45066 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45067 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45070 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45071 mask for masking out the sign-bit is stored in *SMASK, if that is
45072 non-null. */
45073 static rtx
45074 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45076 machine_mode vmode, mode = GET_MODE (op0);
45077 rtx xa, mask;
45079 xa = gen_reg_rtx (mode);
45080 if (mode == SFmode)
45081 vmode = V4SFmode;
45082 else if (mode == DFmode)
45083 vmode = V2DFmode;
45084 else
45085 vmode = mode;
45086 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45087 if (!VECTOR_MODE_P (mode))
45089 /* We need to generate a scalar mode mask in this case. */
45090 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45091 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45092 mask = gen_reg_rtx (mode);
45093 emit_insn (gen_rtx_SET (mask, tmp));
45095 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45097 if (smask)
45098 *smask = mask;
45100 return xa;
45103 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45104 swapping the operands if SWAP_OPERANDS is true. The expanded
45105 code is a forward jump to a newly created label in case the
45106 comparison is true. The generated label rtx is returned. */
45107 static rtx_code_label *
45108 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45109 bool swap_operands)
45111 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45112 rtx_code_label *label;
45113 rtx tmp;
45115 if (swap_operands)
45116 std::swap (op0, op1);
45118 label = gen_label_rtx ();
45119 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45120 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45121 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45122 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45123 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45124 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45125 JUMP_LABEL (tmp) = label;
45127 return label;
45130 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45131 using comparison code CODE. Operands are swapped for the comparison if
45132 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45133 static rtx
45134 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45135 bool swap_operands)
45137 rtx (*insn)(rtx, rtx, rtx, rtx);
45138 machine_mode mode = GET_MODE (op0);
45139 rtx mask = gen_reg_rtx (mode);
45141 if (swap_operands)
45142 std::swap (op0, op1);
45144 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45146 emit_insn (insn (mask, op0, op1,
45147 gen_rtx_fmt_ee (code, mode, op0, op1)));
45148 return mask;
45151 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45152 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45153 static rtx
45154 ix86_gen_TWO52 (machine_mode mode)
45156 REAL_VALUE_TYPE TWO52r;
45157 rtx TWO52;
45159 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45160 TWO52 = const_double_from_real_value (TWO52r, mode);
45161 TWO52 = force_reg (mode, TWO52);
45163 return TWO52;
45166 /* Expand SSE sequence for computing lround from OP1 storing
45167 into OP0. */
45168 void
45169 ix86_expand_lround (rtx op0, rtx op1)
45171 /* C code for the stuff we're doing below:
45172 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45173 return (long)tmp;
45175 machine_mode mode = GET_MODE (op1);
45176 const struct real_format *fmt;
45177 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45178 rtx adj;
45180 /* load nextafter (0.5, 0.0) */
45181 fmt = REAL_MODE_FORMAT (mode);
45182 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45183 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45185 /* adj = copysign (0.5, op1) */
45186 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45187 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45189 /* adj = op1 + adj */
45190 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45192 /* op0 = (imode)adj */
45193 expand_fix (op0, adj, 0);
45196 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45197 into OPERAND0. */
45198 void
45199 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45201 /* C code for the stuff we're doing below (for do_floor):
45202 xi = (long)op1;
45203 xi -= (double)xi > op1 ? 1 : 0;
45204 return xi;
45206 machine_mode fmode = GET_MODE (op1);
45207 machine_mode imode = GET_MODE (op0);
45208 rtx ireg, freg, tmp;
45209 rtx_code_label *label;
45211 /* reg = (long)op1 */
45212 ireg = gen_reg_rtx (imode);
45213 expand_fix (ireg, op1, 0);
45215 /* freg = (double)reg */
45216 freg = gen_reg_rtx (fmode);
45217 expand_float (freg, ireg, 0);
45219 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45220 label = ix86_expand_sse_compare_and_jump (UNLE,
45221 freg, op1, !do_floor);
45222 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45223 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45224 emit_move_insn (ireg, tmp);
45226 emit_label (label);
45227 LABEL_NUSES (label) = 1;
45229 emit_move_insn (op0, ireg);
45232 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
45233 result in OPERAND0. */
45234 void
45235 ix86_expand_rint (rtx operand0, rtx operand1)
45237 /* C code for the stuff we're doing below:
45238 xa = fabs (operand1);
45239 if (!isless (xa, 2**52))
45240 return operand1;
45241 xa = xa + 2**52 - 2**52;
45242 return copysign (xa, operand1);
45244 machine_mode mode = GET_MODE (operand0);
45245 rtx res, xa, TWO52, mask;
45246 rtx_code_label *label;
45248 res = gen_reg_rtx (mode);
45249 emit_move_insn (res, operand1);
45251 /* xa = abs (operand1) */
45252 xa = ix86_expand_sse_fabs (res, &mask);
45254 /* if (!isless (xa, TWO52)) goto label; */
45255 TWO52 = ix86_gen_TWO52 (mode);
45256 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45258 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45259 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45261 ix86_sse_copysign_to_positive (res, xa, res, mask);
45263 emit_label (label);
45264 LABEL_NUSES (label) = 1;
45266 emit_move_insn (operand0, res);
45269 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45270 into OPERAND0. */
45271 void
45272 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45274 /* C code for the stuff we expand below.
45275 double xa = fabs (x), x2;
45276 if (!isless (xa, TWO52))
45277 return x;
45278 xa = xa + TWO52 - TWO52;
45279 x2 = copysign (xa, x);
45280 Compensate. Floor:
45281 if (x2 > x)
45282 x2 -= 1;
45283 Compensate. Ceil:
45284 if (x2 < x)
45285 x2 -= -1;
45286 return x2;
45288 machine_mode mode = GET_MODE (operand0);
45289 rtx xa, TWO52, tmp, one, res, mask;
45290 rtx_code_label *label;
45292 TWO52 = ix86_gen_TWO52 (mode);
45294 /* Temporary for holding the result, initialized to the input
45295 operand to ease control flow. */
45296 res = gen_reg_rtx (mode);
45297 emit_move_insn (res, operand1);
45299 /* xa = abs (operand1) */
45300 xa = ix86_expand_sse_fabs (res, &mask);
45302 /* if (!isless (xa, TWO52)) goto label; */
45303 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45305 /* xa = xa + TWO52 - TWO52; */
45306 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45307 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45309 /* xa = copysign (xa, operand1) */
45310 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45312 /* generate 1.0 or -1.0 */
45313 one = force_reg (mode,
45314 const_double_from_real_value (do_floor
45315 ? dconst1 : dconstm1, mode));
45317 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45318 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45319 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45320 /* We always need to subtract here to preserve signed zero. */
45321 tmp = expand_simple_binop (mode, MINUS,
45322 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45323 emit_move_insn (res, tmp);
45325 emit_label (label);
45326 LABEL_NUSES (label) = 1;
45328 emit_move_insn (operand0, res);
45331 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45332 into OPERAND0. */
45333 void
45334 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45336 /* C code for the stuff we expand below.
45337 double xa = fabs (x), x2;
45338 if (!isless (xa, TWO52))
45339 return x;
45340 x2 = (double)(long)x;
45341 Compensate. Floor:
45342 if (x2 > x)
45343 x2 -= 1;
45344 Compensate. Ceil:
45345 if (x2 < x)
45346 x2 += 1;
45347 if (HONOR_SIGNED_ZEROS (mode))
45348 return copysign (x2, x);
45349 return x2;
45351 machine_mode mode = GET_MODE (operand0);
45352 rtx xa, xi, TWO52, tmp, one, res, mask;
45353 rtx_code_label *label;
45355 TWO52 = ix86_gen_TWO52 (mode);
45357 /* Temporary for holding the result, initialized to the input
45358 operand to ease control flow. */
45359 res = gen_reg_rtx (mode);
45360 emit_move_insn (res, operand1);
45362 /* xa = abs (operand1) */
45363 xa = ix86_expand_sse_fabs (res, &mask);
45365 /* if (!isless (xa, TWO52)) goto label; */
45366 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45368 /* xa = (double)(long)x */
45369 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45370 expand_fix (xi, res, 0);
45371 expand_float (xa, xi, 0);
45373 /* generate 1.0 */
45374 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45376 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45377 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45378 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45379 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45380 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45381 emit_move_insn (res, tmp);
45383 if (HONOR_SIGNED_ZEROS (mode))
45384 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45386 emit_label (label);
45387 LABEL_NUSES (label) = 1;
45389 emit_move_insn (operand0, res);
45392 /* Expand SSE sequence for computing round from OPERAND1 storing
45393 into OPERAND0. Sequence that works without relying on DImode truncation
45394 via cvttsd2siq that is only available on 64bit targets. */
45395 void
45396 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45398 /* C code for the stuff we expand below.
45399 double xa = fabs (x), xa2, x2;
45400 if (!isless (xa, TWO52))
45401 return x;
45402 Using the absolute value and copying back sign makes
45403 -0.0 -> -0.0 correct.
45404 xa2 = xa + TWO52 - TWO52;
45405 Compensate.
45406 dxa = xa2 - xa;
45407 if (dxa <= -0.5)
45408 xa2 += 1;
45409 else if (dxa > 0.5)
45410 xa2 -= 1;
45411 x2 = copysign (xa2, x);
45412 return x2;
45414 machine_mode mode = GET_MODE (operand0);
45415 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45416 rtx_code_label *label;
45418 TWO52 = ix86_gen_TWO52 (mode);
45420 /* Temporary for holding the result, initialized to the input
45421 operand to ease control flow. */
45422 res = gen_reg_rtx (mode);
45423 emit_move_insn (res, operand1);
45425 /* xa = abs (operand1) */
45426 xa = ix86_expand_sse_fabs (res, &mask);
45428 /* if (!isless (xa, TWO52)) goto label; */
45429 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45431 /* xa2 = xa + TWO52 - TWO52; */
45432 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45433 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45435 /* dxa = xa2 - xa; */
45436 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45438 /* generate 0.5, 1.0 and -0.5 */
45439 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45440 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45441 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45442 0, OPTAB_DIRECT);
45444 /* Compensate. */
45445 tmp = gen_reg_rtx (mode);
45446 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45447 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45448 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45449 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45450 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45451 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45452 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45453 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45455 /* res = copysign (xa2, operand1) */
45456 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45458 emit_label (label);
45459 LABEL_NUSES (label) = 1;
45461 emit_move_insn (operand0, res);
45464 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45465 into OPERAND0. */
45466 void
45467 ix86_expand_trunc (rtx operand0, rtx operand1)
45469 /* C code for SSE variant we expand below.
45470 double xa = fabs (x), x2;
45471 if (!isless (xa, TWO52))
45472 return x;
45473 x2 = (double)(long)x;
45474 if (HONOR_SIGNED_ZEROS (mode))
45475 return copysign (x2, x);
45476 return x2;
45478 machine_mode mode = GET_MODE (operand0);
45479 rtx xa, xi, TWO52, res, mask;
45480 rtx_code_label *label;
45482 TWO52 = ix86_gen_TWO52 (mode);
45484 /* Temporary for holding the result, initialized to the input
45485 operand to ease control flow. */
45486 res = gen_reg_rtx (mode);
45487 emit_move_insn (res, operand1);
45489 /* xa = abs (operand1) */
45490 xa = ix86_expand_sse_fabs (res, &mask);
45492 /* if (!isless (xa, TWO52)) goto label; */
45493 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45495 /* x = (double)(long)x */
45496 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45497 expand_fix (xi, res, 0);
45498 expand_float (res, xi, 0);
45500 if (HONOR_SIGNED_ZEROS (mode))
45501 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45503 emit_label (label);
45504 LABEL_NUSES (label) = 1;
45506 emit_move_insn (operand0, res);
45509 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45510 into OPERAND0. */
45511 void
45512 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45514 machine_mode mode = GET_MODE (operand0);
45515 rtx xa, mask, TWO52, one, res, smask, tmp;
45516 rtx_code_label *label;
45518 /* C code for SSE variant we expand below.
45519 double xa = fabs (x), x2;
45520 if (!isless (xa, TWO52))
45521 return x;
45522 xa2 = xa + TWO52 - TWO52;
45523 Compensate:
45524 if (xa2 > xa)
45525 xa2 -= 1.0;
45526 x2 = copysign (xa2, x);
45527 return x2;
45530 TWO52 = ix86_gen_TWO52 (mode);
45532 /* Temporary for holding the result, initialized to the input
45533 operand to ease control flow. */
45534 res = gen_reg_rtx (mode);
45535 emit_move_insn (res, operand1);
45537 /* xa = abs (operand1) */
45538 xa = ix86_expand_sse_fabs (res, &smask);
45540 /* if (!isless (xa, TWO52)) goto label; */
45541 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45543 /* res = xa + TWO52 - TWO52; */
45544 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45545 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45546 emit_move_insn (res, tmp);
45548 /* generate 1.0 */
45549 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45551 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45552 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45553 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45554 tmp = expand_simple_binop (mode, MINUS,
45555 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45556 emit_move_insn (res, tmp);
45558 /* res = copysign (res, operand1) */
45559 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45561 emit_label (label);
45562 LABEL_NUSES (label) = 1;
45564 emit_move_insn (operand0, res);
45567 /* Expand SSE sequence for computing round from OPERAND1 storing
45568 into OPERAND0. */
45569 void
45570 ix86_expand_round (rtx operand0, rtx operand1)
45572 /* C code for the stuff we're doing below:
45573 double xa = fabs (x);
45574 if (!isless (xa, TWO52))
45575 return x;
45576 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45577 return copysign (xa, x);
45579 machine_mode mode = GET_MODE (operand0);
45580 rtx res, TWO52, xa, xi, half, mask;
45581 rtx_code_label *label;
45582 const struct real_format *fmt;
45583 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45585 /* Temporary for holding the result, initialized to the input
45586 operand to ease control flow. */
45587 res = gen_reg_rtx (mode);
45588 emit_move_insn (res, operand1);
45590 TWO52 = ix86_gen_TWO52 (mode);
45591 xa = ix86_expand_sse_fabs (res, &mask);
45592 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45594 /* load nextafter (0.5, 0.0) */
45595 fmt = REAL_MODE_FORMAT (mode);
45596 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45597 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45599 /* xa = xa + 0.5 */
45600 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45601 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45603 /* xa = (double)(int64_t)xa */
45604 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45605 expand_fix (xi, xa, 0);
45606 expand_float (xa, xi, 0);
45608 /* res = copysign (xa, operand1) */
45609 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45611 emit_label (label);
45612 LABEL_NUSES (label) = 1;
45614 emit_move_insn (operand0, res);
45617 /* Expand SSE sequence for computing round
45618 from OP1 storing into OP0 using sse4 round insn. */
45619 void
45620 ix86_expand_round_sse4 (rtx op0, rtx op1)
45622 machine_mode mode = GET_MODE (op0);
45623 rtx e1, e2, res, half;
45624 const struct real_format *fmt;
45625 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45626 rtx (*gen_copysign) (rtx, rtx, rtx);
45627 rtx (*gen_round) (rtx, rtx, rtx);
45629 switch (mode)
45631 case SFmode:
45632 gen_copysign = gen_copysignsf3;
45633 gen_round = gen_sse4_1_roundsf2;
45634 break;
45635 case DFmode:
45636 gen_copysign = gen_copysigndf3;
45637 gen_round = gen_sse4_1_rounddf2;
45638 break;
45639 default:
45640 gcc_unreachable ();
45643 /* round (a) = trunc (a + copysign (0.5, a)) */
45645 /* load nextafter (0.5, 0.0) */
45646 fmt = REAL_MODE_FORMAT (mode);
45647 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45648 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45649 half = const_double_from_real_value (pred_half, mode);
45651 /* e1 = copysign (0.5, op1) */
45652 e1 = gen_reg_rtx (mode);
45653 emit_insn (gen_copysign (e1, half, op1));
45655 /* e2 = op1 + e1 */
45656 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45658 /* res = trunc (e2) */
45659 res = gen_reg_rtx (mode);
45660 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45662 emit_move_insn (op0, res);
45666 /* Table of valid machine attributes. */
45667 static const struct attribute_spec ix86_attribute_table[] =
45669 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
45670 affects_type_identity } */
45671 /* Stdcall attribute says callee is responsible for popping arguments
45672 if they are not variable. */
45673 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45674 true },
45675 /* Fastcall attribute says callee is responsible for popping arguments
45676 if they are not variable. */
45677 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45678 true },
45679 /* Thiscall attribute says callee is responsible for popping arguments
45680 if they are not variable. */
45681 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45682 true },
45683 /* Cdecl attribute says the callee is a normal C declaration */
45684 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45685 true },
45686 /* Regparm attribute specifies how many integer arguments are to be
45687 passed in registers. */
45688 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
45689 true },
45690 /* Sseregparm attribute says we are using x86_64 calling conventions
45691 for FP arguments. */
45692 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45693 true },
45694 /* The transactional memory builtins are implicitly regparm or fastcall
45695 depending on the ABI. Override the generic do-nothing attribute that
45696 these builtins were declared with. */
45697 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
45698 true },
45699 /* force_align_arg_pointer says this function realigns the stack at entry. */
45700 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45701 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
45702 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45703 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
45704 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
45705 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
45706 false },
45707 #endif
45708 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45709 false },
45710 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45711 false },
45712 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45713 SUBTARGET_ATTRIBUTE_TABLE,
45714 #endif
45715 /* ms_abi and sysv_abi calling convention function attributes. */
45716 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45717 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45718 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
45719 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
45720 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
45721 false },
45722 { "callee_pop_aggregate_return", 1, 1, false, true, true,
45723 ix86_handle_callee_pop_aggregate_return, true },
45724 { "interrupt", 0, 0, false, true, true,
45725 ix86_handle_interrupt_attribute, false },
45726 { "no_caller_saved_registers", 0, 0, false, true, true,
45727 ix86_handle_no_caller_saved_registers_attribute, false },
45729 /* End element. */
45730 { NULL, 0, 0, false, false, false, NULL, false }
45733 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45734 static int
45735 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45736 tree vectype, int)
45738 switch (type_of_cost)
45740 case scalar_stmt:
45741 return ix86_cost->scalar_stmt_cost;
45743 case scalar_load:
45744 return ix86_cost->scalar_load_cost;
45746 case scalar_store:
45747 return ix86_cost->scalar_store_cost;
45749 case vector_stmt:
45750 return ix86_cost->vec_stmt_cost;
45752 case vector_load:
45753 return ix86_cost->vec_align_load_cost;
45755 case vector_store:
45756 return ix86_cost->vec_store_cost;
45758 case vec_to_scalar:
45759 return ix86_cost->vec_to_scalar_cost;
45761 case scalar_to_vec:
45762 return ix86_cost->scalar_to_vec_cost;
45764 case unaligned_load:
45765 case unaligned_store:
45766 return ix86_cost->vec_unalign_load_cost;
45768 case cond_branch_taken:
45769 return ix86_cost->cond_taken_branch_cost;
45771 case cond_branch_not_taken:
45772 return ix86_cost->cond_not_taken_branch_cost;
45774 case vec_perm:
45775 case vec_promote_demote:
45776 return ix86_cost->vec_stmt_cost;
45778 case vec_construct:
45779 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
45781 default:
45782 gcc_unreachable ();
45786 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45787 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45788 insn every time. */
45790 static GTY(()) rtx_insn *vselect_insn;
45792 /* Initialize vselect_insn. */
45794 static void
45795 init_vselect_insn (void)
45797 unsigned i;
45798 rtx x;
45800 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45801 for (i = 0; i < MAX_VECT_LEN; ++i)
45802 XVECEXP (x, 0, i) = const0_rtx;
45803 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45804 const0_rtx), x);
45805 x = gen_rtx_SET (const0_rtx, x);
45806 start_sequence ();
45807 vselect_insn = emit_insn (x);
45808 end_sequence ();
45811 /* Construct (set target (vec_select op0 (parallel perm))) and
45812 return true if that's a valid instruction in the active ISA. */
45814 static bool
45815 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45816 unsigned nelt, bool testing_p)
45818 unsigned int i;
45819 rtx x, save_vconcat;
45820 int icode;
45822 if (vselect_insn == NULL_RTX)
45823 init_vselect_insn ();
45825 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45826 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45827 for (i = 0; i < nelt; ++i)
45828 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45829 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45830 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45831 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45832 SET_DEST (PATTERN (vselect_insn)) = target;
45833 icode = recog_memoized (vselect_insn);
45835 if (icode >= 0 && !testing_p)
45836 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45838 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45839 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45840 INSN_CODE (vselect_insn) = -1;
45842 return icode >= 0;
45845 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45847 static bool
45848 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45849 const unsigned char *perm, unsigned nelt,
45850 bool testing_p)
45852 machine_mode v2mode;
45853 rtx x;
45854 bool ok;
45856 if (vselect_insn == NULL_RTX)
45857 init_vselect_insn ();
45859 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
45860 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45861 PUT_MODE (x, v2mode);
45862 XEXP (x, 0) = op0;
45863 XEXP (x, 1) = op1;
45864 ok = expand_vselect (target, x, perm, nelt, testing_p);
45865 XEXP (x, 0) = const0_rtx;
45866 XEXP (x, 1) = const0_rtx;
45867 return ok;
45870 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45871 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45873 static bool
45874 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45876 machine_mode mmode, vmode = d->vmode;
45877 unsigned i, mask, nelt = d->nelt;
45878 rtx target, op0, op1, maskop, x;
45879 rtx rperm[32], vperm;
45881 if (d->one_operand_p)
45882 return false;
45883 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45884 && (TARGET_AVX512BW
45885 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45887 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45889 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45891 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45893 else
45894 return false;
45896 /* This is a blend, not a permute. Elements must stay in their
45897 respective lanes. */
45898 for (i = 0; i < nelt; ++i)
45900 unsigned e = d->perm[i];
45901 if (!(e == i || e == i + nelt))
45902 return false;
45905 if (d->testing_p)
45906 return true;
45908 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45909 decision should be extracted elsewhere, so that we only try that
45910 sequence once all budget==3 options have been tried. */
45911 target = d->target;
45912 op0 = d->op0;
45913 op1 = d->op1;
45914 mask = 0;
45916 switch (vmode)
45918 case V8DFmode:
45919 case V16SFmode:
45920 case V4DFmode:
45921 case V8SFmode:
45922 case V2DFmode:
45923 case V4SFmode:
45924 case V8HImode:
45925 case V8SImode:
45926 case V32HImode:
45927 case V64QImode:
45928 case V16SImode:
45929 case V8DImode:
45930 for (i = 0; i < nelt; ++i)
45931 mask |= (d->perm[i] >= nelt) << i;
45932 break;
45934 case V2DImode:
45935 for (i = 0; i < 2; ++i)
45936 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45937 vmode = V8HImode;
45938 goto do_subreg;
45940 case V4SImode:
45941 for (i = 0; i < 4; ++i)
45942 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45943 vmode = V8HImode;
45944 goto do_subreg;
45946 case V16QImode:
45947 /* See if bytes move in pairs so we can use pblendw with
45948 an immediate argument, rather than pblendvb with a vector
45949 argument. */
45950 for (i = 0; i < 16; i += 2)
45951 if (d->perm[i] + 1 != d->perm[i + 1])
45953 use_pblendvb:
45954 for (i = 0; i < nelt; ++i)
45955 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45957 finish_pblendvb:
45958 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45959 vperm = force_reg (vmode, vperm);
45961 if (GET_MODE_SIZE (vmode) == 16)
45962 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45963 else
45964 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45965 if (target != d->target)
45966 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45967 return true;
45970 for (i = 0; i < 8; ++i)
45971 mask |= (d->perm[i * 2] >= 16) << i;
45972 vmode = V8HImode;
45973 /* FALLTHRU */
45975 do_subreg:
45976 target = gen_reg_rtx (vmode);
45977 op0 = gen_lowpart (vmode, op0);
45978 op1 = gen_lowpart (vmode, op1);
45979 break;
45981 case V32QImode:
45982 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45983 for (i = 0; i < 32; i += 2)
45984 if (d->perm[i] + 1 != d->perm[i + 1])
45985 goto use_pblendvb;
45986 /* See if bytes move in quadruplets. If yes, vpblendd
45987 with immediate can be used. */
45988 for (i = 0; i < 32; i += 4)
45989 if (d->perm[i] + 2 != d->perm[i + 2])
45990 break;
45991 if (i < 32)
45993 /* See if bytes move the same in both lanes. If yes,
45994 vpblendw with immediate can be used. */
45995 for (i = 0; i < 16; i += 2)
45996 if (d->perm[i] + 16 != d->perm[i + 16])
45997 goto use_pblendvb;
45999 /* Use vpblendw. */
46000 for (i = 0; i < 16; ++i)
46001 mask |= (d->perm[i * 2] >= 32) << i;
46002 vmode = V16HImode;
46003 goto do_subreg;
46006 /* Use vpblendd. */
46007 for (i = 0; i < 8; ++i)
46008 mask |= (d->perm[i * 4] >= 32) << i;
46009 vmode = V8SImode;
46010 goto do_subreg;
46012 case V16HImode:
46013 /* See if words move in pairs. If yes, vpblendd can be used. */
46014 for (i = 0; i < 16; i += 2)
46015 if (d->perm[i] + 1 != d->perm[i + 1])
46016 break;
46017 if (i < 16)
46019 /* See if words move the same in both lanes. If not,
46020 vpblendvb must be used. */
46021 for (i = 0; i < 8; i++)
46022 if (d->perm[i] + 8 != d->perm[i + 8])
46024 /* Use vpblendvb. */
46025 for (i = 0; i < 32; ++i)
46026 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46028 vmode = V32QImode;
46029 nelt = 32;
46030 target = gen_reg_rtx (vmode);
46031 op0 = gen_lowpart (vmode, op0);
46032 op1 = gen_lowpart (vmode, op1);
46033 goto finish_pblendvb;
46036 /* Use vpblendw. */
46037 for (i = 0; i < 16; ++i)
46038 mask |= (d->perm[i] >= 16) << i;
46039 break;
46042 /* Use vpblendd. */
46043 for (i = 0; i < 8; ++i)
46044 mask |= (d->perm[i * 2] >= 16) << i;
46045 vmode = V8SImode;
46046 goto do_subreg;
46048 case V4DImode:
46049 /* Use vpblendd. */
46050 for (i = 0; i < 4; ++i)
46051 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46052 vmode = V8SImode;
46053 goto do_subreg;
46055 default:
46056 gcc_unreachable ();
46059 switch (vmode)
46061 case V8DFmode:
46062 case V8DImode:
46063 mmode = QImode;
46064 break;
46065 case V16SFmode:
46066 case V16SImode:
46067 mmode = HImode;
46068 break;
46069 case V32HImode:
46070 mmode = SImode;
46071 break;
46072 case V64QImode:
46073 mmode = DImode;
46074 break;
46075 default:
46076 mmode = VOIDmode;
46079 if (mmode != VOIDmode)
46080 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46081 else
46082 maskop = GEN_INT (mask);
46084 /* This matches five different patterns with the different modes. */
46085 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46086 x = gen_rtx_SET (target, x);
46087 emit_insn (x);
46088 if (target != d->target)
46089 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46091 return true;
46094 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46095 in terms of the variable form of vpermilps.
46097 Note that we will have already failed the immediate input vpermilps,
46098 which requires that the high and low part shuffle be identical; the
46099 variable form doesn't require that. */
46101 static bool
46102 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46104 rtx rperm[8], vperm;
46105 unsigned i;
46107 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46108 return false;
46110 /* We can only permute within the 128-bit lane. */
46111 for (i = 0; i < 8; ++i)
46113 unsigned e = d->perm[i];
46114 if (i < 4 ? e >= 4 : e < 4)
46115 return false;
46118 if (d->testing_p)
46119 return true;
46121 for (i = 0; i < 8; ++i)
46123 unsigned e = d->perm[i];
46125 /* Within each 128-bit lane, the elements of op0 are numbered
46126 from 0 and the elements of op1 are numbered from 4. */
46127 if (e >= 8 + 4)
46128 e -= 8;
46129 else if (e >= 4)
46130 e -= 4;
46132 rperm[i] = GEN_INT (e);
46135 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46136 vperm = force_reg (V8SImode, vperm);
46137 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46139 return true;
46142 /* Return true if permutation D can be performed as VMODE permutation
46143 instead. */
46145 static bool
46146 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46148 unsigned int i, j, chunk;
46150 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46151 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46152 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46153 return false;
46155 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46156 return true;
46158 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46159 for (i = 0; i < d->nelt; i += chunk)
46160 if (d->perm[i] & (chunk - 1))
46161 return false;
46162 else
46163 for (j = 1; j < chunk; ++j)
46164 if (d->perm[i] + j != d->perm[i + j])
46165 return false;
46167 return true;
46170 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46171 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46173 static bool
46174 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46176 unsigned i, nelt, eltsz, mask;
46177 unsigned char perm[64];
46178 machine_mode vmode = V16QImode;
46179 rtx rperm[64], vperm, target, op0, op1;
46181 nelt = d->nelt;
46183 if (!d->one_operand_p)
46185 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46187 if (TARGET_AVX2
46188 && valid_perm_using_mode_p (V2TImode, d))
46190 if (d->testing_p)
46191 return true;
46193 /* Use vperm2i128 insn. The pattern uses
46194 V4DImode instead of V2TImode. */
46195 target = d->target;
46196 if (d->vmode != V4DImode)
46197 target = gen_reg_rtx (V4DImode);
46198 op0 = gen_lowpart (V4DImode, d->op0);
46199 op1 = gen_lowpart (V4DImode, d->op1);
46200 rperm[0]
46201 = GEN_INT ((d->perm[0] / (nelt / 2))
46202 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46203 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46204 if (target != d->target)
46205 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46206 return true;
46208 return false;
46211 else
46213 if (GET_MODE_SIZE (d->vmode) == 16)
46215 if (!TARGET_SSSE3)
46216 return false;
46218 else if (GET_MODE_SIZE (d->vmode) == 32)
46220 if (!TARGET_AVX2)
46221 return false;
46223 /* V4DImode should be already handled through
46224 expand_vselect by vpermq instruction. */
46225 gcc_assert (d->vmode != V4DImode);
46227 vmode = V32QImode;
46228 if (d->vmode == V8SImode
46229 || d->vmode == V16HImode
46230 || d->vmode == V32QImode)
46232 /* First see if vpermq can be used for
46233 V8SImode/V16HImode/V32QImode. */
46234 if (valid_perm_using_mode_p (V4DImode, d))
46236 for (i = 0; i < 4; i++)
46237 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46238 if (d->testing_p)
46239 return true;
46240 target = gen_reg_rtx (V4DImode);
46241 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46242 perm, 4, false))
46244 emit_move_insn (d->target,
46245 gen_lowpart (d->vmode, target));
46246 return true;
46248 return false;
46251 /* Next see if vpermd can be used. */
46252 if (valid_perm_using_mode_p (V8SImode, d))
46253 vmode = V8SImode;
46255 /* Or if vpermps can be used. */
46256 else if (d->vmode == V8SFmode)
46257 vmode = V8SImode;
46259 if (vmode == V32QImode)
46261 /* vpshufb only works intra lanes, it is not
46262 possible to shuffle bytes in between the lanes. */
46263 for (i = 0; i < nelt; ++i)
46264 if ((d->perm[i] ^ i) & (nelt / 2))
46265 return false;
46268 else if (GET_MODE_SIZE (d->vmode) == 64)
46270 if (!TARGET_AVX512BW)
46271 return false;
46273 /* If vpermq didn't work, vpshufb won't work either. */
46274 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46275 return false;
46277 vmode = V64QImode;
46278 if (d->vmode == V16SImode
46279 || d->vmode == V32HImode
46280 || d->vmode == V64QImode)
46282 /* First see if vpermq can be used for
46283 V16SImode/V32HImode/V64QImode. */
46284 if (valid_perm_using_mode_p (V8DImode, d))
46286 for (i = 0; i < 8; i++)
46287 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46288 if (d->testing_p)
46289 return true;
46290 target = gen_reg_rtx (V8DImode);
46291 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46292 perm, 8, false))
46294 emit_move_insn (d->target,
46295 gen_lowpart (d->vmode, target));
46296 return true;
46298 return false;
46301 /* Next see if vpermd can be used. */
46302 if (valid_perm_using_mode_p (V16SImode, d))
46303 vmode = V16SImode;
46305 /* Or if vpermps can be used. */
46306 else if (d->vmode == V16SFmode)
46307 vmode = V16SImode;
46308 if (vmode == V64QImode)
46310 /* vpshufb only works intra lanes, it is not
46311 possible to shuffle bytes in between the lanes. */
46312 for (i = 0; i < nelt; ++i)
46313 if ((d->perm[i] ^ i) & (nelt / 4))
46314 return false;
46317 else
46318 return false;
46321 if (d->testing_p)
46322 return true;
46324 if (vmode == V8SImode)
46325 for (i = 0; i < 8; ++i)
46326 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46327 else if (vmode == V16SImode)
46328 for (i = 0; i < 16; ++i)
46329 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46330 else
46332 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46333 if (!d->one_operand_p)
46334 mask = 2 * nelt - 1;
46335 else if (vmode == V16QImode)
46336 mask = nelt - 1;
46337 else if (vmode == V64QImode)
46338 mask = nelt / 4 - 1;
46339 else
46340 mask = nelt / 2 - 1;
46342 for (i = 0; i < nelt; ++i)
46344 unsigned j, e = d->perm[i] & mask;
46345 for (j = 0; j < eltsz; ++j)
46346 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46350 vperm = gen_rtx_CONST_VECTOR (vmode,
46351 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46352 vperm = force_reg (vmode, vperm);
46354 target = d->target;
46355 if (d->vmode != vmode)
46356 target = gen_reg_rtx (vmode);
46357 op0 = gen_lowpart (vmode, d->op0);
46358 if (d->one_operand_p)
46360 if (vmode == V16QImode)
46361 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46362 else if (vmode == V32QImode)
46363 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46364 else if (vmode == V64QImode)
46365 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46366 else if (vmode == V8SFmode)
46367 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46368 else if (vmode == V8SImode)
46369 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46370 else if (vmode == V16SFmode)
46371 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46372 else if (vmode == V16SImode)
46373 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46374 else
46375 gcc_unreachable ();
46377 else
46379 op1 = gen_lowpart (vmode, d->op1);
46380 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46382 if (target != d->target)
46383 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46385 return true;
46388 /* For V*[QHS]Imode permutations, check if the same permutation
46389 can't be performed in a 2x, 4x or 8x wider inner mode. */
46391 static bool
46392 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46393 struct expand_vec_perm_d *nd)
46395 int i;
46396 enum machine_mode mode = VOIDmode;
46398 switch (d->vmode)
46400 case V16QImode: mode = V8HImode; break;
46401 case V32QImode: mode = V16HImode; break;
46402 case V64QImode: mode = V32HImode; break;
46403 case V8HImode: mode = V4SImode; break;
46404 case V16HImode: mode = V8SImode; break;
46405 case V32HImode: mode = V16SImode; break;
46406 case V4SImode: mode = V2DImode; break;
46407 case V8SImode: mode = V4DImode; break;
46408 case V16SImode: mode = V8DImode; break;
46409 default: return false;
46411 for (i = 0; i < d->nelt; i += 2)
46412 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46413 return false;
46414 nd->vmode = mode;
46415 nd->nelt = d->nelt / 2;
46416 for (i = 0; i < nd->nelt; i++)
46417 nd->perm[i] = d->perm[2 * i] / 2;
46418 if (GET_MODE_INNER (mode) != DImode)
46419 canonicalize_vector_int_perm (nd, nd);
46420 if (nd != d)
46422 nd->one_operand_p = d->one_operand_p;
46423 nd->testing_p = d->testing_p;
46424 if (d->op0 == d->op1)
46425 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46426 else
46428 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46429 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46431 if (d->testing_p)
46432 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46433 else
46434 nd->target = gen_reg_rtx (nd->vmode);
46436 return true;
46439 /* Try to expand one-operand permutation with constant mask. */
46441 static bool
46442 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46444 machine_mode mode = GET_MODE (d->op0);
46445 machine_mode maskmode = mode;
46446 rtx (*gen) (rtx, rtx, rtx) = NULL;
46447 rtx target, op0, mask;
46448 rtx vec[64];
46450 if (!rtx_equal_p (d->op0, d->op1))
46451 return false;
46453 if (!TARGET_AVX512F)
46454 return false;
46456 switch (mode)
46458 case V16SImode:
46459 gen = gen_avx512f_permvarv16si;
46460 break;
46461 case V16SFmode:
46462 gen = gen_avx512f_permvarv16sf;
46463 maskmode = V16SImode;
46464 break;
46465 case V8DImode:
46466 gen = gen_avx512f_permvarv8di;
46467 break;
46468 case V8DFmode:
46469 gen = gen_avx512f_permvarv8df;
46470 maskmode = V8DImode;
46471 break;
46472 default:
46473 return false;
46476 target = d->target;
46477 op0 = d->op0;
46478 for (int i = 0; i < d->nelt; ++i)
46479 vec[i] = GEN_INT (d->perm[i]);
46480 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46481 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46482 return true;
46485 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46486 in a single instruction. */
46488 static bool
46489 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46491 unsigned i, nelt = d->nelt;
46492 struct expand_vec_perm_d nd;
46494 /* Check plain VEC_SELECT first, because AVX has instructions that could
46495 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46496 input where SEL+CONCAT may not. */
46497 if (d->one_operand_p)
46499 int mask = nelt - 1;
46500 bool identity_perm = true;
46501 bool broadcast_perm = true;
46503 for (i = 0; i < nelt; i++)
46505 nd.perm[i] = d->perm[i] & mask;
46506 if (nd.perm[i] != i)
46507 identity_perm = false;
46508 if (nd.perm[i])
46509 broadcast_perm = false;
46512 if (identity_perm)
46514 if (!d->testing_p)
46515 emit_move_insn (d->target, d->op0);
46516 return true;
46518 else if (broadcast_perm && TARGET_AVX2)
46520 /* Use vpbroadcast{b,w,d}. */
46521 rtx (*gen) (rtx, rtx) = NULL;
46522 switch (d->vmode)
46524 case V64QImode:
46525 if (TARGET_AVX512BW)
46526 gen = gen_avx512bw_vec_dupv64qi_1;
46527 break;
46528 case V32QImode:
46529 gen = gen_avx2_pbroadcastv32qi_1;
46530 break;
46531 case V32HImode:
46532 if (TARGET_AVX512BW)
46533 gen = gen_avx512bw_vec_dupv32hi_1;
46534 break;
46535 case V16HImode:
46536 gen = gen_avx2_pbroadcastv16hi_1;
46537 break;
46538 case V16SImode:
46539 if (TARGET_AVX512F)
46540 gen = gen_avx512f_vec_dupv16si_1;
46541 break;
46542 case V8SImode:
46543 gen = gen_avx2_pbroadcastv8si_1;
46544 break;
46545 case V16QImode:
46546 gen = gen_avx2_pbroadcastv16qi;
46547 break;
46548 case V8HImode:
46549 gen = gen_avx2_pbroadcastv8hi;
46550 break;
46551 case V16SFmode:
46552 if (TARGET_AVX512F)
46553 gen = gen_avx512f_vec_dupv16sf_1;
46554 break;
46555 case V8SFmode:
46556 gen = gen_avx2_vec_dupv8sf_1;
46557 break;
46558 case V8DFmode:
46559 if (TARGET_AVX512F)
46560 gen = gen_avx512f_vec_dupv8df_1;
46561 break;
46562 case V8DImode:
46563 if (TARGET_AVX512F)
46564 gen = gen_avx512f_vec_dupv8di_1;
46565 break;
46566 /* For other modes prefer other shuffles this function creates. */
46567 default: break;
46569 if (gen != NULL)
46571 if (!d->testing_p)
46572 emit_insn (gen (d->target, d->op0));
46573 return true;
46577 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46578 return true;
46580 /* There are plenty of patterns in sse.md that are written for
46581 SEL+CONCAT and are not replicated for a single op. Perhaps
46582 that should be changed, to avoid the nastiness here. */
46584 /* Recognize interleave style patterns, which means incrementing
46585 every other permutation operand. */
46586 for (i = 0; i < nelt; i += 2)
46588 nd.perm[i] = d->perm[i] & mask;
46589 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46591 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46592 d->testing_p))
46593 return true;
46595 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46596 if (nelt >= 4)
46598 for (i = 0; i < nelt; i += 4)
46600 nd.perm[i + 0] = d->perm[i + 0] & mask;
46601 nd.perm[i + 1] = d->perm[i + 1] & mask;
46602 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46603 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46606 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46607 d->testing_p))
46608 return true;
46612 /* Finally, try the fully general two operand permute. */
46613 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46614 d->testing_p))
46615 return true;
46617 /* Recognize interleave style patterns with reversed operands. */
46618 if (!d->one_operand_p)
46620 for (i = 0; i < nelt; ++i)
46622 unsigned e = d->perm[i];
46623 if (e >= nelt)
46624 e -= nelt;
46625 else
46626 e += nelt;
46627 nd.perm[i] = e;
46630 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46631 d->testing_p))
46632 return true;
46635 /* Try the SSE4.1 blend variable merge instructions. */
46636 if (expand_vec_perm_blend (d))
46637 return true;
46639 /* Try one of the AVX vpermil variable permutations. */
46640 if (expand_vec_perm_vpermil (d))
46641 return true;
46643 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46644 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46645 if (expand_vec_perm_pshufb (d))
46646 return true;
46648 /* Try the AVX2 vpalignr instruction. */
46649 if (expand_vec_perm_palignr (d, true))
46650 return true;
46652 /* Try the AVX512F vperm{s,d} instructions. */
46653 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46654 return true;
46656 /* Try the AVX512F vpermi2 instructions. */
46657 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46658 return true;
46660 /* See if we can get the same permutation in different vector integer
46661 mode. */
46662 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46664 if (!d->testing_p)
46665 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46666 return true;
46668 return false;
46671 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46672 in terms of a pair of pshuflw + pshufhw instructions. */
46674 static bool
46675 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46677 unsigned char perm2[MAX_VECT_LEN];
46678 unsigned i;
46679 bool ok;
46681 if (d->vmode != V8HImode || !d->one_operand_p)
46682 return false;
46684 /* The two permutations only operate in 64-bit lanes. */
46685 for (i = 0; i < 4; ++i)
46686 if (d->perm[i] >= 4)
46687 return false;
46688 for (i = 4; i < 8; ++i)
46689 if (d->perm[i] < 4)
46690 return false;
46692 if (d->testing_p)
46693 return true;
46695 /* Emit the pshuflw. */
46696 memcpy (perm2, d->perm, 4);
46697 for (i = 4; i < 8; ++i)
46698 perm2[i] = i;
46699 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46700 gcc_assert (ok);
46702 /* Emit the pshufhw. */
46703 memcpy (perm2 + 4, d->perm + 4, 4);
46704 for (i = 0; i < 4; ++i)
46705 perm2[i] = i;
46706 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46707 gcc_assert (ok);
46709 return true;
46712 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46713 the permutation using the SSSE3 palignr instruction. This succeeds
46714 when all of the elements in PERM fit within one vector and we merely
46715 need to shift them down so that a single vector permutation has a
46716 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46717 the vpalignr instruction itself can perform the requested permutation. */
46719 static bool
46720 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46722 unsigned i, nelt = d->nelt;
46723 unsigned min, max, minswap, maxswap;
46724 bool in_order, ok, swap = false;
46725 rtx shift, target;
46726 struct expand_vec_perm_d dcopy;
46728 /* Even with AVX, palignr only operates on 128-bit vectors,
46729 in AVX2 palignr operates on both 128-bit lanes. */
46730 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46731 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46732 return false;
46734 min = 2 * nelt;
46735 max = 0;
46736 minswap = 2 * nelt;
46737 maxswap = 0;
46738 for (i = 0; i < nelt; ++i)
46740 unsigned e = d->perm[i];
46741 unsigned eswap = d->perm[i] ^ nelt;
46742 if (GET_MODE_SIZE (d->vmode) == 32)
46744 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46745 eswap = e ^ (nelt / 2);
46747 if (e < min)
46748 min = e;
46749 if (e > max)
46750 max = e;
46751 if (eswap < minswap)
46752 minswap = eswap;
46753 if (eswap > maxswap)
46754 maxswap = eswap;
46756 if (min == 0
46757 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46759 if (d->one_operand_p
46760 || minswap == 0
46761 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46762 ? nelt / 2 : nelt))
46763 return false;
46764 swap = true;
46765 min = minswap;
46766 max = maxswap;
46769 /* Given that we have SSSE3, we know we'll be able to implement the
46770 single operand permutation after the palignr with pshufb for
46771 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46772 first. */
46773 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46774 return true;
46776 dcopy = *d;
46777 if (swap)
46779 dcopy.op0 = d->op1;
46780 dcopy.op1 = d->op0;
46781 for (i = 0; i < nelt; ++i)
46782 dcopy.perm[i] ^= nelt;
46785 in_order = true;
46786 for (i = 0; i < nelt; ++i)
46788 unsigned e = dcopy.perm[i];
46789 if (GET_MODE_SIZE (d->vmode) == 32
46790 && e >= nelt
46791 && (e & (nelt / 2 - 1)) < min)
46792 e = e - min - (nelt / 2);
46793 else
46794 e = e - min;
46795 if (e != i)
46796 in_order = false;
46797 dcopy.perm[i] = e;
46799 dcopy.one_operand_p = true;
46801 if (single_insn_only_p && !in_order)
46802 return false;
46804 /* For AVX2, test whether we can permute the result in one instruction. */
46805 if (d->testing_p)
46807 if (in_order)
46808 return true;
46809 dcopy.op1 = dcopy.op0;
46810 return expand_vec_perm_1 (&dcopy);
46813 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46814 if (GET_MODE_SIZE (d->vmode) == 16)
46816 target = gen_reg_rtx (TImode);
46817 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46818 gen_lowpart (TImode, dcopy.op0), shift));
46820 else
46822 target = gen_reg_rtx (V2TImode);
46823 emit_insn (gen_avx2_palignrv2ti (target,
46824 gen_lowpart (V2TImode, dcopy.op1),
46825 gen_lowpart (V2TImode, dcopy.op0),
46826 shift));
46829 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46831 /* Test for the degenerate case where the alignment by itself
46832 produces the desired permutation. */
46833 if (in_order)
46835 emit_move_insn (d->target, dcopy.op0);
46836 return true;
46839 ok = expand_vec_perm_1 (&dcopy);
46840 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46842 return ok;
46845 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46846 the permutation using the SSE4_1 pblendv instruction. Potentially
46847 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46849 static bool
46850 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46852 unsigned i, which, nelt = d->nelt;
46853 struct expand_vec_perm_d dcopy, dcopy1;
46854 machine_mode vmode = d->vmode;
46855 bool ok;
46857 /* Use the same checks as in expand_vec_perm_blend. */
46858 if (d->one_operand_p)
46859 return false;
46860 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46862 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46864 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46866 else
46867 return false;
46869 /* Figure out where permutation elements stay not in their
46870 respective lanes. */
46871 for (i = 0, which = 0; i < nelt; ++i)
46873 unsigned e = d->perm[i];
46874 if (e != i)
46875 which |= (e < nelt ? 1 : 2);
46877 /* We can pblend the part where elements stay not in their
46878 respective lanes only when these elements are all in one
46879 half of a permutation.
46880 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46881 lanes, but both 8 and 9 >= 8
46882 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46883 respective lanes and 8 >= 8, but 2 not. */
46884 if (which != 1 && which != 2)
46885 return false;
46886 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46887 return true;
46889 /* First we apply one operand permutation to the part where
46890 elements stay not in their respective lanes. */
46891 dcopy = *d;
46892 if (which == 2)
46893 dcopy.op0 = dcopy.op1 = d->op1;
46894 else
46895 dcopy.op0 = dcopy.op1 = d->op0;
46896 if (!d->testing_p)
46897 dcopy.target = gen_reg_rtx (vmode);
46898 dcopy.one_operand_p = true;
46900 for (i = 0; i < nelt; ++i)
46901 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46903 ok = expand_vec_perm_1 (&dcopy);
46904 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46905 return false;
46906 else
46907 gcc_assert (ok);
46908 if (d->testing_p)
46909 return true;
46911 /* Next we put permuted elements into their positions. */
46912 dcopy1 = *d;
46913 if (which == 2)
46914 dcopy1.op1 = dcopy.target;
46915 else
46916 dcopy1.op0 = dcopy.target;
46918 for (i = 0; i < nelt; ++i)
46919 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46921 ok = expand_vec_perm_blend (&dcopy1);
46922 gcc_assert (ok);
46924 return true;
46927 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46929 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46930 a two vector permutation into a single vector permutation by using
46931 an interleave operation to merge the vectors. */
46933 static bool
46934 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46936 struct expand_vec_perm_d dremap, dfinal;
46937 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46938 unsigned HOST_WIDE_INT contents;
46939 unsigned char remap[2 * MAX_VECT_LEN];
46940 rtx_insn *seq;
46941 bool ok, same_halves = false;
46943 if (GET_MODE_SIZE (d->vmode) == 16)
46945 if (d->one_operand_p)
46946 return false;
46948 else if (GET_MODE_SIZE (d->vmode) == 32)
46950 if (!TARGET_AVX)
46951 return false;
46952 /* For 32-byte modes allow even d->one_operand_p.
46953 The lack of cross-lane shuffling in some instructions
46954 might prevent a single insn shuffle. */
46955 dfinal = *d;
46956 dfinal.testing_p = true;
46957 /* If expand_vec_perm_interleave3 can expand this into
46958 a 3 insn sequence, give up and let it be expanded as
46959 3 insn sequence. While that is one insn longer,
46960 it doesn't need a memory operand and in the common
46961 case that both interleave low and high permutations
46962 with the same operands are adjacent needs 4 insns
46963 for both after CSE. */
46964 if (expand_vec_perm_interleave3 (&dfinal))
46965 return false;
46967 else
46968 return false;
46970 /* Examine from whence the elements come. */
46971 contents = 0;
46972 for (i = 0; i < nelt; ++i)
46973 contents |= HOST_WIDE_INT_1U << d->perm[i];
46975 memset (remap, 0xff, sizeof (remap));
46976 dremap = *d;
46978 if (GET_MODE_SIZE (d->vmode) == 16)
46980 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46982 /* Split the two input vectors into 4 halves. */
46983 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46984 h2 = h1 << nelt2;
46985 h3 = h2 << nelt2;
46986 h4 = h3 << nelt2;
46988 /* If the elements from the low halves use interleave low, and similarly
46989 for interleave high. If the elements are from mis-matched halves, we
46990 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46991 if ((contents & (h1 | h3)) == contents)
46993 /* punpckl* */
46994 for (i = 0; i < nelt2; ++i)
46996 remap[i] = i * 2;
46997 remap[i + nelt] = i * 2 + 1;
46998 dremap.perm[i * 2] = i;
46999 dremap.perm[i * 2 + 1] = i + nelt;
47001 if (!TARGET_SSE2 && d->vmode == V4SImode)
47002 dremap.vmode = V4SFmode;
47004 else if ((contents & (h2 | h4)) == contents)
47006 /* punpckh* */
47007 for (i = 0; i < nelt2; ++i)
47009 remap[i + nelt2] = i * 2;
47010 remap[i + nelt + nelt2] = i * 2 + 1;
47011 dremap.perm[i * 2] = i + nelt2;
47012 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47014 if (!TARGET_SSE2 && d->vmode == V4SImode)
47015 dremap.vmode = V4SFmode;
47017 else if ((contents & (h1 | h4)) == contents)
47019 /* shufps */
47020 for (i = 0; i < nelt2; ++i)
47022 remap[i] = i;
47023 remap[i + nelt + nelt2] = i + nelt2;
47024 dremap.perm[i] = i;
47025 dremap.perm[i + nelt2] = i + nelt + nelt2;
47027 if (nelt != 4)
47029 /* shufpd */
47030 dremap.vmode = V2DImode;
47031 dremap.nelt = 2;
47032 dremap.perm[0] = 0;
47033 dremap.perm[1] = 3;
47036 else if ((contents & (h2 | h3)) == contents)
47038 /* shufps */
47039 for (i = 0; i < nelt2; ++i)
47041 remap[i + nelt2] = i;
47042 remap[i + nelt] = i + nelt2;
47043 dremap.perm[i] = i + nelt2;
47044 dremap.perm[i + nelt2] = i + nelt;
47046 if (nelt != 4)
47048 /* shufpd */
47049 dremap.vmode = V2DImode;
47050 dremap.nelt = 2;
47051 dremap.perm[0] = 1;
47052 dremap.perm[1] = 2;
47055 else
47056 return false;
47058 else
47060 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47061 unsigned HOST_WIDE_INT q[8];
47062 unsigned int nonzero_halves[4];
47064 /* Split the two input vectors into 8 quarters. */
47065 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47066 for (i = 1; i < 8; ++i)
47067 q[i] = q[0] << (nelt4 * i);
47068 for (i = 0; i < 4; ++i)
47069 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47071 nonzero_halves[nzcnt] = i;
47072 ++nzcnt;
47075 if (nzcnt == 1)
47077 gcc_assert (d->one_operand_p);
47078 nonzero_halves[1] = nonzero_halves[0];
47079 same_halves = true;
47081 else if (d->one_operand_p)
47083 gcc_assert (nonzero_halves[0] == 0);
47084 gcc_assert (nonzero_halves[1] == 1);
47087 if (nzcnt <= 2)
47089 if (d->perm[0] / nelt2 == nonzero_halves[1])
47091 /* Attempt to increase the likelihood that dfinal
47092 shuffle will be intra-lane. */
47093 std::swap (nonzero_halves[0], nonzero_halves[1]);
47096 /* vperm2f128 or vperm2i128. */
47097 for (i = 0; i < nelt2; ++i)
47099 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47100 remap[i + nonzero_halves[0] * nelt2] = i;
47101 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47102 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47105 if (d->vmode != V8SFmode
47106 && d->vmode != V4DFmode
47107 && d->vmode != V8SImode)
47109 dremap.vmode = V8SImode;
47110 dremap.nelt = 8;
47111 for (i = 0; i < 4; ++i)
47113 dremap.perm[i] = i + nonzero_halves[0] * 4;
47114 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47118 else if (d->one_operand_p)
47119 return false;
47120 else if (TARGET_AVX2
47121 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47123 /* vpunpckl* */
47124 for (i = 0; i < nelt4; ++i)
47126 remap[i] = i * 2;
47127 remap[i + nelt] = i * 2 + 1;
47128 remap[i + nelt2] = i * 2 + nelt2;
47129 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47130 dremap.perm[i * 2] = i;
47131 dremap.perm[i * 2 + 1] = i + nelt;
47132 dremap.perm[i * 2 + nelt2] = i + nelt2;
47133 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47136 else if (TARGET_AVX2
47137 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47139 /* vpunpckh* */
47140 for (i = 0; i < nelt4; ++i)
47142 remap[i + nelt4] = i * 2;
47143 remap[i + nelt + nelt4] = i * 2 + 1;
47144 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47145 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47146 dremap.perm[i * 2] = i + nelt4;
47147 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47148 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47149 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47152 else
47153 return false;
47156 /* Use the remapping array set up above to move the elements from their
47157 swizzled locations into their final destinations. */
47158 dfinal = *d;
47159 for (i = 0; i < nelt; ++i)
47161 unsigned e = remap[d->perm[i]];
47162 gcc_assert (e < nelt);
47163 /* If same_halves is true, both halves of the remapped vector are the
47164 same. Avoid cross-lane accesses if possible. */
47165 if (same_halves && i >= nelt2)
47167 gcc_assert (e < nelt2);
47168 dfinal.perm[i] = e + nelt2;
47170 else
47171 dfinal.perm[i] = e;
47173 if (!d->testing_p)
47175 dremap.target = gen_reg_rtx (dremap.vmode);
47176 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47178 dfinal.op1 = dfinal.op0;
47179 dfinal.one_operand_p = true;
47181 /* Test if the final remap can be done with a single insn. For V4SFmode or
47182 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47183 start_sequence ();
47184 ok = expand_vec_perm_1 (&dfinal);
47185 seq = get_insns ();
47186 end_sequence ();
47188 if (!ok)
47189 return false;
47191 if (d->testing_p)
47192 return true;
47194 if (dremap.vmode != dfinal.vmode)
47196 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47197 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47200 ok = expand_vec_perm_1 (&dremap);
47201 gcc_assert (ok);
47203 emit_insn (seq);
47204 return true;
47207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47208 a single vector cross-lane permutation into vpermq followed
47209 by any of the single insn permutations. */
47211 static bool
47212 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47214 struct expand_vec_perm_d dremap, dfinal;
47215 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47216 unsigned contents[2];
47217 bool ok;
47219 if (!(TARGET_AVX2
47220 && (d->vmode == V32QImode || d->vmode == V16HImode)
47221 && d->one_operand_p))
47222 return false;
47224 contents[0] = 0;
47225 contents[1] = 0;
47226 for (i = 0; i < nelt2; ++i)
47228 contents[0] |= 1u << (d->perm[i] / nelt4);
47229 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47232 for (i = 0; i < 2; ++i)
47234 unsigned int cnt = 0;
47235 for (j = 0; j < 4; ++j)
47236 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47237 return false;
47240 if (d->testing_p)
47241 return true;
47243 dremap = *d;
47244 dremap.vmode = V4DImode;
47245 dremap.nelt = 4;
47246 dremap.target = gen_reg_rtx (V4DImode);
47247 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47248 dremap.op1 = dremap.op0;
47249 dremap.one_operand_p = true;
47250 for (i = 0; i < 2; ++i)
47252 unsigned int cnt = 0;
47253 for (j = 0; j < 4; ++j)
47254 if ((contents[i] & (1u << j)) != 0)
47255 dremap.perm[2 * i + cnt++] = j;
47256 for (; cnt < 2; ++cnt)
47257 dremap.perm[2 * i + cnt] = 0;
47260 dfinal = *d;
47261 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47262 dfinal.op1 = dfinal.op0;
47263 dfinal.one_operand_p = true;
47264 for (i = 0, j = 0; i < nelt; ++i)
47266 if (i == nelt2)
47267 j = 2;
47268 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47269 if ((d->perm[i] / nelt4) == dremap.perm[j])
47271 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47272 dfinal.perm[i] |= nelt4;
47273 else
47274 gcc_unreachable ();
47277 ok = expand_vec_perm_1 (&dremap);
47278 gcc_assert (ok);
47280 ok = expand_vec_perm_1 (&dfinal);
47281 gcc_assert (ok);
47283 return true;
47286 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47287 a vector permutation using two instructions, vperm2f128 resp.
47288 vperm2i128 followed by any single in-lane permutation. */
47290 static bool
47291 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47293 struct expand_vec_perm_d dfirst, dsecond;
47294 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47295 bool ok;
47297 if (!TARGET_AVX
47298 || GET_MODE_SIZE (d->vmode) != 32
47299 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47300 return false;
47302 dsecond = *d;
47303 dsecond.one_operand_p = false;
47304 dsecond.testing_p = true;
47306 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47307 immediate. For perm < 16 the second permutation uses
47308 d->op0 as first operand, for perm >= 16 it uses d->op1
47309 as first operand. The second operand is the result of
47310 vperm2[fi]128. */
47311 for (perm = 0; perm < 32; perm++)
47313 /* Ignore permutations which do not move anything cross-lane. */
47314 if (perm < 16)
47316 /* The second shuffle for e.g. V4DFmode has
47317 0123 and ABCD operands.
47318 Ignore AB23, as 23 is already in the second lane
47319 of the first operand. */
47320 if ((perm & 0xc) == (1 << 2)) continue;
47321 /* And 01CD, as 01 is in the first lane of the first
47322 operand. */
47323 if ((perm & 3) == 0) continue;
47324 /* And 4567, as then the vperm2[fi]128 doesn't change
47325 anything on the original 4567 second operand. */
47326 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47328 else
47330 /* The second shuffle for e.g. V4DFmode has
47331 4567 and ABCD operands.
47332 Ignore AB67, as 67 is already in the second lane
47333 of the first operand. */
47334 if ((perm & 0xc) == (3 << 2)) continue;
47335 /* And 45CD, as 45 is in the first lane of the first
47336 operand. */
47337 if ((perm & 3) == 2) continue;
47338 /* And 0123, as then the vperm2[fi]128 doesn't change
47339 anything on the original 0123 first operand. */
47340 if ((perm & 0xf) == (1 << 2)) continue;
47343 for (i = 0; i < nelt; i++)
47345 j = d->perm[i] / nelt2;
47346 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47347 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47348 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47349 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47350 else
47351 break;
47354 if (i == nelt)
47356 start_sequence ();
47357 ok = expand_vec_perm_1 (&dsecond);
47358 end_sequence ();
47360 else
47361 ok = false;
47363 if (ok)
47365 if (d->testing_p)
47366 return true;
47368 /* Found a usable second shuffle. dfirst will be
47369 vperm2f128 on d->op0 and d->op1. */
47370 dsecond.testing_p = false;
47371 dfirst = *d;
47372 dfirst.target = gen_reg_rtx (d->vmode);
47373 for (i = 0; i < nelt; i++)
47374 dfirst.perm[i] = (i & (nelt2 - 1))
47375 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47377 canonicalize_perm (&dfirst);
47378 ok = expand_vec_perm_1 (&dfirst);
47379 gcc_assert (ok);
47381 /* And dsecond is some single insn shuffle, taking
47382 d->op0 and result of vperm2f128 (if perm < 16) or
47383 d->op1 and result of vperm2f128 (otherwise). */
47384 if (perm >= 16)
47385 dsecond.op0 = dsecond.op1;
47386 dsecond.op1 = dfirst.target;
47388 ok = expand_vec_perm_1 (&dsecond);
47389 gcc_assert (ok);
47391 return true;
47394 /* For one operand, the only useful vperm2f128 permutation is 0x01
47395 aka lanes swap. */
47396 if (d->one_operand_p)
47397 return false;
47400 return false;
47403 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47404 a two vector permutation using 2 intra-lane interleave insns
47405 and cross-lane shuffle for 32-byte vectors. */
47407 static bool
47408 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47410 unsigned i, nelt;
47411 rtx (*gen) (rtx, rtx, rtx);
47413 if (d->one_operand_p)
47414 return false;
47415 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47417 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47419 else
47420 return false;
47422 nelt = d->nelt;
47423 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47424 return false;
47425 for (i = 0; i < nelt; i += 2)
47426 if (d->perm[i] != d->perm[0] + i / 2
47427 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47428 return false;
47430 if (d->testing_p)
47431 return true;
47433 switch (d->vmode)
47435 case V32QImode:
47436 if (d->perm[0])
47437 gen = gen_vec_interleave_highv32qi;
47438 else
47439 gen = gen_vec_interleave_lowv32qi;
47440 break;
47441 case V16HImode:
47442 if (d->perm[0])
47443 gen = gen_vec_interleave_highv16hi;
47444 else
47445 gen = gen_vec_interleave_lowv16hi;
47446 break;
47447 case V8SImode:
47448 if (d->perm[0])
47449 gen = gen_vec_interleave_highv8si;
47450 else
47451 gen = gen_vec_interleave_lowv8si;
47452 break;
47453 case V4DImode:
47454 if (d->perm[0])
47455 gen = gen_vec_interleave_highv4di;
47456 else
47457 gen = gen_vec_interleave_lowv4di;
47458 break;
47459 case V8SFmode:
47460 if (d->perm[0])
47461 gen = gen_vec_interleave_highv8sf;
47462 else
47463 gen = gen_vec_interleave_lowv8sf;
47464 break;
47465 case V4DFmode:
47466 if (d->perm[0])
47467 gen = gen_vec_interleave_highv4df;
47468 else
47469 gen = gen_vec_interleave_lowv4df;
47470 break;
47471 default:
47472 gcc_unreachable ();
47475 emit_insn (gen (d->target, d->op0, d->op1));
47476 return true;
47479 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47480 a single vector permutation using a single intra-lane vector
47481 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47482 the non-swapped and swapped vectors together. */
47484 static bool
47485 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47487 struct expand_vec_perm_d dfirst, dsecond;
47488 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47489 rtx_insn *seq;
47490 bool ok;
47491 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47493 if (!TARGET_AVX
47494 || TARGET_AVX2
47495 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47496 || !d->one_operand_p)
47497 return false;
47499 dfirst = *d;
47500 for (i = 0; i < nelt; i++)
47501 dfirst.perm[i] = 0xff;
47502 for (i = 0, msk = 0; i < nelt; i++)
47504 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47505 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47506 return false;
47507 dfirst.perm[j] = d->perm[i];
47508 if (j != i)
47509 msk |= (1 << i);
47511 for (i = 0; i < nelt; i++)
47512 if (dfirst.perm[i] == 0xff)
47513 dfirst.perm[i] = i;
47515 if (!d->testing_p)
47516 dfirst.target = gen_reg_rtx (dfirst.vmode);
47518 start_sequence ();
47519 ok = expand_vec_perm_1 (&dfirst);
47520 seq = get_insns ();
47521 end_sequence ();
47523 if (!ok)
47524 return false;
47526 if (d->testing_p)
47527 return true;
47529 emit_insn (seq);
47531 dsecond = *d;
47532 dsecond.op0 = dfirst.target;
47533 dsecond.op1 = dfirst.target;
47534 dsecond.one_operand_p = true;
47535 dsecond.target = gen_reg_rtx (dsecond.vmode);
47536 for (i = 0; i < nelt; i++)
47537 dsecond.perm[i] = i ^ nelt2;
47539 ok = expand_vec_perm_1 (&dsecond);
47540 gcc_assert (ok);
47542 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47543 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47544 return true;
47547 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47548 permutation using two vperm2f128, followed by a vshufpd insn blending
47549 the two vectors together. */
47551 static bool
47552 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47554 struct expand_vec_perm_d dfirst, dsecond, dthird;
47555 bool ok;
47557 if (!TARGET_AVX || (d->vmode != V4DFmode))
47558 return false;
47560 if (d->testing_p)
47561 return true;
47563 dfirst = *d;
47564 dsecond = *d;
47565 dthird = *d;
47567 dfirst.perm[0] = (d->perm[0] & ~1);
47568 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47569 dfirst.perm[2] = (d->perm[2] & ~1);
47570 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47571 dsecond.perm[0] = (d->perm[1] & ~1);
47572 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47573 dsecond.perm[2] = (d->perm[3] & ~1);
47574 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47575 dthird.perm[0] = (d->perm[0] % 2);
47576 dthird.perm[1] = (d->perm[1] % 2) + 4;
47577 dthird.perm[2] = (d->perm[2] % 2) + 2;
47578 dthird.perm[3] = (d->perm[3] % 2) + 6;
47580 dfirst.target = gen_reg_rtx (dfirst.vmode);
47581 dsecond.target = gen_reg_rtx (dsecond.vmode);
47582 dthird.op0 = dfirst.target;
47583 dthird.op1 = dsecond.target;
47584 dthird.one_operand_p = false;
47586 canonicalize_perm (&dfirst);
47587 canonicalize_perm (&dsecond);
47589 ok = expand_vec_perm_1 (&dfirst)
47590 && expand_vec_perm_1 (&dsecond)
47591 && expand_vec_perm_1 (&dthird);
47593 gcc_assert (ok);
47595 return true;
47598 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47599 permutation with two pshufb insns and an ior. We should have already
47600 failed all two instruction sequences. */
47602 static bool
47603 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47605 rtx rperm[2][16], vperm, l, h, op, m128;
47606 unsigned int i, nelt, eltsz;
47608 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47609 return false;
47610 gcc_assert (!d->one_operand_p);
47612 if (d->testing_p)
47613 return true;
47615 nelt = d->nelt;
47616 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47618 /* Generate two permutation masks. If the required element is within
47619 the given vector it is shuffled into the proper lane. If the required
47620 element is in the other vector, force a zero into the lane by setting
47621 bit 7 in the permutation mask. */
47622 m128 = GEN_INT (-128);
47623 for (i = 0; i < nelt; ++i)
47625 unsigned j, e = d->perm[i];
47626 unsigned which = (e >= nelt);
47627 if (e >= nelt)
47628 e -= nelt;
47630 for (j = 0; j < eltsz; ++j)
47632 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47633 rperm[1-which][i*eltsz + j] = m128;
47637 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47638 vperm = force_reg (V16QImode, vperm);
47640 l = gen_reg_rtx (V16QImode);
47641 op = gen_lowpart (V16QImode, d->op0);
47642 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47644 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47645 vperm = force_reg (V16QImode, vperm);
47647 h = gen_reg_rtx (V16QImode);
47648 op = gen_lowpart (V16QImode, d->op1);
47649 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47651 op = d->target;
47652 if (d->vmode != V16QImode)
47653 op = gen_reg_rtx (V16QImode);
47654 emit_insn (gen_iorv16qi3 (op, l, h));
47655 if (op != d->target)
47656 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47658 return true;
47661 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47662 with two vpshufb insns, vpermq and vpor. We should have already failed
47663 all two or three instruction sequences. */
47665 static bool
47666 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47668 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47669 unsigned int i, nelt, eltsz;
47671 if (!TARGET_AVX2
47672 || !d->one_operand_p
47673 || (d->vmode != V32QImode && d->vmode != V16HImode))
47674 return false;
47676 if (d->testing_p)
47677 return true;
47679 nelt = d->nelt;
47680 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47682 /* Generate two permutation masks. If the required element is within
47683 the same lane, it is shuffled in. If the required element from the
47684 other lane, force a zero by setting bit 7 in the permutation mask.
47685 In the other mask the mask has non-negative elements if element
47686 is requested from the other lane, but also moved to the other lane,
47687 so that the result of vpshufb can have the two V2TImode halves
47688 swapped. */
47689 m128 = GEN_INT (-128);
47690 for (i = 0; i < nelt; ++i)
47692 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47693 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47695 for (j = 0; j < eltsz; ++j)
47697 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47698 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47702 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47703 vperm = force_reg (V32QImode, vperm);
47705 h = gen_reg_rtx (V32QImode);
47706 op = gen_lowpart (V32QImode, d->op0);
47707 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47709 /* Swap the 128-byte lanes of h into hp. */
47710 hp = gen_reg_rtx (V4DImode);
47711 op = gen_lowpart (V4DImode, h);
47712 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47713 const1_rtx));
47715 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47716 vperm = force_reg (V32QImode, vperm);
47718 l = gen_reg_rtx (V32QImode);
47719 op = gen_lowpart (V32QImode, d->op0);
47720 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47722 op = d->target;
47723 if (d->vmode != V32QImode)
47724 op = gen_reg_rtx (V32QImode);
47725 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47726 if (op != d->target)
47727 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47729 return true;
47732 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47733 and extract-odd permutations of two V32QImode and V16QImode operand
47734 with two vpshufb insns, vpor and vpermq. We should have already
47735 failed all two or three instruction sequences. */
47737 static bool
47738 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47740 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47741 unsigned int i, nelt, eltsz;
47743 if (!TARGET_AVX2
47744 || d->one_operand_p
47745 || (d->vmode != V32QImode && d->vmode != V16HImode))
47746 return false;
47748 for (i = 0; i < d->nelt; ++i)
47749 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47750 return false;
47752 if (d->testing_p)
47753 return true;
47755 nelt = d->nelt;
47756 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47758 /* Generate two permutation masks. In the first permutation mask
47759 the first quarter will contain indexes for the first half
47760 of the op0, the second quarter will contain bit 7 set, third quarter
47761 will contain indexes for the second half of the op0 and the
47762 last quarter bit 7 set. In the second permutation mask
47763 the first quarter will contain bit 7 set, the second quarter
47764 indexes for the first half of the op1, the third quarter bit 7 set
47765 and last quarter indexes for the second half of the op1.
47766 I.e. the first mask e.g. for V32QImode extract even will be:
47767 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47768 (all values masked with 0xf except for -128) and second mask
47769 for extract even will be
47770 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47771 m128 = GEN_INT (-128);
47772 for (i = 0; i < nelt; ++i)
47774 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47775 unsigned which = d->perm[i] >= nelt;
47776 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47778 for (j = 0; j < eltsz; ++j)
47780 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47781 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47785 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47786 vperm = force_reg (V32QImode, vperm);
47788 l = gen_reg_rtx (V32QImode);
47789 op = gen_lowpart (V32QImode, d->op0);
47790 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47792 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47793 vperm = force_reg (V32QImode, vperm);
47795 h = gen_reg_rtx (V32QImode);
47796 op = gen_lowpart (V32QImode, d->op1);
47797 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47799 ior = gen_reg_rtx (V32QImode);
47800 emit_insn (gen_iorv32qi3 (ior, l, h));
47802 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47803 op = gen_reg_rtx (V4DImode);
47804 ior = gen_lowpart (V4DImode, ior);
47805 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47806 const1_rtx, GEN_INT (3)));
47807 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47809 return true;
47812 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47813 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47814 with two "and" and "pack" or two "shift" and "pack" insns. We should
47815 have already failed all two instruction sequences. */
47817 static bool
47818 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47820 rtx op, dop0, dop1, t, rperm[16];
47821 unsigned i, odd, c, s, nelt = d->nelt;
47822 bool end_perm = false;
47823 machine_mode half_mode;
47824 rtx (*gen_and) (rtx, rtx, rtx);
47825 rtx (*gen_pack) (rtx, rtx, rtx);
47826 rtx (*gen_shift) (rtx, rtx, rtx);
47828 if (d->one_operand_p)
47829 return false;
47831 switch (d->vmode)
47833 case V8HImode:
47834 /* Required for "pack". */
47835 if (!TARGET_SSE4_1)
47836 return false;
47837 c = 0xffff;
47838 s = 16;
47839 half_mode = V4SImode;
47840 gen_and = gen_andv4si3;
47841 gen_pack = gen_sse4_1_packusdw;
47842 gen_shift = gen_lshrv4si3;
47843 break;
47844 case V16QImode:
47845 /* No check as all instructions are SSE2. */
47846 c = 0xff;
47847 s = 8;
47848 half_mode = V8HImode;
47849 gen_and = gen_andv8hi3;
47850 gen_pack = gen_sse2_packuswb;
47851 gen_shift = gen_lshrv8hi3;
47852 break;
47853 case V16HImode:
47854 if (!TARGET_AVX2)
47855 return false;
47856 c = 0xffff;
47857 s = 16;
47858 half_mode = V8SImode;
47859 gen_and = gen_andv8si3;
47860 gen_pack = gen_avx2_packusdw;
47861 gen_shift = gen_lshrv8si3;
47862 end_perm = true;
47863 break;
47864 case V32QImode:
47865 if (!TARGET_AVX2)
47866 return false;
47867 c = 0xff;
47868 s = 8;
47869 half_mode = V16HImode;
47870 gen_and = gen_andv16hi3;
47871 gen_pack = gen_avx2_packuswb;
47872 gen_shift = gen_lshrv16hi3;
47873 end_perm = true;
47874 break;
47875 default:
47876 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47877 general shuffles. */
47878 return false;
47881 /* Check that permutation is even or odd. */
47882 odd = d->perm[0];
47883 if (odd > 1)
47884 return false;
47886 for (i = 1; i < nelt; ++i)
47887 if (d->perm[i] != 2 * i + odd)
47888 return false;
47890 if (d->testing_p)
47891 return true;
47893 dop0 = gen_reg_rtx (half_mode);
47894 dop1 = gen_reg_rtx (half_mode);
47895 if (odd == 0)
47897 for (i = 0; i < nelt / 2; i++)
47898 rperm[i] = GEN_INT (c);
47899 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
47900 t = force_reg (half_mode, t);
47901 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47902 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47904 else
47906 emit_insn (gen_shift (dop0,
47907 gen_lowpart (half_mode, d->op0),
47908 GEN_INT (s)));
47909 emit_insn (gen_shift (dop1,
47910 gen_lowpart (half_mode, d->op1),
47911 GEN_INT (s)));
47913 /* In AVX2 for 256 bit case we need to permute pack result. */
47914 if (TARGET_AVX2 && end_perm)
47916 op = gen_reg_rtx (d->vmode);
47917 t = gen_reg_rtx (V4DImode);
47918 emit_insn (gen_pack (op, dop0, dop1));
47919 emit_insn (gen_avx2_permv4di_1 (t,
47920 gen_lowpart (V4DImode, op),
47921 const0_rtx,
47922 const2_rtx,
47923 const1_rtx,
47924 GEN_INT (3)));
47925 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47927 else
47928 emit_insn (gen_pack (d->target, dop0, dop1));
47930 return true;
47933 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47934 and extract-odd permutations of two V64QI operands
47935 with two "shifts", two "truncs" and one "concat" insns for "odd"
47936 and two "truncs" and one concat insn for "even."
47937 Have already failed all two instruction sequences. */
47939 static bool
47940 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47942 rtx t1, t2, t3, t4;
47943 unsigned i, odd, nelt = d->nelt;
47945 if (!TARGET_AVX512BW
47946 || d->one_operand_p
47947 || d->vmode != V64QImode)
47948 return false;
47950 /* Check that permutation is even or odd. */
47951 odd = d->perm[0];
47952 if (odd > 1)
47953 return false;
47955 for (i = 1; i < nelt; ++i)
47956 if (d->perm[i] != 2 * i + odd)
47957 return false;
47959 if (d->testing_p)
47960 return true;
47963 if (odd)
47965 t1 = gen_reg_rtx (V32HImode);
47966 t2 = gen_reg_rtx (V32HImode);
47967 emit_insn (gen_lshrv32hi3 (t1,
47968 gen_lowpart (V32HImode, d->op0),
47969 GEN_INT (8)));
47970 emit_insn (gen_lshrv32hi3 (t2,
47971 gen_lowpart (V32HImode, d->op1),
47972 GEN_INT (8)));
47974 else
47976 t1 = gen_lowpart (V32HImode, d->op0);
47977 t2 = gen_lowpart (V32HImode, d->op1);
47980 t3 = gen_reg_rtx (V32QImode);
47981 t4 = gen_reg_rtx (V32QImode);
47982 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47983 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47984 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47986 return true;
47989 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47990 and extract-odd permutations. */
47992 static bool
47993 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47995 rtx t1, t2, t3, t4, t5;
47997 switch (d->vmode)
47999 case V4DFmode:
48000 if (d->testing_p)
48001 break;
48002 t1 = gen_reg_rtx (V4DFmode);
48003 t2 = gen_reg_rtx (V4DFmode);
48005 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48006 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48007 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48009 /* Now an unpck[lh]pd will produce the result required. */
48010 if (odd)
48011 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48012 else
48013 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48014 emit_insn (t3);
48015 break;
48017 case V8SFmode:
48019 int mask = odd ? 0xdd : 0x88;
48021 if (d->testing_p)
48022 break;
48023 t1 = gen_reg_rtx (V8SFmode);
48024 t2 = gen_reg_rtx (V8SFmode);
48025 t3 = gen_reg_rtx (V8SFmode);
48027 /* Shuffle within the 128-bit lanes to produce:
48028 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48029 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48030 GEN_INT (mask)));
48032 /* Shuffle the lanes around to produce:
48033 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48034 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48035 GEN_INT (0x3)));
48037 /* Shuffle within the 128-bit lanes to produce:
48038 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48039 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48041 /* Shuffle within the 128-bit lanes to produce:
48042 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48043 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48045 /* Shuffle the lanes around to produce:
48046 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48047 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48048 GEN_INT (0x20)));
48050 break;
48052 case V2DFmode:
48053 case V4SFmode:
48054 case V2DImode:
48055 case V4SImode:
48056 /* These are always directly implementable by expand_vec_perm_1. */
48057 gcc_unreachable ();
48059 case V8HImode:
48060 if (TARGET_SSE4_1)
48061 return expand_vec_perm_even_odd_pack (d);
48062 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48063 return expand_vec_perm_pshufb2 (d);
48064 else
48066 if (d->testing_p)
48067 break;
48068 /* We need 2*log2(N)-1 operations to achieve odd/even
48069 with interleave. */
48070 t1 = gen_reg_rtx (V8HImode);
48071 t2 = gen_reg_rtx (V8HImode);
48072 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48073 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48074 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48075 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48076 if (odd)
48077 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48078 else
48079 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48080 emit_insn (t3);
48082 break;
48084 case V16QImode:
48085 return expand_vec_perm_even_odd_pack (d);
48087 case V16HImode:
48088 case V32QImode:
48089 return expand_vec_perm_even_odd_pack (d);
48091 case V64QImode:
48092 return expand_vec_perm_even_odd_trunc (d);
48094 case V4DImode:
48095 if (!TARGET_AVX2)
48097 struct expand_vec_perm_d d_copy = *d;
48098 d_copy.vmode = V4DFmode;
48099 if (d->testing_p)
48100 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48101 else
48102 d_copy.target = gen_reg_rtx (V4DFmode);
48103 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48104 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48105 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48107 if (!d->testing_p)
48108 emit_move_insn (d->target,
48109 gen_lowpart (V4DImode, d_copy.target));
48110 return true;
48112 return false;
48115 if (d->testing_p)
48116 break;
48118 t1 = gen_reg_rtx (V4DImode);
48119 t2 = gen_reg_rtx (V4DImode);
48121 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48122 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48123 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48125 /* Now an vpunpck[lh]qdq will produce the result required. */
48126 if (odd)
48127 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48128 else
48129 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48130 emit_insn (t3);
48131 break;
48133 case V8SImode:
48134 if (!TARGET_AVX2)
48136 struct expand_vec_perm_d d_copy = *d;
48137 d_copy.vmode = V8SFmode;
48138 if (d->testing_p)
48139 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48140 else
48141 d_copy.target = gen_reg_rtx (V8SFmode);
48142 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48143 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48144 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48146 if (!d->testing_p)
48147 emit_move_insn (d->target,
48148 gen_lowpart (V8SImode, d_copy.target));
48149 return true;
48151 return false;
48154 if (d->testing_p)
48155 break;
48157 t1 = gen_reg_rtx (V8SImode);
48158 t2 = gen_reg_rtx (V8SImode);
48159 t3 = gen_reg_rtx (V4DImode);
48160 t4 = gen_reg_rtx (V4DImode);
48161 t5 = gen_reg_rtx (V4DImode);
48163 /* Shuffle the lanes around into
48164 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48165 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48166 gen_lowpart (V4DImode, d->op1),
48167 GEN_INT (0x20)));
48168 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48169 gen_lowpart (V4DImode, d->op1),
48170 GEN_INT (0x31)));
48172 /* Swap the 2nd and 3rd position in each lane into
48173 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48174 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48175 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48176 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48177 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48179 /* Now an vpunpck[lh]qdq will produce
48180 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48181 if (odd)
48182 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48183 gen_lowpart (V4DImode, t2));
48184 else
48185 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48186 gen_lowpart (V4DImode, t2));
48187 emit_insn (t3);
48188 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48189 break;
48191 default:
48192 gcc_unreachable ();
48195 return true;
48198 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48199 extract-even and extract-odd permutations. */
48201 static bool
48202 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48204 unsigned i, odd, nelt = d->nelt;
48206 odd = d->perm[0];
48207 if (odd != 0 && odd != 1)
48208 return false;
48210 for (i = 1; i < nelt; ++i)
48211 if (d->perm[i] != 2 * i + odd)
48212 return false;
48214 return expand_vec_perm_even_odd_1 (d, odd);
48217 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48218 permutations. We assume that expand_vec_perm_1 has already failed. */
48220 static bool
48221 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48223 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48224 machine_mode vmode = d->vmode;
48225 unsigned char perm2[4];
48226 rtx op0 = d->op0, dest;
48227 bool ok;
48229 switch (vmode)
48231 case V4DFmode:
48232 case V8SFmode:
48233 /* These are special-cased in sse.md so that we can optionally
48234 use the vbroadcast instruction. They expand to two insns
48235 if the input happens to be in a register. */
48236 gcc_unreachable ();
48238 case V2DFmode:
48239 case V2DImode:
48240 case V4SFmode:
48241 case V4SImode:
48242 /* These are always implementable using standard shuffle patterns. */
48243 gcc_unreachable ();
48245 case V8HImode:
48246 case V16QImode:
48247 /* These can be implemented via interleave. We save one insn by
48248 stopping once we have promoted to V4SImode and then use pshufd. */
48249 if (d->testing_p)
48250 return true;
48253 rtx dest;
48254 rtx (*gen) (rtx, rtx, rtx)
48255 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48256 : gen_vec_interleave_lowv8hi;
48258 if (elt >= nelt2)
48260 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48261 : gen_vec_interleave_highv8hi;
48262 elt -= nelt2;
48264 nelt2 /= 2;
48266 dest = gen_reg_rtx (vmode);
48267 emit_insn (gen (dest, op0, op0));
48268 vmode = get_mode_wider_vector (vmode);
48269 op0 = gen_lowpart (vmode, dest);
48271 while (vmode != V4SImode);
48273 memset (perm2, elt, 4);
48274 dest = gen_reg_rtx (V4SImode);
48275 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48276 gcc_assert (ok);
48277 if (!d->testing_p)
48278 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48279 return true;
48281 case V64QImode:
48282 case V32QImode:
48283 case V16HImode:
48284 case V8SImode:
48285 case V4DImode:
48286 /* For AVX2 broadcasts of the first element vpbroadcast* or
48287 vpermq should be used by expand_vec_perm_1. */
48288 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48289 return false;
48291 default:
48292 gcc_unreachable ();
48296 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48297 broadcast permutations. */
48299 static bool
48300 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48302 unsigned i, elt, nelt = d->nelt;
48304 if (!d->one_operand_p)
48305 return false;
48307 elt = d->perm[0];
48308 for (i = 1; i < nelt; ++i)
48309 if (d->perm[i] != elt)
48310 return false;
48312 return expand_vec_perm_broadcast_1 (d);
48315 /* Implement arbitrary permutations of two V64QImode operands
48316 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
48317 static bool
48318 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
48320 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48321 return false;
48323 if (d->testing_p)
48324 return true;
48326 struct expand_vec_perm_d ds[2];
48327 rtx rperm[128], vperm, target0, target1;
48328 unsigned int i, nelt;
48329 machine_mode vmode;
48331 nelt = d->nelt;
48332 vmode = V64QImode;
48334 for (i = 0; i < 2; i++)
48336 ds[i] = *d;
48337 ds[i].vmode = V32HImode;
48338 ds[i].nelt = 32;
48339 ds[i].target = gen_reg_rtx (V32HImode);
48340 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48341 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48344 /* Prepare permutations such that the first one takes care of
48345 putting the even bytes into the right positions or one higher
48346 positions (ds[0]) and the second one takes care of
48347 putting the odd bytes into the right positions or one below
48348 (ds[1]). */
48350 for (i = 0; i < nelt; i++)
48352 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48353 if (i & 1)
48355 rperm[i] = constm1_rtx;
48356 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48358 else
48360 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48361 rperm[i + 64] = constm1_rtx;
48365 bool ok = expand_vec_perm_1 (&ds[0]);
48366 gcc_assert (ok);
48367 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48369 ok = expand_vec_perm_1 (&ds[1]);
48370 gcc_assert (ok);
48371 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48373 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48374 vperm = force_reg (vmode, vperm);
48375 target0 = gen_reg_rtx (V64QImode);
48376 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48378 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48379 vperm = force_reg (vmode, vperm);
48380 target1 = gen_reg_rtx (V64QImode);
48381 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48383 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48384 return true;
48387 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48388 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48389 all the shorter instruction sequences. */
48391 static bool
48392 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48394 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48395 unsigned int i, nelt, eltsz;
48396 bool used[4];
48398 if (!TARGET_AVX2
48399 || d->one_operand_p
48400 || (d->vmode != V32QImode && d->vmode != V16HImode))
48401 return false;
48403 if (d->testing_p)
48404 return true;
48406 nelt = d->nelt;
48407 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48409 /* Generate 4 permutation masks. If the required element is within
48410 the same lane, it is shuffled in. If the required element from the
48411 other lane, force a zero by setting bit 7 in the permutation mask.
48412 In the other mask the mask has non-negative elements if element
48413 is requested from the other lane, but also moved to the other lane,
48414 so that the result of vpshufb can have the two V2TImode halves
48415 swapped. */
48416 m128 = GEN_INT (-128);
48417 for (i = 0; i < 32; ++i)
48419 rperm[0][i] = m128;
48420 rperm[1][i] = m128;
48421 rperm[2][i] = m128;
48422 rperm[3][i] = m128;
48424 used[0] = false;
48425 used[1] = false;
48426 used[2] = false;
48427 used[3] = false;
48428 for (i = 0; i < nelt; ++i)
48430 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48431 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48432 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48434 for (j = 0; j < eltsz; ++j)
48435 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48436 used[which] = true;
48439 for (i = 0; i < 2; ++i)
48441 if (!used[2 * i + 1])
48443 h[i] = NULL_RTX;
48444 continue;
48446 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48447 gen_rtvec_v (32, rperm[2 * i + 1]));
48448 vperm = force_reg (V32QImode, vperm);
48449 h[i] = gen_reg_rtx (V32QImode);
48450 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48451 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48454 /* Swap the 128-byte lanes of h[X]. */
48455 for (i = 0; i < 2; ++i)
48457 if (h[i] == NULL_RTX)
48458 continue;
48459 op = gen_reg_rtx (V4DImode);
48460 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48461 const2_rtx, GEN_INT (3), const0_rtx,
48462 const1_rtx));
48463 h[i] = gen_lowpart (V32QImode, op);
48466 for (i = 0; i < 2; ++i)
48468 if (!used[2 * i])
48470 l[i] = NULL_RTX;
48471 continue;
48473 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48474 vperm = force_reg (V32QImode, vperm);
48475 l[i] = gen_reg_rtx (V32QImode);
48476 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48477 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48480 for (i = 0; i < 2; ++i)
48482 if (h[i] && l[i])
48484 op = gen_reg_rtx (V32QImode);
48485 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48486 l[i] = op;
48488 else if (h[i])
48489 l[i] = h[i];
48492 gcc_assert (l[0] && l[1]);
48493 op = d->target;
48494 if (d->vmode != V32QImode)
48495 op = gen_reg_rtx (V32QImode);
48496 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48497 if (op != d->target)
48498 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48499 return true;
48502 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
48503 With all of the interface bits taken care of, perform the expansion
48504 in D and return true on success. */
48506 static bool
48507 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48509 /* Try a single instruction expansion. */
48510 if (expand_vec_perm_1 (d))
48511 return true;
48513 /* Try sequences of two instructions. */
48515 if (expand_vec_perm_pshuflw_pshufhw (d))
48516 return true;
48518 if (expand_vec_perm_palignr (d, false))
48519 return true;
48521 if (expand_vec_perm_interleave2 (d))
48522 return true;
48524 if (expand_vec_perm_broadcast (d))
48525 return true;
48527 if (expand_vec_perm_vpermq_perm_1 (d))
48528 return true;
48530 if (expand_vec_perm_vperm2f128 (d))
48531 return true;
48533 if (expand_vec_perm_pblendv (d))
48534 return true;
48536 /* Try sequences of three instructions. */
48538 if (expand_vec_perm_even_odd_pack (d))
48539 return true;
48541 if (expand_vec_perm_2vperm2f128_vshuf (d))
48542 return true;
48544 if (expand_vec_perm_pshufb2 (d))
48545 return true;
48547 if (expand_vec_perm_interleave3 (d))
48548 return true;
48550 if (expand_vec_perm_vperm2f128_vblend (d))
48551 return true;
48553 /* Try sequences of four instructions. */
48555 if (expand_vec_perm_even_odd_trunc (d))
48556 return true;
48557 if (expand_vec_perm_vpshufb2_vpermq (d))
48558 return true;
48560 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48561 return true;
48563 if (expand_vec_perm_vpermi2_vpshub2 (d))
48564 return true;
48566 /* ??? Look for narrow permutations whose element orderings would
48567 allow the promotion to a wider mode. */
48569 /* ??? Look for sequences of interleave or a wider permute that place
48570 the data into the correct lanes for a half-vector shuffle like
48571 pshuf[lh]w or vpermilps. */
48573 /* ??? Look for sequences of interleave that produce the desired results.
48574 The combinatorics of punpck[lh] get pretty ugly... */
48576 if (expand_vec_perm_even_odd (d))
48577 return true;
48579 /* Even longer sequences. */
48580 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48581 return true;
48583 /* See if we can get the same permutation in different vector integer
48584 mode. */
48585 struct expand_vec_perm_d nd;
48586 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48588 if (!d->testing_p)
48589 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48590 return true;
48593 return false;
48596 /* If a permutation only uses one operand, make it clear. Returns true
48597 if the permutation references both operands. */
48599 static bool
48600 canonicalize_perm (struct expand_vec_perm_d *d)
48602 int i, which, nelt = d->nelt;
48604 for (i = which = 0; i < nelt; ++i)
48605 which |= (d->perm[i] < nelt ? 1 : 2);
48607 d->one_operand_p = true;
48608 switch (which)
48610 default:
48611 gcc_unreachable();
48613 case 3:
48614 if (!rtx_equal_p (d->op0, d->op1))
48616 d->one_operand_p = false;
48617 break;
48619 /* The elements of PERM do not suggest that only the first operand
48620 is used, but both operands are identical. Allow easier matching
48621 of the permutation by folding the permutation into the single
48622 input vector. */
48623 /* FALLTHRU */
48625 case 2:
48626 for (i = 0; i < nelt; ++i)
48627 d->perm[i] &= nelt - 1;
48628 d->op0 = d->op1;
48629 break;
48631 case 1:
48632 d->op1 = d->op0;
48633 break;
48636 return (which == 3);
48639 bool
48640 ix86_expand_vec_perm_const (rtx operands[4])
48642 struct expand_vec_perm_d d;
48643 unsigned char perm[MAX_VECT_LEN];
48644 int i, nelt;
48645 bool two_args;
48646 rtx sel;
48648 d.target = operands[0];
48649 d.op0 = operands[1];
48650 d.op1 = operands[2];
48651 sel = operands[3];
48653 d.vmode = GET_MODE (d.target);
48654 gcc_assert (VECTOR_MODE_P (d.vmode));
48655 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48656 d.testing_p = false;
48658 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
48659 gcc_assert (XVECLEN (sel, 0) == nelt);
48660 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48662 for (i = 0; i < nelt; ++i)
48664 rtx e = XVECEXP (sel, 0, i);
48665 int ei = INTVAL (e) & (2 * nelt - 1);
48666 d.perm[i] = ei;
48667 perm[i] = ei;
48670 two_args = canonicalize_perm (&d);
48672 if (ix86_expand_vec_perm_const_1 (&d))
48673 return true;
48675 /* If the selector says both arguments are needed, but the operands are the
48676 same, the above tried to expand with one_operand_p and flattened selector.
48677 If that didn't work, retry without one_operand_p; we succeeded with that
48678 during testing. */
48679 if (two_args && d.one_operand_p)
48681 d.one_operand_p = false;
48682 memcpy (d.perm, perm, sizeof (perm));
48683 return ix86_expand_vec_perm_const_1 (&d);
48686 return false;
48689 /* Implement targetm.vectorize.vec_perm_const_ok. */
48691 static bool
48692 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
48693 const unsigned char *sel)
48695 struct expand_vec_perm_d d;
48696 unsigned int i, nelt, which;
48697 bool ret;
48699 d.vmode = vmode;
48700 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48701 d.testing_p = true;
48703 /* Given sufficient ISA support we can just return true here
48704 for selected vector modes. */
48705 switch (d.vmode)
48707 case V16SFmode:
48708 case V16SImode:
48709 case V8DImode:
48710 case V8DFmode:
48711 if (TARGET_AVX512F)
48712 /* All implementable with a single vpermi2 insn. */
48713 return true;
48714 break;
48715 case V32HImode:
48716 if (TARGET_AVX512BW)
48717 /* All implementable with a single vpermi2 insn. */
48718 return true;
48719 break;
48720 case V64QImode:
48721 if (TARGET_AVX512BW)
48722 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
48723 return true;
48724 break;
48725 case V8SImode:
48726 case V8SFmode:
48727 case V4DFmode:
48728 case V4DImode:
48729 if (TARGET_AVX512VL)
48730 /* All implementable with a single vpermi2 insn. */
48731 return true;
48732 break;
48733 case V16HImode:
48734 if (TARGET_AVX2)
48735 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48736 return true;
48737 break;
48738 case V32QImode:
48739 if (TARGET_AVX2)
48740 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48741 return true;
48742 break;
48743 case V4SImode:
48744 case V4SFmode:
48745 case V8HImode:
48746 case V16QImode:
48747 /* All implementable with a single vpperm insn. */
48748 if (TARGET_XOP)
48749 return true;
48750 /* All implementable with 2 pshufb + 1 ior. */
48751 if (TARGET_SSSE3)
48752 return true;
48753 break;
48754 case V2DImode:
48755 case V2DFmode:
48756 /* All implementable with shufpd or unpck[lh]pd. */
48757 return true;
48758 default:
48759 return false;
48762 /* Extract the values from the vector CST into the permutation
48763 array in D. */
48764 memcpy (d.perm, sel, nelt);
48765 for (i = which = 0; i < nelt; ++i)
48767 unsigned char e = d.perm[i];
48768 gcc_assert (e < 2 * nelt);
48769 which |= (e < nelt ? 1 : 2);
48772 /* For all elements from second vector, fold the elements to first. */
48773 if (which == 2)
48774 for (i = 0; i < nelt; ++i)
48775 d.perm[i] -= nelt;
48777 /* Check whether the mask can be applied to the vector type. */
48778 d.one_operand_p = (which != 3);
48780 /* Implementable with shufps or pshufd. */
48781 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48782 return true;
48784 /* Otherwise we have to go through the motions and see if we can
48785 figure out how to generate the requested permutation. */
48786 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48787 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48788 if (!d.one_operand_p)
48789 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48791 start_sequence ();
48792 ret = ix86_expand_vec_perm_const_1 (&d);
48793 end_sequence ();
48795 return ret;
48798 void
48799 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48801 struct expand_vec_perm_d d;
48802 unsigned i, nelt;
48804 d.target = targ;
48805 d.op0 = op0;
48806 d.op1 = op1;
48807 d.vmode = GET_MODE (targ);
48808 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48809 d.one_operand_p = false;
48810 d.testing_p = false;
48812 for (i = 0; i < nelt; ++i)
48813 d.perm[i] = i * 2 + odd;
48815 /* We'll either be able to implement the permutation directly... */
48816 if (expand_vec_perm_1 (&d))
48817 return;
48819 /* ... or we use the special-case patterns. */
48820 expand_vec_perm_even_odd_1 (&d, odd);
48823 static void
48824 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48826 struct expand_vec_perm_d d;
48827 unsigned i, nelt, base;
48828 bool ok;
48830 d.target = targ;
48831 d.op0 = op0;
48832 d.op1 = op1;
48833 d.vmode = GET_MODE (targ);
48834 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48835 d.one_operand_p = false;
48836 d.testing_p = false;
48838 base = high_p ? nelt / 2 : 0;
48839 for (i = 0; i < nelt / 2; ++i)
48841 d.perm[i * 2] = i + base;
48842 d.perm[i * 2 + 1] = i + base + nelt;
48845 /* Note that for AVX this isn't one instruction. */
48846 ok = ix86_expand_vec_perm_const_1 (&d);
48847 gcc_assert (ok);
48851 /* Expand a vector operation CODE for a V*QImode in terms of the
48852 same operation on V*HImode. */
48854 void
48855 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48857 machine_mode qimode = GET_MODE (dest);
48858 machine_mode himode;
48859 rtx (*gen_il) (rtx, rtx, rtx);
48860 rtx (*gen_ih) (rtx, rtx, rtx);
48861 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48862 struct expand_vec_perm_d d;
48863 bool ok, full_interleave;
48864 bool uns_p = false;
48865 int i;
48867 switch (qimode)
48869 case V16QImode:
48870 himode = V8HImode;
48871 gen_il = gen_vec_interleave_lowv16qi;
48872 gen_ih = gen_vec_interleave_highv16qi;
48873 break;
48874 case V32QImode:
48875 himode = V16HImode;
48876 gen_il = gen_avx2_interleave_lowv32qi;
48877 gen_ih = gen_avx2_interleave_highv32qi;
48878 break;
48879 case V64QImode:
48880 himode = V32HImode;
48881 gen_il = gen_avx512bw_interleave_lowv64qi;
48882 gen_ih = gen_avx512bw_interleave_highv64qi;
48883 break;
48884 default:
48885 gcc_unreachable ();
48888 op2_l = op2_h = op2;
48889 switch (code)
48891 case MULT:
48892 /* Unpack data such that we've got a source byte in each low byte of
48893 each word. We don't care what goes into the high byte of each word.
48894 Rather than trying to get zero in there, most convenient is to let
48895 it be a copy of the low byte. */
48896 op2_l = gen_reg_rtx (qimode);
48897 op2_h = gen_reg_rtx (qimode);
48898 emit_insn (gen_il (op2_l, op2, op2));
48899 emit_insn (gen_ih (op2_h, op2, op2));
48900 /* FALLTHRU */
48902 op1_l = gen_reg_rtx (qimode);
48903 op1_h = gen_reg_rtx (qimode);
48904 emit_insn (gen_il (op1_l, op1, op1));
48905 emit_insn (gen_ih (op1_h, op1, op1));
48906 full_interleave = qimode == V16QImode;
48907 break;
48909 case ASHIFT:
48910 case LSHIFTRT:
48911 uns_p = true;
48912 /* FALLTHRU */
48913 case ASHIFTRT:
48914 op1_l = gen_reg_rtx (himode);
48915 op1_h = gen_reg_rtx (himode);
48916 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48917 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48918 full_interleave = true;
48919 break;
48920 default:
48921 gcc_unreachable ();
48924 /* Perform the operation. */
48925 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48926 1, OPTAB_DIRECT);
48927 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48928 1, OPTAB_DIRECT);
48929 gcc_assert (res_l && res_h);
48931 /* Merge the data back into the right place. */
48932 d.target = dest;
48933 d.op0 = gen_lowpart (qimode, res_l);
48934 d.op1 = gen_lowpart (qimode, res_h);
48935 d.vmode = qimode;
48936 d.nelt = GET_MODE_NUNITS (qimode);
48937 d.one_operand_p = false;
48938 d.testing_p = false;
48940 if (full_interleave)
48942 /* For SSE2, we used an full interleave, so the desired
48943 results are in the even elements. */
48944 for (i = 0; i < d.nelt; ++i)
48945 d.perm[i] = i * 2;
48947 else
48949 /* For AVX, the interleave used above was not cross-lane. So the
48950 extraction is evens but with the second and third quarter swapped.
48951 Happily, that is even one insn shorter than even extraction.
48952 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48953 always first from the first and then from the second source operand,
48954 the index bits above the low 4 bits remains the same.
48955 Thus, for d.nelt == 32 we want permutation
48956 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48957 and for d.nelt == 64 we want permutation
48958 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48959 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48960 for (i = 0; i < d.nelt; ++i)
48961 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48964 ok = ix86_expand_vec_perm_const_1 (&d);
48965 gcc_assert (ok);
48967 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48968 gen_rtx_fmt_ee (code, qimode, op1, op2));
48971 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48972 if op is CONST_VECTOR with all odd elements equal to their
48973 preceding element. */
48975 static bool
48976 const_vector_equal_evenodd_p (rtx op)
48978 machine_mode mode = GET_MODE (op);
48979 int i, nunits = GET_MODE_NUNITS (mode);
48980 if (GET_CODE (op) != CONST_VECTOR
48981 || nunits != CONST_VECTOR_NUNITS (op))
48982 return false;
48983 for (i = 0; i < nunits; i += 2)
48984 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48985 return false;
48986 return true;
48989 void
48990 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48991 bool uns_p, bool odd_p)
48993 machine_mode mode = GET_MODE (op1);
48994 machine_mode wmode = GET_MODE (dest);
48995 rtx x;
48996 rtx orig_op1 = op1, orig_op2 = op2;
48998 if (!nonimmediate_operand (op1, mode))
48999 op1 = force_reg (mode, op1);
49000 if (!nonimmediate_operand (op2, mode))
49001 op2 = force_reg (mode, op2);
49003 /* We only play even/odd games with vectors of SImode. */
49004 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49006 /* If we're looking for the odd results, shift those members down to
49007 the even slots. For some cpus this is faster than a PSHUFD. */
49008 if (odd_p)
49010 /* For XOP use vpmacsdqh, but only for smult, as it is only
49011 signed. */
49012 if (TARGET_XOP && mode == V4SImode && !uns_p)
49014 x = force_reg (wmode, CONST0_RTX (wmode));
49015 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49016 return;
49019 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49020 if (!const_vector_equal_evenodd_p (orig_op1))
49021 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49022 x, NULL, 1, OPTAB_DIRECT);
49023 if (!const_vector_equal_evenodd_p (orig_op2))
49024 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49025 x, NULL, 1, OPTAB_DIRECT);
49026 op1 = gen_lowpart (mode, op1);
49027 op2 = gen_lowpart (mode, op2);
49030 if (mode == V16SImode)
49032 if (uns_p)
49033 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49034 else
49035 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49037 else if (mode == V8SImode)
49039 if (uns_p)
49040 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49041 else
49042 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49044 else if (uns_p)
49045 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49046 else if (TARGET_SSE4_1)
49047 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49048 else
49050 rtx s1, s2, t0, t1, t2;
49052 /* The easiest way to implement this without PMULDQ is to go through
49053 the motions as if we are performing a full 64-bit multiply. With
49054 the exception that we need to do less shuffling of the elements. */
49056 /* Compute the sign-extension, aka highparts, of the two operands. */
49057 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49058 op1, pc_rtx, pc_rtx);
49059 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49060 op2, pc_rtx, pc_rtx);
49062 /* Multiply LO(A) * HI(B), and vice-versa. */
49063 t1 = gen_reg_rtx (wmode);
49064 t2 = gen_reg_rtx (wmode);
49065 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49066 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49068 /* Multiply LO(A) * LO(B). */
49069 t0 = gen_reg_rtx (wmode);
49070 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49072 /* Combine and shift the highparts into place. */
49073 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49074 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49075 1, OPTAB_DIRECT);
49077 /* Combine high and low parts. */
49078 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49079 return;
49081 emit_insn (x);
49084 void
49085 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49086 bool uns_p, bool high_p)
49088 machine_mode wmode = GET_MODE (dest);
49089 machine_mode mode = GET_MODE (op1);
49090 rtx t1, t2, t3, t4, mask;
49092 switch (mode)
49094 case V4SImode:
49095 t1 = gen_reg_rtx (mode);
49096 t2 = gen_reg_rtx (mode);
49097 if (TARGET_XOP && !uns_p)
49099 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49100 shuffle the elements once so that all elements are in the right
49101 place for immediate use: { A C B D }. */
49102 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49103 const1_rtx, GEN_INT (3)));
49104 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49105 const1_rtx, GEN_INT (3)));
49107 else
49109 /* Put the elements into place for the multiply. */
49110 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49111 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49112 high_p = false;
49114 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49115 break;
49117 case V8SImode:
49118 /* Shuffle the elements between the lanes. After this we
49119 have { A B E F | C D G H } for each operand. */
49120 t1 = gen_reg_rtx (V4DImode);
49121 t2 = gen_reg_rtx (V4DImode);
49122 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49123 const0_rtx, const2_rtx,
49124 const1_rtx, GEN_INT (3)));
49125 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49126 const0_rtx, const2_rtx,
49127 const1_rtx, GEN_INT (3)));
49129 /* Shuffle the elements within the lanes. After this we
49130 have { A A B B | C C D D } or { E E F F | G G H H }. */
49131 t3 = gen_reg_rtx (V8SImode);
49132 t4 = gen_reg_rtx (V8SImode);
49133 mask = GEN_INT (high_p
49134 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49135 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49136 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49137 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49139 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49140 break;
49142 case V8HImode:
49143 case V16HImode:
49144 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49145 uns_p, OPTAB_DIRECT);
49146 t2 = expand_binop (mode,
49147 uns_p ? umul_highpart_optab : smul_highpart_optab,
49148 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49149 gcc_assert (t1 && t2);
49151 t3 = gen_reg_rtx (mode);
49152 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49153 emit_move_insn (dest, gen_lowpart (wmode, t3));
49154 break;
49156 case V16QImode:
49157 case V32QImode:
49158 case V32HImode:
49159 case V16SImode:
49160 case V64QImode:
49161 t1 = gen_reg_rtx (wmode);
49162 t2 = gen_reg_rtx (wmode);
49163 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49164 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49166 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49167 break;
49169 default:
49170 gcc_unreachable ();
49174 void
49175 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49177 rtx res_1, res_2, res_3, res_4;
49179 res_1 = gen_reg_rtx (V4SImode);
49180 res_2 = gen_reg_rtx (V4SImode);
49181 res_3 = gen_reg_rtx (V2DImode);
49182 res_4 = gen_reg_rtx (V2DImode);
49183 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49184 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49186 /* Move the results in element 2 down to element 1; we don't care
49187 what goes in elements 2 and 3. Then we can merge the parts
49188 back together with an interleave.
49190 Note that two other sequences were tried:
49191 (1) Use interleaves at the start instead of psrldq, which allows
49192 us to use a single shufps to merge things back at the end.
49193 (2) Use shufps here to combine the two vectors, then pshufd to
49194 put the elements in the correct order.
49195 In both cases the cost of the reformatting stall was too high
49196 and the overall sequence slower. */
49198 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49199 const0_rtx, const2_rtx,
49200 const0_rtx, const0_rtx));
49201 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49202 const0_rtx, const2_rtx,
49203 const0_rtx, const0_rtx));
49204 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49206 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49209 void
49210 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49212 machine_mode mode = GET_MODE (op0);
49213 rtx t1, t2, t3, t4, t5, t6;
49215 if (TARGET_AVX512DQ && mode == V8DImode)
49216 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49217 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49218 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49219 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49220 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49221 else if (TARGET_XOP && mode == V2DImode)
49223 /* op1: A,B,C,D, op2: E,F,G,H */
49224 op1 = gen_lowpart (V4SImode, op1);
49225 op2 = gen_lowpart (V4SImode, op2);
49227 t1 = gen_reg_rtx (V4SImode);
49228 t2 = gen_reg_rtx (V4SImode);
49229 t3 = gen_reg_rtx (V2DImode);
49230 t4 = gen_reg_rtx (V2DImode);
49232 /* t1: B,A,D,C */
49233 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49234 GEN_INT (1),
49235 GEN_INT (0),
49236 GEN_INT (3),
49237 GEN_INT (2)));
49239 /* t2: (B*E),(A*F),(D*G),(C*H) */
49240 emit_insn (gen_mulv4si3 (t2, t1, op2));
49242 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49243 emit_insn (gen_xop_phadddq (t3, t2));
49245 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49246 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49248 /* Multiply lower parts and add all */
49249 t5 = gen_reg_rtx (V2DImode);
49250 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49251 gen_lowpart (V4SImode, op1),
49252 gen_lowpart (V4SImode, op2)));
49253 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49256 else
49258 machine_mode nmode;
49259 rtx (*umul) (rtx, rtx, rtx);
49261 if (mode == V2DImode)
49263 umul = gen_vec_widen_umult_even_v4si;
49264 nmode = V4SImode;
49266 else if (mode == V4DImode)
49268 umul = gen_vec_widen_umult_even_v8si;
49269 nmode = V8SImode;
49271 else if (mode == V8DImode)
49273 umul = gen_vec_widen_umult_even_v16si;
49274 nmode = V16SImode;
49276 else
49277 gcc_unreachable ();
49280 /* Multiply low parts. */
49281 t1 = gen_reg_rtx (mode);
49282 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49284 /* Shift input vectors right 32 bits so we can multiply high parts. */
49285 t6 = GEN_INT (32);
49286 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49287 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49289 /* Multiply high parts by low parts. */
49290 t4 = gen_reg_rtx (mode);
49291 t5 = gen_reg_rtx (mode);
49292 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49293 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49295 /* Combine and shift the highparts back. */
49296 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49297 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49299 /* Combine high and low parts. */
49300 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49303 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49304 gen_rtx_MULT (mode, op1, op2));
49307 /* Return 1 if control tansfer instruction INSN
49308 should be encoded with bnd prefix.
49309 If insn is NULL then return 1 when control
49310 transfer instructions should be prefixed with
49311 bnd by default for current function. */
49313 bool
49314 ix86_bnd_prefixed_insn_p (rtx insn)
49316 /* For call insns check special flag. */
49317 if (insn && CALL_P (insn))
49319 rtx call = get_call_rtx_from (insn);
49320 if (call)
49321 return CALL_EXPR_WITH_BOUNDS_P (call);
49324 /* All other insns are prefixed only if function is instrumented. */
49325 return chkp_function_instrumented_p (current_function_decl);
49328 /* Calculate integer abs() using only SSE2 instructions. */
49330 void
49331 ix86_expand_sse2_abs (rtx target, rtx input)
49333 machine_mode mode = GET_MODE (target);
49334 rtx tmp0, tmp1, x;
49336 switch (mode)
49338 /* For 32-bit signed integer X, the best way to calculate the absolute
49339 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49340 case V4SImode:
49341 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49342 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49343 NULL, 0, OPTAB_DIRECT);
49344 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49345 NULL, 0, OPTAB_DIRECT);
49346 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49347 target, 0, OPTAB_DIRECT);
49348 break;
49350 /* For 16-bit signed integer X, the best way to calculate the absolute
49351 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49352 case V8HImode:
49353 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49355 x = expand_simple_binop (mode, SMAX, tmp0, input,
49356 target, 0, OPTAB_DIRECT);
49357 break;
49359 /* For 8-bit signed integer X, the best way to calculate the absolute
49360 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49361 as SSE2 provides the PMINUB insn. */
49362 case V16QImode:
49363 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49365 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49366 target, 0, OPTAB_DIRECT);
49367 break;
49369 default:
49370 gcc_unreachable ();
49373 if (x != target)
49374 emit_move_insn (target, x);
49377 /* Expand an extract from a vector register through pextr insn.
49378 Return true if successful. */
49380 bool
49381 ix86_expand_pextr (rtx *operands)
49383 rtx dst = operands[0];
49384 rtx src = operands[1];
49386 unsigned int size = INTVAL (operands[2]);
49387 unsigned int pos = INTVAL (operands[3]);
49389 if (SUBREG_P (dst))
49391 /* Reject non-lowpart subregs. */
49392 if (SUBREG_BYTE (dst) > 0)
49393 return false;
49394 dst = SUBREG_REG (dst);
49397 if (SUBREG_P (src))
49399 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49400 src = SUBREG_REG (src);
49403 switch (GET_MODE (src))
49405 case V16QImode:
49406 case V8HImode:
49407 case V4SImode:
49408 case V2DImode:
49409 case V1TImode:
49410 case TImode:
49412 machine_mode srcmode, dstmode;
49413 rtx d, pat;
49415 dstmode = mode_for_size (size, MODE_INT, 0);
49417 switch (dstmode)
49419 case QImode:
49420 if (!TARGET_SSE4_1)
49421 return false;
49422 srcmode = V16QImode;
49423 break;
49425 case HImode:
49426 if (!TARGET_SSE2)
49427 return false;
49428 srcmode = V8HImode;
49429 break;
49431 case SImode:
49432 if (!TARGET_SSE4_1)
49433 return false;
49434 srcmode = V4SImode;
49435 break;
49437 case DImode:
49438 gcc_assert (TARGET_64BIT);
49439 if (!TARGET_SSE4_1)
49440 return false;
49441 srcmode = V2DImode;
49442 break;
49444 default:
49445 return false;
49448 /* Reject extractions from misaligned positions. */
49449 if (pos & (size-1))
49450 return false;
49452 if (GET_MODE (dst) == dstmode)
49453 d = dst;
49454 else
49455 d = gen_reg_rtx (dstmode);
49457 /* Construct insn pattern. */
49458 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49459 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49461 /* Let the rtl optimizers know about the zero extension performed. */
49462 if (dstmode == QImode || dstmode == HImode)
49464 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49465 d = gen_lowpart (SImode, d);
49468 emit_insn (gen_rtx_SET (d, pat));
49470 if (d != dst)
49471 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49472 return true;
49475 default:
49476 return false;
49480 /* Expand an insert into a vector register through pinsr insn.
49481 Return true if successful. */
49483 bool
49484 ix86_expand_pinsr (rtx *operands)
49486 rtx dst = operands[0];
49487 rtx src = operands[3];
49489 unsigned int size = INTVAL (operands[1]);
49490 unsigned int pos = INTVAL (operands[2]);
49492 if (SUBREG_P (dst))
49494 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49495 dst = SUBREG_REG (dst);
49498 switch (GET_MODE (dst))
49500 case V16QImode:
49501 case V8HImode:
49502 case V4SImode:
49503 case V2DImode:
49504 case V1TImode:
49505 case TImode:
49507 machine_mode srcmode, dstmode;
49508 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49509 rtx d;
49511 srcmode = mode_for_size (size, MODE_INT, 0);
49513 switch (srcmode)
49515 case QImode:
49516 if (!TARGET_SSE4_1)
49517 return false;
49518 dstmode = V16QImode;
49519 pinsr = gen_sse4_1_pinsrb;
49520 break;
49522 case HImode:
49523 if (!TARGET_SSE2)
49524 return false;
49525 dstmode = V8HImode;
49526 pinsr = gen_sse2_pinsrw;
49527 break;
49529 case SImode:
49530 if (!TARGET_SSE4_1)
49531 return false;
49532 dstmode = V4SImode;
49533 pinsr = gen_sse4_1_pinsrd;
49534 break;
49536 case DImode:
49537 gcc_assert (TARGET_64BIT);
49538 if (!TARGET_SSE4_1)
49539 return false;
49540 dstmode = V2DImode;
49541 pinsr = gen_sse4_1_pinsrq;
49542 break;
49544 default:
49545 return false;
49548 /* Reject insertions to misaligned positions. */
49549 if (pos & (size-1))
49550 return false;
49552 if (SUBREG_P (src))
49554 unsigned int srcpos = SUBREG_BYTE (src);
49556 if (srcpos > 0)
49558 rtx extr_ops[4];
49560 extr_ops[0] = gen_reg_rtx (srcmode);
49561 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49562 extr_ops[2] = GEN_INT (size);
49563 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49565 if (!ix86_expand_pextr (extr_ops))
49566 return false;
49568 src = extr_ops[0];
49570 else
49571 src = gen_lowpart (srcmode, SUBREG_REG (src));
49574 if (GET_MODE (dst) == dstmode)
49575 d = dst;
49576 else
49577 d = gen_reg_rtx (dstmode);
49579 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49580 gen_lowpart (srcmode, src),
49581 GEN_INT (1 << (pos / size))));
49582 if (d != dst)
49583 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49584 return true;
49587 default:
49588 return false;
49592 /* This function returns the calling abi specific va_list type node.
49593 It returns the FNDECL specific va_list type. */
49595 static tree
49596 ix86_fn_abi_va_list (tree fndecl)
49598 if (!TARGET_64BIT)
49599 return va_list_type_node;
49600 gcc_assert (fndecl != NULL_TREE);
49602 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49603 return ms_va_list_type_node;
49604 else
49605 return sysv_va_list_type_node;
49608 /* Returns the canonical va_list type specified by TYPE. If there
49609 is no valid TYPE provided, it return NULL_TREE. */
49611 static tree
49612 ix86_canonical_va_list_type (tree type)
49614 if (TARGET_64BIT)
49616 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49617 return ms_va_list_type_node;
49619 if ((TREE_CODE (type) == ARRAY_TYPE
49620 && integer_zerop (array_type_nelts (type)))
49621 || POINTER_TYPE_P (type))
49623 tree elem_type = TREE_TYPE (type);
49624 if (TREE_CODE (elem_type) == RECORD_TYPE
49625 && lookup_attribute ("sysv_abi va_list",
49626 TYPE_ATTRIBUTES (elem_type)))
49627 return sysv_va_list_type_node;
49630 return NULL_TREE;
49633 return std_canonical_va_list_type (type);
49636 /* Iterate through the target-specific builtin types for va_list.
49637 IDX denotes the iterator, *PTREE is set to the result type of
49638 the va_list builtin, and *PNAME to its internal type.
49639 Returns zero if there is no element for this index, otherwise
49640 IDX should be increased upon the next call.
49641 Note, do not iterate a base builtin's name like __builtin_va_list.
49642 Used from c_common_nodes_and_builtins. */
49644 static int
49645 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49647 if (TARGET_64BIT)
49649 switch (idx)
49651 default:
49652 break;
49654 case 0:
49655 *ptree = ms_va_list_type_node;
49656 *pname = "__builtin_ms_va_list";
49657 return 1;
49659 case 1:
49660 *ptree = sysv_va_list_type_node;
49661 *pname = "__builtin_sysv_va_list";
49662 return 1;
49666 return 0;
49669 #undef TARGET_SCHED_DISPATCH
49670 #define TARGET_SCHED_DISPATCH has_dispatch
49671 #undef TARGET_SCHED_DISPATCH_DO
49672 #define TARGET_SCHED_DISPATCH_DO do_dispatch
49673 #undef TARGET_SCHED_REASSOCIATION_WIDTH
49674 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
49675 #undef TARGET_SCHED_REORDER
49676 #define TARGET_SCHED_REORDER ix86_sched_reorder
49677 #undef TARGET_SCHED_ADJUST_PRIORITY
49678 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
49679 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
49680 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
49681 ix86_dependencies_evaluation_hook
49683 /* The size of the dispatch window is the total number of bytes of
49684 object code allowed in a window. */
49685 #define DISPATCH_WINDOW_SIZE 16
49687 /* Number of dispatch windows considered for scheduling. */
49688 #define MAX_DISPATCH_WINDOWS 3
49690 /* Maximum number of instructions in a window. */
49691 #define MAX_INSN 4
49693 /* Maximum number of immediate operands in a window. */
49694 #define MAX_IMM 4
49696 /* Maximum number of immediate bits allowed in a window. */
49697 #define MAX_IMM_SIZE 128
49699 /* Maximum number of 32 bit immediates allowed in a window. */
49700 #define MAX_IMM_32 4
49702 /* Maximum number of 64 bit immediates allowed in a window. */
49703 #define MAX_IMM_64 2
49705 /* Maximum total of loads or prefetches allowed in a window. */
49706 #define MAX_LOAD 2
49708 /* Maximum total of stores allowed in a window. */
49709 #define MAX_STORE 1
49711 #undef BIG
49712 #define BIG 100
49715 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
49716 enum dispatch_group {
49717 disp_no_group = 0,
49718 disp_load,
49719 disp_store,
49720 disp_load_store,
49721 disp_prefetch,
49722 disp_imm,
49723 disp_imm_32,
49724 disp_imm_64,
49725 disp_branch,
49726 disp_cmp,
49727 disp_jcc,
49728 disp_last
49731 /* Number of allowable groups in a dispatch window. It is an array
49732 indexed by dispatch_group enum. 100 is used as a big number,
49733 because the number of these kind of operations does not have any
49734 effect in dispatch window, but we need them for other reasons in
49735 the table. */
49736 static unsigned int num_allowable_groups[disp_last] = {
49737 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
49740 char group_name[disp_last + 1][16] = {
49741 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
49742 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
49743 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
49746 /* Instruction path. */
49747 enum insn_path {
49748 no_path = 0,
49749 path_single, /* Single micro op. */
49750 path_double, /* Double micro op. */
49751 path_multi, /* Instructions with more than 2 micro op.. */
49752 last_path
49755 /* sched_insn_info defines a window to the instructions scheduled in
49756 the basic block. It contains a pointer to the insn_info table and
49757 the instruction scheduled.
49759 Windows are allocated for each basic block and are linked
49760 together. */
49761 typedef struct sched_insn_info_s {
49762 rtx insn;
49763 enum dispatch_group group;
49764 enum insn_path path;
49765 int byte_len;
49766 int imm_bytes;
49767 } sched_insn_info;
49769 /* Linked list of dispatch windows. This is a two way list of
49770 dispatch windows of a basic block. It contains information about
49771 the number of uops in the window and the total number of
49772 instructions and of bytes in the object code for this dispatch
49773 window. */
49774 typedef struct dispatch_windows_s {
49775 int num_insn; /* Number of insn in the window. */
49776 int num_uops; /* Number of uops in the window. */
49777 int window_size; /* Number of bytes in the window. */
49778 int window_num; /* Window number between 0 or 1. */
49779 int num_imm; /* Number of immediates in an insn. */
49780 int num_imm_32; /* Number of 32 bit immediates in an insn. */
49781 int num_imm_64; /* Number of 64 bit immediates in an insn. */
49782 int imm_size; /* Total immediates in the window. */
49783 int num_loads; /* Total memory loads in the window. */
49784 int num_stores; /* Total memory stores in the window. */
49785 int violation; /* Violation exists in window. */
49786 sched_insn_info *window; /* Pointer to the window. */
49787 struct dispatch_windows_s *next;
49788 struct dispatch_windows_s *prev;
49789 } dispatch_windows;
49791 /* Immediate valuse used in an insn. */
49792 typedef struct imm_info_s
49794 int imm;
49795 int imm32;
49796 int imm64;
49797 } imm_info;
49799 static dispatch_windows *dispatch_window_list;
49800 static dispatch_windows *dispatch_window_list1;
49802 /* Get dispatch group of insn. */
49804 static enum dispatch_group
49805 get_mem_group (rtx_insn *insn)
49807 enum attr_memory memory;
49809 if (INSN_CODE (insn) < 0)
49810 return disp_no_group;
49811 memory = get_attr_memory (insn);
49812 if (memory == MEMORY_STORE)
49813 return disp_store;
49815 if (memory == MEMORY_LOAD)
49816 return disp_load;
49818 if (memory == MEMORY_BOTH)
49819 return disp_load_store;
49821 return disp_no_group;
49824 /* Return true if insn is a compare instruction. */
49826 static bool
49827 is_cmp (rtx_insn *insn)
49829 enum attr_type type;
49831 type = get_attr_type (insn);
49832 return (type == TYPE_TEST
49833 || type == TYPE_ICMP
49834 || type == TYPE_FCMP
49835 || GET_CODE (PATTERN (insn)) == COMPARE);
49838 /* Return true if a dispatch violation encountered. */
49840 static bool
49841 dispatch_violation (void)
49843 if (dispatch_window_list->next)
49844 return dispatch_window_list->next->violation;
49845 return dispatch_window_list->violation;
49848 /* Return true if insn is a branch instruction. */
49850 static bool
49851 is_branch (rtx_insn *insn)
49853 return (CALL_P (insn) || JUMP_P (insn));
49856 /* Return true if insn is a prefetch instruction. */
49858 static bool
49859 is_prefetch (rtx_insn *insn)
49861 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
49864 /* This function initializes a dispatch window and the list container holding a
49865 pointer to the window. */
49867 static void
49868 init_window (int window_num)
49870 int i;
49871 dispatch_windows *new_list;
49873 if (window_num == 0)
49874 new_list = dispatch_window_list;
49875 else
49876 new_list = dispatch_window_list1;
49878 new_list->num_insn = 0;
49879 new_list->num_uops = 0;
49880 new_list->window_size = 0;
49881 new_list->next = NULL;
49882 new_list->prev = NULL;
49883 new_list->window_num = window_num;
49884 new_list->num_imm = 0;
49885 new_list->num_imm_32 = 0;
49886 new_list->num_imm_64 = 0;
49887 new_list->imm_size = 0;
49888 new_list->num_loads = 0;
49889 new_list->num_stores = 0;
49890 new_list->violation = false;
49892 for (i = 0; i < MAX_INSN; i++)
49894 new_list->window[i].insn = NULL;
49895 new_list->window[i].group = disp_no_group;
49896 new_list->window[i].path = no_path;
49897 new_list->window[i].byte_len = 0;
49898 new_list->window[i].imm_bytes = 0;
49900 return;
49903 /* This function allocates and initializes a dispatch window and the
49904 list container holding a pointer to the window. */
49906 static dispatch_windows *
49907 allocate_window (void)
49909 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
49910 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
49912 return new_list;
49915 /* This routine initializes the dispatch scheduling information. It
49916 initiates building dispatch scheduler tables and constructs the
49917 first dispatch window. */
49919 static void
49920 init_dispatch_sched (void)
49922 /* Allocate a dispatch list and a window. */
49923 dispatch_window_list = allocate_window ();
49924 dispatch_window_list1 = allocate_window ();
49925 init_window (0);
49926 init_window (1);
49929 /* This function returns true if a branch is detected. End of a basic block
49930 does not have to be a branch, but here we assume only branches end a
49931 window. */
49933 static bool
49934 is_end_basic_block (enum dispatch_group group)
49936 return group == disp_branch;
49939 /* This function is called when the end of a window processing is reached. */
49941 static void
49942 process_end_window (void)
49944 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
49945 if (dispatch_window_list->next)
49947 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
49948 gcc_assert (dispatch_window_list->window_size
49949 + dispatch_window_list1->window_size <= 48);
49950 init_window (1);
49952 init_window (0);
49955 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
49956 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
49957 for 48 bytes of instructions. Note that these windows are not dispatch
49958 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49960 static dispatch_windows *
49961 allocate_next_window (int window_num)
49963 if (window_num == 0)
49965 if (dispatch_window_list->next)
49966 init_window (1);
49967 init_window (0);
49968 return dispatch_window_list;
49971 dispatch_window_list->next = dispatch_window_list1;
49972 dispatch_window_list1->prev = dispatch_window_list;
49974 return dispatch_window_list1;
49977 /* Compute number of immediate operands of an instruction. */
49979 static void
49980 find_constant (rtx in_rtx, imm_info *imm_values)
49982 if (INSN_P (in_rtx))
49983 in_rtx = PATTERN (in_rtx);
49984 subrtx_iterator::array_type array;
49985 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49986 if (const_rtx x = *iter)
49987 switch (GET_CODE (x))
49989 case CONST:
49990 case SYMBOL_REF:
49991 case CONST_INT:
49992 (imm_values->imm)++;
49993 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49994 (imm_values->imm32)++;
49995 else
49996 (imm_values->imm64)++;
49997 break;
49999 case CONST_DOUBLE:
50000 case CONST_WIDE_INT:
50001 (imm_values->imm)++;
50002 (imm_values->imm64)++;
50003 break;
50005 case CODE_LABEL:
50006 if (LABEL_KIND (x) == LABEL_NORMAL)
50008 (imm_values->imm)++;
50009 (imm_values->imm32)++;
50011 break;
50013 default:
50014 break;
50018 /* Return total size of immediate operands of an instruction along with number
50019 of corresponding immediate-operands. It initializes its parameters to zero
50020 befor calling FIND_CONSTANT.
50021 INSN is the input instruction. IMM is the total of immediates.
50022 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
50023 bit immediates. */
50025 static int
50026 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
50028 imm_info imm_values = {0, 0, 0};
50030 find_constant (insn, &imm_values);
50031 *imm = imm_values.imm;
50032 *imm32 = imm_values.imm32;
50033 *imm64 = imm_values.imm64;
50034 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
50037 /* This function indicates if an operand of an instruction is an
50038 immediate. */
50040 static bool
50041 has_immediate (rtx_insn *insn)
50043 int num_imm_operand;
50044 int num_imm32_operand;
50045 int num_imm64_operand;
50047 if (insn)
50048 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50049 &num_imm64_operand);
50050 return false;
50053 /* Return single or double path for instructions. */
50055 static enum insn_path
50056 get_insn_path (rtx_insn *insn)
50058 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
50060 if ((int)path == 0)
50061 return path_single;
50063 if ((int)path == 1)
50064 return path_double;
50066 return path_multi;
50069 /* Return insn dispatch group. */
50071 static enum dispatch_group
50072 get_insn_group (rtx_insn *insn)
50074 enum dispatch_group group = get_mem_group (insn);
50075 if (group)
50076 return group;
50078 if (is_branch (insn))
50079 return disp_branch;
50081 if (is_cmp (insn))
50082 return disp_cmp;
50084 if (has_immediate (insn))
50085 return disp_imm;
50087 if (is_prefetch (insn))
50088 return disp_prefetch;
50090 return disp_no_group;
50093 /* Count number of GROUP restricted instructions in a dispatch
50094 window WINDOW_LIST. */
50096 static int
50097 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50099 enum dispatch_group group = get_insn_group (insn);
50100 int imm_size;
50101 int num_imm_operand;
50102 int num_imm32_operand;
50103 int num_imm64_operand;
50105 if (group == disp_no_group)
50106 return 0;
50108 if (group == disp_imm)
50110 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50111 &num_imm64_operand);
50112 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50113 || num_imm_operand + window_list->num_imm > MAX_IMM
50114 || (num_imm32_operand > 0
50115 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50116 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50117 || (num_imm64_operand > 0
50118 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50119 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50120 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50121 && num_imm64_operand > 0
50122 && ((window_list->num_imm_64 > 0
50123 && window_list->num_insn >= 2)
50124 || window_list->num_insn >= 3)))
50125 return BIG;
50127 return 1;
50130 if ((group == disp_load_store
50131 && (window_list->num_loads >= MAX_LOAD
50132 || window_list->num_stores >= MAX_STORE))
50133 || ((group == disp_load
50134 || group == disp_prefetch)
50135 && window_list->num_loads >= MAX_LOAD)
50136 || (group == disp_store
50137 && window_list->num_stores >= MAX_STORE))
50138 return BIG;
50140 return 1;
50143 /* This function returns true if insn satisfies dispatch rules on the
50144 last window scheduled. */
50146 static bool
50147 fits_dispatch_window (rtx_insn *insn)
50149 dispatch_windows *window_list = dispatch_window_list;
50150 dispatch_windows *window_list_next = dispatch_window_list->next;
50151 unsigned int num_restrict;
50152 enum dispatch_group group = get_insn_group (insn);
50153 enum insn_path path = get_insn_path (insn);
50154 int sum;
50156 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50157 instructions should be given the lowest priority in the
50158 scheduling process in Haifa scheduler to make sure they will be
50159 scheduled in the same dispatch window as the reference to them. */
50160 if (group == disp_jcc || group == disp_cmp)
50161 return false;
50163 /* Check nonrestricted. */
50164 if (group == disp_no_group || group == disp_branch)
50165 return true;
50167 /* Get last dispatch window. */
50168 if (window_list_next)
50169 window_list = window_list_next;
50171 if (window_list->window_num == 1)
50173 sum = window_list->prev->window_size + window_list->window_size;
50175 if (sum == 32
50176 || (min_insn_size (insn) + sum) >= 48)
50177 /* Window 1 is full. Go for next window. */
50178 return true;
50181 num_restrict = count_num_restricted (insn, window_list);
50183 if (num_restrict > num_allowable_groups[group])
50184 return false;
50186 /* See if it fits in the first window. */
50187 if (window_list->window_num == 0)
50189 /* The first widow should have only single and double path
50190 uops. */
50191 if (path == path_double
50192 && (window_list->num_uops + 2) > MAX_INSN)
50193 return false;
50194 else if (path != path_single)
50195 return false;
50197 return true;
50200 /* Add an instruction INSN with NUM_UOPS micro-operations to the
50201 dispatch window WINDOW_LIST. */
50203 static void
50204 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
50206 int byte_len = min_insn_size (insn);
50207 int num_insn = window_list->num_insn;
50208 int imm_size;
50209 sched_insn_info *window = window_list->window;
50210 enum dispatch_group group = get_insn_group (insn);
50211 enum insn_path path = get_insn_path (insn);
50212 int num_imm_operand;
50213 int num_imm32_operand;
50214 int num_imm64_operand;
50216 if (!window_list->violation && group != disp_cmp
50217 && !fits_dispatch_window (insn))
50218 window_list->violation = true;
50220 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50221 &num_imm64_operand);
50223 /* Initialize window with new instruction. */
50224 window[num_insn].insn = insn;
50225 window[num_insn].byte_len = byte_len;
50226 window[num_insn].group = group;
50227 window[num_insn].path = path;
50228 window[num_insn].imm_bytes = imm_size;
50230 window_list->window_size += byte_len;
50231 window_list->num_insn = num_insn + 1;
50232 window_list->num_uops = window_list->num_uops + num_uops;
50233 window_list->imm_size += imm_size;
50234 window_list->num_imm += num_imm_operand;
50235 window_list->num_imm_32 += num_imm32_operand;
50236 window_list->num_imm_64 += num_imm64_operand;
50238 if (group == disp_store)
50239 window_list->num_stores += 1;
50240 else if (group == disp_load
50241 || group == disp_prefetch)
50242 window_list->num_loads += 1;
50243 else if (group == disp_load_store)
50245 window_list->num_stores += 1;
50246 window_list->num_loads += 1;
50250 /* Adds a scheduled instruction, INSN, to the current dispatch window.
50251 If the total bytes of instructions or the number of instructions in
50252 the window exceed allowable, it allocates a new window. */
50254 static void
50255 add_to_dispatch_window (rtx_insn *insn)
50257 int byte_len;
50258 dispatch_windows *window_list;
50259 dispatch_windows *next_list;
50260 dispatch_windows *window0_list;
50261 enum insn_path path;
50262 enum dispatch_group insn_group;
50263 bool insn_fits;
50264 int num_insn;
50265 int num_uops;
50266 int window_num;
50267 int insn_num_uops;
50268 int sum;
50270 if (INSN_CODE (insn) < 0)
50271 return;
50273 byte_len = min_insn_size (insn);
50274 window_list = dispatch_window_list;
50275 next_list = window_list->next;
50276 path = get_insn_path (insn);
50277 insn_group = get_insn_group (insn);
50279 /* Get the last dispatch window. */
50280 if (next_list)
50281 window_list = dispatch_window_list->next;
50283 if (path == path_single)
50284 insn_num_uops = 1;
50285 else if (path == path_double)
50286 insn_num_uops = 2;
50287 else
50288 insn_num_uops = (int) path;
50290 /* If current window is full, get a new window.
50291 Window number zero is full, if MAX_INSN uops are scheduled in it.
50292 Window number one is full, if window zero's bytes plus window
50293 one's bytes is 32, or if the bytes of the new instruction added
50294 to the total makes it greater than 48, or it has already MAX_INSN
50295 instructions in it. */
50296 num_insn = window_list->num_insn;
50297 num_uops = window_list->num_uops;
50298 window_num = window_list->window_num;
50299 insn_fits = fits_dispatch_window (insn);
50301 if (num_insn >= MAX_INSN
50302 || num_uops + insn_num_uops > MAX_INSN
50303 || !(insn_fits))
50305 window_num = ~window_num & 1;
50306 window_list = allocate_next_window (window_num);
50309 if (window_num == 0)
50311 add_insn_window (insn, window_list, insn_num_uops);
50312 if (window_list->num_insn >= MAX_INSN
50313 && insn_group == disp_branch)
50315 process_end_window ();
50316 return;
50319 else if (window_num == 1)
50321 window0_list = window_list->prev;
50322 sum = window0_list->window_size + window_list->window_size;
50323 if (sum == 32
50324 || (byte_len + sum) >= 48)
50326 process_end_window ();
50327 window_list = dispatch_window_list;
50330 add_insn_window (insn, window_list, insn_num_uops);
50332 else
50333 gcc_unreachable ();
50335 if (is_end_basic_block (insn_group))
50337 /* End of basic block is reached do end-basic-block process. */
50338 process_end_window ();
50339 return;
50343 /* Print the dispatch window, WINDOW_NUM, to FILE. */
50345 DEBUG_FUNCTION static void
50346 debug_dispatch_window_file (FILE *file, int window_num)
50348 dispatch_windows *list;
50349 int i;
50351 if (window_num == 0)
50352 list = dispatch_window_list;
50353 else
50354 list = dispatch_window_list1;
50356 fprintf (file, "Window #%d:\n", list->window_num);
50357 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
50358 list->num_insn, list->num_uops, list->window_size);
50359 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50360 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
50362 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
50363 list->num_stores);
50364 fprintf (file, " insn info:\n");
50366 for (i = 0; i < MAX_INSN; i++)
50368 if (!list->window[i].insn)
50369 break;
50370 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
50371 i, group_name[list->window[i].group],
50372 i, (void *)list->window[i].insn,
50373 i, list->window[i].path,
50374 i, list->window[i].byte_len,
50375 i, list->window[i].imm_bytes);
50379 /* Print to stdout a dispatch window. */
50381 DEBUG_FUNCTION void
50382 debug_dispatch_window (int window_num)
50384 debug_dispatch_window_file (stdout, window_num);
50387 /* Print INSN dispatch information to FILE. */
50389 DEBUG_FUNCTION static void
50390 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
50392 int byte_len;
50393 enum insn_path path;
50394 enum dispatch_group group;
50395 int imm_size;
50396 int num_imm_operand;
50397 int num_imm32_operand;
50398 int num_imm64_operand;
50400 if (INSN_CODE (insn) < 0)
50401 return;
50403 byte_len = min_insn_size (insn);
50404 path = get_insn_path (insn);
50405 group = get_insn_group (insn);
50406 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50407 &num_imm64_operand);
50409 fprintf (file, " insn info:\n");
50410 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
50411 group_name[group], path, byte_len);
50412 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50413 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
50416 /* Print to STDERR the status of the ready list with respect to
50417 dispatch windows. */
50419 DEBUG_FUNCTION void
50420 debug_ready_dispatch (void)
50422 int i;
50423 int no_ready = number_in_ready ();
50425 fprintf (stdout, "Number of ready: %d\n", no_ready);
50427 for (i = 0; i < no_ready; i++)
50428 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
50431 /* This routine is the driver of the dispatch scheduler. */
50433 static void
50434 do_dispatch (rtx_insn *insn, int mode)
50436 if (mode == DISPATCH_INIT)
50437 init_dispatch_sched ();
50438 else if (mode == ADD_TO_DISPATCH_WINDOW)
50439 add_to_dispatch_window (insn);
50442 /* Return TRUE if Dispatch Scheduling is supported. */
50444 static bool
50445 has_dispatch (rtx_insn *insn, int action)
50447 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
50448 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
50449 switch (action)
50451 default:
50452 return false;
50454 case IS_DISPATCH_ON:
50455 return true;
50457 case IS_CMP:
50458 return is_cmp (insn);
50460 case DISPATCH_VIOLATION:
50461 return dispatch_violation ();
50463 case FITS_DISPATCH_WINDOW:
50464 return fits_dispatch_window (insn);
50467 return false;
50470 /* Implementation of reassociation_width target hook used by
50471 reassoc phase to identify parallelism level in reassociated
50472 tree. Statements tree_code is passed in OPC. Arguments type
50473 is passed in MODE.
50475 Currently parallel reassociation is enabled for Atom
50476 processors only and we set reassociation width to be 2
50477 because Atom may issue up to 2 instructions per cycle.
50479 Return value should be fixed if parallel reassociation is
50480 enabled for other processors. */
50482 static int
50483 ix86_reassociation_width (unsigned int, machine_mode mode)
50485 /* Vector part. */
50486 if (VECTOR_MODE_P (mode))
50488 if (TARGET_VECTOR_PARALLEL_EXECUTION)
50489 return 2;
50490 else
50491 return 1;
50494 /* Scalar part. */
50495 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
50496 return 2;
50497 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
50498 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
50499 else
50500 return 1;
50503 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50504 place emms and femms instructions. */
50506 static machine_mode
50507 ix86_preferred_simd_mode (machine_mode mode)
50509 if (!TARGET_SSE)
50510 return word_mode;
50512 switch (mode)
50514 case QImode:
50515 return TARGET_AVX512BW ? V64QImode :
50516 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
50517 case HImode:
50518 return TARGET_AVX512BW ? V32HImode :
50519 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
50520 case SImode:
50521 return TARGET_AVX512F ? V16SImode :
50522 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
50523 case DImode:
50524 return TARGET_AVX512F ? V8DImode :
50525 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
50527 case SFmode:
50528 if (TARGET_AVX512F)
50529 return V16SFmode;
50530 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50531 return V8SFmode;
50532 else
50533 return V4SFmode;
50535 case DFmode:
50536 if (TARGET_AVX512F)
50537 return V8DFmode;
50538 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50539 return V4DFmode;
50540 else if (TARGET_SSE2)
50541 return V2DFmode;
50542 /* FALLTHRU */
50544 default:
50545 return word_mode;
50549 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50550 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50551 256bit and 128bit vectors. */
50553 static unsigned int
50554 ix86_autovectorize_vector_sizes (void)
50556 return TARGET_AVX512F ? 64 | 32 | 16 :
50557 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
50560 /* Implemenation of targetm.vectorize.get_mask_mode. */
50562 static machine_mode
50563 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
50565 unsigned elem_size = vector_size / nunits;
50567 /* Scalar mask case. */
50568 if ((TARGET_AVX512F && vector_size == 64)
50569 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50571 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50572 return smallest_mode_for_size (nunits, MODE_INT);
50575 machine_mode elem_mode
50576 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
50578 gcc_assert (elem_size * nunits == vector_size);
50580 return mode_for_vector (elem_mode, nunits);
50585 /* Return class of registers which could be used for pseudo of MODE
50586 and of class RCLASS for spilling instead of memory. Return NO_REGS
50587 if it is not possible or non-profitable. */
50589 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50591 static reg_class_t
50592 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50594 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50595 && TARGET_SSE2
50596 && TARGET_INTER_UNIT_MOVES_TO_VEC
50597 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50598 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50599 && INTEGER_CLASS_P (rclass))
50600 return ALL_SSE_REGS;
50601 return NO_REGS;
50604 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50605 but returns a lower bound. */
50607 static unsigned int
50608 ix86_max_noce_ifcvt_seq_cost (edge e)
50610 bool predictable_p = predictable_edge_p (e);
50612 enum compiler_param param
50613 = (predictable_p
50614 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50615 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50617 /* If we have a parameter set, use that, otherwise take a guess using
50618 BRANCH_COST. */
50619 if (global_options_set.x_param_values[param])
50620 return PARAM_VALUE (param);
50621 else
50622 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50626 /* Implement targetm.vectorize.init_cost. */
50628 static void *
50629 ix86_init_cost (struct loop *)
50631 unsigned *cost = XNEWVEC (unsigned, 3);
50632 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50633 return cost;
50636 /* Implement targetm.vectorize.add_stmt_cost. */
50638 static unsigned
50639 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50640 struct _stmt_vec_info *stmt_info, int misalign,
50641 enum vect_cost_model_location where)
50643 unsigned *cost = (unsigned *) data;
50644 unsigned retval = 0;
50646 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50647 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50649 /* Penalize DFmode vector operations for Bonnell. */
50650 if (TARGET_BONNELL && kind == vector_stmt
50651 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50652 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50654 /* Statements in an inner loop relative to the loop being
50655 vectorized are weighted more heavily. The value here is
50656 arbitrary and could potentially be improved with analysis. */
50657 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50658 count *= 50; /* FIXME. */
50660 retval = (unsigned) (count * stmt_cost);
50662 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50663 for Silvermont as it has out of order integer pipeline and can execute
50664 2 scalar instruction per tick, but has in order SIMD pipeline. */
50665 if ((TARGET_SILVERMONT || TARGET_INTEL)
50666 && stmt_info && stmt_info->stmt)
50668 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50669 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50670 retval = (retval * 17) / 10;
50673 cost[where] += retval;
50675 return retval;
50678 /* Implement targetm.vectorize.finish_cost. */
50680 static void
50681 ix86_finish_cost (void *data, unsigned *prologue_cost,
50682 unsigned *body_cost, unsigned *epilogue_cost)
50684 unsigned *cost = (unsigned *) data;
50685 *prologue_cost = cost[vect_prologue];
50686 *body_cost = cost[vect_body];
50687 *epilogue_cost = cost[vect_epilogue];
50690 /* Implement targetm.vectorize.destroy_cost_data. */
50692 static void
50693 ix86_destroy_cost_data (void *data)
50695 free (data);
50698 /* Validate target specific memory model bits in VAL. */
50700 static unsigned HOST_WIDE_INT
50701 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50703 enum memmodel model = memmodel_from_int (val);
50704 bool strong;
50706 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50707 |MEMMODEL_MASK)
50708 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50710 warning (OPT_Winvalid_memory_model,
50711 "Unknown architecture specific memory model");
50712 return MEMMODEL_SEQ_CST;
50714 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50715 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50717 warning (OPT_Winvalid_memory_model,
50718 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50719 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50721 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50723 warning (OPT_Winvalid_memory_model,
50724 "HLE_RELEASE not used with RELEASE or stronger memory model");
50725 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50727 return val;
50730 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50731 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50732 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50733 or number of vecsize_mangle variants that should be emitted. */
50735 static int
50736 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50737 struct cgraph_simd_clone *clonei,
50738 tree base_type, int num)
50740 int ret = 1;
50742 if (clonei->simdlen
50743 && (clonei->simdlen < 2
50744 || clonei->simdlen > 1024
50745 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50747 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50748 "unsupported simdlen %d", clonei->simdlen);
50749 return 0;
50752 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50753 if (TREE_CODE (ret_type) != VOID_TYPE)
50754 switch (TYPE_MODE (ret_type))
50756 case QImode:
50757 case HImode:
50758 case SImode:
50759 case DImode:
50760 case SFmode:
50761 case DFmode:
50762 /* case SCmode: */
50763 /* case DCmode: */
50764 break;
50765 default:
50766 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50767 "unsupported return type %qT for simd\n", ret_type);
50768 return 0;
50771 tree t;
50772 int i;
50774 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50775 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50776 switch (TYPE_MODE (TREE_TYPE (t)))
50778 case QImode:
50779 case HImode:
50780 case SImode:
50781 case DImode:
50782 case SFmode:
50783 case DFmode:
50784 /* case SCmode: */
50785 /* case DCmode: */
50786 break;
50787 default:
50788 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50789 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
50790 return 0;
50793 if (clonei->cilk_elemental)
50795 /* Parse here processor clause. If not present, default to 'b'. */
50796 clonei->vecsize_mangle = 'b';
50798 else if (!TREE_PUBLIC (node->decl))
50800 /* If the function isn't exported, we can pick up just one ISA
50801 for the clones. */
50802 if (TARGET_AVX512F)
50803 clonei->vecsize_mangle = 'e';
50804 else if (TARGET_AVX2)
50805 clonei->vecsize_mangle = 'd';
50806 else if (TARGET_AVX)
50807 clonei->vecsize_mangle = 'c';
50808 else
50809 clonei->vecsize_mangle = 'b';
50810 ret = 1;
50812 else
50814 clonei->vecsize_mangle = "bcde"[num];
50815 ret = 4;
50817 clonei->mask_mode = VOIDmode;
50818 switch (clonei->vecsize_mangle)
50820 case 'b':
50821 clonei->vecsize_int = 128;
50822 clonei->vecsize_float = 128;
50823 break;
50824 case 'c':
50825 clonei->vecsize_int = 128;
50826 clonei->vecsize_float = 256;
50827 break;
50828 case 'd':
50829 clonei->vecsize_int = 256;
50830 clonei->vecsize_float = 256;
50831 break;
50832 case 'e':
50833 clonei->vecsize_int = 512;
50834 clonei->vecsize_float = 512;
50835 if (TYPE_MODE (base_type) == QImode)
50836 clonei->mask_mode = DImode;
50837 else
50838 clonei->mask_mode = SImode;
50839 break;
50841 if (clonei->simdlen == 0)
50843 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50844 clonei->simdlen = clonei->vecsize_int;
50845 else
50846 clonei->simdlen = clonei->vecsize_float;
50847 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50849 else if (clonei->simdlen > 16)
50851 /* For compatibility with ICC, use the same upper bounds
50852 for simdlen. In particular, for CTYPE below, use the return type,
50853 unless the function returns void, in that case use the characteristic
50854 type. If it is possible for given SIMDLEN to pass CTYPE value
50855 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50856 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50857 emit corresponding clone. */
50858 tree ctype = ret_type;
50859 if (TREE_CODE (ret_type) == VOID_TYPE)
50860 ctype = base_type;
50861 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50862 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50863 cnt /= clonei->vecsize_int;
50864 else
50865 cnt /= clonei->vecsize_float;
50866 if (cnt > (TARGET_64BIT ? 16 : 8))
50868 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50869 "unsupported simdlen %d", clonei->simdlen);
50870 return 0;
50873 return ret;
50876 /* Add target attribute to SIMD clone NODE if needed. */
50878 static void
50879 ix86_simd_clone_adjust (struct cgraph_node *node)
50881 const char *str = NULL;
50882 gcc_assert (node->decl == cfun->decl);
50883 switch (node->simdclone->vecsize_mangle)
50885 case 'b':
50886 if (!TARGET_SSE2)
50887 str = "sse2";
50888 break;
50889 case 'c':
50890 if (!TARGET_AVX)
50891 str = "avx";
50892 break;
50893 case 'd':
50894 if (!TARGET_AVX2)
50895 str = "avx2";
50896 break;
50897 case 'e':
50898 if (!TARGET_AVX512F)
50899 str = "avx512f";
50900 break;
50901 default:
50902 gcc_unreachable ();
50904 if (str == NULL)
50905 return;
50906 push_cfun (NULL);
50907 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50908 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50909 gcc_assert (ok);
50910 pop_cfun ();
50911 ix86_reset_previous_fndecl ();
50912 ix86_set_current_function (node->decl);
50915 /* If SIMD clone NODE can't be used in a vectorized loop
50916 in current function, return -1, otherwise return a badness of using it
50917 (0 if it is most desirable from vecsize_mangle point of view, 1
50918 slightly less desirable, etc.). */
50920 static int
50921 ix86_simd_clone_usable (struct cgraph_node *node)
50923 switch (node->simdclone->vecsize_mangle)
50925 case 'b':
50926 if (!TARGET_SSE2)
50927 return -1;
50928 if (!TARGET_AVX)
50929 return 0;
50930 return TARGET_AVX2 ? 2 : 1;
50931 case 'c':
50932 if (!TARGET_AVX)
50933 return -1;
50934 return TARGET_AVX2 ? 1 : 0;
50935 case 'd':
50936 if (!TARGET_AVX2)
50937 return -1;
50938 return 0;
50939 case 'e':
50940 if (!TARGET_AVX512F)
50941 return -1;
50942 return 0;
50943 default:
50944 gcc_unreachable ();
50948 /* This function adjusts the unroll factor based on
50949 the hardware capabilities. For ex, bdver3 has
50950 a loop buffer which makes unrolling of smaller
50951 loops less important. This function decides the
50952 unroll factor using number of memory references
50953 (value 32 is used) as a heuristic. */
50955 static unsigned
50956 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50958 basic_block *bbs;
50959 rtx_insn *insn;
50960 unsigned i;
50961 unsigned mem_count = 0;
50963 if (!TARGET_ADJUST_UNROLL)
50964 return nunroll;
50966 /* Count the number of memory references within the loop body.
50967 This value determines the unrolling factor for bdver3 and bdver4
50968 architectures. */
50969 subrtx_iterator::array_type array;
50970 bbs = get_loop_body (loop);
50971 for (i = 0; i < loop->num_nodes; i++)
50972 FOR_BB_INSNS (bbs[i], insn)
50973 if (NONDEBUG_INSN_P (insn))
50974 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50975 if (const_rtx x = *iter)
50976 if (MEM_P (x))
50978 machine_mode mode = GET_MODE (x);
50979 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50980 if (n_words > 4)
50981 mem_count += 2;
50982 else
50983 mem_count += 1;
50985 free (bbs);
50987 if (mem_count && mem_count <=32)
50988 return 32/mem_count;
50990 return nunroll;
50994 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50996 static bool
50997 ix86_float_exceptions_rounding_supported_p (void)
50999 /* For x87 floating point with standard excess precision handling,
51000 there is no adddf3 pattern (since x87 floating point only has
51001 XFmode operations) so the default hook implementation gets this
51002 wrong. */
51003 return TARGET_80387 || TARGET_SSE_MATH;
51006 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
51008 static void
51009 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
51011 if (!TARGET_80387 && !TARGET_SSE_MATH)
51012 return;
51013 tree exceptions_var = create_tmp_var_raw (integer_type_node);
51014 if (TARGET_80387)
51016 tree fenv_index_type = build_index_type (size_int (6));
51017 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
51018 tree fenv_var = create_tmp_var_raw (fenv_type);
51019 TREE_ADDRESSABLE (fenv_var) = 1;
51020 tree fenv_ptr = build_pointer_type (fenv_type);
51021 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
51022 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
51023 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
51024 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
51025 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
51026 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
51027 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
51028 tree hold_fnclex = build_call_expr (fnclex, 0);
51029 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
51030 NULL_TREE, NULL_TREE);
51031 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
51032 hold_fnclex);
51033 *clear = build_call_expr (fnclex, 0);
51034 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
51035 tree fnstsw_call = build_call_expr (fnstsw, 0);
51036 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
51037 sw_var, fnstsw_call);
51038 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
51039 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
51040 exceptions_var, exceptions_x87);
51041 *update = build2 (COMPOUND_EXPR, integer_type_node,
51042 sw_mod, update_mod);
51043 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
51044 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
51046 if (TARGET_SSE_MATH)
51048 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
51049 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
51050 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
51051 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
51052 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
51053 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
51054 mxcsr_orig_var, stmxcsr_hold_call);
51055 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
51056 mxcsr_orig_var,
51057 build_int_cst (unsigned_type_node, 0x1f80));
51058 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
51059 build_int_cst (unsigned_type_node, 0xffffffc0));
51060 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
51061 mxcsr_mod_var, hold_mod_val);
51062 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51063 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51064 hold_assign_orig, hold_assign_mod);
51065 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51066 ldmxcsr_hold_call);
51067 if (*hold)
51068 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51069 else
51070 *hold = hold_all;
51071 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51072 if (*clear)
51073 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51074 ldmxcsr_clear_call);
51075 else
51076 *clear = ldmxcsr_clear_call;
51077 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51078 tree exceptions_sse = fold_convert (integer_type_node,
51079 stxmcsr_update_call);
51080 if (*update)
51082 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51083 exceptions_var, exceptions_sse);
51084 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51085 exceptions_var, exceptions_mod);
51086 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51087 exceptions_assign);
51089 else
51090 *update = build2 (MODIFY_EXPR, integer_type_node,
51091 exceptions_var, exceptions_sse);
51092 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51093 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51094 ldmxcsr_update_call);
51096 tree atomic_feraiseexcept
51097 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51098 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51099 1, exceptions_var);
51100 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51101 atomic_feraiseexcept_call);
51104 /* Return mode to be used for bounds or VOIDmode
51105 if bounds are not supported. */
51107 static enum machine_mode
51108 ix86_mpx_bound_mode ()
51110 /* Do not support pointer checker if MPX
51111 is not enabled. */
51112 if (!TARGET_MPX)
51114 if (flag_check_pointer_bounds)
51115 warning (0, "Pointer Checker requires MPX support on this target."
51116 " Use -mmpx options to enable MPX.");
51117 return VOIDmode;
51120 return BNDmode;
51123 /* Return constant used to statically initialize constant bounds.
51125 This function is used to create special bound values. For now
51126 only INIT bounds and NONE bounds are expected. More special
51127 values may be added later. */
51129 static tree
51130 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51132 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51133 : build_zero_cst (pointer_sized_int_node);
51134 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51135 : build_minus_one_cst (pointer_sized_int_node);
51137 /* This function is supposed to be used to create INIT and
51138 NONE bounds only. */
51139 gcc_assert ((lb == 0 && ub == -1)
51140 || (lb == -1 && ub == 0));
51142 return build_complex (NULL, low, high);
51145 /* Generate a list of statements STMTS to initialize pointer bounds
51146 variable VAR with bounds LB and UB. Return the number of generated
51147 statements. */
51149 static int
51150 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51152 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51153 tree lhs, modify, var_p;
51155 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51156 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51158 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51159 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51160 append_to_statement_list (modify, stmts);
51162 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51163 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51164 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51165 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51166 append_to_statement_list (modify, stmts);
51168 return 2;
51171 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51172 /* For i386, common symbol is local only for non-PIE binaries. For
51173 x86-64, common symbol is local only for non-PIE binaries or linker
51174 supports copy reloc in PIE binaries. */
51176 static bool
51177 ix86_binds_local_p (const_tree exp)
51179 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51180 (!flag_pic
51181 || (TARGET_64BIT
51182 && HAVE_LD_PIE_COPYRELOC != 0)));
51184 #endif
51186 /* If MEM is in the form of [base+offset], extract the two parts
51187 of address and set to BASE and OFFSET, otherwise return false. */
51189 static bool
51190 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51192 rtx addr;
51194 gcc_assert (MEM_P (mem));
51196 addr = XEXP (mem, 0);
51198 if (GET_CODE (addr) == CONST)
51199 addr = XEXP (addr, 0);
51201 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51203 *base = addr;
51204 *offset = const0_rtx;
51205 return true;
51208 if (GET_CODE (addr) == PLUS
51209 && (REG_P (XEXP (addr, 0))
51210 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51211 && CONST_INT_P (XEXP (addr, 1)))
51213 *base = XEXP (addr, 0);
51214 *offset = XEXP (addr, 1);
51215 return true;
51218 return false;
51221 /* Given OPERANDS of consecutive load/store, check if we can merge
51222 them into move multiple. LOAD is true if they are load instructions.
51223 MODE is the mode of memory operands. */
51225 bool
51226 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51227 enum machine_mode mode)
51229 HOST_WIDE_INT offval_1, offval_2, msize;
51230 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51232 if (load)
51234 mem_1 = operands[1];
51235 mem_2 = operands[3];
51236 reg_1 = operands[0];
51237 reg_2 = operands[2];
51239 else
51241 mem_1 = operands[0];
51242 mem_2 = operands[2];
51243 reg_1 = operands[1];
51244 reg_2 = operands[3];
51247 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51249 if (REGNO (reg_1) != REGNO (reg_2))
51250 return false;
51252 /* Check if the addresses are in the form of [base+offset]. */
51253 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51254 return false;
51255 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51256 return false;
51258 /* Check if the bases are the same. */
51259 if (!rtx_equal_p (base_1, base_2))
51260 return false;
51262 offval_1 = INTVAL (offset_1);
51263 offval_2 = INTVAL (offset_2);
51264 msize = GET_MODE_SIZE (mode);
51265 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51266 if (offval_1 + msize != offval_2)
51267 return false;
51269 return true;
51272 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51274 static bool
51275 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51276 optimization_type opt_type)
51278 switch (op)
51280 case asin_optab:
51281 case acos_optab:
51282 case log1p_optab:
51283 case exp_optab:
51284 case exp10_optab:
51285 case exp2_optab:
51286 case expm1_optab:
51287 case ldexp_optab:
51288 case scalb_optab:
51289 case round_optab:
51290 return opt_type == OPTIMIZE_FOR_SPEED;
51292 case rint_optab:
51293 if (SSE_FLOAT_MODE_P (mode1)
51294 && TARGET_SSE_MATH
51295 && !flag_trapping_math
51296 && !TARGET_ROUND)
51297 return opt_type == OPTIMIZE_FOR_SPEED;
51298 return true;
51300 case floor_optab:
51301 case ceil_optab:
51302 case btrunc_optab:
51303 if (SSE_FLOAT_MODE_P (mode1)
51304 && TARGET_SSE_MATH
51305 && !flag_trapping_math
51306 && TARGET_ROUND)
51307 return true;
51308 return opt_type == OPTIMIZE_FOR_SPEED;
51310 case rsqrt_optab:
51311 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51313 default:
51314 return true;
51318 /* Address space support.
51320 This is not "far pointers" in the 16-bit sense, but an easy way
51321 to use %fs and %gs segment prefixes. Therefore:
51323 (a) All address spaces have the same modes,
51324 (b) All address spaces have the same addresss forms,
51325 (c) While %fs and %gs are technically subsets of the generic
51326 address space, they are probably not subsets of each other.
51327 (d) Since we have no access to the segment base register values
51328 without resorting to a system call, we cannot convert a
51329 non-default address space to a default address space.
51330 Therefore we do not claim %fs or %gs are subsets of generic.
51332 Therefore we can (mostly) use the default hooks. */
51334 /* All use of segmentation is assumed to make address 0 valid. */
51336 static bool
51337 ix86_addr_space_zero_address_valid (addr_space_t as)
51339 return as != ADDR_SPACE_GENERIC;
51342 static void
51343 ix86_init_libfuncs (void)
51345 if (TARGET_64BIT)
51347 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51348 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51350 else
51352 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51353 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51356 #if TARGET_MACHO
51357 darwin_rename_builtins ();
51358 #endif
51361 /* Generate call to __divmoddi4. */
51363 static void
51364 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51365 rtx op0, rtx op1,
51366 rtx *quot_p, rtx *rem_p)
51368 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51370 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51371 mode, 3,
51372 op0, GET_MODE (op0),
51373 op1, GET_MODE (op1),
51374 XEXP (rem, 0), Pmode);
51375 *quot_p = quot;
51376 *rem_p = rem;
51379 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51380 FPU, assume that the fpcw is set to extended precision; when using
51381 only SSE, rounding is correct; when using both SSE and the FPU,
51382 the rounding precision is indeterminate, since either may be chosen
51383 apparently at random. */
51385 static enum flt_eval_method
51386 ix86_excess_precision (enum excess_precision_type type)
51388 switch (type)
51390 case EXCESS_PRECISION_TYPE_FAST:
51391 /* The fastest type to promote to will always be the native type,
51392 whether that occurs with implicit excess precision or
51393 otherwise. */
51394 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51395 case EXCESS_PRECISION_TYPE_STANDARD:
51396 case EXCESS_PRECISION_TYPE_IMPLICIT:
51397 /* Otherwise, the excess precision we want when we are
51398 in a standards compliant mode, and the implicit precision we
51399 provide would be identical were it not for the unpredictable
51400 cases. */
51401 if (!TARGET_80387)
51402 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51403 else if (!TARGET_MIX_SSE_I387)
51405 if (!TARGET_SSE_MATH)
51406 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51407 else if (TARGET_SSE2)
51408 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51411 /* If we are in standards compliant mode, but we know we will
51412 calculate in unpredictable precision, return
51413 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51414 excess precision if the target can't guarantee it will honor
51415 it. */
51416 return (type == EXCESS_PRECISION_TYPE_STANDARD
51417 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51418 : FLT_EVAL_METHOD_UNPREDICTABLE);
51419 default:
51420 gcc_unreachable ();
51423 return FLT_EVAL_METHOD_UNPREDICTABLE;
51426 /* Target-specific selftests. */
51428 #if CHECKING_P
51430 namespace selftest {
51432 /* Verify that hard regs are dumped as expected (in compact mode). */
51434 static void
51435 ix86_test_dumping_hard_regs ()
51437 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51438 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51441 /* Test dumping an insn with repeated references to the same SCRATCH,
51442 to verify the rtx_reuse code. */
51444 static void
51445 ix86_test_dumping_memory_blockage ()
51447 set_new_first_and_last_insn (NULL, NULL);
51449 rtx pat = gen_memory_blockage ();
51450 rtx_reuse_manager r;
51451 r.preprocess (pat);
51453 /* Verify that the repeated references to the SCRATCH show use
51454 reuse IDS. The first should be prefixed with a reuse ID,
51455 and the second should be dumped as a "reuse_rtx" of that ID.
51456 The expected string assumes Pmode == DImode. */
51457 if (Pmode == DImode)
51458 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51459 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51460 " (unspec:BLK [\n"
51461 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51462 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51465 /* Verify loading an RTL dump; specifically a dump of copying
51466 a param on x86_64 from a hard reg into the frame.
51467 This test is target-specific since the dump contains target-specific
51468 hard reg names. */
51470 static void
51471 ix86_test_loading_dump_fragment_1 ()
51473 rtl_dump_test t (SELFTEST_LOCATION,
51474 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51476 rtx_insn *insn = get_insn_by_uid (1);
51478 /* The block structure and indentation here is purely for
51479 readability; it mirrors the structure of the rtx. */
51480 tree mem_expr;
51482 rtx pat = PATTERN (insn);
51483 ASSERT_EQ (SET, GET_CODE (pat));
51485 rtx dest = SET_DEST (pat);
51486 ASSERT_EQ (MEM, GET_CODE (dest));
51487 /* Verify the "/c" was parsed. */
51488 ASSERT_TRUE (RTX_FLAG (dest, call));
51489 ASSERT_EQ (SImode, GET_MODE (dest));
51491 rtx addr = XEXP (dest, 0);
51492 ASSERT_EQ (PLUS, GET_CODE (addr));
51493 ASSERT_EQ (DImode, GET_MODE (addr));
51495 rtx lhs = XEXP (addr, 0);
51496 /* Verify that the "frame" REG was consolidated. */
51497 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51500 rtx rhs = XEXP (addr, 1);
51501 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51502 ASSERT_EQ (-4, INTVAL (rhs));
51505 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51506 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51507 /* "i" should have been handled by synthesizing a global int
51508 variable named "i". */
51509 mem_expr = MEM_EXPR (dest);
51510 ASSERT_NE (mem_expr, NULL);
51511 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51512 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51513 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51514 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51515 /* "+0". */
51516 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51517 ASSERT_EQ (0, MEM_OFFSET (dest));
51518 /* "S4". */
51519 ASSERT_EQ (4, MEM_SIZE (dest));
51520 /* "A32. */
51521 ASSERT_EQ (32, MEM_ALIGN (dest));
51524 rtx src = SET_SRC (pat);
51525 ASSERT_EQ (REG, GET_CODE (src));
51526 ASSERT_EQ (SImode, GET_MODE (src));
51527 ASSERT_EQ (5, REGNO (src));
51528 tree reg_expr = REG_EXPR (src);
51529 /* "i" here should point to the same var as for the MEM_EXPR. */
51530 ASSERT_EQ (reg_expr, mem_expr);
51535 /* Verify that the RTL loader copes with a call_insn dump.
51536 This test is target-specific since the dump contains a target-specific
51537 hard reg name. */
51539 static void
51540 ix86_test_loading_call_insn ()
51542 /* The test dump includes register "xmm0", where requires TARGET_SSE
51543 to exist. */
51544 if (!TARGET_SSE)
51545 return;
51547 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51549 rtx_insn *insn = get_insns ();
51550 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51552 /* "/j". */
51553 ASSERT_TRUE (RTX_FLAG (insn, jump));
51555 rtx pat = PATTERN (insn);
51556 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51558 /* Verify REG_NOTES. */
51560 /* "(expr_list:REG_CALL_DECL". */
51561 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51562 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51563 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51565 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51566 rtx_expr_list *note1 = note0->next ();
51567 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51569 ASSERT_EQ (NULL, note1->next ());
51572 /* Verify CALL_INSN_FUNCTION_USAGE. */
51574 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51575 rtx_expr_list *usage
51576 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51577 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51578 ASSERT_EQ (DFmode, GET_MODE (usage));
51579 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51580 ASSERT_EQ (NULL, usage->next ());
51584 /* Verify that the RTL loader copes a dump from print_rtx_function.
51585 This test is target-specific since the dump contains target-specific
51586 hard reg names. */
51588 static void
51589 ix86_test_loading_full_dump ()
51591 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51593 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51595 rtx_insn *insn_1 = get_insn_by_uid (1);
51596 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51598 rtx_insn *insn_7 = get_insn_by_uid (7);
51599 ASSERT_EQ (INSN, GET_CODE (insn_7));
51600 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51602 rtx_insn *insn_15 = get_insn_by_uid (15);
51603 ASSERT_EQ (INSN, GET_CODE (insn_15));
51604 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51606 /* Verify crtl->return_rtx. */
51607 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51608 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51609 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51612 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51613 In particular, verify that it correctly loads the 2nd operand.
51614 This test is target-specific since these are machine-specific
51615 operands (and enums). */
51617 static void
51618 ix86_test_loading_unspec ()
51620 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51622 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51624 ASSERT_TRUE (cfun);
51626 /* Test of an UNSPEC. */
51627 rtx_insn *insn = get_insns ();
51628 ASSERT_EQ (INSN, GET_CODE (insn));
51629 rtx set = single_set (insn);
51630 ASSERT_NE (NULL, set);
51631 rtx dst = SET_DEST (set);
51632 ASSERT_EQ (MEM, GET_CODE (dst));
51633 rtx src = SET_SRC (set);
51634 ASSERT_EQ (UNSPEC, GET_CODE (src));
51635 ASSERT_EQ (BLKmode, GET_MODE (src));
51636 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51638 rtx v0 = XVECEXP (src, 0, 0);
51640 /* Verify that the two uses of the first SCRATCH have pointer
51641 equality. */
51642 rtx scratch_a = XEXP (dst, 0);
51643 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51645 rtx scratch_b = XEXP (v0, 0);
51646 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51648 ASSERT_EQ (scratch_a, scratch_b);
51650 /* Verify that the two mems are thus treated as equal. */
51651 ASSERT_TRUE (rtx_equal_p (dst, v0));
51653 /* Verify the the insn is recognized. */
51654 ASSERT_NE(-1, recog_memoized (insn));
51656 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51657 insn = NEXT_INSN (insn);
51658 ASSERT_EQ (INSN, GET_CODE (insn));
51660 set = single_set (insn);
51661 ASSERT_NE (NULL, set);
51663 src = SET_SRC (set);
51664 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51665 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51668 /* Run all target-specific selftests. */
51670 static void
51671 ix86_run_selftests (void)
51673 ix86_test_dumping_hard_regs ();
51674 ix86_test_dumping_memory_blockage ();
51676 /* Various tests of loading RTL dumps, here because they contain
51677 ix86-isms (e.g. names of hard regs). */
51678 ix86_test_loading_dump_fragment_1 ();
51679 ix86_test_loading_call_insn ();
51680 ix86_test_loading_full_dump ();
51681 ix86_test_loading_unspec ();
51684 } // namespace selftest
51686 #endif /* CHECKING_P */
51688 /* Initialize the GCC target structure. */
51689 #undef TARGET_RETURN_IN_MEMORY
51690 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51692 #undef TARGET_LEGITIMIZE_ADDRESS
51693 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51695 #undef TARGET_ATTRIBUTE_TABLE
51696 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51697 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51698 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51699 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51700 # undef TARGET_MERGE_DECL_ATTRIBUTES
51701 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51702 #endif
51704 #undef TARGET_COMP_TYPE_ATTRIBUTES
51705 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51707 #undef TARGET_INIT_BUILTINS
51708 #define TARGET_INIT_BUILTINS ix86_init_builtins
51709 #undef TARGET_BUILTIN_DECL
51710 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51711 #undef TARGET_EXPAND_BUILTIN
51712 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51714 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51715 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51716 ix86_builtin_vectorized_function
51718 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51719 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51721 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51722 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51724 #undef TARGET_BUILTIN_RECIPROCAL
51725 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51727 #undef TARGET_ASM_FUNCTION_EPILOGUE
51728 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51730 #undef TARGET_ENCODE_SECTION_INFO
51731 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51732 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51733 #else
51734 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51735 #endif
51737 #undef TARGET_ASM_OPEN_PAREN
51738 #define TARGET_ASM_OPEN_PAREN ""
51739 #undef TARGET_ASM_CLOSE_PAREN
51740 #define TARGET_ASM_CLOSE_PAREN ""
51742 #undef TARGET_ASM_BYTE_OP
51743 #define TARGET_ASM_BYTE_OP ASM_BYTE
51745 #undef TARGET_ASM_ALIGNED_HI_OP
51746 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51747 #undef TARGET_ASM_ALIGNED_SI_OP
51748 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51749 #ifdef ASM_QUAD
51750 #undef TARGET_ASM_ALIGNED_DI_OP
51751 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51752 #endif
51754 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51755 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51757 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51758 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51760 #undef TARGET_ASM_UNALIGNED_HI_OP
51761 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51762 #undef TARGET_ASM_UNALIGNED_SI_OP
51763 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51764 #undef TARGET_ASM_UNALIGNED_DI_OP
51765 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51767 #undef TARGET_PRINT_OPERAND
51768 #define TARGET_PRINT_OPERAND ix86_print_operand
51769 #undef TARGET_PRINT_OPERAND_ADDRESS
51770 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51771 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51772 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51773 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51774 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51776 #undef TARGET_SCHED_INIT_GLOBAL
51777 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51778 #undef TARGET_SCHED_ADJUST_COST
51779 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51780 #undef TARGET_SCHED_ISSUE_RATE
51781 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51782 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51783 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51784 ia32_multipass_dfa_lookahead
51785 #undef TARGET_SCHED_MACRO_FUSION_P
51786 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51787 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51788 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51790 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51791 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51793 #undef TARGET_MEMMODEL_CHECK
51794 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51796 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51797 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51799 #ifdef HAVE_AS_TLS
51800 #undef TARGET_HAVE_TLS
51801 #define TARGET_HAVE_TLS true
51802 #endif
51803 #undef TARGET_CANNOT_FORCE_CONST_MEM
51804 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51805 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51806 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51808 #undef TARGET_DELEGITIMIZE_ADDRESS
51809 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51811 #undef TARGET_MS_BITFIELD_LAYOUT_P
51812 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51814 #if TARGET_MACHO
51815 #undef TARGET_BINDS_LOCAL_P
51816 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51817 #else
51818 #undef TARGET_BINDS_LOCAL_P
51819 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51820 #endif
51821 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51822 #undef TARGET_BINDS_LOCAL_P
51823 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51824 #endif
51826 #undef TARGET_ASM_OUTPUT_MI_THUNK
51827 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51828 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51829 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51831 #undef TARGET_ASM_FILE_START
51832 #define TARGET_ASM_FILE_START x86_file_start
51834 #undef TARGET_OPTION_OVERRIDE
51835 #define TARGET_OPTION_OVERRIDE ix86_option_override
51837 #undef TARGET_REGISTER_MOVE_COST
51838 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51839 #undef TARGET_MEMORY_MOVE_COST
51840 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51841 #undef TARGET_RTX_COSTS
51842 #define TARGET_RTX_COSTS ix86_rtx_costs
51843 #undef TARGET_ADDRESS_COST
51844 #define TARGET_ADDRESS_COST ix86_address_cost
51846 #undef TARGET_FIXED_CONDITION_CODE_REGS
51847 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51848 #undef TARGET_CC_MODES_COMPATIBLE
51849 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51851 #undef TARGET_MACHINE_DEPENDENT_REORG
51852 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51854 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51855 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51857 #undef TARGET_BUILD_BUILTIN_VA_LIST
51858 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51860 #undef TARGET_FOLD_BUILTIN
51861 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51863 #undef TARGET_GIMPLE_FOLD_BUILTIN
51864 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51866 #undef TARGET_COMPARE_VERSION_PRIORITY
51867 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51869 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51870 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51871 ix86_generate_version_dispatcher_body
51873 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51874 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51875 ix86_get_function_versions_dispatcher
51877 #undef TARGET_ENUM_VA_LIST_P
51878 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51880 #undef TARGET_FN_ABI_VA_LIST
51881 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51883 #undef TARGET_CANONICAL_VA_LIST_TYPE
51884 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51886 #undef TARGET_EXPAND_BUILTIN_VA_START
51887 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51889 #undef TARGET_MD_ASM_ADJUST
51890 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51892 #undef TARGET_C_EXCESS_PRECISION
51893 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51894 #undef TARGET_PROMOTE_PROTOTYPES
51895 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51896 #undef TARGET_SETUP_INCOMING_VARARGS
51897 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51898 #undef TARGET_MUST_PASS_IN_STACK
51899 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51900 #undef TARGET_FUNCTION_ARG_ADVANCE
51901 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51902 #undef TARGET_FUNCTION_ARG
51903 #define TARGET_FUNCTION_ARG ix86_function_arg
51904 #undef TARGET_INIT_PIC_REG
51905 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51906 #undef TARGET_USE_PSEUDO_PIC_REG
51907 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51908 #undef TARGET_FUNCTION_ARG_BOUNDARY
51909 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51910 #undef TARGET_PASS_BY_REFERENCE
51911 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51912 #undef TARGET_INTERNAL_ARG_POINTER
51913 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51914 #undef TARGET_UPDATE_STACK_BOUNDARY
51915 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51916 #undef TARGET_GET_DRAP_RTX
51917 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51918 #undef TARGET_STRICT_ARGUMENT_NAMING
51919 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51920 #undef TARGET_STATIC_CHAIN
51921 #define TARGET_STATIC_CHAIN ix86_static_chain
51922 #undef TARGET_TRAMPOLINE_INIT
51923 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51924 #undef TARGET_RETURN_POPS_ARGS
51925 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51927 #undef TARGET_LEGITIMATE_COMBINED_INSN
51928 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51930 #undef TARGET_ASAN_SHADOW_OFFSET
51931 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51933 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51934 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51936 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51937 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51939 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51940 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51942 #undef TARGET_C_MODE_FOR_SUFFIX
51943 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51945 #ifdef HAVE_AS_TLS
51946 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51947 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51948 #endif
51950 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51951 #undef TARGET_INSERT_ATTRIBUTES
51952 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51953 #endif
51955 #undef TARGET_MANGLE_TYPE
51956 #define TARGET_MANGLE_TYPE ix86_mangle_type
51958 #ifdef TARGET_THREAD_SSP_OFFSET
51959 #undef TARGET_STACK_PROTECT_GUARD
51960 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51961 #endif
51963 #if !TARGET_MACHO
51964 #undef TARGET_STACK_PROTECT_FAIL
51965 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51966 #endif
51968 #undef TARGET_FUNCTION_VALUE
51969 #define TARGET_FUNCTION_VALUE ix86_function_value
51971 #undef TARGET_FUNCTION_VALUE_REGNO_P
51972 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51974 #undef TARGET_PROMOTE_FUNCTION_MODE
51975 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51977 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51978 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51980 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51981 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51983 #undef TARGET_INSTANTIATE_DECLS
51984 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51986 #undef TARGET_SECONDARY_RELOAD
51987 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51989 #undef TARGET_CLASS_MAX_NREGS
51990 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51992 #undef TARGET_PREFERRED_RELOAD_CLASS
51993 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51994 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51995 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51996 #undef TARGET_CLASS_LIKELY_SPILLED_P
51997 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51999 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
52000 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
52001 ix86_builtin_vectorization_cost
52002 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
52003 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
52004 ix86_vectorize_vec_perm_const_ok
52005 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
52006 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
52007 ix86_preferred_simd_mode
52008 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
52009 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
52010 ix86_autovectorize_vector_sizes
52011 #undef TARGET_VECTORIZE_GET_MASK_MODE
52012 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
52013 #undef TARGET_VECTORIZE_INIT_COST
52014 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
52015 #undef TARGET_VECTORIZE_ADD_STMT_COST
52016 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
52017 #undef TARGET_VECTORIZE_FINISH_COST
52018 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
52019 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
52020 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
52022 #undef TARGET_SET_CURRENT_FUNCTION
52023 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
52025 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
52026 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
52028 #undef TARGET_OPTION_SAVE
52029 #define TARGET_OPTION_SAVE ix86_function_specific_save
52031 #undef TARGET_OPTION_RESTORE
52032 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
52034 #undef TARGET_OPTION_POST_STREAM_IN
52035 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
52037 #undef TARGET_OPTION_PRINT
52038 #define TARGET_OPTION_PRINT ix86_function_specific_print
52040 #undef TARGET_OPTION_FUNCTION_VERSIONS
52041 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
52043 #undef TARGET_CAN_INLINE_P
52044 #define TARGET_CAN_INLINE_P ix86_can_inline_p
52046 #undef TARGET_LEGITIMATE_ADDRESS_P
52047 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
52049 #undef TARGET_REGISTER_PRIORITY
52050 #define TARGET_REGISTER_PRIORITY ix86_register_priority
52052 #undef TARGET_REGISTER_USAGE_LEVELING_P
52053 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
52055 #undef TARGET_LEGITIMATE_CONSTANT_P
52056 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52058 #undef TARGET_FRAME_POINTER_REQUIRED
52059 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52061 #undef TARGET_CAN_ELIMINATE
52062 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52064 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52065 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52067 #undef TARGET_ASM_CODE_END
52068 #define TARGET_ASM_CODE_END ix86_code_end
52070 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52071 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52073 #undef TARGET_LOOP_UNROLL_ADJUST
52074 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52076 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52077 #undef TARGET_SPILL_CLASS
52078 #define TARGET_SPILL_CLASS ix86_spill_class
52080 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52081 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52082 ix86_simd_clone_compute_vecsize_and_simdlen
52084 #undef TARGET_SIMD_CLONE_ADJUST
52085 #define TARGET_SIMD_CLONE_ADJUST \
52086 ix86_simd_clone_adjust
52088 #undef TARGET_SIMD_CLONE_USABLE
52089 #define TARGET_SIMD_CLONE_USABLE \
52090 ix86_simd_clone_usable
52092 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52093 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52094 ix86_float_exceptions_rounding_supported_p
52096 #undef TARGET_MODE_EMIT
52097 #define TARGET_MODE_EMIT ix86_emit_mode_set
52099 #undef TARGET_MODE_NEEDED
52100 #define TARGET_MODE_NEEDED ix86_mode_needed
52102 #undef TARGET_MODE_AFTER
52103 #define TARGET_MODE_AFTER ix86_mode_after
52105 #undef TARGET_MODE_ENTRY
52106 #define TARGET_MODE_ENTRY ix86_mode_entry
52108 #undef TARGET_MODE_EXIT
52109 #define TARGET_MODE_EXIT ix86_mode_exit
52111 #undef TARGET_MODE_PRIORITY
52112 #define TARGET_MODE_PRIORITY ix86_mode_priority
52114 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52115 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52117 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52118 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52120 #undef TARGET_STORE_BOUNDS_FOR_ARG
52121 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52123 #undef TARGET_LOAD_RETURNED_BOUNDS
52124 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52126 #undef TARGET_STORE_RETURNED_BOUNDS
52127 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52129 #undef TARGET_CHKP_BOUND_MODE
52130 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52132 #undef TARGET_BUILTIN_CHKP_FUNCTION
52133 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52135 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52136 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52138 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52139 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52141 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52142 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52144 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52145 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52147 #undef TARGET_OFFLOAD_OPTIONS
52148 #define TARGET_OFFLOAD_OPTIONS \
52149 ix86_offload_options
52151 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52152 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52154 #undef TARGET_OPTAB_SUPPORTED_P
52155 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52157 #undef TARGET_HARD_REGNO_SCRATCH_OK
52158 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52160 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52161 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52163 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52164 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52166 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52167 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52169 #undef TARGET_INIT_LIBFUNCS
52170 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52172 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52173 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52175 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52176 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52177 #if CHECKING_P
52178 #undef TARGET_RUN_TARGET_SELFTESTS
52179 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52180 #endif /* #if CHECKING_P */
52182 struct gcc_target targetm = TARGET_INITIALIZER;
52184 #include "gt-i386.h"