PR target/79568
[official-gcc.git] / gcc / config / i386 / i386.c
blobfe2dd6a0ce753383c21ab2a48ef978859ba338b9
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
87 /* This file should be included last. */
88 #include "target-def.h"
90 static rtx legitimize_dllimport_symbol (rtx, bool);
91 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
92 static rtx legitimize_pe_coff_symbol (rtx, bool);
93 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
95 #ifndef CHECK_STACK_LIMIT
96 #define CHECK_STACK_LIMIT (-1)
97 #endif
99 /* Return index of given mode in mult and division cost tables. */
100 #define MODE_INDEX(mode) \
101 ((mode) == QImode ? 0 \
102 : (mode) == HImode ? 1 \
103 : (mode) == SImode ? 2 \
104 : (mode) == DImode ? 3 \
105 : 4)
107 /* Processor costs (relative to an add) */
108 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
109 #define COSTS_N_BYTES(N) ((N) * 2)
111 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
113 static stringop_algs ix86_size_memcpy[2] = {
114 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
115 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
116 static stringop_algs ix86_size_memset[2] = {
117 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
120 const
121 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
122 COSTS_N_BYTES (2), /* cost of an add instruction */
123 COSTS_N_BYTES (3), /* cost of a lea instruction */
124 COSTS_N_BYTES (2), /* variable shift costs */
125 COSTS_N_BYTES (3), /* constant shift costs */
126 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
127 COSTS_N_BYTES (3), /* HI */
128 COSTS_N_BYTES (3), /* SI */
129 COSTS_N_BYTES (3), /* DI */
130 COSTS_N_BYTES (5)}, /* other */
131 0, /* cost of multiply per each bit set */
132 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
133 COSTS_N_BYTES (3), /* HI */
134 COSTS_N_BYTES (3), /* SI */
135 COSTS_N_BYTES (3), /* DI */
136 COSTS_N_BYTES (5)}, /* other */
137 COSTS_N_BYTES (3), /* cost of movsx */
138 COSTS_N_BYTES (3), /* cost of movzx */
139 0, /* "large" insn */
140 2, /* MOVE_RATIO */
141 2, /* cost for loading QImode using movzbl */
142 {2, 2, 2}, /* cost of loading integer registers
143 in QImode, HImode and SImode.
144 Relative to reg-reg move (2). */
145 {2, 2, 2}, /* cost of storing integer registers */
146 2, /* cost of reg,reg fld/fst */
147 {2, 2, 2}, /* cost of loading fp registers
148 in SFmode, DFmode and XFmode */
149 {2, 2, 2}, /* cost of storing fp registers
150 in SFmode, DFmode and XFmode */
151 3, /* cost of moving MMX register */
152 {3, 3}, /* cost of loading MMX registers
153 in SImode and DImode */
154 {3, 3}, /* cost of storing MMX registers
155 in SImode and DImode */
156 3, /* cost of moving SSE register */
157 {3, 3, 3}, /* cost of loading SSE registers
158 in SImode, DImode and TImode */
159 {3, 3, 3}, /* cost of storing SSE registers
160 in SImode, DImode and TImode */
161 3, /* MMX or SSE register to integer */
162 0, /* size of l1 cache */
163 0, /* size of l2 cache */
164 0, /* size of prefetch block */
165 0, /* number of parallel prefetches */
166 2, /* Branch cost */
167 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
168 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
169 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
170 COSTS_N_BYTES (2), /* cost of FABS instruction. */
171 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
172 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
173 ix86_size_memcpy,
174 ix86_size_memset,
175 1, /* scalar_stmt_cost. */
176 1, /* scalar load_cost. */
177 1, /* scalar_store_cost. */
178 1, /* vec_stmt_cost. */
179 1, /* vec_to_scalar_cost. */
180 1, /* scalar_to_vec_cost. */
181 1, /* vec_align_load_cost. */
182 1, /* vec_unalign_load_cost. */
183 1, /* vec_store_cost. */
184 1, /* cond_taken_branch_cost. */
185 1, /* cond_not_taken_branch_cost. */
188 /* Processor costs (relative to an add) */
189 static stringop_algs i386_memcpy[2] = {
190 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
191 DUMMY_STRINGOP_ALGS};
192 static stringop_algs i386_memset[2] = {
193 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
194 DUMMY_STRINGOP_ALGS};
196 static const
197 struct processor_costs i386_cost = { /* 386 specific costs */
198 COSTS_N_INSNS (1), /* cost of an add instruction */
199 COSTS_N_INSNS (1), /* cost of a lea instruction */
200 COSTS_N_INSNS (3), /* variable shift costs */
201 COSTS_N_INSNS (2), /* constant shift costs */
202 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
203 COSTS_N_INSNS (6), /* HI */
204 COSTS_N_INSNS (6), /* SI */
205 COSTS_N_INSNS (6), /* DI */
206 COSTS_N_INSNS (6)}, /* other */
207 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
208 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
209 COSTS_N_INSNS (23), /* HI */
210 COSTS_N_INSNS (23), /* SI */
211 COSTS_N_INSNS (23), /* DI */
212 COSTS_N_INSNS (23)}, /* other */
213 COSTS_N_INSNS (3), /* cost of movsx */
214 COSTS_N_INSNS (2), /* cost of movzx */
215 15, /* "large" insn */
216 3, /* MOVE_RATIO */
217 4, /* cost for loading QImode using movzbl */
218 {2, 4, 2}, /* cost of loading integer registers
219 in QImode, HImode and SImode.
220 Relative to reg-reg move (2). */
221 {2, 4, 2}, /* cost of storing integer registers */
222 2, /* cost of reg,reg fld/fst */
223 {8, 8, 8}, /* cost of loading fp registers
224 in SFmode, DFmode and XFmode */
225 {8, 8, 8}, /* cost of storing fp registers
226 in SFmode, DFmode and XFmode */
227 2, /* cost of moving MMX register */
228 {4, 8}, /* cost of loading MMX registers
229 in SImode and DImode */
230 {4, 8}, /* cost of storing MMX registers
231 in SImode and DImode */
232 2, /* cost of moving SSE register */
233 {4, 8, 16}, /* cost of loading SSE registers
234 in SImode, DImode and TImode */
235 {4, 8, 16}, /* cost of storing SSE registers
236 in SImode, DImode and TImode */
237 3, /* MMX or SSE register to integer */
238 0, /* size of l1 cache */
239 0, /* size of l2 cache */
240 0, /* size of prefetch block */
241 0, /* number of parallel prefetches */
242 1, /* Branch cost */
243 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
244 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
245 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
246 COSTS_N_INSNS (22), /* cost of FABS instruction. */
247 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
248 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
249 i386_memcpy,
250 i386_memset,
251 1, /* scalar_stmt_cost. */
252 1, /* scalar load_cost. */
253 1, /* scalar_store_cost. */
254 1, /* vec_stmt_cost. */
255 1, /* vec_to_scalar_cost. */
256 1, /* scalar_to_vec_cost. */
257 1, /* vec_align_load_cost. */
258 2, /* vec_unalign_load_cost. */
259 1, /* vec_store_cost. */
260 3, /* cond_taken_branch_cost. */
261 1, /* cond_not_taken_branch_cost. */
264 static stringop_algs i486_memcpy[2] = {
265 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
266 DUMMY_STRINGOP_ALGS};
267 static stringop_algs i486_memset[2] = {
268 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
269 DUMMY_STRINGOP_ALGS};
271 static const
272 struct processor_costs i486_cost = { /* 486 specific costs */
273 COSTS_N_INSNS (1), /* cost of an add instruction */
274 COSTS_N_INSNS (1), /* cost of a lea instruction */
275 COSTS_N_INSNS (3), /* variable shift costs */
276 COSTS_N_INSNS (2), /* constant shift costs */
277 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
278 COSTS_N_INSNS (12), /* HI */
279 COSTS_N_INSNS (12), /* SI */
280 COSTS_N_INSNS (12), /* DI */
281 COSTS_N_INSNS (12)}, /* other */
282 1, /* cost of multiply per each bit set */
283 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
284 COSTS_N_INSNS (40), /* HI */
285 COSTS_N_INSNS (40), /* SI */
286 COSTS_N_INSNS (40), /* DI */
287 COSTS_N_INSNS (40)}, /* other */
288 COSTS_N_INSNS (3), /* cost of movsx */
289 COSTS_N_INSNS (2), /* cost of movzx */
290 15, /* "large" insn */
291 3, /* MOVE_RATIO */
292 4, /* cost for loading QImode using movzbl */
293 {2, 4, 2}, /* cost of loading integer registers
294 in QImode, HImode and SImode.
295 Relative to reg-reg move (2). */
296 {2, 4, 2}, /* cost of storing integer registers */
297 2, /* cost of reg,reg fld/fst */
298 {8, 8, 8}, /* cost of loading fp registers
299 in SFmode, DFmode and XFmode */
300 {8, 8, 8}, /* cost of storing fp registers
301 in SFmode, DFmode and XFmode */
302 2, /* cost of moving MMX register */
303 {4, 8}, /* cost of loading MMX registers
304 in SImode and DImode */
305 {4, 8}, /* cost of storing MMX registers
306 in SImode and DImode */
307 2, /* cost of moving SSE register */
308 {4, 8, 16}, /* cost of loading SSE registers
309 in SImode, DImode and TImode */
310 {4, 8, 16}, /* cost of storing SSE registers
311 in SImode, DImode and TImode */
312 3, /* MMX or SSE register to integer */
313 4, /* size of l1 cache. 486 has 8kB cache
314 shared for code and data, so 4kB is
315 not really precise. */
316 4, /* size of l2 cache */
317 0, /* size of prefetch block */
318 0, /* number of parallel prefetches */
319 1, /* Branch cost */
320 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
321 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
322 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
323 COSTS_N_INSNS (3), /* cost of FABS instruction. */
324 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
325 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
326 i486_memcpy,
327 i486_memset,
328 1, /* scalar_stmt_cost. */
329 1, /* scalar load_cost. */
330 1, /* scalar_store_cost. */
331 1, /* vec_stmt_cost. */
332 1, /* vec_to_scalar_cost. */
333 1, /* scalar_to_vec_cost. */
334 1, /* vec_align_load_cost. */
335 2, /* vec_unalign_load_cost. */
336 1, /* vec_store_cost. */
337 3, /* cond_taken_branch_cost. */
338 1, /* cond_not_taken_branch_cost. */
341 static stringop_algs pentium_memcpy[2] = {
342 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
343 DUMMY_STRINGOP_ALGS};
344 static stringop_algs pentium_memset[2] = {
345 {libcall, {{-1, rep_prefix_4_byte, false}}},
346 DUMMY_STRINGOP_ALGS};
348 static const
349 struct processor_costs pentium_cost = {
350 COSTS_N_INSNS (1), /* cost of an add instruction */
351 COSTS_N_INSNS (1), /* cost of a lea instruction */
352 COSTS_N_INSNS (4), /* variable shift costs */
353 COSTS_N_INSNS (1), /* constant shift costs */
354 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
355 COSTS_N_INSNS (11), /* HI */
356 COSTS_N_INSNS (11), /* SI */
357 COSTS_N_INSNS (11), /* DI */
358 COSTS_N_INSNS (11)}, /* other */
359 0, /* cost of multiply per each bit set */
360 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
361 COSTS_N_INSNS (25), /* HI */
362 COSTS_N_INSNS (25), /* SI */
363 COSTS_N_INSNS (25), /* DI */
364 COSTS_N_INSNS (25)}, /* other */
365 COSTS_N_INSNS (3), /* cost of movsx */
366 COSTS_N_INSNS (2), /* cost of movzx */
367 8, /* "large" insn */
368 6, /* MOVE_RATIO */
369 6, /* cost for loading QImode using movzbl */
370 {2, 4, 2}, /* cost of loading integer registers
371 in QImode, HImode and SImode.
372 Relative to reg-reg move (2). */
373 {2, 4, 2}, /* cost of storing integer registers */
374 2, /* cost of reg,reg fld/fst */
375 {2, 2, 6}, /* cost of loading fp registers
376 in SFmode, DFmode and XFmode */
377 {4, 4, 6}, /* cost of storing fp registers
378 in SFmode, DFmode and XFmode */
379 8, /* cost of moving MMX register */
380 {8, 8}, /* cost of loading MMX registers
381 in SImode and DImode */
382 {8, 8}, /* cost of storing MMX registers
383 in SImode and DImode */
384 2, /* cost of moving SSE register */
385 {4, 8, 16}, /* cost of loading SSE registers
386 in SImode, DImode and TImode */
387 {4, 8, 16}, /* cost of storing SSE registers
388 in SImode, DImode and TImode */
389 3, /* MMX or SSE register to integer */
390 8, /* size of l1 cache. */
391 8, /* size of l2 cache */
392 0, /* size of prefetch block */
393 0, /* number of parallel prefetches */
394 2, /* Branch cost */
395 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
396 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
397 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
398 COSTS_N_INSNS (1), /* cost of FABS instruction. */
399 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
400 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
401 pentium_memcpy,
402 pentium_memset,
403 1, /* scalar_stmt_cost. */
404 1, /* scalar load_cost. */
405 1, /* scalar_store_cost. */
406 1, /* vec_stmt_cost. */
407 1, /* vec_to_scalar_cost. */
408 1, /* scalar_to_vec_cost. */
409 1, /* vec_align_load_cost. */
410 2, /* vec_unalign_load_cost. */
411 1, /* vec_store_cost. */
412 3, /* cond_taken_branch_cost. */
413 1, /* cond_not_taken_branch_cost. */
416 static const
417 struct processor_costs lakemont_cost = {
418 COSTS_N_INSNS (1), /* cost of an add instruction */
419 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
420 COSTS_N_INSNS (1), /* variable shift costs */
421 COSTS_N_INSNS (1), /* constant shift costs */
422 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
423 COSTS_N_INSNS (11), /* HI */
424 COSTS_N_INSNS (11), /* SI */
425 COSTS_N_INSNS (11), /* DI */
426 COSTS_N_INSNS (11)}, /* other */
427 0, /* cost of multiply per each bit set */
428 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
429 COSTS_N_INSNS (25), /* HI */
430 COSTS_N_INSNS (25), /* SI */
431 COSTS_N_INSNS (25), /* DI */
432 COSTS_N_INSNS (25)}, /* other */
433 COSTS_N_INSNS (3), /* cost of movsx */
434 COSTS_N_INSNS (2), /* cost of movzx */
435 8, /* "large" insn */
436 17, /* MOVE_RATIO */
437 6, /* cost for loading QImode using movzbl */
438 {2, 4, 2}, /* cost of loading integer registers
439 in QImode, HImode and SImode.
440 Relative to reg-reg move (2). */
441 {2, 4, 2}, /* cost of storing integer registers */
442 2, /* cost of reg,reg fld/fst */
443 {2, 2, 6}, /* cost of loading fp registers
444 in SFmode, DFmode and XFmode */
445 {4, 4, 6}, /* cost of storing fp registers
446 in SFmode, DFmode and XFmode */
447 8, /* cost of moving MMX register */
448 {8, 8}, /* cost of loading MMX registers
449 in SImode and DImode */
450 {8, 8}, /* cost of storing MMX registers
451 in SImode and DImode */
452 2, /* cost of moving SSE register */
453 {4, 8, 16}, /* cost of loading SSE registers
454 in SImode, DImode and TImode */
455 {4, 8, 16}, /* cost of storing SSE registers
456 in SImode, DImode and TImode */
457 3, /* MMX or SSE register to integer */
458 8, /* size of l1 cache. */
459 8, /* size of l2 cache */
460 0, /* size of prefetch block */
461 0, /* number of parallel prefetches */
462 2, /* Branch cost */
463 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
464 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
465 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
466 COSTS_N_INSNS (1), /* cost of FABS instruction. */
467 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
468 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
469 pentium_memcpy,
470 pentium_memset,
471 1, /* scalar_stmt_cost. */
472 1, /* scalar load_cost. */
473 1, /* scalar_store_cost. */
474 1, /* vec_stmt_cost. */
475 1, /* vec_to_scalar_cost. */
476 1, /* scalar_to_vec_cost. */
477 1, /* vec_align_load_cost. */
478 2, /* vec_unalign_load_cost. */
479 1, /* vec_store_cost. */
480 3, /* cond_taken_branch_cost. */
481 1, /* cond_not_taken_branch_cost. */
484 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
485 (we ensure the alignment). For small blocks inline loop is still a
486 noticeable win, for bigger blocks either rep movsl or rep movsb is
487 way to go. Rep movsb has apparently more expensive startup time in CPU,
488 but after 4K the difference is down in the noise. */
489 static stringop_algs pentiumpro_memcpy[2] = {
490 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
491 {8192, rep_prefix_4_byte, false},
492 {-1, rep_prefix_1_byte, false}}},
493 DUMMY_STRINGOP_ALGS};
494 static stringop_algs pentiumpro_memset[2] = {
495 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
496 {8192, rep_prefix_4_byte, false},
497 {-1, libcall, false}}},
498 DUMMY_STRINGOP_ALGS};
499 static const
500 struct processor_costs pentiumpro_cost = {
501 COSTS_N_INSNS (1), /* cost of an add instruction */
502 COSTS_N_INSNS (1), /* cost of a lea instruction */
503 COSTS_N_INSNS (1), /* variable shift costs */
504 COSTS_N_INSNS (1), /* constant shift costs */
505 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
506 COSTS_N_INSNS (4), /* HI */
507 COSTS_N_INSNS (4), /* SI */
508 COSTS_N_INSNS (4), /* DI */
509 COSTS_N_INSNS (4)}, /* other */
510 0, /* cost of multiply per each bit set */
511 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
512 COSTS_N_INSNS (17), /* HI */
513 COSTS_N_INSNS (17), /* SI */
514 COSTS_N_INSNS (17), /* DI */
515 COSTS_N_INSNS (17)}, /* other */
516 COSTS_N_INSNS (1), /* cost of movsx */
517 COSTS_N_INSNS (1), /* cost of movzx */
518 8, /* "large" insn */
519 6, /* MOVE_RATIO */
520 2, /* cost for loading QImode using movzbl */
521 {4, 4, 4}, /* cost of loading integer registers
522 in QImode, HImode and SImode.
523 Relative to reg-reg move (2). */
524 {2, 2, 2}, /* cost of storing integer registers */
525 2, /* cost of reg,reg fld/fst */
526 {2, 2, 6}, /* cost of loading fp registers
527 in SFmode, DFmode and XFmode */
528 {4, 4, 6}, /* cost of storing fp registers
529 in SFmode, DFmode and XFmode */
530 2, /* cost of moving MMX register */
531 {2, 2}, /* cost of loading MMX registers
532 in SImode and DImode */
533 {2, 2}, /* cost of storing MMX registers
534 in SImode and DImode */
535 2, /* cost of moving SSE register */
536 {2, 2, 8}, /* cost of loading SSE registers
537 in SImode, DImode and TImode */
538 {2, 2, 8}, /* cost of storing SSE registers
539 in SImode, DImode and TImode */
540 3, /* MMX or SSE register to integer */
541 8, /* size of l1 cache. */
542 256, /* size of l2 cache */
543 32, /* size of prefetch block */
544 6, /* number of parallel prefetches */
545 2, /* Branch cost */
546 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
547 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
548 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
549 COSTS_N_INSNS (2), /* cost of FABS instruction. */
550 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
551 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
552 pentiumpro_memcpy,
553 pentiumpro_memset,
554 1, /* scalar_stmt_cost. */
555 1, /* scalar load_cost. */
556 1, /* scalar_store_cost. */
557 1, /* vec_stmt_cost. */
558 1, /* vec_to_scalar_cost. */
559 1, /* scalar_to_vec_cost. */
560 1, /* vec_align_load_cost. */
561 2, /* vec_unalign_load_cost. */
562 1, /* vec_store_cost. */
563 3, /* cond_taken_branch_cost. */
564 1, /* cond_not_taken_branch_cost. */
567 static stringop_algs geode_memcpy[2] = {
568 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
569 DUMMY_STRINGOP_ALGS};
570 static stringop_algs geode_memset[2] = {
571 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
572 DUMMY_STRINGOP_ALGS};
573 static const
574 struct processor_costs geode_cost = {
575 COSTS_N_INSNS (1), /* cost of an add instruction */
576 COSTS_N_INSNS (1), /* cost of a lea instruction */
577 COSTS_N_INSNS (2), /* variable shift costs */
578 COSTS_N_INSNS (1), /* constant shift costs */
579 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
580 COSTS_N_INSNS (4), /* HI */
581 COSTS_N_INSNS (7), /* SI */
582 COSTS_N_INSNS (7), /* DI */
583 COSTS_N_INSNS (7)}, /* other */
584 0, /* cost of multiply per each bit set */
585 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
586 COSTS_N_INSNS (23), /* HI */
587 COSTS_N_INSNS (39), /* SI */
588 COSTS_N_INSNS (39), /* DI */
589 COSTS_N_INSNS (39)}, /* other */
590 COSTS_N_INSNS (1), /* cost of movsx */
591 COSTS_N_INSNS (1), /* cost of movzx */
592 8, /* "large" insn */
593 4, /* MOVE_RATIO */
594 1, /* cost for loading QImode using movzbl */
595 {1, 1, 1}, /* cost of loading integer registers
596 in QImode, HImode and SImode.
597 Relative to reg-reg move (2). */
598 {1, 1, 1}, /* cost of storing integer registers */
599 1, /* cost of reg,reg fld/fst */
600 {1, 1, 1}, /* cost of loading fp registers
601 in SFmode, DFmode and XFmode */
602 {4, 6, 6}, /* cost of storing fp registers
603 in SFmode, DFmode and XFmode */
605 2, /* cost of moving MMX register */
606 {2, 2}, /* cost of loading MMX registers
607 in SImode and DImode */
608 {2, 2}, /* cost of storing MMX registers
609 in SImode and DImode */
610 2, /* cost of moving SSE register */
611 {2, 2, 8}, /* cost of loading SSE registers
612 in SImode, DImode and TImode */
613 {2, 2, 8}, /* cost of storing SSE registers
614 in SImode, DImode and TImode */
615 3, /* MMX or SSE register to integer */
616 64, /* size of l1 cache. */
617 128, /* size of l2 cache. */
618 32, /* size of prefetch block */
619 1, /* number of parallel prefetches */
620 1, /* Branch cost */
621 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
622 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
623 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
624 COSTS_N_INSNS (1), /* cost of FABS instruction. */
625 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
626 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
627 geode_memcpy,
628 geode_memset,
629 1, /* scalar_stmt_cost. */
630 1, /* scalar load_cost. */
631 1, /* scalar_store_cost. */
632 1, /* vec_stmt_cost. */
633 1, /* vec_to_scalar_cost. */
634 1, /* scalar_to_vec_cost. */
635 1, /* vec_align_load_cost. */
636 2, /* vec_unalign_load_cost. */
637 1, /* vec_store_cost. */
638 3, /* cond_taken_branch_cost. */
639 1, /* cond_not_taken_branch_cost. */
642 static stringop_algs k6_memcpy[2] = {
643 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
644 DUMMY_STRINGOP_ALGS};
645 static stringop_algs k6_memset[2] = {
646 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
647 DUMMY_STRINGOP_ALGS};
648 static const
649 struct processor_costs k6_cost = {
650 COSTS_N_INSNS (1), /* cost of an add instruction */
651 COSTS_N_INSNS (2), /* cost of a lea instruction */
652 COSTS_N_INSNS (1), /* variable shift costs */
653 COSTS_N_INSNS (1), /* constant shift costs */
654 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
655 COSTS_N_INSNS (3), /* HI */
656 COSTS_N_INSNS (3), /* SI */
657 COSTS_N_INSNS (3), /* DI */
658 COSTS_N_INSNS (3)}, /* other */
659 0, /* cost of multiply per each bit set */
660 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
661 COSTS_N_INSNS (18), /* HI */
662 COSTS_N_INSNS (18), /* SI */
663 COSTS_N_INSNS (18), /* DI */
664 COSTS_N_INSNS (18)}, /* other */
665 COSTS_N_INSNS (2), /* cost of movsx */
666 COSTS_N_INSNS (2), /* cost of movzx */
667 8, /* "large" insn */
668 4, /* MOVE_RATIO */
669 3, /* cost for loading QImode using movzbl */
670 {4, 5, 4}, /* cost of loading integer registers
671 in QImode, HImode and SImode.
672 Relative to reg-reg move (2). */
673 {2, 3, 2}, /* cost of storing integer registers */
674 4, /* cost of reg,reg fld/fst */
675 {6, 6, 6}, /* cost of loading fp registers
676 in SFmode, DFmode and XFmode */
677 {4, 4, 4}, /* cost of storing fp registers
678 in SFmode, DFmode and XFmode */
679 2, /* cost of moving MMX register */
680 {2, 2}, /* cost of loading MMX registers
681 in SImode and DImode */
682 {2, 2}, /* cost of storing MMX registers
683 in SImode and DImode */
684 2, /* cost of moving SSE register */
685 {2, 2, 8}, /* cost of loading SSE registers
686 in SImode, DImode and TImode */
687 {2, 2, 8}, /* cost of storing SSE registers
688 in SImode, DImode and TImode */
689 6, /* MMX or SSE register to integer */
690 32, /* size of l1 cache. */
691 32, /* size of l2 cache. Some models
692 have integrated l2 cache, but
693 optimizing for k6 is not important
694 enough to worry about that. */
695 32, /* size of prefetch block */
696 1, /* number of parallel prefetches */
697 1, /* Branch cost */
698 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
699 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
700 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
701 COSTS_N_INSNS (2), /* cost of FABS instruction. */
702 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
703 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
704 k6_memcpy,
705 k6_memset,
706 1, /* scalar_stmt_cost. */
707 1, /* scalar load_cost. */
708 1, /* scalar_store_cost. */
709 1, /* vec_stmt_cost. */
710 1, /* vec_to_scalar_cost. */
711 1, /* scalar_to_vec_cost. */
712 1, /* vec_align_load_cost. */
713 2, /* vec_unalign_load_cost. */
714 1, /* vec_store_cost. */
715 3, /* cond_taken_branch_cost. */
716 1, /* cond_not_taken_branch_cost. */
719 /* For some reason, Athlon deals better with REP prefix (relative to loops)
720 compared to K8. Alignment becomes important after 8 bytes for memcpy and
721 128 bytes for memset. */
722 static stringop_algs athlon_memcpy[2] = {
723 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
724 DUMMY_STRINGOP_ALGS};
725 static stringop_algs athlon_memset[2] = {
726 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
727 DUMMY_STRINGOP_ALGS};
728 static const
729 struct processor_costs athlon_cost = {
730 COSTS_N_INSNS (1), /* cost of an add instruction */
731 COSTS_N_INSNS (2), /* cost of a lea instruction */
732 COSTS_N_INSNS (1), /* variable shift costs */
733 COSTS_N_INSNS (1), /* constant shift costs */
734 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
735 COSTS_N_INSNS (5), /* HI */
736 COSTS_N_INSNS (5), /* SI */
737 COSTS_N_INSNS (5), /* DI */
738 COSTS_N_INSNS (5)}, /* other */
739 0, /* cost of multiply per each bit set */
740 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
741 COSTS_N_INSNS (26), /* HI */
742 COSTS_N_INSNS (42), /* SI */
743 COSTS_N_INSNS (74), /* DI */
744 COSTS_N_INSNS (74)}, /* other */
745 COSTS_N_INSNS (1), /* cost of movsx */
746 COSTS_N_INSNS (1), /* cost of movzx */
747 8, /* "large" insn */
748 9, /* MOVE_RATIO */
749 4, /* cost for loading QImode using movzbl */
750 {3, 4, 3}, /* cost of loading integer registers
751 in QImode, HImode and SImode.
752 Relative to reg-reg move (2). */
753 {3, 4, 3}, /* cost of storing integer registers */
754 4, /* cost of reg,reg fld/fst */
755 {4, 4, 12}, /* cost of loading fp registers
756 in SFmode, DFmode and XFmode */
757 {6, 6, 8}, /* cost of storing fp registers
758 in SFmode, DFmode and XFmode */
759 2, /* cost of moving MMX register */
760 {4, 4}, /* cost of loading MMX registers
761 in SImode and DImode */
762 {4, 4}, /* cost of storing MMX registers
763 in SImode and DImode */
764 2, /* cost of moving SSE register */
765 {4, 4, 6}, /* cost of loading SSE registers
766 in SImode, DImode and TImode */
767 {4, 4, 5}, /* cost of storing SSE registers
768 in SImode, DImode and TImode */
769 5, /* MMX or SSE register to integer */
770 64, /* size of l1 cache. */
771 256, /* size of l2 cache. */
772 64, /* size of prefetch block */
773 6, /* number of parallel prefetches */
774 5, /* Branch cost */
775 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (2), /* cost of FABS instruction. */
779 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
781 athlon_memcpy,
782 athlon_memset,
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
796 /* K8 has optimized REP instruction for medium sized blocks, but for very
797 small blocks it is better to use loop. For large blocks, libcall can
798 do nontemporary accesses and beat inline considerably. */
799 static stringop_algs k8_memcpy[2] = {
800 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
801 {-1, rep_prefix_4_byte, false}}},
802 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
803 {-1, libcall, false}}}};
804 static stringop_algs k8_memset[2] = {
805 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
806 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
807 {libcall, {{48, unrolled_loop, false},
808 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
809 static const
810 struct processor_costs k8_cost = {
811 COSTS_N_INSNS (1), /* cost of an add instruction */
812 COSTS_N_INSNS (2), /* cost of a lea instruction */
813 COSTS_N_INSNS (1), /* variable shift costs */
814 COSTS_N_INSNS (1), /* constant shift costs */
815 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
816 COSTS_N_INSNS (4), /* HI */
817 COSTS_N_INSNS (3), /* SI */
818 COSTS_N_INSNS (4), /* DI */
819 COSTS_N_INSNS (5)}, /* other */
820 0, /* cost of multiply per each bit set */
821 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
822 COSTS_N_INSNS (26), /* HI */
823 COSTS_N_INSNS (42), /* SI */
824 COSTS_N_INSNS (74), /* DI */
825 COSTS_N_INSNS (74)}, /* other */
826 COSTS_N_INSNS (1), /* cost of movsx */
827 COSTS_N_INSNS (1), /* cost of movzx */
828 8, /* "large" insn */
829 9, /* MOVE_RATIO */
830 4, /* cost for loading QImode using movzbl */
831 {3, 4, 3}, /* cost of loading integer registers
832 in QImode, HImode and SImode.
833 Relative to reg-reg move (2). */
834 {3, 4, 3}, /* cost of storing integer registers */
835 4, /* cost of reg,reg fld/fst */
836 {4, 4, 12}, /* cost of loading fp registers
837 in SFmode, DFmode and XFmode */
838 {6, 6, 8}, /* cost of storing fp registers
839 in SFmode, DFmode and XFmode */
840 2, /* cost of moving MMX register */
841 {3, 3}, /* cost of loading MMX registers
842 in SImode and DImode */
843 {4, 4}, /* cost of storing MMX registers
844 in SImode and DImode */
845 2, /* cost of moving SSE register */
846 {4, 3, 6}, /* cost of loading SSE registers
847 in SImode, DImode and TImode */
848 {4, 4, 5}, /* cost of storing SSE registers
849 in SImode, DImode and TImode */
850 5, /* MMX or SSE register to integer */
851 64, /* size of l1 cache. */
852 512, /* size of l2 cache. */
853 64, /* size of prefetch block */
854 /* New AMD processors never drop prefetches; if they cannot be performed
855 immediately, they are queued. We set number of simultaneous prefetches
856 to a large constant to reflect this (it probably is not a good idea not
857 to limit number of prefetches at all, as their execution also takes some
858 time). */
859 100, /* number of parallel prefetches */
860 3, /* Branch cost */
861 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
862 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
863 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
864 COSTS_N_INSNS (2), /* cost of FABS instruction. */
865 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
866 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
868 k8_memcpy,
869 k8_memset,
870 4, /* scalar_stmt_cost. */
871 2, /* scalar load_cost. */
872 2, /* scalar_store_cost. */
873 5, /* vec_stmt_cost. */
874 0, /* vec_to_scalar_cost. */
875 2, /* scalar_to_vec_cost. */
876 2, /* vec_align_load_cost. */
877 3, /* vec_unalign_load_cost. */
878 3, /* vec_store_cost. */
879 3, /* cond_taken_branch_cost. */
880 2, /* cond_not_taken_branch_cost. */
883 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
884 very small blocks it is better to use loop. For large blocks, libcall can
885 do nontemporary accesses and beat inline considerably. */
886 static stringop_algs amdfam10_memcpy[2] = {
887 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
888 {-1, rep_prefix_4_byte, false}}},
889 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
890 {-1, libcall, false}}}};
891 static stringop_algs amdfam10_memset[2] = {
892 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
893 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
894 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
895 {-1, libcall, false}}}};
896 struct processor_costs amdfam10_cost = {
897 COSTS_N_INSNS (1), /* cost of an add instruction */
898 COSTS_N_INSNS (2), /* cost of a lea instruction */
899 COSTS_N_INSNS (1), /* variable shift costs */
900 COSTS_N_INSNS (1), /* constant shift costs */
901 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
902 COSTS_N_INSNS (4), /* HI */
903 COSTS_N_INSNS (3), /* SI */
904 COSTS_N_INSNS (4), /* DI */
905 COSTS_N_INSNS (5)}, /* other */
906 0, /* cost of multiply per each bit set */
907 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
908 COSTS_N_INSNS (35), /* HI */
909 COSTS_N_INSNS (51), /* SI */
910 COSTS_N_INSNS (83), /* DI */
911 COSTS_N_INSNS (83)}, /* other */
912 COSTS_N_INSNS (1), /* cost of movsx */
913 COSTS_N_INSNS (1), /* cost of movzx */
914 8, /* "large" insn */
915 9, /* MOVE_RATIO */
916 4, /* cost for loading QImode using movzbl */
917 {3, 4, 3}, /* cost of loading integer registers
918 in QImode, HImode and SImode.
919 Relative to reg-reg move (2). */
920 {3, 4, 3}, /* cost of storing integer registers */
921 4, /* cost of reg,reg fld/fst */
922 {4, 4, 12}, /* cost of loading fp registers
923 in SFmode, DFmode and XFmode */
924 {6, 6, 8}, /* cost of storing fp registers
925 in SFmode, DFmode and XFmode */
926 2, /* cost of moving MMX register */
927 {3, 3}, /* cost of loading MMX registers
928 in SImode and DImode */
929 {4, 4}, /* cost of storing MMX registers
930 in SImode and DImode */
931 2, /* cost of moving SSE register */
932 {4, 4, 3}, /* cost of loading SSE registers
933 in SImode, DImode and TImode */
934 {4, 4, 5}, /* cost of storing SSE registers
935 in SImode, DImode and TImode */
936 3, /* MMX or SSE register to integer */
937 /* On K8:
938 MOVD reg64, xmmreg Double FSTORE 4
939 MOVD reg32, xmmreg Double FSTORE 4
940 On AMDFAM10:
941 MOVD reg64, xmmreg Double FADD 3
942 1/1 1/1
943 MOVD reg32, xmmreg Double FADD 3
944 1/1 1/1 */
945 64, /* size of l1 cache. */
946 512, /* size of l2 cache. */
947 64, /* size of prefetch block */
948 /* New AMD processors never drop prefetches; if they cannot be performed
949 immediately, they are queued. We set number of simultaneous prefetches
950 to a large constant to reflect this (it probably is not a good idea not
951 to limit number of prefetches at all, as their execution also takes some
952 time). */
953 100, /* number of parallel prefetches */
954 2, /* Branch cost */
955 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
956 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
957 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
958 COSTS_N_INSNS (2), /* cost of FABS instruction. */
959 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
960 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
962 amdfam10_memcpy,
963 amdfam10_memset,
964 4, /* scalar_stmt_cost. */
965 2, /* scalar load_cost. */
966 2, /* scalar_store_cost. */
967 6, /* vec_stmt_cost. */
968 0, /* vec_to_scalar_cost. */
969 2, /* scalar_to_vec_cost. */
970 2, /* vec_align_load_cost. */
971 2, /* vec_unalign_load_cost. */
972 2, /* vec_store_cost. */
973 2, /* cond_taken_branch_cost. */
974 1, /* cond_not_taken_branch_cost. */
977 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
978 very small blocks it is better to use loop. For large blocks, libcall
979 can do nontemporary accesses and beat inline considerably. */
980 static stringop_algs bdver1_memcpy[2] = {
981 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
982 {-1, rep_prefix_4_byte, false}}},
983 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
984 {-1, libcall, false}}}};
985 static stringop_algs bdver1_memset[2] = {
986 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
987 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
988 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
991 const struct processor_costs bdver1_cost = {
992 COSTS_N_INSNS (1), /* cost of an add instruction */
993 COSTS_N_INSNS (1), /* cost of a lea instruction */
994 COSTS_N_INSNS (1), /* variable shift costs */
995 COSTS_N_INSNS (1), /* constant shift costs */
996 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
997 COSTS_N_INSNS (4), /* HI */
998 COSTS_N_INSNS (4), /* SI */
999 COSTS_N_INSNS (6), /* DI */
1000 COSTS_N_INSNS (6)}, /* other */
1001 0, /* cost of multiply per each bit set */
1002 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1003 COSTS_N_INSNS (35), /* HI */
1004 COSTS_N_INSNS (51), /* SI */
1005 COSTS_N_INSNS (83), /* DI */
1006 COSTS_N_INSNS (83)}, /* other */
1007 COSTS_N_INSNS (1), /* cost of movsx */
1008 COSTS_N_INSNS (1), /* cost of movzx */
1009 8, /* "large" insn */
1010 9, /* MOVE_RATIO */
1011 4, /* cost for loading QImode using movzbl */
1012 {5, 5, 4}, /* cost of loading integer registers
1013 in QImode, HImode and SImode.
1014 Relative to reg-reg move (2). */
1015 {4, 4, 4}, /* cost of storing integer registers */
1016 2, /* cost of reg,reg fld/fst */
1017 {5, 5, 12}, /* cost of loading fp registers
1018 in SFmode, DFmode and XFmode */
1019 {4, 4, 8}, /* cost of storing fp registers
1020 in SFmode, DFmode and XFmode */
1021 2, /* cost of moving MMX register */
1022 {4, 4}, /* cost of loading MMX registers
1023 in SImode and DImode */
1024 {4, 4}, /* cost of storing MMX registers
1025 in SImode and DImode */
1026 2, /* cost of moving SSE register */
1027 {4, 4, 4}, /* cost of loading SSE registers
1028 in SImode, DImode and TImode */
1029 {4, 4, 4}, /* cost of storing SSE registers
1030 in SImode, DImode and TImode */
1031 2, /* MMX or SSE register to integer */
1032 /* On K8:
1033 MOVD reg64, xmmreg Double FSTORE 4
1034 MOVD reg32, xmmreg Double FSTORE 4
1035 On AMDFAM10:
1036 MOVD reg64, xmmreg Double FADD 3
1037 1/1 1/1
1038 MOVD reg32, xmmreg Double FADD 3
1039 1/1 1/1 */
1040 16, /* size of l1 cache. */
1041 2048, /* size of l2 cache. */
1042 64, /* size of prefetch block */
1043 /* New AMD processors never drop prefetches; if they cannot be performed
1044 immediately, they are queued. We set number of simultaneous prefetches
1045 to a large constant to reflect this (it probably is not a good idea not
1046 to limit number of prefetches at all, as their execution also takes some
1047 time). */
1048 100, /* number of parallel prefetches */
1049 2, /* Branch cost */
1050 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1051 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1052 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1053 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1054 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1055 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1057 bdver1_memcpy,
1058 bdver1_memset,
1059 6, /* scalar_stmt_cost. */
1060 4, /* scalar load_cost. */
1061 4, /* scalar_store_cost. */
1062 6, /* vec_stmt_cost. */
1063 0, /* vec_to_scalar_cost. */
1064 2, /* scalar_to_vec_cost. */
1065 4, /* vec_align_load_cost. */
1066 4, /* vec_unalign_load_cost. */
1067 4, /* vec_store_cost. */
1068 4, /* cond_taken_branch_cost. */
1069 2, /* cond_not_taken_branch_cost. */
1072 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1073 very small blocks it is better to use loop. For large blocks, libcall
1074 can do nontemporary accesses and beat inline considerably. */
1076 static stringop_algs bdver2_memcpy[2] = {
1077 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1078 {-1, rep_prefix_4_byte, false}}},
1079 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1080 {-1, libcall, false}}}};
1081 static stringop_algs bdver2_memset[2] = {
1082 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1087 const struct processor_costs bdver2_cost = {
1088 COSTS_N_INSNS (1), /* cost of an add instruction */
1089 COSTS_N_INSNS (1), /* cost of a lea instruction */
1090 COSTS_N_INSNS (1), /* variable shift costs */
1091 COSTS_N_INSNS (1), /* constant shift costs */
1092 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1093 COSTS_N_INSNS (4), /* HI */
1094 COSTS_N_INSNS (4), /* SI */
1095 COSTS_N_INSNS (6), /* DI */
1096 COSTS_N_INSNS (6)}, /* other */
1097 0, /* cost of multiply per each bit set */
1098 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1099 COSTS_N_INSNS (35), /* HI */
1100 COSTS_N_INSNS (51), /* SI */
1101 COSTS_N_INSNS (83), /* DI */
1102 COSTS_N_INSNS (83)}, /* other */
1103 COSTS_N_INSNS (1), /* cost of movsx */
1104 COSTS_N_INSNS (1), /* cost of movzx */
1105 8, /* "large" insn */
1106 9, /* MOVE_RATIO */
1107 4, /* cost for loading QImode using movzbl */
1108 {5, 5, 4}, /* cost of loading integer registers
1109 in QImode, HImode and SImode.
1110 Relative to reg-reg move (2). */
1111 {4, 4, 4}, /* cost of storing integer registers */
1112 2, /* cost of reg,reg fld/fst */
1113 {5, 5, 12}, /* cost of loading fp registers
1114 in SFmode, DFmode and XFmode */
1115 {4, 4, 8}, /* cost of storing fp registers
1116 in SFmode, DFmode and XFmode */
1117 2, /* cost of moving MMX register */
1118 {4, 4}, /* cost of loading MMX registers
1119 in SImode and DImode */
1120 {4, 4}, /* cost of storing MMX registers
1121 in SImode and DImode */
1122 2, /* cost of moving SSE register */
1123 {4, 4, 4}, /* cost of loading SSE registers
1124 in SImode, DImode and TImode */
1125 {4, 4, 4}, /* cost of storing SSE registers
1126 in SImode, DImode and TImode */
1127 2, /* MMX or SSE register to integer */
1128 /* On K8:
1129 MOVD reg64, xmmreg Double FSTORE 4
1130 MOVD reg32, xmmreg Double FSTORE 4
1131 On AMDFAM10:
1132 MOVD reg64, xmmreg Double FADD 3
1133 1/1 1/1
1134 MOVD reg32, xmmreg Double FADD 3
1135 1/1 1/1 */
1136 16, /* size of l1 cache. */
1137 2048, /* size of l2 cache. */
1138 64, /* size of prefetch block */
1139 /* New AMD processors never drop prefetches; if they cannot be performed
1140 immediately, they are queued. We set number of simultaneous prefetches
1141 to a large constant to reflect this (it probably is not a good idea not
1142 to limit number of prefetches at all, as their execution also takes some
1143 time). */
1144 100, /* number of parallel prefetches */
1145 2, /* Branch cost */
1146 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1147 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1148 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1149 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1150 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1151 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1153 bdver2_memcpy,
1154 bdver2_memset,
1155 6, /* scalar_stmt_cost. */
1156 4, /* scalar load_cost. */
1157 4, /* scalar_store_cost. */
1158 6, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 4, /* vec_align_load_cost. */
1162 4, /* vec_unalign_load_cost. */
1163 4, /* vec_store_cost. */
1164 4, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1169 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1170 very small blocks it is better to use loop. For large blocks, libcall
1171 can do nontemporary accesses and beat inline considerably. */
1172 static stringop_algs bdver3_memcpy[2] = {
1173 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1174 {-1, rep_prefix_4_byte, false}}},
1175 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 static stringop_algs bdver3_memset[2] = {
1178 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1179 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1180 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1181 {-1, libcall, false}}}};
1182 struct processor_costs bdver3_cost = {
1183 COSTS_N_INSNS (1), /* cost of an add instruction */
1184 COSTS_N_INSNS (1), /* cost of a lea instruction */
1185 COSTS_N_INSNS (1), /* variable shift costs */
1186 COSTS_N_INSNS (1), /* constant shift costs */
1187 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1188 COSTS_N_INSNS (4), /* HI */
1189 COSTS_N_INSNS (4), /* SI */
1190 COSTS_N_INSNS (6), /* DI */
1191 COSTS_N_INSNS (6)}, /* other */
1192 0, /* cost of multiply per each bit set */
1193 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1194 COSTS_N_INSNS (35), /* HI */
1195 COSTS_N_INSNS (51), /* SI */
1196 COSTS_N_INSNS (83), /* DI */
1197 COSTS_N_INSNS (83)}, /* other */
1198 COSTS_N_INSNS (1), /* cost of movsx */
1199 COSTS_N_INSNS (1), /* cost of movzx */
1200 8, /* "large" insn */
1201 9, /* MOVE_RATIO */
1202 4, /* cost for loading QImode using movzbl */
1203 {5, 5, 4}, /* cost of loading integer registers
1204 in QImode, HImode and SImode.
1205 Relative to reg-reg move (2). */
1206 {4, 4, 4}, /* cost of storing integer registers */
1207 2, /* cost of reg,reg fld/fst */
1208 {5, 5, 12}, /* cost of loading fp registers
1209 in SFmode, DFmode and XFmode */
1210 {4, 4, 8}, /* cost of storing fp registers
1211 in SFmode, DFmode and XFmode */
1212 2, /* cost of moving MMX register */
1213 {4, 4}, /* cost of loading MMX registers
1214 in SImode and DImode */
1215 {4, 4}, /* cost of storing MMX registers
1216 in SImode and DImode */
1217 2, /* cost of moving SSE register */
1218 {4, 4, 4}, /* cost of loading SSE registers
1219 in SImode, DImode and TImode */
1220 {4, 4, 4}, /* cost of storing SSE registers
1221 in SImode, DImode and TImode */
1222 2, /* MMX or SSE register to integer */
1223 16, /* size of l1 cache. */
1224 2048, /* size of l2 cache. */
1225 64, /* size of prefetch block */
1226 /* New AMD processors never drop prefetches; if they cannot be performed
1227 immediately, they are queued. We set number of simultaneous prefetches
1228 to a large constant to reflect this (it probably is not a good idea not
1229 to limit number of prefetches at all, as their execution also takes some
1230 time). */
1231 100, /* number of parallel prefetches */
1232 2, /* Branch cost */
1233 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1234 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1235 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1236 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1237 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1238 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1240 bdver3_memcpy,
1241 bdver3_memset,
1242 6, /* scalar_stmt_cost. */
1243 4, /* scalar load_cost. */
1244 4, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 4, /* vec_align_load_cost. */
1249 4, /* vec_unalign_load_cost. */
1250 4, /* vec_store_cost. */
1251 4, /* cond_taken_branch_cost. */
1252 2, /* cond_not_taken_branch_cost. */
1255 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1256 very small blocks it is better to use loop. For large blocks, libcall
1257 can do nontemporary accesses and beat inline considerably. */
1258 static stringop_algs bdver4_memcpy[2] = {
1259 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1260 {-1, rep_prefix_4_byte, false}}},
1261 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 static stringop_algs bdver4_memset[2] = {
1264 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1265 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1266 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1267 {-1, libcall, false}}}};
1268 struct processor_costs bdver4_cost = {
1269 COSTS_N_INSNS (1), /* cost of an add instruction */
1270 COSTS_N_INSNS (1), /* cost of a lea instruction */
1271 COSTS_N_INSNS (1), /* variable shift costs */
1272 COSTS_N_INSNS (1), /* constant shift costs */
1273 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1274 COSTS_N_INSNS (4), /* HI */
1275 COSTS_N_INSNS (4), /* SI */
1276 COSTS_N_INSNS (6), /* DI */
1277 COSTS_N_INSNS (6)}, /* other */
1278 0, /* cost of multiply per each bit set */
1279 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1280 COSTS_N_INSNS (35), /* HI */
1281 COSTS_N_INSNS (51), /* SI */
1282 COSTS_N_INSNS (83), /* DI */
1283 COSTS_N_INSNS (83)}, /* other */
1284 COSTS_N_INSNS (1), /* cost of movsx */
1285 COSTS_N_INSNS (1), /* cost of movzx */
1286 8, /* "large" insn */
1287 9, /* MOVE_RATIO */
1288 4, /* cost for loading QImode using movzbl */
1289 {5, 5, 4}, /* cost of loading integer registers
1290 in QImode, HImode and SImode.
1291 Relative to reg-reg move (2). */
1292 {4, 4, 4}, /* cost of storing integer registers */
1293 2, /* cost of reg,reg fld/fst */
1294 {5, 5, 12}, /* cost of loading fp registers
1295 in SFmode, DFmode and XFmode */
1296 {4, 4, 8}, /* cost of storing fp registers
1297 in SFmode, DFmode and XFmode */
1298 2, /* cost of moving MMX register */
1299 {4, 4}, /* cost of loading MMX registers
1300 in SImode and DImode */
1301 {4, 4}, /* cost of storing MMX registers
1302 in SImode and DImode */
1303 2, /* cost of moving SSE register */
1304 {4, 4, 4}, /* cost of loading SSE registers
1305 in SImode, DImode and TImode */
1306 {4, 4, 4}, /* cost of storing SSE registers
1307 in SImode, DImode and TImode */
1308 2, /* MMX or SSE register to integer */
1309 16, /* size of l1 cache. */
1310 2048, /* size of l2 cache. */
1311 64, /* size of prefetch block */
1312 /* New AMD processors never drop prefetches; if they cannot be performed
1313 immediately, they are queued. We set number of simultaneous prefetches
1314 to a large constant to reflect this (it probably is not a good idea not
1315 to limit number of prefetches at all, as their execution also takes some
1316 time). */
1317 100, /* number of parallel prefetches */
1318 2, /* Branch cost */
1319 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1320 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1321 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1322 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1323 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1324 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1326 bdver4_memcpy,
1327 bdver4_memset,
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 4, /* cond_taken_branch_cost. */
1338 2, /* cond_not_taken_branch_cost. */
1342 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1343 very small blocks it is better to use loop. For large blocks, libcall
1344 can do nontemporary accesses and beat inline considerably. */
1345 static stringop_algs znver1_memcpy[2] = {
1346 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1347 {-1, rep_prefix_4_byte, false}}},
1348 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1349 {-1, libcall, false}}}};
1350 static stringop_algs znver1_memset[2] = {
1351 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1352 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1353 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1354 {-1, libcall, false}}}};
1355 struct processor_costs znver1_cost = {
1356 COSTS_N_INSNS (1), /* cost of an add instruction. */
1357 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1358 COSTS_N_INSNS (1), /* variable shift costs. */
1359 COSTS_N_INSNS (1), /* constant shift costs. */
1360 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1361 COSTS_N_INSNS (3), /* HI. */
1362 COSTS_N_INSNS (3), /* SI. */
1363 COSTS_N_INSNS (4), /* DI. */
1364 COSTS_N_INSNS (4)}, /* other. */
1365 0, /* cost of multiply per each bit
1366 set. */
1367 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1368 COSTS_N_INSNS (35), /* HI. */
1369 COSTS_N_INSNS (51), /* SI. */
1370 COSTS_N_INSNS (83), /* DI. */
1371 COSTS_N_INSNS (83)}, /* other. */
1372 COSTS_N_INSNS (1), /* cost of movsx. */
1373 COSTS_N_INSNS (1), /* cost of movzx. */
1374 8, /* "large" insn. */
1375 9, /* MOVE_RATIO. */
1376 4, /* cost for loading QImode using
1377 movzbl. */
1378 {5, 5, 4}, /* cost of loading integer registers
1379 in QImode, HImode and SImode.
1380 Relative to reg-reg move (2). */
1381 {4, 4, 4}, /* cost of storing integer
1382 registers. */
1383 2, /* cost of reg,reg fld/fst. */
1384 {5, 5, 12}, /* cost of loading fp registers
1385 in SFmode, DFmode and XFmode. */
1386 {4, 4, 8}, /* cost of storing fp registers
1387 in SFmode, DFmode and XFmode. */
1388 2, /* cost of moving MMX register. */
1389 {4, 4}, /* cost of loading MMX registers
1390 in SImode and DImode. */
1391 {4, 4}, /* cost of storing MMX registers
1392 in SImode and DImode. */
1393 2, /* cost of moving SSE register. */
1394 {4, 4, 4}, /* cost of loading SSE registers
1395 in SImode, DImode and TImode. */
1396 {4, 4, 4}, /* cost of storing SSE registers
1397 in SImode, DImode and TImode. */
1398 2, /* MMX or SSE register to integer. */
1399 32, /* size of l1 cache. */
1400 512, /* size of l2 cache. */
1401 64, /* size of prefetch block. */
1402 /* New AMD processors never drop prefetches; if they cannot be performed
1403 immediately, they are queued. We set number of simultaneous prefetches
1404 to a large constant to reflect this (it probably is not a good idea not
1405 to limit number of prefetches at all, as their execution also takes some
1406 time). */
1407 100, /* number of parallel prefetches. */
1408 2, /* Branch cost. */
1409 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1410 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1411 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1412 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1413 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1414 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1416 znver1_memcpy,
1417 znver1_memset,
1418 6, /* scalar_stmt_cost. */
1419 4, /* scalar load_cost. */
1420 4, /* scalar_store_cost. */
1421 6, /* vec_stmt_cost. */
1422 0, /* vec_to_scalar_cost. */
1423 2, /* scalar_to_vec_cost. */
1424 4, /* vec_align_load_cost. */
1425 4, /* vec_unalign_load_cost. */
1426 4, /* vec_store_cost. */
1427 4, /* cond_taken_branch_cost. */
1428 2, /* cond_not_taken_branch_cost. */
1431 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1432 very small blocks it is better to use loop. For large blocks, libcall can
1433 do nontemporary accesses and beat inline considerably. */
1434 static stringop_algs btver1_memcpy[2] = {
1435 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1436 {-1, rep_prefix_4_byte, false}}},
1437 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1438 {-1, libcall, false}}}};
1439 static stringop_algs btver1_memset[2] = {
1440 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1441 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1442 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1443 {-1, libcall, false}}}};
1444 const struct processor_costs btver1_cost = {
1445 COSTS_N_INSNS (1), /* cost of an add instruction */
1446 COSTS_N_INSNS (2), /* cost of a lea instruction */
1447 COSTS_N_INSNS (1), /* variable shift costs */
1448 COSTS_N_INSNS (1), /* constant shift costs */
1449 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1450 COSTS_N_INSNS (4), /* HI */
1451 COSTS_N_INSNS (3), /* SI */
1452 COSTS_N_INSNS (4), /* DI */
1453 COSTS_N_INSNS (5)}, /* other */
1454 0, /* cost of multiply per each bit set */
1455 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1456 COSTS_N_INSNS (35), /* HI */
1457 COSTS_N_INSNS (51), /* SI */
1458 COSTS_N_INSNS (83), /* DI */
1459 COSTS_N_INSNS (83)}, /* other */
1460 COSTS_N_INSNS (1), /* cost of movsx */
1461 COSTS_N_INSNS (1), /* cost of movzx */
1462 8, /* "large" insn */
1463 9, /* MOVE_RATIO */
1464 4, /* cost for loading QImode using movzbl */
1465 {3, 4, 3}, /* cost of loading integer registers
1466 in QImode, HImode and SImode.
1467 Relative to reg-reg move (2). */
1468 {3, 4, 3}, /* cost of storing integer registers */
1469 4, /* cost of reg,reg fld/fst */
1470 {4, 4, 12}, /* cost of loading fp registers
1471 in SFmode, DFmode and XFmode */
1472 {6, 6, 8}, /* cost of storing fp registers
1473 in SFmode, DFmode and XFmode */
1474 2, /* cost of moving MMX register */
1475 {3, 3}, /* cost of loading MMX registers
1476 in SImode and DImode */
1477 {4, 4}, /* cost of storing MMX registers
1478 in SImode and DImode */
1479 2, /* cost of moving SSE register */
1480 {4, 4, 3}, /* cost of loading SSE registers
1481 in SImode, DImode and TImode */
1482 {4, 4, 5}, /* cost of storing SSE registers
1483 in SImode, DImode and TImode */
1484 3, /* MMX or SSE register to integer */
1485 /* On K8:
1486 MOVD reg64, xmmreg Double FSTORE 4
1487 MOVD reg32, xmmreg Double FSTORE 4
1488 On AMDFAM10:
1489 MOVD reg64, xmmreg Double FADD 3
1490 1/1 1/1
1491 MOVD reg32, xmmreg Double FADD 3
1492 1/1 1/1 */
1493 32, /* size of l1 cache. */
1494 512, /* size of l2 cache. */
1495 64, /* size of prefetch block */
1496 100, /* number of parallel prefetches */
1497 2, /* Branch cost */
1498 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1499 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1500 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1501 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1502 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1503 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1505 btver1_memcpy,
1506 btver1_memset,
1507 4, /* scalar_stmt_cost. */
1508 2, /* scalar load_cost. */
1509 2, /* scalar_store_cost. */
1510 6, /* vec_stmt_cost. */
1511 0, /* vec_to_scalar_cost. */
1512 2, /* scalar_to_vec_cost. */
1513 2, /* vec_align_load_cost. */
1514 2, /* vec_unalign_load_cost. */
1515 2, /* vec_store_cost. */
1516 2, /* cond_taken_branch_cost. */
1517 1, /* cond_not_taken_branch_cost. */
1520 static stringop_algs btver2_memcpy[2] = {
1521 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1522 {-1, rep_prefix_4_byte, false}}},
1523 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1524 {-1, libcall, false}}}};
1525 static stringop_algs btver2_memset[2] = {
1526 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1527 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1528 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1529 {-1, libcall, false}}}};
1530 const struct processor_costs btver2_cost = {
1531 COSTS_N_INSNS (1), /* cost of an add instruction */
1532 COSTS_N_INSNS (2), /* cost of a lea instruction */
1533 COSTS_N_INSNS (1), /* variable shift costs */
1534 COSTS_N_INSNS (1), /* constant shift costs */
1535 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1536 COSTS_N_INSNS (4), /* HI */
1537 COSTS_N_INSNS (3), /* SI */
1538 COSTS_N_INSNS (4), /* DI */
1539 COSTS_N_INSNS (5)}, /* other */
1540 0, /* cost of multiply per each bit set */
1541 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1542 COSTS_N_INSNS (35), /* HI */
1543 COSTS_N_INSNS (51), /* SI */
1544 COSTS_N_INSNS (83), /* DI */
1545 COSTS_N_INSNS (83)}, /* other */
1546 COSTS_N_INSNS (1), /* cost of movsx */
1547 COSTS_N_INSNS (1), /* cost of movzx */
1548 8, /* "large" insn */
1549 9, /* MOVE_RATIO */
1550 4, /* cost for loading QImode using movzbl */
1551 {3, 4, 3}, /* cost of loading integer registers
1552 in QImode, HImode and SImode.
1553 Relative to reg-reg move (2). */
1554 {3, 4, 3}, /* cost of storing integer registers */
1555 4, /* cost of reg,reg fld/fst */
1556 {4, 4, 12}, /* cost of loading fp registers
1557 in SFmode, DFmode and XFmode */
1558 {6, 6, 8}, /* cost of storing fp registers
1559 in SFmode, DFmode and XFmode */
1560 2, /* cost of moving MMX register */
1561 {3, 3}, /* cost of loading MMX registers
1562 in SImode and DImode */
1563 {4, 4}, /* cost of storing MMX registers
1564 in SImode and DImode */
1565 2, /* cost of moving SSE register */
1566 {4, 4, 3}, /* cost of loading SSE registers
1567 in SImode, DImode and TImode */
1568 {4, 4, 5}, /* cost of storing SSE registers
1569 in SImode, DImode and TImode */
1570 3, /* MMX or SSE register to integer */
1571 /* On K8:
1572 MOVD reg64, xmmreg Double FSTORE 4
1573 MOVD reg32, xmmreg Double FSTORE 4
1574 On AMDFAM10:
1575 MOVD reg64, xmmreg Double FADD 3
1576 1/1 1/1
1577 MOVD reg32, xmmreg Double FADD 3
1578 1/1 1/1 */
1579 32, /* size of l1 cache. */
1580 2048, /* size of l2 cache. */
1581 64, /* size of prefetch block */
1582 100, /* number of parallel prefetches */
1583 2, /* Branch cost */
1584 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1585 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1586 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1587 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1588 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1589 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1590 btver2_memcpy,
1591 btver2_memset,
1592 4, /* scalar_stmt_cost. */
1593 2, /* scalar load_cost. */
1594 2, /* scalar_store_cost. */
1595 6, /* vec_stmt_cost. */
1596 0, /* vec_to_scalar_cost. */
1597 2, /* scalar_to_vec_cost. */
1598 2, /* vec_align_load_cost. */
1599 2, /* vec_unalign_load_cost. */
1600 2, /* vec_store_cost. */
1601 2, /* cond_taken_branch_cost. */
1602 1, /* cond_not_taken_branch_cost. */
1605 static stringop_algs pentium4_memcpy[2] = {
1606 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1607 DUMMY_STRINGOP_ALGS};
1608 static stringop_algs pentium4_memset[2] = {
1609 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1610 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1611 DUMMY_STRINGOP_ALGS};
1613 static const
1614 struct processor_costs pentium4_cost = {
1615 COSTS_N_INSNS (1), /* cost of an add instruction */
1616 COSTS_N_INSNS (3), /* cost of a lea instruction */
1617 COSTS_N_INSNS (4), /* variable shift costs */
1618 COSTS_N_INSNS (4), /* constant shift costs */
1619 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1620 COSTS_N_INSNS (15), /* HI */
1621 COSTS_N_INSNS (15), /* SI */
1622 COSTS_N_INSNS (15), /* DI */
1623 COSTS_N_INSNS (15)}, /* other */
1624 0, /* cost of multiply per each bit set */
1625 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1626 COSTS_N_INSNS (56), /* HI */
1627 COSTS_N_INSNS (56), /* SI */
1628 COSTS_N_INSNS (56), /* DI */
1629 COSTS_N_INSNS (56)}, /* other */
1630 COSTS_N_INSNS (1), /* cost of movsx */
1631 COSTS_N_INSNS (1), /* cost of movzx */
1632 16, /* "large" insn */
1633 6, /* MOVE_RATIO */
1634 2, /* cost for loading QImode using movzbl */
1635 {4, 5, 4}, /* cost of loading integer registers
1636 in QImode, HImode and SImode.
1637 Relative to reg-reg move (2). */
1638 {2, 3, 2}, /* cost of storing integer registers */
1639 2, /* cost of reg,reg fld/fst */
1640 {2, 2, 6}, /* cost of loading fp registers
1641 in SFmode, DFmode and XFmode */
1642 {4, 4, 6}, /* cost of storing fp registers
1643 in SFmode, DFmode and XFmode */
1644 2, /* cost of moving MMX register */
1645 {2, 2}, /* cost of loading MMX registers
1646 in SImode and DImode */
1647 {2, 2}, /* cost of storing MMX registers
1648 in SImode and DImode */
1649 12, /* cost of moving SSE register */
1650 {12, 12, 12}, /* cost of loading SSE registers
1651 in SImode, DImode and TImode */
1652 {2, 2, 8}, /* cost of storing SSE registers
1653 in SImode, DImode and TImode */
1654 10, /* MMX or SSE register to integer */
1655 8, /* size of l1 cache. */
1656 256, /* size of l2 cache. */
1657 64, /* size of prefetch block */
1658 6, /* number of parallel prefetches */
1659 2, /* Branch cost */
1660 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1661 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1662 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1663 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1664 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1665 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1666 pentium4_memcpy,
1667 pentium4_memset,
1668 1, /* scalar_stmt_cost. */
1669 1, /* scalar load_cost. */
1670 1, /* scalar_store_cost. */
1671 1, /* vec_stmt_cost. */
1672 1, /* vec_to_scalar_cost. */
1673 1, /* scalar_to_vec_cost. */
1674 1, /* vec_align_load_cost. */
1675 2, /* vec_unalign_load_cost. */
1676 1, /* vec_store_cost. */
1677 3, /* cond_taken_branch_cost. */
1678 1, /* cond_not_taken_branch_cost. */
1681 static stringop_algs nocona_memcpy[2] = {
1682 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1683 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1684 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1686 static stringop_algs nocona_memset[2] = {
1687 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1690 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1692 static const
1693 struct processor_costs nocona_cost = {
1694 COSTS_N_INSNS (1), /* cost of an add instruction */
1695 COSTS_N_INSNS (1), /* cost of a lea instruction */
1696 COSTS_N_INSNS (1), /* variable shift costs */
1697 COSTS_N_INSNS (1), /* constant shift costs */
1698 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1699 COSTS_N_INSNS (10), /* HI */
1700 COSTS_N_INSNS (10), /* SI */
1701 COSTS_N_INSNS (10), /* DI */
1702 COSTS_N_INSNS (10)}, /* other */
1703 0, /* cost of multiply per each bit set */
1704 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1705 COSTS_N_INSNS (66), /* HI */
1706 COSTS_N_INSNS (66), /* SI */
1707 COSTS_N_INSNS (66), /* DI */
1708 COSTS_N_INSNS (66)}, /* other */
1709 COSTS_N_INSNS (1), /* cost of movsx */
1710 COSTS_N_INSNS (1), /* cost of movzx */
1711 16, /* "large" insn */
1712 17, /* MOVE_RATIO */
1713 4, /* cost for loading QImode using movzbl */
1714 {4, 4, 4}, /* cost of loading integer registers
1715 in QImode, HImode and SImode.
1716 Relative to reg-reg move (2). */
1717 {4, 4, 4}, /* cost of storing integer registers */
1718 3, /* cost of reg,reg fld/fst */
1719 {12, 12, 12}, /* cost of loading fp registers
1720 in SFmode, DFmode and XFmode */
1721 {4, 4, 4}, /* cost of storing fp registers
1722 in SFmode, DFmode and XFmode */
1723 6, /* cost of moving MMX register */
1724 {12, 12}, /* cost of loading MMX registers
1725 in SImode and DImode */
1726 {12, 12}, /* cost of storing MMX registers
1727 in SImode and DImode */
1728 6, /* cost of moving SSE register */
1729 {12, 12, 12}, /* cost of loading SSE registers
1730 in SImode, DImode and TImode */
1731 {12, 12, 12}, /* cost of storing SSE registers
1732 in SImode, DImode and TImode */
1733 8, /* MMX or SSE register to integer */
1734 8, /* size of l1 cache. */
1735 1024, /* size of l2 cache. */
1736 64, /* size of prefetch block */
1737 8, /* number of parallel prefetches */
1738 1, /* Branch cost */
1739 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1740 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1741 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1742 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1743 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1744 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1745 nocona_memcpy,
1746 nocona_memset,
1747 1, /* scalar_stmt_cost. */
1748 1, /* scalar load_cost. */
1749 1, /* scalar_store_cost. */
1750 1, /* vec_stmt_cost. */
1751 1, /* vec_to_scalar_cost. */
1752 1, /* scalar_to_vec_cost. */
1753 1, /* vec_align_load_cost. */
1754 2, /* vec_unalign_load_cost. */
1755 1, /* vec_store_cost. */
1756 3, /* cond_taken_branch_cost. */
1757 1, /* cond_not_taken_branch_cost. */
1760 static stringop_algs atom_memcpy[2] = {
1761 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1762 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1763 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1764 static stringop_algs atom_memset[2] = {
1765 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1766 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1767 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1768 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1769 static const
1770 struct processor_costs atom_cost = {
1771 COSTS_N_INSNS (1), /* cost of an add instruction */
1772 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1773 COSTS_N_INSNS (1), /* variable shift costs */
1774 COSTS_N_INSNS (1), /* constant shift costs */
1775 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1776 COSTS_N_INSNS (4), /* HI */
1777 COSTS_N_INSNS (3), /* SI */
1778 COSTS_N_INSNS (4), /* DI */
1779 COSTS_N_INSNS (2)}, /* other */
1780 0, /* cost of multiply per each bit set */
1781 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1782 COSTS_N_INSNS (26), /* HI */
1783 COSTS_N_INSNS (42), /* SI */
1784 COSTS_N_INSNS (74), /* DI */
1785 COSTS_N_INSNS (74)}, /* other */
1786 COSTS_N_INSNS (1), /* cost of movsx */
1787 COSTS_N_INSNS (1), /* cost of movzx */
1788 8, /* "large" insn */
1789 17, /* MOVE_RATIO */
1790 4, /* cost for loading QImode using movzbl */
1791 {4, 4, 4}, /* cost of loading integer registers
1792 in QImode, HImode and SImode.
1793 Relative to reg-reg move (2). */
1794 {4, 4, 4}, /* cost of storing integer registers */
1795 4, /* cost of reg,reg fld/fst */
1796 {12, 12, 12}, /* cost of loading fp registers
1797 in SFmode, DFmode and XFmode */
1798 {6, 6, 8}, /* cost of storing fp registers
1799 in SFmode, DFmode and XFmode */
1800 2, /* cost of moving MMX register */
1801 {8, 8}, /* cost of loading MMX registers
1802 in SImode and DImode */
1803 {8, 8}, /* cost of storing MMX registers
1804 in SImode and DImode */
1805 2, /* cost of moving SSE register */
1806 {8, 8, 8}, /* cost of loading SSE registers
1807 in SImode, DImode and TImode */
1808 {8, 8, 8}, /* cost of storing SSE registers
1809 in SImode, DImode and TImode */
1810 5, /* MMX or SSE register to integer */
1811 32, /* size of l1 cache. */
1812 256, /* size of l2 cache. */
1813 64, /* size of prefetch block */
1814 6, /* number of parallel prefetches */
1815 3, /* Branch cost */
1816 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1817 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1818 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1819 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1820 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1821 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1822 atom_memcpy,
1823 atom_memset,
1824 1, /* scalar_stmt_cost. */
1825 1, /* scalar load_cost. */
1826 1, /* scalar_store_cost. */
1827 1, /* vec_stmt_cost. */
1828 1, /* vec_to_scalar_cost. */
1829 1, /* scalar_to_vec_cost. */
1830 1, /* vec_align_load_cost. */
1831 2, /* vec_unalign_load_cost. */
1832 1, /* vec_store_cost. */
1833 3, /* cond_taken_branch_cost. */
1834 1, /* cond_not_taken_branch_cost. */
1837 static stringop_algs slm_memcpy[2] = {
1838 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1839 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1840 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1841 static stringop_algs slm_memset[2] = {
1842 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1843 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1844 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1845 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1846 static const
1847 struct processor_costs slm_cost = {
1848 COSTS_N_INSNS (1), /* cost of an add instruction */
1849 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1850 COSTS_N_INSNS (1), /* variable shift costs */
1851 COSTS_N_INSNS (1), /* constant shift costs */
1852 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1853 COSTS_N_INSNS (3), /* HI */
1854 COSTS_N_INSNS (3), /* SI */
1855 COSTS_N_INSNS (4), /* DI */
1856 COSTS_N_INSNS (2)}, /* other */
1857 0, /* cost of multiply per each bit set */
1858 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1859 COSTS_N_INSNS (26), /* HI */
1860 COSTS_N_INSNS (42), /* SI */
1861 COSTS_N_INSNS (74), /* DI */
1862 COSTS_N_INSNS (74)}, /* other */
1863 COSTS_N_INSNS (1), /* cost of movsx */
1864 COSTS_N_INSNS (1), /* cost of movzx */
1865 8, /* "large" insn */
1866 17, /* MOVE_RATIO */
1867 4, /* cost for loading QImode using movzbl */
1868 {4, 4, 4}, /* cost of loading integer registers
1869 in QImode, HImode and SImode.
1870 Relative to reg-reg move (2). */
1871 {4, 4, 4}, /* cost of storing integer registers */
1872 4, /* cost of reg,reg fld/fst */
1873 {12, 12, 12}, /* cost of loading fp registers
1874 in SFmode, DFmode and XFmode */
1875 {6, 6, 8}, /* cost of storing fp registers
1876 in SFmode, DFmode and XFmode */
1877 2, /* cost of moving MMX register */
1878 {8, 8}, /* cost of loading MMX registers
1879 in SImode and DImode */
1880 {8, 8}, /* cost of storing MMX registers
1881 in SImode and DImode */
1882 2, /* cost of moving SSE register */
1883 {8, 8, 8}, /* cost of loading SSE registers
1884 in SImode, DImode and TImode */
1885 {8, 8, 8}, /* cost of storing SSE registers
1886 in SImode, DImode and TImode */
1887 5, /* MMX or SSE register to integer */
1888 32, /* size of l1 cache. */
1889 256, /* size of l2 cache. */
1890 64, /* size of prefetch block */
1891 6, /* number of parallel prefetches */
1892 3, /* Branch cost */
1893 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1894 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1895 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1896 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1897 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1898 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1899 slm_memcpy,
1900 slm_memset,
1901 1, /* scalar_stmt_cost. */
1902 1, /* scalar load_cost. */
1903 1, /* scalar_store_cost. */
1904 1, /* vec_stmt_cost. */
1905 4, /* vec_to_scalar_cost. */
1906 1, /* scalar_to_vec_cost. */
1907 1, /* vec_align_load_cost. */
1908 2, /* vec_unalign_load_cost. */
1909 1, /* vec_store_cost. */
1910 3, /* cond_taken_branch_cost. */
1911 1, /* cond_not_taken_branch_cost. */
1914 static stringop_algs intel_memcpy[2] = {
1915 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1916 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1917 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1918 static stringop_algs intel_memset[2] = {
1919 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1920 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1921 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1922 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1923 static const
1924 struct processor_costs intel_cost = {
1925 COSTS_N_INSNS (1), /* cost of an add instruction */
1926 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1927 COSTS_N_INSNS (1), /* variable shift costs */
1928 COSTS_N_INSNS (1), /* constant shift costs */
1929 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1930 COSTS_N_INSNS (3), /* HI */
1931 COSTS_N_INSNS (3), /* SI */
1932 COSTS_N_INSNS (4), /* DI */
1933 COSTS_N_INSNS (2)}, /* other */
1934 0, /* cost of multiply per each bit set */
1935 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1936 COSTS_N_INSNS (26), /* HI */
1937 COSTS_N_INSNS (42), /* SI */
1938 COSTS_N_INSNS (74), /* DI */
1939 COSTS_N_INSNS (74)}, /* other */
1940 COSTS_N_INSNS (1), /* cost of movsx */
1941 COSTS_N_INSNS (1), /* cost of movzx */
1942 8, /* "large" insn */
1943 17, /* MOVE_RATIO */
1944 4, /* cost for loading QImode using movzbl */
1945 {4, 4, 4}, /* cost of loading integer registers
1946 in QImode, HImode and SImode.
1947 Relative to reg-reg move (2). */
1948 {4, 4, 4}, /* cost of storing integer registers */
1949 4, /* cost of reg,reg fld/fst */
1950 {12, 12, 12}, /* cost of loading fp registers
1951 in SFmode, DFmode and XFmode */
1952 {6, 6, 8}, /* cost of storing fp registers
1953 in SFmode, DFmode and XFmode */
1954 2, /* cost of moving MMX register */
1955 {8, 8}, /* cost of loading MMX registers
1956 in SImode and DImode */
1957 {8, 8}, /* cost of storing MMX registers
1958 in SImode and DImode */
1959 2, /* cost of moving SSE register */
1960 {8, 8, 8}, /* cost of loading SSE registers
1961 in SImode, DImode and TImode */
1962 {8, 8, 8}, /* cost of storing SSE registers
1963 in SImode, DImode and TImode */
1964 5, /* MMX or SSE register to integer */
1965 32, /* size of l1 cache. */
1966 256, /* size of l2 cache. */
1967 64, /* size of prefetch block */
1968 6, /* number of parallel prefetches */
1969 3, /* Branch cost */
1970 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1971 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1972 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1973 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1974 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1975 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1976 intel_memcpy,
1977 intel_memset,
1978 1, /* scalar_stmt_cost. */
1979 1, /* scalar load_cost. */
1980 1, /* scalar_store_cost. */
1981 1, /* vec_stmt_cost. */
1982 4, /* vec_to_scalar_cost. */
1983 1, /* scalar_to_vec_cost. */
1984 1, /* vec_align_load_cost. */
1985 2, /* vec_unalign_load_cost. */
1986 1, /* vec_store_cost. */
1987 3, /* cond_taken_branch_cost. */
1988 1, /* cond_not_taken_branch_cost. */
1991 /* Generic should produce code tuned for Core-i7 (and newer chips)
1992 and btver1 (and newer chips). */
1994 static stringop_algs generic_memcpy[2] = {
1995 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1996 {-1, libcall, false}}},
1997 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1998 {-1, libcall, false}}}};
1999 static stringop_algs generic_memset[2] = {
2000 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2001 {-1, libcall, false}}},
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2003 {-1, libcall, false}}}};
2004 static const
2005 struct processor_costs generic_cost = {
2006 COSTS_N_INSNS (1), /* cost of an add instruction */
2007 /* On all chips taken into consideration lea is 2 cycles and more. With
2008 this cost however our current implementation of synth_mult results in
2009 use of unnecessary temporary registers causing regression on several
2010 SPECfp benchmarks. */
2011 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2012 COSTS_N_INSNS (1), /* variable shift costs */
2013 COSTS_N_INSNS (1), /* constant shift costs */
2014 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2015 COSTS_N_INSNS (4), /* HI */
2016 COSTS_N_INSNS (3), /* SI */
2017 COSTS_N_INSNS (4), /* DI */
2018 COSTS_N_INSNS (2)}, /* other */
2019 0, /* cost of multiply per each bit set */
2020 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2021 COSTS_N_INSNS (26), /* HI */
2022 COSTS_N_INSNS (42), /* SI */
2023 COSTS_N_INSNS (74), /* DI */
2024 COSTS_N_INSNS (74)}, /* other */
2025 COSTS_N_INSNS (1), /* cost of movsx */
2026 COSTS_N_INSNS (1), /* cost of movzx */
2027 8, /* "large" insn */
2028 17, /* MOVE_RATIO */
2029 4, /* cost for loading QImode using movzbl */
2030 {4, 4, 4}, /* cost of loading integer registers
2031 in QImode, HImode and SImode.
2032 Relative to reg-reg move (2). */
2033 {4, 4, 4}, /* cost of storing integer registers */
2034 4, /* cost of reg,reg fld/fst */
2035 {12, 12, 12}, /* cost of loading fp registers
2036 in SFmode, DFmode and XFmode */
2037 {6, 6, 8}, /* cost of storing fp registers
2038 in SFmode, DFmode and XFmode */
2039 2, /* cost of moving MMX register */
2040 {8, 8}, /* cost of loading MMX registers
2041 in SImode and DImode */
2042 {8, 8}, /* cost of storing MMX registers
2043 in SImode and DImode */
2044 2, /* cost of moving SSE register */
2045 {8, 8, 8}, /* cost of loading SSE registers
2046 in SImode, DImode and TImode */
2047 {8, 8, 8}, /* cost of storing SSE registers
2048 in SImode, DImode and TImode */
2049 5, /* MMX or SSE register to integer */
2050 32, /* size of l1 cache. */
2051 512, /* size of l2 cache. */
2052 64, /* size of prefetch block */
2053 6, /* number of parallel prefetches */
2054 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2055 value is increased to perhaps more appropriate value of 5. */
2056 3, /* Branch cost */
2057 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2058 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2059 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2060 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2061 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2062 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2063 generic_memcpy,
2064 generic_memset,
2065 1, /* scalar_stmt_cost. */
2066 1, /* scalar load_cost. */
2067 1, /* scalar_store_cost. */
2068 1, /* vec_stmt_cost. */
2069 1, /* vec_to_scalar_cost. */
2070 1, /* scalar_to_vec_cost. */
2071 1, /* vec_align_load_cost. */
2072 2, /* vec_unalign_load_cost. */
2073 1, /* vec_store_cost. */
2074 3, /* cond_taken_branch_cost. */
2075 1, /* cond_not_taken_branch_cost. */
2078 /* core_cost should produce code tuned for Core familly of CPUs. */
2079 static stringop_algs core_memcpy[2] = {
2080 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2081 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2082 {-1, libcall, false}}}};
2083 static stringop_algs core_memset[2] = {
2084 {libcall, {{6, loop_1_byte, true},
2085 {24, loop, true},
2086 {8192, rep_prefix_4_byte, true},
2087 {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2091 static const
2092 struct processor_costs core_cost = {
2093 COSTS_N_INSNS (1), /* cost of an add instruction */
2094 /* On all chips taken into consideration lea is 2 cycles and more. With
2095 this cost however our current implementation of synth_mult results in
2096 use of unnecessary temporary registers causing regression on several
2097 SPECfp benchmarks. */
2098 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2099 COSTS_N_INSNS (1), /* variable shift costs */
2100 COSTS_N_INSNS (1), /* constant shift costs */
2101 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2102 COSTS_N_INSNS (4), /* HI */
2103 COSTS_N_INSNS (3), /* SI */
2104 COSTS_N_INSNS (4), /* DI */
2105 COSTS_N_INSNS (2)}, /* other */
2106 0, /* cost of multiply per each bit set */
2107 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2108 COSTS_N_INSNS (26), /* HI */
2109 COSTS_N_INSNS (42), /* SI */
2110 COSTS_N_INSNS (74), /* DI */
2111 COSTS_N_INSNS (74)}, /* other */
2112 COSTS_N_INSNS (1), /* cost of movsx */
2113 COSTS_N_INSNS (1), /* cost of movzx */
2114 8, /* "large" insn */
2115 17, /* MOVE_RATIO */
2116 4, /* cost for loading QImode using movzbl */
2117 {4, 4, 4}, /* cost of loading integer registers
2118 in QImode, HImode and SImode.
2119 Relative to reg-reg move (2). */
2120 {4, 4, 4}, /* cost of storing integer registers */
2121 4, /* cost of reg,reg fld/fst */
2122 {12, 12, 12}, /* cost of loading fp registers
2123 in SFmode, DFmode and XFmode */
2124 {6, 6, 8}, /* cost of storing fp registers
2125 in SFmode, DFmode and XFmode */
2126 2, /* cost of moving MMX register */
2127 {8, 8}, /* cost of loading MMX registers
2128 in SImode and DImode */
2129 {8, 8}, /* cost of storing MMX registers
2130 in SImode and DImode */
2131 2, /* cost of moving SSE register */
2132 {8, 8, 8}, /* cost of loading SSE registers
2133 in SImode, DImode and TImode */
2134 {8, 8, 8}, /* cost of storing SSE registers
2135 in SImode, DImode and TImode */
2136 5, /* MMX or SSE register to integer */
2137 64, /* size of l1 cache. */
2138 512, /* size of l2 cache. */
2139 64, /* size of prefetch block */
2140 6, /* number of parallel prefetches */
2141 /* FIXME perhaps more appropriate value is 5. */
2142 3, /* Branch cost */
2143 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2144 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2145 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2146 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2147 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2148 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2149 core_memcpy,
2150 core_memset,
2151 1, /* scalar_stmt_cost. */
2152 1, /* scalar load_cost. */
2153 1, /* scalar_store_cost. */
2154 1, /* vec_stmt_cost. */
2155 1, /* vec_to_scalar_cost. */
2156 1, /* scalar_to_vec_cost. */
2157 1, /* vec_align_load_cost. */
2158 2, /* vec_unalign_load_cost. */
2159 1, /* vec_store_cost. */
2160 3, /* cond_taken_branch_cost. */
2161 1, /* cond_not_taken_branch_cost. */
2165 /* Set by -mtune. */
2166 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2168 /* Set by -mtune or -Os. */
2169 const struct processor_costs *ix86_cost = &pentium_cost;
2171 /* Processor feature/optimization bitmasks. */
2172 #define m_386 (1U<<PROCESSOR_I386)
2173 #define m_486 (1U<<PROCESSOR_I486)
2174 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2175 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2176 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2177 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2178 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2179 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2180 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2181 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2182 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2183 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2184 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2185 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2186 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2187 #define m_KNL (1U<<PROCESSOR_KNL)
2188 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2189 #define m_INTEL (1U<<PROCESSOR_INTEL)
2191 #define m_GEODE (1U<<PROCESSOR_GEODE)
2192 #define m_K6 (1U<<PROCESSOR_K6)
2193 #define m_K6_GEODE (m_K6 | m_GEODE)
2194 #define m_K8 (1U<<PROCESSOR_K8)
2195 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2196 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2197 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2198 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2199 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2200 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2201 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2202 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2203 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2204 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2205 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2206 #define m_BTVER (m_BTVER1 | m_BTVER2)
2207 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2208 | m_ZNVER1)
2210 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2212 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2213 #undef DEF_TUNE
2214 #define DEF_TUNE(tune, name, selector) name,
2215 #include "x86-tune.def"
2216 #undef DEF_TUNE
2219 /* Feature tests against the various tunings. */
2220 unsigned char ix86_tune_features[X86_TUNE_LAST];
2222 /* Feature tests against the various tunings used to create ix86_tune_features
2223 based on the processor mask. */
2224 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2225 #undef DEF_TUNE
2226 #define DEF_TUNE(tune, name, selector) selector,
2227 #include "x86-tune.def"
2228 #undef DEF_TUNE
2231 /* Feature tests against the various architecture variations. */
2232 unsigned char ix86_arch_features[X86_ARCH_LAST];
2234 /* Feature tests against the various architecture variations, used to create
2235 ix86_arch_features based on the processor mask. */
2236 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2237 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2238 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2240 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2241 ~m_386,
2243 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2244 ~(m_386 | m_486),
2246 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2247 ~m_386,
2249 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2250 ~m_386,
2253 /* In case the average insn count for single function invocation is
2254 lower than this constant, emit fast (but longer) prologue and
2255 epilogue code. */
2256 #define FAST_PROLOGUE_INSN_COUNT 20
2258 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2259 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2260 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2261 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2263 /* Array of the smallest class containing reg number REGNO, indexed by
2264 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2266 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2268 /* ax, dx, cx, bx */
2269 AREG, DREG, CREG, BREG,
2270 /* si, di, bp, sp */
2271 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2272 /* FP registers */
2273 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2274 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2275 /* arg pointer */
2276 NON_Q_REGS,
2277 /* flags, fpsr, fpcr, frame */
2278 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2279 /* SSE registers */
2280 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2281 SSE_REGS, SSE_REGS,
2282 /* MMX registers */
2283 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2284 MMX_REGS, MMX_REGS,
2285 /* REX registers */
2286 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2287 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2288 /* SSE REX registers */
2289 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2290 SSE_REGS, SSE_REGS,
2291 /* AVX-512 SSE registers */
2292 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2293 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2294 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2295 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2296 /* Mask registers. */
2297 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2298 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2299 /* MPX bound registers */
2300 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2303 /* The "default" register map used in 32bit mode. */
2305 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2307 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2308 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2309 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2310 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2311 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2312 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2313 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2314 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2315 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2316 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2317 101, 102, 103, 104, /* bound registers */
2320 /* The "default" register map used in 64bit mode. */
2322 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2324 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2325 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2326 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2327 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2328 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2329 8,9,10,11,12,13,14,15, /* extended integer registers */
2330 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2331 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2332 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2333 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2334 126, 127, 128, 129, /* bound registers */
2337 /* Define the register numbers to be used in Dwarf debugging information.
2338 The SVR4 reference port C compiler uses the following register numbers
2339 in its Dwarf output code:
2340 0 for %eax (gcc regno = 0)
2341 1 for %ecx (gcc regno = 2)
2342 2 for %edx (gcc regno = 1)
2343 3 for %ebx (gcc regno = 3)
2344 4 for %esp (gcc regno = 7)
2345 5 for %ebp (gcc regno = 6)
2346 6 for %esi (gcc regno = 4)
2347 7 for %edi (gcc regno = 5)
2348 The following three DWARF register numbers are never generated by
2349 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2350 believes these numbers have these meanings.
2351 8 for %eip (no gcc equivalent)
2352 9 for %eflags (gcc regno = 17)
2353 10 for %trapno (no gcc equivalent)
2354 It is not at all clear how we should number the FP stack registers
2355 for the x86 architecture. If the version of SDB on x86/svr4 were
2356 a bit less brain dead with respect to floating-point then we would
2357 have a precedent to follow with respect to DWARF register numbers
2358 for x86 FP registers, but the SDB on x86/svr4 is so completely
2359 broken with respect to FP registers that it is hardly worth thinking
2360 of it as something to strive for compatibility with.
2361 The version of x86/svr4 SDB I have at the moment does (partially)
2362 seem to believe that DWARF register number 11 is associated with
2363 the x86 register %st(0), but that's about all. Higher DWARF
2364 register numbers don't seem to be associated with anything in
2365 particular, and even for DWARF regno 11, SDB only seems to under-
2366 stand that it should say that a variable lives in %st(0) (when
2367 asked via an `=' command) if we said it was in DWARF regno 11,
2368 but SDB still prints garbage when asked for the value of the
2369 variable in question (via a `/' command).
2370 (Also note that the labels SDB prints for various FP stack regs
2371 when doing an `x' command are all wrong.)
2372 Note that these problems generally don't affect the native SVR4
2373 C compiler because it doesn't allow the use of -O with -g and
2374 because when it is *not* optimizing, it allocates a memory
2375 location for each floating-point variable, and the memory
2376 location is what gets described in the DWARF AT_location
2377 attribute for the variable in question.
2378 Regardless of the severe mental illness of the x86/svr4 SDB, we
2379 do something sensible here and we use the following DWARF
2380 register numbers. Note that these are all stack-top-relative
2381 numbers.
2382 11 for %st(0) (gcc regno = 8)
2383 12 for %st(1) (gcc regno = 9)
2384 13 for %st(2) (gcc regno = 10)
2385 14 for %st(3) (gcc regno = 11)
2386 15 for %st(4) (gcc regno = 12)
2387 16 for %st(5) (gcc regno = 13)
2388 17 for %st(6) (gcc regno = 14)
2389 18 for %st(7) (gcc regno = 15)
2391 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2393 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2394 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2395 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2396 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2397 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2398 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2399 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2400 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2401 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2402 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2403 101, 102, 103, 104, /* bound registers */
2406 /* Define parameter passing and return registers. */
2408 static int const x86_64_int_parameter_registers[6] =
2410 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2413 static int const x86_64_ms_abi_int_parameter_registers[4] =
2415 CX_REG, DX_REG, R8_REG, R9_REG
2418 static int const x86_64_int_return_registers[4] =
2420 AX_REG, DX_REG, DI_REG, SI_REG
2423 /* Additional registers that are clobbered by SYSV calls. */
2425 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2427 SI_REG, DI_REG,
2428 XMM6_REG, XMM7_REG,
2429 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2430 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2433 /* Define the structure for the machine field in struct function. */
2435 struct GTY(()) stack_local_entry {
2436 unsigned short mode;
2437 unsigned short n;
2438 rtx rtl;
2439 struct stack_local_entry *next;
2442 /* Structure describing stack frame layout.
2443 Stack grows downward:
2445 [arguments]
2446 <- ARG_POINTER
2447 saved pc
2449 saved static chain if ix86_static_chain_on_stack
2451 saved frame pointer if frame_pointer_needed
2452 <- HARD_FRAME_POINTER
2453 [saved regs]
2454 <- regs_save_offset
2455 [padding0]
2457 [saved SSE regs]
2458 <- sse_regs_save_offset
2459 [padding1] |
2460 | <- FRAME_POINTER
2461 [va_arg registers] |
2463 [frame] |
2465 [padding2] | = to_allocate
2466 <- STACK_POINTER
2468 struct ix86_frame
2470 int nsseregs;
2471 int nregs;
2472 int va_arg_size;
2473 int red_zone_size;
2474 int outgoing_arguments_size;
2476 /* The offsets relative to ARG_POINTER. */
2477 HOST_WIDE_INT frame_pointer_offset;
2478 HOST_WIDE_INT hard_frame_pointer_offset;
2479 HOST_WIDE_INT stack_pointer_offset;
2480 HOST_WIDE_INT hfp_save_offset;
2481 HOST_WIDE_INT reg_save_offset;
2482 HOST_WIDE_INT sse_reg_save_offset;
2484 /* When save_regs_using_mov is set, emit prologue using
2485 move instead of push instructions. */
2486 bool save_regs_using_mov;
2489 /* Which cpu are we scheduling for. */
2490 enum attr_cpu ix86_schedule;
2492 /* Which cpu are we optimizing for. */
2493 enum processor_type ix86_tune;
2495 /* Which instruction set architecture to use. */
2496 enum processor_type ix86_arch;
2498 /* True if processor has SSE prefetch instruction. */
2499 unsigned char x86_prefetch_sse;
2501 /* -mstackrealign option */
2502 static const char ix86_force_align_arg_pointer_string[]
2503 = "force_align_arg_pointer";
2505 static rtx (*ix86_gen_leave) (void);
2506 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2507 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2509 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2510 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_clzero) (rtx);
2513 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2514 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2515 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2516 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2517 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2518 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2520 /* Preferred alignment for stack boundary in bits. */
2521 unsigned int ix86_preferred_stack_boundary;
2523 /* Alignment for incoming stack boundary in bits specified at
2524 command line. */
2525 static unsigned int ix86_user_incoming_stack_boundary;
2527 /* Default alignment for incoming stack boundary in bits. */
2528 static unsigned int ix86_default_incoming_stack_boundary;
2530 /* Alignment for incoming stack boundary in bits. */
2531 unsigned int ix86_incoming_stack_boundary;
2533 /* Calling abi specific va_list type nodes. */
2534 static GTY(()) tree sysv_va_list_type_node;
2535 static GTY(()) tree ms_va_list_type_node;
2537 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2538 char internal_label_prefix[16];
2539 int internal_label_prefix_len;
2541 /* Fence to use after loop using movnt. */
2542 tree x86_mfence;
2544 /* Register class used for passing given 64bit part of the argument.
2545 These represent classes as documented by the PS ABI, with the exception
2546 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2547 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2549 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2550 whenever possible (upper half does contain padding). */
2551 enum x86_64_reg_class
2553 X86_64_NO_CLASS,
2554 X86_64_INTEGER_CLASS,
2555 X86_64_INTEGERSI_CLASS,
2556 X86_64_SSE_CLASS,
2557 X86_64_SSESF_CLASS,
2558 X86_64_SSEDF_CLASS,
2559 X86_64_SSEUP_CLASS,
2560 X86_64_X87_CLASS,
2561 X86_64_X87UP_CLASS,
2562 X86_64_COMPLEX_X87_CLASS,
2563 X86_64_MEMORY_CLASS
2566 #define MAX_CLASSES 8
2568 /* Table of constants used by fldpi, fldln2, etc.... */
2569 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2570 static bool ext_80387_constants_init = 0;
2573 static struct machine_function * ix86_init_machine_status (void);
2574 static rtx ix86_function_value (const_tree, const_tree, bool);
2575 static bool ix86_function_value_regno_p (const unsigned int);
2576 static unsigned int ix86_function_arg_boundary (machine_mode,
2577 const_tree);
2578 static rtx ix86_static_chain (const_tree, bool);
2579 static int ix86_function_regparm (const_tree, const_tree);
2580 static void ix86_compute_frame_layout (struct ix86_frame *);
2581 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2582 rtx, rtx, int);
2583 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2584 static tree ix86_canonical_va_list_type (tree);
2585 static void predict_jump (int);
2586 static unsigned int split_stack_prologue_scratch_regno (void);
2587 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2589 enum ix86_function_specific_strings
2591 IX86_FUNCTION_SPECIFIC_ARCH,
2592 IX86_FUNCTION_SPECIFIC_TUNE,
2593 IX86_FUNCTION_SPECIFIC_MAX
2596 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2597 const char *, const char *, enum fpmath_unit,
2598 bool);
2599 static void ix86_function_specific_save (struct cl_target_option *,
2600 struct gcc_options *opts);
2601 static void ix86_function_specific_restore (struct gcc_options *opts,
2602 struct cl_target_option *);
2603 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2604 static void ix86_function_specific_print (FILE *, int,
2605 struct cl_target_option *);
2606 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2607 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2608 struct gcc_options *,
2609 struct gcc_options *,
2610 struct gcc_options *);
2611 static bool ix86_can_inline_p (tree, tree);
2612 static void ix86_set_current_function (tree);
2613 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2615 static enum calling_abi ix86_function_abi (const_tree);
2618 #ifndef SUBTARGET32_DEFAULT_CPU
2619 #define SUBTARGET32_DEFAULT_CPU "i386"
2620 #endif
2622 /* Whether -mtune= or -march= were specified */
2623 static int ix86_tune_defaulted;
2624 static int ix86_arch_specified;
2626 /* Vectorization library interface and handlers. */
2627 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2629 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2630 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2632 /* Processor target table, indexed by processor number */
2633 struct ptt
2635 const char *const name; /* processor name */
2636 const struct processor_costs *cost; /* Processor costs */
2637 const int align_loop; /* Default alignments. */
2638 const int align_loop_max_skip;
2639 const int align_jump;
2640 const int align_jump_max_skip;
2641 const int align_func;
2644 /* This table must be in sync with enum processor_type in i386.h. */
2645 static const struct ptt processor_target_table[PROCESSOR_max] =
2647 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2648 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2649 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2650 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2651 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2652 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2653 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2654 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2655 {"core2", &core_cost, 16, 10, 16, 10, 16},
2656 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2657 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2658 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2659 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2660 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2661 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2662 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2663 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2664 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2665 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2666 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2667 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2668 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2669 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2670 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2671 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2672 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2673 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2674 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2675 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2678 static unsigned int
2679 rest_of_handle_insert_vzeroupper (void)
2681 int i;
2683 /* vzeroupper instructions are inserted immediately after reload to
2684 account for possible spills from 256bit registers. The pass
2685 reuses mode switching infrastructure by re-running mode insertion
2686 pass, so disable entities that have already been processed. */
2687 for (i = 0; i < MAX_386_ENTITIES; i++)
2688 ix86_optimize_mode_switching[i] = 0;
2690 ix86_optimize_mode_switching[AVX_U128] = 1;
2692 /* Call optimize_mode_switching. */
2693 g->get_passes ()->execute_pass_mode_switching ();
2694 return 0;
2697 /* Return 1 if INSN uses or defines a hard register.
2698 Hard register uses in a memory address are ignored.
2699 Clobbers and flags definitions are ignored. */
2701 static bool
2702 has_non_address_hard_reg (rtx_insn *insn)
2704 df_ref ref;
2705 FOR_EACH_INSN_DEF (ref, insn)
2706 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2707 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2708 && DF_REF_REGNO (ref) != FLAGS_REG)
2709 return true;
2711 FOR_EACH_INSN_USE (ref, insn)
2712 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2713 return true;
2715 return false;
2718 /* Check if comparison INSN may be transformed
2719 into vector comparison. Currently we transform
2720 zero checks only which look like:
2722 (set (reg:CCZ 17 flags)
2723 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2724 (subreg:SI (reg:DI x) 0))
2725 (const_int 0 [0]))) */
2727 static bool
2728 convertible_comparison_p (rtx_insn *insn)
2730 if (!TARGET_SSE4_1)
2731 return false;
2733 rtx def_set = single_set (insn);
2735 gcc_assert (def_set);
2737 rtx src = SET_SRC (def_set);
2738 rtx dst = SET_DEST (def_set);
2740 gcc_assert (GET_CODE (src) == COMPARE);
2742 if (GET_CODE (dst) != REG
2743 || REGNO (dst) != FLAGS_REG
2744 || GET_MODE (dst) != CCZmode)
2745 return false;
2747 rtx op1 = XEXP (src, 0);
2748 rtx op2 = XEXP (src, 1);
2750 if (op2 != CONST0_RTX (GET_MODE (op2)))
2751 return false;
2753 if (GET_CODE (op1) != IOR)
2754 return false;
2756 op2 = XEXP (op1, 1);
2757 op1 = XEXP (op1, 0);
2759 if (!SUBREG_P (op1)
2760 || !SUBREG_P (op2)
2761 || GET_MODE (op1) != SImode
2762 || GET_MODE (op2) != SImode
2763 || ((SUBREG_BYTE (op1) != 0
2764 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2765 && (SUBREG_BYTE (op2) != 0
2766 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2767 return false;
2769 op1 = SUBREG_REG (op1);
2770 op2 = SUBREG_REG (op2);
2772 if (op1 != op2
2773 || !REG_P (op1)
2774 || GET_MODE (op1) != DImode)
2775 return false;
2777 return true;
2780 /* The DImode version of scalar_to_vector_candidate_p. */
2782 static bool
2783 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2785 rtx def_set = single_set (insn);
2787 if (!def_set)
2788 return false;
2790 if (has_non_address_hard_reg (insn))
2791 return false;
2793 rtx src = SET_SRC (def_set);
2794 rtx dst = SET_DEST (def_set);
2796 if (GET_CODE (src) == COMPARE)
2797 return convertible_comparison_p (insn);
2799 /* We are interested in DImode promotion only. */
2800 if ((GET_MODE (src) != DImode
2801 && !CONST_INT_P (src))
2802 || GET_MODE (dst) != DImode)
2803 return false;
2805 if (!REG_P (dst) && !MEM_P (dst))
2806 return false;
2808 switch (GET_CODE (src))
2810 case ASHIFT:
2811 case LSHIFTRT:
2812 /* FIXME: consider also variable shifts. */
2813 if (!CONST_INT_P (XEXP (src, 1))
2814 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
2815 return false;
2816 break;
2818 case PLUS:
2819 case MINUS:
2820 case IOR:
2821 case XOR:
2822 case AND:
2823 if (!REG_P (XEXP (src, 1))
2824 && !MEM_P (XEXP (src, 1))
2825 && !CONST_INT_P (XEXP (src, 1)))
2826 return false;
2827 break;
2829 case NEG:
2830 case NOT:
2831 break;
2833 case REG:
2834 return true;
2836 case MEM:
2837 case CONST_INT:
2838 return REG_P (dst);
2840 default:
2841 return false;
2844 if (!REG_P (XEXP (src, 0))
2845 && !MEM_P (XEXP (src, 0))
2846 && !CONST_INT_P (XEXP (src, 0))
2847 /* Check for andnot case. */
2848 && (GET_CODE (src) != AND
2849 || GET_CODE (XEXP (src, 0)) != NOT
2850 || !REG_P (XEXP (XEXP (src, 0), 0))))
2851 return false;
2853 if ((GET_MODE (XEXP (src, 0)) != DImode
2854 && !CONST_INT_P (XEXP (src, 0)))
2855 || (GET_CODE (src) != NEG
2856 && GET_CODE (src) != NOT
2857 && GET_MODE (XEXP (src, 1)) != DImode
2858 && !CONST_INT_P (XEXP (src, 1))))
2859 return false;
2861 return true;
2864 /* The TImode version of scalar_to_vector_candidate_p. */
2866 static bool
2867 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2869 rtx def_set = single_set (insn);
2871 if (!def_set)
2872 return false;
2874 if (has_non_address_hard_reg (insn))
2875 return false;
2877 rtx src = SET_SRC (def_set);
2878 rtx dst = SET_DEST (def_set);
2880 /* Only TImode load and store are allowed. */
2881 if (GET_MODE (dst) != TImode)
2882 return false;
2884 if (MEM_P (dst))
2886 /* Check for store. Memory must be aligned or unaligned store
2887 is optimal. Only support store from register, standard SSE
2888 constant or CONST_WIDE_INT generated from piecewise store.
2890 ??? Verify performance impact before enabling CONST_INT for
2891 __int128 store. */
2892 if (misaligned_operand (dst, TImode)
2893 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2894 return false;
2896 switch (GET_CODE (src))
2898 default:
2899 return false;
2901 case REG:
2902 case CONST_WIDE_INT:
2903 return true;
2905 case CONST_INT:
2906 return standard_sse_constant_p (src, TImode);
2909 else if (MEM_P (src))
2911 /* Check for load. Memory must be aligned or unaligned load is
2912 optimal. */
2913 return (REG_P (dst)
2914 && (!misaligned_operand (src, TImode)
2915 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2918 return false;
2921 /* Return 1 if INSN may be converted into vector
2922 instruction. */
2924 static bool
2925 scalar_to_vector_candidate_p (rtx_insn *insn)
2927 if (TARGET_64BIT)
2928 return timode_scalar_to_vector_candidate_p (insn);
2929 else
2930 return dimode_scalar_to_vector_candidate_p (insn);
2933 /* The DImode version of remove_non_convertible_regs. */
2935 static void
2936 dimode_remove_non_convertible_regs (bitmap candidates)
2938 bitmap_iterator bi;
2939 unsigned id;
2940 bitmap regs = BITMAP_ALLOC (NULL);
2942 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2944 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2945 rtx reg = SET_DEST (def_set);
2947 if (!REG_P (reg)
2948 || bitmap_bit_p (regs, REGNO (reg))
2949 || HARD_REGISTER_P (reg))
2950 continue;
2952 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2953 def;
2954 def = DF_REF_NEXT_REG (def))
2956 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2958 if (dump_file)
2959 fprintf (dump_file,
2960 "r%d has non convertible definition in insn %d\n",
2961 REGNO (reg), DF_REF_INSN_UID (def));
2963 bitmap_set_bit (regs, REGNO (reg));
2964 break;
2969 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2971 for (df_ref def = DF_REG_DEF_CHAIN (id);
2972 def;
2973 def = DF_REF_NEXT_REG (def))
2974 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2976 if (dump_file)
2977 fprintf (dump_file, "Removing insn %d from candidates list\n",
2978 DF_REF_INSN_UID (def));
2980 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2984 BITMAP_FREE (regs);
2987 /* For a register REGNO, scan instructions for its defs and uses.
2988 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2990 static void
2991 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2992 unsigned int regno)
2994 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2995 def;
2996 def = DF_REF_NEXT_REG (def))
2998 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3000 if (dump_file)
3001 fprintf (dump_file,
3002 "r%d has non convertible def in insn %d\n",
3003 regno, DF_REF_INSN_UID (def));
3005 bitmap_set_bit (regs, regno);
3006 break;
3010 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3011 ref;
3012 ref = DF_REF_NEXT_REG (ref))
3014 /* Debug instructions are skipped. */
3015 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3016 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3018 if (dump_file)
3019 fprintf (dump_file,
3020 "r%d has non convertible use in insn %d\n",
3021 regno, DF_REF_INSN_UID (ref));
3023 bitmap_set_bit (regs, regno);
3024 break;
3029 /* The TImode version of remove_non_convertible_regs. */
3031 static void
3032 timode_remove_non_convertible_regs (bitmap candidates)
3034 bitmap_iterator bi;
3035 unsigned id;
3036 bitmap regs = BITMAP_ALLOC (NULL);
3038 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3040 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3041 rtx dest = SET_DEST (def_set);
3042 rtx src = SET_SRC (def_set);
3044 if ((!REG_P (dest)
3045 || bitmap_bit_p (regs, REGNO (dest))
3046 || HARD_REGISTER_P (dest))
3047 && (!REG_P (src)
3048 || bitmap_bit_p (regs, REGNO (src))
3049 || HARD_REGISTER_P (src)))
3050 continue;
3052 if (REG_P (dest))
3053 timode_check_non_convertible_regs (candidates, regs,
3054 REGNO (dest));
3056 if (REG_P (src))
3057 timode_check_non_convertible_regs (candidates, regs,
3058 REGNO (src));
3061 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3063 for (df_ref def = DF_REG_DEF_CHAIN (id);
3064 def;
3065 def = DF_REF_NEXT_REG (def))
3066 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3068 if (dump_file)
3069 fprintf (dump_file, "Removing insn %d from candidates list\n",
3070 DF_REF_INSN_UID (def));
3072 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3075 for (df_ref ref = DF_REG_USE_CHAIN (id);
3076 ref;
3077 ref = DF_REF_NEXT_REG (ref))
3078 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3080 if (dump_file)
3081 fprintf (dump_file, "Removing insn %d from candidates list\n",
3082 DF_REF_INSN_UID (ref));
3084 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3088 BITMAP_FREE (regs);
3091 /* For a given bitmap of insn UIDs scans all instruction and
3092 remove insn from CANDIDATES in case it has both convertible
3093 and not convertible definitions.
3095 All insns in a bitmap are conversion candidates according to
3096 scalar_to_vector_candidate_p. Currently it implies all insns
3097 are single_set. */
3099 static void
3100 remove_non_convertible_regs (bitmap candidates)
3102 if (TARGET_64BIT)
3103 timode_remove_non_convertible_regs (candidates);
3104 else
3105 dimode_remove_non_convertible_regs (candidates);
3108 class scalar_chain
3110 public:
3111 scalar_chain ();
3112 virtual ~scalar_chain ();
3114 static unsigned max_id;
3116 /* ID of a chain. */
3117 unsigned int chain_id;
3118 /* A queue of instructions to be included into a chain. */
3119 bitmap queue;
3120 /* Instructions included into a chain. */
3121 bitmap insns;
3122 /* All registers defined by a chain. */
3123 bitmap defs;
3124 /* Registers used in both vector and sclar modes. */
3125 bitmap defs_conv;
3127 void build (bitmap candidates, unsigned insn_uid);
3128 virtual int compute_convert_gain () = 0;
3129 int convert ();
3131 protected:
3132 void add_to_queue (unsigned insn_uid);
3133 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3135 private:
3136 void add_insn (bitmap candidates, unsigned insn_uid);
3137 void analyze_register_chain (bitmap candidates, df_ref ref);
3138 virtual void mark_dual_mode_def (df_ref def) = 0;
3139 virtual void convert_insn (rtx_insn *insn) = 0;
3140 virtual void convert_registers () = 0;
3143 class dimode_scalar_chain : public scalar_chain
3145 public:
3146 int compute_convert_gain ();
3147 private:
3148 void mark_dual_mode_def (df_ref def);
3149 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3150 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3151 void convert_insn (rtx_insn *insn);
3152 void convert_op (rtx *op, rtx_insn *insn);
3153 void convert_reg (unsigned regno);
3154 void make_vector_copies (unsigned regno);
3155 void convert_registers ();
3156 int vector_const_cost (rtx exp);
3159 class timode_scalar_chain : public scalar_chain
3161 public:
3162 /* Convert from TImode to V1TImode is always faster. */
3163 int compute_convert_gain () { return 1; }
3165 private:
3166 void mark_dual_mode_def (df_ref def);
3167 void fix_debug_reg_uses (rtx reg);
3168 void convert_insn (rtx_insn *insn);
3169 /* We don't convert registers to difference size. */
3170 void convert_registers () {}
3173 unsigned scalar_chain::max_id = 0;
3175 /* Initialize new chain. */
3177 scalar_chain::scalar_chain ()
3179 chain_id = ++max_id;
3181 if (dump_file)
3182 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3184 bitmap_obstack_initialize (NULL);
3185 insns = BITMAP_ALLOC (NULL);
3186 defs = BITMAP_ALLOC (NULL);
3187 defs_conv = BITMAP_ALLOC (NULL);
3188 queue = NULL;
3191 /* Free chain's data. */
3193 scalar_chain::~scalar_chain ()
3195 BITMAP_FREE (insns);
3196 BITMAP_FREE (defs);
3197 BITMAP_FREE (defs_conv);
3198 bitmap_obstack_release (NULL);
3201 /* Add instruction into chains' queue. */
3203 void
3204 scalar_chain::add_to_queue (unsigned insn_uid)
3206 if (bitmap_bit_p (insns, insn_uid)
3207 || bitmap_bit_p (queue, insn_uid))
3208 return;
3210 if (dump_file)
3211 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3212 insn_uid, chain_id);
3213 bitmap_set_bit (queue, insn_uid);
3216 /* For DImode conversion, mark register defined by DEF as requiring
3217 conversion. */
3219 void
3220 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3222 gcc_assert (DF_REF_REG_DEF_P (def));
3224 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3225 return;
3227 if (dump_file)
3228 fprintf (dump_file,
3229 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3230 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3232 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3235 /* For TImode conversion, it is unused. */
3237 void
3238 timode_scalar_chain::mark_dual_mode_def (df_ref)
3240 gcc_unreachable ();
3243 /* Check REF's chain to add new insns into a queue
3244 and find registers requiring conversion. */
3246 void
3247 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3249 df_link *chain;
3251 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3252 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3253 add_to_queue (DF_REF_INSN_UID (ref));
3255 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3257 unsigned uid = DF_REF_INSN_UID (chain->ref);
3259 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3260 continue;
3262 if (!DF_REF_REG_MEM_P (chain->ref))
3264 if (bitmap_bit_p (insns, uid))
3265 continue;
3267 if (bitmap_bit_p (candidates, uid))
3269 add_to_queue (uid);
3270 continue;
3274 if (DF_REF_REG_DEF_P (chain->ref))
3276 if (dump_file)
3277 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3278 DF_REF_REGNO (chain->ref), uid);
3279 mark_dual_mode_def (chain->ref);
3281 else
3283 if (dump_file)
3284 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3285 DF_REF_REGNO (chain->ref), uid);
3286 mark_dual_mode_def (ref);
3291 /* Add instruction into a chain. */
3293 void
3294 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3296 if (bitmap_bit_p (insns, insn_uid))
3297 return;
3299 if (dump_file)
3300 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3302 bitmap_set_bit (insns, insn_uid);
3304 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3305 rtx def_set = single_set (insn);
3306 if (def_set && REG_P (SET_DEST (def_set))
3307 && !HARD_REGISTER_P (SET_DEST (def_set)))
3308 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3310 df_ref ref;
3311 df_ref def;
3312 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3313 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3314 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3315 def;
3316 def = DF_REF_NEXT_REG (def))
3317 analyze_register_chain (candidates, def);
3318 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3319 if (!DF_REF_REG_MEM_P (ref))
3320 analyze_register_chain (candidates, ref);
3323 /* Build new chain starting from insn INSN_UID recursively
3324 adding all dependent uses and definitions. */
3326 void
3327 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3329 queue = BITMAP_ALLOC (NULL);
3330 bitmap_set_bit (queue, insn_uid);
3332 if (dump_file)
3333 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3335 while (!bitmap_empty_p (queue))
3337 insn_uid = bitmap_first_set_bit (queue);
3338 bitmap_clear_bit (queue, insn_uid);
3339 bitmap_clear_bit (candidates, insn_uid);
3340 add_insn (candidates, insn_uid);
3343 if (dump_file)
3345 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3346 fprintf (dump_file, " insns: ");
3347 dump_bitmap (dump_file, insns);
3348 if (!bitmap_empty_p (defs_conv))
3350 bitmap_iterator bi;
3351 unsigned id;
3352 const char *comma = "";
3353 fprintf (dump_file, " defs to convert: ");
3354 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3356 fprintf (dump_file, "%sr%d", comma, id);
3357 comma = ", ";
3359 fprintf (dump_file, "\n");
3363 BITMAP_FREE (queue);
3366 /* Return a cost of building a vector costant
3367 instead of using a scalar one. */
3370 dimode_scalar_chain::vector_const_cost (rtx exp)
3372 gcc_assert (CONST_INT_P (exp));
3374 if (standard_sse_constant_p (exp, V2DImode))
3375 return COSTS_N_INSNS (1);
3376 return ix86_cost->sse_load[1];
3379 /* Compute a gain for chain conversion. */
3382 dimode_scalar_chain::compute_convert_gain ()
3384 bitmap_iterator bi;
3385 unsigned insn_uid;
3386 int gain = 0;
3387 int cost = 0;
3389 if (dump_file)
3390 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3392 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3394 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3395 rtx def_set = single_set (insn);
3396 rtx src = SET_SRC (def_set);
3397 rtx dst = SET_DEST (def_set);
3399 if (REG_P (src) && REG_P (dst))
3400 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3401 else if (REG_P (src) && MEM_P (dst))
3402 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3403 else if (MEM_P (src) && REG_P (dst))
3404 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3405 else if (GET_CODE (src) == ASHIFT
3406 || GET_CODE (src) == LSHIFTRT)
3408 gain += ix86_cost->add;
3409 if (CONST_INT_P (XEXP (src, 0)))
3410 gain -= vector_const_cost (XEXP (src, 0));
3411 if (CONST_INT_P (XEXP (src, 1))
3412 && INTVAL (XEXP (src, 1)) >= 32)
3413 gain -= COSTS_N_INSNS (1);
3415 else if (GET_CODE (src) == PLUS
3416 || GET_CODE (src) == MINUS
3417 || GET_CODE (src) == IOR
3418 || GET_CODE (src) == XOR
3419 || GET_CODE (src) == AND)
3421 gain += ix86_cost->add;
3422 /* Additional gain for andnot for targets without BMI. */
3423 if (GET_CODE (XEXP (src, 0)) == NOT
3424 && !TARGET_BMI)
3425 gain += 2 * ix86_cost->add;
3427 if (CONST_INT_P (XEXP (src, 0)))
3428 gain -= vector_const_cost (XEXP (src, 0));
3429 if (CONST_INT_P (XEXP (src, 1)))
3430 gain -= vector_const_cost (XEXP (src, 1));
3432 else if (GET_CODE (src) == NEG
3433 || GET_CODE (src) == NOT)
3434 gain += ix86_cost->add - COSTS_N_INSNS (1);
3435 else if (GET_CODE (src) == COMPARE)
3437 /* Assume comparison cost is the same. */
3439 else if (CONST_INT_P (src))
3441 if (REG_P (dst))
3442 gain += COSTS_N_INSNS (2);
3443 else if (MEM_P (dst))
3444 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3445 gain -= vector_const_cost (src);
3447 else
3448 gcc_unreachable ();
3451 if (dump_file)
3452 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3454 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3455 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3457 if (dump_file)
3458 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3460 gain -= cost;
3462 if (dump_file)
3463 fprintf (dump_file, " Total gain: %d\n", gain);
3465 return gain;
3468 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3471 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3473 if (x == reg)
3474 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3476 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3477 int i, j;
3478 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3480 if (fmt[i] == 'e')
3481 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3482 else if (fmt[i] == 'E')
3483 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3484 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3485 reg, new_reg);
3488 return x;
3491 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3493 void
3494 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3495 rtx reg, rtx new_reg)
3497 replace_with_subreg (single_set (insn), reg, new_reg);
3500 /* Insert generated conversion instruction sequence INSNS
3501 after instruction AFTER. New BB may be required in case
3502 instruction has EH region attached. */
3504 void
3505 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3507 if (!control_flow_insn_p (after))
3509 emit_insn_after (insns, after);
3510 return;
3513 basic_block bb = BLOCK_FOR_INSN (after);
3514 edge e = find_fallthru_edge (bb->succs);
3515 gcc_assert (e);
3517 basic_block new_bb = split_edge (e);
3518 emit_insn_after (insns, BB_HEAD (new_bb));
3521 /* Make vector copies for all register REGNO definitions
3522 and replace its uses in a chain. */
3524 void
3525 dimode_scalar_chain::make_vector_copies (unsigned regno)
3527 rtx reg = regno_reg_rtx[regno];
3528 rtx vreg = gen_reg_rtx (DImode);
3529 df_ref ref;
3531 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3532 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3534 rtx_insn *insn = DF_REF_INSN (ref);
3536 start_sequence ();
3537 if (TARGET_SSE4_1)
3539 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3540 CONST0_RTX (V4SImode),
3541 gen_rtx_SUBREG (SImode, reg, 0)));
3542 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3543 gen_rtx_SUBREG (V4SImode, vreg, 0),
3544 gen_rtx_SUBREG (SImode, reg, 4),
3545 GEN_INT (2)));
3547 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3549 rtx tmp = gen_reg_rtx (DImode);
3550 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3551 CONST0_RTX (V4SImode),
3552 gen_rtx_SUBREG (SImode, reg, 0)));
3553 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3554 CONST0_RTX (V4SImode),
3555 gen_rtx_SUBREG (SImode, reg, 4)));
3556 emit_insn (gen_vec_interleave_lowv4si
3557 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3558 gen_rtx_SUBREG (V4SImode, vreg, 0),
3559 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3561 else
3563 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3564 emit_move_insn (adjust_address (tmp, SImode, 0),
3565 gen_rtx_SUBREG (SImode, reg, 0));
3566 emit_move_insn (adjust_address (tmp, SImode, 4),
3567 gen_rtx_SUBREG (SImode, reg, 4));
3568 emit_move_insn (vreg, tmp);
3570 rtx_insn *seq = get_insns ();
3571 end_sequence ();
3572 emit_conversion_insns (seq, insn);
3574 if (dump_file)
3575 fprintf (dump_file,
3576 " Copied r%d to a vector register r%d for insn %d\n",
3577 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3580 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3581 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3583 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3585 if (dump_file)
3586 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3587 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3591 /* Convert all definitions of register REGNO
3592 and fix its uses. Scalar copies may be created
3593 in case register is used in not convertible insn. */
3595 void
3596 dimode_scalar_chain::convert_reg (unsigned regno)
3598 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3599 rtx reg = regno_reg_rtx[regno];
3600 rtx scopy = NULL_RTX;
3601 df_ref ref;
3602 bitmap conv;
3604 conv = BITMAP_ALLOC (NULL);
3605 bitmap_copy (conv, insns);
3607 if (scalar_copy)
3608 scopy = gen_reg_rtx (DImode);
3610 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3612 rtx_insn *insn = DF_REF_INSN (ref);
3613 rtx def_set = single_set (insn);
3614 rtx src = SET_SRC (def_set);
3615 rtx reg = DF_REF_REG (ref);
3617 if (!MEM_P (src))
3619 replace_with_subreg_in_insn (insn, reg, reg);
3620 bitmap_clear_bit (conv, INSN_UID (insn));
3623 if (scalar_copy)
3625 start_sequence ();
3626 if (TARGET_SSE4_1)
3628 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3629 emit_insn
3630 (gen_rtx_SET
3631 (gen_rtx_SUBREG (SImode, scopy, 0),
3632 gen_rtx_VEC_SELECT (SImode,
3633 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3635 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3636 emit_insn
3637 (gen_rtx_SET
3638 (gen_rtx_SUBREG (SImode, scopy, 4),
3639 gen_rtx_VEC_SELECT (SImode,
3640 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3642 else if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3644 rtx vcopy = gen_reg_rtx (V2DImode);
3645 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3646 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3647 gen_rtx_SUBREG (SImode, vcopy, 0));
3648 emit_move_insn (vcopy,
3649 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3650 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3651 gen_rtx_SUBREG (SImode, vcopy, 0));
3653 else
3655 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3656 emit_move_insn (tmp, reg);
3657 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3658 adjust_address (tmp, SImode, 0));
3659 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3660 adjust_address (tmp, SImode, 4));
3662 rtx_insn *seq = get_insns ();
3663 end_sequence ();
3664 emit_conversion_insns (seq, insn);
3666 if (dump_file)
3667 fprintf (dump_file,
3668 " Copied r%d to a scalar register r%d for insn %d\n",
3669 regno, REGNO (scopy), INSN_UID (insn));
3673 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3674 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3676 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3678 rtx def_set = single_set (DF_REF_INSN (ref));
3679 if (!MEM_P (SET_DEST (def_set))
3680 || !REG_P (SET_SRC (def_set)))
3681 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3682 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3685 /* Skip debug insns and uninitialized uses. */
3686 else if (DF_REF_CHAIN (ref)
3687 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3689 gcc_assert (scopy);
3690 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3691 df_insn_rescan (DF_REF_INSN (ref));
3694 BITMAP_FREE (conv);
3697 /* Convert operand OP in INSN. We should handle
3698 memory operands and uninitialized registers.
3699 All other register uses are converted during
3700 registers conversion. */
3702 void
3703 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3705 *op = copy_rtx_if_shared (*op);
3707 if (GET_CODE (*op) == NOT)
3709 convert_op (&XEXP (*op, 0), insn);
3710 PUT_MODE (*op, V2DImode);
3712 else if (MEM_P (*op))
3714 rtx tmp = gen_reg_rtx (DImode);
3716 emit_insn_before (gen_move_insn (tmp, *op), insn);
3717 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3719 if (dump_file)
3720 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3721 INSN_UID (insn), REGNO (tmp));
3723 else if (REG_P (*op))
3725 /* We may have not converted register usage in case
3726 this register has no definition. Otherwise it
3727 should be converted in convert_reg. */
3728 df_ref ref;
3729 FOR_EACH_INSN_USE (ref, insn)
3730 if (DF_REF_REGNO (ref) == REGNO (*op))
3732 gcc_assert (!DF_REF_CHAIN (ref));
3733 break;
3735 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3737 else if (CONST_INT_P (*op))
3739 rtx vec_cst;
3740 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3742 /* Prefer all ones vector in case of -1. */
3743 if (constm1_operand (*op, GET_MODE (*op)))
3744 vec_cst = CONSTM1_RTX (V2DImode);
3745 else
3746 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3747 gen_rtvec (2, *op, const0_rtx));
3749 if (!standard_sse_constant_p (vec_cst, V2DImode))
3751 start_sequence ();
3752 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3753 rtx_insn *seq = get_insns ();
3754 end_sequence ();
3755 emit_insn_before (seq, insn);
3758 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
3759 *op = tmp;
3761 else
3763 gcc_assert (SUBREG_P (*op));
3764 gcc_assert (GET_MODE (*op) == V2DImode);
3768 /* Convert INSN to vector mode. */
3770 void
3771 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3773 rtx def_set = single_set (insn);
3774 rtx src = SET_SRC (def_set);
3775 rtx dst = SET_DEST (def_set);
3776 rtx subreg;
3778 if (MEM_P (dst) && !REG_P (src))
3780 /* There are no scalar integer instructions and therefore
3781 temporary register usage is required. */
3782 rtx tmp = gen_reg_rtx (DImode);
3783 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3784 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3787 switch (GET_CODE (src))
3789 case ASHIFT:
3790 case LSHIFTRT:
3791 convert_op (&XEXP (src, 0), insn);
3792 PUT_MODE (src, V2DImode);
3793 break;
3795 case PLUS:
3796 case MINUS:
3797 case IOR:
3798 case XOR:
3799 case AND:
3800 convert_op (&XEXP (src, 0), insn);
3801 convert_op (&XEXP (src, 1), insn);
3802 PUT_MODE (src, V2DImode);
3803 break;
3805 case NEG:
3806 src = XEXP (src, 0);
3807 convert_op (&src, insn);
3808 subreg = gen_reg_rtx (V2DImode);
3809 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
3810 src = gen_rtx_MINUS (V2DImode, subreg, src);
3811 break;
3813 case NOT:
3814 src = XEXP (src, 0);
3815 convert_op (&src, insn);
3816 subreg = gen_reg_rtx (V2DImode);
3817 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
3818 src = gen_rtx_XOR (V2DImode, src, subreg);
3819 break;
3821 case MEM:
3822 if (!REG_P (dst))
3823 convert_op (&src, insn);
3824 break;
3826 case REG:
3827 if (!MEM_P (dst))
3828 convert_op (&src, insn);
3829 break;
3831 case SUBREG:
3832 gcc_assert (GET_MODE (src) == V2DImode);
3833 break;
3835 case COMPARE:
3836 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3838 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3839 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3841 if (REG_P (src))
3842 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3843 else
3844 subreg = copy_rtx_if_shared (src);
3845 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3846 copy_rtx_if_shared (subreg),
3847 copy_rtx_if_shared (subreg)),
3848 insn);
3849 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3850 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3851 copy_rtx_if_shared (src)),
3852 UNSPEC_PTEST);
3853 break;
3855 case CONST_INT:
3856 convert_op (&src, insn);
3857 break;
3859 default:
3860 gcc_unreachable ();
3863 SET_SRC (def_set) = src;
3864 SET_DEST (def_set) = dst;
3866 /* Drop possible dead definitions. */
3867 PATTERN (insn) = def_set;
3869 INSN_CODE (insn) = -1;
3870 recog_memoized (insn);
3871 df_insn_rescan (insn);
3874 /* Fix uses of converted REG in debug insns. */
3876 void
3877 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3879 if (!flag_var_tracking)
3880 return;
3882 df_ref ref, next;
3883 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
3885 rtx_insn *insn = DF_REF_INSN (ref);
3886 /* Make sure the next ref is for a different instruction,
3887 so that we're not affected by the rescan. */
3888 next = DF_REF_NEXT_REG (ref);
3889 while (next && DF_REF_INSN (next) == insn)
3890 next = DF_REF_NEXT_REG (next);
3892 if (DEBUG_INSN_P (insn))
3894 /* It may be a debug insn with a TImode variable in
3895 register. */
3896 bool changed = false;
3897 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
3899 rtx *loc = DF_REF_LOC (ref);
3900 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
3902 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
3903 changed = true;
3906 if (changed)
3907 df_insn_rescan (insn);
3912 /* Convert INSN from TImode to V1T1mode. */
3914 void
3915 timode_scalar_chain::convert_insn (rtx_insn *insn)
3917 rtx def_set = single_set (insn);
3918 rtx src = SET_SRC (def_set);
3919 rtx dst = SET_DEST (def_set);
3921 switch (GET_CODE (dst))
3923 case REG:
3925 rtx tmp = find_reg_equal_equiv_note (insn);
3926 if (tmp)
3927 PUT_MODE (XEXP (tmp, 0), V1TImode);
3928 PUT_MODE (dst, V1TImode);
3929 fix_debug_reg_uses (dst);
3931 break;
3932 case MEM:
3933 PUT_MODE (dst, V1TImode);
3934 break;
3936 default:
3937 gcc_unreachable ();
3940 switch (GET_CODE (src))
3942 case REG:
3943 PUT_MODE (src, V1TImode);
3944 /* Call fix_debug_reg_uses only if SRC is never defined. */
3945 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3946 fix_debug_reg_uses (src);
3947 break;
3949 case MEM:
3950 PUT_MODE (src, V1TImode);
3951 break;
3953 case CONST_WIDE_INT:
3954 if (NONDEBUG_INSN_P (insn))
3956 /* Since there are no instructions to store 128-bit constant,
3957 temporary register usage is required. */
3958 rtx tmp = gen_reg_rtx (V1TImode);
3959 start_sequence ();
3960 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3961 src = validize_mem (force_const_mem (V1TImode, src));
3962 rtx_insn *seq = get_insns ();
3963 end_sequence ();
3964 if (seq)
3965 emit_insn_before (seq, insn);
3966 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3967 dst = tmp;
3969 break;
3971 case CONST_INT:
3972 switch (standard_sse_constant_p (src, TImode))
3974 case 1:
3975 src = CONST0_RTX (GET_MODE (dst));
3976 break;
3977 case 2:
3978 src = CONSTM1_RTX (GET_MODE (dst));
3979 break;
3980 default:
3981 gcc_unreachable ();
3983 if (NONDEBUG_INSN_P (insn))
3985 rtx tmp = gen_reg_rtx (V1TImode);
3986 /* Since there are no instructions to store standard SSE
3987 constant, temporary register usage is required. */
3988 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3989 dst = tmp;
3991 break;
3993 default:
3994 gcc_unreachable ();
3997 SET_SRC (def_set) = src;
3998 SET_DEST (def_set) = dst;
4000 /* Drop possible dead definitions. */
4001 PATTERN (insn) = def_set;
4003 INSN_CODE (insn) = -1;
4004 recog_memoized (insn);
4005 df_insn_rescan (insn);
4008 void
4009 dimode_scalar_chain::convert_registers ()
4011 bitmap_iterator bi;
4012 unsigned id;
4014 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4015 convert_reg (id);
4017 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4018 make_vector_copies (id);
4021 /* Convert whole chain creating required register
4022 conversions and copies. */
4025 scalar_chain::convert ()
4027 bitmap_iterator bi;
4028 unsigned id;
4029 int converted_insns = 0;
4031 if (!dbg_cnt (stv_conversion))
4032 return 0;
4034 if (dump_file)
4035 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4037 convert_registers ();
4039 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4041 convert_insn (DF_INSN_UID_GET (id)->insn);
4042 converted_insns++;
4045 return converted_insns;
4048 /* Main STV pass function. Find and convert scalar
4049 instructions into vector mode when profitable. */
4051 static unsigned int
4052 convert_scalars_to_vector ()
4054 basic_block bb;
4055 bitmap candidates;
4056 int converted_insns = 0;
4058 bitmap_obstack_initialize (NULL);
4059 candidates = BITMAP_ALLOC (NULL);
4061 calculate_dominance_info (CDI_DOMINATORS);
4062 df_set_flags (DF_DEFER_INSN_RESCAN);
4063 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4064 df_md_add_problem ();
4065 df_analyze ();
4067 /* Find all instructions we want to convert into vector mode. */
4068 if (dump_file)
4069 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4071 FOR_EACH_BB_FN (bb, cfun)
4073 rtx_insn *insn;
4074 FOR_BB_INSNS (bb, insn)
4075 if (scalar_to_vector_candidate_p (insn))
4077 if (dump_file)
4078 fprintf (dump_file, " insn %d is marked as a candidate\n",
4079 INSN_UID (insn));
4081 bitmap_set_bit (candidates, INSN_UID (insn));
4085 remove_non_convertible_regs (candidates);
4087 if (bitmap_empty_p (candidates))
4088 if (dump_file)
4089 fprintf (dump_file, "There are no candidates for optimization.\n");
4091 while (!bitmap_empty_p (candidates))
4093 unsigned uid = bitmap_first_set_bit (candidates);
4094 scalar_chain *chain;
4096 if (TARGET_64BIT)
4097 chain = new timode_scalar_chain;
4098 else
4099 chain = new dimode_scalar_chain;
4101 /* Find instructions chain we want to convert to vector mode.
4102 Check all uses and definitions to estimate all required
4103 conversions. */
4104 chain->build (candidates, uid);
4106 if (chain->compute_convert_gain () > 0)
4107 converted_insns += chain->convert ();
4108 else
4109 if (dump_file)
4110 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4111 chain->chain_id);
4113 delete chain;
4116 if (dump_file)
4117 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4119 BITMAP_FREE (candidates);
4120 bitmap_obstack_release (NULL);
4121 df_process_deferred_rescans ();
4123 /* Conversion means we may have 128bit register spills/fills
4124 which require aligned stack. */
4125 if (converted_insns)
4127 if (crtl->stack_alignment_needed < 128)
4128 crtl->stack_alignment_needed = 128;
4129 if (crtl->stack_alignment_estimated < 128)
4130 crtl->stack_alignment_estimated = 128;
4131 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4132 if (TARGET_64BIT)
4133 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4134 parm; parm = DECL_CHAIN (parm))
4136 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4137 continue;
4138 if (DECL_RTL_SET_P (parm)
4139 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4141 rtx r = DECL_RTL (parm);
4142 if (REG_P (r))
4143 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4145 if (DECL_INCOMING_RTL (parm)
4146 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4148 rtx r = DECL_INCOMING_RTL (parm);
4149 if (REG_P (r))
4150 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4155 return 0;
4158 namespace {
4160 const pass_data pass_data_insert_vzeroupper =
4162 RTL_PASS, /* type */
4163 "vzeroupper", /* name */
4164 OPTGROUP_NONE, /* optinfo_flags */
4165 TV_MACH_DEP, /* tv_id */
4166 0, /* properties_required */
4167 0, /* properties_provided */
4168 0, /* properties_destroyed */
4169 0, /* todo_flags_start */
4170 TODO_df_finish, /* todo_flags_finish */
4173 class pass_insert_vzeroupper : public rtl_opt_pass
4175 public:
4176 pass_insert_vzeroupper(gcc::context *ctxt)
4177 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4180 /* opt_pass methods: */
4181 virtual bool gate (function *)
4183 return TARGET_AVX && !TARGET_AVX512F
4184 && TARGET_VZEROUPPER && flag_expensive_optimizations
4185 && !optimize_size;
4188 virtual unsigned int execute (function *)
4190 return rest_of_handle_insert_vzeroupper ();
4193 }; // class pass_insert_vzeroupper
4195 const pass_data pass_data_stv =
4197 RTL_PASS, /* type */
4198 "stv", /* name */
4199 OPTGROUP_NONE, /* optinfo_flags */
4200 TV_MACH_DEP, /* tv_id */
4201 0, /* properties_required */
4202 0, /* properties_provided */
4203 0, /* properties_destroyed */
4204 0, /* todo_flags_start */
4205 TODO_df_finish, /* todo_flags_finish */
4208 class pass_stv : public rtl_opt_pass
4210 public:
4211 pass_stv (gcc::context *ctxt)
4212 : rtl_opt_pass (pass_data_stv, ctxt),
4213 timode_p (false)
4216 /* opt_pass methods: */
4217 virtual bool gate (function *)
4219 return (timode_p == !!TARGET_64BIT
4220 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4223 virtual unsigned int execute (function *)
4225 return convert_scalars_to_vector ();
4228 opt_pass *clone ()
4230 return new pass_stv (m_ctxt);
4233 void set_pass_param (unsigned int n, bool param)
4235 gcc_assert (n == 0);
4236 timode_p = param;
4239 private:
4240 bool timode_p;
4241 }; // class pass_stv
4243 } // anon namespace
4245 rtl_opt_pass *
4246 make_pass_insert_vzeroupper (gcc::context *ctxt)
4248 return new pass_insert_vzeroupper (ctxt);
4251 rtl_opt_pass *
4252 make_pass_stv (gcc::context *ctxt)
4254 return new pass_stv (ctxt);
4257 /* Return true if a red-zone is in use. */
4259 bool
4260 ix86_using_red_zone (void)
4262 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4265 /* Return a string that documents the current -m options. The caller is
4266 responsible for freeing the string. */
4268 static char *
4269 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4270 int flags, int flags2,
4271 const char *arch, const char *tune,
4272 enum fpmath_unit fpmath, bool add_nl_p)
4274 struct ix86_target_opts
4276 const char *option; /* option string */
4277 HOST_WIDE_INT mask; /* isa mask options */
4280 /* This table is ordered so that options like -msse4.2 that imply other
4281 ISAs come first. Target string will be displayed in the same order. */
4282 static struct ix86_target_opts isa2_opts[] =
4284 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4285 { "-msgx", OPTION_MASK_ISA_SGX },
4286 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4287 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4288 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4290 static struct ix86_target_opts isa_opts[] =
4292 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4293 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4294 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4295 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4296 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4297 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4298 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4299 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4300 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4301 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4302 { "-mfma", OPTION_MASK_ISA_FMA },
4303 { "-mxop", OPTION_MASK_ISA_XOP },
4304 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4305 { "-mf16c", OPTION_MASK_ISA_F16C },
4306 { "-mavx", OPTION_MASK_ISA_AVX },
4307 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4308 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4309 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4310 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4311 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4312 { "-msse3", OPTION_MASK_ISA_SSE3 },
4313 { "-maes", OPTION_MASK_ISA_AES },
4314 { "-msha", OPTION_MASK_ISA_SHA },
4315 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4316 { "-msse2", OPTION_MASK_ISA_SSE2 },
4317 { "-msse", OPTION_MASK_ISA_SSE },
4318 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4319 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4320 { "-mmmx", OPTION_MASK_ISA_MMX },
4321 { "-mrtm", OPTION_MASK_ISA_RTM },
4322 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4323 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4324 { "-madx", OPTION_MASK_ISA_ADX },
4325 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4326 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4327 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4328 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4329 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4330 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4331 { "-mabm", OPTION_MASK_ISA_ABM },
4332 { "-mbmi", OPTION_MASK_ISA_BMI },
4333 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4334 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4335 { "-mtbm", OPTION_MASK_ISA_TBM },
4336 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4337 { "-mcx16", OPTION_MASK_ISA_CX16 },
4338 { "-msahf", OPTION_MASK_ISA_SAHF },
4339 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4340 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4341 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4342 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4343 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4344 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4345 { "-mpku", OPTION_MASK_ISA_PKU },
4346 { "-mlwp", OPTION_MASK_ISA_LWP },
4347 { "-mhle", OPTION_MASK_ISA_HLE },
4348 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4349 { "-mmpx", OPTION_MASK_ISA_MPX },
4350 { "-mclwb", OPTION_MASK_ISA_CLWB }
4353 /* Flag options. */
4354 static struct ix86_target_opts flag_opts[] =
4356 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4357 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4358 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4359 { "-m80387", MASK_80387 },
4360 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4361 { "-malign-double", MASK_ALIGN_DOUBLE },
4362 { "-mcld", MASK_CLD },
4363 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4364 { "-mieee-fp", MASK_IEEE_FP },
4365 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4366 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4367 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4368 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4369 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4370 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4371 { "-mno-red-zone", MASK_NO_RED_ZONE },
4372 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4373 { "-mrecip", MASK_RECIP },
4374 { "-mrtd", MASK_RTD },
4375 { "-msseregparm", MASK_SSEREGPARM },
4376 { "-mstack-arg-probe", MASK_STACK_PROBE },
4377 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4378 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4379 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4380 { "-mvzeroupper", MASK_VZEROUPPER },
4381 { "-mstv", MASK_STV },
4382 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4383 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4384 { "-mprefer-avx128", MASK_PREFER_AVX128 }
4387 /* Additional flag options. */
4388 static struct ix86_target_opts flag2_opts[] =
4390 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4393 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4394 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4396 char isa_other[40];
4397 char isa2_other[40];
4398 char flags_other[40];
4399 char flags2_other[40];
4400 unsigned num = 0;
4401 unsigned i, j;
4402 char *ret;
4403 char *ptr;
4404 size_t len;
4405 size_t line_len;
4406 size_t sep_len;
4407 const char *abi;
4409 memset (opts, '\0', sizeof (opts));
4411 /* Add -march= option. */
4412 if (arch)
4414 opts[num][0] = "-march=";
4415 opts[num++][1] = arch;
4418 /* Add -mtune= option. */
4419 if (tune)
4421 opts[num][0] = "-mtune=";
4422 opts[num++][1] = tune;
4425 /* Add -m32/-m64/-mx32. */
4426 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4428 if ((isa & OPTION_MASK_ABI_64) != 0)
4429 abi = "-m64";
4430 else
4431 abi = "-mx32";
4432 isa &= ~ (OPTION_MASK_ISA_64BIT
4433 | OPTION_MASK_ABI_64
4434 | OPTION_MASK_ABI_X32);
4436 else
4437 abi = "-m32";
4438 opts[num++][0] = abi;
4440 /* Pick out the options in isa2 options. */
4441 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4443 if ((isa2 & isa2_opts[i].mask) != 0)
4445 opts[num++][0] = isa2_opts[i].option;
4446 isa2 &= ~ isa2_opts[i].mask;
4450 if (isa2 && add_nl_p)
4452 opts[num++][0] = isa2_other;
4453 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4456 /* Pick out the options in isa options. */
4457 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4459 if ((isa & isa_opts[i].mask) != 0)
4461 opts[num++][0] = isa_opts[i].option;
4462 isa &= ~ isa_opts[i].mask;
4466 if (isa && add_nl_p)
4468 opts[num++][0] = isa_other;
4469 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4472 /* Add flag options. */
4473 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4475 if ((flags & flag_opts[i].mask) != 0)
4477 opts[num++][0] = flag_opts[i].option;
4478 flags &= ~ flag_opts[i].mask;
4482 if (flags && add_nl_p)
4484 opts[num++][0] = flags_other;
4485 sprintf (flags_other, "(other flags: %#x)", flags);
4488 /* Add additional flag options. */
4489 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4491 if ((flags2 & flag2_opts[i].mask) != 0)
4493 opts[num++][0] = flag2_opts[i].option;
4494 flags2 &= ~ flag2_opts[i].mask;
4498 if (flags2 && add_nl_p)
4500 opts[num++][0] = flags2_other;
4501 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4504 /* Add -fpmath= option. */
4505 if (fpmath)
4507 opts[num][0] = "-mfpmath=";
4508 switch ((int) fpmath)
4510 case FPMATH_387:
4511 opts[num++][1] = "387";
4512 break;
4514 case FPMATH_SSE:
4515 opts[num++][1] = "sse";
4516 break;
4518 case FPMATH_387 | FPMATH_SSE:
4519 opts[num++][1] = "sse+387";
4520 break;
4522 default:
4523 gcc_unreachable ();
4527 /* Any options? */
4528 if (num == 0)
4529 return NULL;
4531 gcc_assert (num < ARRAY_SIZE (opts));
4533 /* Size the string. */
4534 len = 0;
4535 sep_len = (add_nl_p) ? 3 : 1;
4536 for (i = 0; i < num; i++)
4538 len += sep_len;
4539 for (j = 0; j < 2; j++)
4540 if (opts[i][j])
4541 len += strlen (opts[i][j]);
4544 /* Build the string. */
4545 ret = ptr = (char *) xmalloc (len);
4546 line_len = 0;
4548 for (i = 0; i < num; i++)
4550 size_t len2[2];
4552 for (j = 0; j < 2; j++)
4553 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4555 if (i != 0)
4557 *ptr++ = ' ';
4558 line_len++;
4560 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4562 *ptr++ = '\\';
4563 *ptr++ = '\n';
4564 line_len = 0;
4568 for (j = 0; j < 2; j++)
4569 if (opts[i][j])
4571 memcpy (ptr, opts[i][j], len2[j]);
4572 ptr += len2[j];
4573 line_len += len2[j];
4577 *ptr = '\0';
4578 gcc_assert (ret + len >= ptr);
4580 return ret;
4583 /* Return true, if profiling code should be emitted before
4584 prologue. Otherwise it returns false.
4585 Note: For x86 with "hotfix" it is sorried. */
4586 static bool
4587 ix86_profile_before_prologue (void)
4589 return flag_fentry != 0;
4592 /* Function that is callable from the debugger to print the current
4593 options. */
4594 void ATTRIBUTE_UNUSED
4595 ix86_debug_options (void)
4597 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4598 target_flags, ix86_target_flags,
4599 ix86_arch_string,ix86_tune_string,
4600 ix86_fpmath, true);
4602 if (opts)
4604 fprintf (stderr, "%s\n\n", opts);
4605 free (opts);
4607 else
4608 fputs ("<no options>\n\n", stderr);
4610 return;
4613 /* Return true if T is one of the bytes we should avoid with
4614 -fmitigate-rop. */
4616 static bool
4617 ix86_rop_should_change_byte_p (int t)
4619 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4622 static const char *stringop_alg_names[] = {
4623 #define DEF_ENUM
4624 #define DEF_ALG(alg, name) #name,
4625 #include "stringop.def"
4626 #undef DEF_ENUM
4627 #undef DEF_ALG
4630 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4631 The string is of the following form (or comma separated list of it):
4633 strategy_alg:max_size:[align|noalign]
4635 where the full size range for the strategy is either [0, max_size] or
4636 [min_size, max_size], in which min_size is the max_size + 1 of the
4637 preceding range. The last size range must have max_size == -1.
4639 Examples:
4642 -mmemcpy-strategy=libcall:-1:noalign
4644 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4648 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4650 This is to tell the compiler to use the following strategy for memset
4651 1) when the expected size is between [1, 16], use rep_8byte strategy;
4652 2) when the size is between [17, 2048], use vector_loop;
4653 3) when the size is > 2048, use libcall. */
4655 struct stringop_size_range
4657 int max;
4658 stringop_alg alg;
4659 bool noalign;
4662 static void
4663 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4665 const struct stringop_algs *default_algs;
4666 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4667 char *curr_range_str, *next_range_str;
4668 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4669 int i = 0, n = 0;
4671 if (is_memset)
4672 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4673 else
4674 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4676 curr_range_str = strategy_str;
4680 int maxs;
4681 char alg_name[128];
4682 char align[16];
4683 next_range_str = strchr (curr_range_str, ',');
4684 if (next_range_str)
4685 *next_range_str++ = '\0';
4687 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4688 alg_name, &maxs, align))
4690 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4691 return;
4694 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4696 error ("size ranges of option %qs should be increasing", opt);
4697 return;
4700 for (i = 0; i < last_alg; i++)
4701 if (!strcmp (alg_name, stringop_alg_names[i]))
4702 break;
4704 if (i == last_alg)
4706 error ("wrong strategy name %qs specified for option %qs",
4707 alg_name, opt);
4709 auto_vec <const char *> candidates;
4710 for (i = 0; i < last_alg; i++)
4711 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4712 candidates.safe_push (stringop_alg_names[i]);
4714 char *s;
4715 const char *hint
4716 = candidates_list_and_hint (alg_name, s, candidates);
4717 if (hint)
4718 inform (input_location,
4719 "valid arguments to %qs are: %s; did you mean %qs?",
4720 opt, s, hint);
4721 else
4722 inform (input_location, "valid arguments to %qs are: %s",
4723 opt, s);
4724 XDELETEVEC (s);
4725 return;
4728 if ((stringop_alg) i == rep_prefix_8_byte
4729 && !TARGET_64BIT)
4731 /* rep; movq isn't available in 32-bit code. */
4732 error ("strategy name %qs specified for option %qs "
4733 "not supported for 32-bit code", alg_name, opt);
4734 return;
4737 input_ranges[n].max = maxs;
4738 input_ranges[n].alg = (stringop_alg) i;
4739 if (!strcmp (align, "align"))
4740 input_ranges[n].noalign = false;
4741 else if (!strcmp (align, "noalign"))
4742 input_ranges[n].noalign = true;
4743 else
4745 error ("unknown alignment %qs specified for option %qs", align, opt);
4746 return;
4748 n++;
4749 curr_range_str = next_range_str;
4751 while (curr_range_str);
4753 if (input_ranges[n - 1].max != -1)
4755 error ("the max value for the last size range should be -1"
4756 " for option %qs", opt);
4757 return;
4760 if (n > MAX_STRINGOP_ALGS)
4762 error ("too many size ranges specified in option %qs", opt);
4763 return;
4766 /* Now override the default algs array. */
4767 for (i = 0; i < n; i++)
4769 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4770 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4771 = input_ranges[i].alg;
4772 *const_cast<int *>(&default_algs->size[i].noalign)
4773 = input_ranges[i].noalign;
4778 /* parse -mtune-ctrl= option. When DUMP is true,
4779 print the features that are explicitly set. */
4781 static void
4782 parse_mtune_ctrl_str (bool dump)
4784 if (!ix86_tune_ctrl_string)
4785 return;
4787 char *next_feature_string = NULL;
4788 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4789 char *orig = curr_feature_string;
4790 int i;
4793 bool clear = false;
4795 next_feature_string = strchr (curr_feature_string, ',');
4796 if (next_feature_string)
4797 *next_feature_string++ = '\0';
4798 if (*curr_feature_string == '^')
4800 curr_feature_string++;
4801 clear = true;
4803 for (i = 0; i < X86_TUNE_LAST; i++)
4805 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4807 ix86_tune_features[i] = !clear;
4808 if (dump)
4809 fprintf (stderr, "Explicitly %s feature %s\n",
4810 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4811 break;
4814 if (i == X86_TUNE_LAST)
4815 error ("Unknown parameter to option -mtune-ctrl: %s",
4816 clear ? curr_feature_string - 1 : curr_feature_string);
4817 curr_feature_string = next_feature_string;
4819 while (curr_feature_string);
4820 free (orig);
4823 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4824 processor type. */
4826 static void
4827 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4829 unsigned int ix86_tune_mask = 1u << ix86_tune;
4830 int i;
4832 for (i = 0; i < X86_TUNE_LAST; ++i)
4834 if (ix86_tune_no_default)
4835 ix86_tune_features[i] = 0;
4836 else
4837 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4840 if (dump)
4842 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4843 for (i = 0; i < X86_TUNE_LAST; i++)
4844 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4845 ix86_tune_features[i] ? "on" : "off");
4848 parse_mtune_ctrl_str (dump);
4852 /* Default align_* from the processor table. */
4854 static void
4855 ix86_default_align (struct gcc_options *opts)
4857 if (opts->x_align_loops == 0)
4859 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4860 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4862 if (opts->x_align_jumps == 0)
4864 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4865 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4867 if (opts->x_align_functions == 0)
4869 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4873 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4875 static void
4876 ix86_override_options_after_change (void)
4878 ix86_default_align (&global_options);
4881 /* Override various settings based on options. If MAIN_ARGS_P, the
4882 options are from the command line, otherwise they are from
4883 attributes. Return true if there's an error related to march
4884 option. */
4886 static bool
4887 ix86_option_override_internal (bool main_args_p,
4888 struct gcc_options *opts,
4889 struct gcc_options *opts_set)
4891 int i;
4892 unsigned int ix86_arch_mask;
4893 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4895 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4896 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4897 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4898 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4899 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4900 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4901 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4902 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4903 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4904 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4905 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4906 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4907 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4908 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4909 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4910 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4911 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4912 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4913 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4914 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4915 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4916 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4917 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4918 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4919 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4920 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4921 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4922 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4923 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4924 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4925 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4926 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4927 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4928 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4929 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4930 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4931 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4932 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4933 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4934 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4935 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4936 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4937 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4938 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4939 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4940 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4941 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4942 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4943 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4944 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4945 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4946 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4947 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4948 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4949 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4950 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4951 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4952 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4953 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4954 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4955 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
4956 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
4957 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
4958 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
4960 #define PTA_CORE2 \
4961 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4962 | PTA_CX16 | PTA_FXSR)
4963 #define PTA_NEHALEM \
4964 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4965 #define PTA_WESTMERE \
4966 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4967 #define PTA_SANDYBRIDGE \
4968 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4969 #define PTA_IVYBRIDGE \
4970 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4971 #define PTA_HASWELL \
4972 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4973 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4974 #define PTA_BROADWELL \
4975 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4976 #define PTA_SKYLAKE \
4977 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4978 #define PTA_SKYLAKE_AVX512 \
4979 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4980 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4981 #define PTA_KNL \
4982 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4983 #define PTA_BONNELL \
4984 (PTA_CORE2 | PTA_MOVBE)
4985 #define PTA_SILVERMONT \
4986 (PTA_WESTMERE | PTA_MOVBE)
4988 /* if this reaches 64, need to widen struct pta flags below */
4990 static struct pta
4992 const char *const name; /* processor name or nickname. */
4993 const enum processor_type processor;
4994 const enum attr_cpu schedule;
4995 const unsigned HOST_WIDE_INT flags;
4997 const processor_alias_table[] =
4999 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5000 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5001 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5002 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5003 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5004 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5005 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5006 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5007 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5008 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5009 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5010 PTA_MMX | PTA_SSE | PTA_FXSR},
5011 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5012 PTA_MMX | PTA_SSE | PTA_FXSR},
5013 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5014 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5015 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5016 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5017 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5018 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5019 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5020 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5021 PTA_MMX | PTA_SSE | PTA_FXSR},
5022 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5023 PTA_MMX | PTA_SSE | PTA_FXSR},
5024 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5025 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5026 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5027 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5028 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5029 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5030 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5031 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5032 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5033 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5034 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5035 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5036 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5037 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5038 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5039 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5040 PTA_SANDYBRIDGE},
5041 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5042 PTA_SANDYBRIDGE},
5043 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5044 PTA_IVYBRIDGE},
5045 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5046 PTA_IVYBRIDGE},
5047 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5048 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5049 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5050 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5051 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5052 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5053 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5054 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5055 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5056 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5057 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5058 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5059 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5060 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5061 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5062 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5063 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5064 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5065 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5066 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5067 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5068 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5069 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5070 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5071 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5072 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5073 {"x86-64", PROCESSOR_K8, CPU_K8,
5074 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5075 {"eden-x2", PROCESSOR_K8, CPU_K8,
5076 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5077 {"nano", PROCESSOR_K8, CPU_K8,
5078 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5079 | PTA_SSSE3 | PTA_FXSR},
5080 {"nano-1000", PROCESSOR_K8, CPU_K8,
5081 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5082 | PTA_SSSE3 | PTA_FXSR},
5083 {"nano-2000", PROCESSOR_K8, CPU_K8,
5084 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5085 | PTA_SSSE3 | PTA_FXSR},
5086 {"nano-3000", PROCESSOR_K8, CPU_K8,
5087 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5088 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5089 {"nano-x2", PROCESSOR_K8, CPU_K8,
5090 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5091 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5092 {"eden-x4", PROCESSOR_K8, CPU_K8,
5093 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5094 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5095 {"nano-x4", PROCESSOR_K8, CPU_K8,
5096 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5097 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5098 {"k8", PROCESSOR_K8, CPU_K8,
5099 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5100 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5101 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5102 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5103 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5104 {"opteron", PROCESSOR_K8, CPU_K8,
5105 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5106 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5107 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5108 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5109 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5110 {"athlon64", PROCESSOR_K8, CPU_K8,
5111 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5112 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5113 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5114 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5115 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5116 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5117 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5118 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5119 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5120 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5121 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5122 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5123 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5124 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5125 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5126 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5127 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5128 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5129 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5130 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5131 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5132 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5133 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5134 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5135 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5136 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5137 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5138 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5139 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5140 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5141 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5142 | PTA_XSAVEOPT | PTA_FSGSBASE},
5143 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5144 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5145 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5146 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5147 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5148 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5149 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5150 | PTA_MOVBE | PTA_MWAITX},
5151 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5152 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5153 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5154 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5155 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5156 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5157 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5158 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5159 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5160 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5161 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5162 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5163 | PTA_FXSR | PTA_XSAVE},
5164 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5165 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5166 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5167 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5168 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5169 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5171 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5172 PTA_64BIT
5173 | PTA_HLE /* flags are only used for -march switch. */ },
5176 /* -mrecip options. */
5177 static struct
5179 const char *string; /* option name */
5180 unsigned int mask; /* mask bits to set */
5182 const recip_options[] =
5184 { "all", RECIP_MASK_ALL },
5185 { "none", RECIP_MASK_NONE },
5186 { "div", RECIP_MASK_DIV },
5187 { "sqrt", RECIP_MASK_SQRT },
5188 { "vec-div", RECIP_MASK_VEC_DIV },
5189 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5192 int const pta_size = ARRAY_SIZE (processor_alias_table);
5194 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5195 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5196 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5197 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5198 #ifdef TARGET_BI_ARCH
5199 else
5201 #if TARGET_BI_ARCH == 1
5202 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5203 is on and OPTION_MASK_ABI_X32 is off. We turn off
5204 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5205 -mx32. */
5206 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5207 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5208 #else
5209 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5210 on and OPTION_MASK_ABI_64 is off. We turn off
5211 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5212 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5213 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5214 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5215 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5216 #endif
5217 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5218 && TARGET_IAMCU_P (opts->x_target_flags))
5219 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5220 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5222 #endif
5224 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5226 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5227 OPTION_MASK_ABI_64 for TARGET_X32. */
5228 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5229 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5231 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5232 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5233 | OPTION_MASK_ABI_X32
5234 | OPTION_MASK_ABI_64);
5235 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5237 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5238 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5239 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5240 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5243 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5244 SUBTARGET_OVERRIDE_OPTIONS;
5245 #endif
5247 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5248 SUBSUBTARGET_OVERRIDE_OPTIONS;
5249 #endif
5251 /* -fPIC is the default for x86_64. */
5252 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5253 opts->x_flag_pic = 2;
5255 /* Need to check -mtune=generic first. */
5256 if (opts->x_ix86_tune_string)
5258 /* As special support for cross compilers we read -mtune=native
5259 as -mtune=generic. With native compilers we won't see the
5260 -mtune=native, as it was changed by the driver. */
5261 if (!strcmp (opts->x_ix86_tune_string, "native"))
5263 opts->x_ix86_tune_string = "generic";
5265 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5266 warning (OPT_Wdeprecated,
5267 main_args_p
5268 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5269 "or %<-mtune=generic%> instead as appropriate"
5270 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5271 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5272 "instead as appropriate");
5274 else
5276 if (opts->x_ix86_arch_string)
5277 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5278 if (!opts->x_ix86_tune_string)
5280 opts->x_ix86_tune_string
5281 = processor_target_table[TARGET_CPU_DEFAULT].name;
5282 ix86_tune_defaulted = 1;
5285 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5286 or defaulted. We need to use a sensible tune option. */
5287 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5289 opts->x_ix86_tune_string = "generic";
5293 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5294 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5296 /* rep; movq isn't available in 32-bit code. */
5297 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5298 opts->x_ix86_stringop_alg = no_stringop;
5301 if (!opts->x_ix86_arch_string)
5302 opts->x_ix86_arch_string
5303 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5304 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5305 else
5306 ix86_arch_specified = 1;
5308 if (opts_set->x_ix86_pmode)
5310 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5311 && opts->x_ix86_pmode == PMODE_SI)
5312 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5313 && opts->x_ix86_pmode == PMODE_DI))
5314 error ("address mode %qs not supported in the %s bit mode",
5315 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5316 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5318 else
5319 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5320 ? PMODE_DI : PMODE_SI;
5322 if (!opts_set->x_ix86_abi)
5323 opts->x_ix86_abi = DEFAULT_ABI;
5325 /* For targets using ms ABI enable ms-extensions, if not
5326 explicit turned off. For non-ms ABI we turn off this
5327 option. */
5328 if (!opts_set->x_flag_ms_extensions)
5329 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5331 if (opts_set->x_ix86_cmodel)
5333 switch (opts->x_ix86_cmodel)
5335 case CM_SMALL:
5336 case CM_SMALL_PIC:
5337 if (opts->x_flag_pic)
5338 opts->x_ix86_cmodel = CM_SMALL_PIC;
5339 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5340 error ("code model %qs not supported in the %s bit mode",
5341 "small", "32");
5342 break;
5344 case CM_MEDIUM:
5345 case CM_MEDIUM_PIC:
5346 if (opts->x_flag_pic)
5347 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5348 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5349 error ("code model %qs not supported in the %s bit mode",
5350 "medium", "32");
5351 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5352 error ("code model %qs not supported in x32 mode",
5353 "medium");
5354 break;
5356 case CM_LARGE:
5357 case CM_LARGE_PIC:
5358 if (opts->x_flag_pic)
5359 opts->x_ix86_cmodel = CM_LARGE_PIC;
5360 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5361 error ("code model %qs not supported in the %s bit mode",
5362 "large", "32");
5363 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5364 error ("code model %qs not supported in x32 mode",
5365 "large");
5366 break;
5368 case CM_32:
5369 if (opts->x_flag_pic)
5370 error ("code model %s does not support PIC mode", "32");
5371 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5372 error ("code model %qs not supported in the %s bit mode",
5373 "32", "64");
5374 break;
5376 case CM_KERNEL:
5377 if (opts->x_flag_pic)
5379 error ("code model %s does not support PIC mode", "kernel");
5380 opts->x_ix86_cmodel = CM_32;
5382 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5383 error ("code model %qs not supported in the %s bit mode",
5384 "kernel", "32");
5385 break;
5387 default:
5388 gcc_unreachable ();
5391 else
5393 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5394 use of rip-relative addressing. This eliminates fixups that
5395 would otherwise be needed if this object is to be placed in a
5396 DLL, and is essentially just as efficient as direct addressing. */
5397 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5398 && (TARGET_RDOS || TARGET_PECOFF))
5399 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5400 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5401 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5402 else
5403 opts->x_ix86_cmodel = CM_32;
5405 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5407 error ("-masm=intel not supported in this configuration");
5408 opts->x_ix86_asm_dialect = ASM_ATT;
5410 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5411 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5412 sorry ("%i-bit mode not compiled in",
5413 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5415 for (i = 0; i < pta_size; i++)
5416 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5418 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5420 error (main_args_p
5421 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5422 : "%<generic%> CPU can be used only for "
5423 "%<target(\"tune=\")%> attribute");
5424 return false;
5426 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5428 error (main_args_p
5429 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5430 : "%<intel%> CPU can be used only for "
5431 "%<target(\"tune=\")%> attribute");
5432 return false;
5435 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5436 && !(processor_alias_table[i].flags & PTA_64BIT))
5438 error ("CPU you selected does not support x86-64 "
5439 "instruction set");
5440 return false;
5443 ix86_schedule = processor_alias_table[i].schedule;
5444 ix86_arch = processor_alias_table[i].processor;
5445 /* Default cpu tuning to the architecture. */
5446 ix86_tune = ix86_arch;
5448 if (processor_alias_table[i].flags & PTA_MMX
5449 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5450 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5451 if (processor_alias_table[i].flags & PTA_3DNOW
5452 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5453 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5454 if (processor_alias_table[i].flags & PTA_3DNOW_A
5455 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5456 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5457 if (processor_alias_table[i].flags & PTA_SSE
5458 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5459 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5460 if (processor_alias_table[i].flags & PTA_SSE2
5461 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5462 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5463 if (processor_alias_table[i].flags & PTA_SSE3
5464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5466 if (processor_alias_table[i].flags & PTA_SSSE3
5467 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5468 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5469 if (processor_alias_table[i].flags & PTA_SSE4_1
5470 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5471 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5472 if (processor_alias_table[i].flags & PTA_SSE4_2
5473 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5474 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5475 if (processor_alias_table[i].flags & PTA_AVX
5476 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5477 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5478 if (processor_alias_table[i].flags & PTA_AVX2
5479 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5480 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5481 if (processor_alias_table[i].flags & PTA_FMA
5482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5484 if (processor_alias_table[i].flags & PTA_SSE4A
5485 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5486 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5487 if (processor_alias_table[i].flags & PTA_FMA4
5488 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5489 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5490 if (processor_alias_table[i].flags & PTA_XOP
5491 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5492 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5493 if (processor_alias_table[i].flags & PTA_LWP
5494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5496 if (processor_alias_table[i].flags & PTA_ABM
5497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5499 if (processor_alias_table[i].flags & PTA_BMI
5500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5502 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5505 if (processor_alias_table[i].flags & PTA_TBM
5506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5508 if (processor_alias_table[i].flags & PTA_BMI2
5509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5511 if (processor_alias_table[i].flags & PTA_CX16
5512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5514 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5517 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5518 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5519 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5520 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5521 if (processor_alias_table[i].flags & PTA_MOVBE
5522 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5523 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5524 if (processor_alias_table[i].flags & PTA_AES
5525 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5526 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5527 if (processor_alias_table[i].flags & PTA_SHA
5528 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5529 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5530 if (processor_alias_table[i].flags & PTA_PCLMUL
5531 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5532 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5533 if (processor_alias_table[i].flags & PTA_FSGSBASE
5534 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5535 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5536 if (processor_alias_table[i].flags & PTA_RDRND
5537 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5538 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5539 if (processor_alias_table[i].flags & PTA_F16C
5540 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5541 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5542 if (processor_alias_table[i].flags & PTA_RTM
5543 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5544 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5545 if (processor_alias_table[i].flags & PTA_HLE
5546 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5547 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5548 if (processor_alias_table[i].flags & PTA_PRFCHW
5549 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5550 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5551 if (processor_alias_table[i].flags & PTA_RDSEED
5552 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5553 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5554 if (processor_alias_table[i].flags & PTA_ADX
5555 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5556 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5557 if (processor_alias_table[i].flags & PTA_FXSR
5558 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5559 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5560 if (processor_alias_table[i].flags & PTA_XSAVE
5561 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5562 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5563 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5564 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5565 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5566 if (processor_alias_table[i].flags & PTA_AVX512F
5567 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5568 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5569 if (processor_alias_table[i].flags & PTA_AVX512ER
5570 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5571 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5572 if (processor_alias_table[i].flags & PTA_AVX512PF
5573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5575 if (processor_alias_table[i].flags & PTA_AVX512CD
5576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5578 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5581 if (processor_alias_table[i].flags & PTA_CLWB
5582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5584 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5587 if (processor_alias_table[i].flags & PTA_CLZERO
5588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5590 if (processor_alias_table[i].flags & PTA_XSAVEC
5591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5593 if (processor_alias_table[i].flags & PTA_XSAVES
5594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5596 if (processor_alias_table[i].flags & PTA_AVX512DQ
5597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5599 if (processor_alias_table[i].flags & PTA_AVX512BW
5600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5602 if (processor_alias_table[i].flags & PTA_AVX512VL
5603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5605 if (processor_alias_table[i].flags & PTA_MPX
5606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5608 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5611 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5615 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5616 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5617 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5618 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5619 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5620 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5621 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5622 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5623 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5624 if (processor_alias_table[i].flags & PTA_SGX
5625 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5626 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5628 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5629 x86_prefetch_sse = true;
5630 if (processor_alias_table[i].flags & PTA_MWAITX
5631 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5632 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5633 if (processor_alias_table[i].flags & PTA_PKU
5634 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5635 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5637 /* Don't enable x87 instructions if only
5638 general registers are allowed. */
5639 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5640 && !(opts_set->x_target_flags & MASK_80387))
5642 if (processor_alias_table[i].flags & PTA_NO_80387)
5643 opts->x_target_flags &= ~MASK_80387;
5644 else
5645 opts->x_target_flags |= MASK_80387;
5647 break;
5650 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5651 error ("Intel MPX does not support x32");
5653 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5654 error ("Intel MPX does not support x32");
5656 if (i == pta_size)
5658 error (main_args_p
5659 ? "bad value (%qs) for %<-march=%> switch"
5660 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5661 opts->x_ix86_arch_string);
5663 auto_vec <const char *> candidates;
5664 for (i = 0; i < pta_size; i++)
5665 if (strcmp (processor_alias_table[i].name, "generic")
5666 && strcmp (processor_alias_table[i].name, "intel")
5667 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5668 || (processor_alias_table[i].flags & PTA_64BIT)))
5669 candidates.safe_push (processor_alias_table[i].name);
5671 char *s;
5672 const char *hint
5673 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5674 if (hint)
5675 inform (input_location,
5676 main_args_p
5677 ? "valid arguments to %<-march=%> switch are: "
5678 "%s; did you mean %qs?"
5679 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5680 "%s; did you mean %qs?", s, hint);
5681 else
5682 inform (input_location,
5683 main_args_p
5684 ? "valid arguments to %<-march=%> switch are: %s"
5685 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5687 XDELETEVEC (s);
5690 ix86_arch_mask = 1u << ix86_arch;
5691 for (i = 0; i < X86_ARCH_LAST; ++i)
5692 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5694 for (i = 0; i < pta_size; i++)
5695 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5697 ix86_schedule = processor_alias_table[i].schedule;
5698 ix86_tune = processor_alias_table[i].processor;
5699 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5701 if (!(processor_alias_table[i].flags & PTA_64BIT))
5703 if (ix86_tune_defaulted)
5705 opts->x_ix86_tune_string = "x86-64";
5706 for (i = 0; i < pta_size; i++)
5707 if (! strcmp (opts->x_ix86_tune_string,
5708 processor_alias_table[i].name))
5709 break;
5710 ix86_schedule = processor_alias_table[i].schedule;
5711 ix86_tune = processor_alias_table[i].processor;
5713 else
5714 error ("CPU you selected does not support x86-64 "
5715 "instruction set");
5718 /* Intel CPUs have always interpreted SSE prefetch instructions as
5719 NOPs; so, we can enable SSE prefetch instructions even when
5720 -mtune (rather than -march) points us to a processor that has them.
5721 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5722 higher processors. */
5723 if (TARGET_CMOV
5724 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5725 x86_prefetch_sse = true;
5726 break;
5729 if (ix86_tune_specified && i == pta_size)
5731 error (main_args_p
5732 ? "bad value (%qs) for %<-mtune=%> switch"
5733 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5734 opts->x_ix86_tune_string);
5736 auto_vec <const char *> candidates;
5737 for (i = 0; i < pta_size; i++)
5738 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5739 || (processor_alias_table[i].flags & PTA_64BIT))
5740 candidates.safe_push (processor_alias_table[i].name);
5742 char *s;
5743 const char *hint
5744 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5745 if (hint)
5746 inform (input_location,
5747 main_args_p
5748 ? "valid arguments to %<-mtune=%> switch are: "
5749 "%s; did you mean %qs?"
5750 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5751 "%s; did you mean %qs?", s, hint);
5752 else
5753 inform (input_location,
5754 main_args_p
5755 ? "valid arguments to %<-mtune=%> switch are: %s"
5756 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5758 XDELETEVEC (s);
5761 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5763 #ifndef USE_IX86_FRAME_POINTER
5764 #define USE_IX86_FRAME_POINTER 0
5765 #endif
5767 #ifndef USE_X86_64_FRAME_POINTER
5768 #define USE_X86_64_FRAME_POINTER 0
5769 #endif
5771 /* Set the default values for switches whose default depends on TARGET_64BIT
5772 in case they weren't overwritten by command line options. */
5773 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5775 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5776 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5777 if (opts->x_flag_asynchronous_unwind_tables
5778 && !opts_set->x_flag_unwind_tables
5779 && TARGET_64BIT_MS_ABI)
5780 opts->x_flag_unwind_tables = 1;
5781 if (opts->x_flag_asynchronous_unwind_tables == 2)
5782 opts->x_flag_unwind_tables
5783 = opts->x_flag_asynchronous_unwind_tables = 1;
5784 if (opts->x_flag_pcc_struct_return == 2)
5785 opts->x_flag_pcc_struct_return = 0;
5787 else
5789 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5790 opts->x_flag_omit_frame_pointer
5791 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5792 if (opts->x_flag_asynchronous_unwind_tables == 2)
5793 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5794 if (opts->x_flag_pcc_struct_return == 2)
5796 /* Intel MCU psABI specifies that -freg-struct-return should
5797 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5798 we check -miamcu so that -freg-struct-return is always
5799 turned on if -miamcu is used. */
5800 if (TARGET_IAMCU_P (opts->x_target_flags))
5801 opts->x_flag_pcc_struct_return = 0;
5802 else
5803 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5807 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5808 /* TODO: ix86_cost should be chosen at instruction or function granuality
5809 so for cold code we use size_cost even in !optimize_size compilation. */
5810 if (opts->x_optimize_size)
5811 ix86_cost = &ix86_size_cost;
5812 else
5813 ix86_cost = ix86_tune_cost;
5815 /* Arrange to set up i386_stack_locals for all functions. */
5816 init_machine_status = ix86_init_machine_status;
5818 /* Validate -mregparm= value. */
5819 if (opts_set->x_ix86_regparm)
5821 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5822 warning (0, "-mregparm is ignored in 64-bit mode");
5823 else if (TARGET_IAMCU_P (opts->x_target_flags))
5824 warning (0, "-mregparm is ignored for Intel MCU psABI");
5825 if (opts->x_ix86_regparm > REGPARM_MAX)
5827 error ("-mregparm=%d is not between 0 and %d",
5828 opts->x_ix86_regparm, REGPARM_MAX);
5829 opts->x_ix86_regparm = 0;
5832 if (TARGET_IAMCU_P (opts->x_target_flags)
5833 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5834 opts->x_ix86_regparm = REGPARM_MAX;
5836 /* Default align_* from the processor table. */
5837 ix86_default_align (opts);
5839 /* Provide default for -mbranch-cost= value. */
5840 if (!opts_set->x_ix86_branch_cost)
5841 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5843 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5845 opts->x_target_flags
5846 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5848 /* Enable by default the SSE and MMX builtins. Do allow the user to
5849 explicitly disable any of these. In particular, disabling SSE and
5850 MMX for kernel code is extremely useful. */
5851 if (!ix86_arch_specified)
5852 opts->x_ix86_isa_flags
5853 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5854 | TARGET_SUBTARGET64_ISA_DEFAULT)
5855 & ~opts->x_ix86_isa_flags_explicit);
5857 if (TARGET_RTD_P (opts->x_target_flags))
5858 warning (0,
5859 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5860 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5862 else
5864 opts->x_target_flags
5865 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5867 if (!ix86_arch_specified)
5868 opts->x_ix86_isa_flags
5869 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5871 /* i386 ABI does not specify red zone. It still makes sense to use it
5872 when programmer takes care to stack from being destroyed. */
5873 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5874 opts->x_target_flags |= MASK_NO_RED_ZONE;
5877 /* Keep nonleaf frame pointers. */
5878 if (opts->x_flag_omit_frame_pointer)
5879 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5880 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5881 opts->x_flag_omit_frame_pointer = 1;
5883 /* If we're doing fast math, we don't care about comparison order
5884 wrt NaNs. This lets us use a shorter comparison sequence. */
5885 if (opts->x_flag_finite_math_only)
5886 opts->x_target_flags &= ~MASK_IEEE_FP;
5888 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5889 since the insns won't need emulation. */
5890 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5891 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5893 /* Likewise, if the target doesn't have a 387, or we've specified
5894 software floating point, don't use 387 inline intrinsics. */
5895 if (!TARGET_80387_P (opts->x_target_flags))
5896 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5898 /* Turn on MMX builtins for -msse. */
5899 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5900 opts->x_ix86_isa_flags
5901 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5903 /* Enable SSE prefetch. */
5904 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5905 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5906 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5907 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5908 x86_prefetch_sse = true;
5910 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5911 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5912 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5913 opts->x_ix86_isa_flags
5914 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5916 /* Enable lzcnt instruction for -mabm. */
5917 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5918 opts->x_ix86_isa_flags
5919 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5921 /* Validate -mpreferred-stack-boundary= value or default it to
5922 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5923 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5924 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5926 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5927 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5928 int max = (TARGET_SEH ? 4 : 12);
5930 if (opts->x_ix86_preferred_stack_boundary_arg < min
5931 || opts->x_ix86_preferred_stack_boundary_arg > max)
5933 if (min == max)
5934 error ("-mpreferred-stack-boundary is not supported "
5935 "for this target");
5936 else
5937 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5938 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5940 else
5941 ix86_preferred_stack_boundary
5942 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5945 /* Set the default value for -mstackrealign. */
5946 if (opts->x_ix86_force_align_arg_pointer == -1)
5947 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5949 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5951 /* Validate -mincoming-stack-boundary= value or default it to
5952 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5953 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5954 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5956 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5958 if (opts->x_ix86_incoming_stack_boundary_arg < min
5959 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5960 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5961 opts->x_ix86_incoming_stack_boundary_arg, min);
5962 else
5964 ix86_user_incoming_stack_boundary
5965 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5966 ix86_incoming_stack_boundary
5967 = ix86_user_incoming_stack_boundary;
5971 #ifndef NO_PROFILE_COUNTERS
5972 if (flag_nop_mcount)
5973 error ("-mnop-mcount is not compatible with this target");
5974 #endif
5975 if (flag_nop_mcount && flag_pic)
5976 error ("-mnop-mcount is not implemented for -fPIC");
5978 /* Accept -msseregparm only if at least SSE support is enabled. */
5979 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5980 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5981 error (main_args_p
5982 ? "%<-msseregparm%> used without SSE enabled"
5983 : "%<target(\"sseregparm\")%> used without SSE enabled");
5985 if (opts_set->x_ix86_fpmath)
5987 if (opts->x_ix86_fpmath & FPMATH_SSE)
5989 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5991 if (TARGET_80387_P (opts->x_target_flags))
5993 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5994 opts->x_ix86_fpmath = FPMATH_387;
5997 else if ((opts->x_ix86_fpmath & FPMATH_387)
5998 && !TARGET_80387_P (opts->x_target_flags))
6000 warning (0, "387 instruction set disabled, using SSE arithmetics");
6001 opts->x_ix86_fpmath = FPMATH_SSE;
6005 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6006 fpmath=387. The second is however default at many targets since the
6007 extra 80bit precision of temporaries is considered to be part of ABI.
6008 Overwrite the default at least for -ffast-math.
6009 TODO: -mfpmath=both seems to produce same performing code with bit
6010 smaller binaries. It is however not clear if register allocation is
6011 ready for this setting.
6012 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6013 codegen. We may switch to 387 with -ffast-math for size optimized
6014 functions. */
6015 else if (fast_math_flags_set_p (&global_options)
6016 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6017 opts->x_ix86_fpmath = FPMATH_SSE;
6018 else
6019 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6021 /* Use external vectorized library in vectorizing intrinsics. */
6022 if (opts_set->x_ix86_veclibabi_type)
6023 switch (opts->x_ix86_veclibabi_type)
6025 case ix86_veclibabi_type_svml:
6026 ix86_veclib_handler = ix86_veclibabi_svml;
6027 break;
6029 case ix86_veclibabi_type_acml:
6030 ix86_veclib_handler = ix86_veclibabi_acml;
6031 break;
6033 default:
6034 gcc_unreachable ();
6037 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6038 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6039 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6041 /* If stack probes are required, the space used for large function
6042 arguments on the stack must also be probed, so enable
6043 -maccumulate-outgoing-args so this happens in the prologue. */
6044 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6045 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6047 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6048 warning (0,
6049 main_args_p
6050 ? "stack probing requires %<-maccumulate-outgoing-args%> "
6051 "for correctness"
6052 : "stack probing requires "
6053 "%<target(\"accumulate-outgoing-args\")%> for correctness");
6054 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6057 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6058 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6059 if (fixed_regs[BP_REG]
6060 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6062 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6063 warning (0,
6064 main_args_p
6065 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
6066 : "fixed ebp register requires "
6067 "%<target(\"accumulate-outgoing-args\")%>");
6068 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6071 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6073 char *p;
6074 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6075 p = strchr (internal_label_prefix, 'X');
6076 internal_label_prefix_len = p - internal_label_prefix;
6077 *p = '\0';
6080 /* When scheduling description is not available, disable scheduler pass
6081 so it won't slow down the compilation and make x87 code slower. */
6082 if (!TARGET_SCHEDULE)
6083 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6085 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6086 ix86_tune_cost->simultaneous_prefetches,
6087 opts->x_param_values,
6088 opts_set->x_param_values);
6089 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6090 ix86_tune_cost->prefetch_block,
6091 opts->x_param_values,
6092 opts_set->x_param_values);
6093 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6094 ix86_tune_cost->l1_cache_size,
6095 opts->x_param_values,
6096 opts_set->x_param_values);
6097 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6098 ix86_tune_cost->l2_cache_size,
6099 opts->x_param_values,
6100 opts_set->x_param_values);
6102 /* Restrict number of if-converted SET insns to 1. */
6103 if (TARGET_ONE_IF_CONV_INSN)
6104 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
6106 opts->x_param_values,
6107 opts_set->x_param_values);
6109 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6110 if (opts->x_flag_prefetch_loop_arrays < 0
6111 && HAVE_prefetch
6112 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6113 && !opts->x_optimize_size
6114 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6115 opts->x_flag_prefetch_loop_arrays = 1;
6117 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6118 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6119 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6120 targetm.expand_builtin_va_start = NULL;
6122 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6124 ix86_gen_leave = gen_leave_rex64;
6125 if (Pmode == DImode)
6127 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6128 ix86_gen_tls_local_dynamic_base_64
6129 = gen_tls_local_dynamic_base_64_di;
6131 else
6133 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6134 ix86_gen_tls_local_dynamic_base_64
6135 = gen_tls_local_dynamic_base_64_si;
6138 else
6139 ix86_gen_leave = gen_leave;
6141 if (Pmode == DImode)
6143 ix86_gen_add3 = gen_adddi3;
6144 ix86_gen_sub3 = gen_subdi3;
6145 ix86_gen_sub3_carry = gen_subdi3_carry;
6146 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6147 ix86_gen_andsp = gen_anddi3;
6148 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6149 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6150 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6151 ix86_gen_monitor = gen_sse3_monitor_di;
6152 ix86_gen_monitorx = gen_monitorx_di;
6153 ix86_gen_clzero = gen_clzero_di;
6155 else
6157 ix86_gen_add3 = gen_addsi3;
6158 ix86_gen_sub3 = gen_subsi3;
6159 ix86_gen_sub3_carry = gen_subsi3_carry;
6160 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6161 ix86_gen_andsp = gen_andsi3;
6162 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6163 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6164 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6165 ix86_gen_monitor = gen_sse3_monitor_si;
6166 ix86_gen_monitorx = gen_monitorx_si;
6167 ix86_gen_clzero = gen_clzero_si;
6170 #ifdef USE_IX86_CLD
6171 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6172 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6173 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6174 #endif
6176 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6178 if (opts->x_flag_fentry > 0)
6179 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6180 "with -fpic");
6181 opts->x_flag_fentry = 0;
6183 else if (TARGET_SEH)
6185 if (opts->x_flag_fentry == 0)
6186 sorry ("-mno-fentry isn%'t compatible with SEH");
6187 opts->x_flag_fentry = 1;
6189 else if (opts->x_flag_fentry < 0)
6191 #if defined(PROFILE_BEFORE_PROLOGUE)
6192 opts->x_flag_fentry = 1;
6193 #else
6194 opts->x_flag_fentry = 0;
6195 #endif
6198 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6199 opts->x_target_flags |= MASK_VZEROUPPER;
6200 if (!(opts_set->x_target_flags & MASK_STV))
6201 opts->x_target_flags |= MASK_STV;
6202 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6203 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6204 stack realignment will be extra cost the pass doesn't take into
6205 account and the pass can't realign the stack. */
6206 if (ix86_preferred_stack_boundary < 128
6207 || ix86_incoming_stack_boundary < 128
6208 || opts->x_ix86_force_align_arg_pointer)
6209 opts->x_target_flags &= ~MASK_STV;
6210 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6211 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6212 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6213 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6214 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6215 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6216 /* Enable 128-bit AVX instruction generation
6217 for the auto-vectorizer. */
6218 if (TARGET_AVX128_OPTIMAL
6219 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6220 opts->x_target_flags |= MASK_PREFER_AVX128;
6222 if (opts->x_ix86_recip_name)
6224 char *p = ASTRDUP (opts->x_ix86_recip_name);
6225 char *q;
6226 unsigned int mask, i;
6227 bool invert;
6229 while ((q = strtok (p, ",")) != NULL)
6231 p = NULL;
6232 if (*q == '!')
6234 invert = true;
6235 q++;
6237 else
6238 invert = false;
6240 if (!strcmp (q, "default"))
6241 mask = RECIP_MASK_ALL;
6242 else
6244 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6245 if (!strcmp (q, recip_options[i].string))
6247 mask = recip_options[i].mask;
6248 break;
6251 if (i == ARRAY_SIZE (recip_options))
6253 error ("unknown option for -mrecip=%s", q);
6254 invert = false;
6255 mask = RECIP_MASK_NONE;
6259 opts->x_recip_mask_explicit |= mask;
6260 if (invert)
6261 opts->x_recip_mask &= ~mask;
6262 else
6263 opts->x_recip_mask |= mask;
6267 if (TARGET_RECIP_P (opts->x_target_flags))
6268 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6269 else if (opts_set->x_target_flags & MASK_RECIP)
6270 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6272 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6273 for 64-bit Bionic. Also default long double to 64-bit for Intel
6274 MCU psABI. */
6275 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6276 && !(opts_set->x_target_flags
6277 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6278 opts->x_target_flags |= (TARGET_64BIT
6279 ? MASK_LONG_DOUBLE_128
6280 : MASK_LONG_DOUBLE_64);
6282 /* Only one of them can be active. */
6283 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6284 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6286 /* Save the initial options in case the user does function specific
6287 options. */
6288 if (main_args_p)
6289 target_option_default_node = target_option_current_node
6290 = build_target_option_node (opts);
6292 /* Handle stack protector */
6293 if (!opts_set->x_ix86_stack_protector_guard)
6294 opts->x_ix86_stack_protector_guard
6295 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6297 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6298 if (opts->x_ix86_tune_memcpy_strategy)
6300 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6301 ix86_parse_stringop_strategy_string (str, false);
6302 free (str);
6305 if (opts->x_ix86_tune_memset_strategy)
6307 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6308 ix86_parse_stringop_strategy_string (str, true);
6309 free (str);
6312 return true;
6315 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6317 static void
6318 ix86_option_override (void)
6320 ix86_option_override_internal (true, &global_options, &global_options_set);
6323 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6324 static char *
6325 ix86_offload_options (void)
6327 if (TARGET_LP64)
6328 return xstrdup ("-foffload-abi=lp64");
6329 return xstrdup ("-foffload-abi=ilp32");
6332 /* Update register usage after having seen the compiler flags. */
6334 static void
6335 ix86_conditional_register_usage (void)
6337 int i, c_mask;
6339 /* If there are no caller-saved registers, preserve all registers.
6340 except fixed_regs and registers used for function return value
6341 since aggregate_value_p checks call_used_regs[regno] on return
6342 value. */
6343 if (cfun && cfun->machine->no_caller_saved_registers)
6344 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6345 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6346 call_used_regs[i] = 0;
6348 /* For 32-bit targets, squash the REX registers. */
6349 if (! TARGET_64BIT)
6351 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6352 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6353 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6354 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6355 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6356 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6359 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6360 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6362 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6364 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6366 /* Set/reset conditionally defined registers from
6367 CALL_USED_REGISTERS initializer. */
6368 if (call_used_regs[i] > 1)
6369 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6371 /* Calculate registers of CLOBBERED_REGS register set
6372 as call used registers from GENERAL_REGS register set. */
6373 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6374 && call_used_regs[i])
6375 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6378 /* If MMX is disabled, squash the registers. */
6379 if (! TARGET_MMX)
6380 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6381 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6382 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6384 /* If SSE is disabled, squash the registers. */
6385 if (! TARGET_SSE)
6386 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6387 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6388 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6390 /* If the FPU is disabled, squash the registers. */
6391 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6392 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6393 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6394 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6396 /* If AVX512F is disabled, squash the registers. */
6397 if (! TARGET_AVX512F)
6399 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6400 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6402 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6403 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6406 /* If MPX is disabled, squash the registers. */
6407 if (! TARGET_MPX)
6408 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6409 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6413 /* Save the current options */
6415 static void
6416 ix86_function_specific_save (struct cl_target_option *ptr,
6417 struct gcc_options *opts)
6419 ptr->arch = ix86_arch;
6420 ptr->schedule = ix86_schedule;
6421 ptr->prefetch_sse = x86_prefetch_sse;
6422 ptr->tune = ix86_tune;
6423 ptr->branch_cost = ix86_branch_cost;
6424 ptr->tune_defaulted = ix86_tune_defaulted;
6425 ptr->arch_specified = ix86_arch_specified;
6426 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6427 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6428 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6429 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6430 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6431 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6432 ptr->x_ix86_abi = opts->x_ix86_abi;
6433 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6434 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6435 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6436 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6437 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6438 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6439 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6440 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6441 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6442 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6443 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6444 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6445 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6446 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6447 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6448 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6449 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6450 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6451 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6452 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6454 /* The fields are char but the variables are not; make sure the
6455 values fit in the fields. */
6456 gcc_assert (ptr->arch == ix86_arch);
6457 gcc_assert (ptr->schedule == ix86_schedule);
6458 gcc_assert (ptr->tune == ix86_tune);
6459 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6462 /* Restore the current options */
6464 static void
6465 ix86_function_specific_restore (struct gcc_options *opts,
6466 struct cl_target_option *ptr)
6468 enum processor_type old_tune = ix86_tune;
6469 enum processor_type old_arch = ix86_arch;
6470 unsigned int ix86_arch_mask;
6471 int i;
6473 /* We don't change -fPIC. */
6474 opts->x_flag_pic = flag_pic;
6476 ix86_arch = (enum processor_type) ptr->arch;
6477 ix86_schedule = (enum attr_cpu) ptr->schedule;
6478 ix86_tune = (enum processor_type) ptr->tune;
6479 x86_prefetch_sse = ptr->prefetch_sse;
6480 opts->x_ix86_branch_cost = ptr->branch_cost;
6481 ix86_tune_defaulted = ptr->tune_defaulted;
6482 ix86_arch_specified = ptr->arch_specified;
6483 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6484 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6485 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6486 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6487 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6488 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6489 opts->x_ix86_abi = ptr->x_ix86_abi;
6490 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6491 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6492 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6493 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6494 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6495 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6496 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6497 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6498 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6499 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6500 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6501 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6502 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6503 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6504 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6505 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6506 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6507 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6508 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6509 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6510 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6511 /* TODO: ix86_cost should be chosen at instruction or function granuality
6512 so for cold code we use size_cost even in !optimize_size compilation. */
6513 if (opts->x_optimize_size)
6514 ix86_cost = &ix86_size_cost;
6515 else
6516 ix86_cost = ix86_tune_cost;
6518 /* Recreate the arch feature tests if the arch changed */
6519 if (old_arch != ix86_arch)
6521 ix86_arch_mask = 1u << ix86_arch;
6522 for (i = 0; i < X86_ARCH_LAST; ++i)
6523 ix86_arch_features[i]
6524 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6527 /* Recreate the tune optimization tests */
6528 if (old_tune != ix86_tune)
6529 set_ix86_tune_features (ix86_tune, false);
6532 /* Adjust target options after streaming them in. This is mainly about
6533 reconciling them with global options. */
6535 static void
6536 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6538 /* flag_pic is a global option, but ix86_cmodel is target saved option
6539 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6540 for PIC, or error out. */
6541 if (flag_pic)
6542 switch (ptr->x_ix86_cmodel)
6544 case CM_SMALL:
6545 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6546 break;
6548 case CM_MEDIUM:
6549 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6550 break;
6552 case CM_LARGE:
6553 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6554 break;
6556 case CM_KERNEL:
6557 error ("code model %s does not support PIC mode", "kernel");
6558 break;
6560 default:
6561 break;
6563 else
6564 switch (ptr->x_ix86_cmodel)
6566 case CM_SMALL_PIC:
6567 ptr->x_ix86_cmodel = CM_SMALL;
6568 break;
6570 case CM_MEDIUM_PIC:
6571 ptr->x_ix86_cmodel = CM_MEDIUM;
6572 break;
6574 case CM_LARGE_PIC:
6575 ptr->x_ix86_cmodel = CM_LARGE;
6576 break;
6578 default:
6579 break;
6583 /* Print the current options */
6585 static void
6586 ix86_function_specific_print (FILE *file, int indent,
6587 struct cl_target_option *ptr)
6589 char *target_string
6590 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6591 ptr->x_target_flags, ptr->x_ix86_target_flags,
6592 NULL, NULL, ptr->x_ix86_fpmath, false);
6594 gcc_assert (ptr->arch < PROCESSOR_max);
6595 fprintf (file, "%*sarch = %d (%s)\n",
6596 indent, "",
6597 ptr->arch, processor_target_table[ptr->arch].name);
6599 gcc_assert (ptr->tune < PROCESSOR_max);
6600 fprintf (file, "%*stune = %d (%s)\n",
6601 indent, "",
6602 ptr->tune, processor_target_table[ptr->tune].name);
6604 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6606 if (target_string)
6608 fprintf (file, "%*s%s\n", indent, "", target_string);
6609 free (target_string);
6614 /* Inner function to process the attribute((target(...))), take an argument and
6615 set the current options from the argument. If we have a list, recursively go
6616 over the list. */
6618 static bool
6619 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6620 struct gcc_options *opts,
6621 struct gcc_options *opts_set,
6622 struct gcc_options *enum_opts_set)
6624 char *next_optstr;
6625 bool ret = true;
6627 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6628 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6629 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6630 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6631 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6633 enum ix86_opt_type
6635 ix86_opt_unknown,
6636 ix86_opt_yes,
6637 ix86_opt_no,
6638 ix86_opt_str,
6639 ix86_opt_enum,
6640 ix86_opt_isa
6643 static const struct
6645 const char *string;
6646 size_t len;
6647 enum ix86_opt_type type;
6648 int opt;
6649 int mask;
6650 } attrs[] = {
6651 /* isa options */
6652 IX86_ATTR_ISA ("sgx", OPT_msgx),
6653 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
6654 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
6655 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
6657 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6658 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6659 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6660 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6661 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6662 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6663 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6664 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6665 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6666 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6667 IX86_ATTR_ISA ("fma", OPT_mfma),
6668 IX86_ATTR_ISA ("xop", OPT_mxop),
6669 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6670 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6671 IX86_ATTR_ISA ("avx", OPT_mavx),
6672 IX86_ATTR_ISA ("sse4", OPT_msse4),
6673 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6674 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6675 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6676 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6677 IX86_ATTR_ISA ("sse3", OPT_msse3),
6678 IX86_ATTR_ISA ("aes", OPT_maes),
6679 IX86_ATTR_ISA ("sha", OPT_msha),
6680 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6681 IX86_ATTR_ISA ("sse2", OPT_msse2),
6682 IX86_ATTR_ISA ("sse", OPT_msse),
6683 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
6684 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6685 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6686 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6687 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6688 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6689 IX86_ATTR_ISA ("adx", OPT_madx),
6690 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6691 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6692 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6693 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6694 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6695 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6696 IX86_ATTR_ISA ("abm", OPT_mabm),
6697 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6698 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6699 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6700 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6701 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6702 IX86_ATTR_ISA ("cx16", OPT_mcx16),
6703 IX86_ATTR_ISA ("sahf", OPT_msahf),
6704 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6705 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6706 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6707 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6708 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6709 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6710 IX86_ATTR_ISA ("pku", OPT_mpku),
6711 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6712 IX86_ATTR_ISA ("hle", OPT_mhle),
6713 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6714 IX86_ATTR_ISA ("mpx", OPT_mmpx),
6715 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6716 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
6718 /* enum options */
6719 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6721 /* string options */
6722 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6723 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6725 /* flag options */
6726 IX86_ATTR_YES ("cld",
6727 OPT_mcld,
6728 MASK_CLD),
6730 IX86_ATTR_NO ("fancy-math-387",
6731 OPT_mfancy_math_387,
6732 MASK_NO_FANCY_MATH_387),
6734 IX86_ATTR_YES ("ieee-fp",
6735 OPT_mieee_fp,
6736 MASK_IEEE_FP),
6738 IX86_ATTR_YES ("inline-all-stringops",
6739 OPT_minline_all_stringops,
6740 MASK_INLINE_ALL_STRINGOPS),
6742 IX86_ATTR_YES ("inline-stringops-dynamically",
6743 OPT_minline_stringops_dynamically,
6744 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6746 IX86_ATTR_NO ("align-stringops",
6747 OPT_mno_align_stringops,
6748 MASK_NO_ALIGN_STRINGOPS),
6750 IX86_ATTR_YES ("recip",
6751 OPT_mrecip,
6752 MASK_RECIP),
6756 /* If this is a list, recurse to get the options. */
6757 if (TREE_CODE (args) == TREE_LIST)
6759 bool ret = true;
6761 for (; args; args = TREE_CHAIN (args))
6762 if (TREE_VALUE (args)
6763 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6764 p_strings, opts, opts_set,
6765 enum_opts_set))
6766 ret = false;
6768 return ret;
6771 else if (TREE_CODE (args) != STRING_CST)
6773 error ("attribute %<target%> argument not a string");
6774 return false;
6777 /* Handle multiple arguments separated by commas. */
6778 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6780 while (next_optstr && *next_optstr != '\0')
6782 char *p = next_optstr;
6783 char *orig_p = p;
6784 char *comma = strchr (next_optstr, ',');
6785 const char *opt_string;
6786 size_t len, opt_len;
6787 int opt;
6788 bool opt_set_p;
6789 char ch;
6790 unsigned i;
6791 enum ix86_opt_type type = ix86_opt_unknown;
6792 int mask = 0;
6794 if (comma)
6796 *comma = '\0';
6797 len = comma - next_optstr;
6798 next_optstr = comma + 1;
6800 else
6802 len = strlen (p);
6803 next_optstr = NULL;
6806 /* Recognize no-xxx. */
6807 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6809 opt_set_p = false;
6810 p += 3;
6811 len -= 3;
6813 else
6814 opt_set_p = true;
6816 /* Find the option. */
6817 ch = *p;
6818 opt = N_OPTS;
6819 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6821 type = attrs[i].type;
6822 opt_len = attrs[i].len;
6823 if (ch == attrs[i].string[0]
6824 && ((type != ix86_opt_str && type != ix86_opt_enum)
6825 ? len == opt_len
6826 : len > opt_len)
6827 && memcmp (p, attrs[i].string, opt_len) == 0)
6829 opt = attrs[i].opt;
6830 mask = attrs[i].mask;
6831 opt_string = attrs[i].string;
6832 break;
6836 /* Process the option. */
6837 if (opt == N_OPTS)
6839 error ("attribute(target(\"%s\")) is unknown", orig_p);
6840 ret = false;
6843 else if (type == ix86_opt_isa)
6845 struct cl_decoded_option decoded;
6847 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6848 ix86_handle_option (opts, opts_set,
6849 &decoded, input_location);
6852 else if (type == ix86_opt_yes || type == ix86_opt_no)
6854 if (type == ix86_opt_no)
6855 opt_set_p = !opt_set_p;
6857 if (opt_set_p)
6858 opts->x_target_flags |= mask;
6859 else
6860 opts->x_target_flags &= ~mask;
6863 else if (type == ix86_opt_str)
6865 if (p_strings[opt])
6867 error ("option(\"%s\") was already specified", opt_string);
6868 ret = false;
6870 else
6871 p_strings[opt] = xstrdup (p + opt_len);
6874 else if (type == ix86_opt_enum)
6876 bool arg_ok;
6877 int value;
6879 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6880 if (arg_ok)
6881 set_option (opts, enum_opts_set, opt, value,
6882 p + opt_len, DK_UNSPECIFIED, input_location,
6883 global_dc);
6884 else
6886 error ("attribute(target(\"%s\")) is unknown", orig_p);
6887 ret = false;
6891 else
6892 gcc_unreachable ();
6895 return ret;
6898 /* Release allocated strings. */
6899 static void
6900 release_options_strings (char **option_strings)
6902 /* Free up memory allocated to hold the strings */
6903 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6904 free (option_strings[i]);
6907 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6909 tree
6910 ix86_valid_target_attribute_tree (tree args,
6911 struct gcc_options *opts,
6912 struct gcc_options *opts_set)
6914 const char *orig_arch_string = opts->x_ix86_arch_string;
6915 const char *orig_tune_string = opts->x_ix86_tune_string;
6916 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6917 int orig_tune_defaulted = ix86_tune_defaulted;
6918 int orig_arch_specified = ix86_arch_specified;
6919 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6920 tree t = NULL_TREE;
6921 struct cl_target_option *def
6922 = TREE_TARGET_OPTION (target_option_default_node);
6923 struct gcc_options enum_opts_set;
6925 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6927 /* Process each of the options on the chain. */
6928 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6929 opts_set, &enum_opts_set))
6930 return error_mark_node;
6932 /* If the changed options are different from the default, rerun
6933 ix86_option_override_internal, and then save the options away.
6934 The string options are attribute options, and will be undone
6935 when we copy the save structure. */
6936 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6937 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
6938 || opts->x_target_flags != def->x_target_flags
6939 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6940 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6941 || enum_opts_set.x_ix86_fpmath)
6943 /* If we are using the default tune= or arch=, undo the string assigned,
6944 and use the default. */
6945 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6947 opts->x_ix86_arch_string
6948 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6950 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6951 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6952 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6953 | OPTION_MASK_ABI_64
6954 | OPTION_MASK_ABI_X32
6955 | OPTION_MASK_CODE16);
6956 opts->x_ix86_isa_flags2 = 0;
6958 else if (!orig_arch_specified)
6959 opts->x_ix86_arch_string = NULL;
6961 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6962 opts->x_ix86_tune_string
6963 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6964 else if (orig_tune_defaulted)
6965 opts->x_ix86_tune_string = NULL;
6967 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6968 if (enum_opts_set.x_ix86_fpmath)
6969 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6970 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6971 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6973 if (TARGET_80387_P (opts->x_target_flags))
6974 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6975 | FPMATH_387);
6976 else
6977 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6978 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6981 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6982 bool r = ix86_option_override_internal (false, opts, opts_set);
6983 if (!r)
6985 release_options_strings (option_strings);
6986 return error_mark_node;
6989 /* Add any builtin functions with the new isa if any. */
6990 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
6992 /* Save the current options unless we are validating options for
6993 #pragma. */
6994 t = build_target_option_node (opts);
6996 opts->x_ix86_arch_string = orig_arch_string;
6997 opts->x_ix86_tune_string = orig_tune_string;
6998 opts_set->x_ix86_fpmath = orig_fpmath_set;
7000 release_options_strings (option_strings);
7003 return t;
7006 /* Hook to validate attribute((target("string"))). */
7008 static bool
7009 ix86_valid_target_attribute_p (tree fndecl,
7010 tree ARG_UNUSED (name),
7011 tree args,
7012 int ARG_UNUSED (flags))
7014 struct gcc_options func_options;
7015 tree new_target, new_optimize;
7016 bool ret = true;
7018 /* attribute((target("default"))) does nothing, beyond
7019 affecting multi-versioning. */
7020 if (TREE_VALUE (args)
7021 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7022 && TREE_CHAIN (args) == NULL_TREE
7023 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7024 return true;
7026 tree old_optimize = build_optimization_node (&global_options);
7028 /* Get the optimization options of the current function. */
7029 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7031 if (!func_optimize)
7032 func_optimize = old_optimize;
7034 /* Init func_options. */
7035 memset (&func_options, 0, sizeof (func_options));
7036 init_options_struct (&func_options, NULL);
7037 lang_hooks.init_options_struct (&func_options);
7039 cl_optimization_restore (&func_options,
7040 TREE_OPTIMIZATION (func_optimize));
7042 /* Initialize func_options to the default before its target options can
7043 be set. */
7044 cl_target_option_restore (&func_options,
7045 TREE_TARGET_OPTION (target_option_default_node));
7047 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7048 &global_options_set);
7050 new_optimize = build_optimization_node (&func_options);
7052 if (new_target == error_mark_node)
7053 ret = false;
7055 else if (fndecl && new_target)
7057 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7059 if (old_optimize != new_optimize)
7060 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7063 finalize_options_struct (&func_options);
7065 return ret;
7069 /* Hook to determine if one function can safely inline another. */
7071 static bool
7072 ix86_can_inline_p (tree caller, tree callee)
7074 bool ret = false;
7075 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7076 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7078 /* If callee has no option attributes, then it is ok to inline. */
7079 if (!callee_tree)
7080 ret = true;
7082 /* If caller has no option attributes, but callee does then it is not ok to
7083 inline. */
7084 else if (!caller_tree)
7085 ret = false;
7087 else
7089 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7090 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7092 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7093 function can inline a SSE2 function but a SSE2 function can't inline
7094 a SSE4 function. */
7095 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7096 != callee_opts->x_ix86_isa_flags)
7097 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7098 != callee_opts->x_ix86_isa_flags2))
7099 ret = false;
7101 /* See if we have the same non-isa options. */
7102 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7103 ret = false;
7105 /* See if arch, tune, etc. are the same. */
7106 else if (caller_opts->arch != callee_opts->arch)
7107 ret = false;
7109 else if (caller_opts->tune != callee_opts->tune)
7110 ret = false;
7112 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7113 ret = false;
7115 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7116 ret = false;
7118 else
7119 ret = true;
7122 return ret;
7126 /* Remember the last target of ix86_set_current_function. */
7127 static GTY(()) tree ix86_previous_fndecl;
7129 /* Set targets globals to the default (or current #pragma GCC target
7130 if active). Invalidate ix86_previous_fndecl cache. */
7132 void
7133 ix86_reset_previous_fndecl (void)
7135 tree new_tree = target_option_current_node;
7136 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7137 if (TREE_TARGET_GLOBALS (new_tree))
7138 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7139 else if (new_tree == target_option_default_node)
7140 restore_target_globals (&default_target_globals);
7141 else
7142 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7143 ix86_previous_fndecl = NULL_TREE;
7146 /* Set the func_type field from the function FNDECL. */
7148 static void
7149 ix86_set_func_type (tree fndecl)
7151 if (cfun->machine->func_type == TYPE_UNKNOWN)
7153 if (lookup_attribute ("interrupt",
7154 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7156 int nargs = 0;
7157 for (tree arg = DECL_ARGUMENTS (fndecl);
7158 arg;
7159 arg = TREE_CHAIN (arg))
7160 nargs++;
7161 cfun->machine->no_caller_saved_registers = true;
7162 cfun->machine->func_type
7163 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7165 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7167 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7168 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7169 sorry ("Only DWARF debug format is supported for interrupt "
7170 "service routine.");
7172 else
7174 cfun->machine->func_type = TYPE_NORMAL;
7175 if (lookup_attribute ("no_caller_saved_registers",
7176 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7177 cfun->machine->no_caller_saved_registers = true;
7182 /* Establish appropriate back-end context for processing the function
7183 FNDECL. The argument might be NULL to indicate processing at top
7184 level, outside of any function scope. */
7185 static void
7186 ix86_set_current_function (tree fndecl)
7188 /* Only change the context if the function changes. This hook is called
7189 several times in the course of compiling a function, and we don't want to
7190 slow things down too much or call target_reinit when it isn't safe. */
7191 if (fndecl == ix86_previous_fndecl)
7193 /* There may be 2 function bodies for the same function FNDECL,
7194 one is extern inline and one isn't. Call ix86_set_func_type
7195 to set the func_type field. */
7196 if (fndecl != NULL_TREE)
7197 ix86_set_func_type (fndecl);
7198 return;
7201 tree old_tree;
7202 if (ix86_previous_fndecl == NULL_TREE)
7203 old_tree = target_option_current_node;
7204 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7205 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7206 else
7207 old_tree = target_option_default_node;
7209 if (fndecl == NULL_TREE)
7211 if (old_tree != target_option_current_node)
7212 ix86_reset_previous_fndecl ();
7213 return;
7216 ix86_set_func_type (fndecl);
7218 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7219 if (new_tree == NULL_TREE)
7220 new_tree = target_option_default_node;
7222 if (old_tree != new_tree)
7224 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7225 if (TREE_TARGET_GLOBALS (new_tree))
7226 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7227 else if (new_tree == target_option_default_node)
7228 restore_target_globals (&default_target_globals);
7229 else
7230 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7232 ix86_previous_fndecl = fndecl;
7234 static bool prev_no_caller_saved_registers;
7236 /* 64-bit MS and SYSV ABI have different set of call used registers.
7237 Avoid expensive re-initialization of init_regs each time we switch
7238 function context. */
7239 if (TARGET_64BIT
7240 && (call_used_regs[SI_REG]
7241 == (cfun->machine->call_abi == MS_ABI)))
7242 reinit_regs ();
7243 /* Need to re-initialize init_regs if caller-saved registers are
7244 changed. */
7245 else if (prev_no_caller_saved_registers
7246 != cfun->machine->no_caller_saved_registers)
7247 reinit_regs ();
7249 if (cfun->machine->func_type != TYPE_NORMAL
7250 || cfun->machine->no_caller_saved_registers)
7252 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7253 may change processor state. */
7254 const char *isa;
7255 if (TARGET_MPX)
7256 isa = "MPX";
7257 else if (TARGET_SSE)
7258 isa = "SSE";
7259 else if (TARGET_MMX)
7260 isa = "MMX/3Dnow";
7261 else if (TARGET_80387)
7262 isa = "80387";
7263 else
7264 isa = NULL;
7265 if (isa != NULL)
7267 if (cfun->machine->func_type != TYPE_NORMAL)
7268 sorry ("%s instructions aren't allowed in %s service routine",
7269 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7270 ? "exception" : "interrupt"));
7271 else
7272 sorry ("%s instructions aren't allowed in function with "
7273 "no_caller_saved_registers attribute", isa);
7274 /* Don't issue the same error twice. */
7275 cfun->machine->func_type = TYPE_NORMAL;
7276 cfun->machine->no_caller_saved_registers = false;
7280 prev_no_caller_saved_registers
7281 = cfun->machine->no_caller_saved_registers;
7285 /* Return true if this goes in large data/bss. */
7287 static bool
7288 ix86_in_large_data_p (tree exp)
7290 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7291 return false;
7293 if (exp == NULL_TREE)
7294 return false;
7296 /* Functions are never large data. */
7297 if (TREE_CODE (exp) == FUNCTION_DECL)
7298 return false;
7300 /* Automatic variables are never large data. */
7301 if (VAR_P (exp) && !is_global_var (exp))
7302 return false;
7304 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7306 const char *section = DECL_SECTION_NAME (exp);
7307 if (strcmp (section, ".ldata") == 0
7308 || strcmp (section, ".lbss") == 0)
7309 return true;
7310 return false;
7312 else
7314 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7316 /* If this is an incomplete type with size 0, then we can't put it
7317 in data because it might be too big when completed. Also,
7318 int_size_in_bytes returns -1 if size can vary or is larger than
7319 an integer in which case also it is safer to assume that it goes in
7320 large data. */
7321 if (size <= 0 || size > ix86_section_threshold)
7322 return true;
7325 return false;
7328 /* i386-specific section flag to mark large sections. */
7329 #define SECTION_LARGE SECTION_MACH_DEP
7331 /* Switch to the appropriate section for output of DECL.
7332 DECL is either a `VAR_DECL' node or a constant of some sort.
7333 RELOC indicates whether forming the initial value of DECL requires
7334 link-time relocations. */
7336 ATTRIBUTE_UNUSED static section *
7337 x86_64_elf_select_section (tree decl, int reloc,
7338 unsigned HOST_WIDE_INT align)
7340 if (ix86_in_large_data_p (decl))
7342 const char *sname = NULL;
7343 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7344 switch (categorize_decl_for_section (decl, reloc))
7346 case SECCAT_DATA:
7347 sname = ".ldata";
7348 break;
7349 case SECCAT_DATA_REL:
7350 sname = ".ldata.rel";
7351 break;
7352 case SECCAT_DATA_REL_LOCAL:
7353 sname = ".ldata.rel.local";
7354 break;
7355 case SECCAT_DATA_REL_RO:
7356 sname = ".ldata.rel.ro";
7357 break;
7358 case SECCAT_DATA_REL_RO_LOCAL:
7359 sname = ".ldata.rel.ro.local";
7360 break;
7361 case SECCAT_BSS:
7362 sname = ".lbss";
7363 flags |= SECTION_BSS;
7364 break;
7365 case SECCAT_RODATA:
7366 case SECCAT_RODATA_MERGE_STR:
7367 case SECCAT_RODATA_MERGE_STR_INIT:
7368 case SECCAT_RODATA_MERGE_CONST:
7369 sname = ".lrodata";
7370 flags &= ~SECTION_WRITE;
7371 break;
7372 case SECCAT_SRODATA:
7373 case SECCAT_SDATA:
7374 case SECCAT_SBSS:
7375 gcc_unreachable ();
7376 case SECCAT_TEXT:
7377 case SECCAT_TDATA:
7378 case SECCAT_TBSS:
7379 /* We don't split these for medium model. Place them into
7380 default sections and hope for best. */
7381 break;
7383 if (sname)
7385 /* We might get called with string constants, but get_named_section
7386 doesn't like them as they are not DECLs. Also, we need to set
7387 flags in that case. */
7388 if (!DECL_P (decl))
7389 return get_section (sname, flags, NULL);
7390 return get_named_section (decl, sname, reloc);
7393 return default_elf_select_section (decl, reloc, align);
7396 /* Select a set of attributes for section NAME based on the properties
7397 of DECL and whether or not RELOC indicates that DECL's initializer
7398 might contain runtime relocations. */
7400 static unsigned int ATTRIBUTE_UNUSED
7401 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7403 unsigned int flags = default_section_type_flags (decl, name, reloc);
7405 if (ix86_in_large_data_p (decl))
7406 flags |= SECTION_LARGE;
7408 if (decl == NULL_TREE
7409 && (strcmp (name, ".ldata.rel.ro") == 0
7410 || strcmp (name, ".ldata.rel.ro.local") == 0))
7411 flags |= SECTION_RELRO;
7413 if (strcmp (name, ".lbss") == 0
7414 || strncmp (name, ".lbss.", 5) == 0
7415 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7416 flags |= SECTION_BSS;
7418 return flags;
7421 /* Build up a unique section name, expressed as a
7422 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7423 RELOC indicates whether the initial value of EXP requires
7424 link-time relocations. */
7426 static void ATTRIBUTE_UNUSED
7427 x86_64_elf_unique_section (tree decl, int reloc)
7429 if (ix86_in_large_data_p (decl))
7431 const char *prefix = NULL;
7432 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7433 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7435 switch (categorize_decl_for_section (decl, reloc))
7437 case SECCAT_DATA:
7438 case SECCAT_DATA_REL:
7439 case SECCAT_DATA_REL_LOCAL:
7440 case SECCAT_DATA_REL_RO:
7441 case SECCAT_DATA_REL_RO_LOCAL:
7442 prefix = one_only ? ".ld" : ".ldata";
7443 break;
7444 case SECCAT_BSS:
7445 prefix = one_only ? ".lb" : ".lbss";
7446 break;
7447 case SECCAT_RODATA:
7448 case SECCAT_RODATA_MERGE_STR:
7449 case SECCAT_RODATA_MERGE_STR_INIT:
7450 case SECCAT_RODATA_MERGE_CONST:
7451 prefix = one_only ? ".lr" : ".lrodata";
7452 break;
7453 case SECCAT_SRODATA:
7454 case SECCAT_SDATA:
7455 case SECCAT_SBSS:
7456 gcc_unreachable ();
7457 case SECCAT_TEXT:
7458 case SECCAT_TDATA:
7459 case SECCAT_TBSS:
7460 /* We don't split these for medium model. Place them into
7461 default sections and hope for best. */
7462 break;
7464 if (prefix)
7466 const char *name, *linkonce;
7467 char *string;
7469 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7470 name = targetm.strip_name_encoding (name);
7472 /* If we're using one_only, then there needs to be a .gnu.linkonce
7473 prefix to the section name. */
7474 linkonce = one_only ? ".gnu.linkonce" : "";
7476 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7478 set_decl_section_name (decl, string);
7479 return;
7482 default_unique_section (decl, reloc);
7485 #ifdef COMMON_ASM_OP
7487 #ifndef LARGECOMM_SECTION_ASM_OP
7488 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7489 #endif
7491 /* This says how to output assembler code to declare an
7492 uninitialized external linkage data object.
7494 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7495 large objects. */
7496 void
7497 x86_elf_aligned_decl_common (FILE *file, tree decl,
7498 const char *name, unsigned HOST_WIDE_INT size,
7499 int align)
7501 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7502 && size > (unsigned int)ix86_section_threshold)
7504 switch_to_section (get_named_section (decl, ".lbss", 0));
7505 fputs (LARGECOMM_SECTION_ASM_OP, file);
7507 else
7508 fputs (COMMON_ASM_OP, file);
7509 assemble_name (file, name);
7510 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7511 size, align / BITS_PER_UNIT);
7513 #endif
7515 /* Utility function for targets to use in implementing
7516 ASM_OUTPUT_ALIGNED_BSS. */
7518 void
7519 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7520 unsigned HOST_WIDE_INT size, int align)
7522 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7523 && size > (unsigned int)ix86_section_threshold)
7524 switch_to_section (get_named_section (decl, ".lbss", 0));
7525 else
7526 switch_to_section (bss_section);
7527 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7528 #ifdef ASM_DECLARE_OBJECT_NAME
7529 last_assemble_variable_decl = decl;
7530 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7531 #else
7532 /* Standard thing is just output label for the object. */
7533 ASM_OUTPUT_LABEL (file, name);
7534 #endif /* ASM_DECLARE_OBJECT_NAME */
7535 ASM_OUTPUT_SKIP (file, size ? size : 1);
7538 /* Decide whether we must probe the stack before any space allocation
7539 on this target. It's essentially TARGET_STACK_PROBE except when
7540 -fstack-check causes the stack to be already probed differently. */
7542 bool
7543 ix86_target_stack_probe (void)
7545 /* Do not probe the stack twice if static stack checking is enabled. */
7546 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7547 return false;
7549 return TARGET_STACK_PROBE;
7552 /* Decide whether we can make a sibling call to a function. DECL is the
7553 declaration of the function being targeted by the call and EXP is the
7554 CALL_EXPR representing the call. */
7556 static bool
7557 ix86_function_ok_for_sibcall (tree decl, tree exp)
7559 tree type, decl_or_type;
7560 rtx a, b;
7561 bool bind_global = decl && !targetm.binds_local_p (decl);
7563 /* Sibling call isn't OK if there are no caller-saved registers
7564 since all registers must be preserved before return. */
7565 if (cfun->machine->no_caller_saved_registers)
7566 return false;
7568 /* If we are generating position-independent code, we cannot sibcall
7569 optimize direct calls to global functions, as the PLT requires
7570 %ebx be live. (Darwin does not have a PLT.) */
7571 if (!TARGET_MACHO
7572 && !TARGET_64BIT
7573 && flag_pic
7574 && flag_plt
7575 && bind_global)
7576 return false;
7578 /* If we need to align the outgoing stack, then sibcalling would
7579 unalign the stack, which may break the called function. */
7580 if (ix86_minimum_incoming_stack_boundary (true)
7581 < PREFERRED_STACK_BOUNDARY)
7582 return false;
7584 if (decl)
7586 decl_or_type = decl;
7587 type = TREE_TYPE (decl);
7589 else
7591 /* We're looking at the CALL_EXPR, we need the type of the function. */
7592 type = CALL_EXPR_FN (exp); /* pointer expression */
7593 type = TREE_TYPE (type); /* pointer type */
7594 type = TREE_TYPE (type); /* function type */
7595 decl_or_type = type;
7598 /* Check that the return value locations are the same. Like
7599 if we are returning floats on the 80387 register stack, we cannot
7600 make a sibcall from a function that doesn't return a float to a
7601 function that does or, conversely, from a function that does return
7602 a float to a function that doesn't; the necessary stack adjustment
7603 would not be executed. This is also the place we notice
7604 differences in the return value ABI. Note that it is ok for one
7605 of the functions to have void return type as long as the return
7606 value of the other is passed in a register. */
7607 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7608 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7609 cfun->decl, false);
7610 if (STACK_REG_P (a) || STACK_REG_P (b))
7612 if (!rtx_equal_p (a, b))
7613 return false;
7615 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7617 else if (!rtx_equal_p (a, b))
7618 return false;
7620 if (TARGET_64BIT)
7622 /* The SYSV ABI has more call-clobbered registers;
7623 disallow sibcalls from MS to SYSV. */
7624 if (cfun->machine->call_abi == MS_ABI
7625 && ix86_function_type_abi (type) == SYSV_ABI)
7626 return false;
7628 else
7630 /* If this call is indirect, we'll need to be able to use a
7631 call-clobbered register for the address of the target function.
7632 Make sure that all such registers are not used for passing
7633 parameters. Note that DLLIMPORT functions and call to global
7634 function via GOT slot are indirect. */
7635 if (!decl
7636 || (bind_global && flag_pic && !flag_plt)
7637 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7639 /* Check if regparm >= 3 since arg_reg_available is set to
7640 false if regparm == 0. If regparm is 1 or 2, there is
7641 always a call-clobbered register available.
7643 ??? The symbol indirect call doesn't need a call-clobbered
7644 register. But we don't know if this is a symbol indirect
7645 call or not here. */
7646 if (ix86_function_regparm (type, NULL) >= 3
7647 && !cfun->machine->arg_reg_available)
7648 return false;
7652 /* Otherwise okay. That also includes certain types of indirect calls. */
7653 return true;
7656 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7657 and "sseregparm" calling convention attributes;
7658 arguments as in struct attribute_spec.handler. */
7660 static tree
7661 ix86_handle_cconv_attribute (tree *node, tree name,
7662 tree args,
7663 int,
7664 bool *no_add_attrs)
7666 if (TREE_CODE (*node) != FUNCTION_TYPE
7667 && TREE_CODE (*node) != METHOD_TYPE
7668 && TREE_CODE (*node) != FIELD_DECL
7669 && TREE_CODE (*node) != TYPE_DECL)
7671 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7672 name);
7673 *no_add_attrs = true;
7674 return NULL_TREE;
7677 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7678 if (is_attribute_p ("regparm", name))
7680 tree cst;
7682 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7684 error ("fastcall and regparm attributes are not compatible");
7687 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7689 error ("regparam and thiscall attributes are not compatible");
7692 cst = TREE_VALUE (args);
7693 if (TREE_CODE (cst) != INTEGER_CST)
7695 warning (OPT_Wattributes,
7696 "%qE attribute requires an integer constant argument",
7697 name);
7698 *no_add_attrs = true;
7700 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7702 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7703 name, REGPARM_MAX);
7704 *no_add_attrs = true;
7707 return NULL_TREE;
7710 if (TARGET_64BIT)
7712 /* Do not warn when emulating the MS ABI. */
7713 if ((TREE_CODE (*node) != FUNCTION_TYPE
7714 && TREE_CODE (*node) != METHOD_TYPE)
7715 || ix86_function_type_abi (*node) != MS_ABI)
7716 warning (OPT_Wattributes, "%qE attribute ignored",
7717 name);
7718 *no_add_attrs = true;
7719 return NULL_TREE;
7722 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7723 if (is_attribute_p ("fastcall", name))
7725 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7727 error ("fastcall and cdecl attributes are not compatible");
7729 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7731 error ("fastcall and stdcall attributes are not compatible");
7733 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7735 error ("fastcall and regparm attributes are not compatible");
7737 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7739 error ("fastcall and thiscall attributes are not compatible");
7743 /* Can combine stdcall with fastcall (redundant), regparm and
7744 sseregparm. */
7745 else if (is_attribute_p ("stdcall", name))
7747 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7749 error ("stdcall and cdecl attributes are not compatible");
7751 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7753 error ("stdcall and fastcall attributes are not compatible");
7755 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7757 error ("stdcall and thiscall attributes are not compatible");
7761 /* Can combine cdecl with regparm and sseregparm. */
7762 else if (is_attribute_p ("cdecl", name))
7764 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7766 error ("stdcall and cdecl attributes are not compatible");
7768 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7770 error ("fastcall and cdecl attributes are not compatible");
7772 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7774 error ("cdecl and thiscall attributes are not compatible");
7777 else if (is_attribute_p ("thiscall", name))
7779 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7780 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7781 name);
7782 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7784 error ("stdcall and thiscall attributes are not compatible");
7786 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7788 error ("fastcall and thiscall attributes are not compatible");
7790 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7792 error ("cdecl and thiscall attributes are not compatible");
7796 /* Can combine sseregparm with all attributes. */
7798 return NULL_TREE;
7801 /* The transactional memory builtins are implicitly regparm or fastcall
7802 depending on the ABI. Override the generic do-nothing attribute that
7803 these builtins were declared with, and replace it with one of the two
7804 attributes that we expect elsewhere. */
7806 static tree
7807 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7808 int flags, bool *no_add_attrs)
7810 tree alt;
7812 /* In no case do we want to add the placeholder attribute. */
7813 *no_add_attrs = true;
7815 /* The 64-bit ABI is unchanged for transactional memory. */
7816 if (TARGET_64BIT)
7817 return NULL_TREE;
7819 /* ??? Is there a better way to validate 32-bit windows? We have
7820 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7821 if (CHECK_STACK_LIMIT > 0)
7822 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7823 else
7825 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7826 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7828 decl_attributes (node, alt, flags);
7830 return NULL_TREE;
7833 /* This function determines from TYPE the calling-convention. */
7835 unsigned int
7836 ix86_get_callcvt (const_tree type)
7838 unsigned int ret = 0;
7839 bool is_stdarg;
7840 tree attrs;
7842 if (TARGET_64BIT)
7843 return IX86_CALLCVT_CDECL;
7845 attrs = TYPE_ATTRIBUTES (type);
7846 if (attrs != NULL_TREE)
7848 if (lookup_attribute ("cdecl", attrs))
7849 ret |= IX86_CALLCVT_CDECL;
7850 else if (lookup_attribute ("stdcall", attrs))
7851 ret |= IX86_CALLCVT_STDCALL;
7852 else if (lookup_attribute ("fastcall", attrs))
7853 ret |= IX86_CALLCVT_FASTCALL;
7854 else if (lookup_attribute ("thiscall", attrs))
7855 ret |= IX86_CALLCVT_THISCALL;
7857 /* Regparam isn't allowed for thiscall and fastcall. */
7858 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7860 if (lookup_attribute ("regparm", attrs))
7861 ret |= IX86_CALLCVT_REGPARM;
7862 if (lookup_attribute ("sseregparm", attrs))
7863 ret |= IX86_CALLCVT_SSEREGPARM;
7866 if (IX86_BASE_CALLCVT(ret) != 0)
7867 return ret;
7870 is_stdarg = stdarg_p (type);
7871 if (TARGET_RTD && !is_stdarg)
7872 return IX86_CALLCVT_STDCALL | ret;
7874 if (ret != 0
7875 || is_stdarg
7876 || TREE_CODE (type) != METHOD_TYPE
7877 || ix86_function_type_abi (type) != MS_ABI)
7878 return IX86_CALLCVT_CDECL | ret;
7880 return IX86_CALLCVT_THISCALL;
7883 /* Return 0 if the attributes for two types are incompatible, 1 if they
7884 are compatible, and 2 if they are nearly compatible (which causes a
7885 warning to be generated). */
7887 static int
7888 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7890 unsigned int ccvt1, ccvt2;
7892 if (TREE_CODE (type1) != FUNCTION_TYPE
7893 && TREE_CODE (type1) != METHOD_TYPE)
7894 return 1;
7896 ccvt1 = ix86_get_callcvt (type1);
7897 ccvt2 = ix86_get_callcvt (type2);
7898 if (ccvt1 != ccvt2)
7899 return 0;
7900 if (ix86_function_regparm (type1, NULL)
7901 != ix86_function_regparm (type2, NULL))
7902 return 0;
7904 return 1;
7907 /* Return the regparm value for a function with the indicated TYPE and DECL.
7908 DECL may be NULL when calling function indirectly
7909 or considering a libcall. */
7911 static int
7912 ix86_function_regparm (const_tree type, const_tree decl)
7914 tree attr;
7915 int regparm;
7916 unsigned int ccvt;
7918 if (TARGET_64BIT)
7919 return (ix86_function_type_abi (type) == SYSV_ABI
7920 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7921 ccvt = ix86_get_callcvt (type);
7922 regparm = ix86_regparm;
7924 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7926 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7927 if (attr)
7929 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7930 return regparm;
7933 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7934 return 2;
7935 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7936 return 1;
7938 /* Use register calling convention for local functions when possible. */
7939 if (decl
7940 && TREE_CODE (decl) == FUNCTION_DECL)
7942 cgraph_node *target = cgraph_node::get (decl);
7943 if (target)
7944 target = target->function_symbol ();
7946 /* Caller and callee must agree on the calling convention, so
7947 checking here just optimize means that with
7948 __attribute__((optimize (...))) caller could use regparm convention
7949 and callee not, or vice versa. Instead look at whether the callee
7950 is optimized or not. */
7951 if (target && opt_for_fn (target->decl, optimize)
7952 && !(profile_flag && !flag_fentry))
7954 cgraph_local_info *i = &target->local;
7955 if (i && i->local && i->can_change_signature)
7957 int local_regparm, globals = 0, regno;
7959 /* Make sure no regparm register is taken by a
7960 fixed register variable. */
7961 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7962 local_regparm++)
7963 if (fixed_regs[local_regparm])
7964 break;
7966 /* We don't want to use regparm(3) for nested functions as
7967 these use a static chain pointer in the third argument. */
7968 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7969 local_regparm = 2;
7971 /* Save a register for the split stack. */
7972 if (local_regparm == 3 && flag_split_stack)
7973 local_regparm = 2;
7975 /* Each fixed register usage increases register pressure,
7976 so less registers should be used for argument passing.
7977 This functionality can be overriden by an explicit
7978 regparm value. */
7979 for (regno = AX_REG; regno <= DI_REG; regno++)
7980 if (fixed_regs[regno])
7981 globals++;
7983 local_regparm
7984 = globals < local_regparm ? local_regparm - globals : 0;
7986 if (local_regparm > regparm)
7987 regparm = local_regparm;
7992 return regparm;
7995 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7996 DFmode (2) arguments in SSE registers for a function with the
7997 indicated TYPE and DECL. DECL may be NULL when calling function
7998 indirectly or considering a libcall. Return -1 if any FP parameter
7999 should be rejected by error. This is used in siutation we imply SSE
8000 calling convetion but the function is called from another function with
8001 SSE disabled. Otherwise return 0. */
8003 static int
8004 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8006 gcc_assert (!TARGET_64BIT);
8008 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8009 by the sseregparm attribute. */
8010 if (TARGET_SSEREGPARM
8011 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8013 if (!TARGET_SSE)
8015 if (warn)
8017 if (decl)
8018 error ("calling %qD with attribute sseregparm without "
8019 "SSE/SSE2 enabled", decl);
8020 else
8021 error ("calling %qT with attribute sseregparm without "
8022 "SSE/SSE2 enabled", type);
8024 return 0;
8027 return 2;
8030 if (!decl)
8031 return 0;
8033 cgraph_node *target = cgraph_node::get (decl);
8034 if (target)
8035 target = target->function_symbol ();
8037 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8038 (and DFmode for SSE2) arguments in SSE registers. */
8039 if (target
8040 /* TARGET_SSE_MATH */
8041 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8042 && opt_for_fn (target->decl, optimize)
8043 && !(profile_flag && !flag_fentry))
8045 cgraph_local_info *i = &target->local;
8046 if (i && i->local && i->can_change_signature)
8048 /* Refuse to produce wrong code when local function with SSE enabled
8049 is called from SSE disabled function.
8050 FIXME: We need a way to detect these cases cross-ltrans partition
8051 and avoid using SSE calling conventions on local functions called
8052 from function with SSE disabled. For now at least delay the
8053 warning until we know we are going to produce wrong code.
8054 See PR66047 */
8055 if (!TARGET_SSE && warn)
8056 return -1;
8057 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8058 ->x_ix86_isa_flags) ? 2 : 1;
8062 return 0;
8065 /* Return true if EAX is live at the start of the function. Used by
8066 ix86_expand_prologue to determine if we need special help before
8067 calling allocate_stack_worker. */
8069 static bool
8070 ix86_eax_live_at_start_p (void)
8072 /* Cheat. Don't bother working forward from ix86_function_regparm
8073 to the function type to whether an actual argument is located in
8074 eax. Instead just look at cfg info, which is still close enough
8075 to correct at this point. This gives false positives for broken
8076 functions that might use uninitialized data that happens to be
8077 allocated in eax, but who cares? */
8078 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8081 static bool
8082 ix86_keep_aggregate_return_pointer (tree fntype)
8084 tree attr;
8086 if (!TARGET_64BIT)
8088 attr = lookup_attribute ("callee_pop_aggregate_return",
8089 TYPE_ATTRIBUTES (fntype));
8090 if (attr)
8091 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8093 /* For 32-bit MS-ABI the default is to keep aggregate
8094 return pointer. */
8095 if (ix86_function_type_abi (fntype) == MS_ABI)
8096 return true;
8098 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8101 /* Value is the number of bytes of arguments automatically
8102 popped when returning from a subroutine call.
8103 FUNDECL is the declaration node of the function (as a tree),
8104 FUNTYPE is the data type of the function (as a tree),
8105 or for a library call it is an identifier node for the subroutine name.
8106 SIZE is the number of bytes of arguments passed on the stack.
8108 On the 80386, the RTD insn may be used to pop them if the number
8109 of args is fixed, but if the number is variable then the caller
8110 must pop them all. RTD can't be used for library calls now
8111 because the library is compiled with the Unix compiler.
8112 Use of RTD is a selectable option, since it is incompatible with
8113 standard Unix calling sequences. If the option is not selected,
8114 the caller must always pop the args.
8116 The attribute stdcall is equivalent to RTD on a per module basis. */
8118 static int
8119 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8121 unsigned int ccvt;
8123 /* None of the 64-bit ABIs pop arguments. */
8124 if (TARGET_64BIT)
8125 return 0;
8127 ccvt = ix86_get_callcvt (funtype);
8129 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8130 | IX86_CALLCVT_THISCALL)) != 0
8131 && ! stdarg_p (funtype))
8132 return size;
8134 /* Lose any fake structure return argument if it is passed on the stack. */
8135 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8136 && !ix86_keep_aggregate_return_pointer (funtype))
8138 int nregs = ix86_function_regparm (funtype, fundecl);
8139 if (nregs == 0)
8140 return GET_MODE_SIZE (Pmode);
8143 return 0;
8146 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8148 static bool
8149 ix86_legitimate_combined_insn (rtx_insn *insn)
8151 int i;
8153 /* Check operand constraints in case hard registers were propagated
8154 into insn pattern. This check prevents combine pass from
8155 generating insn patterns with invalid hard register operands.
8156 These invalid insns can eventually confuse reload to error out
8157 with a spill failure. See also PRs 46829 and 46843. */
8159 gcc_assert (INSN_CODE (insn) >= 0);
8161 extract_insn (insn);
8162 preprocess_constraints (insn);
8164 int n_operands = recog_data.n_operands;
8165 int n_alternatives = recog_data.n_alternatives;
8166 for (i = 0; i < n_operands; i++)
8168 rtx op = recog_data.operand[i];
8169 machine_mode mode = GET_MODE (op);
8170 const operand_alternative *op_alt;
8171 int offset = 0;
8172 bool win;
8173 int j;
8175 /* A unary operator may be accepted by the predicate, but it
8176 is irrelevant for matching constraints. */
8177 if (UNARY_P (op))
8178 op = XEXP (op, 0);
8180 if (SUBREG_P (op))
8182 if (REG_P (SUBREG_REG (op))
8183 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8184 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8185 GET_MODE (SUBREG_REG (op)),
8186 SUBREG_BYTE (op),
8187 GET_MODE (op));
8188 op = SUBREG_REG (op);
8191 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8192 continue;
8194 op_alt = recog_op_alt;
8196 /* Operand has no constraints, anything is OK. */
8197 win = !n_alternatives;
8199 alternative_mask preferred = get_preferred_alternatives (insn);
8200 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8202 if (!TEST_BIT (preferred, j))
8203 continue;
8204 if (op_alt[i].anything_ok
8205 || (op_alt[i].matches != -1
8206 && operands_match_p
8207 (recog_data.operand[i],
8208 recog_data.operand[op_alt[i].matches]))
8209 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8211 win = true;
8212 break;
8216 if (!win)
8217 return false;
8220 return true;
8223 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8225 static unsigned HOST_WIDE_INT
8226 ix86_asan_shadow_offset (void)
8228 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8229 : HOST_WIDE_INT_C (0x7fff8000))
8230 : (HOST_WIDE_INT_1 << 29);
8233 /* Argument support functions. */
8235 /* Return true when register may be used to pass function parameters. */
8236 bool
8237 ix86_function_arg_regno_p (int regno)
8239 int i;
8240 enum calling_abi call_abi;
8241 const int *parm_regs;
8243 if (TARGET_MPX && BND_REGNO_P (regno))
8244 return true;
8246 if (!TARGET_64BIT)
8248 if (TARGET_MACHO)
8249 return (regno < REGPARM_MAX
8250 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8251 else
8252 return (regno < REGPARM_MAX
8253 || (TARGET_MMX && MMX_REGNO_P (regno)
8254 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8255 || (TARGET_SSE && SSE_REGNO_P (regno)
8256 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8259 if (TARGET_SSE && SSE_REGNO_P (regno)
8260 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8261 return true;
8263 /* TODO: The function should depend on current function ABI but
8264 builtins.c would need updating then. Therefore we use the
8265 default ABI. */
8266 call_abi = ix86_cfun_abi ();
8268 /* RAX is used as hidden argument to va_arg functions. */
8269 if (call_abi == SYSV_ABI && regno == AX_REG)
8270 return true;
8272 if (call_abi == MS_ABI)
8273 parm_regs = x86_64_ms_abi_int_parameter_registers;
8274 else
8275 parm_regs = x86_64_int_parameter_registers;
8277 for (i = 0; i < (call_abi == MS_ABI
8278 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8279 if (regno == parm_regs[i])
8280 return true;
8281 return false;
8284 /* Return if we do not know how to pass TYPE solely in registers. */
8286 static bool
8287 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8289 if (must_pass_in_stack_var_size_or_pad (mode, type))
8290 return true;
8292 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8293 The layout_type routine is crafty and tries to trick us into passing
8294 currently unsupported vector types on the stack by using TImode. */
8295 return (!TARGET_64BIT && mode == TImode
8296 && type && TREE_CODE (type) != VECTOR_TYPE);
8299 /* It returns the size, in bytes, of the area reserved for arguments passed
8300 in registers for the function represented by fndecl dependent to the used
8301 abi format. */
8303 ix86_reg_parm_stack_space (const_tree fndecl)
8305 enum calling_abi call_abi = SYSV_ABI;
8306 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8307 call_abi = ix86_function_abi (fndecl);
8308 else
8309 call_abi = ix86_function_type_abi (fndecl);
8310 if (TARGET_64BIT && call_abi == MS_ABI)
8311 return 32;
8312 return 0;
8315 /* We add this as a workaround in order to use libc_has_function
8316 hook in i386.md. */
8317 bool
8318 ix86_libc_has_function (enum function_class fn_class)
8320 return targetm.libc_has_function (fn_class);
8323 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8324 specifying the call abi used. */
8325 enum calling_abi
8326 ix86_function_type_abi (const_tree fntype)
8328 enum calling_abi abi = ix86_abi;
8330 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8331 return abi;
8333 if (abi == SYSV_ABI
8334 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8336 if (TARGET_X32)
8337 error ("X32 does not support ms_abi attribute");
8339 abi = MS_ABI;
8341 else if (abi == MS_ABI
8342 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8343 abi = SYSV_ABI;
8345 return abi;
8348 static enum calling_abi
8349 ix86_function_abi (const_tree fndecl)
8351 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8354 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8355 specifying the call abi used. */
8356 enum calling_abi
8357 ix86_cfun_abi (void)
8359 return cfun ? cfun->machine->call_abi : ix86_abi;
8362 static bool
8363 ix86_function_ms_hook_prologue (const_tree fn)
8365 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8367 if (decl_function_context (fn) != NULL_TREE)
8368 error_at (DECL_SOURCE_LOCATION (fn),
8369 "ms_hook_prologue is not compatible with nested function");
8370 else
8371 return true;
8373 return false;
8376 /* Write the extra assembler code needed to declare a function properly. */
8378 void
8379 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8380 tree decl)
8382 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8384 if (is_ms_hook)
8386 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8387 unsigned int filler_cc = 0xcccccccc;
8389 for (i = 0; i < filler_count; i += 4)
8390 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8393 #ifdef SUBTARGET_ASM_UNWIND_INIT
8394 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8395 #endif
8397 ASM_OUTPUT_LABEL (asm_out_file, fname);
8399 /* Output magic byte marker, if hot-patch attribute is set. */
8400 if (is_ms_hook)
8402 if (TARGET_64BIT)
8404 /* leaq [%rsp + 0], %rsp */
8405 asm_fprintf (asm_out_file, ASM_BYTE
8406 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8408 else
8410 /* movl.s %edi, %edi
8411 push %ebp
8412 movl.s %esp, %ebp */
8413 asm_fprintf (asm_out_file, ASM_BYTE
8414 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8419 /* regclass.c */
8420 extern void init_regs (void);
8422 /* Implementation of call abi switching target hook. Specific to FNDECL
8423 the specific call register sets are set. See also
8424 ix86_conditional_register_usage for more details. */
8425 void
8426 ix86_call_abi_override (const_tree fndecl)
8428 cfun->machine->call_abi = ix86_function_abi (fndecl);
8431 /* Return 1 if pseudo register should be created and used to hold
8432 GOT address for PIC code. */
8433 bool
8434 ix86_use_pseudo_pic_reg (void)
8436 if ((TARGET_64BIT
8437 && (ix86_cmodel == CM_SMALL_PIC
8438 || TARGET_PECOFF))
8439 || !flag_pic)
8440 return false;
8441 return true;
8444 /* Initialize large model PIC register. */
8446 static void
8447 ix86_init_large_pic_reg (unsigned int tmp_regno)
8449 rtx_code_label *label;
8450 rtx tmp_reg;
8452 gcc_assert (Pmode == DImode);
8453 label = gen_label_rtx ();
8454 emit_label (label);
8455 LABEL_PRESERVE_P (label) = 1;
8456 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8457 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8458 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8459 label));
8460 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8461 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8462 pic_offset_table_rtx, tmp_reg));
8465 /* Create and initialize PIC register if required. */
8466 static void
8467 ix86_init_pic_reg (void)
8469 edge entry_edge;
8470 rtx_insn *seq;
8472 if (!ix86_use_pseudo_pic_reg ())
8473 return;
8475 start_sequence ();
8477 if (TARGET_64BIT)
8479 if (ix86_cmodel == CM_LARGE_PIC)
8480 ix86_init_large_pic_reg (R11_REG);
8481 else
8482 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8484 else
8486 /* If there is future mcount call in the function it is more profitable
8487 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8488 rtx reg = crtl->profile
8489 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8490 : pic_offset_table_rtx;
8491 rtx_insn *insn = emit_insn (gen_set_got (reg));
8492 RTX_FRAME_RELATED_P (insn) = 1;
8493 if (crtl->profile)
8494 emit_move_insn (pic_offset_table_rtx, reg);
8495 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8498 seq = get_insns ();
8499 end_sequence ();
8501 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8502 insert_insn_on_edge (seq, entry_edge);
8503 commit_one_edge_insertion (entry_edge);
8506 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8507 for a call to a function whose data type is FNTYPE.
8508 For a library call, FNTYPE is 0. */
8510 void
8511 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8512 tree fntype, /* tree ptr for function decl */
8513 rtx libname, /* SYMBOL_REF of library name or 0 */
8514 tree fndecl,
8515 int caller)
8517 struct cgraph_local_info *i = NULL;
8518 struct cgraph_node *target = NULL;
8520 memset (cum, 0, sizeof (*cum));
8522 if (fndecl)
8524 target = cgraph_node::get (fndecl);
8525 if (target)
8527 target = target->function_symbol ();
8528 i = cgraph_node::local_info (target->decl);
8529 cum->call_abi = ix86_function_abi (target->decl);
8531 else
8532 cum->call_abi = ix86_function_abi (fndecl);
8534 else
8535 cum->call_abi = ix86_function_type_abi (fntype);
8537 cum->caller = caller;
8539 /* Set up the number of registers to use for passing arguments. */
8540 cum->nregs = ix86_regparm;
8541 if (TARGET_64BIT)
8543 cum->nregs = (cum->call_abi == SYSV_ABI
8544 ? X86_64_REGPARM_MAX
8545 : X86_64_MS_REGPARM_MAX);
8547 if (TARGET_SSE)
8549 cum->sse_nregs = SSE_REGPARM_MAX;
8550 if (TARGET_64BIT)
8552 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8553 ? X86_64_SSE_REGPARM_MAX
8554 : X86_64_MS_SSE_REGPARM_MAX);
8557 if (TARGET_MMX)
8558 cum->mmx_nregs = MMX_REGPARM_MAX;
8559 cum->warn_avx512f = true;
8560 cum->warn_avx = true;
8561 cum->warn_sse = true;
8562 cum->warn_mmx = true;
8564 /* Because type might mismatch in between caller and callee, we need to
8565 use actual type of function for local calls.
8566 FIXME: cgraph_analyze can be told to actually record if function uses
8567 va_start so for local functions maybe_vaarg can be made aggressive
8568 helping K&R code.
8569 FIXME: once typesytem is fixed, we won't need this code anymore. */
8570 if (i && i->local && i->can_change_signature)
8571 fntype = TREE_TYPE (target->decl);
8572 cum->stdarg = stdarg_p (fntype);
8573 cum->maybe_vaarg = (fntype
8574 ? (!prototype_p (fntype) || stdarg_p (fntype))
8575 : !libname);
8577 cum->bnd_regno = FIRST_BND_REG;
8578 cum->bnds_in_bt = 0;
8579 cum->force_bnd_pass = 0;
8580 cum->decl = fndecl;
8582 if (!TARGET_64BIT)
8584 /* If there are variable arguments, then we won't pass anything
8585 in registers in 32-bit mode. */
8586 if (stdarg_p (fntype))
8588 cum->nregs = 0;
8589 /* Since in 32-bit, variable arguments are always passed on
8590 stack, there is scratch register available for indirect
8591 sibcall. */
8592 cfun->machine->arg_reg_available = true;
8593 cum->sse_nregs = 0;
8594 cum->mmx_nregs = 0;
8595 cum->warn_avx512f = false;
8596 cum->warn_avx = false;
8597 cum->warn_sse = false;
8598 cum->warn_mmx = false;
8599 return;
8602 /* Use ecx and edx registers if function has fastcall attribute,
8603 else look for regparm information. */
8604 if (fntype)
8606 unsigned int ccvt = ix86_get_callcvt (fntype);
8607 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8609 cum->nregs = 1;
8610 cum->fastcall = 1; /* Same first register as in fastcall. */
8612 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8614 cum->nregs = 2;
8615 cum->fastcall = 1;
8617 else
8618 cum->nregs = ix86_function_regparm (fntype, fndecl);
8621 /* Set up the number of SSE registers used for passing SFmode
8622 and DFmode arguments. Warn for mismatching ABI. */
8623 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8626 cfun->machine->arg_reg_available = (cum->nregs > 0);
8629 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8630 But in the case of vector types, it is some vector mode.
8632 When we have only some of our vector isa extensions enabled, then there
8633 are some modes for which vector_mode_supported_p is false. For these
8634 modes, the generic vector support in gcc will choose some non-vector mode
8635 in order to implement the type. By computing the natural mode, we'll
8636 select the proper ABI location for the operand and not depend on whatever
8637 the middle-end decides to do with these vector types.
8639 The midde-end can't deal with the vector types > 16 bytes. In this
8640 case, we return the original mode and warn ABI change if CUM isn't
8641 NULL.
8643 If INT_RETURN is true, warn ABI change if the vector mode isn't
8644 available for function return value. */
8646 static machine_mode
8647 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8648 bool in_return)
8650 machine_mode mode = TYPE_MODE (type);
8652 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8654 HOST_WIDE_INT size = int_size_in_bytes (type);
8655 if ((size == 8 || size == 16 || size == 32 || size == 64)
8656 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8657 && TYPE_VECTOR_SUBPARTS (type) > 1)
8659 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8661 /* There are no XFmode vector modes. */
8662 if (innermode == XFmode)
8663 return mode;
8665 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8666 mode = MIN_MODE_VECTOR_FLOAT;
8667 else
8668 mode = MIN_MODE_VECTOR_INT;
8670 /* Get the mode which has this inner mode and number of units. */
8671 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8672 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8673 && GET_MODE_INNER (mode) == innermode)
8675 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8677 static bool warnedavx512f;
8678 static bool warnedavx512f_ret;
8680 if (cum && cum->warn_avx512f && !warnedavx512f)
8682 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8683 "without AVX512F enabled changes the ABI"))
8684 warnedavx512f = true;
8686 else if (in_return && !warnedavx512f_ret)
8688 if (warning (OPT_Wpsabi, "AVX512F vector return "
8689 "without AVX512F enabled changes the ABI"))
8690 warnedavx512f_ret = true;
8693 return TYPE_MODE (type);
8695 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8697 static bool warnedavx;
8698 static bool warnedavx_ret;
8700 if (cum && cum->warn_avx && !warnedavx)
8702 if (warning (OPT_Wpsabi, "AVX vector argument "
8703 "without AVX enabled changes the ABI"))
8704 warnedavx = true;
8706 else if (in_return && !warnedavx_ret)
8708 if (warning (OPT_Wpsabi, "AVX vector return "
8709 "without AVX enabled changes the ABI"))
8710 warnedavx_ret = true;
8713 return TYPE_MODE (type);
8715 else if (((size == 8 && TARGET_64BIT) || size == 16)
8716 && !TARGET_SSE
8717 && !TARGET_IAMCU)
8719 static bool warnedsse;
8720 static bool warnedsse_ret;
8722 if (cum && cum->warn_sse && !warnedsse)
8724 if (warning (OPT_Wpsabi, "SSE vector argument "
8725 "without SSE enabled changes the ABI"))
8726 warnedsse = true;
8728 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8730 if (warning (OPT_Wpsabi, "SSE vector return "
8731 "without SSE enabled changes the ABI"))
8732 warnedsse_ret = true;
8735 else if ((size == 8 && !TARGET_64BIT)
8736 && (!cfun
8737 || cfun->machine->func_type == TYPE_NORMAL)
8738 && !TARGET_MMX
8739 && !TARGET_IAMCU)
8741 static bool warnedmmx;
8742 static bool warnedmmx_ret;
8744 if (cum && cum->warn_mmx && !warnedmmx)
8746 if (warning (OPT_Wpsabi, "MMX vector argument "
8747 "without MMX enabled changes the ABI"))
8748 warnedmmx = true;
8750 else if (in_return && !warnedmmx_ret)
8752 if (warning (OPT_Wpsabi, "MMX vector return "
8753 "without MMX enabled changes the ABI"))
8754 warnedmmx_ret = true;
8757 return mode;
8760 gcc_unreachable ();
8764 return mode;
8767 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8768 this may not agree with the mode that the type system has chosen for the
8769 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8770 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8772 static rtx
8773 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8774 unsigned int regno)
8776 rtx tmp;
8778 if (orig_mode != BLKmode)
8779 tmp = gen_rtx_REG (orig_mode, regno);
8780 else
8782 tmp = gen_rtx_REG (mode, regno);
8783 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8784 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8787 return tmp;
8790 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8791 of this code is to classify each 8bytes of incoming argument by the register
8792 class and assign registers accordingly. */
8794 /* Return the union class of CLASS1 and CLASS2.
8795 See the x86-64 PS ABI for details. */
8797 static enum x86_64_reg_class
8798 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8800 /* Rule #1: If both classes are equal, this is the resulting class. */
8801 if (class1 == class2)
8802 return class1;
8804 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8805 the other class. */
8806 if (class1 == X86_64_NO_CLASS)
8807 return class2;
8808 if (class2 == X86_64_NO_CLASS)
8809 return class1;
8811 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8812 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8813 return X86_64_MEMORY_CLASS;
8815 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8816 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8817 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8818 return X86_64_INTEGERSI_CLASS;
8819 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8820 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8821 return X86_64_INTEGER_CLASS;
8823 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8824 MEMORY is used. */
8825 if (class1 == X86_64_X87_CLASS
8826 || class1 == X86_64_X87UP_CLASS
8827 || class1 == X86_64_COMPLEX_X87_CLASS
8828 || class2 == X86_64_X87_CLASS
8829 || class2 == X86_64_X87UP_CLASS
8830 || class2 == X86_64_COMPLEX_X87_CLASS)
8831 return X86_64_MEMORY_CLASS;
8833 /* Rule #6: Otherwise class SSE is used. */
8834 return X86_64_SSE_CLASS;
8837 /* Classify the argument of type TYPE and mode MODE.
8838 CLASSES will be filled by the register class used to pass each word
8839 of the operand. The number of words is returned. In case the parameter
8840 should be passed in memory, 0 is returned. As a special case for zero
8841 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8843 BIT_OFFSET is used internally for handling records and specifies offset
8844 of the offset in bits modulo 512 to avoid overflow cases.
8846 See the x86-64 PS ABI for details.
8849 static int
8850 classify_argument (machine_mode mode, const_tree type,
8851 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8853 HOST_WIDE_INT bytes =
8854 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8855 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8857 /* Variable sized entities are always passed/returned in memory. */
8858 if (bytes < 0)
8859 return 0;
8861 if (mode != VOIDmode
8862 && targetm.calls.must_pass_in_stack (mode, type))
8863 return 0;
8865 if (type && AGGREGATE_TYPE_P (type))
8867 int i;
8868 tree field;
8869 enum x86_64_reg_class subclasses[MAX_CLASSES];
8871 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8872 if (bytes > 64)
8873 return 0;
8875 for (i = 0; i < words; i++)
8876 classes[i] = X86_64_NO_CLASS;
8878 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8879 signalize memory class, so handle it as special case. */
8880 if (!words)
8882 classes[0] = X86_64_NO_CLASS;
8883 return 1;
8886 /* Classify each field of record and merge classes. */
8887 switch (TREE_CODE (type))
8889 case RECORD_TYPE:
8890 /* And now merge the fields of structure. */
8891 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8893 if (TREE_CODE (field) == FIELD_DECL)
8895 int num;
8897 if (TREE_TYPE (field) == error_mark_node)
8898 continue;
8900 /* Bitfields are always classified as integer. Handle them
8901 early, since later code would consider them to be
8902 misaligned integers. */
8903 if (DECL_BIT_FIELD (field))
8905 for (i = (int_bit_position (field)
8906 + (bit_offset % 64)) / 8 / 8;
8907 i < ((int_bit_position (field) + (bit_offset % 64))
8908 + tree_to_shwi (DECL_SIZE (field))
8909 + 63) / 8 / 8; i++)
8910 classes[i] =
8911 merge_classes (X86_64_INTEGER_CLASS,
8912 classes[i]);
8914 else
8916 int pos;
8918 type = TREE_TYPE (field);
8920 /* Flexible array member is ignored. */
8921 if (TYPE_MODE (type) == BLKmode
8922 && TREE_CODE (type) == ARRAY_TYPE
8923 && TYPE_SIZE (type) == NULL_TREE
8924 && TYPE_DOMAIN (type) != NULL_TREE
8925 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8926 == NULL_TREE))
8928 static bool warned;
8930 if (!warned && warn_psabi)
8932 warned = true;
8933 inform (input_location,
8934 "the ABI of passing struct with"
8935 " a flexible array member has"
8936 " changed in GCC 4.4");
8938 continue;
8940 num = classify_argument (TYPE_MODE (type), type,
8941 subclasses,
8942 (int_bit_position (field)
8943 + bit_offset) % 512);
8944 if (!num)
8945 return 0;
8946 pos = (int_bit_position (field)
8947 + (bit_offset % 64)) / 8 / 8;
8948 for (i = 0; i < num && (i + pos) < words; i++)
8949 classes[i + pos] =
8950 merge_classes (subclasses[i], classes[i + pos]);
8954 break;
8956 case ARRAY_TYPE:
8957 /* Arrays are handled as small records. */
8959 int num;
8960 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8961 TREE_TYPE (type), subclasses, bit_offset);
8962 if (!num)
8963 return 0;
8965 /* The partial classes are now full classes. */
8966 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8967 subclasses[0] = X86_64_SSE_CLASS;
8968 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8969 && !((bit_offset % 64) == 0 && bytes == 4))
8970 subclasses[0] = X86_64_INTEGER_CLASS;
8972 for (i = 0; i < words; i++)
8973 classes[i] = subclasses[i % num];
8975 break;
8977 case UNION_TYPE:
8978 case QUAL_UNION_TYPE:
8979 /* Unions are similar to RECORD_TYPE but offset is always 0.
8981 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8983 if (TREE_CODE (field) == FIELD_DECL)
8985 int num;
8987 if (TREE_TYPE (field) == error_mark_node)
8988 continue;
8990 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8991 TREE_TYPE (field), subclasses,
8992 bit_offset);
8993 if (!num)
8994 return 0;
8995 for (i = 0; i < num && i < words; i++)
8996 classes[i] = merge_classes (subclasses[i], classes[i]);
8999 break;
9001 default:
9002 gcc_unreachable ();
9005 if (words > 2)
9007 /* When size > 16 bytes, if the first one isn't
9008 X86_64_SSE_CLASS or any other ones aren't
9009 X86_64_SSEUP_CLASS, everything should be passed in
9010 memory. */
9011 if (classes[0] != X86_64_SSE_CLASS)
9012 return 0;
9014 for (i = 1; i < words; i++)
9015 if (classes[i] != X86_64_SSEUP_CLASS)
9016 return 0;
9019 /* Final merger cleanup. */
9020 for (i = 0; i < words; i++)
9022 /* If one class is MEMORY, everything should be passed in
9023 memory. */
9024 if (classes[i] == X86_64_MEMORY_CLASS)
9025 return 0;
9027 /* The X86_64_SSEUP_CLASS should be always preceded by
9028 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9029 if (classes[i] == X86_64_SSEUP_CLASS
9030 && classes[i - 1] != X86_64_SSE_CLASS
9031 && classes[i - 1] != X86_64_SSEUP_CLASS)
9033 /* The first one should never be X86_64_SSEUP_CLASS. */
9034 gcc_assert (i != 0);
9035 classes[i] = X86_64_SSE_CLASS;
9038 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9039 everything should be passed in memory. */
9040 if (classes[i] == X86_64_X87UP_CLASS
9041 && (classes[i - 1] != X86_64_X87_CLASS))
9043 static bool warned;
9045 /* The first one should never be X86_64_X87UP_CLASS. */
9046 gcc_assert (i != 0);
9047 if (!warned && warn_psabi)
9049 warned = true;
9050 inform (input_location,
9051 "the ABI of passing union with long double"
9052 " has changed in GCC 4.4");
9054 return 0;
9057 return words;
9060 /* Compute alignment needed. We align all types to natural boundaries with
9061 exception of XFmode that is aligned to 64bits. */
9062 if (mode != VOIDmode && mode != BLKmode)
9064 int mode_alignment = GET_MODE_BITSIZE (mode);
9066 if (mode == XFmode)
9067 mode_alignment = 128;
9068 else if (mode == XCmode)
9069 mode_alignment = 256;
9070 if (COMPLEX_MODE_P (mode))
9071 mode_alignment /= 2;
9072 /* Misaligned fields are always returned in memory. */
9073 if (bit_offset % mode_alignment)
9074 return 0;
9077 /* for V1xx modes, just use the base mode */
9078 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9079 && GET_MODE_UNIT_SIZE (mode) == bytes)
9080 mode = GET_MODE_INNER (mode);
9082 /* Classification of atomic types. */
9083 switch (mode)
9085 case SDmode:
9086 case DDmode:
9087 classes[0] = X86_64_SSE_CLASS;
9088 return 1;
9089 case TDmode:
9090 classes[0] = X86_64_SSE_CLASS;
9091 classes[1] = X86_64_SSEUP_CLASS;
9092 return 2;
9093 case DImode:
9094 case SImode:
9095 case HImode:
9096 case QImode:
9097 case CSImode:
9098 case CHImode:
9099 case CQImode:
9101 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9103 /* Analyze last 128 bits only. */
9104 size = (size - 1) & 0x7f;
9106 if (size < 32)
9108 classes[0] = X86_64_INTEGERSI_CLASS;
9109 return 1;
9111 else if (size < 64)
9113 classes[0] = X86_64_INTEGER_CLASS;
9114 return 1;
9116 else if (size < 64+32)
9118 classes[0] = X86_64_INTEGER_CLASS;
9119 classes[1] = X86_64_INTEGERSI_CLASS;
9120 return 2;
9122 else if (size < 64+64)
9124 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9125 return 2;
9127 else
9128 gcc_unreachable ();
9130 case CDImode:
9131 case TImode:
9132 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9133 return 2;
9134 case COImode:
9135 case OImode:
9136 /* OImode shouldn't be used directly. */
9137 gcc_unreachable ();
9138 case CTImode:
9139 return 0;
9140 case SFmode:
9141 if (!(bit_offset % 64))
9142 classes[0] = X86_64_SSESF_CLASS;
9143 else
9144 classes[0] = X86_64_SSE_CLASS;
9145 return 1;
9146 case DFmode:
9147 classes[0] = X86_64_SSEDF_CLASS;
9148 return 1;
9149 case XFmode:
9150 classes[0] = X86_64_X87_CLASS;
9151 classes[1] = X86_64_X87UP_CLASS;
9152 return 2;
9153 case TFmode:
9154 classes[0] = X86_64_SSE_CLASS;
9155 classes[1] = X86_64_SSEUP_CLASS;
9156 return 2;
9157 case SCmode:
9158 classes[0] = X86_64_SSE_CLASS;
9159 if (!(bit_offset % 64))
9160 return 1;
9161 else
9163 static bool warned;
9165 if (!warned && warn_psabi)
9167 warned = true;
9168 inform (input_location,
9169 "the ABI of passing structure with complex float"
9170 " member has changed in GCC 4.4");
9172 classes[1] = X86_64_SSESF_CLASS;
9173 return 2;
9175 case DCmode:
9176 classes[0] = X86_64_SSEDF_CLASS;
9177 classes[1] = X86_64_SSEDF_CLASS;
9178 return 2;
9179 case XCmode:
9180 classes[0] = X86_64_COMPLEX_X87_CLASS;
9181 return 1;
9182 case TCmode:
9183 /* This modes is larger than 16 bytes. */
9184 return 0;
9185 case V8SFmode:
9186 case V8SImode:
9187 case V32QImode:
9188 case V16HImode:
9189 case V4DFmode:
9190 case V4DImode:
9191 classes[0] = X86_64_SSE_CLASS;
9192 classes[1] = X86_64_SSEUP_CLASS;
9193 classes[2] = X86_64_SSEUP_CLASS;
9194 classes[3] = X86_64_SSEUP_CLASS;
9195 return 4;
9196 case V8DFmode:
9197 case V16SFmode:
9198 case V8DImode:
9199 case V16SImode:
9200 case V32HImode:
9201 case V64QImode:
9202 classes[0] = X86_64_SSE_CLASS;
9203 classes[1] = X86_64_SSEUP_CLASS;
9204 classes[2] = X86_64_SSEUP_CLASS;
9205 classes[3] = X86_64_SSEUP_CLASS;
9206 classes[4] = X86_64_SSEUP_CLASS;
9207 classes[5] = X86_64_SSEUP_CLASS;
9208 classes[6] = X86_64_SSEUP_CLASS;
9209 classes[7] = X86_64_SSEUP_CLASS;
9210 return 8;
9211 case V4SFmode:
9212 case V4SImode:
9213 case V16QImode:
9214 case V8HImode:
9215 case V2DFmode:
9216 case V2DImode:
9217 classes[0] = X86_64_SSE_CLASS;
9218 classes[1] = X86_64_SSEUP_CLASS;
9219 return 2;
9220 case V1TImode:
9221 case V1DImode:
9222 case V2SFmode:
9223 case V2SImode:
9224 case V4HImode:
9225 case V8QImode:
9226 classes[0] = X86_64_SSE_CLASS;
9227 return 1;
9228 case BLKmode:
9229 case VOIDmode:
9230 return 0;
9231 default:
9232 gcc_assert (VECTOR_MODE_P (mode));
9234 if (bytes > 16)
9235 return 0;
9237 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9239 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9240 classes[0] = X86_64_INTEGERSI_CLASS;
9241 else
9242 classes[0] = X86_64_INTEGER_CLASS;
9243 classes[1] = X86_64_INTEGER_CLASS;
9244 return 1 + (bytes > 8);
9248 /* Examine the argument and return set number of register required in each
9249 class. Return true iff parameter should be passed in memory. */
9251 static bool
9252 examine_argument (machine_mode mode, const_tree type, int in_return,
9253 int *int_nregs, int *sse_nregs)
9255 enum x86_64_reg_class regclass[MAX_CLASSES];
9256 int n = classify_argument (mode, type, regclass, 0);
9258 *int_nregs = 0;
9259 *sse_nregs = 0;
9261 if (!n)
9262 return true;
9263 for (n--; n >= 0; n--)
9264 switch (regclass[n])
9266 case X86_64_INTEGER_CLASS:
9267 case X86_64_INTEGERSI_CLASS:
9268 (*int_nregs)++;
9269 break;
9270 case X86_64_SSE_CLASS:
9271 case X86_64_SSESF_CLASS:
9272 case X86_64_SSEDF_CLASS:
9273 (*sse_nregs)++;
9274 break;
9275 case X86_64_NO_CLASS:
9276 case X86_64_SSEUP_CLASS:
9277 break;
9278 case X86_64_X87_CLASS:
9279 case X86_64_X87UP_CLASS:
9280 case X86_64_COMPLEX_X87_CLASS:
9281 if (!in_return)
9282 return true;
9283 break;
9284 case X86_64_MEMORY_CLASS:
9285 gcc_unreachable ();
9288 return false;
9291 /* Construct container for the argument used by GCC interface. See
9292 FUNCTION_ARG for the detailed description. */
9294 static rtx
9295 construct_container (machine_mode mode, machine_mode orig_mode,
9296 const_tree type, int in_return, int nintregs, int nsseregs,
9297 const int *intreg, int sse_regno)
9299 /* The following variables hold the static issued_error state. */
9300 static bool issued_sse_arg_error;
9301 static bool issued_sse_ret_error;
9302 static bool issued_x87_ret_error;
9304 machine_mode tmpmode;
9305 int bytes =
9306 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9307 enum x86_64_reg_class regclass[MAX_CLASSES];
9308 int n;
9309 int i;
9310 int nexps = 0;
9311 int needed_sseregs, needed_intregs;
9312 rtx exp[MAX_CLASSES];
9313 rtx ret;
9315 n = classify_argument (mode, type, regclass, 0);
9316 if (!n)
9317 return NULL;
9318 if (examine_argument (mode, type, in_return, &needed_intregs,
9319 &needed_sseregs))
9320 return NULL;
9321 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9322 return NULL;
9324 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9325 some less clueful developer tries to use floating-point anyway. */
9326 if (needed_sseregs && !TARGET_SSE)
9328 if (in_return)
9330 if (!issued_sse_ret_error)
9332 error ("SSE register return with SSE disabled");
9333 issued_sse_ret_error = true;
9336 else if (!issued_sse_arg_error)
9338 error ("SSE register argument with SSE disabled");
9339 issued_sse_arg_error = true;
9341 return NULL;
9344 /* Likewise, error if the ABI requires us to return values in the
9345 x87 registers and the user specified -mno-80387. */
9346 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9347 for (i = 0; i < n; i++)
9348 if (regclass[i] == X86_64_X87_CLASS
9349 || regclass[i] == X86_64_X87UP_CLASS
9350 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9352 if (!issued_x87_ret_error)
9354 error ("x87 register return with x87 disabled");
9355 issued_x87_ret_error = true;
9357 return NULL;
9360 /* First construct simple cases. Avoid SCmode, since we want to use
9361 single register to pass this type. */
9362 if (n == 1 && mode != SCmode)
9363 switch (regclass[0])
9365 case X86_64_INTEGER_CLASS:
9366 case X86_64_INTEGERSI_CLASS:
9367 return gen_rtx_REG (mode, intreg[0]);
9368 case X86_64_SSE_CLASS:
9369 case X86_64_SSESF_CLASS:
9370 case X86_64_SSEDF_CLASS:
9371 if (mode != BLKmode)
9372 return gen_reg_or_parallel (mode, orig_mode,
9373 SSE_REGNO (sse_regno));
9374 break;
9375 case X86_64_X87_CLASS:
9376 case X86_64_COMPLEX_X87_CLASS:
9377 return gen_rtx_REG (mode, FIRST_STACK_REG);
9378 case X86_64_NO_CLASS:
9379 /* Zero sized array, struct or class. */
9380 return NULL;
9381 default:
9382 gcc_unreachable ();
9384 if (n == 2
9385 && regclass[0] == X86_64_SSE_CLASS
9386 && regclass[1] == X86_64_SSEUP_CLASS
9387 && mode != BLKmode)
9388 return gen_reg_or_parallel (mode, orig_mode,
9389 SSE_REGNO (sse_regno));
9390 if (n == 4
9391 && regclass[0] == X86_64_SSE_CLASS
9392 && regclass[1] == X86_64_SSEUP_CLASS
9393 && regclass[2] == X86_64_SSEUP_CLASS
9394 && regclass[3] == X86_64_SSEUP_CLASS
9395 && mode != BLKmode)
9396 return gen_reg_or_parallel (mode, orig_mode,
9397 SSE_REGNO (sse_regno));
9398 if (n == 8
9399 && regclass[0] == X86_64_SSE_CLASS
9400 && regclass[1] == X86_64_SSEUP_CLASS
9401 && regclass[2] == X86_64_SSEUP_CLASS
9402 && regclass[3] == X86_64_SSEUP_CLASS
9403 && regclass[4] == X86_64_SSEUP_CLASS
9404 && regclass[5] == X86_64_SSEUP_CLASS
9405 && regclass[6] == X86_64_SSEUP_CLASS
9406 && regclass[7] == X86_64_SSEUP_CLASS
9407 && mode != BLKmode)
9408 return gen_reg_or_parallel (mode, orig_mode,
9409 SSE_REGNO (sse_regno));
9410 if (n == 2
9411 && regclass[0] == X86_64_X87_CLASS
9412 && regclass[1] == X86_64_X87UP_CLASS)
9413 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9415 if (n == 2
9416 && regclass[0] == X86_64_INTEGER_CLASS
9417 && regclass[1] == X86_64_INTEGER_CLASS
9418 && (mode == CDImode || mode == TImode)
9419 && intreg[0] + 1 == intreg[1])
9420 return gen_rtx_REG (mode, intreg[0]);
9422 /* Otherwise figure out the entries of the PARALLEL. */
9423 for (i = 0; i < n; i++)
9425 int pos;
9427 switch (regclass[i])
9429 case X86_64_NO_CLASS:
9430 break;
9431 case X86_64_INTEGER_CLASS:
9432 case X86_64_INTEGERSI_CLASS:
9433 /* Merge TImodes on aligned occasions here too. */
9434 if (i * 8 + 8 > bytes)
9435 tmpmode
9436 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9437 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9438 tmpmode = SImode;
9439 else
9440 tmpmode = DImode;
9441 /* We've requested 24 bytes we
9442 don't have mode for. Use DImode. */
9443 if (tmpmode == BLKmode)
9444 tmpmode = DImode;
9445 exp [nexps++]
9446 = gen_rtx_EXPR_LIST (VOIDmode,
9447 gen_rtx_REG (tmpmode, *intreg),
9448 GEN_INT (i*8));
9449 intreg++;
9450 break;
9451 case X86_64_SSESF_CLASS:
9452 exp [nexps++]
9453 = gen_rtx_EXPR_LIST (VOIDmode,
9454 gen_rtx_REG (SFmode,
9455 SSE_REGNO (sse_regno)),
9456 GEN_INT (i*8));
9457 sse_regno++;
9458 break;
9459 case X86_64_SSEDF_CLASS:
9460 exp [nexps++]
9461 = gen_rtx_EXPR_LIST (VOIDmode,
9462 gen_rtx_REG (DFmode,
9463 SSE_REGNO (sse_regno)),
9464 GEN_INT (i*8));
9465 sse_regno++;
9466 break;
9467 case X86_64_SSE_CLASS:
9468 pos = i;
9469 switch (n)
9471 case 1:
9472 tmpmode = DImode;
9473 break;
9474 case 2:
9475 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9477 tmpmode = TImode;
9478 i++;
9480 else
9481 tmpmode = DImode;
9482 break;
9483 case 4:
9484 gcc_assert (i == 0
9485 && regclass[1] == X86_64_SSEUP_CLASS
9486 && regclass[2] == X86_64_SSEUP_CLASS
9487 && regclass[3] == X86_64_SSEUP_CLASS);
9488 tmpmode = OImode;
9489 i += 3;
9490 break;
9491 case 8:
9492 gcc_assert (i == 0
9493 && regclass[1] == X86_64_SSEUP_CLASS
9494 && regclass[2] == X86_64_SSEUP_CLASS
9495 && regclass[3] == X86_64_SSEUP_CLASS
9496 && regclass[4] == X86_64_SSEUP_CLASS
9497 && regclass[5] == X86_64_SSEUP_CLASS
9498 && regclass[6] == X86_64_SSEUP_CLASS
9499 && regclass[7] == X86_64_SSEUP_CLASS);
9500 tmpmode = XImode;
9501 i += 7;
9502 break;
9503 default:
9504 gcc_unreachable ();
9506 exp [nexps++]
9507 = gen_rtx_EXPR_LIST (VOIDmode,
9508 gen_rtx_REG (tmpmode,
9509 SSE_REGNO (sse_regno)),
9510 GEN_INT (pos*8));
9511 sse_regno++;
9512 break;
9513 default:
9514 gcc_unreachable ();
9518 /* Empty aligned struct, union or class. */
9519 if (nexps == 0)
9520 return NULL;
9522 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9523 for (i = 0; i < nexps; i++)
9524 XVECEXP (ret, 0, i) = exp [i];
9525 return ret;
9528 /* Update the data in CUM to advance over an argument of mode MODE
9529 and data type TYPE. (TYPE is null for libcalls where that information
9530 may not be available.)
9532 Return a number of integer regsiters advanced over. */
9534 static int
9535 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9536 const_tree type, HOST_WIDE_INT bytes,
9537 HOST_WIDE_INT words)
9539 int res = 0;
9540 bool error_p = NULL;
9542 if (TARGET_IAMCU)
9544 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9545 bytes in registers. */
9546 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9547 goto pass_in_reg;
9548 return res;
9551 switch (mode)
9553 default:
9554 break;
9556 case BLKmode:
9557 if (bytes < 0)
9558 break;
9559 /* FALLTHRU */
9561 case DImode:
9562 case SImode:
9563 case HImode:
9564 case QImode:
9565 pass_in_reg:
9566 cum->words += words;
9567 cum->nregs -= words;
9568 cum->regno += words;
9569 if (cum->nregs >= 0)
9570 res = words;
9571 if (cum->nregs <= 0)
9573 cum->nregs = 0;
9574 cfun->machine->arg_reg_available = false;
9575 cum->regno = 0;
9577 break;
9579 case OImode:
9580 /* OImode shouldn't be used directly. */
9581 gcc_unreachable ();
9583 case DFmode:
9584 if (cum->float_in_sse == -1)
9585 error_p = 1;
9586 if (cum->float_in_sse < 2)
9587 break;
9588 /* FALLTHRU */
9589 case SFmode:
9590 if (cum->float_in_sse == -1)
9591 error_p = 1;
9592 if (cum->float_in_sse < 1)
9593 break;
9594 /* FALLTHRU */
9596 case V8SFmode:
9597 case V8SImode:
9598 case V64QImode:
9599 case V32HImode:
9600 case V16SImode:
9601 case V8DImode:
9602 case V16SFmode:
9603 case V8DFmode:
9604 case V32QImode:
9605 case V16HImode:
9606 case V4DFmode:
9607 case V4DImode:
9608 case TImode:
9609 case V16QImode:
9610 case V8HImode:
9611 case V4SImode:
9612 case V2DImode:
9613 case V4SFmode:
9614 case V2DFmode:
9615 if (!type || !AGGREGATE_TYPE_P (type))
9617 cum->sse_words += words;
9618 cum->sse_nregs -= 1;
9619 cum->sse_regno += 1;
9620 if (cum->sse_nregs <= 0)
9622 cum->sse_nregs = 0;
9623 cum->sse_regno = 0;
9626 break;
9628 case V8QImode:
9629 case V4HImode:
9630 case V2SImode:
9631 case V2SFmode:
9632 case V1TImode:
9633 case V1DImode:
9634 if (!type || !AGGREGATE_TYPE_P (type))
9636 cum->mmx_words += words;
9637 cum->mmx_nregs -= 1;
9638 cum->mmx_regno += 1;
9639 if (cum->mmx_nregs <= 0)
9641 cum->mmx_nregs = 0;
9642 cum->mmx_regno = 0;
9645 break;
9647 if (error_p)
9649 cum->float_in_sse = 0;
9650 error ("calling %qD with SSE calling convention without "
9651 "SSE/SSE2 enabled", cum->decl);
9652 sorry ("this is a GCC bug that can be worked around by adding "
9653 "attribute used to function called");
9656 return res;
9659 static int
9660 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9661 const_tree type, HOST_WIDE_INT words, bool named)
9663 int int_nregs, sse_nregs;
9665 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9666 if (!named && (VALID_AVX512F_REG_MODE (mode)
9667 || VALID_AVX256_REG_MODE (mode)))
9668 return 0;
9670 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9671 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9673 cum->nregs -= int_nregs;
9674 cum->sse_nregs -= sse_nregs;
9675 cum->regno += int_nregs;
9676 cum->sse_regno += sse_nregs;
9677 return int_nregs;
9679 else
9681 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9682 cum->words = ROUND_UP (cum->words, align);
9683 cum->words += words;
9684 return 0;
9688 static int
9689 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9690 HOST_WIDE_INT words)
9692 /* Otherwise, this should be passed indirect. */
9693 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9695 cum->words += words;
9696 if (cum->nregs > 0)
9698 cum->nregs -= 1;
9699 cum->regno += 1;
9700 return 1;
9702 return 0;
9705 /* Update the data in CUM to advance over an argument of mode MODE and
9706 data type TYPE. (TYPE is null for libcalls where that information
9707 may not be available.) */
9709 static void
9710 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9711 const_tree type, bool named)
9713 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9714 HOST_WIDE_INT bytes, words;
9715 int nregs;
9717 /* The argument of interrupt handler is a special case and is
9718 handled in ix86_function_arg. */
9719 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9720 return;
9722 if (mode == BLKmode)
9723 bytes = int_size_in_bytes (type);
9724 else
9725 bytes = GET_MODE_SIZE (mode);
9726 words = CEIL (bytes, UNITS_PER_WORD);
9728 if (type)
9729 mode = type_natural_mode (type, NULL, false);
9731 if ((type && POINTER_BOUNDS_TYPE_P (type))
9732 || POINTER_BOUNDS_MODE_P (mode))
9734 /* If we pass bounds in BT then just update remained bounds count. */
9735 if (cum->bnds_in_bt)
9737 cum->bnds_in_bt--;
9738 return;
9741 /* Update remained number of bounds to force. */
9742 if (cum->force_bnd_pass)
9743 cum->force_bnd_pass--;
9745 cum->bnd_regno++;
9747 return;
9750 /* The first arg not going to Bounds Tables resets this counter. */
9751 cum->bnds_in_bt = 0;
9752 /* For unnamed args we always pass bounds to avoid bounds mess when
9753 passed and received types do not match. If bounds do not follow
9754 unnamed arg, still pretend required number of bounds were passed. */
9755 if (cum->force_bnd_pass)
9757 cum->bnd_regno += cum->force_bnd_pass;
9758 cum->force_bnd_pass = 0;
9761 if (TARGET_64BIT)
9763 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9765 if (call_abi == MS_ABI)
9766 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9767 else
9768 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9770 else
9771 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9773 /* For stdarg we expect bounds to be passed for each value passed
9774 in register. */
9775 if (cum->stdarg)
9776 cum->force_bnd_pass = nregs;
9777 /* For pointers passed in memory we expect bounds passed in Bounds
9778 Table. */
9779 if (!nregs)
9780 cum->bnds_in_bt = chkp_type_bounds_count (type);
9783 /* Define where to put the arguments to a function.
9784 Value is zero to push the argument on the stack,
9785 or a hard register in which to store the argument.
9787 MODE is the argument's machine mode.
9788 TYPE is the data type of the argument (as a tree).
9789 This is null for libcalls where that information may
9790 not be available.
9791 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9792 the preceding args and about the function being called.
9793 NAMED is nonzero if this argument is a named parameter
9794 (otherwise it is an extra parameter matching an ellipsis). */
9796 static rtx
9797 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9798 machine_mode orig_mode, const_tree type,
9799 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9801 bool error_p = false;
9802 /* Avoid the AL settings for the Unix64 ABI. */
9803 if (mode == VOIDmode)
9804 return constm1_rtx;
9806 if (TARGET_IAMCU)
9808 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9809 bytes in registers. */
9810 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9811 goto pass_in_reg;
9812 return NULL_RTX;
9815 switch (mode)
9817 default:
9818 break;
9820 case BLKmode:
9821 if (bytes < 0)
9822 break;
9823 /* FALLTHRU */
9824 case DImode:
9825 case SImode:
9826 case HImode:
9827 case QImode:
9828 pass_in_reg:
9829 if (words <= cum->nregs)
9831 int regno = cum->regno;
9833 /* Fastcall allocates the first two DWORD (SImode) or
9834 smaller arguments to ECX and EDX if it isn't an
9835 aggregate type . */
9836 if (cum->fastcall)
9838 if (mode == BLKmode
9839 || mode == DImode
9840 || (type && AGGREGATE_TYPE_P (type)))
9841 break;
9843 /* ECX not EAX is the first allocated register. */
9844 if (regno == AX_REG)
9845 regno = CX_REG;
9847 return gen_rtx_REG (mode, regno);
9849 break;
9851 case DFmode:
9852 if (cum->float_in_sse == -1)
9853 error_p = 1;
9854 if (cum->float_in_sse < 2)
9855 break;
9856 /* FALLTHRU */
9857 case SFmode:
9858 if (cum->float_in_sse == -1)
9859 error_p = 1;
9860 if (cum->float_in_sse < 1)
9861 break;
9862 /* FALLTHRU */
9863 case TImode:
9864 /* In 32bit, we pass TImode in xmm registers. */
9865 case V16QImode:
9866 case V8HImode:
9867 case V4SImode:
9868 case V2DImode:
9869 case V4SFmode:
9870 case V2DFmode:
9871 if (!type || !AGGREGATE_TYPE_P (type))
9873 if (cum->sse_nregs)
9874 return gen_reg_or_parallel (mode, orig_mode,
9875 cum->sse_regno + FIRST_SSE_REG);
9877 break;
9879 case OImode:
9880 case XImode:
9881 /* OImode and XImode shouldn't be used directly. */
9882 gcc_unreachable ();
9884 case V64QImode:
9885 case V32HImode:
9886 case V16SImode:
9887 case V8DImode:
9888 case V16SFmode:
9889 case V8DFmode:
9890 case V8SFmode:
9891 case V8SImode:
9892 case V32QImode:
9893 case V16HImode:
9894 case V4DFmode:
9895 case V4DImode:
9896 if (!type || !AGGREGATE_TYPE_P (type))
9898 if (cum->sse_nregs)
9899 return gen_reg_or_parallel (mode, orig_mode,
9900 cum->sse_regno + FIRST_SSE_REG);
9902 break;
9904 case V8QImode:
9905 case V4HImode:
9906 case V2SImode:
9907 case V2SFmode:
9908 case V1TImode:
9909 case V1DImode:
9910 if (!type || !AGGREGATE_TYPE_P (type))
9912 if (cum->mmx_nregs)
9913 return gen_reg_or_parallel (mode, orig_mode,
9914 cum->mmx_regno + FIRST_MMX_REG);
9916 break;
9918 if (error_p)
9920 cum->float_in_sse = 0;
9921 error ("calling %qD with SSE calling convention without "
9922 "SSE/SSE2 enabled", cum->decl);
9923 sorry ("this is a GCC bug that can be worked around by adding "
9924 "attribute used to function called");
9927 return NULL_RTX;
9930 static rtx
9931 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9932 machine_mode orig_mode, const_tree type, bool named)
9934 /* Handle a hidden AL argument containing number of registers
9935 for varargs x86-64 functions. */
9936 if (mode == VOIDmode)
9937 return GEN_INT (cum->maybe_vaarg
9938 ? (cum->sse_nregs < 0
9939 ? X86_64_SSE_REGPARM_MAX
9940 : cum->sse_regno)
9941 : -1);
9943 switch (mode)
9945 default:
9946 break;
9948 case V8SFmode:
9949 case V8SImode:
9950 case V32QImode:
9951 case V16HImode:
9952 case V4DFmode:
9953 case V4DImode:
9954 case V16SFmode:
9955 case V16SImode:
9956 case V64QImode:
9957 case V32HImode:
9958 case V8DFmode:
9959 case V8DImode:
9960 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9961 if (!named)
9962 return NULL;
9963 break;
9966 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9967 cum->sse_nregs,
9968 &x86_64_int_parameter_registers [cum->regno],
9969 cum->sse_regno);
9972 static rtx
9973 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9974 machine_mode orig_mode, bool named,
9975 HOST_WIDE_INT bytes)
9977 unsigned int regno;
9979 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9980 We use value of -2 to specify that current function call is MSABI. */
9981 if (mode == VOIDmode)
9982 return GEN_INT (-2);
9984 /* If we've run out of registers, it goes on the stack. */
9985 if (cum->nregs == 0)
9986 return NULL_RTX;
9988 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9990 /* Only floating point modes are passed in anything but integer regs. */
9991 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9993 if (named)
9994 regno = cum->regno + FIRST_SSE_REG;
9995 else
9997 rtx t1, t2;
9999 /* Unnamed floating parameters are passed in both the
10000 SSE and integer registers. */
10001 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10002 t2 = gen_rtx_REG (mode, regno);
10003 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10004 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10005 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10008 /* Handle aggregated types passed in register. */
10009 if (orig_mode == BLKmode)
10011 if (bytes > 0 && bytes <= 8)
10012 mode = (bytes > 4 ? DImode : SImode);
10013 if (mode == BLKmode)
10014 mode = DImode;
10017 return gen_reg_or_parallel (mode, orig_mode, regno);
10020 /* Return where to put the arguments to a function.
10021 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10023 MODE is the argument's machine mode. TYPE is the data type of the
10024 argument. It is null for libcalls where that information may not be
10025 available. CUM gives information about the preceding args and about
10026 the function being called. NAMED is nonzero if this argument is a
10027 named parameter (otherwise it is an extra parameter matching an
10028 ellipsis). */
10030 static rtx
10031 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10032 const_tree type, bool named)
10034 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10035 machine_mode mode = omode;
10036 HOST_WIDE_INT bytes, words;
10037 rtx arg;
10039 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10041 gcc_assert (type != NULL_TREE);
10042 if (POINTER_TYPE_P (type))
10044 /* This is the pointer argument. */
10045 gcc_assert (TYPE_MODE (type) == Pmode);
10046 if (cfun->machine->func_type == TYPE_INTERRUPT)
10047 /* -WORD(AP) in the current frame in interrupt handler. */
10048 arg = plus_constant (Pmode, arg_pointer_rtx,
10049 -UNITS_PER_WORD);
10050 else
10051 /* (AP) in the current frame in exception handler. */
10052 arg = arg_pointer_rtx;
10054 else
10056 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10057 && TREE_CODE (type) == INTEGER_TYPE
10058 && TYPE_MODE (type) == word_mode);
10059 /* The integer argument is the error code at -WORD(AP) in
10060 the current frame in exception handler. */
10061 arg = gen_rtx_MEM (word_mode,
10062 plus_constant (Pmode,
10063 arg_pointer_rtx,
10064 -UNITS_PER_WORD));
10066 return arg;
10069 /* All pointer bounds arguments are handled separately here. */
10070 if ((type && POINTER_BOUNDS_TYPE_P (type))
10071 || POINTER_BOUNDS_MODE_P (mode))
10073 /* Return NULL if bounds are forced to go in Bounds Table. */
10074 if (cum->bnds_in_bt)
10075 arg = NULL;
10076 /* Return the next available bound reg if any. */
10077 else if (cum->bnd_regno <= LAST_BND_REG)
10078 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10079 /* Return the next special slot number otherwise. */
10080 else
10081 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10083 return arg;
10086 if (mode == BLKmode)
10087 bytes = int_size_in_bytes (type);
10088 else
10089 bytes = GET_MODE_SIZE (mode);
10090 words = CEIL (bytes, UNITS_PER_WORD);
10092 /* To simplify the code below, represent vector types with a vector mode
10093 even if MMX/SSE are not active. */
10094 if (type && TREE_CODE (type) == VECTOR_TYPE)
10095 mode = type_natural_mode (type, cum, false);
10097 if (TARGET_64BIT)
10099 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10101 if (call_abi == MS_ABI)
10102 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10103 else
10104 arg = function_arg_64 (cum, mode, omode, type, named);
10106 else
10107 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10109 return arg;
10112 /* A C expression that indicates when an argument must be passed by
10113 reference. If nonzero for an argument, a copy of that argument is
10114 made in memory and a pointer to the argument is passed instead of
10115 the argument itself. The pointer is passed in whatever way is
10116 appropriate for passing a pointer to that type. */
10118 static bool
10119 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10120 const_tree type, bool)
10122 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10124 /* Bounds are never passed by reference. */
10125 if ((type && POINTER_BOUNDS_TYPE_P (type))
10126 || POINTER_BOUNDS_MODE_P (mode))
10127 return false;
10129 if (TARGET_64BIT)
10131 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10133 /* See Windows x64 Software Convention. */
10134 if (call_abi == MS_ABI)
10136 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10138 if (type)
10140 /* Arrays are passed by reference. */
10141 if (TREE_CODE (type) == ARRAY_TYPE)
10142 return true;
10144 if (RECORD_OR_UNION_TYPE_P (type))
10146 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10147 are passed by reference. */
10148 msize = int_size_in_bytes (type);
10152 /* __m128 is passed by reference. */
10153 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10155 else if (type && int_size_in_bytes (type) == -1)
10156 return true;
10159 return false;
10162 /* Return true when TYPE should be 128bit aligned for 32bit argument
10163 passing ABI. XXX: This function is obsolete and is only used for
10164 checking psABI compatibility with previous versions of GCC. */
10166 static bool
10167 ix86_compat_aligned_value_p (const_tree type)
10169 machine_mode mode = TYPE_MODE (type);
10170 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10171 || mode == TDmode
10172 || mode == TFmode
10173 || mode == TCmode)
10174 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10175 return true;
10176 if (TYPE_ALIGN (type) < 128)
10177 return false;
10179 if (AGGREGATE_TYPE_P (type))
10181 /* Walk the aggregates recursively. */
10182 switch (TREE_CODE (type))
10184 case RECORD_TYPE:
10185 case UNION_TYPE:
10186 case QUAL_UNION_TYPE:
10188 tree field;
10190 /* Walk all the structure fields. */
10191 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10193 if (TREE_CODE (field) == FIELD_DECL
10194 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10195 return true;
10197 break;
10200 case ARRAY_TYPE:
10201 /* Just for use if some languages passes arrays by value. */
10202 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10203 return true;
10204 break;
10206 default:
10207 gcc_unreachable ();
10210 return false;
10213 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10214 XXX: This function is obsolete and is only used for checking psABI
10215 compatibility with previous versions of GCC. */
10217 static unsigned int
10218 ix86_compat_function_arg_boundary (machine_mode mode,
10219 const_tree type, unsigned int align)
10221 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10222 natural boundaries. */
10223 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10225 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10226 make an exception for SSE modes since these require 128bit
10227 alignment.
10229 The handling here differs from field_alignment. ICC aligns MMX
10230 arguments to 4 byte boundaries, while structure fields are aligned
10231 to 8 byte boundaries. */
10232 if (!type)
10234 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10235 align = PARM_BOUNDARY;
10237 else
10239 if (!ix86_compat_aligned_value_p (type))
10240 align = PARM_BOUNDARY;
10243 if (align > BIGGEST_ALIGNMENT)
10244 align = BIGGEST_ALIGNMENT;
10245 return align;
10248 /* Return true when TYPE should be 128bit aligned for 32bit argument
10249 passing ABI. */
10251 static bool
10252 ix86_contains_aligned_value_p (const_tree type)
10254 machine_mode mode = TYPE_MODE (type);
10256 if (mode == XFmode || mode == XCmode)
10257 return false;
10259 if (TYPE_ALIGN (type) < 128)
10260 return false;
10262 if (AGGREGATE_TYPE_P (type))
10264 /* Walk the aggregates recursively. */
10265 switch (TREE_CODE (type))
10267 case RECORD_TYPE:
10268 case UNION_TYPE:
10269 case QUAL_UNION_TYPE:
10271 tree field;
10273 /* Walk all the structure fields. */
10274 for (field = TYPE_FIELDS (type);
10275 field;
10276 field = DECL_CHAIN (field))
10278 if (TREE_CODE (field) == FIELD_DECL
10279 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10280 return true;
10282 break;
10285 case ARRAY_TYPE:
10286 /* Just for use if some languages passes arrays by value. */
10287 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10288 return true;
10289 break;
10291 default:
10292 gcc_unreachable ();
10295 else
10296 return TYPE_ALIGN (type) >= 128;
10298 return false;
10301 /* Gives the alignment boundary, in bits, of an argument with the
10302 specified mode and type. */
10304 static unsigned int
10305 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10307 unsigned int align;
10308 if (type)
10310 /* Since the main variant type is used for call, we convert it to
10311 the main variant type. */
10312 type = TYPE_MAIN_VARIANT (type);
10313 align = TYPE_ALIGN (type);
10315 else
10316 align = GET_MODE_ALIGNMENT (mode);
10317 if (align < PARM_BOUNDARY)
10318 align = PARM_BOUNDARY;
10319 else
10321 static bool warned;
10322 unsigned int saved_align = align;
10324 if (!TARGET_64BIT)
10326 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10327 if (!type)
10329 if (mode == XFmode || mode == XCmode)
10330 align = PARM_BOUNDARY;
10332 else if (!ix86_contains_aligned_value_p (type))
10333 align = PARM_BOUNDARY;
10335 if (align < 128)
10336 align = PARM_BOUNDARY;
10339 if (warn_psabi
10340 && !warned
10341 && align != ix86_compat_function_arg_boundary (mode, type,
10342 saved_align))
10344 warned = true;
10345 inform (input_location,
10346 "The ABI for passing parameters with %d-byte"
10347 " alignment has changed in GCC 4.6",
10348 align / BITS_PER_UNIT);
10352 return align;
10355 /* Return true if N is a possible register number of function value. */
10357 static bool
10358 ix86_function_value_regno_p (const unsigned int regno)
10360 switch (regno)
10362 case AX_REG:
10363 return true;
10364 case DX_REG:
10365 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10366 case DI_REG:
10367 case SI_REG:
10368 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10370 case BND0_REG:
10371 case BND1_REG:
10372 return chkp_function_instrumented_p (current_function_decl);
10374 /* Complex values are returned in %st(0)/%st(1) pair. */
10375 case ST0_REG:
10376 case ST1_REG:
10377 /* TODO: The function should depend on current function ABI but
10378 builtins.c would need updating then. Therefore we use the
10379 default ABI. */
10380 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10381 return false;
10382 return TARGET_FLOAT_RETURNS_IN_80387;
10384 /* Complex values are returned in %xmm0/%xmm1 pair. */
10385 case XMM0_REG:
10386 case XMM1_REG:
10387 return TARGET_SSE;
10389 case MM0_REG:
10390 if (TARGET_MACHO || TARGET_64BIT)
10391 return false;
10392 return TARGET_MMX;
10395 return false;
10398 /* Define how to find the value returned by a function.
10399 VALTYPE is the data type of the value (as a tree).
10400 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10401 otherwise, FUNC is 0. */
10403 static rtx
10404 function_value_32 (machine_mode orig_mode, machine_mode mode,
10405 const_tree fntype, const_tree fn)
10407 unsigned int regno;
10409 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10410 we normally prevent this case when mmx is not available. However
10411 some ABIs may require the result to be returned like DImode. */
10412 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10413 regno = FIRST_MMX_REG;
10415 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10416 we prevent this case when sse is not available. However some ABIs
10417 may require the result to be returned like integer TImode. */
10418 else if (mode == TImode
10419 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10420 regno = FIRST_SSE_REG;
10422 /* 32-byte vector modes in %ymm0. */
10423 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10424 regno = FIRST_SSE_REG;
10426 /* 64-byte vector modes in %zmm0. */
10427 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10428 regno = FIRST_SSE_REG;
10430 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10431 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10432 regno = FIRST_FLOAT_REG;
10433 else
10434 /* Most things go in %eax. */
10435 regno = AX_REG;
10437 /* Override FP return register with %xmm0 for local functions when
10438 SSE math is enabled or for functions with sseregparm attribute. */
10439 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10441 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10442 if (sse_level == -1)
10444 error ("calling %qD with SSE calling convention without "
10445 "SSE/SSE2 enabled", fn);
10446 sorry ("this is a GCC bug that can be worked around by adding "
10447 "attribute used to function called");
10449 else if ((sse_level >= 1 && mode == SFmode)
10450 || (sse_level == 2 && mode == DFmode))
10451 regno = FIRST_SSE_REG;
10454 /* OImode shouldn't be used directly. */
10455 gcc_assert (mode != OImode);
10457 return gen_rtx_REG (orig_mode, regno);
10460 static rtx
10461 function_value_64 (machine_mode orig_mode, machine_mode mode,
10462 const_tree valtype)
10464 rtx ret;
10466 /* Handle libcalls, which don't provide a type node. */
10467 if (valtype == NULL)
10469 unsigned int regno;
10471 switch (mode)
10473 case SFmode:
10474 case SCmode:
10475 case DFmode:
10476 case DCmode:
10477 case TFmode:
10478 case SDmode:
10479 case DDmode:
10480 case TDmode:
10481 regno = FIRST_SSE_REG;
10482 break;
10483 case XFmode:
10484 case XCmode:
10485 regno = FIRST_FLOAT_REG;
10486 break;
10487 case TCmode:
10488 return NULL;
10489 default:
10490 regno = AX_REG;
10493 return gen_rtx_REG (mode, regno);
10495 else if (POINTER_TYPE_P (valtype))
10497 /* Pointers are always returned in word_mode. */
10498 mode = word_mode;
10501 ret = construct_container (mode, orig_mode, valtype, 1,
10502 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10503 x86_64_int_return_registers, 0);
10505 /* For zero sized structures, construct_container returns NULL, but we
10506 need to keep rest of compiler happy by returning meaningful value. */
10507 if (!ret)
10508 ret = gen_rtx_REG (orig_mode, AX_REG);
10510 return ret;
10513 static rtx
10514 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10515 const_tree valtype)
10517 unsigned int regno = AX_REG;
10519 if (TARGET_SSE)
10521 switch (GET_MODE_SIZE (mode))
10523 case 16:
10524 if (valtype != NULL_TREE
10525 && !VECTOR_INTEGER_TYPE_P (valtype)
10526 && !VECTOR_INTEGER_TYPE_P (valtype)
10527 && !INTEGRAL_TYPE_P (valtype)
10528 && !VECTOR_FLOAT_TYPE_P (valtype))
10529 break;
10530 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10531 && !COMPLEX_MODE_P (mode))
10532 regno = FIRST_SSE_REG;
10533 break;
10534 case 8:
10535 case 4:
10536 if (mode == SFmode || mode == DFmode)
10537 regno = FIRST_SSE_REG;
10538 break;
10539 default:
10540 break;
10543 return gen_rtx_REG (orig_mode, regno);
10546 static rtx
10547 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10548 machine_mode orig_mode, machine_mode mode)
10550 const_tree fn, fntype;
10552 fn = NULL_TREE;
10553 if (fntype_or_decl && DECL_P (fntype_or_decl))
10554 fn = fntype_or_decl;
10555 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10557 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10558 || POINTER_BOUNDS_MODE_P (mode))
10559 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10560 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10561 return function_value_ms_64 (orig_mode, mode, valtype);
10562 else if (TARGET_64BIT)
10563 return function_value_64 (orig_mode, mode, valtype);
10564 else
10565 return function_value_32 (orig_mode, mode, fntype, fn);
10568 static rtx
10569 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10571 machine_mode mode, orig_mode;
10573 orig_mode = TYPE_MODE (valtype);
10574 mode = type_natural_mode (valtype, NULL, true);
10575 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10578 /* Return an RTX representing a place where a function returns
10579 or recieves pointer bounds or NULL if no bounds are returned.
10581 VALTYPE is a data type of a value returned by the function.
10583 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10584 or FUNCTION_TYPE of the function.
10586 If OUTGOING is false, return a place in which the caller will
10587 see the return value. Otherwise, return a place where a
10588 function returns a value. */
10590 static rtx
10591 ix86_function_value_bounds (const_tree valtype,
10592 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10593 bool outgoing ATTRIBUTE_UNUSED)
10595 rtx res = NULL_RTX;
10597 if (BOUNDED_TYPE_P (valtype))
10598 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10599 else if (chkp_type_has_pointer (valtype))
10601 bitmap slots;
10602 rtx bounds[2];
10603 bitmap_iterator bi;
10604 unsigned i, bnd_no = 0;
10606 bitmap_obstack_initialize (NULL);
10607 slots = BITMAP_ALLOC (NULL);
10608 chkp_find_bound_slots (valtype, slots);
10610 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10612 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10613 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10614 gcc_assert (bnd_no < 2);
10615 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10618 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10620 BITMAP_FREE (slots);
10621 bitmap_obstack_release (NULL);
10623 else
10624 res = NULL_RTX;
10626 return res;
10629 /* Pointer function arguments and return values are promoted to
10630 word_mode for normal functions. */
10632 static machine_mode
10633 ix86_promote_function_mode (const_tree type, machine_mode mode,
10634 int *punsignedp, const_tree fntype,
10635 int for_return)
10637 if (cfun->machine->func_type == TYPE_NORMAL
10638 && type != NULL_TREE
10639 && POINTER_TYPE_P (type))
10641 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10642 return word_mode;
10644 return default_promote_function_mode (type, mode, punsignedp, fntype,
10645 for_return);
10648 /* Return true if a structure, union or array with MODE containing FIELD
10649 should be accessed using BLKmode. */
10651 static bool
10652 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10654 /* Union with XFmode must be in BLKmode. */
10655 return (mode == XFmode
10656 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10657 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10661 ix86_libcall_value (machine_mode mode)
10663 return ix86_function_value_1 (NULL, NULL, mode, mode);
10666 /* Return true iff type is returned in memory. */
10668 static bool
10669 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10671 #ifdef SUBTARGET_RETURN_IN_MEMORY
10672 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10673 #else
10674 const machine_mode mode = type_natural_mode (type, NULL, true);
10675 HOST_WIDE_INT size;
10677 if (POINTER_BOUNDS_TYPE_P (type))
10678 return false;
10680 if (TARGET_64BIT)
10682 if (ix86_function_type_abi (fntype) == MS_ABI)
10684 size = int_size_in_bytes (type);
10686 /* __m128 is returned in xmm0. */
10687 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10688 || INTEGRAL_TYPE_P (type)
10689 || VECTOR_FLOAT_TYPE_P (type))
10690 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10691 && !COMPLEX_MODE_P (mode)
10692 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10693 return false;
10695 /* Otherwise, the size must be exactly in [1248]. */
10696 return size != 1 && size != 2 && size != 4 && size != 8;
10698 else
10700 int needed_intregs, needed_sseregs;
10702 return examine_argument (mode, type, 1,
10703 &needed_intregs, &needed_sseregs);
10706 else
10708 size = int_size_in_bytes (type);
10710 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10711 bytes in registers. */
10712 if (TARGET_IAMCU)
10713 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10715 if (mode == BLKmode)
10716 return true;
10718 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10719 return false;
10721 if (VECTOR_MODE_P (mode) || mode == TImode)
10723 /* User-created vectors small enough to fit in EAX. */
10724 if (size < 8)
10725 return false;
10727 /* Unless ABI prescibes otherwise,
10728 MMX/3dNow values are returned in MM0 if available. */
10730 if (size == 8)
10731 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10733 /* SSE values are returned in XMM0 if available. */
10734 if (size == 16)
10735 return !TARGET_SSE;
10737 /* AVX values are returned in YMM0 if available. */
10738 if (size == 32)
10739 return !TARGET_AVX;
10741 /* AVX512F values are returned in ZMM0 if available. */
10742 if (size == 64)
10743 return !TARGET_AVX512F;
10746 if (mode == XFmode)
10747 return false;
10749 if (size > 12)
10750 return true;
10752 /* OImode shouldn't be used directly. */
10753 gcc_assert (mode != OImode);
10755 return false;
10757 #endif
10761 /* Create the va_list data type. */
10763 static tree
10764 ix86_build_builtin_va_list_64 (void)
10766 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10768 record = lang_hooks.types.make_type (RECORD_TYPE);
10769 type_decl = build_decl (BUILTINS_LOCATION,
10770 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10772 f_gpr = build_decl (BUILTINS_LOCATION,
10773 FIELD_DECL, get_identifier ("gp_offset"),
10774 unsigned_type_node);
10775 f_fpr = build_decl (BUILTINS_LOCATION,
10776 FIELD_DECL, get_identifier ("fp_offset"),
10777 unsigned_type_node);
10778 f_ovf = build_decl (BUILTINS_LOCATION,
10779 FIELD_DECL, get_identifier ("overflow_arg_area"),
10780 ptr_type_node);
10781 f_sav = build_decl (BUILTINS_LOCATION,
10782 FIELD_DECL, get_identifier ("reg_save_area"),
10783 ptr_type_node);
10785 va_list_gpr_counter_field = f_gpr;
10786 va_list_fpr_counter_field = f_fpr;
10788 DECL_FIELD_CONTEXT (f_gpr) = record;
10789 DECL_FIELD_CONTEXT (f_fpr) = record;
10790 DECL_FIELD_CONTEXT (f_ovf) = record;
10791 DECL_FIELD_CONTEXT (f_sav) = record;
10793 TYPE_STUB_DECL (record) = type_decl;
10794 TYPE_NAME (record) = type_decl;
10795 TYPE_FIELDS (record) = f_gpr;
10796 DECL_CHAIN (f_gpr) = f_fpr;
10797 DECL_CHAIN (f_fpr) = f_ovf;
10798 DECL_CHAIN (f_ovf) = f_sav;
10800 layout_type (record);
10802 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10803 NULL_TREE, TYPE_ATTRIBUTES (record));
10805 /* The correct type is an array type of one element. */
10806 return build_array_type (record, build_index_type (size_zero_node));
10809 /* Setup the builtin va_list data type and for 64-bit the additional
10810 calling convention specific va_list data types. */
10812 static tree
10813 ix86_build_builtin_va_list (void)
10815 if (TARGET_64BIT)
10817 /* Initialize ABI specific va_list builtin types.
10819 In lto1, we can encounter two va_list types:
10820 - one as a result of the type-merge across TUs, and
10821 - the one constructed here.
10822 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10823 a type identity check in canonical_va_list_type based on
10824 TYPE_MAIN_VARIANT (which we used to have) will not work.
10825 Instead, we tag each va_list_type_node with its unique attribute, and
10826 look for the attribute in the type identity check in
10827 canonical_va_list_type.
10829 Tagging sysv_va_list_type_node directly with the attribute is
10830 problematic since it's a array of one record, which will degrade into a
10831 pointer to record when used as parameter (see build_va_arg comments for
10832 an example), dropping the attribute in the process. So we tag the
10833 record instead. */
10835 /* For SYSV_ABI we use an array of one record. */
10836 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10838 /* For MS_ABI we use plain pointer to argument area. */
10839 tree char_ptr_type = build_pointer_type (char_type_node);
10840 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10841 TYPE_ATTRIBUTES (char_ptr_type));
10842 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10844 return ((ix86_abi == MS_ABI)
10845 ? ms_va_list_type_node
10846 : sysv_va_list_type_node);
10848 else
10850 /* For i386 we use plain pointer to argument area. */
10851 return build_pointer_type (char_type_node);
10855 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10857 static void
10858 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10860 rtx save_area, mem;
10861 alias_set_type set;
10862 int i, max;
10864 /* GPR size of varargs save area. */
10865 if (cfun->va_list_gpr_size)
10866 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10867 else
10868 ix86_varargs_gpr_size = 0;
10870 /* FPR size of varargs save area. We don't need it if we don't pass
10871 anything in SSE registers. */
10872 if (TARGET_SSE && cfun->va_list_fpr_size)
10873 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10874 else
10875 ix86_varargs_fpr_size = 0;
10877 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10878 return;
10880 save_area = frame_pointer_rtx;
10881 set = get_varargs_alias_set ();
10883 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10884 if (max > X86_64_REGPARM_MAX)
10885 max = X86_64_REGPARM_MAX;
10887 for (i = cum->regno; i < max; i++)
10889 mem = gen_rtx_MEM (word_mode,
10890 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10891 MEM_NOTRAP_P (mem) = 1;
10892 set_mem_alias_set (mem, set);
10893 emit_move_insn (mem,
10894 gen_rtx_REG (word_mode,
10895 x86_64_int_parameter_registers[i]));
10898 if (ix86_varargs_fpr_size)
10900 machine_mode smode;
10901 rtx_code_label *label;
10902 rtx test;
10904 /* Now emit code to save SSE registers. The AX parameter contains number
10905 of SSE parameter registers used to call this function, though all we
10906 actually check here is the zero/non-zero status. */
10908 label = gen_label_rtx ();
10909 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10910 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10911 label));
10913 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10914 we used movdqa (i.e. TImode) instead? Perhaps even better would
10915 be if we could determine the real mode of the data, via a hook
10916 into pass_stdarg. Ignore all that for now. */
10917 smode = V4SFmode;
10918 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10919 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10921 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10922 if (max > X86_64_SSE_REGPARM_MAX)
10923 max = X86_64_SSE_REGPARM_MAX;
10925 for (i = cum->sse_regno; i < max; ++i)
10927 mem = plus_constant (Pmode, save_area,
10928 i * 16 + ix86_varargs_gpr_size);
10929 mem = gen_rtx_MEM (smode, mem);
10930 MEM_NOTRAP_P (mem) = 1;
10931 set_mem_alias_set (mem, set);
10932 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10934 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10937 emit_label (label);
10941 static void
10942 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10944 alias_set_type set = get_varargs_alias_set ();
10945 int i;
10947 /* Reset to zero, as there might be a sysv vaarg used
10948 before. */
10949 ix86_varargs_gpr_size = 0;
10950 ix86_varargs_fpr_size = 0;
10952 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10954 rtx reg, mem;
10956 mem = gen_rtx_MEM (Pmode,
10957 plus_constant (Pmode, virtual_incoming_args_rtx,
10958 i * UNITS_PER_WORD));
10959 MEM_NOTRAP_P (mem) = 1;
10960 set_mem_alias_set (mem, set);
10962 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10963 emit_move_insn (mem, reg);
10967 static void
10968 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10969 tree type, int *, int no_rtl)
10971 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10972 CUMULATIVE_ARGS next_cum;
10973 tree fntype;
10975 /* This argument doesn't appear to be used anymore. Which is good,
10976 because the old code here didn't suppress rtl generation. */
10977 gcc_assert (!no_rtl);
10979 if (!TARGET_64BIT)
10980 return;
10982 fntype = TREE_TYPE (current_function_decl);
10984 /* For varargs, we do not want to skip the dummy va_dcl argument.
10985 For stdargs, we do want to skip the last named argument. */
10986 next_cum = *cum;
10987 if (stdarg_p (fntype))
10988 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10989 true);
10991 if (cum->call_abi == MS_ABI)
10992 setup_incoming_varargs_ms_64 (&next_cum);
10993 else
10994 setup_incoming_varargs_64 (&next_cum);
10997 static void
10998 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10999 enum machine_mode mode,
11000 tree type,
11001 int *pretend_size ATTRIBUTE_UNUSED,
11002 int no_rtl)
11004 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11005 CUMULATIVE_ARGS next_cum;
11006 tree fntype;
11007 rtx save_area;
11008 int bnd_reg, i, max;
11010 gcc_assert (!no_rtl);
11012 /* Do nothing if we use plain pointer to argument area. */
11013 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11014 return;
11016 fntype = TREE_TYPE (current_function_decl);
11018 /* For varargs, we do not want to skip the dummy va_dcl argument.
11019 For stdargs, we do want to skip the last named argument. */
11020 next_cum = *cum;
11021 if (stdarg_p (fntype))
11022 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11023 true);
11024 save_area = frame_pointer_rtx;
11026 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11027 if (max > X86_64_REGPARM_MAX)
11028 max = X86_64_REGPARM_MAX;
11030 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11031 if (chkp_function_instrumented_p (current_function_decl))
11032 for (i = cum->regno; i < max; i++)
11034 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11035 rtx ptr = gen_rtx_REG (Pmode,
11036 x86_64_int_parameter_registers[i]);
11037 rtx bounds;
11039 if (bnd_reg <= LAST_BND_REG)
11040 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11041 else
11043 rtx ldx_addr =
11044 plus_constant (Pmode, arg_pointer_rtx,
11045 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11046 bounds = gen_reg_rtx (BNDmode);
11047 emit_insn (BNDmode == BND64mode
11048 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11049 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11052 emit_insn (BNDmode == BND64mode
11053 ? gen_bnd64_stx (addr, ptr, bounds)
11054 : gen_bnd32_stx (addr, ptr, bounds));
11056 bnd_reg++;
11061 /* Checks if TYPE is of kind va_list char *. */
11063 static bool
11064 is_va_list_char_pointer (tree type)
11066 tree canonic;
11068 /* For 32-bit it is always true. */
11069 if (!TARGET_64BIT)
11070 return true;
11071 canonic = ix86_canonical_va_list_type (type);
11072 return (canonic == ms_va_list_type_node
11073 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11076 /* Implement va_start. */
11078 static void
11079 ix86_va_start (tree valist, rtx nextarg)
11081 HOST_WIDE_INT words, n_gpr, n_fpr;
11082 tree f_gpr, f_fpr, f_ovf, f_sav;
11083 tree gpr, fpr, ovf, sav, t;
11084 tree type;
11085 rtx ovf_rtx;
11087 if (flag_split_stack
11088 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11090 unsigned int scratch_regno;
11092 /* When we are splitting the stack, we can't refer to the stack
11093 arguments using internal_arg_pointer, because they may be on
11094 the old stack. The split stack prologue will arrange to
11095 leave a pointer to the old stack arguments in a scratch
11096 register, which we here copy to a pseudo-register. The split
11097 stack prologue can't set the pseudo-register directly because
11098 it (the prologue) runs before any registers have been saved. */
11100 scratch_regno = split_stack_prologue_scratch_regno ();
11101 if (scratch_regno != INVALID_REGNUM)
11103 rtx reg;
11104 rtx_insn *seq;
11106 reg = gen_reg_rtx (Pmode);
11107 cfun->machine->split_stack_varargs_pointer = reg;
11109 start_sequence ();
11110 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11111 seq = get_insns ();
11112 end_sequence ();
11114 push_topmost_sequence ();
11115 emit_insn_after (seq, entry_of_function ());
11116 pop_topmost_sequence ();
11120 /* Only 64bit target needs something special. */
11121 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11123 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11124 std_expand_builtin_va_start (valist, nextarg);
11125 else
11127 rtx va_r, next;
11129 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11130 next = expand_binop (ptr_mode, add_optab,
11131 cfun->machine->split_stack_varargs_pointer,
11132 crtl->args.arg_offset_rtx,
11133 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11134 convert_move (va_r, next, 0);
11136 /* Store zero bounds for va_list. */
11137 if (chkp_function_instrumented_p (current_function_decl))
11138 chkp_expand_bounds_reset_for_mem (valist,
11139 make_tree (TREE_TYPE (valist),
11140 next));
11143 return;
11146 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11147 f_fpr = DECL_CHAIN (f_gpr);
11148 f_ovf = DECL_CHAIN (f_fpr);
11149 f_sav = DECL_CHAIN (f_ovf);
11151 valist = build_simple_mem_ref (valist);
11152 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11153 /* The following should be folded into the MEM_REF offset. */
11154 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11155 f_gpr, NULL_TREE);
11156 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11157 f_fpr, NULL_TREE);
11158 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11159 f_ovf, NULL_TREE);
11160 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11161 f_sav, NULL_TREE);
11163 /* Count number of gp and fp argument registers used. */
11164 words = crtl->args.info.words;
11165 n_gpr = crtl->args.info.regno;
11166 n_fpr = crtl->args.info.sse_regno;
11168 if (cfun->va_list_gpr_size)
11170 type = TREE_TYPE (gpr);
11171 t = build2 (MODIFY_EXPR, type,
11172 gpr, build_int_cst (type, n_gpr * 8));
11173 TREE_SIDE_EFFECTS (t) = 1;
11174 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11177 if (TARGET_SSE && cfun->va_list_fpr_size)
11179 type = TREE_TYPE (fpr);
11180 t = build2 (MODIFY_EXPR, type, fpr,
11181 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11182 TREE_SIDE_EFFECTS (t) = 1;
11183 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11186 /* Find the overflow area. */
11187 type = TREE_TYPE (ovf);
11188 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11189 ovf_rtx = crtl->args.internal_arg_pointer;
11190 else
11191 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11192 t = make_tree (type, ovf_rtx);
11193 if (words != 0)
11194 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11196 /* Store zero bounds for overflow area pointer. */
11197 if (chkp_function_instrumented_p (current_function_decl))
11198 chkp_expand_bounds_reset_for_mem (ovf, t);
11200 t = build2 (MODIFY_EXPR, type, ovf, t);
11201 TREE_SIDE_EFFECTS (t) = 1;
11202 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11204 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11206 /* Find the register save area.
11207 Prologue of the function save it right above stack frame. */
11208 type = TREE_TYPE (sav);
11209 t = make_tree (type, frame_pointer_rtx);
11210 if (!ix86_varargs_gpr_size)
11211 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11213 /* Store zero bounds for save area pointer. */
11214 if (chkp_function_instrumented_p (current_function_decl))
11215 chkp_expand_bounds_reset_for_mem (sav, t);
11217 t = build2 (MODIFY_EXPR, type, sav, t);
11218 TREE_SIDE_EFFECTS (t) = 1;
11219 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11223 /* Implement va_arg. */
11225 static tree
11226 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11227 gimple_seq *post_p)
11229 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11230 tree f_gpr, f_fpr, f_ovf, f_sav;
11231 tree gpr, fpr, ovf, sav, t;
11232 int size, rsize;
11233 tree lab_false, lab_over = NULL_TREE;
11234 tree addr, t2;
11235 rtx container;
11236 int indirect_p = 0;
11237 tree ptrtype;
11238 machine_mode nat_mode;
11239 unsigned int arg_boundary;
11241 /* Only 64bit target needs something special. */
11242 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11243 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11245 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11246 f_fpr = DECL_CHAIN (f_gpr);
11247 f_ovf = DECL_CHAIN (f_fpr);
11248 f_sav = DECL_CHAIN (f_ovf);
11250 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11251 valist, f_gpr, NULL_TREE);
11253 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11254 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11255 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11257 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11258 if (indirect_p)
11259 type = build_pointer_type (type);
11260 size = int_size_in_bytes (type);
11261 rsize = CEIL (size, UNITS_PER_WORD);
11263 nat_mode = type_natural_mode (type, NULL, false);
11264 switch (nat_mode)
11266 case V8SFmode:
11267 case V8SImode:
11268 case V32QImode:
11269 case V16HImode:
11270 case V4DFmode:
11271 case V4DImode:
11272 case V16SFmode:
11273 case V16SImode:
11274 case V64QImode:
11275 case V32HImode:
11276 case V8DFmode:
11277 case V8DImode:
11278 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11279 if (!TARGET_64BIT_MS_ABI)
11281 container = NULL;
11282 break;
11284 /* FALLTHRU */
11286 default:
11287 container = construct_container (nat_mode, TYPE_MODE (type),
11288 type, 0, X86_64_REGPARM_MAX,
11289 X86_64_SSE_REGPARM_MAX, intreg,
11291 break;
11294 /* Pull the value out of the saved registers. */
11296 addr = create_tmp_var (ptr_type_node, "addr");
11298 if (container)
11300 int needed_intregs, needed_sseregs;
11301 bool need_temp;
11302 tree int_addr, sse_addr;
11304 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11305 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11307 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11309 need_temp = (!REG_P (container)
11310 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11311 || TYPE_ALIGN (type) > 128));
11313 /* In case we are passing structure, verify that it is consecutive block
11314 on the register save area. If not we need to do moves. */
11315 if (!need_temp && !REG_P (container))
11317 /* Verify that all registers are strictly consecutive */
11318 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11320 int i;
11322 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11324 rtx slot = XVECEXP (container, 0, i);
11325 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11326 || INTVAL (XEXP (slot, 1)) != i * 16)
11327 need_temp = true;
11330 else
11332 int i;
11334 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11336 rtx slot = XVECEXP (container, 0, i);
11337 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11338 || INTVAL (XEXP (slot, 1)) != i * 8)
11339 need_temp = true;
11343 if (!need_temp)
11345 int_addr = addr;
11346 sse_addr = addr;
11348 else
11350 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11351 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11354 /* First ensure that we fit completely in registers. */
11355 if (needed_intregs)
11357 t = build_int_cst (TREE_TYPE (gpr),
11358 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11359 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11360 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11361 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11362 gimplify_and_add (t, pre_p);
11364 if (needed_sseregs)
11366 t = build_int_cst (TREE_TYPE (fpr),
11367 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11368 + X86_64_REGPARM_MAX * 8);
11369 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11370 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11371 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11372 gimplify_and_add (t, pre_p);
11375 /* Compute index to start of area used for integer regs. */
11376 if (needed_intregs)
11378 /* int_addr = gpr + sav; */
11379 t = fold_build_pointer_plus (sav, gpr);
11380 gimplify_assign (int_addr, t, pre_p);
11382 if (needed_sseregs)
11384 /* sse_addr = fpr + sav; */
11385 t = fold_build_pointer_plus (sav, fpr);
11386 gimplify_assign (sse_addr, t, pre_p);
11388 if (need_temp)
11390 int i, prev_size = 0;
11391 tree temp = create_tmp_var (type, "va_arg_tmp");
11393 /* addr = &temp; */
11394 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11395 gimplify_assign (addr, t, pre_p);
11397 for (i = 0; i < XVECLEN (container, 0); i++)
11399 rtx slot = XVECEXP (container, 0, i);
11400 rtx reg = XEXP (slot, 0);
11401 machine_mode mode = GET_MODE (reg);
11402 tree piece_type;
11403 tree addr_type;
11404 tree daddr_type;
11405 tree src_addr, src;
11406 int src_offset;
11407 tree dest_addr, dest;
11408 int cur_size = GET_MODE_SIZE (mode);
11410 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11411 prev_size = INTVAL (XEXP (slot, 1));
11412 if (prev_size + cur_size > size)
11414 cur_size = size - prev_size;
11415 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11416 if (mode == BLKmode)
11417 mode = QImode;
11419 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11420 if (mode == GET_MODE (reg))
11421 addr_type = build_pointer_type (piece_type);
11422 else
11423 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11424 true);
11425 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11426 true);
11428 if (SSE_REGNO_P (REGNO (reg)))
11430 src_addr = sse_addr;
11431 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11433 else
11435 src_addr = int_addr;
11436 src_offset = REGNO (reg) * 8;
11438 src_addr = fold_convert (addr_type, src_addr);
11439 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11441 dest_addr = fold_convert (daddr_type, addr);
11442 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11443 if (cur_size == GET_MODE_SIZE (mode))
11445 src = build_va_arg_indirect_ref (src_addr);
11446 dest = build_va_arg_indirect_ref (dest_addr);
11448 gimplify_assign (dest, src, pre_p);
11450 else
11452 tree copy
11453 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11454 3, dest_addr, src_addr,
11455 size_int (cur_size));
11456 gimplify_and_add (copy, pre_p);
11458 prev_size += cur_size;
11462 if (needed_intregs)
11464 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11465 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11466 gimplify_assign (gpr, t, pre_p);
11469 if (needed_sseregs)
11471 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11472 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11473 gimplify_assign (unshare_expr (fpr), t, pre_p);
11476 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11478 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11481 /* ... otherwise out of the overflow area. */
11483 /* When we align parameter on stack for caller, if the parameter
11484 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11485 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11486 here with caller. */
11487 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11488 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11489 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11491 /* Care for on-stack alignment if needed. */
11492 if (arg_boundary <= 64 || size == 0)
11493 t = ovf;
11494 else
11496 HOST_WIDE_INT align = arg_boundary / 8;
11497 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11498 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11499 build_int_cst (TREE_TYPE (t), -align));
11502 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11503 gimplify_assign (addr, t, pre_p);
11505 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11506 gimplify_assign (unshare_expr (ovf), t, pre_p);
11508 if (container)
11509 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11511 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11512 addr = fold_convert (ptrtype, addr);
11514 if (indirect_p)
11515 addr = build_va_arg_indirect_ref (addr);
11516 return build_va_arg_indirect_ref (addr);
11519 /* Return true if OPNUM's MEM should be matched
11520 in movabs* patterns. */
11522 bool
11523 ix86_check_movabs (rtx insn, int opnum)
11525 rtx set, mem;
11527 set = PATTERN (insn);
11528 if (GET_CODE (set) == PARALLEL)
11529 set = XVECEXP (set, 0, 0);
11530 gcc_assert (GET_CODE (set) == SET);
11531 mem = XEXP (set, opnum);
11532 while (SUBREG_P (mem))
11533 mem = SUBREG_REG (mem);
11534 gcc_assert (MEM_P (mem));
11535 return volatile_ok || !MEM_VOLATILE_P (mem);
11538 /* Return false if INSN contains a MEM with a non-default address space. */
11539 bool
11540 ix86_check_no_addr_space (rtx insn)
11542 subrtx_var_iterator::array_type array;
11543 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11545 rtx x = *iter;
11546 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11547 return false;
11549 return true;
11552 /* Initialize the table of extra 80387 mathematical constants. */
11554 static void
11555 init_ext_80387_constants (void)
11557 static const char * cst[5] =
11559 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11560 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11561 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11562 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11563 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11565 int i;
11567 for (i = 0; i < 5; i++)
11569 real_from_string (&ext_80387_constants_table[i], cst[i]);
11570 /* Ensure each constant is rounded to XFmode precision. */
11571 real_convert (&ext_80387_constants_table[i],
11572 XFmode, &ext_80387_constants_table[i]);
11575 ext_80387_constants_init = 1;
11578 /* Return non-zero if the constant is something that
11579 can be loaded with a special instruction. */
11582 standard_80387_constant_p (rtx x)
11584 machine_mode mode = GET_MODE (x);
11586 const REAL_VALUE_TYPE *r;
11588 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11589 return -1;
11591 if (x == CONST0_RTX (mode))
11592 return 1;
11593 if (x == CONST1_RTX (mode))
11594 return 2;
11596 r = CONST_DOUBLE_REAL_VALUE (x);
11598 /* For XFmode constants, try to find a special 80387 instruction when
11599 optimizing for size or on those CPUs that benefit from them. */
11600 if (mode == XFmode
11601 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11603 int i;
11605 if (! ext_80387_constants_init)
11606 init_ext_80387_constants ();
11608 for (i = 0; i < 5; i++)
11609 if (real_identical (r, &ext_80387_constants_table[i]))
11610 return i + 3;
11613 /* Load of the constant -0.0 or -1.0 will be split as
11614 fldz;fchs or fld1;fchs sequence. */
11615 if (real_isnegzero (r))
11616 return 8;
11617 if (real_identical (r, &dconstm1))
11618 return 9;
11620 return 0;
11623 /* Return the opcode of the special instruction to be used to load
11624 the constant X. */
11626 const char *
11627 standard_80387_constant_opcode (rtx x)
11629 switch (standard_80387_constant_p (x))
11631 case 1:
11632 return "fldz";
11633 case 2:
11634 return "fld1";
11635 case 3:
11636 return "fldlg2";
11637 case 4:
11638 return "fldln2";
11639 case 5:
11640 return "fldl2e";
11641 case 6:
11642 return "fldl2t";
11643 case 7:
11644 return "fldpi";
11645 case 8:
11646 case 9:
11647 return "#";
11648 default:
11649 gcc_unreachable ();
11653 /* Return the CONST_DOUBLE representing the 80387 constant that is
11654 loaded by the specified special instruction. The argument IDX
11655 matches the return value from standard_80387_constant_p. */
11658 standard_80387_constant_rtx (int idx)
11660 int i;
11662 if (! ext_80387_constants_init)
11663 init_ext_80387_constants ();
11665 switch (idx)
11667 case 3:
11668 case 4:
11669 case 5:
11670 case 6:
11671 case 7:
11672 i = idx - 3;
11673 break;
11675 default:
11676 gcc_unreachable ();
11679 return const_double_from_real_value (ext_80387_constants_table[i],
11680 XFmode);
11683 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11684 in supported SSE/AVX vector mode. */
11687 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11689 machine_mode mode;
11691 if (!TARGET_SSE)
11692 return 0;
11694 mode = GET_MODE (x);
11696 if (x == const0_rtx || const0_operand (x, mode))
11697 return 1;
11699 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11701 /* VOIDmode integer constant, get mode from the predicate. */
11702 if (mode == VOIDmode)
11703 mode = pred_mode;
11705 switch (GET_MODE_SIZE (mode))
11707 case 64:
11708 if (TARGET_AVX512F)
11709 return 2;
11710 break;
11711 case 32:
11712 if (TARGET_AVX2)
11713 return 2;
11714 break;
11715 case 16:
11716 if (TARGET_SSE2)
11717 return 2;
11718 break;
11719 case 0:
11720 /* VOIDmode */
11721 gcc_unreachable ();
11722 default:
11723 break;
11727 return 0;
11730 /* Return the opcode of the special instruction to be used to load
11731 the constant X. */
11733 const char *
11734 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11736 machine_mode mode;
11738 gcc_assert (TARGET_SSE);
11740 mode = GET_MODE (x);
11742 if (x == const0_rtx || const0_operand (x, mode))
11744 switch (get_attr_mode (insn))
11746 case MODE_XI:
11747 return "vpxord\t%g0, %g0, %g0";
11748 case MODE_OI:
11749 return (TARGET_AVX512VL
11750 ? "vpxord\t%x0, %x0, %x0"
11751 : "vpxor\t%x0, %x0, %x0");
11752 case MODE_TI:
11753 return (TARGET_AVX512VL
11754 ? "vpxord\t%t0, %t0, %t0"
11755 : "%vpxor\t%0, %d0");
11757 case MODE_V8DF:
11758 return (TARGET_AVX512DQ
11759 ? "vxorpd\t%g0, %g0, %g0"
11760 : "vpxorq\t%g0, %g0, %g0");
11761 case MODE_V4DF:
11762 return "vxorpd\t%x0, %x0, %x0";
11763 case MODE_V2DF:
11764 return "%vxorpd\t%0, %d0";
11766 case MODE_V16SF:
11767 return (TARGET_AVX512DQ
11768 ? "vxorps\t%g0, %g0, %g0"
11769 : "vpxord\t%g0, %g0, %g0");
11770 case MODE_V8SF:
11771 return "vxorps\t%x0, %x0, %x0";
11772 case MODE_V4SF:
11773 return "%vxorps\t%0, %d0";
11775 default:
11776 gcc_unreachable ();
11779 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11781 enum attr_mode insn_mode = get_attr_mode (insn);
11783 switch (insn_mode)
11785 case MODE_XI:
11786 case MODE_V8DF:
11787 case MODE_V16SF:
11788 gcc_assert (TARGET_AVX512F);
11789 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11791 case MODE_OI:
11792 case MODE_V4DF:
11793 case MODE_V8SF:
11794 gcc_assert (TARGET_AVX2);
11795 /* FALLTHRU */
11796 case MODE_TI:
11797 case MODE_V2DF:
11798 case MODE_V4SF:
11799 gcc_assert (TARGET_SSE2);
11800 return (TARGET_AVX
11801 ? "vpcmpeqd\t%0, %0, %0"
11802 : "pcmpeqd\t%0, %0");
11804 default:
11805 gcc_unreachable ();
11809 gcc_unreachable ();
11812 /* Returns true if INSN can be transformed from a memory load
11813 to a supported FP constant load. */
11815 bool
11816 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11818 rtx src = find_constant_src (insn);
11820 gcc_assert (REG_P (dst));
11822 if (src == NULL
11823 || (SSE_REGNO_P (REGNO (dst))
11824 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11825 || (STACK_REGNO_P (REGNO (dst))
11826 && standard_80387_constant_p (src) < 1))
11827 return false;
11829 return true;
11832 /* Returns true if OP contains a symbol reference */
11834 bool
11835 symbolic_reference_mentioned_p (rtx op)
11837 const char *fmt;
11838 int i;
11840 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11841 return true;
11843 fmt = GET_RTX_FORMAT (GET_CODE (op));
11844 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11846 if (fmt[i] == 'E')
11848 int j;
11850 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11851 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11852 return true;
11855 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11856 return true;
11859 return false;
11862 /* Return true if it is appropriate to emit `ret' instructions in the
11863 body of a function. Do this only if the epilogue is simple, needing a
11864 couple of insns. Prior to reloading, we can't tell how many registers
11865 must be saved, so return false then. Return false if there is no frame
11866 marker to de-allocate. */
11868 bool
11869 ix86_can_use_return_insn_p (void)
11871 struct ix86_frame frame;
11873 /* Don't use `ret' instruction in interrupt handler. */
11874 if (! reload_completed
11875 || frame_pointer_needed
11876 || cfun->machine->func_type != TYPE_NORMAL)
11877 return 0;
11879 /* Don't allow more than 32k pop, since that's all we can do
11880 with one instruction. */
11881 if (crtl->args.pops_args && crtl->args.size >= 32768)
11882 return 0;
11884 ix86_compute_frame_layout (&frame);
11885 return (frame.stack_pointer_offset == UNITS_PER_WORD
11886 && (frame.nregs + frame.nsseregs) == 0);
11889 /* Value should be nonzero if functions must have frame pointers.
11890 Zero means the frame pointer need not be set up (and parms may
11891 be accessed via the stack pointer) in functions that seem suitable. */
11893 static bool
11894 ix86_frame_pointer_required (void)
11896 /* If we accessed previous frames, then the generated code expects
11897 to be able to access the saved ebp value in our frame. */
11898 if (cfun->machine->accesses_prev_frame)
11899 return true;
11901 /* Several x86 os'es need a frame pointer for other reasons,
11902 usually pertaining to setjmp. */
11903 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11904 return true;
11906 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11907 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11908 return true;
11910 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11911 allocation is 4GB. */
11912 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11913 return true;
11915 /* SSE saves require frame-pointer when stack is misaligned. */
11916 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11917 return true;
11919 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11920 turns off the frame pointer by default. Turn it back on now if
11921 we've not got a leaf function. */
11922 if (TARGET_OMIT_LEAF_FRAME_POINTER
11923 && (!crtl->is_leaf
11924 || ix86_current_function_calls_tls_descriptor))
11925 return true;
11927 if (crtl->profile && !flag_fentry)
11928 return true;
11930 return false;
11933 /* Record that the current function accesses previous call frames. */
11935 void
11936 ix86_setup_frame_addresses (void)
11938 cfun->machine->accesses_prev_frame = 1;
11941 #ifndef USE_HIDDEN_LINKONCE
11942 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11943 # define USE_HIDDEN_LINKONCE 1
11944 # else
11945 # define USE_HIDDEN_LINKONCE 0
11946 # endif
11947 #endif
11949 static int pic_labels_used;
11951 /* Fills in the label name that should be used for a pc thunk for
11952 the given register. */
11954 static void
11955 get_pc_thunk_name (char name[32], unsigned int regno)
11957 gcc_assert (!TARGET_64BIT);
11959 if (USE_HIDDEN_LINKONCE)
11960 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11961 else
11962 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11966 /* This function generates code for -fpic that loads %ebx with
11967 the return address of the caller and then returns. */
11969 static void
11970 ix86_code_end (void)
11972 rtx xops[2];
11973 int regno;
11975 for (regno = AX_REG; regno <= SP_REG; regno++)
11977 char name[32];
11978 tree decl;
11980 if (!(pic_labels_used & (1 << regno)))
11981 continue;
11983 get_pc_thunk_name (name, regno);
11985 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11986 get_identifier (name),
11987 build_function_type_list (void_type_node, NULL_TREE));
11988 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11989 NULL_TREE, void_type_node);
11990 TREE_PUBLIC (decl) = 1;
11991 TREE_STATIC (decl) = 1;
11992 DECL_IGNORED_P (decl) = 1;
11994 #if TARGET_MACHO
11995 if (TARGET_MACHO)
11997 switch_to_section (darwin_sections[picbase_thunk_section]);
11998 fputs ("\t.weak_definition\t", asm_out_file);
11999 assemble_name (asm_out_file, name);
12000 fputs ("\n\t.private_extern\t", asm_out_file);
12001 assemble_name (asm_out_file, name);
12002 putc ('\n', asm_out_file);
12003 ASM_OUTPUT_LABEL (asm_out_file, name);
12004 DECL_WEAK (decl) = 1;
12006 else
12007 #endif
12008 if (USE_HIDDEN_LINKONCE)
12010 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12012 targetm.asm_out.unique_section (decl, 0);
12013 switch_to_section (get_named_section (decl, NULL, 0));
12015 targetm.asm_out.globalize_label (asm_out_file, name);
12016 fputs ("\t.hidden\t", asm_out_file);
12017 assemble_name (asm_out_file, name);
12018 putc ('\n', asm_out_file);
12019 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12021 else
12023 switch_to_section (text_section);
12024 ASM_OUTPUT_LABEL (asm_out_file, name);
12027 DECL_INITIAL (decl) = make_node (BLOCK);
12028 current_function_decl = decl;
12029 allocate_struct_function (decl, false);
12030 init_function_start (decl);
12031 /* We're about to hide the function body from callees of final_* by
12032 emitting it directly; tell them we're a thunk, if they care. */
12033 cfun->is_thunk = true;
12034 first_function_block_is_cold = false;
12035 /* Make sure unwind info is emitted for the thunk if needed. */
12036 final_start_function (emit_barrier (), asm_out_file, 1);
12038 /* Pad stack IP move with 4 instructions (two NOPs count
12039 as one instruction). */
12040 if (TARGET_PAD_SHORT_FUNCTION)
12042 int i = 8;
12044 while (i--)
12045 fputs ("\tnop\n", asm_out_file);
12048 xops[0] = gen_rtx_REG (Pmode, regno);
12049 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12050 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12051 output_asm_insn ("%!ret", NULL);
12052 final_end_function ();
12053 init_insn_lengths ();
12054 free_after_compilation (cfun);
12055 set_cfun (NULL);
12056 current_function_decl = NULL;
12059 if (flag_split_stack)
12060 file_end_indicate_split_stack ();
12063 /* Emit code for the SET_GOT patterns. */
12065 const char *
12066 output_set_got (rtx dest, rtx label)
12068 rtx xops[3];
12070 xops[0] = dest;
12072 if (TARGET_VXWORKS_RTP && flag_pic)
12074 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12075 xops[2] = gen_rtx_MEM (Pmode,
12076 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12077 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12079 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12080 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12081 an unadorned address. */
12082 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12083 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12084 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12085 return "";
12088 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12090 if (flag_pic)
12092 char name[32];
12093 get_pc_thunk_name (name, REGNO (dest));
12094 pic_labels_used |= 1 << REGNO (dest);
12096 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12097 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12098 output_asm_insn ("%!call\t%X2", xops);
12100 #if TARGET_MACHO
12101 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12102 This is what will be referenced by the Mach-O PIC subsystem. */
12103 if (machopic_should_output_picbase_label () || !label)
12104 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12106 /* When we are restoring the pic base at the site of a nonlocal label,
12107 and we decided to emit the pic base above, we will still output a
12108 local label used for calculating the correction offset (even though
12109 the offset will be 0 in that case). */
12110 if (label)
12111 targetm.asm_out.internal_label (asm_out_file, "L",
12112 CODE_LABEL_NUMBER (label));
12113 #endif
12115 else
12117 if (TARGET_MACHO)
12118 /* We don't need a pic base, we're not producing pic. */
12119 gcc_unreachable ();
12121 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12122 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12123 targetm.asm_out.internal_label (asm_out_file, "L",
12124 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12127 if (!TARGET_MACHO)
12128 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12130 return "";
12133 /* Generate an "push" pattern for input ARG. */
12135 static rtx
12136 gen_push (rtx arg)
12138 struct machine_function *m = cfun->machine;
12140 if (m->fs.cfa_reg == stack_pointer_rtx)
12141 m->fs.cfa_offset += UNITS_PER_WORD;
12142 m->fs.sp_offset += UNITS_PER_WORD;
12144 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12145 arg = gen_rtx_REG (word_mode, REGNO (arg));
12147 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12148 gen_rtx_PRE_DEC (Pmode,
12149 stack_pointer_rtx)),
12150 arg);
12153 /* Generate an "pop" pattern for input ARG. */
12155 static rtx
12156 gen_pop (rtx arg)
12158 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12159 arg = gen_rtx_REG (word_mode, REGNO (arg));
12161 return gen_rtx_SET (arg,
12162 gen_rtx_MEM (word_mode,
12163 gen_rtx_POST_INC (Pmode,
12164 stack_pointer_rtx)));
12167 /* Return >= 0 if there is an unused call-clobbered register available
12168 for the entire function. */
12170 static unsigned int
12171 ix86_select_alt_pic_regnum (void)
12173 if (ix86_use_pseudo_pic_reg ())
12174 return INVALID_REGNUM;
12176 if (crtl->is_leaf
12177 && !crtl->profile
12178 && !ix86_current_function_calls_tls_descriptor)
12180 int i, drap;
12181 /* Can't use the same register for both PIC and DRAP. */
12182 if (crtl->drap_reg)
12183 drap = REGNO (crtl->drap_reg);
12184 else
12185 drap = -1;
12186 for (i = 2; i >= 0; --i)
12187 if (i != drap && !df_regs_ever_live_p (i))
12188 return i;
12191 return INVALID_REGNUM;
12194 /* Return true if REGNO is used by the epilogue. */
12196 bool
12197 ix86_epilogue_uses (int regno)
12199 /* If there are no caller-saved registers, we preserve all registers,
12200 except for MMX and x87 registers which aren't supported when saving
12201 and restoring registers. Don't explicitly save SP register since
12202 it is always preserved. */
12203 return (epilogue_completed
12204 && cfun->machine->no_caller_saved_registers
12205 && !fixed_regs[regno]
12206 && !STACK_REGNO_P (regno)
12207 && !MMX_REGNO_P (regno));
12210 /* Return nonzero if register REGNO can be used as a scratch register
12211 in peephole2. */
12213 static bool
12214 ix86_hard_regno_scratch_ok (unsigned int regno)
12216 /* If there are no caller-saved registers, we can't use any register
12217 as a scratch register after epilogue and use REGNO as scratch
12218 register only if it has been used before to avoid saving and
12219 restoring it. */
12220 return (!cfun->machine->no_caller_saved_registers
12221 || (!epilogue_completed
12222 && df_regs_ever_live_p (regno)));
12225 /* Return true if register class CL should be an additional allocno
12226 class. */
12228 static bool
12229 ix86_additional_allocno_class_p (reg_class_t cl)
12231 return cl == MOD4_SSE_REGS;
12234 /* Return TRUE if we need to save REGNO. */
12236 static bool
12237 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12239 /* If there are no caller-saved registers, we preserve all registers,
12240 except for MMX and x87 registers which aren't supported when saving
12241 and restoring registers. Don't explicitly save SP register since
12242 it is always preserved. */
12243 if (cfun->machine->no_caller_saved_registers)
12245 /* Don't preserve registers used for function return value. */
12246 rtx reg = crtl->return_rtx;
12247 if (reg)
12249 unsigned int i = REGNO (reg);
12250 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12251 while (nregs-- > 0)
12252 if ((i + nregs) == regno)
12253 return false;
12255 reg = crtl->return_bnd;
12256 if (reg)
12258 i = REGNO (reg);
12259 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12260 while (nregs-- > 0)
12261 if ((i + nregs) == regno)
12262 return false;
12266 return (df_regs_ever_live_p (regno)
12267 && !fixed_regs[regno]
12268 && !STACK_REGNO_P (regno)
12269 && !MMX_REGNO_P (regno)
12270 && (regno != HARD_FRAME_POINTER_REGNUM
12271 || !frame_pointer_needed));
12274 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12275 && pic_offset_table_rtx)
12277 if (ix86_use_pseudo_pic_reg ())
12279 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12280 _mcount in prologue. */
12281 if (!TARGET_64BIT && flag_pic && crtl->profile)
12282 return true;
12284 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12285 || crtl->profile
12286 || crtl->calls_eh_return
12287 || crtl->uses_const_pool
12288 || cfun->has_nonlocal_label)
12289 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12292 if (crtl->calls_eh_return && maybe_eh_return)
12294 unsigned i;
12295 for (i = 0; ; i++)
12297 unsigned test = EH_RETURN_DATA_REGNO (i);
12298 if (test == INVALID_REGNUM)
12299 break;
12300 if (test == regno)
12301 return true;
12305 if (crtl->drap_reg
12306 && regno == REGNO (crtl->drap_reg)
12307 && !cfun->machine->no_drap_save_restore)
12308 return true;
12310 return (df_regs_ever_live_p (regno)
12311 && !call_used_regs[regno]
12312 && !fixed_regs[regno]
12313 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12316 /* Return number of saved general prupose registers. */
12318 static int
12319 ix86_nsaved_regs (void)
12321 int nregs = 0;
12322 int regno;
12324 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12325 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12326 nregs ++;
12327 return nregs;
12330 /* Return number of saved SSE registers. */
12332 static int
12333 ix86_nsaved_sseregs (void)
12335 int nregs = 0;
12336 int regno;
12338 if (!TARGET_64BIT_MS_ABI)
12339 return 0;
12340 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12341 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12342 nregs ++;
12343 return nregs;
12346 /* Given FROM and TO register numbers, say whether this elimination is
12347 allowed. If stack alignment is needed, we can only replace argument
12348 pointer with hard frame pointer, or replace frame pointer with stack
12349 pointer. Otherwise, frame pointer elimination is automatically
12350 handled and all other eliminations are valid. */
12352 static bool
12353 ix86_can_eliminate (const int from, const int to)
12355 if (stack_realign_fp)
12356 return ((from == ARG_POINTER_REGNUM
12357 && to == HARD_FRAME_POINTER_REGNUM)
12358 || (from == FRAME_POINTER_REGNUM
12359 && to == STACK_POINTER_REGNUM));
12360 else
12361 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12364 /* Return the offset between two registers, one to be eliminated, and the other
12365 its replacement, at the start of a routine. */
12367 HOST_WIDE_INT
12368 ix86_initial_elimination_offset (int from, int to)
12370 struct ix86_frame frame;
12371 ix86_compute_frame_layout (&frame);
12373 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12374 return frame.hard_frame_pointer_offset;
12375 else if (from == FRAME_POINTER_REGNUM
12376 && to == HARD_FRAME_POINTER_REGNUM)
12377 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12378 else
12380 gcc_assert (to == STACK_POINTER_REGNUM);
12382 if (from == ARG_POINTER_REGNUM)
12383 return frame.stack_pointer_offset;
12385 gcc_assert (from == FRAME_POINTER_REGNUM);
12386 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12390 /* In a dynamically-aligned function, we can't know the offset from
12391 stack pointer to frame pointer, so we must ensure that setjmp
12392 eliminates fp against the hard fp (%ebp) rather than trying to
12393 index from %esp up to the top of the frame across a gap that is
12394 of unknown (at compile-time) size. */
12395 static rtx
12396 ix86_builtin_setjmp_frame_value (void)
12398 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12401 /* When using -fsplit-stack, the allocation routines set a field in
12402 the TCB to the bottom of the stack plus this much space, measured
12403 in bytes. */
12405 #define SPLIT_STACK_AVAILABLE 256
12407 /* Fill structure ix86_frame about frame of currently computed function. */
12409 static void
12410 ix86_compute_frame_layout (struct ix86_frame *frame)
12412 unsigned HOST_WIDE_INT stack_alignment_needed;
12413 HOST_WIDE_INT offset;
12414 unsigned HOST_WIDE_INT preferred_alignment;
12415 HOST_WIDE_INT size = get_frame_size ();
12416 HOST_WIDE_INT to_allocate;
12418 frame->nregs = ix86_nsaved_regs ();
12419 frame->nsseregs = ix86_nsaved_sseregs ();
12421 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12422 except for function prologues, leaf functions and when the defult
12423 incoming stack boundary is overriden at command line or via
12424 force_align_arg_pointer attribute. */
12425 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12426 && (!crtl->is_leaf || cfun->calls_alloca != 0
12427 || ix86_current_function_calls_tls_descriptor
12428 || ix86_incoming_stack_boundary < 128))
12430 crtl->preferred_stack_boundary = 128;
12431 crtl->stack_alignment_needed = 128;
12434 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12435 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12437 gcc_assert (!size || stack_alignment_needed);
12438 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12439 gcc_assert (preferred_alignment <= stack_alignment_needed);
12441 /* For SEH we have to limit the amount of code movement into the prologue.
12442 At present we do this via a BLOCKAGE, at which point there's very little
12443 scheduling that can be done, which means that there's very little point
12444 in doing anything except PUSHs. */
12445 if (TARGET_SEH)
12446 cfun->machine->use_fast_prologue_epilogue = false;
12448 /* During reload iteration the amount of registers saved can change.
12449 Recompute the value as needed. Do not recompute when amount of registers
12450 didn't change as reload does multiple calls to the function and does not
12451 expect the decision to change within single iteration. */
12452 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12453 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12455 int count = frame->nregs;
12456 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12458 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12460 /* The fast prologue uses move instead of push to save registers. This
12461 is significantly longer, but also executes faster as modern hardware
12462 can execute the moves in parallel, but can't do that for push/pop.
12464 Be careful about choosing what prologue to emit: When function takes
12465 many instructions to execute we may use slow version as well as in
12466 case function is known to be outside hot spot (this is known with
12467 feedback only). Weight the size of function by number of registers
12468 to save as it is cheap to use one or two push instructions but very
12469 slow to use many of them. */
12470 if (count)
12471 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12472 if (node->frequency < NODE_FREQUENCY_NORMAL
12473 || (flag_branch_probabilities
12474 && node->frequency < NODE_FREQUENCY_HOT))
12475 cfun->machine->use_fast_prologue_epilogue = false;
12476 else
12477 cfun->machine->use_fast_prologue_epilogue
12478 = !expensive_function_p (count);
12481 frame->save_regs_using_mov
12482 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12483 /* If static stack checking is enabled and done with probes,
12484 the registers need to be saved before allocating the frame. */
12485 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12487 /* Skip return address. */
12488 offset = UNITS_PER_WORD;
12490 /* Skip pushed static chain. */
12491 if (ix86_static_chain_on_stack)
12492 offset += UNITS_PER_WORD;
12494 /* Skip saved base pointer. */
12495 if (frame_pointer_needed)
12496 offset += UNITS_PER_WORD;
12497 frame->hfp_save_offset = offset;
12499 /* The traditional frame pointer location is at the top of the frame. */
12500 frame->hard_frame_pointer_offset = offset;
12502 /* Register save area */
12503 offset += frame->nregs * UNITS_PER_WORD;
12504 frame->reg_save_offset = offset;
12506 /* On SEH target, registers are pushed just before the frame pointer
12507 location. */
12508 if (TARGET_SEH)
12509 frame->hard_frame_pointer_offset = offset;
12511 /* Align and set SSE register save area. */
12512 if (frame->nsseregs)
12514 /* The only ABI that has saved SSE registers (Win64) also has a
12515 16-byte aligned default stack, and thus we don't need to be
12516 within the re-aligned local stack frame to save them. In case
12517 incoming stack boundary is aligned to less than 16 bytes,
12518 unaligned move of SSE register will be emitted, so there is
12519 no point to round up the SSE register save area outside the
12520 re-aligned local stack frame to 16 bytes. */
12521 if (ix86_incoming_stack_boundary >= 128)
12522 offset = ROUND_UP (offset, 16);
12523 offset += frame->nsseregs * 16;
12525 frame->sse_reg_save_offset = offset;
12527 /* The re-aligned stack starts here. Values before this point are not
12528 directly comparable with values below this point. In order to make
12529 sure that no value happens to be the same before and after, force
12530 the alignment computation below to add a non-zero value. */
12531 if (stack_realign_fp)
12532 offset = ROUND_UP (offset, stack_alignment_needed);
12534 /* Va-arg area */
12535 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12536 offset += frame->va_arg_size;
12538 /* Align start of frame for local function. */
12539 if (stack_realign_fp
12540 || offset != frame->sse_reg_save_offset
12541 || size != 0
12542 || !crtl->is_leaf
12543 || cfun->calls_alloca
12544 || ix86_current_function_calls_tls_descriptor)
12545 offset = ROUND_UP (offset, stack_alignment_needed);
12547 /* Frame pointer points here. */
12548 frame->frame_pointer_offset = offset;
12550 offset += size;
12552 /* Add outgoing arguments area. Can be skipped if we eliminated
12553 all the function calls as dead code.
12554 Skipping is however impossible when function calls alloca. Alloca
12555 expander assumes that last crtl->outgoing_args_size
12556 of stack frame are unused. */
12557 if (ACCUMULATE_OUTGOING_ARGS
12558 && (!crtl->is_leaf || cfun->calls_alloca
12559 || ix86_current_function_calls_tls_descriptor))
12561 offset += crtl->outgoing_args_size;
12562 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12564 else
12565 frame->outgoing_arguments_size = 0;
12567 /* Align stack boundary. Only needed if we're calling another function
12568 or using alloca. */
12569 if (!crtl->is_leaf || cfun->calls_alloca
12570 || ix86_current_function_calls_tls_descriptor)
12571 offset = ROUND_UP (offset, preferred_alignment);
12573 /* We've reached end of stack frame. */
12574 frame->stack_pointer_offset = offset;
12576 /* Size prologue needs to allocate. */
12577 to_allocate = offset - frame->sse_reg_save_offset;
12579 if ((!to_allocate && frame->nregs <= 1)
12580 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12581 frame->save_regs_using_mov = false;
12583 if (ix86_using_red_zone ()
12584 && crtl->sp_is_unchanging
12585 && crtl->is_leaf
12586 && !ix86_pc_thunk_call_expanded
12587 && !ix86_current_function_calls_tls_descriptor)
12589 frame->red_zone_size = to_allocate;
12590 if (frame->save_regs_using_mov)
12591 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12592 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12593 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12595 else
12596 frame->red_zone_size = 0;
12597 frame->stack_pointer_offset -= frame->red_zone_size;
12599 /* The SEH frame pointer location is near the bottom of the frame.
12600 This is enforced by the fact that the difference between the
12601 stack pointer and the frame pointer is limited to 240 bytes in
12602 the unwind data structure. */
12603 if (TARGET_SEH)
12605 HOST_WIDE_INT diff;
12607 /* If we can leave the frame pointer where it is, do so. Also, returns
12608 the establisher frame for __builtin_frame_address (0). */
12609 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12610 if (diff <= SEH_MAX_FRAME_SIZE
12611 && (diff > 240 || (diff & 15) != 0)
12612 && !crtl->accesses_prior_frames)
12614 /* Ideally we'd determine what portion of the local stack frame
12615 (within the constraint of the lowest 240) is most heavily used.
12616 But without that complication, simply bias the frame pointer
12617 by 128 bytes so as to maximize the amount of the local stack
12618 frame that is addressable with 8-bit offsets. */
12619 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12624 /* This is semi-inlined memory_address_length, but simplified
12625 since we know that we're always dealing with reg+offset, and
12626 to avoid having to create and discard all that rtl. */
12628 static inline int
12629 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12631 int len = 4;
12633 if (offset == 0)
12635 /* EBP and R13 cannot be encoded without an offset. */
12636 len = (regno == BP_REG || regno == R13_REG);
12638 else if (IN_RANGE (offset, -128, 127))
12639 len = 1;
12641 /* ESP and R12 must be encoded with a SIB byte. */
12642 if (regno == SP_REG || regno == R12_REG)
12643 len++;
12645 return len;
12648 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12649 The valid base registers are taken from CFUN->MACHINE->FS. */
12651 static rtx
12652 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12654 const struct machine_function *m = cfun->machine;
12655 rtx base_reg = NULL;
12656 HOST_WIDE_INT base_offset = 0;
12658 if (m->use_fast_prologue_epilogue)
12660 /* Choose the base register most likely to allow the most scheduling
12661 opportunities. Generally FP is valid throughout the function,
12662 while DRAP must be reloaded within the epilogue. But choose either
12663 over the SP due to increased encoding size. */
12665 if (m->fs.fp_valid)
12667 base_reg = hard_frame_pointer_rtx;
12668 base_offset = m->fs.fp_offset - cfa_offset;
12670 else if (m->fs.drap_valid)
12672 base_reg = crtl->drap_reg;
12673 base_offset = 0 - cfa_offset;
12675 else if (m->fs.sp_valid)
12677 base_reg = stack_pointer_rtx;
12678 base_offset = m->fs.sp_offset - cfa_offset;
12681 else
12683 HOST_WIDE_INT toffset;
12684 int len = 16, tlen;
12686 /* Choose the base register with the smallest address encoding.
12687 With a tie, choose FP > DRAP > SP. */
12688 if (m->fs.sp_valid)
12690 base_reg = stack_pointer_rtx;
12691 base_offset = m->fs.sp_offset - cfa_offset;
12692 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12694 if (m->fs.drap_valid)
12696 toffset = 0 - cfa_offset;
12697 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12698 if (tlen <= len)
12700 base_reg = crtl->drap_reg;
12701 base_offset = toffset;
12702 len = tlen;
12705 if (m->fs.fp_valid)
12707 toffset = m->fs.fp_offset - cfa_offset;
12708 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12709 if (tlen <= len)
12711 base_reg = hard_frame_pointer_rtx;
12712 base_offset = toffset;
12713 len = tlen;
12717 gcc_assert (base_reg != NULL);
12719 return plus_constant (Pmode, base_reg, base_offset);
12722 /* Emit code to save registers in the prologue. */
12724 static void
12725 ix86_emit_save_regs (void)
12727 unsigned int regno;
12728 rtx_insn *insn;
12730 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12731 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12733 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12734 RTX_FRAME_RELATED_P (insn) = 1;
12738 /* Emit a single register save at CFA - CFA_OFFSET. */
12740 static void
12741 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12742 HOST_WIDE_INT cfa_offset)
12744 struct machine_function *m = cfun->machine;
12745 rtx reg = gen_rtx_REG (mode, regno);
12746 rtx mem, addr, base, insn;
12747 unsigned int align;
12749 addr = choose_baseaddr (cfa_offset);
12750 mem = gen_frame_mem (mode, addr);
12752 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12753 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12754 set_mem_align (mem, align);
12756 insn = emit_insn (gen_rtx_SET (mem, reg));
12757 RTX_FRAME_RELATED_P (insn) = 1;
12759 base = addr;
12760 if (GET_CODE (base) == PLUS)
12761 base = XEXP (base, 0);
12762 gcc_checking_assert (REG_P (base));
12764 /* When saving registers into a re-aligned local stack frame, avoid
12765 any tricky guessing by dwarf2out. */
12766 if (m->fs.realigned)
12768 gcc_checking_assert (stack_realign_drap);
12770 if (regno == REGNO (crtl->drap_reg))
12772 /* A bit of a hack. We force the DRAP register to be saved in
12773 the re-aligned stack frame, which provides us with a copy
12774 of the CFA that will last past the prologue. Install it. */
12775 gcc_checking_assert (cfun->machine->fs.fp_valid);
12776 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12777 cfun->machine->fs.fp_offset - cfa_offset);
12778 mem = gen_rtx_MEM (mode, addr);
12779 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12781 else
12783 /* The frame pointer is a stable reference within the
12784 aligned frame. Use it. */
12785 gcc_checking_assert (cfun->machine->fs.fp_valid);
12786 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12787 cfun->machine->fs.fp_offset - cfa_offset);
12788 mem = gen_rtx_MEM (mode, addr);
12789 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12793 /* The memory may not be relative to the current CFA register,
12794 which means that we may need to generate a new pattern for
12795 use by the unwind info. */
12796 else if (base != m->fs.cfa_reg)
12798 addr = plus_constant (Pmode, m->fs.cfa_reg,
12799 m->fs.cfa_offset - cfa_offset);
12800 mem = gen_rtx_MEM (mode, addr);
12801 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12805 /* Emit code to save registers using MOV insns.
12806 First register is stored at CFA - CFA_OFFSET. */
12807 static void
12808 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12810 unsigned int regno;
12812 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12813 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12815 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12816 cfa_offset -= UNITS_PER_WORD;
12820 /* Emit code to save SSE registers using MOV insns.
12821 First register is stored at CFA - CFA_OFFSET. */
12822 static void
12823 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12825 unsigned int regno;
12827 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12828 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12830 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12831 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12835 static GTY(()) rtx queued_cfa_restores;
12837 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12838 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12839 Don't add the note if the previously saved value will be left untouched
12840 within stack red-zone till return, as unwinders can find the same value
12841 in the register and on the stack. */
12843 static void
12844 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12846 if (!crtl->shrink_wrapped
12847 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12848 return;
12850 if (insn)
12852 add_reg_note (insn, REG_CFA_RESTORE, reg);
12853 RTX_FRAME_RELATED_P (insn) = 1;
12855 else
12856 queued_cfa_restores
12857 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12860 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12862 static void
12863 ix86_add_queued_cfa_restore_notes (rtx insn)
12865 rtx last;
12866 if (!queued_cfa_restores)
12867 return;
12868 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12870 XEXP (last, 1) = REG_NOTES (insn);
12871 REG_NOTES (insn) = queued_cfa_restores;
12872 queued_cfa_restores = NULL_RTX;
12873 RTX_FRAME_RELATED_P (insn) = 1;
12876 /* Expand prologue or epilogue stack adjustment.
12877 The pattern exist to put a dependency on all ebp-based memory accesses.
12878 STYLE should be negative if instructions should be marked as frame related,
12879 zero if %r11 register is live and cannot be freely used and positive
12880 otherwise. */
12882 static void
12883 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12884 int style, bool set_cfa)
12886 struct machine_function *m = cfun->machine;
12887 rtx insn;
12888 bool add_frame_related_expr = false;
12890 if (Pmode == SImode)
12891 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12892 else if (x86_64_immediate_operand (offset, DImode))
12893 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12894 else
12896 rtx tmp;
12897 /* r11 is used by indirect sibcall return as well, set before the
12898 epilogue and used after the epilogue. */
12899 if (style)
12900 tmp = gen_rtx_REG (DImode, R11_REG);
12901 else
12903 gcc_assert (src != hard_frame_pointer_rtx
12904 && dest != hard_frame_pointer_rtx);
12905 tmp = hard_frame_pointer_rtx;
12907 insn = emit_insn (gen_rtx_SET (tmp, offset));
12908 if (style < 0)
12909 add_frame_related_expr = true;
12911 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12914 insn = emit_insn (insn);
12915 if (style >= 0)
12916 ix86_add_queued_cfa_restore_notes (insn);
12918 if (set_cfa)
12920 rtx r;
12922 gcc_assert (m->fs.cfa_reg == src);
12923 m->fs.cfa_offset += INTVAL (offset);
12924 m->fs.cfa_reg = dest;
12926 r = gen_rtx_PLUS (Pmode, src, offset);
12927 r = gen_rtx_SET (dest, r);
12928 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12929 RTX_FRAME_RELATED_P (insn) = 1;
12931 else if (style < 0)
12933 RTX_FRAME_RELATED_P (insn) = 1;
12934 if (add_frame_related_expr)
12936 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12937 r = gen_rtx_SET (dest, r);
12938 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12942 if (dest == stack_pointer_rtx)
12944 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12945 bool valid = m->fs.sp_valid;
12947 if (src == hard_frame_pointer_rtx)
12949 valid = m->fs.fp_valid;
12950 ooffset = m->fs.fp_offset;
12952 else if (src == crtl->drap_reg)
12954 valid = m->fs.drap_valid;
12955 ooffset = 0;
12957 else
12959 /* Else there are two possibilities: SP itself, which we set
12960 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12961 taken care of this by hand along the eh_return path. */
12962 gcc_checking_assert (src == stack_pointer_rtx
12963 || offset == const0_rtx);
12966 m->fs.sp_offset = ooffset - INTVAL (offset);
12967 m->fs.sp_valid = valid;
12971 /* Find an available register to be used as dynamic realign argument
12972 pointer regsiter. Such a register will be written in prologue and
12973 used in begin of body, so it must not be
12974 1. parameter passing register.
12975 2. GOT pointer.
12976 We reuse static-chain register if it is available. Otherwise, we
12977 use DI for i386 and R13 for x86-64. We chose R13 since it has
12978 shorter encoding.
12980 Return: the regno of chosen register. */
12982 static unsigned int
12983 find_drap_reg (void)
12985 tree decl = cfun->decl;
12987 /* Always use callee-saved register if there are no caller-saved
12988 registers. */
12989 if (TARGET_64BIT)
12991 /* Use R13 for nested function or function need static chain.
12992 Since function with tail call may use any caller-saved
12993 registers in epilogue, DRAP must not use caller-saved
12994 register in such case. */
12995 if (DECL_STATIC_CHAIN (decl)
12996 || cfun->machine->no_caller_saved_registers
12997 || crtl->tail_call_emit)
12998 return R13_REG;
13000 return R10_REG;
13002 else
13004 /* Use DI for nested function or function need static chain.
13005 Since function with tail call may use any caller-saved
13006 registers in epilogue, DRAP must not use caller-saved
13007 register in such case. */
13008 if (DECL_STATIC_CHAIN (decl)
13009 || cfun->machine->no_caller_saved_registers
13010 || crtl->tail_call_emit)
13011 return DI_REG;
13013 /* Reuse static chain register if it isn't used for parameter
13014 passing. */
13015 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13017 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13018 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13019 return CX_REG;
13021 return DI_REG;
13025 /* Handle a "force_align_arg_pointer" attribute. */
13027 static tree
13028 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13029 tree, int, bool *no_add_attrs)
13031 if (TREE_CODE (*node) != FUNCTION_TYPE
13032 && TREE_CODE (*node) != METHOD_TYPE
13033 && TREE_CODE (*node) != FIELD_DECL
13034 && TREE_CODE (*node) != TYPE_DECL)
13036 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13037 name);
13038 *no_add_attrs = true;
13041 return NULL_TREE;
13044 /* Return minimum incoming stack alignment. */
13046 static unsigned int
13047 ix86_minimum_incoming_stack_boundary (bool sibcall)
13049 unsigned int incoming_stack_boundary;
13051 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
13053 if (cfun->machine->func_type != TYPE_NORMAL)
13054 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13055 /* Prefer the one specified at command line. */
13056 else if (ix86_user_incoming_stack_boundary)
13057 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13058 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13059 if -mstackrealign is used, it isn't used for sibcall check and
13060 estimated stack alignment is 128bit. */
13061 else if (!sibcall
13062 && ix86_force_align_arg_pointer
13063 && crtl->stack_alignment_estimated == 128)
13064 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13065 else
13066 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13068 /* Incoming stack alignment can be changed on individual functions
13069 via force_align_arg_pointer attribute. We use the smallest
13070 incoming stack boundary. */
13071 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13072 && lookup_attribute (ix86_force_align_arg_pointer_string,
13073 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13074 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13076 /* The incoming stack frame has to be aligned at least at
13077 parm_stack_boundary. */
13078 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13079 incoming_stack_boundary = crtl->parm_stack_boundary;
13081 /* Stack at entrance of main is aligned by runtime. We use the
13082 smallest incoming stack boundary. */
13083 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13084 && DECL_NAME (current_function_decl)
13085 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13086 && DECL_FILE_SCOPE_P (current_function_decl))
13087 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13089 return incoming_stack_boundary;
13092 /* Update incoming stack boundary and estimated stack alignment. */
13094 static void
13095 ix86_update_stack_boundary (void)
13097 ix86_incoming_stack_boundary
13098 = ix86_minimum_incoming_stack_boundary (false);
13100 /* x86_64 vararg needs 16byte stack alignment for register save
13101 area. */
13102 if (TARGET_64BIT
13103 && cfun->stdarg
13104 && crtl->stack_alignment_estimated < 128)
13105 crtl->stack_alignment_estimated = 128;
13107 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13108 if (ix86_tls_descriptor_calls_expanded_in_cfun
13109 && crtl->preferred_stack_boundary < 128)
13110 crtl->preferred_stack_boundary = 128;
13113 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13114 needed or an rtx for DRAP otherwise. */
13116 static rtx
13117 ix86_get_drap_rtx (void)
13119 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
13120 crtl->need_drap = true;
13122 if (stack_realign_drap)
13124 /* Assign DRAP to vDRAP and returns vDRAP */
13125 unsigned int regno = find_drap_reg ();
13126 rtx drap_vreg;
13127 rtx arg_ptr;
13128 rtx_insn *seq, *insn;
13130 arg_ptr = gen_rtx_REG (Pmode, regno);
13131 crtl->drap_reg = arg_ptr;
13133 start_sequence ();
13134 drap_vreg = copy_to_reg (arg_ptr);
13135 seq = get_insns ();
13136 end_sequence ();
13138 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13139 if (!optimize)
13141 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13142 RTX_FRAME_RELATED_P (insn) = 1;
13144 return drap_vreg;
13146 else
13147 return NULL;
13150 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13152 static rtx
13153 ix86_internal_arg_pointer (void)
13155 return virtual_incoming_args_rtx;
13158 struct scratch_reg {
13159 rtx reg;
13160 bool saved;
13163 /* Return a short-lived scratch register for use on function entry.
13164 In 32-bit mode, it is valid only after the registers are saved
13165 in the prologue. This register must be released by means of
13166 release_scratch_register_on_entry once it is dead. */
13168 static void
13169 get_scratch_register_on_entry (struct scratch_reg *sr)
13171 int regno;
13173 sr->saved = false;
13175 if (TARGET_64BIT)
13177 /* We always use R11 in 64-bit mode. */
13178 regno = R11_REG;
13180 else
13182 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13183 bool fastcall_p
13184 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13185 bool thiscall_p
13186 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13187 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13188 int regparm = ix86_function_regparm (fntype, decl);
13189 int drap_regno
13190 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13192 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13193 for the static chain register. */
13194 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13195 && drap_regno != AX_REG)
13196 regno = AX_REG;
13197 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13198 for the static chain register. */
13199 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13200 regno = AX_REG;
13201 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13202 regno = DX_REG;
13203 /* ecx is the static chain register. */
13204 else if (regparm < 3 && !fastcall_p && !thiscall_p
13205 && !static_chain_p
13206 && drap_regno != CX_REG)
13207 regno = CX_REG;
13208 else if (ix86_save_reg (BX_REG, true))
13209 regno = BX_REG;
13210 /* esi is the static chain register. */
13211 else if (!(regparm == 3 && static_chain_p)
13212 && ix86_save_reg (SI_REG, true))
13213 regno = SI_REG;
13214 else if (ix86_save_reg (DI_REG, true))
13215 regno = DI_REG;
13216 else
13218 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13219 sr->saved = true;
13223 sr->reg = gen_rtx_REG (Pmode, regno);
13224 if (sr->saved)
13226 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13227 RTX_FRAME_RELATED_P (insn) = 1;
13231 /* Release a scratch register obtained from the preceding function. */
13233 static void
13234 release_scratch_register_on_entry (struct scratch_reg *sr)
13236 if (sr->saved)
13238 struct machine_function *m = cfun->machine;
13239 rtx x, insn = emit_insn (gen_pop (sr->reg));
13241 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13242 RTX_FRAME_RELATED_P (insn) = 1;
13243 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13244 x = gen_rtx_SET (stack_pointer_rtx, x);
13245 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13246 m->fs.sp_offset -= UNITS_PER_WORD;
13250 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13252 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13254 static void
13255 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13257 /* We skip the probe for the first interval + a small dope of 4 words and
13258 probe that many bytes past the specified size to maintain a protection
13259 area at the botton of the stack. */
13260 const int dope = 4 * UNITS_PER_WORD;
13261 rtx size_rtx = GEN_INT (size), last;
13263 /* See if we have a constant small number of probes to generate. If so,
13264 that's the easy case. The run-time loop is made up of 9 insns in the
13265 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13266 for n # of intervals. */
13267 if (size <= 4 * PROBE_INTERVAL)
13269 HOST_WIDE_INT i, adjust;
13270 bool first_probe = true;
13272 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13273 values of N from 1 until it exceeds SIZE. If only one probe is
13274 needed, this will not generate any code. Then adjust and probe
13275 to PROBE_INTERVAL + SIZE. */
13276 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13278 if (first_probe)
13280 adjust = 2 * PROBE_INTERVAL + dope;
13281 first_probe = false;
13283 else
13284 adjust = PROBE_INTERVAL;
13286 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13287 plus_constant (Pmode, stack_pointer_rtx,
13288 -adjust)));
13289 emit_stack_probe (stack_pointer_rtx);
13292 if (first_probe)
13293 adjust = size + PROBE_INTERVAL + dope;
13294 else
13295 adjust = size + PROBE_INTERVAL - i;
13297 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13298 plus_constant (Pmode, stack_pointer_rtx,
13299 -adjust)));
13300 emit_stack_probe (stack_pointer_rtx);
13302 /* Adjust back to account for the additional first interval. */
13303 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13304 plus_constant (Pmode, stack_pointer_rtx,
13305 PROBE_INTERVAL + dope)));
13308 /* Otherwise, do the same as above, but in a loop. Note that we must be
13309 extra careful with variables wrapping around because we might be at
13310 the very top (or the very bottom) of the address space and we have
13311 to be able to handle this case properly; in particular, we use an
13312 equality test for the loop condition. */
13313 else
13315 HOST_WIDE_INT rounded_size;
13316 struct scratch_reg sr;
13318 get_scratch_register_on_entry (&sr);
13321 /* Step 1: round SIZE to the previous multiple of the interval. */
13323 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13326 /* Step 2: compute initial and final value of the loop counter. */
13328 /* SP = SP_0 + PROBE_INTERVAL. */
13329 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13330 plus_constant (Pmode, stack_pointer_rtx,
13331 - (PROBE_INTERVAL + dope))));
13333 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13334 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13335 emit_insn (gen_rtx_SET (sr.reg,
13336 plus_constant (Pmode, stack_pointer_rtx,
13337 -rounded_size)));
13338 else
13340 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13341 emit_insn (gen_rtx_SET (sr.reg,
13342 gen_rtx_PLUS (Pmode, sr.reg,
13343 stack_pointer_rtx)));
13347 /* Step 3: the loop
13351 SP = SP + PROBE_INTERVAL
13352 probe at SP
13354 while (SP != LAST_ADDR)
13356 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13357 values of N from 1 until it is equal to ROUNDED_SIZE. */
13359 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13362 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13363 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13365 if (size != rounded_size)
13367 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13368 plus_constant (Pmode, stack_pointer_rtx,
13369 rounded_size - size)));
13370 emit_stack_probe (stack_pointer_rtx);
13373 /* Adjust back to account for the additional first interval. */
13374 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13375 plus_constant (Pmode, stack_pointer_rtx,
13376 PROBE_INTERVAL + dope)));
13378 release_scratch_register_on_entry (&sr);
13381 /* Even if the stack pointer isn't the CFA register, we need to correctly
13382 describe the adjustments made to it, in particular differentiate the
13383 frame-related ones from the frame-unrelated ones. */
13384 if (size > 0)
13386 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13387 XVECEXP (expr, 0, 0)
13388 = gen_rtx_SET (stack_pointer_rtx,
13389 plus_constant (Pmode, stack_pointer_rtx, -size));
13390 XVECEXP (expr, 0, 1)
13391 = gen_rtx_SET (stack_pointer_rtx,
13392 plus_constant (Pmode, stack_pointer_rtx,
13393 PROBE_INTERVAL + dope + size));
13394 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13395 RTX_FRAME_RELATED_P (last) = 1;
13397 cfun->machine->fs.sp_offset += size;
13400 /* Make sure nothing is scheduled before we are done. */
13401 emit_insn (gen_blockage ());
13404 /* Adjust the stack pointer up to REG while probing it. */
13406 const char *
13407 output_adjust_stack_and_probe (rtx reg)
13409 static int labelno = 0;
13410 char loop_lab[32];
13411 rtx xops[2];
13413 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13415 /* Loop. */
13416 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13418 /* SP = SP + PROBE_INTERVAL. */
13419 xops[0] = stack_pointer_rtx;
13420 xops[1] = GEN_INT (PROBE_INTERVAL);
13421 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13423 /* Probe at SP. */
13424 xops[1] = const0_rtx;
13425 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13427 /* Test if SP == LAST_ADDR. */
13428 xops[0] = stack_pointer_rtx;
13429 xops[1] = reg;
13430 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13432 /* Branch. */
13433 fputs ("\tjne\t", asm_out_file);
13434 assemble_name_raw (asm_out_file, loop_lab);
13435 fputc ('\n', asm_out_file);
13437 return "";
13440 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13441 inclusive. These are offsets from the current stack pointer. */
13443 static void
13444 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13446 /* See if we have a constant small number of probes to generate. If so,
13447 that's the easy case. The run-time loop is made up of 6 insns in the
13448 generic case while the compile-time loop is made up of n insns for n #
13449 of intervals. */
13450 if (size <= 6 * PROBE_INTERVAL)
13452 HOST_WIDE_INT i;
13454 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13455 it exceeds SIZE. If only one probe is needed, this will not
13456 generate any code. Then probe at FIRST + SIZE. */
13457 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13458 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13459 -(first + i)));
13461 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13462 -(first + size)));
13465 /* Otherwise, do the same as above, but in a loop. Note that we must be
13466 extra careful with variables wrapping around because we might be at
13467 the very top (or the very bottom) of the address space and we have
13468 to be able to handle this case properly; in particular, we use an
13469 equality test for the loop condition. */
13470 else
13472 HOST_WIDE_INT rounded_size, last;
13473 struct scratch_reg sr;
13475 get_scratch_register_on_entry (&sr);
13478 /* Step 1: round SIZE to the previous multiple of the interval. */
13480 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13483 /* Step 2: compute initial and final value of the loop counter. */
13485 /* TEST_OFFSET = FIRST. */
13486 emit_move_insn (sr.reg, GEN_INT (-first));
13488 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13489 last = first + rounded_size;
13492 /* Step 3: the loop
13496 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13497 probe at TEST_ADDR
13499 while (TEST_ADDR != LAST_ADDR)
13501 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13502 until it is equal to ROUNDED_SIZE. */
13504 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13507 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13508 that SIZE is equal to ROUNDED_SIZE. */
13510 if (size != rounded_size)
13511 emit_stack_probe (plus_constant (Pmode,
13512 gen_rtx_PLUS (Pmode,
13513 stack_pointer_rtx,
13514 sr.reg),
13515 rounded_size - size));
13517 release_scratch_register_on_entry (&sr);
13520 /* Make sure nothing is scheduled before we are done. */
13521 emit_insn (gen_blockage ());
13524 /* Probe a range of stack addresses from REG to END, inclusive. These are
13525 offsets from the current stack pointer. */
13527 const char *
13528 output_probe_stack_range (rtx reg, rtx end)
13530 static int labelno = 0;
13531 char loop_lab[32];
13532 rtx xops[3];
13534 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13536 /* Loop. */
13537 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13539 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13540 xops[0] = reg;
13541 xops[1] = GEN_INT (PROBE_INTERVAL);
13542 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13544 /* Probe at TEST_ADDR. */
13545 xops[0] = stack_pointer_rtx;
13546 xops[1] = reg;
13547 xops[2] = const0_rtx;
13548 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13550 /* Test if TEST_ADDR == LAST_ADDR. */
13551 xops[0] = reg;
13552 xops[1] = end;
13553 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13555 /* Branch. */
13556 fputs ("\tjne\t", asm_out_file);
13557 assemble_name_raw (asm_out_file, loop_lab);
13558 fputc ('\n', asm_out_file);
13560 return "";
13563 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13564 to be generated in correct form. */
13565 static void
13566 ix86_finalize_stack_realign_flags (void)
13568 /* Check if stack realign is really needed after reload, and
13569 stores result in cfun */
13570 unsigned int incoming_stack_boundary
13571 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13572 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13573 unsigned int stack_realign
13574 = (incoming_stack_boundary
13575 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13576 ? crtl->max_used_stack_slot_alignment
13577 : crtl->stack_alignment_needed));
13579 if (crtl->stack_realign_finalized)
13581 /* After stack_realign_needed is finalized, we can't no longer
13582 change it. */
13583 gcc_assert (crtl->stack_realign_needed == stack_realign);
13584 return;
13587 /* If the only reason for frame_pointer_needed is that we conservatively
13588 assumed stack realignment might be needed, but in the end nothing that
13589 needed the stack alignment had been spilled, clear frame_pointer_needed
13590 and say we don't need stack realignment. */
13591 if (stack_realign
13592 && frame_pointer_needed
13593 && crtl->is_leaf
13594 && flag_omit_frame_pointer
13595 && crtl->sp_is_unchanging
13596 && !ix86_current_function_calls_tls_descriptor
13597 && !crtl->accesses_prior_frames
13598 && !cfun->calls_alloca
13599 && !crtl->calls_eh_return
13600 /* See ira_setup_eliminable_regset for the rationale. */
13601 && !(STACK_CHECK_MOVING_SP
13602 && flag_stack_check
13603 && flag_exceptions
13604 && cfun->can_throw_non_call_exceptions)
13605 && !ix86_frame_pointer_required ()
13606 && get_frame_size () == 0
13607 && ix86_nsaved_sseregs () == 0
13608 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13610 HARD_REG_SET set_up_by_prologue, prologue_used;
13611 basic_block bb;
13613 CLEAR_HARD_REG_SET (prologue_used);
13614 CLEAR_HARD_REG_SET (set_up_by_prologue);
13615 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13616 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13617 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13618 HARD_FRAME_POINTER_REGNUM);
13619 FOR_EACH_BB_FN (bb, cfun)
13621 rtx_insn *insn;
13622 FOR_BB_INSNS (bb, insn)
13623 if (NONDEBUG_INSN_P (insn)
13624 && requires_stack_frame_p (insn, prologue_used,
13625 set_up_by_prologue))
13627 crtl->stack_realign_needed = stack_realign;
13628 crtl->stack_realign_finalized = true;
13629 return;
13633 /* If drap has been set, but it actually isn't live at the start
13634 of the function, there is no reason to set it up. */
13635 if (crtl->drap_reg)
13637 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13638 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13640 crtl->drap_reg = NULL_RTX;
13641 crtl->need_drap = false;
13644 else
13645 cfun->machine->no_drap_save_restore = true;
13647 frame_pointer_needed = false;
13648 stack_realign = false;
13649 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13650 crtl->stack_alignment_needed = incoming_stack_boundary;
13651 crtl->stack_alignment_estimated = incoming_stack_boundary;
13652 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13653 crtl->preferred_stack_boundary = incoming_stack_boundary;
13654 df_finish_pass (true);
13655 df_scan_alloc (NULL);
13656 df_scan_blocks ();
13657 df_compute_regs_ever_live (true);
13658 df_analyze ();
13661 crtl->stack_realign_needed = stack_realign;
13662 crtl->stack_realign_finalized = true;
13665 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13667 static void
13668 ix86_elim_entry_set_got (rtx reg)
13670 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13671 rtx_insn *c_insn = BB_HEAD (bb);
13672 if (!NONDEBUG_INSN_P (c_insn))
13673 c_insn = next_nonnote_nondebug_insn (c_insn);
13674 if (c_insn && NONJUMP_INSN_P (c_insn))
13676 rtx pat = PATTERN (c_insn);
13677 if (GET_CODE (pat) == PARALLEL)
13679 rtx vec = XVECEXP (pat, 0, 0);
13680 if (GET_CODE (vec) == SET
13681 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13682 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13683 delete_insn (c_insn);
13688 /* Expand the prologue into a bunch of separate insns. */
13690 void
13691 ix86_expand_prologue (void)
13693 struct machine_function *m = cfun->machine;
13694 rtx insn, t;
13695 struct ix86_frame frame;
13696 HOST_WIDE_INT allocate;
13697 bool int_registers_saved;
13698 bool sse_registers_saved;
13699 rtx static_chain = NULL_RTX;
13701 ix86_finalize_stack_realign_flags ();
13703 /* DRAP should not coexist with stack_realign_fp */
13704 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13706 memset (&m->fs, 0, sizeof (m->fs));
13708 /* Initialize CFA state for before the prologue. */
13709 m->fs.cfa_reg = stack_pointer_rtx;
13710 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13712 /* Track SP offset to the CFA. We continue tracking this after we've
13713 swapped the CFA register away from SP. In the case of re-alignment
13714 this is fudged; we're interested to offsets within the local frame. */
13715 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13716 m->fs.sp_valid = true;
13718 ix86_compute_frame_layout (&frame);
13720 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13722 /* We should have already generated an error for any use of
13723 ms_hook on a nested function. */
13724 gcc_checking_assert (!ix86_static_chain_on_stack);
13726 /* Check if profiling is active and we shall use profiling before
13727 prologue variant. If so sorry. */
13728 if (crtl->profile && flag_fentry != 0)
13729 sorry ("ms_hook_prologue attribute isn%'t compatible "
13730 "with -mfentry for 32-bit");
13732 /* In ix86_asm_output_function_label we emitted:
13733 8b ff movl.s %edi,%edi
13734 55 push %ebp
13735 8b ec movl.s %esp,%ebp
13737 This matches the hookable function prologue in Win32 API
13738 functions in Microsoft Windows XP Service Pack 2 and newer.
13739 Wine uses this to enable Windows apps to hook the Win32 API
13740 functions provided by Wine.
13742 What that means is that we've already set up the frame pointer. */
13744 if (frame_pointer_needed
13745 && !(crtl->drap_reg && crtl->stack_realign_needed))
13747 rtx push, mov;
13749 /* We've decided to use the frame pointer already set up.
13750 Describe this to the unwinder by pretending that both
13751 push and mov insns happen right here.
13753 Putting the unwind info here at the end of the ms_hook
13754 is done so that we can make absolutely certain we get
13755 the required byte sequence at the start of the function,
13756 rather than relying on an assembler that can produce
13757 the exact encoding required.
13759 However it does mean (in the unpatched case) that we have
13760 a 1 insn window where the asynchronous unwind info is
13761 incorrect. However, if we placed the unwind info at
13762 its correct location we would have incorrect unwind info
13763 in the patched case. Which is probably all moot since
13764 I don't expect Wine generates dwarf2 unwind info for the
13765 system libraries that use this feature. */
13767 insn = emit_insn (gen_blockage ());
13769 push = gen_push (hard_frame_pointer_rtx);
13770 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13771 stack_pointer_rtx);
13772 RTX_FRAME_RELATED_P (push) = 1;
13773 RTX_FRAME_RELATED_P (mov) = 1;
13775 RTX_FRAME_RELATED_P (insn) = 1;
13776 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13777 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13779 /* Note that gen_push incremented m->fs.cfa_offset, even
13780 though we didn't emit the push insn here. */
13781 m->fs.cfa_reg = hard_frame_pointer_rtx;
13782 m->fs.fp_offset = m->fs.cfa_offset;
13783 m->fs.fp_valid = true;
13785 else
13787 /* The frame pointer is not needed so pop %ebp again.
13788 This leaves us with a pristine state. */
13789 emit_insn (gen_pop (hard_frame_pointer_rtx));
13793 /* The first insn of a function that accepts its static chain on the
13794 stack is to push the register that would be filled in by a direct
13795 call. This insn will be skipped by the trampoline. */
13796 else if (ix86_static_chain_on_stack)
13798 static_chain = ix86_static_chain (cfun->decl, false);
13799 insn = emit_insn (gen_push (static_chain));
13800 emit_insn (gen_blockage ());
13802 /* We don't want to interpret this push insn as a register save,
13803 only as a stack adjustment. The real copy of the register as
13804 a save will be done later, if needed. */
13805 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13806 t = gen_rtx_SET (stack_pointer_rtx, t);
13807 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13808 RTX_FRAME_RELATED_P (insn) = 1;
13811 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13812 of DRAP is needed and stack realignment is really needed after reload */
13813 if (stack_realign_drap)
13815 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13817 /* Can't use DRAP in interrupt function. */
13818 if (cfun->machine->func_type != TYPE_NORMAL)
13819 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13820 "in interrupt service routine. This may be worked "
13821 "around by avoiding functions with aggregate return.");
13823 /* Only need to push parameter pointer reg if it is caller saved. */
13824 if (!call_used_regs[REGNO (crtl->drap_reg)])
13826 /* Push arg pointer reg */
13827 insn = emit_insn (gen_push (crtl->drap_reg));
13828 RTX_FRAME_RELATED_P (insn) = 1;
13831 /* Grab the argument pointer. */
13832 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13833 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13834 RTX_FRAME_RELATED_P (insn) = 1;
13835 m->fs.cfa_reg = crtl->drap_reg;
13836 m->fs.cfa_offset = 0;
13838 /* Align the stack. */
13839 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13840 stack_pointer_rtx,
13841 GEN_INT (-align_bytes)));
13842 RTX_FRAME_RELATED_P (insn) = 1;
13844 /* Replicate the return address on the stack so that return
13845 address can be reached via (argp - 1) slot. This is needed
13846 to implement macro RETURN_ADDR_RTX and intrinsic function
13847 expand_builtin_return_addr etc. */
13848 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13849 t = gen_frame_mem (word_mode, t);
13850 insn = emit_insn (gen_push (t));
13851 RTX_FRAME_RELATED_P (insn) = 1;
13853 /* For the purposes of frame and register save area addressing,
13854 we've started over with a new frame. */
13855 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13856 m->fs.realigned = true;
13858 if (static_chain)
13860 /* Replicate static chain on the stack so that static chain
13861 can be reached via (argp - 2) slot. This is needed for
13862 nested function with stack realignment. */
13863 insn = emit_insn (gen_push (static_chain));
13864 RTX_FRAME_RELATED_P (insn) = 1;
13868 int_registers_saved = (frame.nregs == 0);
13869 sse_registers_saved = (frame.nsseregs == 0);
13871 if (frame_pointer_needed && !m->fs.fp_valid)
13873 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13874 slower on all targets. Also sdb doesn't like it. */
13875 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13876 RTX_FRAME_RELATED_P (insn) = 1;
13878 /* Push registers now, before setting the frame pointer
13879 on SEH target. */
13880 if (!int_registers_saved
13881 && TARGET_SEH
13882 && !frame.save_regs_using_mov)
13884 ix86_emit_save_regs ();
13885 int_registers_saved = true;
13886 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13889 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13891 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13892 RTX_FRAME_RELATED_P (insn) = 1;
13894 if (m->fs.cfa_reg == stack_pointer_rtx)
13895 m->fs.cfa_reg = hard_frame_pointer_rtx;
13896 m->fs.fp_offset = m->fs.sp_offset;
13897 m->fs.fp_valid = true;
13901 if (!int_registers_saved)
13903 /* If saving registers via PUSH, do so now. */
13904 if (!frame.save_regs_using_mov)
13906 ix86_emit_save_regs ();
13907 int_registers_saved = true;
13908 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13911 /* When using red zone we may start register saving before allocating
13912 the stack frame saving one cycle of the prologue. However, avoid
13913 doing this if we have to probe the stack; at least on x86_64 the
13914 stack probe can turn into a call that clobbers a red zone location. */
13915 else if (ix86_using_red_zone ()
13916 && (! TARGET_STACK_PROBE
13917 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13919 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13920 int_registers_saved = true;
13924 if (stack_realign_fp)
13926 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13927 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13929 /* The computation of the size of the re-aligned stack frame means
13930 that we must allocate the size of the register save area before
13931 performing the actual alignment. Otherwise we cannot guarantee
13932 that there's enough storage above the realignment point. */
13933 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13934 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13935 GEN_INT (m->fs.sp_offset
13936 - frame.sse_reg_save_offset),
13937 -1, false);
13939 /* Align the stack. */
13940 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13941 stack_pointer_rtx,
13942 GEN_INT (-align_bytes)));
13944 /* For the purposes of register save area addressing, the stack
13945 pointer is no longer valid. As for the value of sp_offset,
13946 see ix86_compute_frame_layout, which we need to match in order
13947 to pass verification of stack_pointer_offset at the end. */
13948 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13949 m->fs.sp_valid = false;
13952 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13954 if (flag_stack_usage_info)
13956 /* We start to count from ARG_POINTER. */
13957 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13959 /* If it was realigned, take into account the fake frame. */
13960 if (stack_realign_drap)
13962 if (ix86_static_chain_on_stack)
13963 stack_size += UNITS_PER_WORD;
13965 if (!call_used_regs[REGNO (crtl->drap_reg)])
13966 stack_size += UNITS_PER_WORD;
13968 /* This over-estimates by 1 minimal-stack-alignment-unit but
13969 mitigates that by counting in the new return address slot. */
13970 current_function_dynamic_stack_size
13971 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13974 current_function_static_stack_size = stack_size;
13977 /* On SEH target with very large frame size, allocate an area to save
13978 SSE registers (as the very large allocation won't be described). */
13979 if (TARGET_SEH
13980 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13981 && !sse_registers_saved)
13983 HOST_WIDE_INT sse_size =
13984 frame.sse_reg_save_offset - frame.reg_save_offset;
13986 gcc_assert (int_registers_saved);
13988 /* No need to do stack checking as the area will be immediately
13989 written. */
13990 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13991 GEN_INT (-sse_size), -1,
13992 m->fs.cfa_reg == stack_pointer_rtx);
13993 allocate -= sse_size;
13994 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13995 sse_registers_saved = true;
13998 /* The stack has already been decremented by the instruction calling us
13999 so probe if the size is non-negative to preserve the protection area. */
14000 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14002 /* We expect the registers to be saved when probes are used. */
14003 gcc_assert (int_registers_saved);
14005 if (STACK_CHECK_MOVING_SP)
14007 if (!(crtl->is_leaf && !cfun->calls_alloca
14008 && allocate <= PROBE_INTERVAL))
14010 ix86_adjust_stack_and_probe (allocate);
14011 allocate = 0;
14014 else
14016 HOST_WIDE_INT size = allocate;
14018 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14019 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14021 if (TARGET_STACK_PROBE)
14023 if (crtl->is_leaf && !cfun->calls_alloca)
14025 if (size > PROBE_INTERVAL)
14026 ix86_emit_probe_stack_range (0, size);
14028 else
14029 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14031 else
14033 if (crtl->is_leaf && !cfun->calls_alloca)
14035 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14036 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14037 size - STACK_CHECK_PROTECT);
14039 else
14040 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14045 if (allocate == 0)
14047 else if (!ix86_target_stack_probe ()
14048 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14050 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14051 GEN_INT (-allocate), -1,
14052 m->fs.cfa_reg == stack_pointer_rtx);
14054 else
14056 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14057 rtx r10 = NULL;
14058 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14059 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14060 bool eax_live = ix86_eax_live_at_start_p ();
14061 bool r10_live = false;
14063 if (TARGET_64BIT)
14064 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14066 if (eax_live)
14068 insn = emit_insn (gen_push (eax));
14069 allocate -= UNITS_PER_WORD;
14070 /* Note that SEH directives need to continue tracking the stack
14071 pointer even after the frame pointer has been set up. */
14072 if (sp_is_cfa_reg || TARGET_SEH)
14074 if (sp_is_cfa_reg)
14075 m->fs.cfa_offset += UNITS_PER_WORD;
14076 RTX_FRAME_RELATED_P (insn) = 1;
14077 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14078 gen_rtx_SET (stack_pointer_rtx,
14079 plus_constant (Pmode, stack_pointer_rtx,
14080 -UNITS_PER_WORD)));
14084 if (r10_live)
14086 r10 = gen_rtx_REG (Pmode, R10_REG);
14087 insn = emit_insn (gen_push (r10));
14088 allocate -= UNITS_PER_WORD;
14089 if (sp_is_cfa_reg || TARGET_SEH)
14091 if (sp_is_cfa_reg)
14092 m->fs.cfa_offset += UNITS_PER_WORD;
14093 RTX_FRAME_RELATED_P (insn) = 1;
14094 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14095 gen_rtx_SET (stack_pointer_rtx,
14096 plus_constant (Pmode, stack_pointer_rtx,
14097 -UNITS_PER_WORD)));
14101 emit_move_insn (eax, GEN_INT (allocate));
14102 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14104 /* Use the fact that AX still contains ALLOCATE. */
14105 adjust_stack_insn = (Pmode == DImode
14106 ? gen_pro_epilogue_adjust_stack_di_sub
14107 : gen_pro_epilogue_adjust_stack_si_sub);
14109 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14110 stack_pointer_rtx, eax));
14112 if (sp_is_cfa_reg || TARGET_SEH)
14114 if (sp_is_cfa_reg)
14115 m->fs.cfa_offset += allocate;
14116 RTX_FRAME_RELATED_P (insn) = 1;
14117 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14118 gen_rtx_SET (stack_pointer_rtx,
14119 plus_constant (Pmode, stack_pointer_rtx,
14120 -allocate)));
14122 m->fs.sp_offset += allocate;
14124 /* Use stack_pointer_rtx for relative addressing so that code
14125 works for realigned stack, too. */
14126 if (r10_live && eax_live)
14128 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14129 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14130 gen_frame_mem (word_mode, t));
14131 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14132 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14133 gen_frame_mem (word_mode, t));
14135 else if (eax_live || r10_live)
14137 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14138 emit_move_insn (gen_rtx_REG (word_mode,
14139 (eax_live ? AX_REG : R10_REG)),
14140 gen_frame_mem (word_mode, t));
14143 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14145 /* If we havn't already set up the frame pointer, do so now. */
14146 if (frame_pointer_needed && !m->fs.fp_valid)
14148 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14149 GEN_INT (frame.stack_pointer_offset
14150 - frame.hard_frame_pointer_offset));
14151 insn = emit_insn (insn);
14152 RTX_FRAME_RELATED_P (insn) = 1;
14153 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14155 if (m->fs.cfa_reg == stack_pointer_rtx)
14156 m->fs.cfa_reg = hard_frame_pointer_rtx;
14157 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14158 m->fs.fp_valid = true;
14161 if (!int_registers_saved)
14162 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14163 if (!sse_registers_saved)
14164 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14166 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14167 in PROLOGUE. */
14168 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14170 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14171 insn = emit_insn (gen_set_got (pic));
14172 RTX_FRAME_RELATED_P (insn) = 1;
14173 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14174 emit_insn (gen_prologue_use (pic));
14175 /* Deleting already emmitted SET_GOT if exist and allocated to
14176 REAL_PIC_OFFSET_TABLE_REGNUM. */
14177 ix86_elim_entry_set_got (pic);
14180 if (crtl->drap_reg && !crtl->stack_realign_needed)
14182 /* vDRAP is setup but after reload it turns out stack realign
14183 isn't necessary, here we will emit prologue to setup DRAP
14184 without stack realign adjustment */
14185 t = choose_baseaddr (0);
14186 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14189 /* Prevent instructions from being scheduled into register save push
14190 sequence when access to the redzone area is done through frame pointer.
14191 The offset between the frame pointer and the stack pointer is calculated
14192 relative to the value of the stack pointer at the end of the function
14193 prologue, and moving instructions that access redzone area via frame
14194 pointer inside push sequence violates this assumption. */
14195 if (frame_pointer_needed && frame.red_zone_size)
14196 emit_insn (gen_memory_blockage ());
14198 /* SEH requires that the prologue end within 256 bytes of the start of
14199 the function. Prevent instruction schedules that would extend that.
14200 Further, prevent alloca modifications to the stack pointer from being
14201 combined with prologue modifications. */
14202 if (TARGET_SEH)
14203 emit_insn (gen_prologue_use (stack_pointer_rtx));
14206 /* Emit code to restore REG using a POP insn. */
14208 static void
14209 ix86_emit_restore_reg_using_pop (rtx reg)
14211 struct machine_function *m = cfun->machine;
14212 rtx_insn *insn = emit_insn (gen_pop (reg));
14214 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14215 m->fs.sp_offset -= UNITS_PER_WORD;
14217 if (m->fs.cfa_reg == crtl->drap_reg
14218 && REGNO (reg) == REGNO (crtl->drap_reg))
14220 /* Previously we'd represented the CFA as an expression
14221 like *(%ebp - 8). We've just popped that value from
14222 the stack, which means we need to reset the CFA to
14223 the drap register. This will remain until we restore
14224 the stack pointer. */
14225 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14226 RTX_FRAME_RELATED_P (insn) = 1;
14228 /* This means that the DRAP register is valid for addressing too. */
14229 m->fs.drap_valid = true;
14230 return;
14233 if (m->fs.cfa_reg == stack_pointer_rtx)
14235 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14236 x = gen_rtx_SET (stack_pointer_rtx, x);
14237 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14238 RTX_FRAME_RELATED_P (insn) = 1;
14240 m->fs.cfa_offset -= UNITS_PER_WORD;
14243 /* When the frame pointer is the CFA, and we pop it, we are
14244 swapping back to the stack pointer as the CFA. This happens
14245 for stack frames that don't allocate other data, so we assume
14246 the stack pointer is now pointing at the return address, i.e.
14247 the function entry state, which makes the offset be 1 word. */
14248 if (reg == hard_frame_pointer_rtx)
14250 m->fs.fp_valid = false;
14251 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14253 m->fs.cfa_reg = stack_pointer_rtx;
14254 m->fs.cfa_offset -= UNITS_PER_WORD;
14256 add_reg_note (insn, REG_CFA_DEF_CFA,
14257 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14258 GEN_INT (m->fs.cfa_offset)));
14259 RTX_FRAME_RELATED_P (insn) = 1;
14264 /* Emit code to restore saved registers using POP insns. */
14266 static void
14267 ix86_emit_restore_regs_using_pop (void)
14269 unsigned int regno;
14271 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14272 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14273 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14276 /* Emit code and notes for the LEAVE instruction. */
14278 static void
14279 ix86_emit_leave (void)
14281 struct machine_function *m = cfun->machine;
14282 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14284 ix86_add_queued_cfa_restore_notes (insn);
14286 gcc_assert (m->fs.fp_valid);
14287 m->fs.sp_valid = true;
14288 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14289 m->fs.fp_valid = false;
14291 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14293 m->fs.cfa_reg = stack_pointer_rtx;
14294 m->fs.cfa_offset = m->fs.sp_offset;
14296 add_reg_note (insn, REG_CFA_DEF_CFA,
14297 plus_constant (Pmode, stack_pointer_rtx,
14298 m->fs.sp_offset));
14299 RTX_FRAME_RELATED_P (insn) = 1;
14301 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14302 m->fs.fp_offset);
14305 /* Emit code to restore saved registers using MOV insns.
14306 First register is restored from CFA - CFA_OFFSET. */
14307 static void
14308 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14309 bool maybe_eh_return)
14311 struct machine_function *m = cfun->machine;
14312 unsigned int regno;
14314 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14315 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14317 rtx reg = gen_rtx_REG (word_mode, regno);
14318 rtx mem;
14319 rtx_insn *insn;
14321 mem = choose_baseaddr (cfa_offset);
14322 mem = gen_frame_mem (word_mode, mem);
14323 insn = emit_move_insn (reg, mem);
14325 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14327 /* Previously we'd represented the CFA as an expression
14328 like *(%ebp - 8). We've just popped that value from
14329 the stack, which means we need to reset the CFA to
14330 the drap register. This will remain until we restore
14331 the stack pointer. */
14332 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14333 RTX_FRAME_RELATED_P (insn) = 1;
14335 /* This means that the DRAP register is valid for addressing. */
14336 m->fs.drap_valid = true;
14338 else
14339 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14341 cfa_offset -= UNITS_PER_WORD;
14345 /* Emit code to restore saved registers using MOV insns.
14346 First register is restored from CFA - CFA_OFFSET. */
14347 static void
14348 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14349 bool maybe_eh_return)
14351 unsigned int regno;
14353 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14354 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14356 rtx reg = gen_rtx_REG (V4SFmode, regno);
14357 rtx mem;
14358 unsigned int align;
14360 mem = choose_baseaddr (cfa_offset);
14361 mem = gen_rtx_MEM (V4SFmode, mem);
14363 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14364 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14365 set_mem_align (mem, align);
14366 emit_insn (gen_rtx_SET (reg, mem));
14368 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14370 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14374 /* Restore function stack, frame, and registers. */
14376 void
14377 ix86_expand_epilogue (int style)
14379 struct machine_function *m = cfun->machine;
14380 struct machine_frame_state frame_state_save = m->fs;
14381 struct ix86_frame frame;
14382 bool restore_regs_via_mov;
14383 bool using_drap;
14385 ix86_finalize_stack_realign_flags ();
14386 ix86_compute_frame_layout (&frame);
14388 m->fs.sp_valid = (!frame_pointer_needed
14389 || (crtl->sp_is_unchanging
14390 && !stack_realign_fp));
14391 gcc_assert (!m->fs.sp_valid
14392 || m->fs.sp_offset == frame.stack_pointer_offset);
14394 /* The FP must be valid if the frame pointer is present. */
14395 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14396 gcc_assert (!m->fs.fp_valid
14397 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14399 /* We must have *some* valid pointer to the stack frame. */
14400 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14402 /* The DRAP is never valid at this point. */
14403 gcc_assert (!m->fs.drap_valid);
14405 /* See the comment about red zone and frame
14406 pointer usage in ix86_expand_prologue. */
14407 if (frame_pointer_needed && frame.red_zone_size)
14408 emit_insn (gen_memory_blockage ());
14410 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14411 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14413 /* Determine the CFA offset of the end of the red-zone. */
14414 m->fs.red_zone_offset = 0;
14415 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14417 /* The red-zone begins below the return address. */
14418 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14420 /* When the register save area is in the aligned portion of
14421 the stack, determine the maximum runtime displacement that
14422 matches up with the aligned frame. */
14423 if (stack_realign_drap)
14424 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14425 + UNITS_PER_WORD);
14428 /* Special care must be taken for the normal return case of a function
14429 using eh_return: the eax and edx registers are marked as saved, but
14430 not restored along this path. Adjust the save location to match. */
14431 if (crtl->calls_eh_return && style != 2)
14432 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14434 /* EH_RETURN requires the use of moves to function properly. */
14435 if (crtl->calls_eh_return)
14436 restore_regs_via_mov = true;
14437 /* SEH requires the use of pops to identify the epilogue. */
14438 else if (TARGET_SEH)
14439 restore_regs_via_mov = false;
14440 /* If we're only restoring one register and sp is not valid then
14441 using a move instruction to restore the register since it's
14442 less work than reloading sp and popping the register. */
14443 else if (!m->fs.sp_valid && frame.nregs <= 1)
14444 restore_regs_via_mov = true;
14445 else if (TARGET_EPILOGUE_USING_MOVE
14446 && cfun->machine->use_fast_prologue_epilogue
14447 && (frame.nregs > 1
14448 || m->fs.sp_offset != frame.reg_save_offset))
14449 restore_regs_via_mov = true;
14450 else if (frame_pointer_needed
14451 && !frame.nregs
14452 && m->fs.sp_offset != frame.reg_save_offset)
14453 restore_regs_via_mov = true;
14454 else if (frame_pointer_needed
14455 && TARGET_USE_LEAVE
14456 && cfun->machine->use_fast_prologue_epilogue
14457 && frame.nregs == 1)
14458 restore_regs_via_mov = true;
14459 else
14460 restore_regs_via_mov = false;
14462 if (restore_regs_via_mov || frame.nsseregs)
14464 /* Ensure that the entire register save area is addressable via
14465 the stack pointer, if we will restore via sp. */
14466 if (TARGET_64BIT
14467 && m->fs.sp_offset > 0x7fffffff
14468 && !(m->fs.fp_valid || m->fs.drap_valid)
14469 && (frame.nsseregs + frame.nregs) != 0)
14471 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14472 GEN_INT (m->fs.sp_offset
14473 - frame.sse_reg_save_offset),
14474 style,
14475 m->fs.cfa_reg == stack_pointer_rtx);
14479 /* If there are any SSE registers to restore, then we have to do it
14480 via moves, since there's obviously no pop for SSE regs. */
14481 if (frame.nsseregs)
14482 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14483 style == 2);
14485 if (restore_regs_via_mov)
14487 rtx t;
14489 if (frame.nregs)
14490 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14492 /* eh_return epilogues need %ecx added to the stack pointer. */
14493 if (style == 2)
14495 rtx sa = EH_RETURN_STACKADJ_RTX;
14496 rtx_insn *insn;
14498 /* %ecx can't be used for both DRAP register and eh_return. */
14499 if (crtl->drap_reg)
14500 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14502 /* regparm nested functions don't work with eh_return. */
14503 gcc_assert (!ix86_static_chain_on_stack);
14505 if (frame_pointer_needed)
14507 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14508 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14509 emit_insn (gen_rtx_SET (sa, t));
14511 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14512 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14514 /* Note that we use SA as a temporary CFA, as the return
14515 address is at the proper place relative to it. We
14516 pretend this happens at the FP restore insn because
14517 prior to this insn the FP would be stored at the wrong
14518 offset relative to SA, and after this insn we have no
14519 other reasonable register to use for the CFA. We don't
14520 bother resetting the CFA to the SP for the duration of
14521 the return insn. */
14522 add_reg_note (insn, REG_CFA_DEF_CFA,
14523 plus_constant (Pmode, sa, UNITS_PER_WORD));
14524 ix86_add_queued_cfa_restore_notes (insn);
14525 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14526 RTX_FRAME_RELATED_P (insn) = 1;
14528 m->fs.cfa_reg = sa;
14529 m->fs.cfa_offset = UNITS_PER_WORD;
14530 m->fs.fp_valid = false;
14532 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14533 const0_rtx, style, false);
14535 else
14537 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14538 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14539 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14540 ix86_add_queued_cfa_restore_notes (insn);
14542 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14543 if (m->fs.cfa_offset != UNITS_PER_WORD)
14545 m->fs.cfa_offset = UNITS_PER_WORD;
14546 add_reg_note (insn, REG_CFA_DEF_CFA,
14547 plus_constant (Pmode, stack_pointer_rtx,
14548 UNITS_PER_WORD));
14549 RTX_FRAME_RELATED_P (insn) = 1;
14552 m->fs.sp_offset = UNITS_PER_WORD;
14553 m->fs.sp_valid = true;
14556 else
14558 /* SEH requires that the function end with (1) a stack adjustment
14559 if necessary, (2) a sequence of pops, and (3) a return or
14560 jump instruction. Prevent insns from the function body from
14561 being scheduled into this sequence. */
14562 if (TARGET_SEH)
14564 /* Prevent a catch region from being adjacent to the standard
14565 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14566 several other flags that would be interesting to test are
14567 not yet set up. */
14568 if (flag_non_call_exceptions)
14569 emit_insn (gen_nops (const1_rtx));
14570 else
14571 emit_insn (gen_blockage ());
14574 /* First step is to deallocate the stack frame so that we can
14575 pop the registers. Also do it on SEH target for very large
14576 frame as the emitted instructions aren't allowed by the ABI in
14577 epilogues. */
14578 if (!m->fs.sp_valid
14579 || (TARGET_SEH
14580 && (m->fs.sp_offset - frame.reg_save_offset
14581 >= SEH_MAX_FRAME_SIZE)))
14583 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14584 GEN_INT (m->fs.fp_offset
14585 - frame.reg_save_offset),
14586 style, false);
14588 else if (m->fs.sp_offset != frame.reg_save_offset)
14590 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14591 GEN_INT (m->fs.sp_offset
14592 - frame.reg_save_offset),
14593 style,
14594 m->fs.cfa_reg == stack_pointer_rtx);
14597 ix86_emit_restore_regs_using_pop ();
14600 /* If we used a stack pointer and haven't already got rid of it,
14601 then do so now. */
14602 if (m->fs.fp_valid)
14604 /* If the stack pointer is valid and pointing at the frame
14605 pointer store address, then we only need a pop. */
14606 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14607 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14608 /* Leave results in shorter dependency chains on CPUs that are
14609 able to grok it fast. */
14610 else if (TARGET_USE_LEAVE
14611 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14612 || !cfun->machine->use_fast_prologue_epilogue)
14613 ix86_emit_leave ();
14614 else
14616 pro_epilogue_adjust_stack (stack_pointer_rtx,
14617 hard_frame_pointer_rtx,
14618 const0_rtx, style, !using_drap);
14619 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14623 if (using_drap)
14625 int param_ptr_offset = UNITS_PER_WORD;
14626 rtx_insn *insn;
14628 gcc_assert (stack_realign_drap);
14630 if (ix86_static_chain_on_stack)
14631 param_ptr_offset += UNITS_PER_WORD;
14632 if (!call_used_regs[REGNO (crtl->drap_reg)])
14633 param_ptr_offset += UNITS_PER_WORD;
14635 insn = emit_insn (gen_rtx_SET
14636 (stack_pointer_rtx,
14637 gen_rtx_PLUS (Pmode,
14638 crtl->drap_reg,
14639 GEN_INT (-param_ptr_offset))));
14640 m->fs.cfa_reg = stack_pointer_rtx;
14641 m->fs.cfa_offset = param_ptr_offset;
14642 m->fs.sp_offset = param_ptr_offset;
14643 m->fs.realigned = false;
14645 add_reg_note (insn, REG_CFA_DEF_CFA,
14646 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14647 GEN_INT (param_ptr_offset)));
14648 RTX_FRAME_RELATED_P (insn) = 1;
14650 if (!call_used_regs[REGNO (crtl->drap_reg)])
14651 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14654 /* At this point the stack pointer must be valid, and we must have
14655 restored all of the registers. We may not have deallocated the
14656 entire stack frame. We've delayed this until now because it may
14657 be possible to merge the local stack deallocation with the
14658 deallocation forced by ix86_static_chain_on_stack. */
14659 gcc_assert (m->fs.sp_valid);
14660 gcc_assert (!m->fs.fp_valid);
14661 gcc_assert (!m->fs.realigned);
14662 if (m->fs.sp_offset != UNITS_PER_WORD)
14664 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14665 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14666 style, true);
14668 else
14669 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14671 /* Sibcall epilogues don't want a return instruction. */
14672 if (style == 0)
14674 m->fs = frame_state_save;
14675 return;
14678 if (cfun->machine->func_type != TYPE_NORMAL)
14680 /* Return with the "IRET" instruction from interrupt handler.
14681 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14682 instruction in exception handler. */
14683 if (cfun->machine->func_type == TYPE_EXCEPTION)
14685 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14686 UNITS_PER_WORD);
14687 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14689 emit_jump_insn (gen_interrupt_return ());
14691 else if (crtl->args.pops_args && crtl->args.size)
14693 rtx popc = GEN_INT (crtl->args.pops_args);
14695 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14696 address, do explicit add, and jump indirectly to the caller. */
14698 if (crtl->args.pops_args >= 65536)
14700 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14701 rtx_insn *insn;
14703 /* There is no "pascal" calling convention in any 64bit ABI. */
14704 gcc_assert (!TARGET_64BIT);
14706 insn = emit_insn (gen_pop (ecx));
14707 m->fs.cfa_offset -= UNITS_PER_WORD;
14708 m->fs.sp_offset -= UNITS_PER_WORD;
14710 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14711 x = gen_rtx_SET (stack_pointer_rtx, x);
14712 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14713 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14714 RTX_FRAME_RELATED_P (insn) = 1;
14716 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14717 popc, -1, true);
14718 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14720 else
14721 emit_jump_insn (gen_simple_return_pop_internal (popc));
14723 else
14724 emit_jump_insn (gen_simple_return_internal ());
14726 /* Restore the state back to the state from the prologue,
14727 so that it's correct for the next epilogue. */
14728 m->fs = frame_state_save;
14731 /* Reset from the function's potential modifications. */
14733 static void
14734 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14736 if (pic_offset_table_rtx
14737 && !ix86_use_pseudo_pic_reg ())
14738 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14740 if (TARGET_MACHO)
14742 rtx_insn *insn = get_last_insn ();
14743 rtx_insn *deleted_debug_label = NULL;
14745 /* Mach-O doesn't support labels at the end of objects, so if
14746 it looks like we might want one, take special action.
14747 First, collect any sequence of deleted debug labels. */
14748 while (insn
14749 && NOTE_P (insn)
14750 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14752 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14753 notes only, instead set their CODE_LABEL_NUMBER to -1,
14754 otherwise there would be code generation differences
14755 in between -g and -g0. */
14756 if (NOTE_P (insn) && NOTE_KIND (insn)
14757 == NOTE_INSN_DELETED_DEBUG_LABEL)
14758 deleted_debug_label = insn;
14759 insn = PREV_INSN (insn);
14762 /* If we have:
14763 label:
14764 barrier
14765 then this needs to be detected, so skip past the barrier. */
14767 if (insn && BARRIER_P (insn))
14768 insn = PREV_INSN (insn);
14770 /* Up to now we've only seen notes or barriers. */
14771 if (insn)
14773 if (LABEL_P (insn)
14774 || (NOTE_P (insn)
14775 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14776 /* Trailing label. */
14777 fputs ("\tnop\n", file);
14778 else if (cfun && ! cfun->is_thunk)
14780 /* See if we have a completely empty function body, skipping
14781 the special case of the picbase thunk emitted as asm. */
14782 while (insn && ! INSN_P (insn))
14783 insn = PREV_INSN (insn);
14784 /* If we don't find any insns, we've got an empty function body;
14785 I.e. completely empty - without a return or branch. This is
14786 taken as the case where a function body has been removed
14787 because it contains an inline __builtin_unreachable(). GCC
14788 declares that reaching __builtin_unreachable() means UB so
14789 we're not obliged to do anything special; however, we want
14790 non-zero-sized function bodies. To meet this, and help the
14791 user out, let's trap the case. */
14792 if (insn == NULL)
14793 fputs ("\tud2\n", file);
14796 else if (deleted_debug_label)
14797 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14798 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14799 CODE_LABEL_NUMBER (insn) = -1;
14803 /* Return a scratch register to use in the split stack prologue. The
14804 split stack prologue is used for -fsplit-stack. It is the first
14805 instructions in the function, even before the regular prologue.
14806 The scratch register can be any caller-saved register which is not
14807 used for parameters or for the static chain. */
14809 static unsigned int
14810 split_stack_prologue_scratch_regno (void)
14812 if (TARGET_64BIT)
14813 return R11_REG;
14814 else
14816 bool is_fastcall, is_thiscall;
14817 int regparm;
14819 is_fastcall = (lookup_attribute ("fastcall",
14820 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14821 != NULL);
14822 is_thiscall = (lookup_attribute ("thiscall",
14823 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14824 != NULL);
14825 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14827 if (is_fastcall)
14829 if (DECL_STATIC_CHAIN (cfun->decl))
14831 sorry ("-fsplit-stack does not support fastcall with "
14832 "nested function");
14833 return INVALID_REGNUM;
14835 return AX_REG;
14837 else if (is_thiscall)
14839 if (!DECL_STATIC_CHAIN (cfun->decl))
14840 return DX_REG;
14841 return AX_REG;
14843 else if (regparm < 3)
14845 if (!DECL_STATIC_CHAIN (cfun->decl))
14846 return CX_REG;
14847 else
14849 if (regparm >= 2)
14851 sorry ("-fsplit-stack does not support 2 register "
14852 "parameters for a nested function");
14853 return INVALID_REGNUM;
14855 return DX_REG;
14858 else
14860 /* FIXME: We could make this work by pushing a register
14861 around the addition and comparison. */
14862 sorry ("-fsplit-stack does not support 3 register parameters");
14863 return INVALID_REGNUM;
14868 /* A SYMBOL_REF for the function which allocates new stackspace for
14869 -fsplit-stack. */
14871 static GTY(()) rtx split_stack_fn;
14873 /* A SYMBOL_REF for the more stack function when using the large
14874 model. */
14876 static GTY(()) rtx split_stack_fn_large;
14878 /* Handle -fsplit-stack. These are the first instructions in the
14879 function, even before the regular prologue. */
14881 void
14882 ix86_expand_split_stack_prologue (void)
14884 struct ix86_frame frame;
14885 HOST_WIDE_INT allocate;
14886 unsigned HOST_WIDE_INT args_size;
14887 rtx_code_label *label;
14888 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14889 rtx scratch_reg = NULL_RTX;
14890 rtx_code_label *varargs_label = NULL;
14891 rtx fn;
14893 gcc_assert (flag_split_stack && reload_completed);
14895 ix86_finalize_stack_realign_flags ();
14896 ix86_compute_frame_layout (&frame);
14897 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14899 /* This is the label we will branch to if we have enough stack
14900 space. We expect the basic block reordering pass to reverse this
14901 branch if optimizing, so that we branch in the unlikely case. */
14902 label = gen_label_rtx ();
14904 /* We need to compare the stack pointer minus the frame size with
14905 the stack boundary in the TCB. The stack boundary always gives
14906 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14907 can compare directly. Otherwise we need to do an addition. */
14909 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14910 UNSPEC_STACK_CHECK);
14911 limit = gen_rtx_CONST (Pmode, limit);
14912 limit = gen_rtx_MEM (Pmode, limit);
14913 if (allocate < SPLIT_STACK_AVAILABLE)
14914 current = stack_pointer_rtx;
14915 else
14917 unsigned int scratch_regno;
14918 rtx offset;
14920 /* We need a scratch register to hold the stack pointer minus
14921 the required frame size. Since this is the very start of the
14922 function, the scratch register can be any caller-saved
14923 register which is not used for parameters. */
14924 offset = GEN_INT (- allocate);
14925 scratch_regno = split_stack_prologue_scratch_regno ();
14926 if (scratch_regno == INVALID_REGNUM)
14927 return;
14928 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14929 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14931 /* We don't use ix86_gen_add3 in this case because it will
14932 want to split to lea, but when not optimizing the insn
14933 will not be split after this point. */
14934 emit_insn (gen_rtx_SET (scratch_reg,
14935 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14936 offset)));
14938 else
14940 emit_move_insn (scratch_reg, offset);
14941 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14942 stack_pointer_rtx));
14944 current = scratch_reg;
14947 ix86_expand_branch (GEU, current, limit, label);
14948 rtx_insn *jump_insn = get_last_insn ();
14949 JUMP_LABEL (jump_insn) = label;
14951 /* Mark the jump as very likely to be taken. */
14952 add_int_reg_note (jump_insn, REG_BR_PROB,
14953 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14955 if (split_stack_fn == NULL_RTX)
14957 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14958 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14960 fn = split_stack_fn;
14962 /* Get more stack space. We pass in the desired stack space and the
14963 size of the arguments to copy to the new stack. In 32-bit mode
14964 we push the parameters; __morestack will return on a new stack
14965 anyhow. In 64-bit mode we pass the parameters in r10 and
14966 r11. */
14967 allocate_rtx = GEN_INT (allocate);
14968 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14969 call_fusage = NULL_RTX;
14970 rtx pop = NULL_RTX;
14971 if (TARGET_64BIT)
14973 rtx reg10, reg11;
14975 reg10 = gen_rtx_REG (Pmode, R10_REG);
14976 reg11 = gen_rtx_REG (Pmode, R11_REG);
14978 /* If this function uses a static chain, it will be in %r10.
14979 Preserve it across the call to __morestack. */
14980 if (DECL_STATIC_CHAIN (cfun->decl))
14982 rtx rax;
14984 rax = gen_rtx_REG (word_mode, AX_REG);
14985 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14986 use_reg (&call_fusage, rax);
14989 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14990 && !TARGET_PECOFF)
14992 HOST_WIDE_INT argval;
14994 gcc_assert (Pmode == DImode);
14995 /* When using the large model we need to load the address
14996 into a register, and we've run out of registers. So we
14997 switch to a different calling convention, and we call a
14998 different function: __morestack_large. We pass the
14999 argument size in the upper 32 bits of r10 and pass the
15000 frame size in the lower 32 bits. */
15001 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15002 gcc_assert ((args_size & 0xffffffff) == args_size);
15004 if (split_stack_fn_large == NULL_RTX)
15006 split_stack_fn_large =
15007 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15008 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15010 if (ix86_cmodel == CM_LARGE_PIC)
15012 rtx_code_label *label;
15013 rtx x;
15015 label = gen_label_rtx ();
15016 emit_label (label);
15017 LABEL_PRESERVE_P (label) = 1;
15018 emit_insn (gen_set_rip_rex64 (reg10, label));
15019 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15020 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15021 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15022 UNSPEC_GOT);
15023 x = gen_rtx_CONST (Pmode, x);
15024 emit_move_insn (reg11, x);
15025 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15026 x = gen_const_mem (Pmode, x);
15027 emit_move_insn (reg11, x);
15029 else
15030 emit_move_insn (reg11, split_stack_fn_large);
15032 fn = reg11;
15034 argval = ((args_size << 16) << 16) + allocate;
15035 emit_move_insn (reg10, GEN_INT (argval));
15037 else
15039 emit_move_insn (reg10, allocate_rtx);
15040 emit_move_insn (reg11, GEN_INT (args_size));
15041 use_reg (&call_fusage, reg11);
15044 use_reg (&call_fusage, reg10);
15046 else
15048 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15049 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15050 insn = emit_insn (gen_push (allocate_rtx));
15051 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15052 pop = GEN_INT (2 * UNITS_PER_WORD);
15054 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15055 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15056 pop, false);
15057 add_function_usage_to (call_insn, call_fusage);
15058 if (!TARGET_64BIT)
15059 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15061 /* In order to make call/return prediction work right, we now need
15062 to execute a return instruction. See
15063 libgcc/config/i386/morestack.S for the details on how this works.
15065 For flow purposes gcc must not see this as a return
15066 instruction--we need control flow to continue at the subsequent
15067 label. Therefore, we use an unspec. */
15068 gcc_assert (crtl->args.pops_args < 65536);
15069 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15071 /* If we are in 64-bit mode and this function uses a static chain,
15072 we saved %r10 in %rax before calling _morestack. */
15073 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15074 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15075 gen_rtx_REG (word_mode, AX_REG));
15077 /* If this function calls va_start, we need to store a pointer to
15078 the arguments on the old stack, because they may not have been
15079 all copied to the new stack. At this point the old stack can be
15080 found at the frame pointer value used by __morestack, because
15081 __morestack has set that up before calling back to us. Here we
15082 store that pointer in a scratch register, and in
15083 ix86_expand_prologue we store the scratch register in a stack
15084 slot. */
15085 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15087 unsigned int scratch_regno;
15088 rtx frame_reg;
15089 int words;
15091 scratch_regno = split_stack_prologue_scratch_regno ();
15092 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15093 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15095 /* 64-bit:
15096 fp -> old fp value
15097 return address within this function
15098 return address of caller of this function
15099 stack arguments
15100 So we add three words to get to the stack arguments.
15102 32-bit:
15103 fp -> old fp value
15104 return address within this function
15105 first argument to __morestack
15106 second argument to __morestack
15107 return address of caller of this function
15108 stack arguments
15109 So we add five words to get to the stack arguments.
15111 words = TARGET_64BIT ? 3 : 5;
15112 emit_insn (gen_rtx_SET (scratch_reg,
15113 gen_rtx_PLUS (Pmode, frame_reg,
15114 GEN_INT (words * UNITS_PER_WORD))));
15116 varargs_label = gen_label_rtx ();
15117 emit_jump_insn (gen_jump (varargs_label));
15118 JUMP_LABEL (get_last_insn ()) = varargs_label;
15120 emit_barrier ();
15123 emit_label (label);
15124 LABEL_NUSES (label) = 1;
15126 /* If this function calls va_start, we now have to set the scratch
15127 register for the case where we do not call __morestack. In this
15128 case we need to set it based on the stack pointer. */
15129 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15131 emit_insn (gen_rtx_SET (scratch_reg,
15132 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15133 GEN_INT (UNITS_PER_WORD))));
15135 emit_label (varargs_label);
15136 LABEL_NUSES (varargs_label) = 1;
15140 /* We may have to tell the dataflow pass that the split stack prologue
15141 is initializing a scratch register. */
15143 static void
15144 ix86_live_on_entry (bitmap regs)
15146 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15148 gcc_assert (flag_split_stack);
15149 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15153 /* Extract the parts of an RTL expression that is a valid memory address
15154 for an instruction. Return 0 if the structure of the address is
15155 grossly off. Return -1 if the address contains ASHIFT, so it is not
15156 strictly valid, but still used for computing length of lea instruction. */
15159 ix86_decompose_address (rtx addr, struct ix86_address *out)
15161 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15162 rtx base_reg, index_reg;
15163 HOST_WIDE_INT scale = 1;
15164 rtx scale_rtx = NULL_RTX;
15165 rtx tmp;
15166 int retval = 1;
15167 addr_space_t seg = ADDR_SPACE_GENERIC;
15169 /* Allow zero-extended SImode addresses,
15170 they will be emitted with addr32 prefix. */
15171 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15173 if (GET_CODE (addr) == ZERO_EXTEND
15174 && GET_MODE (XEXP (addr, 0)) == SImode)
15176 addr = XEXP (addr, 0);
15177 if (CONST_INT_P (addr))
15178 return 0;
15180 else if (GET_CODE (addr) == AND
15181 && const_32bit_mask (XEXP (addr, 1), DImode))
15183 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15184 if (addr == NULL_RTX)
15185 return 0;
15187 if (CONST_INT_P (addr))
15188 return 0;
15192 /* Allow SImode subregs of DImode addresses,
15193 they will be emitted with addr32 prefix. */
15194 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15196 if (SUBREG_P (addr)
15197 && GET_MODE (SUBREG_REG (addr)) == DImode)
15199 addr = SUBREG_REG (addr);
15200 if (CONST_INT_P (addr))
15201 return 0;
15205 if (REG_P (addr))
15206 base = addr;
15207 else if (SUBREG_P (addr))
15209 if (REG_P (SUBREG_REG (addr)))
15210 base = addr;
15211 else
15212 return 0;
15214 else if (GET_CODE (addr) == PLUS)
15216 rtx addends[4], op;
15217 int n = 0, i;
15219 op = addr;
15222 if (n >= 4)
15223 return 0;
15224 addends[n++] = XEXP (op, 1);
15225 op = XEXP (op, 0);
15227 while (GET_CODE (op) == PLUS);
15228 if (n >= 4)
15229 return 0;
15230 addends[n] = op;
15232 for (i = n; i >= 0; --i)
15234 op = addends[i];
15235 switch (GET_CODE (op))
15237 case MULT:
15238 if (index)
15239 return 0;
15240 index = XEXP (op, 0);
15241 scale_rtx = XEXP (op, 1);
15242 break;
15244 case ASHIFT:
15245 if (index)
15246 return 0;
15247 index = XEXP (op, 0);
15248 tmp = XEXP (op, 1);
15249 if (!CONST_INT_P (tmp))
15250 return 0;
15251 scale = INTVAL (tmp);
15252 if ((unsigned HOST_WIDE_INT) scale > 3)
15253 return 0;
15254 scale = 1 << scale;
15255 break;
15257 case ZERO_EXTEND:
15258 op = XEXP (op, 0);
15259 if (GET_CODE (op) != UNSPEC)
15260 return 0;
15261 /* FALLTHRU */
15263 case UNSPEC:
15264 if (XINT (op, 1) == UNSPEC_TP
15265 && TARGET_TLS_DIRECT_SEG_REFS
15266 && seg == ADDR_SPACE_GENERIC)
15267 seg = DEFAULT_TLS_SEG_REG;
15268 else
15269 return 0;
15270 break;
15272 case SUBREG:
15273 if (!REG_P (SUBREG_REG (op)))
15274 return 0;
15275 /* FALLTHRU */
15277 case REG:
15278 if (!base)
15279 base = op;
15280 else if (!index)
15281 index = op;
15282 else
15283 return 0;
15284 break;
15286 case CONST:
15287 case CONST_INT:
15288 case SYMBOL_REF:
15289 case LABEL_REF:
15290 if (disp)
15291 return 0;
15292 disp = op;
15293 break;
15295 default:
15296 return 0;
15300 else if (GET_CODE (addr) == MULT)
15302 index = XEXP (addr, 0); /* index*scale */
15303 scale_rtx = XEXP (addr, 1);
15305 else if (GET_CODE (addr) == ASHIFT)
15307 /* We're called for lea too, which implements ashift on occasion. */
15308 index = XEXP (addr, 0);
15309 tmp = XEXP (addr, 1);
15310 if (!CONST_INT_P (tmp))
15311 return 0;
15312 scale = INTVAL (tmp);
15313 if ((unsigned HOST_WIDE_INT) scale > 3)
15314 return 0;
15315 scale = 1 << scale;
15316 retval = -1;
15318 else
15319 disp = addr; /* displacement */
15321 if (index)
15323 if (REG_P (index))
15325 else if (SUBREG_P (index)
15326 && REG_P (SUBREG_REG (index)))
15328 else
15329 return 0;
15332 /* Extract the integral value of scale. */
15333 if (scale_rtx)
15335 if (!CONST_INT_P (scale_rtx))
15336 return 0;
15337 scale = INTVAL (scale_rtx);
15340 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15341 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15343 /* Avoid useless 0 displacement. */
15344 if (disp == const0_rtx && (base || index))
15345 disp = NULL_RTX;
15347 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15348 if (base_reg && index_reg && scale == 1
15349 && (index_reg == arg_pointer_rtx
15350 || index_reg == frame_pointer_rtx
15351 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15353 std::swap (base, index);
15354 std::swap (base_reg, index_reg);
15357 /* Special case: %ebp cannot be encoded as a base without a displacement.
15358 Similarly %r13. */
15359 if (!disp
15360 && base_reg
15361 && (base_reg == hard_frame_pointer_rtx
15362 || base_reg == frame_pointer_rtx
15363 || base_reg == arg_pointer_rtx
15364 || (REG_P (base_reg)
15365 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15366 || REGNO (base_reg) == R13_REG))))
15367 disp = const0_rtx;
15369 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15370 Avoid this by transforming to [%esi+0].
15371 Reload calls address legitimization without cfun defined, so we need
15372 to test cfun for being non-NULL. */
15373 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15374 && base_reg && !index_reg && !disp
15375 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15376 disp = const0_rtx;
15378 /* Special case: encode reg+reg instead of reg*2. */
15379 if (!base && index && scale == 2)
15380 base = index, base_reg = index_reg, scale = 1;
15382 /* Special case: scaling cannot be encoded without base or displacement. */
15383 if (!base && !disp && index && scale != 1)
15384 disp = const0_rtx;
15386 out->base = base;
15387 out->index = index;
15388 out->disp = disp;
15389 out->scale = scale;
15390 out->seg = seg;
15392 return retval;
15395 /* Return cost of the memory address x.
15396 For i386, it is better to use a complex address than let gcc copy
15397 the address into a reg and make a new pseudo. But not if the address
15398 requires to two regs - that would mean more pseudos with longer
15399 lifetimes. */
15400 static int
15401 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15403 struct ix86_address parts;
15404 int cost = 1;
15405 int ok = ix86_decompose_address (x, &parts);
15407 gcc_assert (ok);
15409 if (parts.base && SUBREG_P (parts.base))
15410 parts.base = SUBREG_REG (parts.base);
15411 if (parts.index && SUBREG_P (parts.index))
15412 parts.index = SUBREG_REG (parts.index);
15414 /* Attempt to minimize number of registers in the address by increasing
15415 address cost for each used register. We don't increase address cost
15416 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15417 is not invariant itself it most likely means that base or index is not
15418 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15419 which is not profitable for x86. */
15420 if (parts.base
15421 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15422 && (current_pass->type == GIMPLE_PASS
15423 || !pic_offset_table_rtx
15424 || !REG_P (parts.base)
15425 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15426 cost++;
15428 if (parts.index
15429 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15430 && (current_pass->type == GIMPLE_PASS
15431 || !pic_offset_table_rtx
15432 || !REG_P (parts.index)
15433 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15434 cost++;
15436 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15437 since it's predecode logic can't detect the length of instructions
15438 and it degenerates to vector decoded. Increase cost of such
15439 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15440 to split such addresses or even refuse such addresses at all.
15442 Following addressing modes are affected:
15443 [base+scale*index]
15444 [scale*index+disp]
15445 [base+index]
15447 The first and last case may be avoidable by explicitly coding the zero in
15448 memory address, but I don't have AMD-K6 machine handy to check this
15449 theory. */
15451 if (TARGET_K6
15452 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15453 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15454 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15455 cost += 10;
15457 return cost;
15460 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15461 this is used for to form addresses to local data when -fPIC is in
15462 use. */
15464 static bool
15465 darwin_local_data_pic (rtx disp)
15467 return (GET_CODE (disp) == UNSPEC
15468 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15471 /* True if operand X should be loaded from GOT. */
15473 bool
15474 ix86_force_load_from_GOT_p (rtx x)
15476 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15477 && !TARGET_PECOFF && !TARGET_MACHO
15478 && !flag_plt && !flag_pic
15479 && ix86_cmodel != CM_LARGE
15480 && GET_CODE (x) == SYMBOL_REF
15481 && SYMBOL_REF_FUNCTION_P (x)
15482 && !SYMBOL_REF_LOCAL_P (x));
15485 /* Determine if a given RTX is a valid constant. We already know this
15486 satisfies CONSTANT_P. */
15488 static bool
15489 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15491 /* Pointer bounds constants are not valid. */
15492 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15493 return false;
15495 switch (GET_CODE (x))
15497 case CONST:
15498 x = XEXP (x, 0);
15500 if (GET_CODE (x) == PLUS)
15502 if (!CONST_INT_P (XEXP (x, 1)))
15503 return false;
15504 x = XEXP (x, 0);
15507 if (TARGET_MACHO && darwin_local_data_pic (x))
15508 return true;
15510 /* Only some unspecs are valid as "constants". */
15511 if (GET_CODE (x) == UNSPEC)
15512 switch (XINT (x, 1))
15514 case UNSPEC_GOT:
15515 case UNSPEC_GOTOFF:
15516 case UNSPEC_PLTOFF:
15517 return TARGET_64BIT;
15518 case UNSPEC_TPOFF:
15519 case UNSPEC_NTPOFF:
15520 x = XVECEXP (x, 0, 0);
15521 return (GET_CODE (x) == SYMBOL_REF
15522 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15523 case UNSPEC_DTPOFF:
15524 x = XVECEXP (x, 0, 0);
15525 return (GET_CODE (x) == SYMBOL_REF
15526 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15527 default:
15528 return false;
15531 /* We must have drilled down to a symbol. */
15532 if (GET_CODE (x) == LABEL_REF)
15533 return true;
15534 if (GET_CODE (x) != SYMBOL_REF)
15535 return false;
15536 /* FALLTHRU */
15538 case SYMBOL_REF:
15539 /* TLS symbols are never valid. */
15540 if (SYMBOL_REF_TLS_MODEL (x))
15541 return false;
15543 /* DLLIMPORT symbols are never valid. */
15544 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15545 && SYMBOL_REF_DLLIMPORT_P (x))
15546 return false;
15548 #if TARGET_MACHO
15549 /* mdynamic-no-pic */
15550 if (MACHO_DYNAMIC_NO_PIC_P)
15551 return machopic_symbol_defined_p (x);
15552 #endif
15554 /* External function address should be loaded
15555 via the GOT slot to avoid PLT. */
15556 if (ix86_force_load_from_GOT_p (x))
15557 return false;
15559 break;
15561 CASE_CONST_SCALAR_INT:
15562 switch (mode)
15564 case TImode:
15565 if (TARGET_64BIT)
15566 return true;
15567 /* FALLTHRU */
15568 case OImode:
15569 case XImode:
15570 if (!standard_sse_constant_p (x, mode))
15571 return false;
15572 default:
15573 break;
15575 break;
15577 case CONST_VECTOR:
15578 if (!standard_sse_constant_p (x, mode))
15579 return false;
15581 default:
15582 break;
15585 /* Otherwise we handle everything else in the move patterns. */
15586 return true;
15589 /* Determine if it's legal to put X into the constant pool. This
15590 is not possible for the address of thread-local symbols, which
15591 is checked above. */
15593 static bool
15594 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15596 /* We can put any immediate constant in memory. */
15597 switch (GET_CODE (x))
15599 CASE_CONST_ANY:
15600 return false;
15602 default:
15603 break;
15606 return !ix86_legitimate_constant_p (mode, x);
15609 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15610 otherwise zero. */
15612 static bool
15613 is_imported_p (rtx x)
15615 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15616 || GET_CODE (x) != SYMBOL_REF)
15617 return false;
15619 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15623 /* Nonzero if the constant value X is a legitimate general operand
15624 when generating PIC code. It is given that flag_pic is on and
15625 that X satisfies CONSTANT_P. */
15627 bool
15628 legitimate_pic_operand_p (rtx x)
15630 rtx inner;
15632 switch (GET_CODE (x))
15634 case CONST:
15635 inner = XEXP (x, 0);
15636 if (GET_CODE (inner) == PLUS
15637 && CONST_INT_P (XEXP (inner, 1)))
15638 inner = XEXP (inner, 0);
15640 /* Only some unspecs are valid as "constants". */
15641 if (GET_CODE (inner) == UNSPEC)
15642 switch (XINT (inner, 1))
15644 case UNSPEC_GOT:
15645 case UNSPEC_GOTOFF:
15646 case UNSPEC_PLTOFF:
15647 return TARGET_64BIT;
15648 case UNSPEC_TPOFF:
15649 x = XVECEXP (inner, 0, 0);
15650 return (GET_CODE (x) == SYMBOL_REF
15651 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15652 case UNSPEC_MACHOPIC_OFFSET:
15653 return legitimate_pic_address_disp_p (x);
15654 default:
15655 return false;
15657 /* FALLTHRU */
15659 case SYMBOL_REF:
15660 case LABEL_REF:
15661 return legitimate_pic_address_disp_p (x);
15663 default:
15664 return true;
15668 /* Determine if a given CONST RTX is a valid memory displacement
15669 in PIC mode. */
15671 bool
15672 legitimate_pic_address_disp_p (rtx disp)
15674 bool saw_plus;
15676 /* In 64bit mode we can allow direct addresses of symbols and labels
15677 when they are not dynamic symbols. */
15678 if (TARGET_64BIT)
15680 rtx op0 = disp, op1;
15682 switch (GET_CODE (disp))
15684 case LABEL_REF:
15685 return true;
15687 case CONST:
15688 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15689 break;
15690 op0 = XEXP (XEXP (disp, 0), 0);
15691 op1 = XEXP (XEXP (disp, 0), 1);
15692 if (!CONST_INT_P (op1)
15693 || INTVAL (op1) >= 16*1024*1024
15694 || INTVAL (op1) < -16*1024*1024)
15695 break;
15696 if (GET_CODE (op0) == LABEL_REF)
15697 return true;
15698 if (GET_CODE (op0) == CONST
15699 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15700 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15701 return true;
15702 if (GET_CODE (op0) == UNSPEC
15703 && XINT (op0, 1) == UNSPEC_PCREL)
15704 return true;
15705 if (GET_CODE (op0) != SYMBOL_REF)
15706 break;
15707 /* FALLTHRU */
15709 case SYMBOL_REF:
15710 /* TLS references should always be enclosed in UNSPEC.
15711 The dllimported symbol needs always to be resolved. */
15712 if (SYMBOL_REF_TLS_MODEL (op0)
15713 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15714 return false;
15716 if (TARGET_PECOFF)
15718 if (is_imported_p (op0))
15719 return true;
15721 if (SYMBOL_REF_FAR_ADDR_P (op0)
15722 || !SYMBOL_REF_LOCAL_P (op0))
15723 break;
15725 /* Function-symbols need to be resolved only for
15726 large-model.
15727 For the small-model we don't need to resolve anything
15728 here. */
15729 if ((ix86_cmodel != CM_LARGE_PIC
15730 && SYMBOL_REF_FUNCTION_P (op0))
15731 || ix86_cmodel == CM_SMALL_PIC)
15732 return true;
15733 /* Non-external symbols don't need to be resolved for
15734 large, and medium-model. */
15735 if ((ix86_cmodel == CM_LARGE_PIC
15736 || ix86_cmodel == CM_MEDIUM_PIC)
15737 && !SYMBOL_REF_EXTERNAL_P (op0))
15738 return true;
15740 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15741 && (SYMBOL_REF_LOCAL_P (op0)
15742 || (HAVE_LD_PIE_COPYRELOC
15743 && flag_pie
15744 && !SYMBOL_REF_WEAK (op0)
15745 && !SYMBOL_REF_FUNCTION_P (op0)))
15746 && ix86_cmodel != CM_LARGE_PIC)
15747 return true;
15748 break;
15750 default:
15751 break;
15754 if (GET_CODE (disp) != CONST)
15755 return false;
15756 disp = XEXP (disp, 0);
15758 if (TARGET_64BIT)
15760 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15761 of GOT tables. We should not need these anyway. */
15762 if (GET_CODE (disp) != UNSPEC
15763 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15764 && XINT (disp, 1) != UNSPEC_GOTOFF
15765 && XINT (disp, 1) != UNSPEC_PCREL
15766 && XINT (disp, 1) != UNSPEC_PLTOFF))
15767 return false;
15769 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15770 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15771 return false;
15772 return true;
15775 saw_plus = false;
15776 if (GET_CODE (disp) == PLUS)
15778 if (!CONST_INT_P (XEXP (disp, 1)))
15779 return false;
15780 disp = XEXP (disp, 0);
15781 saw_plus = true;
15784 if (TARGET_MACHO && darwin_local_data_pic (disp))
15785 return true;
15787 if (GET_CODE (disp) != UNSPEC)
15788 return false;
15790 switch (XINT (disp, 1))
15792 case UNSPEC_GOT:
15793 if (saw_plus)
15794 return false;
15795 /* We need to check for both symbols and labels because VxWorks loads
15796 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15797 details. */
15798 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15799 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15800 case UNSPEC_GOTOFF:
15801 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15802 While ABI specify also 32bit relocation but we don't produce it in
15803 small PIC model at all. */
15804 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15805 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15806 && !TARGET_64BIT)
15807 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15808 return false;
15809 case UNSPEC_GOTTPOFF:
15810 case UNSPEC_GOTNTPOFF:
15811 case UNSPEC_INDNTPOFF:
15812 if (saw_plus)
15813 return false;
15814 disp = XVECEXP (disp, 0, 0);
15815 return (GET_CODE (disp) == SYMBOL_REF
15816 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15817 case UNSPEC_NTPOFF:
15818 disp = XVECEXP (disp, 0, 0);
15819 return (GET_CODE (disp) == SYMBOL_REF
15820 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15821 case UNSPEC_DTPOFF:
15822 disp = XVECEXP (disp, 0, 0);
15823 return (GET_CODE (disp) == SYMBOL_REF
15824 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15827 return false;
15830 /* Determine if op is suitable RTX for an address register.
15831 Return naked register if a register or a register subreg is
15832 found, otherwise return NULL_RTX. */
15834 static rtx
15835 ix86_validate_address_register (rtx op)
15837 machine_mode mode = GET_MODE (op);
15839 /* Only SImode or DImode registers can form the address. */
15840 if (mode != SImode && mode != DImode)
15841 return NULL_RTX;
15843 if (REG_P (op))
15844 return op;
15845 else if (SUBREG_P (op))
15847 rtx reg = SUBREG_REG (op);
15849 if (!REG_P (reg))
15850 return NULL_RTX;
15852 mode = GET_MODE (reg);
15854 /* Don't allow SUBREGs that span more than a word. It can
15855 lead to spill failures when the register is one word out
15856 of a two word structure. */
15857 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15858 return NULL_RTX;
15860 /* Allow only SUBREGs of non-eliminable hard registers. */
15861 if (register_no_elim_operand (reg, mode))
15862 return reg;
15865 /* Op is not a register. */
15866 return NULL_RTX;
15869 /* Recognizes RTL expressions that are valid memory addresses for an
15870 instruction. The MODE argument is the machine mode for the MEM
15871 expression that wants to use this address.
15873 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15874 convert common non-canonical forms to canonical form so that they will
15875 be recognized. */
15877 static bool
15878 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15880 struct ix86_address parts;
15881 rtx base, index, disp;
15882 HOST_WIDE_INT scale;
15883 addr_space_t seg;
15885 if (ix86_decompose_address (addr, &parts) <= 0)
15886 /* Decomposition failed. */
15887 return false;
15889 base = parts.base;
15890 index = parts.index;
15891 disp = parts.disp;
15892 scale = parts.scale;
15893 seg = parts.seg;
15895 /* Validate base register. */
15896 if (base)
15898 rtx reg = ix86_validate_address_register (base);
15900 if (reg == NULL_RTX)
15901 return false;
15903 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15904 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15905 /* Base is not valid. */
15906 return false;
15909 /* Validate index register. */
15910 if (index)
15912 rtx reg = ix86_validate_address_register (index);
15914 if (reg == NULL_RTX)
15915 return false;
15917 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15918 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15919 /* Index is not valid. */
15920 return false;
15923 /* Index and base should have the same mode. */
15924 if (base && index
15925 && GET_MODE (base) != GET_MODE (index))
15926 return false;
15928 /* Address override works only on the (%reg) part of %fs:(%reg). */
15929 if (seg != ADDR_SPACE_GENERIC
15930 && ((base && GET_MODE (base) != word_mode)
15931 || (index && GET_MODE (index) != word_mode)))
15932 return false;
15934 /* Validate scale factor. */
15935 if (scale != 1)
15937 if (!index)
15938 /* Scale without index. */
15939 return false;
15941 if (scale != 2 && scale != 4 && scale != 8)
15942 /* Scale is not a valid multiplier. */
15943 return false;
15946 /* Validate displacement. */
15947 if (disp)
15949 if (GET_CODE (disp) == CONST
15950 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15951 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15952 switch (XINT (XEXP (disp, 0), 1))
15954 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15955 when used. While ABI specify also 32bit relocations, we
15956 don't produce them at all and use IP relative instead.
15957 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15958 should be loaded via GOT. */
15959 case UNSPEC_GOT:
15960 if (!TARGET_64BIT
15961 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15962 goto is_legitimate_pic;
15963 /* FALLTHRU */
15964 case UNSPEC_GOTOFF:
15965 gcc_assert (flag_pic);
15966 if (!TARGET_64BIT)
15967 goto is_legitimate_pic;
15969 /* 64bit address unspec. */
15970 return false;
15972 case UNSPEC_GOTPCREL:
15973 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15974 goto is_legitimate_pic;
15975 /* FALLTHRU */
15976 case UNSPEC_PCREL:
15977 gcc_assert (flag_pic);
15978 goto is_legitimate_pic;
15980 case UNSPEC_GOTTPOFF:
15981 case UNSPEC_GOTNTPOFF:
15982 case UNSPEC_INDNTPOFF:
15983 case UNSPEC_NTPOFF:
15984 case UNSPEC_DTPOFF:
15985 break;
15987 case UNSPEC_STACK_CHECK:
15988 gcc_assert (flag_split_stack);
15989 break;
15991 default:
15992 /* Invalid address unspec. */
15993 return false;
15996 else if (SYMBOLIC_CONST (disp)
15997 && (flag_pic
15998 || (TARGET_MACHO
15999 #if TARGET_MACHO
16000 && MACHOPIC_INDIRECT
16001 && !machopic_operand_p (disp)
16002 #endif
16006 is_legitimate_pic:
16007 if (TARGET_64BIT && (index || base))
16009 /* foo@dtpoff(%rX) is ok. */
16010 if (GET_CODE (disp) != CONST
16011 || GET_CODE (XEXP (disp, 0)) != PLUS
16012 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16013 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16014 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16015 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16016 /* Non-constant pic memory reference. */
16017 return false;
16019 else if ((!TARGET_MACHO || flag_pic)
16020 && ! legitimate_pic_address_disp_p (disp))
16021 /* Displacement is an invalid pic construct. */
16022 return false;
16023 #if TARGET_MACHO
16024 else if (MACHO_DYNAMIC_NO_PIC_P
16025 && !ix86_legitimate_constant_p (Pmode, disp))
16026 /* displacment must be referenced via non_lazy_pointer */
16027 return false;
16028 #endif
16030 /* This code used to verify that a symbolic pic displacement
16031 includes the pic_offset_table_rtx register.
16033 While this is good idea, unfortunately these constructs may
16034 be created by "adds using lea" optimization for incorrect
16035 code like:
16037 int a;
16038 int foo(int i)
16040 return *(&a+i);
16043 This code is nonsensical, but results in addressing
16044 GOT table with pic_offset_table_rtx base. We can't
16045 just refuse it easily, since it gets matched by
16046 "addsi3" pattern, that later gets split to lea in the
16047 case output register differs from input. While this
16048 can be handled by separate addsi pattern for this case
16049 that never results in lea, this seems to be easier and
16050 correct fix for crash to disable this test. */
16052 else if (GET_CODE (disp) != LABEL_REF
16053 && !CONST_INT_P (disp)
16054 && (GET_CODE (disp) != CONST
16055 || !ix86_legitimate_constant_p (Pmode, disp))
16056 && (GET_CODE (disp) != SYMBOL_REF
16057 || !ix86_legitimate_constant_p (Pmode, disp)))
16058 /* Displacement is not constant. */
16059 return false;
16060 else if (TARGET_64BIT
16061 && !x86_64_immediate_operand (disp, VOIDmode))
16062 /* Displacement is out of range. */
16063 return false;
16064 /* In x32 mode, constant addresses are sign extended to 64bit, so
16065 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16066 else if (TARGET_X32 && !(index || base)
16067 && CONST_INT_P (disp)
16068 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16069 return false;
16072 /* Everything looks valid. */
16073 return true;
16076 /* Determine if a given RTX is a valid constant address. */
16078 bool
16079 constant_address_p (rtx x)
16081 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16084 /* Return a unique alias set for the GOT. */
16086 static alias_set_type
16087 ix86_GOT_alias_set (void)
16089 static alias_set_type set = -1;
16090 if (set == -1)
16091 set = new_alias_set ();
16092 return set;
16095 /* Return a legitimate reference for ORIG (an address) using the
16096 register REG. If REG is 0, a new pseudo is generated.
16098 There are two types of references that must be handled:
16100 1. Global data references must load the address from the GOT, via
16101 the PIC reg. An insn is emitted to do this load, and the reg is
16102 returned.
16104 2. Static data references, constant pool addresses, and code labels
16105 compute the address as an offset from the GOT, whose base is in
16106 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16107 differentiate them from global data objects. The returned
16108 address is the PIC reg + an unspec constant.
16110 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16111 reg also appears in the address. */
16113 static rtx
16114 legitimize_pic_address (rtx orig, rtx reg)
16116 rtx addr = orig;
16117 rtx new_rtx = orig;
16119 #if TARGET_MACHO
16120 if (TARGET_MACHO && !TARGET_64BIT)
16122 if (reg == 0)
16123 reg = gen_reg_rtx (Pmode);
16124 /* Use the generic Mach-O PIC machinery. */
16125 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16127 #endif
16129 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16131 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16132 if (tmp)
16133 return tmp;
16136 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16137 new_rtx = addr;
16138 else if ((!TARGET_64BIT
16139 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16140 && !TARGET_PECOFF
16141 && gotoff_operand (addr, Pmode))
16143 /* This symbol may be referenced via a displacement
16144 from the PIC base address (@GOTOFF). */
16145 if (GET_CODE (addr) == CONST)
16146 addr = XEXP (addr, 0);
16148 if (GET_CODE (addr) == PLUS)
16150 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16151 UNSPEC_GOTOFF);
16152 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16154 else
16155 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16157 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16159 if (TARGET_64BIT)
16160 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16162 if (reg != 0)
16164 gcc_assert (REG_P (reg));
16165 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16166 new_rtx, reg, 1, OPTAB_DIRECT);
16168 else
16169 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16171 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16172 /* We can't use @GOTOFF for text labels
16173 on VxWorks, see gotoff_operand. */
16174 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16176 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16177 if (tmp)
16178 return tmp;
16180 /* For x64 PE-COFF there is no GOT table,
16181 so we use address directly. */
16182 if (TARGET_64BIT && TARGET_PECOFF)
16184 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16185 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16187 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16189 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16190 UNSPEC_GOTPCREL);
16191 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16192 new_rtx = gen_const_mem (Pmode, new_rtx);
16193 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16195 else
16197 /* This symbol must be referenced via a load
16198 from the Global Offset Table (@GOT). */
16199 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16200 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16201 if (TARGET_64BIT)
16202 new_rtx = force_reg (Pmode, new_rtx);
16203 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16204 new_rtx = gen_const_mem (Pmode, new_rtx);
16205 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16208 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16210 else
16212 if (CONST_INT_P (addr)
16213 && !x86_64_immediate_operand (addr, VOIDmode))
16214 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16215 else if (GET_CODE (addr) == CONST)
16217 addr = XEXP (addr, 0);
16219 /* We must match stuff we generate before. Assume the only
16220 unspecs that can get here are ours. Not that we could do
16221 anything with them anyway.... */
16222 if (GET_CODE (addr) == UNSPEC
16223 || (GET_CODE (addr) == PLUS
16224 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16225 return orig;
16226 gcc_assert (GET_CODE (addr) == PLUS);
16229 if (GET_CODE (addr) == PLUS)
16231 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16233 /* Check first to see if this is a constant
16234 offset from a @GOTOFF symbol reference. */
16235 if (!TARGET_PECOFF
16236 && gotoff_operand (op0, Pmode)
16237 && CONST_INT_P (op1))
16239 if (!TARGET_64BIT)
16241 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16242 UNSPEC_GOTOFF);
16243 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16244 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16246 if (reg != 0)
16248 gcc_assert (REG_P (reg));
16249 new_rtx = expand_simple_binop (Pmode, PLUS,
16250 pic_offset_table_rtx,
16251 new_rtx, reg, 1,
16252 OPTAB_DIRECT);
16254 else
16255 new_rtx
16256 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16258 else
16260 if (INTVAL (op1) < -16*1024*1024
16261 || INTVAL (op1) >= 16*1024*1024)
16263 if (!x86_64_immediate_operand (op1, Pmode))
16264 op1 = force_reg (Pmode, op1);
16266 new_rtx
16267 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16271 else
16273 rtx base = legitimize_pic_address (op0, reg);
16274 machine_mode mode = GET_MODE (base);
16275 new_rtx
16276 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16278 if (CONST_INT_P (new_rtx))
16280 if (INTVAL (new_rtx) < -16*1024*1024
16281 || INTVAL (new_rtx) >= 16*1024*1024)
16283 if (!x86_64_immediate_operand (new_rtx, mode))
16284 new_rtx = force_reg (mode, new_rtx);
16286 new_rtx
16287 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16289 else
16290 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16292 else
16294 /* For %rip addressing, we have to use
16295 just disp32, not base nor index. */
16296 if (TARGET_64BIT
16297 && (GET_CODE (base) == SYMBOL_REF
16298 || GET_CODE (base) == LABEL_REF))
16299 base = force_reg (mode, base);
16300 if (GET_CODE (new_rtx) == PLUS
16301 && CONSTANT_P (XEXP (new_rtx, 1)))
16303 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16304 new_rtx = XEXP (new_rtx, 1);
16306 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16311 return new_rtx;
16314 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16316 static rtx
16317 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16319 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16321 if (GET_MODE (tp) != tp_mode)
16323 gcc_assert (GET_MODE (tp) == SImode);
16324 gcc_assert (tp_mode == DImode);
16326 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16329 if (to_reg)
16330 tp = copy_to_mode_reg (tp_mode, tp);
16332 return tp;
16335 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16337 static GTY(()) rtx ix86_tls_symbol;
16339 static rtx
16340 ix86_tls_get_addr (void)
16342 if (!ix86_tls_symbol)
16344 const char *sym
16345 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16346 ? "___tls_get_addr" : "__tls_get_addr");
16348 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16351 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16353 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16354 UNSPEC_PLTOFF);
16355 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16356 gen_rtx_CONST (Pmode, unspec));
16359 return ix86_tls_symbol;
16362 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16364 static GTY(()) rtx ix86_tls_module_base_symbol;
16367 ix86_tls_module_base (void)
16369 if (!ix86_tls_module_base_symbol)
16371 ix86_tls_module_base_symbol
16372 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16374 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16375 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16378 return ix86_tls_module_base_symbol;
16381 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16382 false if we expect this to be used for a memory address and true if
16383 we expect to load the address into a register. */
16385 static rtx
16386 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16388 rtx dest, base, off;
16389 rtx pic = NULL_RTX, tp = NULL_RTX;
16390 machine_mode tp_mode = Pmode;
16391 int type;
16393 /* Fall back to global dynamic model if tool chain cannot support local
16394 dynamic. */
16395 if (TARGET_SUN_TLS && !TARGET_64BIT
16396 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16397 && model == TLS_MODEL_LOCAL_DYNAMIC)
16398 model = TLS_MODEL_GLOBAL_DYNAMIC;
16400 switch (model)
16402 case TLS_MODEL_GLOBAL_DYNAMIC:
16403 dest = gen_reg_rtx (Pmode);
16405 if (!TARGET_64BIT)
16407 if (flag_pic && !TARGET_PECOFF)
16408 pic = pic_offset_table_rtx;
16409 else
16411 pic = gen_reg_rtx (Pmode);
16412 emit_insn (gen_set_got (pic));
16416 if (TARGET_GNU2_TLS)
16418 if (TARGET_64BIT)
16419 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16420 else
16421 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16423 tp = get_thread_pointer (Pmode, true);
16424 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16426 if (GET_MODE (x) != Pmode)
16427 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16429 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16431 else
16433 rtx caddr = ix86_tls_get_addr ();
16435 if (TARGET_64BIT)
16437 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16438 rtx_insn *insns;
16440 start_sequence ();
16441 emit_call_insn
16442 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16443 insns = get_insns ();
16444 end_sequence ();
16446 if (GET_MODE (x) != Pmode)
16447 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16449 RTL_CONST_CALL_P (insns) = 1;
16450 emit_libcall_block (insns, dest, rax, x);
16452 else
16453 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16455 break;
16457 case TLS_MODEL_LOCAL_DYNAMIC:
16458 base = gen_reg_rtx (Pmode);
16460 if (!TARGET_64BIT)
16462 if (flag_pic)
16463 pic = pic_offset_table_rtx;
16464 else
16466 pic = gen_reg_rtx (Pmode);
16467 emit_insn (gen_set_got (pic));
16471 if (TARGET_GNU2_TLS)
16473 rtx tmp = ix86_tls_module_base ();
16475 if (TARGET_64BIT)
16476 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16477 else
16478 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16480 tp = get_thread_pointer (Pmode, true);
16481 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16482 gen_rtx_MINUS (Pmode, tmp, tp));
16484 else
16486 rtx caddr = ix86_tls_get_addr ();
16488 if (TARGET_64BIT)
16490 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16491 rtx_insn *insns;
16492 rtx eqv;
16494 start_sequence ();
16495 emit_call_insn
16496 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16497 insns = get_insns ();
16498 end_sequence ();
16500 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16501 share the LD_BASE result with other LD model accesses. */
16502 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16503 UNSPEC_TLS_LD_BASE);
16505 RTL_CONST_CALL_P (insns) = 1;
16506 emit_libcall_block (insns, base, rax, eqv);
16508 else
16509 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16512 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16513 off = gen_rtx_CONST (Pmode, off);
16515 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16517 if (TARGET_GNU2_TLS)
16519 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16521 if (GET_MODE (x) != Pmode)
16522 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16524 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16526 break;
16528 case TLS_MODEL_INITIAL_EXEC:
16529 if (TARGET_64BIT)
16531 if (TARGET_SUN_TLS && !TARGET_X32)
16533 /* The Sun linker took the AMD64 TLS spec literally
16534 and can only handle %rax as destination of the
16535 initial executable code sequence. */
16537 dest = gen_reg_rtx (DImode);
16538 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16539 return dest;
16542 /* Generate DImode references to avoid %fs:(%reg32)
16543 problems and linker IE->LE relaxation bug. */
16544 tp_mode = DImode;
16545 pic = NULL;
16546 type = UNSPEC_GOTNTPOFF;
16548 else if (flag_pic)
16550 pic = pic_offset_table_rtx;
16551 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16553 else if (!TARGET_ANY_GNU_TLS)
16555 pic = gen_reg_rtx (Pmode);
16556 emit_insn (gen_set_got (pic));
16557 type = UNSPEC_GOTTPOFF;
16559 else
16561 pic = NULL;
16562 type = UNSPEC_INDNTPOFF;
16565 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16566 off = gen_rtx_CONST (tp_mode, off);
16567 if (pic)
16568 off = gen_rtx_PLUS (tp_mode, pic, off);
16569 off = gen_const_mem (tp_mode, off);
16570 set_mem_alias_set (off, ix86_GOT_alias_set ());
16572 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16574 base = get_thread_pointer (tp_mode,
16575 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16576 off = force_reg (tp_mode, off);
16577 dest = gen_rtx_PLUS (tp_mode, base, off);
16578 if (tp_mode != Pmode)
16579 dest = convert_to_mode (Pmode, dest, 1);
16581 else
16583 base = get_thread_pointer (Pmode, true);
16584 dest = gen_reg_rtx (Pmode);
16585 emit_insn (ix86_gen_sub3 (dest, base, off));
16587 break;
16589 case TLS_MODEL_LOCAL_EXEC:
16590 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16591 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16592 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16593 off = gen_rtx_CONST (Pmode, off);
16595 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16597 base = get_thread_pointer (Pmode,
16598 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16599 return gen_rtx_PLUS (Pmode, base, off);
16601 else
16603 base = get_thread_pointer (Pmode, true);
16604 dest = gen_reg_rtx (Pmode);
16605 emit_insn (ix86_gen_sub3 (dest, base, off));
16607 break;
16609 default:
16610 gcc_unreachable ();
16613 return dest;
16616 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16617 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16618 unique refptr-DECL symbol corresponding to symbol DECL. */
16620 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16622 static inline hashval_t hash (tree_map *m) { return m->hash; }
16623 static inline bool
16624 equal (tree_map *a, tree_map *b)
16626 return a->base.from == b->base.from;
16629 static int
16630 keep_cache_entry (tree_map *&m)
16632 return ggc_marked_p (m->base.from);
16636 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16638 static tree
16639 get_dllimport_decl (tree decl, bool beimport)
16641 struct tree_map *h, in;
16642 const char *name;
16643 const char *prefix;
16644 size_t namelen, prefixlen;
16645 char *imp_name;
16646 tree to;
16647 rtx rtl;
16649 if (!dllimport_map)
16650 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16652 in.hash = htab_hash_pointer (decl);
16653 in.base.from = decl;
16654 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16655 h = *loc;
16656 if (h)
16657 return h->to;
16659 *loc = h = ggc_alloc<tree_map> ();
16660 h->hash = in.hash;
16661 h->base.from = decl;
16662 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16663 VAR_DECL, NULL, ptr_type_node);
16664 DECL_ARTIFICIAL (to) = 1;
16665 DECL_IGNORED_P (to) = 1;
16666 DECL_EXTERNAL (to) = 1;
16667 TREE_READONLY (to) = 1;
16669 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16670 name = targetm.strip_name_encoding (name);
16671 if (beimport)
16672 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16673 ? "*__imp_" : "*__imp__";
16674 else
16675 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16676 namelen = strlen (name);
16677 prefixlen = strlen (prefix);
16678 imp_name = (char *) alloca (namelen + prefixlen + 1);
16679 memcpy (imp_name, prefix, prefixlen);
16680 memcpy (imp_name + prefixlen, name, namelen + 1);
16682 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16683 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16684 SET_SYMBOL_REF_DECL (rtl, to);
16685 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16686 if (!beimport)
16688 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16689 #ifdef SUB_TARGET_RECORD_STUB
16690 SUB_TARGET_RECORD_STUB (name);
16691 #endif
16694 rtl = gen_const_mem (Pmode, rtl);
16695 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16697 SET_DECL_RTL (to, rtl);
16698 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16700 return to;
16703 /* Expand SYMBOL into its corresponding far-addresse symbol.
16704 WANT_REG is true if we require the result be a register. */
16706 static rtx
16707 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16709 tree imp_decl;
16710 rtx x;
16712 gcc_assert (SYMBOL_REF_DECL (symbol));
16713 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16715 x = DECL_RTL (imp_decl);
16716 if (want_reg)
16717 x = force_reg (Pmode, x);
16718 return x;
16721 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16722 true if we require the result be a register. */
16724 static rtx
16725 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16727 tree imp_decl;
16728 rtx x;
16730 gcc_assert (SYMBOL_REF_DECL (symbol));
16731 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16733 x = DECL_RTL (imp_decl);
16734 if (want_reg)
16735 x = force_reg (Pmode, x);
16736 return x;
16739 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16740 is true if we require the result be a register. */
16742 static rtx
16743 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16745 if (!TARGET_PECOFF)
16746 return NULL_RTX;
16748 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16750 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16751 return legitimize_dllimport_symbol (addr, inreg);
16752 if (GET_CODE (addr) == CONST
16753 && GET_CODE (XEXP (addr, 0)) == PLUS
16754 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16755 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16757 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16758 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16762 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16763 return NULL_RTX;
16764 if (GET_CODE (addr) == SYMBOL_REF
16765 && !is_imported_p (addr)
16766 && SYMBOL_REF_EXTERNAL_P (addr)
16767 && SYMBOL_REF_DECL (addr))
16768 return legitimize_pe_coff_extern_decl (addr, inreg);
16770 if (GET_CODE (addr) == CONST
16771 && GET_CODE (XEXP (addr, 0)) == PLUS
16772 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16773 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16774 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16775 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16777 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16778 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16780 return NULL_RTX;
16783 /* Try machine-dependent ways of modifying an illegitimate address
16784 to be legitimate. If we find one, return the new, valid address.
16785 This macro is used in only one place: `memory_address' in explow.c.
16787 OLDX is the address as it was before break_out_memory_refs was called.
16788 In some cases it is useful to look at this to decide what needs to be done.
16790 It is always safe for this macro to do nothing. It exists to recognize
16791 opportunities to optimize the output.
16793 For the 80386, we handle X+REG by loading X into a register R and
16794 using R+REG. R will go in a general reg and indexing will be used.
16795 However, if REG is a broken-out memory address or multiplication,
16796 nothing needs to be done because REG can certainly go in a general reg.
16798 When -fpic is used, special handling is needed for symbolic references.
16799 See comments by legitimize_pic_address in i386.c for details. */
16801 static rtx
16802 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16804 bool changed = false;
16805 unsigned log;
16807 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16808 if (log)
16809 return legitimize_tls_address (x, (enum tls_model) log, false);
16810 if (GET_CODE (x) == CONST
16811 && GET_CODE (XEXP (x, 0)) == PLUS
16812 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16813 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16815 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16816 (enum tls_model) log, false);
16817 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16820 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16822 rtx tmp = legitimize_pe_coff_symbol (x, true);
16823 if (tmp)
16824 return tmp;
16827 if (flag_pic && SYMBOLIC_CONST (x))
16828 return legitimize_pic_address (x, 0);
16830 #if TARGET_MACHO
16831 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16832 return machopic_indirect_data_reference (x, 0);
16833 #endif
16835 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16836 if (GET_CODE (x) == ASHIFT
16837 && CONST_INT_P (XEXP (x, 1))
16838 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16840 changed = true;
16841 log = INTVAL (XEXP (x, 1));
16842 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16843 GEN_INT (1 << log));
16846 if (GET_CODE (x) == PLUS)
16848 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16850 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16851 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16852 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16854 changed = true;
16855 log = INTVAL (XEXP (XEXP (x, 0), 1));
16856 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16857 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16858 GEN_INT (1 << log));
16861 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16862 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16863 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16865 changed = true;
16866 log = INTVAL (XEXP (XEXP (x, 1), 1));
16867 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16868 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16869 GEN_INT (1 << log));
16872 /* Put multiply first if it isn't already. */
16873 if (GET_CODE (XEXP (x, 1)) == MULT)
16875 std::swap (XEXP (x, 0), XEXP (x, 1));
16876 changed = true;
16879 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16880 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16881 created by virtual register instantiation, register elimination, and
16882 similar optimizations. */
16883 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16885 changed = true;
16886 x = gen_rtx_PLUS (Pmode,
16887 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16888 XEXP (XEXP (x, 1), 0)),
16889 XEXP (XEXP (x, 1), 1));
16892 /* Canonicalize
16893 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16894 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16895 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16896 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16897 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16898 && CONSTANT_P (XEXP (x, 1)))
16900 rtx constant;
16901 rtx other = NULL_RTX;
16903 if (CONST_INT_P (XEXP (x, 1)))
16905 constant = XEXP (x, 1);
16906 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16908 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16910 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16911 other = XEXP (x, 1);
16913 else
16914 constant = 0;
16916 if (constant)
16918 changed = true;
16919 x = gen_rtx_PLUS (Pmode,
16920 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16921 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16922 plus_constant (Pmode, other,
16923 INTVAL (constant)));
16927 if (changed && ix86_legitimate_address_p (mode, x, false))
16928 return x;
16930 if (GET_CODE (XEXP (x, 0)) == MULT)
16932 changed = true;
16933 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16936 if (GET_CODE (XEXP (x, 1)) == MULT)
16938 changed = true;
16939 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16942 if (changed
16943 && REG_P (XEXP (x, 1))
16944 && REG_P (XEXP (x, 0)))
16945 return x;
16947 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16949 changed = true;
16950 x = legitimize_pic_address (x, 0);
16953 if (changed && ix86_legitimate_address_p (mode, x, false))
16954 return x;
16956 if (REG_P (XEXP (x, 0)))
16958 rtx temp = gen_reg_rtx (Pmode);
16959 rtx val = force_operand (XEXP (x, 1), temp);
16960 if (val != temp)
16962 val = convert_to_mode (Pmode, val, 1);
16963 emit_move_insn (temp, val);
16966 XEXP (x, 1) = temp;
16967 return x;
16970 else if (REG_P (XEXP (x, 1)))
16972 rtx temp = gen_reg_rtx (Pmode);
16973 rtx val = force_operand (XEXP (x, 0), temp);
16974 if (val != temp)
16976 val = convert_to_mode (Pmode, val, 1);
16977 emit_move_insn (temp, val);
16980 XEXP (x, 0) = temp;
16981 return x;
16985 return x;
16988 /* Print an integer constant expression in assembler syntax. Addition
16989 and subtraction are the only arithmetic that may appear in these
16990 expressions. FILE is the stdio stream to write to, X is the rtx, and
16991 CODE is the operand print code from the output string. */
16993 static void
16994 output_pic_addr_const (FILE *file, rtx x, int code)
16996 char buf[256];
16998 switch (GET_CODE (x))
17000 case PC:
17001 gcc_assert (flag_pic);
17002 putc ('.', file);
17003 break;
17005 case SYMBOL_REF:
17006 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17007 output_addr_const (file, x);
17008 else
17010 const char *name = XSTR (x, 0);
17012 /* Mark the decl as referenced so that cgraph will
17013 output the function. */
17014 if (SYMBOL_REF_DECL (x))
17015 mark_decl_referenced (SYMBOL_REF_DECL (x));
17017 #if TARGET_MACHO
17018 if (MACHOPIC_INDIRECT
17019 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17020 name = machopic_indirection_name (x, /*stub_p=*/true);
17021 #endif
17022 assemble_name (file, name);
17024 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17025 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17026 fputs ("@PLT", file);
17027 break;
17029 case LABEL_REF:
17030 x = XEXP (x, 0);
17031 /* FALLTHRU */
17032 case CODE_LABEL:
17033 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17034 assemble_name (asm_out_file, buf);
17035 break;
17037 case CONST_INT:
17038 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17039 break;
17041 case CONST:
17042 /* This used to output parentheses around the expression,
17043 but that does not work on the 386 (either ATT or BSD assembler). */
17044 output_pic_addr_const (file, XEXP (x, 0), code);
17045 break;
17047 case CONST_DOUBLE:
17048 /* We can't handle floating point constants;
17049 TARGET_PRINT_OPERAND must handle them. */
17050 output_operand_lossage ("floating constant misused");
17051 break;
17053 case PLUS:
17054 /* Some assemblers need integer constants to appear first. */
17055 if (CONST_INT_P (XEXP (x, 0)))
17057 output_pic_addr_const (file, XEXP (x, 0), code);
17058 putc ('+', file);
17059 output_pic_addr_const (file, XEXP (x, 1), code);
17061 else
17063 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17064 output_pic_addr_const (file, XEXP (x, 1), code);
17065 putc ('+', file);
17066 output_pic_addr_const (file, XEXP (x, 0), code);
17068 break;
17070 case MINUS:
17071 if (!TARGET_MACHO)
17072 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17073 output_pic_addr_const (file, XEXP (x, 0), code);
17074 putc ('-', file);
17075 output_pic_addr_const (file, XEXP (x, 1), code);
17076 if (!TARGET_MACHO)
17077 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17078 break;
17080 case UNSPEC:
17081 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17083 bool f = i386_asm_output_addr_const_extra (file, x);
17084 gcc_assert (f);
17085 break;
17088 gcc_assert (XVECLEN (x, 0) == 1);
17089 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17090 switch (XINT (x, 1))
17092 case UNSPEC_GOT:
17093 fputs ("@GOT", file);
17094 break;
17095 case UNSPEC_GOTOFF:
17096 fputs ("@GOTOFF", file);
17097 break;
17098 case UNSPEC_PLTOFF:
17099 fputs ("@PLTOFF", file);
17100 break;
17101 case UNSPEC_PCREL:
17102 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17103 "(%rip)" : "[rip]", file);
17104 break;
17105 case UNSPEC_GOTPCREL:
17106 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17107 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17108 break;
17109 case UNSPEC_GOTTPOFF:
17110 /* FIXME: This might be @TPOFF in Sun ld too. */
17111 fputs ("@gottpoff", file);
17112 break;
17113 case UNSPEC_TPOFF:
17114 fputs ("@tpoff", file);
17115 break;
17116 case UNSPEC_NTPOFF:
17117 if (TARGET_64BIT)
17118 fputs ("@tpoff", file);
17119 else
17120 fputs ("@ntpoff", file);
17121 break;
17122 case UNSPEC_DTPOFF:
17123 fputs ("@dtpoff", file);
17124 break;
17125 case UNSPEC_GOTNTPOFF:
17126 if (TARGET_64BIT)
17127 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17128 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17129 else
17130 fputs ("@gotntpoff", file);
17131 break;
17132 case UNSPEC_INDNTPOFF:
17133 fputs ("@indntpoff", file);
17134 break;
17135 #if TARGET_MACHO
17136 case UNSPEC_MACHOPIC_OFFSET:
17137 putc ('-', file);
17138 machopic_output_function_base_name (file);
17139 break;
17140 #endif
17141 default:
17142 output_operand_lossage ("invalid UNSPEC as operand");
17143 break;
17145 break;
17147 default:
17148 output_operand_lossage ("invalid expression as operand");
17152 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17153 We need to emit DTP-relative relocations. */
17155 static void ATTRIBUTE_UNUSED
17156 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17158 fputs (ASM_LONG, file);
17159 output_addr_const (file, x);
17160 fputs ("@dtpoff", file);
17161 switch (size)
17163 case 4:
17164 break;
17165 case 8:
17166 fputs (", 0", file);
17167 break;
17168 default:
17169 gcc_unreachable ();
17173 /* Return true if X is a representation of the PIC register. This copes
17174 with calls from ix86_find_base_term, where the register might have
17175 been replaced by a cselib value. */
17177 static bool
17178 ix86_pic_register_p (rtx x)
17180 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17181 return (pic_offset_table_rtx
17182 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17183 else if (!REG_P (x))
17184 return false;
17185 else if (pic_offset_table_rtx)
17187 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17188 return true;
17189 if (HARD_REGISTER_P (x)
17190 && !HARD_REGISTER_P (pic_offset_table_rtx)
17191 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17192 return true;
17193 return false;
17195 else
17196 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17199 /* Helper function for ix86_delegitimize_address.
17200 Attempt to delegitimize TLS local-exec accesses. */
17202 static rtx
17203 ix86_delegitimize_tls_address (rtx orig_x)
17205 rtx x = orig_x, unspec;
17206 struct ix86_address addr;
17208 if (!TARGET_TLS_DIRECT_SEG_REFS)
17209 return orig_x;
17210 if (MEM_P (x))
17211 x = XEXP (x, 0);
17212 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17213 return orig_x;
17214 if (ix86_decompose_address (x, &addr) == 0
17215 || addr.seg != DEFAULT_TLS_SEG_REG
17216 || addr.disp == NULL_RTX
17217 || GET_CODE (addr.disp) != CONST)
17218 return orig_x;
17219 unspec = XEXP (addr.disp, 0);
17220 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17221 unspec = XEXP (unspec, 0);
17222 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17223 return orig_x;
17224 x = XVECEXP (unspec, 0, 0);
17225 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17226 if (unspec != XEXP (addr.disp, 0))
17227 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17228 if (addr.index)
17230 rtx idx = addr.index;
17231 if (addr.scale != 1)
17232 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17233 x = gen_rtx_PLUS (Pmode, idx, x);
17235 if (addr.base)
17236 x = gen_rtx_PLUS (Pmode, addr.base, x);
17237 if (MEM_P (orig_x))
17238 x = replace_equiv_address_nv (orig_x, x);
17239 return x;
17242 /* In the name of slightly smaller debug output, and to cater to
17243 general assembler lossage, recognize PIC+GOTOFF and turn it back
17244 into a direct symbol reference.
17246 On Darwin, this is necessary to avoid a crash, because Darwin
17247 has a different PIC label for each routine but the DWARF debugging
17248 information is not associated with any particular routine, so it's
17249 necessary to remove references to the PIC label from RTL stored by
17250 the DWARF output code. */
17252 static rtx
17253 ix86_delegitimize_address (rtx x)
17255 rtx orig_x = delegitimize_mem_from_attrs (x);
17256 /* addend is NULL or some rtx if x is something+GOTOFF where
17257 something doesn't include the PIC register. */
17258 rtx addend = NULL_RTX;
17259 /* reg_addend is NULL or a multiple of some register. */
17260 rtx reg_addend = NULL_RTX;
17261 /* const_addend is NULL or a const_int. */
17262 rtx const_addend = NULL_RTX;
17263 /* This is the result, or NULL. */
17264 rtx result = NULL_RTX;
17266 x = orig_x;
17268 if (MEM_P (x))
17269 x = XEXP (x, 0);
17271 if (TARGET_64BIT)
17273 if (GET_CODE (x) == CONST
17274 && GET_CODE (XEXP (x, 0)) == PLUS
17275 && GET_MODE (XEXP (x, 0)) == Pmode
17276 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17277 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17278 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17280 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17281 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17282 if (MEM_P (orig_x))
17283 x = replace_equiv_address_nv (orig_x, x);
17284 return x;
17287 if (GET_CODE (x) == CONST
17288 && GET_CODE (XEXP (x, 0)) == UNSPEC
17289 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17290 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17291 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17293 x = XVECEXP (XEXP (x, 0), 0, 0);
17294 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17296 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17297 if (x == NULL_RTX)
17298 return orig_x;
17300 return x;
17303 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17304 return ix86_delegitimize_tls_address (orig_x);
17306 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17307 and -mcmodel=medium -fpic. */
17310 if (GET_CODE (x) != PLUS
17311 || GET_CODE (XEXP (x, 1)) != CONST)
17312 return ix86_delegitimize_tls_address (orig_x);
17314 if (ix86_pic_register_p (XEXP (x, 0)))
17315 /* %ebx + GOT/GOTOFF */
17317 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17319 /* %ebx + %reg * scale + GOT/GOTOFF */
17320 reg_addend = XEXP (x, 0);
17321 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17322 reg_addend = XEXP (reg_addend, 1);
17323 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17324 reg_addend = XEXP (reg_addend, 0);
17325 else
17327 reg_addend = NULL_RTX;
17328 addend = XEXP (x, 0);
17331 else
17332 addend = XEXP (x, 0);
17334 x = XEXP (XEXP (x, 1), 0);
17335 if (GET_CODE (x) == PLUS
17336 && CONST_INT_P (XEXP (x, 1)))
17338 const_addend = XEXP (x, 1);
17339 x = XEXP (x, 0);
17342 if (GET_CODE (x) == UNSPEC
17343 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17344 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17345 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17346 && !MEM_P (orig_x) && !addend)))
17347 result = XVECEXP (x, 0, 0);
17349 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17350 && !MEM_P (orig_x))
17351 result = XVECEXP (x, 0, 0);
17353 if (! result)
17354 return ix86_delegitimize_tls_address (orig_x);
17356 if (const_addend)
17357 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17358 if (reg_addend)
17359 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17360 if (addend)
17362 /* If the rest of original X doesn't involve the PIC register, add
17363 addend and subtract pic_offset_table_rtx. This can happen e.g.
17364 for code like:
17365 leal (%ebx, %ecx, 4), %ecx
17367 movl foo@GOTOFF(%ecx), %edx
17368 in which case we return (%ecx - %ebx) + foo
17369 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17370 and reload has completed. */
17371 if (pic_offset_table_rtx
17372 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17373 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17374 pic_offset_table_rtx),
17375 result);
17376 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17378 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17379 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17380 result = gen_rtx_PLUS (Pmode, tmp, result);
17382 else
17383 return orig_x;
17385 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17387 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17388 if (result == NULL_RTX)
17389 return orig_x;
17391 return result;
17394 /* If X is a machine specific address (i.e. a symbol or label being
17395 referenced as a displacement from the GOT implemented using an
17396 UNSPEC), then return the base term. Otherwise return X. */
17399 ix86_find_base_term (rtx x)
17401 rtx term;
17403 if (TARGET_64BIT)
17405 if (GET_CODE (x) != CONST)
17406 return x;
17407 term = XEXP (x, 0);
17408 if (GET_CODE (term) == PLUS
17409 && CONST_INT_P (XEXP (term, 1)))
17410 term = XEXP (term, 0);
17411 if (GET_CODE (term) != UNSPEC
17412 || (XINT (term, 1) != UNSPEC_GOTPCREL
17413 && XINT (term, 1) != UNSPEC_PCREL))
17414 return x;
17416 return XVECEXP (term, 0, 0);
17419 return ix86_delegitimize_address (x);
17422 static void
17423 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17424 bool fp, FILE *file)
17426 const char *suffix;
17428 if (mode == CCFPmode || mode == CCFPUmode)
17430 code = ix86_fp_compare_code_to_integer (code);
17431 mode = CCmode;
17433 if (reverse)
17434 code = reverse_condition (code);
17436 switch (code)
17438 case EQ:
17439 switch (mode)
17441 case CCAmode:
17442 suffix = "a";
17443 break;
17444 case CCCmode:
17445 suffix = "c";
17446 break;
17447 case CCOmode:
17448 suffix = "o";
17449 break;
17450 case CCPmode:
17451 suffix = "p";
17452 break;
17453 case CCSmode:
17454 suffix = "s";
17455 break;
17456 default:
17457 suffix = "e";
17458 break;
17460 break;
17461 case NE:
17462 switch (mode)
17464 case CCAmode:
17465 suffix = "na";
17466 break;
17467 case CCCmode:
17468 suffix = "nc";
17469 break;
17470 case CCOmode:
17471 suffix = "no";
17472 break;
17473 case CCPmode:
17474 suffix = "np";
17475 break;
17476 case CCSmode:
17477 suffix = "ns";
17478 break;
17479 default:
17480 suffix = "ne";
17481 break;
17483 break;
17484 case GT:
17485 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17486 suffix = "g";
17487 break;
17488 case GTU:
17489 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17490 Those same assemblers have the same but opposite lossage on cmov. */
17491 if (mode == CCmode)
17492 suffix = fp ? "nbe" : "a";
17493 else
17494 gcc_unreachable ();
17495 break;
17496 case LT:
17497 switch (mode)
17499 case CCNOmode:
17500 case CCGOCmode:
17501 suffix = "s";
17502 break;
17504 case CCmode:
17505 case CCGCmode:
17506 suffix = "l";
17507 break;
17509 default:
17510 gcc_unreachable ();
17512 break;
17513 case LTU:
17514 if (mode == CCmode)
17515 suffix = "b";
17516 else if (mode == CCCmode)
17517 suffix = fp ? "b" : "c";
17518 else
17519 gcc_unreachable ();
17520 break;
17521 case GE:
17522 switch (mode)
17524 case CCNOmode:
17525 case CCGOCmode:
17526 suffix = "ns";
17527 break;
17529 case CCmode:
17530 case CCGCmode:
17531 suffix = "ge";
17532 break;
17534 default:
17535 gcc_unreachable ();
17537 break;
17538 case GEU:
17539 if (mode == CCmode)
17540 suffix = "nb";
17541 else if (mode == CCCmode)
17542 suffix = fp ? "nb" : "nc";
17543 else
17544 gcc_unreachable ();
17545 break;
17546 case LE:
17547 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17548 suffix = "le";
17549 break;
17550 case LEU:
17551 if (mode == CCmode)
17552 suffix = "be";
17553 else
17554 gcc_unreachable ();
17555 break;
17556 case UNORDERED:
17557 suffix = fp ? "u" : "p";
17558 break;
17559 case ORDERED:
17560 suffix = fp ? "nu" : "np";
17561 break;
17562 default:
17563 gcc_unreachable ();
17565 fputs (suffix, file);
17568 /* Print the name of register X to FILE based on its machine mode and number.
17569 If CODE is 'w', pretend the mode is HImode.
17570 If CODE is 'b', pretend the mode is QImode.
17571 If CODE is 'k', pretend the mode is SImode.
17572 If CODE is 'q', pretend the mode is DImode.
17573 If CODE is 'x', pretend the mode is V4SFmode.
17574 If CODE is 't', pretend the mode is V8SFmode.
17575 If CODE is 'g', pretend the mode is V16SFmode.
17576 If CODE is 'h', pretend the reg is the 'high' byte register.
17577 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17578 If CODE is 'd', duplicate the operand for AVX instruction.
17581 void
17582 print_reg (rtx x, int code, FILE *file)
17584 const char *reg;
17585 int msize;
17586 unsigned int regno;
17587 bool duplicated;
17589 if (ASSEMBLER_DIALECT == ASM_ATT)
17590 putc ('%', file);
17592 if (x == pc_rtx)
17594 gcc_assert (TARGET_64BIT);
17595 fputs ("rip", file);
17596 return;
17599 if (code == 'y' && STACK_TOP_P (x))
17601 fputs ("st(0)", file);
17602 return;
17605 if (code == 'w')
17606 msize = 2;
17607 else if (code == 'b')
17608 msize = 1;
17609 else if (code == 'k')
17610 msize = 4;
17611 else if (code == 'q')
17612 msize = 8;
17613 else if (code == 'h')
17614 msize = 0;
17615 else if (code == 'x')
17616 msize = 16;
17617 else if (code == 't')
17618 msize = 32;
17619 else if (code == 'g')
17620 msize = 64;
17621 else
17622 msize = GET_MODE_SIZE (GET_MODE (x));
17624 regno = REGNO (x);
17626 gcc_assert (regno != ARG_POINTER_REGNUM
17627 && regno != FRAME_POINTER_REGNUM
17628 && regno != FPSR_REG
17629 && regno != FPCR_REG);
17631 if (regno == FLAGS_REG)
17633 output_operand_lossage ("invalid use of asm flag output");
17634 return;
17637 duplicated = code == 'd' && TARGET_AVX;
17639 switch (msize)
17641 case 8:
17642 case 4:
17643 if (LEGACY_INT_REGNO_P (regno))
17644 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17645 /* FALLTHRU */
17646 case 16:
17647 case 12:
17648 case 2:
17649 normal:
17650 reg = hi_reg_name[regno];
17651 break;
17652 case 1:
17653 if (regno >= ARRAY_SIZE (qi_reg_name))
17654 goto normal;
17655 reg = qi_reg_name[regno];
17656 break;
17657 case 0:
17658 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17659 goto normal;
17660 reg = qi_high_reg_name[regno];
17661 break;
17662 case 32:
17663 case 64:
17664 if (SSE_REGNO_P (regno))
17666 gcc_assert (!duplicated);
17667 putc (msize == 32 ? 'y' : 'z', file);
17668 reg = hi_reg_name[regno] + 1;
17669 break;
17671 goto normal;
17672 default:
17673 gcc_unreachable ();
17676 fputs (reg, file);
17678 /* Irritatingly, AMD extended registers use
17679 different naming convention: "r%d[bwd]" */
17680 if (REX_INT_REGNO_P (regno))
17682 gcc_assert (TARGET_64BIT);
17683 switch (msize)
17685 case 0:
17686 error ("extended registers have no high halves");
17687 break;
17688 case 1:
17689 putc ('b', file);
17690 break;
17691 case 2:
17692 putc ('w', file);
17693 break;
17694 case 4:
17695 putc ('d', file);
17696 break;
17697 case 8:
17698 /* no suffix */
17699 break;
17700 default:
17701 error ("unsupported operand size for extended register");
17702 break;
17704 return;
17707 if (duplicated)
17709 if (ASSEMBLER_DIALECT == ASM_ATT)
17710 fprintf (file, ", %%%s", reg);
17711 else
17712 fprintf (file, ", %s", reg);
17716 /* Meaning of CODE:
17717 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17718 C -- print opcode suffix for set/cmov insn.
17719 c -- like C, but print reversed condition
17720 F,f -- likewise, but for floating-point.
17721 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17722 otherwise nothing
17723 R -- print embeded rounding and sae.
17724 r -- print only sae.
17725 z -- print the opcode suffix for the size of the current operand.
17726 Z -- likewise, with special suffixes for x87 instructions.
17727 * -- print a star (in certain assembler syntax)
17728 A -- print an absolute memory reference.
17729 E -- print address with DImode register names if TARGET_64BIT.
17730 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17731 s -- print a shift double count, followed by the assemblers argument
17732 delimiter.
17733 b -- print the QImode name of the register for the indicated operand.
17734 %b0 would print %al if operands[0] is reg 0.
17735 w -- likewise, print the HImode name of the register.
17736 k -- likewise, print the SImode name of the register.
17737 q -- likewise, print the DImode name of the register.
17738 x -- likewise, print the V4SFmode name of the register.
17739 t -- likewise, print the V8SFmode name of the register.
17740 g -- likewise, print the V16SFmode name of the register.
17741 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17742 y -- print "st(0)" instead of "st" as a register.
17743 d -- print duplicated register operand for AVX instruction.
17744 D -- print condition for SSE cmp instruction.
17745 P -- if PIC, print an @PLT suffix.
17746 p -- print raw symbol name.
17747 X -- don't print any sort of PIC '@' suffix for a symbol.
17748 & -- print some in-use local-dynamic symbol name.
17749 H -- print a memory address offset by 8; used for sse high-parts
17750 Y -- print condition for XOP pcom* instruction.
17751 + -- print a branch hint as 'cs' or 'ds' prefix
17752 ; -- print a semicolon (after prefixes due to bug in older gas).
17753 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17754 @ -- print a segment register of thread base pointer load
17755 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17756 ! -- print MPX prefix for jxx/call/ret instructions if required.
17759 void
17760 ix86_print_operand (FILE *file, rtx x, int code)
17762 if (code)
17764 switch (code)
17766 case 'A':
17767 switch (ASSEMBLER_DIALECT)
17769 case ASM_ATT:
17770 putc ('*', file);
17771 break;
17773 case ASM_INTEL:
17774 /* Intel syntax. For absolute addresses, registers should not
17775 be surrounded by braces. */
17776 if (!REG_P (x))
17778 putc ('[', file);
17779 ix86_print_operand (file, x, 0);
17780 putc (']', file);
17781 return;
17783 break;
17785 default:
17786 gcc_unreachable ();
17789 ix86_print_operand (file, x, 0);
17790 return;
17792 case 'E':
17793 /* Wrap address in an UNSPEC to declare special handling. */
17794 if (TARGET_64BIT)
17795 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17797 output_address (VOIDmode, x);
17798 return;
17800 case 'L':
17801 if (ASSEMBLER_DIALECT == ASM_ATT)
17802 putc ('l', file);
17803 return;
17805 case 'W':
17806 if (ASSEMBLER_DIALECT == ASM_ATT)
17807 putc ('w', file);
17808 return;
17810 case 'B':
17811 if (ASSEMBLER_DIALECT == ASM_ATT)
17812 putc ('b', file);
17813 return;
17815 case 'Q':
17816 if (ASSEMBLER_DIALECT == ASM_ATT)
17817 putc ('l', file);
17818 return;
17820 case 'S':
17821 if (ASSEMBLER_DIALECT == ASM_ATT)
17822 putc ('s', file);
17823 return;
17825 case 'T':
17826 if (ASSEMBLER_DIALECT == ASM_ATT)
17827 putc ('t', file);
17828 return;
17830 case 'O':
17831 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17832 if (ASSEMBLER_DIALECT != ASM_ATT)
17833 return;
17835 switch (GET_MODE_SIZE (GET_MODE (x)))
17837 case 2:
17838 putc ('w', file);
17839 break;
17841 case 4:
17842 putc ('l', file);
17843 break;
17845 case 8:
17846 putc ('q', file);
17847 break;
17849 default:
17850 output_operand_lossage ("invalid operand size for operand "
17851 "code 'O'");
17852 return;
17855 putc ('.', file);
17856 #endif
17857 return;
17859 case 'z':
17860 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17862 /* Opcodes don't get size suffixes if using Intel opcodes. */
17863 if (ASSEMBLER_DIALECT == ASM_INTEL)
17864 return;
17866 switch (GET_MODE_SIZE (GET_MODE (x)))
17868 case 1:
17869 putc ('b', file);
17870 return;
17872 case 2:
17873 putc ('w', file);
17874 return;
17876 case 4:
17877 putc ('l', file);
17878 return;
17880 case 8:
17881 putc ('q', file);
17882 return;
17884 default:
17885 output_operand_lossage ("invalid operand size for operand "
17886 "code 'z'");
17887 return;
17891 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17892 warning (0, "non-integer operand used with operand code 'z'");
17893 /* FALLTHRU */
17895 case 'Z':
17896 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17897 if (ASSEMBLER_DIALECT == ASM_INTEL)
17898 return;
17900 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17902 switch (GET_MODE_SIZE (GET_MODE (x)))
17904 case 2:
17905 #ifdef HAVE_AS_IX86_FILDS
17906 putc ('s', file);
17907 #endif
17908 return;
17910 case 4:
17911 putc ('l', file);
17912 return;
17914 case 8:
17915 #ifdef HAVE_AS_IX86_FILDQ
17916 putc ('q', file);
17917 #else
17918 fputs ("ll", file);
17919 #endif
17920 return;
17922 default:
17923 break;
17926 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17928 /* 387 opcodes don't get size suffixes
17929 if the operands are registers. */
17930 if (STACK_REG_P (x))
17931 return;
17933 switch (GET_MODE_SIZE (GET_MODE (x)))
17935 case 4:
17936 putc ('s', file);
17937 return;
17939 case 8:
17940 putc ('l', file);
17941 return;
17943 case 12:
17944 case 16:
17945 putc ('t', file);
17946 return;
17948 default:
17949 break;
17952 else
17954 output_operand_lossage ("invalid operand type used with "
17955 "operand code 'Z'");
17956 return;
17959 output_operand_lossage ("invalid operand size for operand code 'Z'");
17960 return;
17962 case 'd':
17963 case 'b':
17964 case 'w':
17965 case 'k':
17966 case 'q':
17967 case 'h':
17968 case 't':
17969 case 'g':
17970 case 'y':
17971 case 'x':
17972 case 'X':
17973 case 'P':
17974 case 'p':
17975 break;
17977 case 's':
17978 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17980 ix86_print_operand (file, x, 0);
17981 fputs (", ", file);
17983 return;
17985 case 'Y':
17986 switch (GET_CODE (x))
17988 case NE:
17989 fputs ("neq", file);
17990 break;
17991 case EQ:
17992 fputs ("eq", file);
17993 break;
17994 case GE:
17995 case GEU:
17996 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17997 break;
17998 case GT:
17999 case GTU:
18000 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18001 break;
18002 case LE:
18003 case LEU:
18004 fputs ("le", file);
18005 break;
18006 case LT:
18007 case LTU:
18008 fputs ("lt", file);
18009 break;
18010 case UNORDERED:
18011 fputs ("unord", file);
18012 break;
18013 case ORDERED:
18014 fputs ("ord", file);
18015 break;
18016 case UNEQ:
18017 fputs ("ueq", file);
18018 break;
18019 case UNGE:
18020 fputs ("nlt", file);
18021 break;
18022 case UNGT:
18023 fputs ("nle", file);
18024 break;
18025 case UNLE:
18026 fputs ("ule", file);
18027 break;
18028 case UNLT:
18029 fputs ("ult", file);
18030 break;
18031 case LTGT:
18032 fputs ("une", file);
18033 break;
18034 default:
18035 output_operand_lossage ("operand is not a condition code, "
18036 "invalid operand code 'Y'");
18037 return;
18039 return;
18041 case 'D':
18042 /* Little bit of braindamage here. The SSE compare instructions
18043 does use completely different names for the comparisons that the
18044 fp conditional moves. */
18045 switch (GET_CODE (x))
18047 case UNEQ:
18048 if (TARGET_AVX)
18050 fputs ("eq_us", file);
18051 break;
18053 /* FALLTHRU */
18054 case EQ:
18055 fputs ("eq", file);
18056 break;
18057 case UNLT:
18058 if (TARGET_AVX)
18060 fputs ("nge", file);
18061 break;
18063 /* FALLTHRU */
18064 case LT:
18065 fputs ("lt", file);
18066 break;
18067 case UNLE:
18068 if (TARGET_AVX)
18070 fputs ("ngt", file);
18071 break;
18073 /* FALLTHRU */
18074 case LE:
18075 fputs ("le", file);
18076 break;
18077 case UNORDERED:
18078 fputs ("unord", file);
18079 break;
18080 case LTGT:
18081 if (TARGET_AVX)
18083 fputs ("neq_oq", file);
18084 break;
18086 /* FALLTHRU */
18087 case NE:
18088 fputs ("neq", file);
18089 break;
18090 case GE:
18091 if (TARGET_AVX)
18093 fputs ("ge", file);
18094 break;
18096 /* FALLTHRU */
18097 case UNGE:
18098 fputs ("nlt", file);
18099 break;
18100 case GT:
18101 if (TARGET_AVX)
18103 fputs ("gt", file);
18104 break;
18106 /* FALLTHRU */
18107 case UNGT:
18108 fputs ("nle", file);
18109 break;
18110 case ORDERED:
18111 fputs ("ord", file);
18112 break;
18113 default:
18114 output_operand_lossage ("operand is not a condition code, "
18115 "invalid operand code 'D'");
18116 return;
18118 return;
18120 case 'F':
18121 case 'f':
18122 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18123 if (ASSEMBLER_DIALECT == ASM_ATT)
18124 putc ('.', file);
18125 gcc_fallthrough ();
18126 #endif
18128 case 'C':
18129 case 'c':
18130 if (!COMPARISON_P (x))
18132 output_operand_lossage ("operand is not a condition code, "
18133 "invalid operand code '%c'", code);
18134 return;
18136 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18137 code == 'c' || code == 'f',
18138 code == 'F' || code == 'f',
18139 file);
18140 return;
18142 case 'H':
18143 if (!offsettable_memref_p (x))
18145 output_operand_lossage ("operand is not an offsettable memory "
18146 "reference, invalid operand code 'H'");
18147 return;
18149 /* It doesn't actually matter what mode we use here, as we're
18150 only going to use this for printing. */
18151 x = adjust_address_nv (x, DImode, 8);
18152 /* Output 'qword ptr' for intel assembler dialect. */
18153 if (ASSEMBLER_DIALECT == ASM_INTEL)
18154 code = 'q';
18155 break;
18157 case 'K':
18158 if (!CONST_INT_P (x))
18160 output_operand_lossage ("operand is not an integer, invalid "
18161 "operand code 'K'");
18162 return;
18165 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18166 #ifdef HAVE_AS_IX86_HLE
18167 fputs ("xacquire ", file);
18168 #else
18169 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18170 #endif
18171 else if (INTVAL (x) & IX86_HLE_RELEASE)
18172 #ifdef HAVE_AS_IX86_HLE
18173 fputs ("xrelease ", file);
18174 #else
18175 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18176 #endif
18177 /* We do not want to print value of the operand. */
18178 return;
18180 case 'N':
18181 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18182 fputs ("{z}", file);
18183 return;
18185 case 'r':
18186 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18188 output_operand_lossage ("operand is not a specific integer, "
18189 "invalid operand code 'r'");
18190 return;
18193 if (ASSEMBLER_DIALECT == ASM_INTEL)
18194 fputs (", ", file);
18196 fputs ("{sae}", file);
18198 if (ASSEMBLER_DIALECT == ASM_ATT)
18199 fputs (", ", file);
18201 return;
18203 case 'R':
18204 if (!CONST_INT_P (x))
18206 output_operand_lossage ("operand is not an integer, invalid "
18207 "operand code 'R'");
18208 return;
18211 if (ASSEMBLER_DIALECT == ASM_INTEL)
18212 fputs (", ", file);
18214 switch (INTVAL (x))
18216 case ROUND_NEAREST_INT | ROUND_SAE:
18217 fputs ("{rn-sae}", file);
18218 break;
18219 case ROUND_NEG_INF | ROUND_SAE:
18220 fputs ("{rd-sae}", file);
18221 break;
18222 case ROUND_POS_INF | ROUND_SAE:
18223 fputs ("{ru-sae}", file);
18224 break;
18225 case ROUND_ZERO | ROUND_SAE:
18226 fputs ("{rz-sae}", file);
18227 break;
18228 default:
18229 gcc_unreachable ();
18232 if (ASSEMBLER_DIALECT == ASM_ATT)
18233 fputs (", ", file);
18235 return;
18237 case '*':
18238 if (ASSEMBLER_DIALECT == ASM_ATT)
18239 putc ('*', file);
18240 return;
18242 case '&':
18244 const char *name = get_some_local_dynamic_name ();
18245 if (name == NULL)
18246 output_operand_lossage ("'%%&' used without any "
18247 "local dynamic TLS references");
18248 else
18249 assemble_name (file, name);
18250 return;
18253 case '+':
18255 rtx x;
18257 if (!optimize
18258 || optimize_function_for_size_p (cfun)
18259 || !TARGET_BRANCH_PREDICTION_HINTS)
18260 return;
18262 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18263 if (x)
18265 int pred_val = XINT (x, 0);
18267 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18268 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18270 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18271 bool cputaken
18272 = final_forward_branch_p (current_output_insn) == 0;
18274 /* Emit hints only in the case default branch prediction
18275 heuristics would fail. */
18276 if (taken != cputaken)
18278 /* We use 3e (DS) prefix for taken branches and
18279 2e (CS) prefix for not taken branches. */
18280 if (taken)
18281 fputs ("ds ; ", file);
18282 else
18283 fputs ("cs ; ", file);
18287 return;
18290 case ';':
18291 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18292 putc (';', file);
18293 #endif
18294 return;
18296 case '@':
18297 if (ASSEMBLER_DIALECT == ASM_ATT)
18298 putc ('%', file);
18300 /* The kernel uses a different segment register for performance
18301 reasons; a system call would not have to trash the userspace
18302 segment register, which would be expensive. */
18303 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18304 fputs ("fs", file);
18305 else
18306 fputs ("gs", file);
18307 return;
18309 case '~':
18310 putc (TARGET_AVX2 ? 'i' : 'f', file);
18311 return;
18313 case '^':
18314 if (TARGET_64BIT && Pmode != word_mode)
18315 fputs ("addr32 ", file);
18316 return;
18318 case '!':
18319 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18320 fputs ("bnd ", file);
18321 return;
18323 default:
18324 output_operand_lossage ("invalid operand code '%c'", code);
18328 if (REG_P (x))
18329 print_reg (x, code, file);
18331 else if (MEM_P (x))
18333 rtx addr = XEXP (x, 0);
18335 /* No `byte ptr' prefix for call instructions ... */
18336 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18338 machine_mode mode = GET_MODE (x);
18339 const char *size;
18341 /* Check for explicit size override codes. */
18342 if (code == 'b')
18343 size = "BYTE";
18344 else if (code == 'w')
18345 size = "WORD";
18346 else if (code == 'k')
18347 size = "DWORD";
18348 else if (code == 'q')
18349 size = "QWORD";
18350 else if (code == 'x')
18351 size = "XMMWORD";
18352 else if (code == 't')
18353 size = "YMMWORD";
18354 else if (code == 'g')
18355 size = "ZMMWORD";
18356 else if (mode == BLKmode)
18357 /* ... or BLKmode operands, when not overridden. */
18358 size = NULL;
18359 else
18360 switch (GET_MODE_SIZE (mode))
18362 case 1: size = "BYTE"; break;
18363 case 2: size = "WORD"; break;
18364 case 4: size = "DWORD"; break;
18365 case 8: size = "QWORD"; break;
18366 case 12: size = "TBYTE"; break;
18367 case 16:
18368 if (mode == XFmode)
18369 size = "TBYTE";
18370 else
18371 size = "XMMWORD";
18372 break;
18373 case 32: size = "YMMWORD"; break;
18374 case 64: size = "ZMMWORD"; break;
18375 default:
18376 gcc_unreachable ();
18378 if (size)
18380 fputs (size, file);
18381 fputs (" PTR ", file);
18385 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18386 output_operand_lossage ("invalid constraints for operand");
18387 else
18388 ix86_print_operand_address_as
18389 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18392 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18394 long l;
18396 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18398 if (ASSEMBLER_DIALECT == ASM_ATT)
18399 putc ('$', file);
18400 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18401 if (code == 'q')
18402 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18403 (unsigned long long) (int) l);
18404 else
18405 fprintf (file, "0x%08x", (unsigned int) l);
18408 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18410 long l[2];
18412 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18414 if (ASSEMBLER_DIALECT == ASM_ATT)
18415 putc ('$', file);
18416 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18419 /* These float cases don't actually occur as immediate operands. */
18420 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18422 char dstr[30];
18424 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18425 fputs (dstr, file);
18428 else
18430 /* We have patterns that allow zero sets of memory, for instance.
18431 In 64-bit mode, we should probably support all 8-byte vectors,
18432 since we can in fact encode that into an immediate. */
18433 if (GET_CODE (x) == CONST_VECTOR)
18435 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18436 x = const0_rtx;
18439 if (code != 'P' && code != 'p')
18441 if (CONST_INT_P (x))
18443 if (ASSEMBLER_DIALECT == ASM_ATT)
18444 putc ('$', file);
18446 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18447 || GET_CODE (x) == LABEL_REF)
18449 if (ASSEMBLER_DIALECT == ASM_ATT)
18450 putc ('$', file);
18451 else
18452 fputs ("OFFSET FLAT:", file);
18455 if (CONST_INT_P (x))
18456 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18457 else if (flag_pic || MACHOPIC_INDIRECT)
18458 output_pic_addr_const (file, x, code);
18459 else
18460 output_addr_const (file, x);
18464 static bool
18465 ix86_print_operand_punct_valid_p (unsigned char code)
18467 return (code == '@' || code == '*' || code == '+' || code == '&'
18468 || code == ';' || code == '~' || code == '^' || code == '!');
18471 /* Print a memory operand whose address is ADDR. */
18473 static void
18474 ix86_print_operand_address_as (FILE *file, rtx addr,
18475 addr_space_t as, bool no_rip)
18477 struct ix86_address parts;
18478 rtx base, index, disp;
18479 int scale;
18480 int ok;
18481 bool vsib = false;
18482 int code = 0;
18484 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18486 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18487 gcc_assert (parts.index == NULL_RTX);
18488 parts.index = XVECEXP (addr, 0, 1);
18489 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18490 addr = XVECEXP (addr, 0, 0);
18491 vsib = true;
18493 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18495 gcc_assert (TARGET_64BIT);
18496 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18497 code = 'q';
18499 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18501 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18502 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18503 if (parts.base != NULL_RTX)
18505 parts.index = parts.base;
18506 parts.scale = 1;
18508 parts.base = XVECEXP (addr, 0, 0);
18509 addr = XVECEXP (addr, 0, 0);
18511 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18513 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18514 gcc_assert (parts.index == NULL_RTX);
18515 parts.index = XVECEXP (addr, 0, 1);
18516 addr = XVECEXP (addr, 0, 0);
18518 else
18519 ok = ix86_decompose_address (addr, &parts);
18521 gcc_assert (ok);
18523 base = parts.base;
18524 index = parts.index;
18525 disp = parts.disp;
18526 scale = parts.scale;
18528 if (ADDR_SPACE_GENERIC_P (as))
18529 as = parts.seg;
18530 else
18531 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18533 if (!ADDR_SPACE_GENERIC_P (as))
18535 const char *string;
18537 if (as == ADDR_SPACE_SEG_FS)
18538 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18539 else if (as == ADDR_SPACE_SEG_GS)
18540 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18541 else
18542 gcc_unreachable ();
18543 fputs (string, file);
18546 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18547 if (TARGET_64BIT && !base && !index && !no_rip)
18549 rtx symbol = disp;
18551 if (GET_CODE (disp) == CONST
18552 && GET_CODE (XEXP (disp, 0)) == PLUS
18553 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18554 symbol = XEXP (XEXP (disp, 0), 0);
18556 if (GET_CODE (symbol) == LABEL_REF
18557 || (GET_CODE (symbol) == SYMBOL_REF
18558 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18559 base = pc_rtx;
18562 if (!base && !index)
18564 /* Displacement only requires special attention. */
18565 if (CONST_INT_P (disp))
18567 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18568 fputs ("ds:", file);
18569 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18571 /* Load the external function address via the GOT slot to avoid PLT. */
18572 else if (GET_CODE (disp) == CONST
18573 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18574 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18575 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18576 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18577 output_pic_addr_const (file, disp, 0);
18578 else if (flag_pic)
18579 output_pic_addr_const (file, disp, 0);
18580 else
18581 output_addr_const (file, disp);
18583 else
18585 /* Print SImode register names to force addr32 prefix. */
18586 if (SImode_address_operand (addr, VOIDmode))
18588 if (flag_checking)
18590 gcc_assert (TARGET_64BIT);
18591 switch (GET_CODE (addr))
18593 case SUBREG:
18594 gcc_assert (GET_MODE (addr) == SImode);
18595 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18596 break;
18597 case ZERO_EXTEND:
18598 case AND:
18599 gcc_assert (GET_MODE (addr) == DImode);
18600 break;
18601 default:
18602 gcc_unreachable ();
18605 gcc_assert (!code);
18606 code = 'k';
18608 else if (code == 0
18609 && TARGET_X32
18610 && disp
18611 && CONST_INT_P (disp)
18612 && INTVAL (disp) < -16*1024*1024)
18614 /* X32 runs in 64-bit mode, where displacement, DISP, in
18615 address DISP(%r64), is encoded as 32-bit immediate sign-
18616 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18617 address is %r64 + 0xffffffffbffffd00. When %r64 <
18618 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18619 which is invalid for x32. The correct address is %r64
18620 - 0x40000300 == 0xf7ffdd64. To properly encode
18621 -0x40000300(%r64) for x32, we zero-extend negative
18622 displacement by forcing addr32 prefix which truncates
18623 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18624 zero-extend all negative displacements, including -1(%rsp).
18625 However, for small negative displacements, sign-extension
18626 won't cause overflow. We only zero-extend negative
18627 displacements if they < -16*1024*1024, which is also used
18628 to check legitimate address displacements for PIC. */
18629 code = 'k';
18632 if (ASSEMBLER_DIALECT == ASM_ATT)
18634 if (disp)
18636 if (flag_pic)
18637 output_pic_addr_const (file, disp, 0);
18638 else if (GET_CODE (disp) == LABEL_REF)
18639 output_asm_label (disp);
18640 else
18641 output_addr_const (file, disp);
18644 putc ('(', file);
18645 if (base)
18646 print_reg (base, code, file);
18647 if (index)
18649 putc (',', file);
18650 print_reg (index, vsib ? 0 : code, file);
18651 if (scale != 1 || vsib)
18652 fprintf (file, ",%d", scale);
18654 putc (')', file);
18656 else
18658 rtx offset = NULL_RTX;
18660 if (disp)
18662 /* Pull out the offset of a symbol; print any symbol itself. */
18663 if (GET_CODE (disp) == CONST
18664 && GET_CODE (XEXP (disp, 0)) == PLUS
18665 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18667 offset = XEXP (XEXP (disp, 0), 1);
18668 disp = gen_rtx_CONST (VOIDmode,
18669 XEXP (XEXP (disp, 0), 0));
18672 if (flag_pic)
18673 output_pic_addr_const (file, disp, 0);
18674 else if (GET_CODE (disp) == LABEL_REF)
18675 output_asm_label (disp);
18676 else if (CONST_INT_P (disp))
18677 offset = disp;
18678 else
18679 output_addr_const (file, disp);
18682 putc ('[', file);
18683 if (base)
18685 print_reg (base, code, file);
18686 if (offset)
18688 if (INTVAL (offset) >= 0)
18689 putc ('+', file);
18690 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18693 else if (offset)
18694 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18695 else
18696 putc ('0', file);
18698 if (index)
18700 putc ('+', file);
18701 print_reg (index, vsib ? 0 : code, file);
18702 if (scale != 1 || vsib)
18703 fprintf (file, "*%d", scale);
18705 putc (']', file);
18710 static void
18711 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18713 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18716 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18718 static bool
18719 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18721 rtx op;
18723 if (GET_CODE (x) != UNSPEC)
18724 return false;
18726 op = XVECEXP (x, 0, 0);
18727 switch (XINT (x, 1))
18729 case UNSPEC_GOTTPOFF:
18730 output_addr_const (file, op);
18731 /* FIXME: This might be @TPOFF in Sun ld. */
18732 fputs ("@gottpoff", file);
18733 break;
18734 case UNSPEC_TPOFF:
18735 output_addr_const (file, op);
18736 fputs ("@tpoff", file);
18737 break;
18738 case UNSPEC_NTPOFF:
18739 output_addr_const (file, op);
18740 if (TARGET_64BIT)
18741 fputs ("@tpoff", file);
18742 else
18743 fputs ("@ntpoff", file);
18744 break;
18745 case UNSPEC_DTPOFF:
18746 output_addr_const (file, op);
18747 fputs ("@dtpoff", file);
18748 break;
18749 case UNSPEC_GOTNTPOFF:
18750 output_addr_const (file, op);
18751 if (TARGET_64BIT)
18752 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18753 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18754 else
18755 fputs ("@gotntpoff", file);
18756 break;
18757 case UNSPEC_INDNTPOFF:
18758 output_addr_const (file, op);
18759 fputs ("@indntpoff", file);
18760 break;
18761 #if TARGET_MACHO
18762 case UNSPEC_MACHOPIC_OFFSET:
18763 output_addr_const (file, op);
18764 putc ('-', file);
18765 machopic_output_function_base_name (file);
18766 break;
18767 #endif
18769 case UNSPEC_STACK_CHECK:
18771 int offset;
18773 gcc_assert (flag_split_stack);
18775 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18776 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18777 #else
18778 gcc_unreachable ();
18779 #endif
18781 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18783 break;
18785 default:
18786 return false;
18789 return true;
18792 /* Split one or more double-mode RTL references into pairs of half-mode
18793 references. The RTL can be REG, offsettable MEM, integer constant, or
18794 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18795 split and "num" is its length. lo_half and hi_half are output arrays
18796 that parallel "operands". */
18798 void
18799 split_double_mode (machine_mode mode, rtx operands[],
18800 int num, rtx lo_half[], rtx hi_half[])
18802 machine_mode half_mode;
18803 unsigned int byte;
18805 switch (mode)
18807 case TImode:
18808 half_mode = DImode;
18809 break;
18810 case DImode:
18811 half_mode = SImode;
18812 break;
18813 default:
18814 gcc_unreachable ();
18817 byte = GET_MODE_SIZE (half_mode);
18819 while (num--)
18821 rtx op = operands[num];
18823 /* simplify_subreg refuse to split volatile memory addresses,
18824 but we still have to handle it. */
18825 if (MEM_P (op))
18827 lo_half[num] = adjust_address (op, half_mode, 0);
18828 hi_half[num] = adjust_address (op, half_mode, byte);
18830 else
18832 lo_half[num] = simplify_gen_subreg (half_mode, op,
18833 GET_MODE (op) == VOIDmode
18834 ? mode : GET_MODE (op), 0);
18835 hi_half[num] = simplify_gen_subreg (half_mode, op,
18836 GET_MODE (op) == VOIDmode
18837 ? mode : GET_MODE (op), byte);
18842 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18843 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18844 is the expression of the binary operation. The output may either be
18845 emitted here, or returned to the caller, like all output_* functions.
18847 There is no guarantee that the operands are the same mode, as they
18848 might be within FLOAT or FLOAT_EXTEND expressions. */
18850 #ifndef SYSV386_COMPAT
18851 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18852 wants to fix the assemblers because that causes incompatibility
18853 with gcc. No-one wants to fix gcc because that causes
18854 incompatibility with assemblers... You can use the option of
18855 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18856 #define SYSV386_COMPAT 1
18857 #endif
18859 const char *
18860 output_387_binary_op (rtx_insn *insn, rtx *operands)
18862 static char buf[40];
18863 const char *p;
18864 const char *ssep;
18865 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18867 /* Even if we do not want to check the inputs, this documents input
18868 constraints. Which helps in understanding the following code. */
18869 if (flag_checking)
18871 if (STACK_REG_P (operands[0])
18872 && ((REG_P (operands[1])
18873 && REGNO (operands[0]) == REGNO (operands[1])
18874 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18875 || (REG_P (operands[2])
18876 && REGNO (operands[0]) == REGNO (operands[2])
18877 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18878 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18879 ; /* ok */
18880 else
18881 gcc_assert (is_sse);
18884 switch (GET_CODE (operands[3]))
18886 case PLUS:
18887 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18888 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18889 p = "fiadd";
18890 else
18891 p = "fadd";
18892 ssep = "vadd";
18893 break;
18895 case MINUS:
18896 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18897 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18898 p = "fisub";
18899 else
18900 p = "fsub";
18901 ssep = "vsub";
18902 break;
18904 case MULT:
18905 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18906 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18907 p = "fimul";
18908 else
18909 p = "fmul";
18910 ssep = "vmul";
18911 break;
18913 case DIV:
18914 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18915 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18916 p = "fidiv";
18917 else
18918 p = "fdiv";
18919 ssep = "vdiv";
18920 break;
18922 default:
18923 gcc_unreachable ();
18926 if (is_sse)
18928 if (TARGET_AVX)
18930 strcpy (buf, ssep);
18931 if (GET_MODE (operands[0]) == SFmode)
18932 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18933 else
18934 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18936 else
18938 strcpy (buf, ssep + 1);
18939 if (GET_MODE (operands[0]) == SFmode)
18940 strcat (buf, "ss\t{%2, %0|%0, %2}");
18941 else
18942 strcat (buf, "sd\t{%2, %0|%0, %2}");
18944 return buf;
18946 strcpy (buf, p);
18948 switch (GET_CODE (operands[3]))
18950 case MULT:
18951 case PLUS:
18952 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18953 std::swap (operands[1], operands[2]);
18955 /* know operands[0] == operands[1]. */
18957 if (MEM_P (operands[2]))
18959 p = "%Z2\t%2";
18960 break;
18963 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18965 if (STACK_TOP_P (operands[0]))
18966 /* How is it that we are storing to a dead operand[2]?
18967 Well, presumably operands[1] is dead too. We can't
18968 store the result to st(0) as st(0) gets popped on this
18969 instruction. Instead store to operands[2] (which I
18970 think has to be st(1)). st(1) will be popped later.
18971 gcc <= 2.8.1 didn't have this check and generated
18972 assembly code that the Unixware assembler rejected. */
18973 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18974 else
18975 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18976 break;
18979 if (STACK_TOP_P (operands[0]))
18980 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18981 else
18982 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18983 break;
18985 case MINUS:
18986 case DIV:
18987 if (MEM_P (operands[1]))
18989 p = "r%Z1\t%1";
18990 break;
18993 if (MEM_P (operands[2]))
18995 p = "%Z2\t%2";
18996 break;
18999 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19001 #if SYSV386_COMPAT
19002 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19003 derived assemblers, confusingly reverse the direction of
19004 the operation for fsub{r} and fdiv{r} when the
19005 destination register is not st(0). The Intel assembler
19006 doesn't have this brain damage. Read !SYSV386_COMPAT to
19007 figure out what the hardware really does. */
19008 if (STACK_TOP_P (operands[0]))
19009 p = "{p\t%0, %2|rp\t%2, %0}";
19010 else
19011 p = "{rp\t%2, %0|p\t%0, %2}";
19012 #else
19013 if (STACK_TOP_P (operands[0]))
19014 /* As above for fmul/fadd, we can't store to st(0). */
19015 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19016 else
19017 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19018 #endif
19019 break;
19022 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19024 #if SYSV386_COMPAT
19025 if (STACK_TOP_P (operands[0]))
19026 p = "{rp\t%0, %1|p\t%1, %0}";
19027 else
19028 p = "{p\t%1, %0|rp\t%0, %1}";
19029 #else
19030 if (STACK_TOP_P (operands[0]))
19031 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19032 else
19033 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19034 #endif
19035 break;
19038 if (STACK_TOP_P (operands[0]))
19040 if (STACK_TOP_P (operands[1]))
19041 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19042 else
19043 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19044 break;
19046 else if (STACK_TOP_P (operands[1]))
19048 #if SYSV386_COMPAT
19049 p = "{\t%1, %0|r\t%0, %1}";
19050 #else
19051 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19052 #endif
19054 else
19056 #if SYSV386_COMPAT
19057 p = "{r\t%2, %0|\t%0, %2}";
19058 #else
19059 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19060 #endif
19062 break;
19064 default:
19065 gcc_unreachable ();
19068 strcat (buf, p);
19069 return buf;
19072 /* Return needed mode for entity in optimize_mode_switching pass. */
19074 static int
19075 ix86_dirflag_mode_needed (rtx_insn *insn)
19077 if (CALL_P (insn))
19079 if (cfun->machine->func_type == TYPE_NORMAL)
19080 return X86_DIRFLAG_ANY;
19081 else
19082 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19083 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19086 if (recog_memoized (insn) < 0)
19087 return X86_DIRFLAG_ANY;
19089 if (get_attr_type (insn) == TYPE_STR)
19091 /* Emit cld instruction if stringops are used in the function. */
19092 if (cfun->machine->func_type == TYPE_NORMAL)
19093 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19094 else
19095 return X86_DIRFLAG_RESET;
19098 return X86_DIRFLAG_ANY;
19101 /* Check if a 256bit AVX register is referenced inside of EXP. */
19103 static bool
19104 ix86_check_avx256_register (const_rtx exp)
19106 if (SUBREG_P (exp))
19107 exp = SUBREG_REG (exp);
19109 return (REG_P (exp)
19110 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19113 /* Return needed mode for entity in optimize_mode_switching pass. */
19115 static int
19116 ix86_avx_u128_mode_needed (rtx_insn *insn)
19118 if (CALL_P (insn))
19120 rtx link;
19122 /* Needed mode is set to AVX_U128_CLEAN if there are
19123 no 256bit modes used in function arguments. */
19124 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19125 link;
19126 link = XEXP (link, 1))
19128 if (GET_CODE (XEXP (link, 0)) == USE)
19130 rtx arg = XEXP (XEXP (link, 0), 0);
19132 if (ix86_check_avx256_register (arg))
19133 return AVX_U128_DIRTY;
19137 return AVX_U128_CLEAN;
19140 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
19141 changes state only when a 256bit register is written to, but we need
19142 to prevent the compiler from moving optimal insertion point above
19143 eventual read from 256bit register. */
19144 subrtx_iterator::array_type array;
19145 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19146 if (ix86_check_avx256_register (*iter))
19147 return AVX_U128_DIRTY;
19149 return AVX_U128_ANY;
19152 /* Return mode that i387 must be switched into
19153 prior to the execution of insn. */
19155 static int
19156 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19158 enum attr_i387_cw mode;
19160 /* The mode UNINITIALIZED is used to store control word after a
19161 function call or ASM pattern. The mode ANY specify that function
19162 has no requirements on the control word and make no changes in the
19163 bits we are interested in. */
19165 if (CALL_P (insn)
19166 || (NONJUMP_INSN_P (insn)
19167 && (asm_noperands (PATTERN (insn)) >= 0
19168 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19169 return I387_CW_UNINITIALIZED;
19171 if (recog_memoized (insn) < 0)
19172 return I387_CW_ANY;
19174 mode = get_attr_i387_cw (insn);
19176 switch (entity)
19178 case I387_TRUNC:
19179 if (mode == I387_CW_TRUNC)
19180 return mode;
19181 break;
19183 case I387_FLOOR:
19184 if (mode == I387_CW_FLOOR)
19185 return mode;
19186 break;
19188 case I387_CEIL:
19189 if (mode == I387_CW_CEIL)
19190 return mode;
19191 break;
19193 case I387_MASK_PM:
19194 if (mode == I387_CW_MASK_PM)
19195 return mode;
19196 break;
19198 default:
19199 gcc_unreachable ();
19202 return I387_CW_ANY;
19205 /* Return mode that entity must be switched into
19206 prior to the execution of insn. */
19208 static int
19209 ix86_mode_needed (int entity, rtx_insn *insn)
19211 switch (entity)
19213 case X86_DIRFLAG:
19214 return ix86_dirflag_mode_needed (insn);
19215 case AVX_U128:
19216 return ix86_avx_u128_mode_needed (insn);
19217 case I387_TRUNC:
19218 case I387_FLOOR:
19219 case I387_CEIL:
19220 case I387_MASK_PM:
19221 return ix86_i387_mode_needed (entity, insn);
19222 default:
19223 gcc_unreachable ();
19225 return 0;
19228 /* Check if a 256bit AVX register is referenced in stores. */
19230 static void
19231 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19233 if (ix86_check_avx256_register (dest))
19235 bool *used = (bool *) data;
19236 *used = true;
19240 /* Calculate mode of upper 128bit AVX registers after the insn. */
19242 static int
19243 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19245 rtx pat = PATTERN (insn);
19247 if (vzeroupper_operation (pat, VOIDmode)
19248 || vzeroall_operation (pat, VOIDmode))
19249 return AVX_U128_CLEAN;
19251 /* We know that state is clean after CALL insn if there are no
19252 256bit registers used in the function return register. */
19253 if (CALL_P (insn))
19255 bool avx_reg256_found = false;
19256 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19258 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19261 /* Otherwise, return current mode. Remember that if insn
19262 references AVX 256bit registers, the mode was already changed
19263 to DIRTY from MODE_NEEDED. */
19264 return mode;
19267 /* Return the mode that an insn results in. */
19269 static int
19270 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19272 switch (entity)
19274 case X86_DIRFLAG:
19275 return mode;
19276 case AVX_U128:
19277 return ix86_avx_u128_mode_after (mode, insn);
19278 case I387_TRUNC:
19279 case I387_FLOOR:
19280 case I387_CEIL:
19281 case I387_MASK_PM:
19282 return mode;
19283 default:
19284 gcc_unreachable ();
19288 static int
19289 ix86_dirflag_mode_entry (void)
19291 /* For TARGET_CLD or in the interrupt handler we can't assume
19292 direction flag state at function entry. */
19293 if (TARGET_CLD
19294 || cfun->machine->func_type != TYPE_NORMAL)
19295 return X86_DIRFLAG_ANY;
19297 return X86_DIRFLAG_RESET;
19300 static int
19301 ix86_avx_u128_mode_entry (void)
19303 tree arg;
19305 /* Entry mode is set to AVX_U128_DIRTY if there are
19306 256bit modes used in function arguments. */
19307 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19308 arg = TREE_CHAIN (arg))
19310 rtx incoming = DECL_INCOMING_RTL (arg);
19312 if (incoming && ix86_check_avx256_register (incoming))
19313 return AVX_U128_DIRTY;
19316 return AVX_U128_CLEAN;
19319 /* Return a mode that ENTITY is assumed to be
19320 switched to at function entry. */
19322 static int
19323 ix86_mode_entry (int entity)
19325 switch (entity)
19327 case X86_DIRFLAG:
19328 return ix86_dirflag_mode_entry ();
19329 case AVX_U128:
19330 return ix86_avx_u128_mode_entry ();
19331 case I387_TRUNC:
19332 case I387_FLOOR:
19333 case I387_CEIL:
19334 case I387_MASK_PM:
19335 return I387_CW_ANY;
19336 default:
19337 gcc_unreachable ();
19341 static int
19342 ix86_avx_u128_mode_exit (void)
19344 rtx reg = crtl->return_rtx;
19346 /* Exit mode is set to AVX_U128_DIRTY if there are
19347 256bit modes used in the function return register. */
19348 if (reg && ix86_check_avx256_register (reg))
19349 return AVX_U128_DIRTY;
19351 return AVX_U128_CLEAN;
19354 /* Return a mode that ENTITY is assumed to be
19355 switched to at function exit. */
19357 static int
19358 ix86_mode_exit (int entity)
19360 switch (entity)
19362 case X86_DIRFLAG:
19363 return X86_DIRFLAG_ANY;
19364 case AVX_U128:
19365 return ix86_avx_u128_mode_exit ();
19366 case I387_TRUNC:
19367 case I387_FLOOR:
19368 case I387_CEIL:
19369 case I387_MASK_PM:
19370 return I387_CW_ANY;
19371 default:
19372 gcc_unreachable ();
19376 static int
19377 ix86_mode_priority (int, int n)
19379 return n;
19382 /* Output code to initialize control word copies used by trunc?f?i and
19383 rounding patterns. CURRENT_MODE is set to current control word,
19384 while NEW_MODE is set to new control word. */
19386 static void
19387 emit_i387_cw_initialization (int mode)
19389 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19390 rtx new_mode;
19392 enum ix86_stack_slot slot;
19394 rtx reg = gen_reg_rtx (HImode);
19396 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19397 emit_move_insn (reg, copy_rtx (stored_mode));
19399 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19400 || optimize_insn_for_size_p ())
19402 switch (mode)
19404 case I387_CW_TRUNC:
19405 /* round toward zero (truncate) */
19406 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19407 slot = SLOT_CW_TRUNC;
19408 break;
19410 case I387_CW_FLOOR:
19411 /* round down toward -oo */
19412 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19413 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19414 slot = SLOT_CW_FLOOR;
19415 break;
19417 case I387_CW_CEIL:
19418 /* round up toward +oo */
19419 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19420 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19421 slot = SLOT_CW_CEIL;
19422 break;
19424 case I387_CW_MASK_PM:
19425 /* mask precision exception for nearbyint() */
19426 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19427 slot = SLOT_CW_MASK_PM;
19428 break;
19430 default:
19431 gcc_unreachable ();
19434 else
19436 switch (mode)
19438 case I387_CW_TRUNC:
19439 /* round toward zero (truncate) */
19440 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19441 slot = SLOT_CW_TRUNC;
19442 break;
19444 case I387_CW_FLOOR:
19445 /* round down toward -oo */
19446 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19447 slot = SLOT_CW_FLOOR;
19448 break;
19450 case I387_CW_CEIL:
19451 /* round up toward +oo */
19452 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19453 slot = SLOT_CW_CEIL;
19454 break;
19456 case I387_CW_MASK_PM:
19457 /* mask precision exception for nearbyint() */
19458 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19459 slot = SLOT_CW_MASK_PM;
19460 break;
19462 default:
19463 gcc_unreachable ();
19467 gcc_assert (slot < MAX_386_STACK_LOCALS);
19469 new_mode = assign_386_stack_local (HImode, slot);
19470 emit_move_insn (new_mode, reg);
19473 /* Emit vzeroupper. */
19475 void
19476 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19478 int i;
19480 /* Cancel automatic vzeroupper insertion if there are
19481 live call-saved SSE registers at the insertion point. */
19483 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19484 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19485 return;
19487 if (TARGET_64BIT)
19488 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19489 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19490 return;
19492 emit_insn (gen_avx_vzeroupper ());
19495 /* Generate one or more insns to set ENTITY to MODE. */
19497 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19498 is the set of hard registers live at the point where the insn(s)
19499 are to be inserted. */
19501 static void
19502 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19503 HARD_REG_SET regs_live)
19505 switch (entity)
19507 case X86_DIRFLAG:
19508 if (mode == X86_DIRFLAG_RESET)
19509 emit_insn (gen_cld ());
19510 break;
19511 case AVX_U128:
19512 if (mode == AVX_U128_CLEAN)
19513 ix86_avx_emit_vzeroupper (regs_live);
19514 break;
19515 case I387_TRUNC:
19516 case I387_FLOOR:
19517 case I387_CEIL:
19518 case I387_MASK_PM:
19519 if (mode != I387_CW_ANY
19520 && mode != I387_CW_UNINITIALIZED)
19521 emit_i387_cw_initialization (mode);
19522 break;
19523 default:
19524 gcc_unreachable ();
19528 /* Output code for INSN to convert a float to a signed int. OPERANDS
19529 are the insn operands. The output may be [HSD]Imode and the input
19530 operand may be [SDX]Fmode. */
19532 const char *
19533 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19535 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19536 int dimode_p = GET_MODE (operands[0]) == DImode;
19537 int round_mode = get_attr_i387_cw (insn);
19539 /* Jump through a hoop or two for DImode, since the hardware has no
19540 non-popping instruction. We used to do this a different way, but
19541 that was somewhat fragile and broke with post-reload splitters. */
19542 if ((dimode_p || fisttp) && !stack_top_dies)
19543 output_asm_insn ("fld\t%y1", operands);
19545 gcc_assert (STACK_TOP_P (operands[1]));
19546 gcc_assert (MEM_P (operands[0]));
19547 gcc_assert (GET_MODE (operands[1]) != TFmode);
19549 if (fisttp)
19550 output_asm_insn ("fisttp%Z0\t%0", operands);
19551 else
19553 if (round_mode != I387_CW_ANY)
19554 output_asm_insn ("fldcw\t%3", operands);
19555 if (stack_top_dies || dimode_p)
19556 output_asm_insn ("fistp%Z0\t%0", operands);
19557 else
19558 output_asm_insn ("fist%Z0\t%0", operands);
19559 if (round_mode != I387_CW_ANY)
19560 output_asm_insn ("fldcw\t%2", operands);
19563 return "";
19566 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19567 have the values zero or one, indicates the ffreep insn's operand
19568 from the OPERANDS array. */
19570 static const char *
19571 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19573 if (TARGET_USE_FFREEP)
19574 #ifdef HAVE_AS_IX86_FFREEP
19575 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19576 #else
19578 static char retval[32];
19579 int regno = REGNO (operands[opno]);
19581 gcc_assert (STACK_REGNO_P (regno));
19583 regno -= FIRST_STACK_REG;
19585 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19586 return retval;
19588 #endif
19590 return opno ? "fstp\t%y1" : "fstp\t%y0";
19594 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19595 should be used. UNORDERED_P is true when fucom should be used. */
19597 const char *
19598 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
19600 int stack_top_dies;
19601 rtx cmp_op0, cmp_op1;
19602 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19604 if (eflags_p)
19606 cmp_op0 = operands[0];
19607 cmp_op1 = operands[1];
19609 else
19611 cmp_op0 = operands[1];
19612 cmp_op1 = operands[2];
19615 if (is_sse)
19617 if (GET_MODE (operands[0]) == SFmode)
19618 if (unordered_p)
19619 return "%vucomiss\t{%1, %0|%0, %1}";
19620 else
19621 return "%vcomiss\t{%1, %0|%0, %1}";
19622 else
19623 if (unordered_p)
19624 return "%vucomisd\t{%1, %0|%0, %1}";
19625 else
19626 return "%vcomisd\t{%1, %0|%0, %1}";
19629 gcc_assert (STACK_TOP_P (cmp_op0));
19631 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19633 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19635 if (stack_top_dies)
19637 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19638 return output_387_ffreep (operands, 1);
19640 else
19641 return "ftst\n\tfnstsw\t%0";
19644 if (STACK_REG_P (cmp_op1)
19645 && stack_top_dies
19646 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19647 && REGNO (cmp_op1) != FIRST_STACK_REG)
19649 /* If both the top of the 387 stack dies, and the other operand
19650 is also a stack register that dies, then this must be a
19651 `fcompp' float compare */
19653 if (eflags_p)
19655 /* There is no double popping fcomi variant. Fortunately,
19656 eflags is immune from the fstp's cc clobbering. */
19657 if (unordered_p)
19658 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19659 else
19660 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19661 return output_387_ffreep (operands, 0);
19663 else
19665 if (unordered_p)
19666 return "fucompp\n\tfnstsw\t%0";
19667 else
19668 return "fcompp\n\tfnstsw\t%0";
19671 else
19673 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19675 static const char * const alt[16] =
19677 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19678 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19679 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19680 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19682 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19683 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19684 NULL,
19685 NULL,
19687 "fcomi\t{%y1, %0|%0, %y1}",
19688 "fcomip\t{%y1, %0|%0, %y1}",
19689 "fucomi\t{%y1, %0|%0, %y1}",
19690 "fucomip\t{%y1, %0|%0, %y1}",
19692 NULL,
19693 NULL,
19694 NULL,
19695 NULL
19698 int mask;
19699 const char *ret;
19701 mask = eflags_p << 3;
19702 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19703 mask |= unordered_p << 1;
19704 mask |= stack_top_dies;
19706 gcc_assert (mask < 16);
19707 ret = alt[mask];
19708 gcc_assert (ret);
19710 return ret;
19714 void
19715 ix86_output_addr_vec_elt (FILE *file, int value)
19717 const char *directive = ASM_LONG;
19719 #ifdef ASM_QUAD
19720 if (TARGET_LP64)
19721 directive = ASM_QUAD;
19722 #else
19723 gcc_assert (!TARGET_64BIT);
19724 #endif
19726 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19729 void
19730 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19732 const char *directive = ASM_LONG;
19734 #ifdef ASM_QUAD
19735 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19736 directive = ASM_QUAD;
19737 #else
19738 gcc_assert (!TARGET_64BIT);
19739 #endif
19740 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19741 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19742 fprintf (file, "%s%s%d-%s%d\n",
19743 directive, LPREFIX, value, LPREFIX, rel);
19744 else if (HAVE_AS_GOTOFF_IN_DATA)
19745 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19746 #if TARGET_MACHO
19747 else if (TARGET_MACHO)
19749 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19750 machopic_output_function_base_name (file);
19751 putc ('\n', file);
19753 #endif
19754 else
19755 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19756 GOT_SYMBOL_NAME, LPREFIX, value);
19759 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19760 for the target. */
19762 void
19763 ix86_expand_clear (rtx dest)
19765 rtx tmp;
19767 /* We play register width games, which are only valid after reload. */
19768 gcc_assert (reload_completed);
19770 /* Avoid HImode and its attendant prefix byte. */
19771 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19772 dest = gen_rtx_REG (SImode, REGNO (dest));
19773 tmp = gen_rtx_SET (dest, const0_rtx);
19775 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19777 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19778 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19781 emit_insn (tmp);
19784 /* X is an unchanging MEM. If it is a constant pool reference, return
19785 the constant pool rtx, else NULL. */
19788 maybe_get_pool_constant (rtx x)
19790 x = ix86_delegitimize_address (XEXP (x, 0));
19792 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19793 return get_pool_constant (x);
19795 return NULL_RTX;
19798 void
19799 ix86_expand_move (machine_mode mode, rtx operands[])
19801 rtx op0, op1;
19802 rtx tmp, addend = NULL_RTX;
19803 enum tls_model model;
19805 op0 = operands[0];
19806 op1 = operands[1];
19808 switch (GET_CODE (op1))
19810 case CONST:
19811 tmp = XEXP (op1, 0);
19813 if (GET_CODE (tmp) != PLUS
19814 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19815 break;
19817 op1 = XEXP (tmp, 0);
19818 addend = XEXP (tmp, 1);
19819 /* FALLTHRU */
19821 case SYMBOL_REF:
19822 model = SYMBOL_REF_TLS_MODEL (op1);
19824 if (model)
19825 op1 = legitimize_tls_address (op1, model, true);
19826 else if (ix86_force_load_from_GOT_p (op1))
19828 /* Load the external function address via GOT slot to avoid PLT. */
19829 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19830 (TARGET_64BIT
19831 ? UNSPEC_GOTPCREL
19832 : UNSPEC_GOT));
19833 op1 = gen_rtx_CONST (Pmode, op1);
19834 op1 = gen_const_mem (Pmode, op1);
19835 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19837 else
19839 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19840 if (tmp)
19842 op1 = tmp;
19843 if (!addend)
19844 break;
19846 else
19848 op1 = operands[1];
19849 break;
19853 if (addend)
19855 op1 = force_operand (op1, NULL_RTX);
19856 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19857 op0, 1, OPTAB_DIRECT);
19859 else
19860 op1 = force_operand (op1, op0);
19862 if (op1 == op0)
19863 return;
19865 op1 = convert_to_mode (mode, op1, 1);
19867 default:
19868 break;
19871 if ((flag_pic || MACHOPIC_INDIRECT)
19872 && symbolic_operand (op1, mode))
19874 if (TARGET_MACHO && !TARGET_64BIT)
19876 #if TARGET_MACHO
19877 /* dynamic-no-pic */
19878 if (MACHOPIC_INDIRECT)
19880 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19881 ? op0 : gen_reg_rtx (Pmode);
19882 op1 = machopic_indirect_data_reference (op1, temp);
19883 if (MACHOPIC_PURE)
19884 op1 = machopic_legitimize_pic_address (op1, mode,
19885 temp == op1 ? 0 : temp);
19887 if (op0 != op1 && GET_CODE (op0) != MEM)
19889 rtx insn = gen_rtx_SET (op0, op1);
19890 emit_insn (insn);
19891 return;
19893 if (GET_CODE (op0) == MEM)
19894 op1 = force_reg (Pmode, op1);
19895 else
19897 rtx temp = op0;
19898 if (GET_CODE (temp) != REG)
19899 temp = gen_reg_rtx (Pmode);
19900 temp = legitimize_pic_address (op1, temp);
19901 if (temp == op0)
19902 return;
19903 op1 = temp;
19905 /* dynamic-no-pic */
19906 #endif
19908 else
19910 if (MEM_P (op0))
19911 op1 = force_reg (mode, op1);
19912 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19914 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19915 op1 = legitimize_pic_address (op1, reg);
19916 if (op0 == op1)
19917 return;
19918 op1 = convert_to_mode (mode, op1, 1);
19922 else
19924 if (MEM_P (op0)
19925 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19926 || !push_operand (op0, mode))
19927 && MEM_P (op1))
19928 op1 = force_reg (mode, op1);
19930 if (push_operand (op0, mode)
19931 && ! general_no_elim_operand (op1, mode))
19932 op1 = copy_to_mode_reg (mode, op1);
19934 /* Force large constants in 64bit compilation into register
19935 to get them CSEed. */
19936 if (can_create_pseudo_p ()
19937 && (mode == DImode) && TARGET_64BIT
19938 && immediate_operand (op1, mode)
19939 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19940 && !register_operand (op0, mode)
19941 && optimize)
19942 op1 = copy_to_mode_reg (mode, op1);
19944 if (can_create_pseudo_p ()
19945 && CONST_DOUBLE_P (op1))
19947 /* If we are loading a floating point constant to a register,
19948 force the value to memory now, since we'll get better code
19949 out the back end. */
19951 op1 = validize_mem (force_const_mem (mode, op1));
19952 if (!register_operand (op0, mode))
19954 rtx temp = gen_reg_rtx (mode);
19955 emit_insn (gen_rtx_SET (temp, op1));
19956 emit_move_insn (op0, temp);
19957 return;
19962 emit_insn (gen_rtx_SET (op0, op1));
19965 void
19966 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19968 rtx op0 = operands[0], op1 = operands[1];
19969 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19970 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19971 unsigned int align = (TARGET_IAMCU
19972 ? GET_MODE_BITSIZE (mode)
19973 : GET_MODE_ALIGNMENT (mode));
19975 if (push_operand (op0, VOIDmode))
19976 op0 = emit_move_resolve_push (mode, op0);
19978 /* Force constants other than zero into memory. We do not know how
19979 the instructions used to build constants modify the upper 64 bits
19980 of the register, once we have that information we may be able
19981 to handle some of them more efficiently. */
19982 if (can_create_pseudo_p ()
19983 && (CONSTANT_P (op1)
19984 || (SUBREG_P (op1)
19985 && CONSTANT_P (SUBREG_REG (op1))))
19986 && ((register_operand (op0, mode)
19987 && !standard_sse_constant_p (op1, mode))
19988 /* ix86_expand_vector_move_misalign() does not like constants. */
19989 || (SSE_REG_MODE_P (mode)
19990 && MEM_P (op0)
19991 && MEM_ALIGN (op0) < align)))
19993 if (SUBREG_P (op1))
19995 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19996 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19997 if (r)
19998 r = validize_mem (r);
19999 else
20000 r = force_reg (imode, SUBREG_REG (op1));
20001 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20003 else
20004 op1 = validize_mem (force_const_mem (mode, op1));
20007 /* We need to check memory alignment for SSE mode since attribute
20008 can make operands unaligned. */
20009 if (can_create_pseudo_p ()
20010 && SSE_REG_MODE_P (mode)
20011 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20012 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20014 rtx tmp[2];
20016 /* ix86_expand_vector_move_misalign() does not like both
20017 arguments in memory. */
20018 if (!register_operand (op0, mode)
20019 && !register_operand (op1, mode))
20020 op1 = force_reg (mode, op1);
20022 tmp[0] = op0; tmp[1] = op1;
20023 ix86_expand_vector_move_misalign (mode, tmp);
20024 return;
20027 /* Make operand1 a register if it isn't already. */
20028 if (can_create_pseudo_p ()
20029 && !register_operand (op0, mode)
20030 && !register_operand (op1, mode))
20032 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20033 return;
20036 emit_insn (gen_rtx_SET (op0, op1));
20039 /* Split 32-byte AVX unaligned load and store if needed. */
20041 static void
20042 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20044 rtx m;
20045 rtx (*extract) (rtx, rtx, rtx);
20046 machine_mode mode;
20048 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20049 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20051 emit_insn (gen_rtx_SET (op0, op1));
20052 return;
20055 rtx orig_op0 = NULL_RTX;
20056 mode = GET_MODE (op0);
20057 switch (GET_MODE_CLASS (mode))
20059 case MODE_VECTOR_INT:
20060 case MODE_INT:
20061 if (mode != V32QImode)
20063 if (!MEM_P (op0))
20065 orig_op0 = op0;
20066 op0 = gen_reg_rtx (V32QImode);
20068 else
20069 op0 = gen_lowpart (V32QImode, op0);
20070 op1 = gen_lowpart (V32QImode, op1);
20071 mode = V32QImode;
20073 break;
20074 case MODE_VECTOR_FLOAT:
20075 break;
20076 default:
20077 gcc_unreachable ();
20080 switch (mode)
20082 default:
20083 gcc_unreachable ();
20084 case V32QImode:
20085 extract = gen_avx_vextractf128v32qi;
20086 mode = V16QImode;
20087 break;
20088 case V8SFmode:
20089 extract = gen_avx_vextractf128v8sf;
20090 mode = V4SFmode;
20091 break;
20092 case V4DFmode:
20093 extract = gen_avx_vextractf128v4df;
20094 mode = V2DFmode;
20095 break;
20098 if (MEM_P (op1))
20100 rtx r = gen_reg_rtx (mode);
20101 m = adjust_address (op1, mode, 0);
20102 emit_move_insn (r, m);
20103 m = adjust_address (op1, mode, 16);
20104 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20105 emit_move_insn (op0, r);
20107 else if (MEM_P (op0))
20109 m = adjust_address (op0, mode, 0);
20110 emit_insn (extract (m, op1, const0_rtx));
20111 m = adjust_address (op0, mode, 16);
20112 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20114 else
20115 gcc_unreachable ();
20117 if (orig_op0)
20118 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20121 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20122 straight to ix86_expand_vector_move. */
20123 /* Code generation for scalar reg-reg moves of single and double precision data:
20124 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20125 movaps reg, reg
20126 else
20127 movss reg, reg
20128 if (x86_sse_partial_reg_dependency == true)
20129 movapd reg, reg
20130 else
20131 movsd reg, reg
20133 Code generation for scalar loads of double precision data:
20134 if (x86_sse_split_regs == true)
20135 movlpd mem, reg (gas syntax)
20136 else
20137 movsd mem, reg
20139 Code generation for unaligned packed loads of single precision data
20140 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20141 if (x86_sse_unaligned_move_optimal)
20142 movups mem, reg
20144 if (x86_sse_partial_reg_dependency == true)
20146 xorps reg, reg
20147 movlps mem, reg
20148 movhps mem+8, reg
20150 else
20152 movlps mem, reg
20153 movhps mem+8, reg
20156 Code generation for unaligned packed loads of double precision data
20157 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20158 if (x86_sse_unaligned_move_optimal)
20159 movupd mem, reg
20161 if (x86_sse_split_regs == true)
20163 movlpd mem, reg
20164 movhpd mem+8, reg
20166 else
20168 movsd mem, reg
20169 movhpd mem+8, reg
20173 void
20174 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20176 rtx op0, op1, m;
20178 op0 = operands[0];
20179 op1 = operands[1];
20181 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20182 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20184 emit_insn (gen_rtx_SET (op0, op1));
20185 return;
20188 if (TARGET_AVX)
20190 if (GET_MODE_SIZE (mode) == 32)
20191 ix86_avx256_split_vector_move_misalign (op0, op1);
20192 else
20193 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20194 emit_insn (gen_rtx_SET (op0, op1));
20195 return;
20198 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20199 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20201 emit_insn (gen_rtx_SET (op0, op1));
20202 return;
20205 /* ??? If we have typed data, then it would appear that using
20206 movdqu is the only way to get unaligned data loaded with
20207 integer type. */
20208 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20210 emit_insn (gen_rtx_SET (op0, op1));
20211 return;
20214 if (MEM_P (op1))
20216 if (TARGET_SSE2 && mode == V2DFmode)
20218 rtx zero;
20220 /* When SSE registers are split into halves, we can avoid
20221 writing to the top half twice. */
20222 if (TARGET_SSE_SPLIT_REGS)
20224 emit_clobber (op0);
20225 zero = op0;
20227 else
20229 /* ??? Not sure about the best option for the Intel chips.
20230 The following would seem to satisfy; the register is
20231 entirely cleared, breaking the dependency chain. We
20232 then store to the upper half, with a dependency depth
20233 of one. A rumor has it that Intel recommends two movsd
20234 followed by an unpacklpd, but this is unconfirmed. And
20235 given that the dependency depth of the unpacklpd would
20236 still be one, I'm not sure why this would be better. */
20237 zero = CONST0_RTX (V2DFmode);
20240 m = adjust_address (op1, DFmode, 0);
20241 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20242 m = adjust_address (op1, DFmode, 8);
20243 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20245 else
20247 rtx t;
20249 if (mode != V4SFmode)
20250 t = gen_reg_rtx (V4SFmode);
20251 else
20252 t = op0;
20254 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20255 emit_move_insn (t, CONST0_RTX (V4SFmode));
20256 else
20257 emit_clobber (t);
20259 m = adjust_address (op1, V2SFmode, 0);
20260 emit_insn (gen_sse_loadlps (t, t, m));
20261 m = adjust_address (op1, V2SFmode, 8);
20262 emit_insn (gen_sse_loadhps (t, t, m));
20263 if (mode != V4SFmode)
20264 emit_move_insn (op0, gen_lowpart (mode, t));
20267 else if (MEM_P (op0))
20269 if (TARGET_SSE2 && mode == V2DFmode)
20271 m = adjust_address (op0, DFmode, 0);
20272 emit_insn (gen_sse2_storelpd (m, op1));
20273 m = adjust_address (op0, DFmode, 8);
20274 emit_insn (gen_sse2_storehpd (m, op1));
20276 else
20278 if (mode != V4SFmode)
20279 op1 = gen_lowpart (V4SFmode, op1);
20281 m = adjust_address (op0, V2SFmode, 0);
20282 emit_insn (gen_sse_storelps (m, op1));
20283 m = adjust_address (op0, V2SFmode, 8);
20284 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20287 else
20288 gcc_unreachable ();
20291 /* Helper function of ix86_fixup_binary_operands to canonicalize
20292 operand order. Returns true if the operands should be swapped. */
20294 static bool
20295 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20296 rtx operands[])
20298 rtx dst = operands[0];
20299 rtx src1 = operands[1];
20300 rtx src2 = operands[2];
20302 /* If the operation is not commutative, we can't do anything. */
20303 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20304 return false;
20306 /* Highest priority is that src1 should match dst. */
20307 if (rtx_equal_p (dst, src1))
20308 return false;
20309 if (rtx_equal_p (dst, src2))
20310 return true;
20312 /* Next highest priority is that immediate constants come second. */
20313 if (immediate_operand (src2, mode))
20314 return false;
20315 if (immediate_operand (src1, mode))
20316 return true;
20318 /* Lowest priority is that memory references should come second. */
20319 if (MEM_P (src2))
20320 return false;
20321 if (MEM_P (src1))
20322 return true;
20324 return false;
20328 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20329 destination to use for the operation. If different from the true
20330 destination in operands[0], a copy operation will be required. */
20333 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20334 rtx operands[])
20336 rtx dst = operands[0];
20337 rtx src1 = operands[1];
20338 rtx src2 = operands[2];
20340 /* Canonicalize operand order. */
20341 if (ix86_swap_binary_operands_p (code, mode, operands))
20343 /* It is invalid to swap operands of different modes. */
20344 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20346 std::swap (src1, src2);
20349 /* Both source operands cannot be in memory. */
20350 if (MEM_P (src1) && MEM_P (src2))
20352 /* Optimization: Only read from memory once. */
20353 if (rtx_equal_p (src1, src2))
20355 src2 = force_reg (mode, src2);
20356 src1 = src2;
20358 else if (rtx_equal_p (dst, src1))
20359 src2 = force_reg (mode, src2);
20360 else
20361 src1 = force_reg (mode, src1);
20364 /* If the destination is memory, and we do not have matching source
20365 operands, do things in registers. */
20366 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20367 dst = gen_reg_rtx (mode);
20369 /* Source 1 cannot be a constant. */
20370 if (CONSTANT_P (src1))
20371 src1 = force_reg (mode, src1);
20373 /* Source 1 cannot be a non-matching memory. */
20374 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20375 src1 = force_reg (mode, src1);
20377 /* Improve address combine. */
20378 if (code == PLUS
20379 && GET_MODE_CLASS (mode) == MODE_INT
20380 && MEM_P (src2))
20381 src2 = force_reg (mode, src2);
20383 operands[1] = src1;
20384 operands[2] = src2;
20385 return dst;
20388 /* Similarly, but assume that the destination has already been
20389 set up properly. */
20391 void
20392 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20393 machine_mode mode, rtx operands[])
20395 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20396 gcc_assert (dst == operands[0]);
20399 /* Attempt to expand a binary operator. Make the expansion closer to the
20400 actual machine, then just general_operand, which will allow 3 separate
20401 memory references (one output, two input) in a single insn. */
20403 void
20404 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20405 rtx operands[])
20407 rtx src1, src2, dst, op, clob;
20409 dst = ix86_fixup_binary_operands (code, mode, operands);
20410 src1 = operands[1];
20411 src2 = operands[2];
20413 /* Emit the instruction. */
20415 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20417 if (reload_completed
20418 && code == PLUS
20419 && !rtx_equal_p (dst, src1))
20421 /* This is going to be an LEA; avoid splitting it later. */
20422 emit_insn (op);
20424 else
20426 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20427 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20430 /* Fix up the destination if needed. */
20431 if (dst != operands[0])
20432 emit_move_insn (operands[0], dst);
20435 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20436 the given OPERANDS. */
20438 void
20439 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20440 rtx operands[])
20442 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20443 if (SUBREG_P (operands[1]))
20445 op1 = operands[1];
20446 op2 = operands[2];
20448 else if (SUBREG_P (operands[2]))
20450 op1 = operands[2];
20451 op2 = operands[1];
20453 /* Optimize (__m128i) d | (__m128i) e and similar code
20454 when d and e are float vectors into float vector logical
20455 insn. In C/C++ without using intrinsics there is no other way
20456 to express vector logical operation on float vectors than
20457 to cast them temporarily to integer vectors. */
20458 if (op1
20459 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20460 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20461 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20462 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20463 && SUBREG_BYTE (op1) == 0
20464 && (GET_CODE (op2) == CONST_VECTOR
20465 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20466 && SUBREG_BYTE (op2) == 0))
20467 && can_create_pseudo_p ())
20469 rtx dst;
20470 switch (GET_MODE (SUBREG_REG (op1)))
20472 case V4SFmode:
20473 case V8SFmode:
20474 case V16SFmode:
20475 case V2DFmode:
20476 case V4DFmode:
20477 case V8DFmode:
20478 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20479 if (GET_CODE (op2) == CONST_VECTOR)
20481 op2 = gen_lowpart (GET_MODE (dst), op2);
20482 op2 = force_reg (GET_MODE (dst), op2);
20484 else
20486 op1 = operands[1];
20487 op2 = SUBREG_REG (operands[2]);
20488 if (!vector_operand (op2, GET_MODE (dst)))
20489 op2 = force_reg (GET_MODE (dst), op2);
20491 op1 = SUBREG_REG (op1);
20492 if (!vector_operand (op1, GET_MODE (dst)))
20493 op1 = force_reg (GET_MODE (dst), op1);
20494 emit_insn (gen_rtx_SET (dst,
20495 gen_rtx_fmt_ee (code, GET_MODE (dst),
20496 op1, op2)));
20497 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20498 return;
20499 default:
20500 break;
20503 if (!vector_operand (operands[1], mode))
20504 operands[1] = force_reg (mode, operands[1]);
20505 if (!vector_operand (operands[2], mode))
20506 operands[2] = force_reg (mode, operands[2]);
20507 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20508 emit_insn (gen_rtx_SET (operands[0],
20509 gen_rtx_fmt_ee (code, mode, operands[1],
20510 operands[2])));
20513 /* Return TRUE or FALSE depending on whether the binary operator meets the
20514 appropriate constraints. */
20516 bool
20517 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20518 rtx operands[3])
20520 rtx dst = operands[0];
20521 rtx src1 = operands[1];
20522 rtx src2 = operands[2];
20524 /* Both source operands cannot be in memory. */
20525 if (MEM_P (src1) && MEM_P (src2))
20526 return false;
20528 /* Canonicalize operand order for commutative operators. */
20529 if (ix86_swap_binary_operands_p (code, mode, operands))
20530 std::swap (src1, src2);
20532 /* If the destination is memory, we must have a matching source operand. */
20533 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20534 return false;
20536 /* Source 1 cannot be a constant. */
20537 if (CONSTANT_P (src1))
20538 return false;
20540 /* Source 1 cannot be a non-matching memory. */
20541 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20542 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20543 return (code == AND
20544 && (mode == HImode
20545 || mode == SImode
20546 || (TARGET_64BIT && mode == DImode))
20547 && satisfies_constraint_L (src2));
20549 return true;
20552 /* Attempt to expand a unary operator. Make the expansion closer to the
20553 actual machine, then just general_operand, which will allow 2 separate
20554 memory references (one output, one input) in a single insn. */
20556 void
20557 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20558 rtx operands[])
20560 bool matching_memory = false;
20561 rtx src, dst, op, clob;
20563 dst = operands[0];
20564 src = operands[1];
20566 /* If the destination is memory, and we do not have matching source
20567 operands, do things in registers. */
20568 if (MEM_P (dst))
20570 if (rtx_equal_p (dst, src))
20571 matching_memory = true;
20572 else
20573 dst = gen_reg_rtx (mode);
20576 /* When source operand is memory, destination must match. */
20577 if (MEM_P (src) && !matching_memory)
20578 src = force_reg (mode, src);
20580 /* Emit the instruction. */
20582 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20584 if (code == NOT)
20585 emit_insn (op);
20586 else
20588 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20589 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20592 /* Fix up the destination if needed. */
20593 if (dst != operands[0])
20594 emit_move_insn (operands[0], dst);
20597 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20598 divisor are within the range [0-255]. */
20600 void
20601 ix86_split_idivmod (machine_mode mode, rtx operands[],
20602 bool signed_p)
20604 rtx_code_label *end_label, *qimode_label;
20605 rtx div, mod;
20606 rtx_insn *insn;
20607 rtx scratch, tmp0, tmp1, tmp2;
20608 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20609 rtx (*gen_zero_extend) (rtx, rtx);
20610 rtx (*gen_test_ccno_1) (rtx, rtx);
20612 switch (mode)
20614 case SImode:
20615 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20616 gen_test_ccno_1 = gen_testsi_ccno_1;
20617 gen_zero_extend = gen_zero_extendqisi2;
20618 break;
20619 case DImode:
20620 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20621 gen_test_ccno_1 = gen_testdi_ccno_1;
20622 gen_zero_extend = gen_zero_extendqidi2;
20623 break;
20624 default:
20625 gcc_unreachable ();
20628 end_label = gen_label_rtx ();
20629 qimode_label = gen_label_rtx ();
20631 scratch = gen_reg_rtx (mode);
20633 /* Use 8bit unsigned divimod if dividend and divisor are within
20634 the range [0-255]. */
20635 emit_move_insn (scratch, operands[2]);
20636 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20637 scratch, 1, OPTAB_DIRECT);
20638 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20639 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20640 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20641 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20642 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20643 pc_rtx);
20644 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20645 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20646 JUMP_LABEL (insn) = qimode_label;
20648 /* Generate original signed/unsigned divimod. */
20649 div = gen_divmod4_1 (operands[0], operands[1],
20650 operands[2], operands[3]);
20651 emit_insn (div);
20653 /* Branch to the end. */
20654 emit_jump_insn (gen_jump (end_label));
20655 emit_barrier ();
20657 /* Generate 8bit unsigned divide. */
20658 emit_label (qimode_label);
20659 /* Don't use operands[0] for result of 8bit divide since not all
20660 registers support QImode ZERO_EXTRACT. */
20661 tmp0 = lowpart_subreg (HImode, scratch, mode);
20662 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20663 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20664 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20666 if (signed_p)
20668 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20669 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20671 else
20673 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20674 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20677 /* Extract remainder from AH. */
20678 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20679 if (REG_P (operands[1]))
20680 insn = emit_move_insn (operands[1], tmp1);
20681 else
20683 /* Need a new scratch register since the old one has result
20684 of 8bit divide. */
20685 scratch = gen_reg_rtx (mode);
20686 emit_move_insn (scratch, tmp1);
20687 insn = emit_move_insn (operands[1], scratch);
20689 set_unique_reg_note (insn, REG_EQUAL, mod);
20691 /* Zero extend quotient from AL. */
20692 tmp1 = gen_lowpart (QImode, tmp0);
20693 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20694 set_unique_reg_note (insn, REG_EQUAL, div);
20696 emit_label (end_label);
20699 #define LEA_MAX_STALL (3)
20700 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20702 /* Increase given DISTANCE in half-cycles according to
20703 dependencies between PREV and NEXT instructions.
20704 Add 1 half-cycle if there is no dependency and
20705 go to next cycle if there is some dependecy. */
20707 static unsigned int
20708 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20710 df_ref def, use;
20712 if (!prev || !next)
20713 return distance + (distance & 1) + 2;
20715 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20716 return distance + 1;
20718 FOR_EACH_INSN_USE (use, next)
20719 FOR_EACH_INSN_DEF (def, prev)
20720 if (!DF_REF_IS_ARTIFICIAL (def)
20721 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20722 return distance + (distance & 1) + 2;
20724 return distance + 1;
20727 /* Function checks if instruction INSN defines register number
20728 REGNO1 or REGNO2. */
20730 static bool
20731 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20732 rtx_insn *insn)
20734 df_ref def;
20736 FOR_EACH_INSN_DEF (def, insn)
20737 if (DF_REF_REG_DEF_P (def)
20738 && !DF_REF_IS_ARTIFICIAL (def)
20739 && (regno1 == DF_REF_REGNO (def)
20740 || regno2 == DF_REF_REGNO (def)))
20741 return true;
20743 return false;
20746 /* Function checks if instruction INSN uses register number
20747 REGNO as a part of address expression. */
20749 static bool
20750 insn_uses_reg_mem (unsigned int regno, rtx insn)
20752 df_ref use;
20754 FOR_EACH_INSN_USE (use, insn)
20755 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20756 return true;
20758 return false;
20761 /* Search backward for non-agu definition of register number REGNO1
20762 or register number REGNO2 in basic block starting from instruction
20763 START up to head of basic block or instruction INSN.
20765 Function puts true value into *FOUND var if definition was found
20766 and false otherwise.
20768 Distance in half-cycles between START and found instruction or head
20769 of BB is added to DISTANCE and returned. */
20771 static int
20772 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20773 rtx_insn *insn, int distance,
20774 rtx_insn *start, bool *found)
20776 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20777 rtx_insn *prev = start;
20778 rtx_insn *next = NULL;
20780 *found = false;
20782 while (prev
20783 && prev != insn
20784 && distance < LEA_SEARCH_THRESHOLD)
20786 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20788 distance = increase_distance (prev, next, distance);
20789 if (insn_defines_reg (regno1, regno2, prev))
20791 if (recog_memoized (prev) < 0
20792 || get_attr_type (prev) != TYPE_LEA)
20794 *found = true;
20795 return distance;
20799 next = prev;
20801 if (prev == BB_HEAD (bb))
20802 break;
20804 prev = PREV_INSN (prev);
20807 return distance;
20810 /* Search backward for non-agu definition of register number REGNO1
20811 or register number REGNO2 in INSN's basic block until
20812 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20813 2. Reach neighbor BBs boundary, or
20814 3. Reach agu definition.
20815 Returns the distance between the non-agu definition point and INSN.
20816 If no definition point, returns -1. */
20818 static int
20819 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20820 rtx_insn *insn)
20822 basic_block bb = BLOCK_FOR_INSN (insn);
20823 int distance = 0;
20824 bool found = false;
20826 if (insn != BB_HEAD (bb))
20827 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20828 distance, PREV_INSN (insn),
20829 &found);
20831 if (!found && distance < LEA_SEARCH_THRESHOLD)
20833 edge e;
20834 edge_iterator ei;
20835 bool simple_loop = false;
20837 FOR_EACH_EDGE (e, ei, bb->preds)
20838 if (e->src == bb)
20840 simple_loop = true;
20841 break;
20844 if (simple_loop)
20845 distance = distance_non_agu_define_in_bb (regno1, regno2,
20846 insn, distance,
20847 BB_END (bb), &found);
20848 else
20850 int shortest_dist = -1;
20851 bool found_in_bb = false;
20853 FOR_EACH_EDGE (e, ei, bb->preds)
20855 int bb_dist
20856 = distance_non_agu_define_in_bb (regno1, regno2,
20857 insn, distance,
20858 BB_END (e->src),
20859 &found_in_bb);
20860 if (found_in_bb)
20862 if (shortest_dist < 0)
20863 shortest_dist = bb_dist;
20864 else if (bb_dist > 0)
20865 shortest_dist = MIN (bb_dist, shortest_dist);
20867 found = true;
20871 distance = shortest_dist;
20875 /* get_attr_type may modify recog data. We want to make sure
20876 that recog data is valid for instruction INSN, on which
20877 distance_non_agu_define is called. INSN is unchanged here. */
20878 extract_insn_cached (insn);
20880 if (!found)
20881 return -1;
20883 return distance >> 1;
20886 /* Return the distance in half-cycles between INSN and the next
20887 insn that uses register number REGNO in memory address added
20888 to DISTANCE. Return -1 if REGNO0 is set.
20890 Put true value into *FOUND if register usage was found and
20891 false otherwise.
20892 Put true value into *REDEFINED if register redefinition was
20893 found and false otherwise. */
20895 static int
20896 distance_agu_use_in_bb (unsigned int regno,
20897 rtx_insn *insn, int distance, rtx_insn *start,
20898 bool *found, bool *redefined)
20900 basic_block bb = NULL;
20901 rtx_insn *next = start;
20902 rtx_insn *prev = NULL;
20904 *found = false;
20905 *redefined = false;
20907 if (start != NULL_RTX)
20909 bb = BLOCK_FOR_INSN (start);
20910 if (start != BB_HEAD (bb))
20911 /* If insn and start belong to the same bb, set prev to insn,
20912 so the call to increase_distance will increase the distance
20913 between insns by 1. */
20914 prev = insn;
20917 while (next
20918 && next != insn
20919 && distance < LEA_SEARCH_THRESHOLD)
20921 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20923 distance = increase_distance(prev, next, distance);
20924 if (insn_uses_reg_mem (regno, next))
20926 /* Return DISTANCE if OP0 is used in memory
20927 address in NEXT. */
20928 *found = true;
20929 return distance;
20932 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20934 /* Return -1 if OP0 is set in NEXT. */
20935 *redefined = true;
20936 return -1;
20939 prev = next;
20942 if (next == BB_END (bb))
20943 break;
20945 next = NEXT_INSN (next);
20948 return distance;
20951 /* Return the distance between INSN and the next insn that uses
20952 register number REGNO0 in memory address. Return -1 if no such
20953 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20955 static int
20956 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20958 basic_block bb = BLOCK_FOR_INSN (insn);
20959 int distance = 0;
20960 bool found = false;
20961 bool redefined = false;
20963 if (insn != BB_END (bb))
20964 distance = distance_agu_use_in_bb (regno0, insn, distance,
20965 NEXT_INSN (insn),
20966 &found, &redefined);
20968 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20970 edge e;
20971 edge_iterator ei;
20972 bool simple_loop = false;
20974 FOR_EACH_EDGE (e, ei, bb->succs)
20975 if (e->dest == bb)
20977 simple_loop = true;
20978 break;
20981 if (simple_loop)
20982 distance = distance_agu_use_in_bb (regno0, insn,
20983 distance, BB_HEAD (bb),
20984 &found, &redefined);
20985 else
20987 int shortest_dist = -1;
20988 bool found_in_bb = false;
20989 bool redefined_in_bb = false;
20991 FOR_EACH_EDGE (e, ei, bb->succs)
20993 int bb_dist
20994 = distance_agu_use_in_bb (regno0, insn,
20995 distance, BB_HEAD (e->dest),
20996 &found_in_bb, &redefined_in_bb);
20997 if (found_in_bb)
20999 if (shortest_dist < 0)
21000 shortest_dist = bb_dist;
21001 else if (bb_dist > 0)
21002 shortest_dist = MIN (bb_dist, shortest_dist);
21004 found = true;
21008 distance = shortest_dist;
21012 if (!found || redefined)
21013 return -1;
21015 return distance >> 1;
21018 /* Define this macro to tune LEA priority vs ADD, it take effect when
21019 there is a dilemma of choicing LEA or ADD
21020 Negative value: ADD is more preferred than LEA
21021 Zero: Netrual
21022 Positive value: LEA is more preferred than ADD*/
21023 #define IX86_LEA_PRIORITY 0
21025 /* Return true if usage of lea INSN has performance advantage
21026 over a sequence of instructions. Instructions sequence has
21027 SPLIT_COST cycles higher latency than lea latency. */
21029 static bool
21030 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21031 unsigned int regno2, int split_cost, bool has_scale)
21033 int dist_define, dist_use;
21035 /* For Silvermont if using a 2-source or 3-source LEA for
21036 non-destructive destination purposes, or due to wanting
21037 ability to use SCALE, the use of LEA is justified. */
21038 if (TARGET_SILVERMONT || TARGET_INTEL)
21040 if (has_scale)
21041 return true;
21042 if (split_cost < 1)
21043 return false;
21044 if (regno0 == regno1 || regno0 == regno2)
21045 return false;
21046 return true;
21049 dist_define = distance_non_agu_define (regno1, regno2, insn);
21050 dist_use = distance_agu_use (regno0, insn);
21052 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21054 /* If there is no non AGU operand definition, no AGU
21055 operand usage and split cost is 0 then both lea
21056 and non lea variants have same priority. Currently
21057 we prefer lea for 64 bit code and non lea on 32 bit
21058 code. */
21059 if (dist_use < 0 && split_cost == 0)
21060 return TARGET_64BIT || IX86_LEA_PRIORITY;
21061 else
21062 return true;
21065 /* With longer definitions distance lea is more preferable.
21066 Here we change it to take into account splitting cost and
21067 lea priority. */
21068 dist_define += split_cost + IX86_LEA_PRIORITY;
21070 /* If there is no use in memory addess then we just check
21071 that split cost exceeds AGU stall. */
21072 if (dist_use < 0)
21073 return dist_define > LEA_MAX_STALL;
21075 /* If this insn has both backward non-agu dependence and forward
21076 agu dependence, the one with short distance takes effect. */
21077 return dist_define >= dist_use;
21080 /* Return true if it is legal to clobber flags by INSN and
21081 false otherwise. */
21083 static bool
21084 ix86_ok_to_clobber_flags (rtx_insn *insn)
21086 basic_block bb = BLOCK_FOR_INSN (insn);
21087 df_ref use;
21088 bitmap live;
21090 while (insn)
21092 if (NONDEBUG_INSN_P (insn))
21094 FOR_EACH_INSN_USE (use, insn)
21095 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21096 return false;
21098 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21099 return true;
21102 if (insn == BB_END (bb))
21103 break;
21105 insn = NEXT_INSN (insn);
21108 live = df_get_live_out(bb);
21109 return !REGNO_REG_SET_P (live, FLAGS_REG);
21112 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21113 move and add to avoid AGU stalls. */
21115 bool
21116 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21118 unsigned int regno0, regno1, regno2;
21120 /* Check if we need to optimize. */
21121 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21122 return false;
21124 /* Check it is correct to split here. */
21125 if (!ix86_ok_to_clobber_flags(insn))
21126 return false;
21128 regno0 = true_regnum (operands[0]);
21129 regno1 = true_regnum (operands[1]);
21130 regno2 = true_regnum (operands[2]);
21132 /* We need to split only adds with non destructive
21133 destination operand. */
21134 if (regno0 == regno1 || regno0 == regno2)
21135 return false;
21136 else
21137 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21140 /* Return true if we should emit lea instruction instead of mov
21141 instruction. */
21143 bool
21144 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21146 unsigned int regno0, regno1;
21148 /* Check if we need to optimize. */
21149 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21150 return false;
21152 /* Use lea for reg to reg moves only. */
21153 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21154 return false;
21156 regno0 = true_regnum (operands[0]);
21157 regno1 = true_regnum (operands[1]);
21159 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21162 /* Return true if we need to split lea into a sequence of
21163 instructions to avoid AGU stalls. */
21165 bool
21166 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21168 unsigned int regno0, regno1, regno2;
21169 int split_cost;
21170 struct ix86_address parts;
21171 int ok;
21173 /* Check we need to optimize. */
21174 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21175 return false;
21177 /* The "at least two components" test below might not catch simple
21178 move or zero extension insns if parts.base is non-NULL and parts.disp
21179 is const0_rtx as the only components in the address, e.g. if the
21180 register is %rbp or %r13. As this test is much cheaper and moves or
21181 zero extensions are the common case, do this check first. */
21182 if (REG_P (operands[1])
21183 || (SImode_address_operand (operands[1], VOIDmode)
21184 && REG_P (XEXP (operands[1], 0))))
21185 return false;
21187 /* Check if it is OK to split here. */
21188 if (!ix86_ok_to_clobber_flags (insn))
21189 return false;
21191 ok = ix86_decompose_address (operands[1], &parts);
21192 gcc_assert (ok);
21194 /* There should be at least two components in the address. */
21195 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21196 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21197 return false;
21199 /* We should not split into add if non legitimate pic
21200 operand is used as displacement. */
21201 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21202 return false;
21204 regno0 = true_regnum (operands[0]) ;
21205 regno1 = INVALID_REGNUM;
21206 regno2 = INVALID_REGNUM;
21208 if (parts.base)
21209 regno1 = true_regnum (parts.base);
21210 if (parts.index)
21211 regno2 = true_regnum (parts.index);
21213 split_cost = 0;
21215 /* Compute how many cycles we will add to execution time
21216 if split lea into a sequence of instructions. */
21217 if (parts.base || parts.index)
21219 /* Have to use mov instruction if non desctructive
21220 destination form is used. */
21221 if (regno1 != regno0 && regno2 != regno0)
21222 split_cost += 1;
21224 /* Have to add index to base if both exist. */
21225 if (parts.base && parts.index)
21226 split_cost += 1;
21228 /* Have to use shift and adds if scale is 2 or greater. */
21229 if (parts.scale > 1)
21231 if (regno0 != regno1)
21232 split_cost += 1;
21233 else if (regno2 == regno0)
21234 split_cost += 4;
21235 else
21236 split_cost += parts.scale;
21239 /* Have to use add instruction with immediate if
21240 disp is non zero. */
21241 if (parts.disp && parts.disp != const0_rtx)
21242 split_cost += 1;
21244 /* Subtract the price of lea. */
21245 split_cost -= 1;
21248 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21249 parts.scale > 1);
21252 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21253 matches destination. RTX includes clobber of FLAGS_REG. */
21255 static void
21256 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21257 rtx dst, rtx src)
21259 rtx op, clob;
21261 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21262 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21264 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21267 /* Return true if regno1 def is nearest to the insn. */
21269 static bool
21270 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21272 rtx_insn *prev = insn;
21273 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21275 if (insn == start)
21276 return false;
21277 while (prev && prev != start)
21279 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21281 prev = PREV_INSN (prev);
21282 continue;
21284 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21285 return true;
21286 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21287 return false;
21288 prev = PREV_INSN (prev);
21291 /* None of the regs is defined in the bb. */
21292 return false;
21295 /* Split lea instructions into a sequence of instructions
21296 which are executed on ALU to avoid AGU stalls.
21297 It is assumed that it is allowed to clobber flags register
21298 at lea position. */
21300 void
21301 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21303 unsigned int regno0, regno1, regno2;
21304 struct ix86_address parts;
21305 rtx target, tmp;
21306 int ok, adds;
21308 ok = ix86_decompose_address (operands[1], &parts);
21309 gcc_assert (ok);
21311 target = gen_lowpart (mode, operands[0]);
21313 regno0 = true_regnum (target);
21314 regno1 = INVALID_REGNUM;
21315 regno2 = INVALID_REGNUM;
21317 if (parts.base)
21319 parts.base = gen_lowpart (mode, parts.base);
21320 regno1 = true_regnum (parts.base);
21323 if (parts.index)
21325 parts.index = gen_lowpart (mode, parts.index);
21326 regno2 = true_regnum (parts.index);
21329 if (parts.disp)
21330 parts.disp = gen_lowpart (mode, parts.disp);
21332 if (parts.scale > 1)
21334 /* Case r1 = r1 + ... */
21335 if (regno1 == regno0)
21337 /* If we have a case r1 = r1 + C * r2 then we
21338 should use multiplication which is very
21339 expensive. Assume cost model is wrong if we
21340 have such case here. */
21341 gcc_assert (regno2 != regno0);
21343 for (adds = parts.scale; adds > 0; adds--)
21344 ix86_emit_binop (PLUS, mode, target, parts.index);
21346 else
21348 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21349 if (regno0 != regno2)
21350 emit_insn (gen_rtx_SET (target, parts.index));
21352 /* Use shift for scaling. */
21353 ix86_emit_binop (ASHIFT, mode, target,
21354 GEN_INT (exact_log2 (parts.scale)));
21356 if (parts.base)
21357 ix86_emit_binop (PLUS, mode, target, parts.base);
21359 if (parts.disp && parts.disp != const0_rtx)
21360 ix86_emit_binop (PLUS, mode, target, parts.disp);
21363 else if (!parts.base && !parts.index)
21365 gcc_assert(parts.disp);
21366 emit_insn (gen_rtx_SET (target, parts.disp));
21368 else
21370 if (!parts.base)
21372 if (regno0 != regno2)
21373 emit_insn (gen_rtx_SET (target, parts.index));
21375 else if (!parts.index)
21377 if (regno0 != regno1)
21378 emit_insn (gen_rtx_SET (target, parts.base));
21380 else
21382 if (regno0 == regno1)
21383 tmp = parts.index;
21384 else if (regno0 == regno2)
21385 tmp = parts.base;
21386 else
21388 rtx tmp1;
21390 /* Find better operand for SET instruction, depending
21391 on which definition is farther from the insn. */
21392 if (find_nearest_reg_def (insn, regno1, regno2))
21393 tmp = parts.index, tmp1 = parts.base;
21394 else
21395 tmp = parts.base, tmp1 = parts.index;
21397 emit_insn (gen_rtx_SET (target, tmp));
21399 if (parts.disp && parts.disp != const0_rtx)
21400 ix86_emit_binop (PLUS, mode, target, parts.disp);
21402 ix86_emit_binop (PLUS, mode, target, tmp1);
21403 return;
21406 ix86_emit_binop (PLUS, mode, target, tmp);
21409 if (parts.disp && parts.disp != const0_rtx)
21410 ix86_emit_binop (PLUS, mode, target, parts.disp);
21414 /* Return true if it is ok to optimize an ADD operation to LEA
21415 operation to avoid flag register consumation. For most processors,
21416 ADD is faster than LEA. For the processors like BONNELL, if the
21417 destination register of LEA holds an actual address which will be
21418 used soon, LEA is better and otherwise ADD is better. */
21420 bool
21421 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21423 unsigned int regno0 = true_regnum (operands[0]);
21424 unsigned int regno1 = true_regnum (operands[1]);
21425 unsigned int regno2 = true_regnum (operands[2]);
21427 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21428 if (regno0 != regno1 && regno0 != regno2)
21429 return true;
21431 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21432 return false;
21434 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21437 /* Return true if destination reg of SET_BODY is shift count of
21438 USE_BODY. */
21440 static bool
21441 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21443 rtx set_dest;
21444 rtx shift_rtx;
21445 int i;
21447 /* Retrieve destination of SET_BODY. */
21448 switch (GET_CODE (set_body))
21450 case SET:
21451 set_dest = SET_DEST (set_body);
21452 if (!set_dest || !REG_P (set_dest))
21453 return false;
21454 break;
21455 case PARALLEL:
21456 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21457 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21458 use_body))
21459 return true;
21460 /* FALLTHROUGH */
21461 default:
21462 return false;
21465 /* Retrieve shift count of USE_BODY. */
21466 switch (GET_CODE (use_body))
21468 case SET:
21469 shift_rtx = XEXP (use_body, 1);
21470 break;
21471 case PARALLEL:
21472 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21473 if (ix86_dep_by_shift_count_body (set_body,
21474 XVECEXP (use_body, 0, i)))
21475 return true;
21476 /* FALLTHROUGH */
21477 default:
21478 return false;
21481 if (shift_rtx
21482 && (GET_CODE (shift_rtx) == ASHIFT
21483 || GET_CODE (shift_rtx) == LSHIFTRT
21484 || GET_CODE (shift_rtx) == ASHIFTRT
21485 || GET_CODE (shift_rtx) == ROTATE
21486 || GET_CODE (shift_rtx) == ROTATERT))
21488 rtx shift_count = XEXP (shift_rtx, 1);
21490 /* Return true if shift count is dest of SET_BODY. */
21491 if (REG_P (shift_count))
21493 /* Add check since it can be invoked before register
21494 allocation in pre-reload schedule. */
21495 if (reload_completed
21496 && true_regnum (set_dest) == true_regnum (shift_count))
21497 return true;
21498 else if (REGNO(set_dest) == REGNO(shift_count))
21499 return true;
21503 return false;
21506 /* Return true if destination reg of SET_INSN is shift count of
21507 USE_INSN. */
21509 bool
21510 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21512 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21513 PATTERN (use_insn));
21516 /* Return TRUE or FALSE depending on whether the unary operator meets the
21517 appropriate constraints. */
21519 bool
21520 ix86_unary_operator_ok (enum rtx_code,
21521 machine_mode,
21522 rtx operands[2])
21524 /* If one of operands is memory, source and destination must match. */
21525 if ((MEM_P (operands[0])
21526 || MEM_P (operands[1]))
21527 && ! rtx_equal_p (operands[0], operands[1]))
21528 return false;
21529 return true;
21532 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21533 are ok, keeping in mind the possible movddup alternative. */
21535 bool
21536 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21538 if (MEM_P (operands[0]))
21539 return rtx_equal_p (operands[0], operands[1 + high]);
21540 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21541 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21542 return true;
21545 /* Post-reload splitter for converting an SF or DFmode value in an
21546 SSE register into an unsigned SImode. */
21548 void
21549 ix86_split_convert_uns_si_sse (rtx operands[])
21551 machine_mode vecmode;
21552 rtx value, large, zero_or_two31, input, two31, x;
21554 large = operands[1];
21555 zero_or_two31 = operands[2];
21556 input = operands[3];
21557 two31 = operands[4];
21558 vecmode = GET_MODE (large);
21559 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21561 /* Load up the value into the low element. We must ensure that the other
21562 elements are valid floats -- zero is the easiest such value. */
21563 if (MEM_P (input))
21565 if (vecmode == V4SFmode)
21566 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21567 else
21568 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21570 else
21572 input = gen_rtx_REG (vecmode, REGNO (input));
21573 emit_move_insn (value, CONST0_RTX (vecmode));
21574 if (vecmode == V4SFmode)
21575 emit_insn (gen_sse_movss (value, value, input));
21576 else
21577 emit_insn (gen_sse2_movsd (value, value, input));
21580 emit_move_insn (large, two31);
21581 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21583 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21584 emit_insn (gen_rtx_SET (large, x));
21586 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21587 emit_insn (gen_rtx_SET (zero_or_two31, x));
21589 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21590 emit_insn (gen_rtx_SET (value, x));
21592 large = gen_rtx_REG (V4SImode, REGNO (large));
21593 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21595 x = gen_rtx_REG (V4SImode, REGNO (value));
21596 if (vecmode == V4SFmode)
21597 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21598 else
21599 emit_insn (gen_sse2_cvttpd2dq (x, value));
21600 value = x;
21602 emit_insn (gen_xorv4si3 (value, value, large));
21605 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21606 Expects the 64-bit DImode to be supplied in a pair of integral
21607 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21608 -mfpmath=sse, !optimize_size only. */
21610 void
21611 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21613 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21614 rtx int_xmm, fp_xmm;
21615 rtx biases, exponents;
21616 rtx x;
21618 int_xmm = gen_reg_rtx (V4SImode);
21619 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21620 emit_insn (gen_movdi_to_sse (int_xmm, input));
21621 else if (TARGET_SSE_SPLIT_REGS)
21623 emit_clobber (int_xmm);
21624 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21626 else
21628 x = gen_reg_rtx (V2DImode);
21629 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21630 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21633 x = gen_rtx_CONST_VECTOR (V4SImode,
21634 gen_rtvec (4, GEN_INT (0x43300000UL),
21635 GEN_INT (0x45300000UL),
21636 const0_rtx, const0_rtx));
21637 exponents = validize_mem (force_const_mem (V4SImode, x));
21639 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21640 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21642 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21643 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21644 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21645 (0x1.0p84 + double(fp_value_hi_xmm)).
21646 Note these exponents differ by 32. */
21648 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21650 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21651 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21652 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21653 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21654 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21655 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21656 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21657 biases = validize_mem (force_const_mem (V2DFmode, biases));
21658 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21660 /* Add the upper and lower DFmode values together. */
21661 if (TARGET_SSE3)
21662 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21663 else
21665 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21666 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21667 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21670 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21673 /* Not used, but eases macroization of patterns. */
21674 void
21675 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21677 gcc_unreachable ();
21680 /* Convert an unsigned SImode value into a DFmode. Only currently used
21681 for SSE, but applicable anywhere. */
21683 void
21684 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21686 REAL_VALUE_TYPE TWO31r;
21687 rtx x, fp;
21689 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21690 NULL, 1, OPTAB_DIRECT);
21692 fp = gen_reg_rtx (DFmode);
21693 emit_insn (gen_floatsidf2 (fp, x));
21695 real_ldexp (&TWO31r, &dconst1, 31);
21696 x = const_double_from_real_value (TWO31r, DFmode);
21698 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21699 if (x != target)
21700 emit_move_insn (target, x);
21703 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21704 32-bit mode; otherwise we have a direct convert instruction. */
21706 void
21707 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21709 REAL_VALUE_TYPE TWO32r;
21710 rtx fp_lo, fp_hi, x;
21712 fp_lo = gen_reg_rtx (DFmode);
21713 fp_hi = gen_reg_rtx (DFmode);
21715 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21717 real_ldexp (&TWO32r, &dconst1, 32);
21718 x = const_double_from_real_value (TWO32r, DFmode);
21719 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21721 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21723 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21724 0, OPTAB_DIRECT);
21725 if (x != target)
21726 emit_move_insn (target, x);
21729 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21730 For x86_32, -mfpmath=sse, !optimize_size only. */
21731 void
21732 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21734 REAL_VALUE_TYPE ONE16r;
21735 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21737 real_ldexp (&ONE16r, &dconst1, 16);
21738 x = const_double_from_real_value (ONE16r, SFmode);
21739 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21740 NULL, 0, OPTAB_DIRECT);
21741 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21742 NULL, 0, OPTAB_DIRECT);
21743 fp_hi = gen_reg_rtx (SFmode);
21744 fp_lo = gen_reg_rtx (SFmode);
21745 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21746 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21747 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21748 0, OPTAB_DIRECT);
21749 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21750 0, OPTAB_DIRECT);
21751 if (!rtx_equal_p (target, fp_hi))
21752 emit_move_insn (target, fp_hi);
21755 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21756 a vector of unsigned ints VAL to vector of floats TARGET. */
21758 void
21759 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21761 rtx tmp[8];
21762 REAL_VALUE_TYPE TWO16r;
21763 machine_mode intmode = GET_MODE (val);
21764 machine_mode fltmode = GET_MODE (target);
21765 rtx (*cvt) (rtx, rtx);
21767 if (intmode == V4SImode)
21768 cvt = gen_floatv4siv4sf2;
21769 else
21770 cvt = gen_floatv8siv8sf2;
21771 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21772 tmp[0] = force_reg (intmode, tmp[0]);
21773 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21774 OPTAB_DIRECT);
21775 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21776 NULL_RTX, 1, OPTAB_DIRECT);
21777 tmp[3] = gen_reg_rtx (fltmode);
21778 emit_insn (cvt (tmp[3], tmp[1]));
21779 tmp[4] = gen_reg_rtx (fltmode);
21780 emit_insn (cvt (tmp[4], tmp[2]));
21781 real_ldexp (&TWO16r, &dconst1, 16);
21782 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21783 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21784 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21785 OPTAB_DIRECT);
21786 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21787 OPTAB_DIRECT);
21788 if (tmp[7] != target)
21789 emit_move_insn (target, tmp[7]);
21792 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21793 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21794 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21795 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21798 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21800 REAL_VALUE_TYPE TWO31r;
21801 rtx two31r, tmp[4];
21802 machine_mode mode = GET_MODE (val);
21803 machine_mode scalarmode = GET_MODE_INNER (mode);
21804 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21805 rtx (*cmp) (rtx, rtx, rtx, rtx);
21806 int i;
21808 for (i = 0; i < 3; i++)
21809 tmp[i] = gen_reg_rtx (mode);
21810 real_ldexp (&TWO31r, &dconst1, 31);
21811 two31r = const_double_from_real_value (TWO31r, scalarmode);
21812 two31r = ix86_build_const_vector (mode, 1, two31r);
21813 two31r = force_reg (mode, two31r);
21814 switch (mode)
21816 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21817 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21818 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21819 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21820 default: gcc_unreachable ();
21822 tmp[3] = gen_rtx_LE (mode, two31r, val);
21823 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21824 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21825 0, OPTAB_DIRECT);
21826 if (intmode == V4SImode || TARGET_AVX2)
21827 *xorp = expand_simple_binop (intmode, ASHIFT,
21828 gen_lowpart (intmode, tmp[0]),
21829 GEN_INT (31), NULL_RTX, 0,
21830 OPTAB_DIRECT);
21831 else
21833 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21834 two31 = ix86_build_const_vector (intmode, 1, two31);
21835 *xorp = expand_simple_binop (intmode, AND,
21836 gen_lowpart (intmode, tmp[0]),
21837 two31, NULL_RTX, 0,
21838 OPTAB_DIRECT);
21840 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21841 0, OPTAB_DIRECT);
21844 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21845 then replicate the value for all elements of the vector
21846 register. */
21849 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21851 int i, n_elt;
21852 rtvec v;
21853 machine_mode scalar_mode;
21855 switch (mode)
21857 case V64QImode:
21858 case V32QImode:
21859 case V16QImode:
21860 case V32HImode:
21861 case V16HImode:
21862 case V8HImode:
21863 case V16SImode:
21864 case V8SImode:
21865 case V4SImode:
21866 case V8DImode:
21867 case V4DImode:
21868 case V2DImode:
21869 gcc_assert (vect);
21870 /* FALLTHRU */
21871 case V16SFmode:
21872 case V8SFmode:
21873 case V4SFmode:
21874 case V8DFmode:
21875 case V4DFmode:
21876 case V2DFmode:
21877 n_elt = GET_MODE_NUNITS (mode);
21878 v = rtvec_alloc (n_elt);
21879 scalar_mode = GET_MODE_INNER (mode);
21881 RTVEC_ELT (v, 0) = value;
21883 for (i = 1; i < n_elt; ++i)
21884 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21886 return gen_rtx_CONST_VECTOR (mode, v);
21888 default:
21889 gcc_unreachable ();
21893 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21894 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21895 for an SSE register. If VECT is true, then replicate the mask for
21896 all elements of the vector register. If INVERT is true, then create
21897 a mask excluding the sign bit. */
21900 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21902 machine_mode vec_mode, imode;
21903 wide_int w;
21904 rtx mask, v;
21906 switch (mode)
21908 case V16SImode:
21909 case V16SFmode:
21910 case V8SImode:
21911 case V4SImode:
21912 case V8SFmode:
21913 case V4SFmode:
21914 vec_mode = mode;
21915 imode = SImode;
21916 break;
21918 case V8DImode:
21919 case V4DImode:
21920 case V2DImode:
21921 case V8DFmode:
21922 case V4DFmode:
21923 case V2DFmode:
21924 vec_mode = mode;
21925 imode = DImode;
21926 break;
21928 case TImode:
21929 case TFmode:
21930 vec_mode = VOIDmode;
21931 imode = TImode;
21932 break;
21934 default:
21935 gcc_unreachable ();
21938 machine_mode inner_mode = GET_MODE_INNER (mode);
21939 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21940 GET_MODE_BITSIZE (inner_mode));
21941 if (invert)
21942 w = wi::bit_not (w);
21944 /* Force this value into the low part of a fp vector constant. */
21945 mask = immed_wide_int_const (w, imode);
21946 mask = gen_lowpart (inner_mode, mask);
21948 if (vec_mode == VOIDmode)
21949 return force_reg (inner_mode, mask);
21951 v = ix86_build_const_vector (vec_mode, vect, mask);
21952 return force_reg (vec_mode, v);
21955 /* Generate code for floating point ABS or NEG. */
21957 void
21958 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21959 rtx operands[])
21961 rtx mask, set, dst, src;
21962 bool use_sse = false;
21963 bool vector_mode = VECTOR_MODE_P (mode);
21964 machine_mode vmode = mode;
21966 if (vector_mode)
21967 use_sse = true;
21968 else if (mode == TFmode)
21969 use_sse = true;
21970 else if (TARGET_SSE_MATH)
21972 use_sse = SSE_FLOAT_MODE_P (mode);
21973 if (mode == SFmode)
21974 vmode = V4SFmode;
21975 else if (mode == DFmode)
21976 vmode = V2DFmode;
21979 /* NEG and ABS performed with SSE use bitwise mask operations.
21980 Create the appropriate mask now. */
21981 if (use_sse)
21982 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21983 else
21984 mask = NULL_RTX;
21986 dst = operands[0];
21987 src = operands[1];
21989 set = gen_rtx_fmt_e (code, mode, src);
21990 set = gen_rtx_SET (dst, set);
21992 if (mask)
21994 rtx use, clob;
21995 rtvec par;
21997 use = gen_rtx_USE (VOIDmode, mask);
21998 if (vector_mode)
21999 par = gen_rtvec (2, set, use);
22000 else
22002 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22003 par = gen_rtvec (3, set, use, clob);
22005 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22007 else
22008 emit_insn (set);
22011 /* Expand a copysign operation. Special case operand 0 being a constant. */
22013 void
22014 ix86_expand_copysign (rtx operands[])
22016 machine_mode mode, vmode;
22017 rtx dest, op0, op1, mask, nmask;
22019 dest = operands[0];
22020 op0 = operands[1];
22021 op1 = operands[2];
22023 mode = GET_MODE (dest);
22025 if (mode == SFmode)
22026 vmode = V4SFmode;
22027 else if (mode == DFmode)
22028 vmode = V2DFmode;
22029 else
22030 vmode = mode;
22032 if (CONST_DOUBLE_P (op0))
22034 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22036 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22037 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22039 if (mode == SFmode || mode == DFmode)
22041 if (op0 == CONST0_RTX (mode))
22042 op0 = CONST0_RTX (vmode);
22043 else
22045 rtx v = ix86_build_const_vector (vmode, false, op0);
22047 op0 = force_reg (vmode, v);
22050 else if (op0 != CONST0_RTX (mode))
22051 op0 = force_reg (mode, op0);
22053 mask = ix86_build_signbit_mask (vmode, 0, 0);
22055 if (mode == SFmode)
22056 copysign_insn = gen_copysignsf3_const;
22057 else if (mode == DFmode)
22058 copysign_insn = gen_copysigndf3_const;
22059 else
22060 copysign_insn = gen_copysigntf3_const;
22062 emit_insn (copysign_insn (dest, op0, op1, mask));
22064 else
22066 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22068 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22069 mask = ix86_build_signbit_mask (vmode, 0, 0);
22071 if (mode == SFmode)
22072 copysign_insn = gen_copysignsf3_var;
22073 else if (mode == DFmode)
22074 copysign_insn = gen_copysigndf3_var;
22075 else
22076 copysign_insn = gen_copysigntf3_var;
22078 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22082 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22083 be a constant, and so has already been expanded into a vector constant. */
22085 void
22086 ix86_split_copysign_const (rtx operands[])
22088 machine_mode mode, vmode;
22089 rtx dest, op0, mask, x;
22091 dest = operands[0];
22092 op0 = operands[1];
22093 mask = operands[3];
22095 mode = GET_MODE (dest);
22096 vmode = GET_MODE (mask);
22098 dest = lowpart_subreg (vmode, dest, mode);
22099 x = gen_rtx_AND (vmode, dest, mask);
22100 emit_insn (gen_rtx_SET (dest, x));
22102 if (op0 != CONST0_RTX (vmode))
22104 x = gen_rtx_IOR (vmode, dest, op0);
22105 emit_insn (gen_rtx_SET (dest, x));
22109 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22110 so we have to do two masks. */
22112 void
22113 ix86_split_copysign_var (rtx operands[])
22115 machine_mode mode, vmode;
22116 rtx dest, scratch, op0, op1, mask, nmask, x;
22118 dest = operands[0];
22119 scratch = operands[1];
22120 op0 = operands[2];
22121 op1 = operands[3];
22122 nmask = operands[4];
22123 mask = operands[5];
22125 mode = GET_MODE (dest);
22126 vmode = GET_MODE (mask);
22128 if (rtx_equal_p (op0, op1))
22130 /* Shouldn't happen often (it's useless, obviously), but when it does
22131 we'd generate incorrect code if we continue below. */
22132 emit_move_insn (dest, op0);
22133 return;
22136 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22138 gcc_assert (REGNO (op1) == REGNO (scratch));
22140 x = gen_rtx_AND (vmode, scratch, mask);
22141 emit_insn (gen_rtx_SET (scratch, x));
22143 dest = mask;
22144 op0 = lowpart_subreg (vmode, op0, mode);
22145 x = gen_rtx_NOT (vmode, dest);
22146 x = gen_rtx_AND (vmode, x, op0);
22147 emit_insn (gen_rtx_SET (dest, x));
22149 else
22151 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22153 x = gen_rtx_AND (vmode, scratch, mask);
22155 else /* alternative 2,4 */
22157 gcc_assert (REGNO (mask) == REGNO (scratch));
22158 op1 = lowpart_subreg (vmode, op1, mode);
22159 x = gen_rtx_AND (vmode, scratch, op1);
22161 emit_insn (gen_rtx_SET (scratch, x));
22163 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22165 dest = lowpart_subreg (vmode, op0, mode);
22166 x = gen_rtx_AND (vmode, dest, nmask);
22168 else /* alternative 3,4 */
22170 gcc_assert (REGNO (nmask) == REGNO (dest));
22171 dest = nmask;
22172 op0 = lowpart_subreg (vmode, op0, mode);
22173 x = gen_rtx_AND (vmode, dest, op0);
22175 emit_insn (gen_rtx_SET (dest, x));
22178 x = gen_rtx_IOR (vmode, dest, scratch);
22179 emit_insn (gen_rtx_SET (dest, x));
22182 /* Return TRUE or FALSE depending on whether the first SET in INSN
22183 has source and destination with matching CC modes, and that the
22184 CC mode is at least as constrained as REQ_MODE. */
22186 bool
22187 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22189 rtx set;
22190 machine_mode set_mode;
22192 set = PATTERN (insn);
22193 if (GET_CODE (set) == PARALLEL)
22194 set = XVECEXP (set, 0, 0);
22195 gcc_assert (GET_CODE (set) == SET);
22196 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22198 set_mode = GET_MODE (SET_DEST (set));
22199 switch (set_mode)
22201 case CCNOmode:
22202 if (req_mode != CCNOmode
22203 && (req_mode != CCmode
22204 || XEXP (SET_SRC (set), 1) != const0_rtx))
22205 return false;
22206 break;
22207 case CCmode:
22208 if (req_mode == CCGCmode)
22209 return false;
22210 /* FALLTHRU */
22211 case CCGCmode:
22212 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22213 return false;
22214 /* FALLTHRU */
22215 case CCGOCmode:
22216 if (req_mode == CCZmode)
22217 return false;
22218 /* FALLTHRU */
22219 case CCZmode:
22220 break;
22222 case CCAmode:
22223 case CCCmode:
22224 case CCOmode:
22225 case CCPmode:
22226 case CCSmode:
22227 if (set_mode != req_mode)
22228 return false;
22229 break;
22231 default:
22232 gcc_unreachable ();
22235 return GET_MODE (SET_SRC (set)) == set_mode;
22238 /* Generate insn patterns to do an integer compare of OPERANDS. */
22240 static rtx
22241 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22243 machine_mode cmpmode;
22244 rtx tmp, flags;
22246 cmpmode = SELECT_CC_MODE (code, op0, op1);
22247 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22249 /* This is very simple, but making the interface the same as in the
22250 FP case makes the rest of the code easier. */
22251 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22252 emit_insn (gen_rtx_SET (flags, tmp));
22254 /* Return the test that should be put into the flags user, i.e.
22255 the bcc, scc, or cmov instruction. */
22256 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22259 /* Figure out whether to use ordered or unordered fp comparisons.
22260 Return the appropriate mode to use. */
22262 machine_mode
22263 ix86_fp_compare_mode (enum rtx_code)
22265 /* ??? In order to make all comparisons reversible, we do all comparisons
22266 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22267 all forms trapping and nontrapping comparisons, we can make inequality
22268 comparisons trapping again, since it results in better code when using
22269 FCOM based compares. */
22270 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22273 machine_mode
22274 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22276 machine_mode mode = GET_MODE (op0);
22278 if (SCALAR_FLOAT_MODE_P (mode))
22280 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22281 return ix86_fp_compare_mode (code);
22284 switch (code)
22286 /* Only zero flag is needed. */
22287 case EQ: /* ZF=0 */
22288 case NE: /* ZF!=0 */
22289 return CCZmode;
22290 /* Codes needing carry flag. */
22291 case GEU: /* CF=0 */
22292 case LTU: /* CF=1 */
22293 /* Detect overflow checks. They need just the carry flag. */
22294 if (GET_CODE (op0) == PLUS
22295 && (rtx_equal_p (op1, XEXP (op0, 0))
22296 || rtx_equal_p (op1, XEXP (op0, 1))))
22297 return CCCmode;
22298 else
22299 return CCmode;
22300 case GTU: /* CF=0 & ZF=0 */
22301 case LEU: /* CF=1 | ZF=1 */
22302 return CCmode;
22303 /* Codes possibly doable only with sign flag when
22304 comparing against zero. */
22305 case GE: /* SF=OF or SF=0 */
22306 case LT: /* SF<>OF or SF=1 */
22307 if (op1 == const0_rtx)
22308 return CCGOCmode;
22309 else
22310 /* For other cases Carry flag is not required. */
22311 return CCGCmode;
22312 /* Codes doable only with sign flag when comparing
22313 against zero, but we miss jump instruction for it
22314 so we need to use relational tests against overflow
22315 that thus needs to be zero. */
22316 case GT: /* ZF=0 & SF=OF */
22317 case LE: /* ZF=1 | SF<>OF */
22318 if (op1 == const0_rtx)
22319 return CCNOmode;
22320 else
22321 return CCGCmode;
22322 /* strcmp pattern do (use flags) and combine may ask us for proper
22323 mode. */
22324 case USE:
22325 return CCmode;
22326 default:
22327 gcc_unreachable ();
22331 /* Return the fixed registers used for condition codes. */
22333 static bool
22334 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22336 *p1 = FLAGS_REG;
22337 *p2 = FPSR_REG;
22338 return true;
22341 /* If two condition code modes are compatible, return a condition code
22342 mode which is compatible with both. Otherwise, return
22343 VOIDmode. */
22345 static machine_mode
22346 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22348 if (m1 == m2)
22349 return m1;
22351 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22352 return VOIDmode;
22354 if ((m1 == CCGCmode && m2 == CCGOCmode)
22355 || (m1 == CCGOCmode && m2 == CCGCmode))
22356 return CCGCmode;
22358 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22359 return m2;
22360 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22361 return m1;
22363 switch (m1)
22365 default:
22366 gcc_unreachable ();
22368 case CCmode:
22369 case CCGCmode:
22370 case CCGOCmode:
22371 case CCNOmode:
22372 case CCAmode:
22373 case CCCmode:
22374 case CCOmode:
22375 case CCPmode:
22376 case CCSmode:
22377 case CCZmode:
22378 switch (m2)
22380 default:
22381 return VOIDmode;
22383 case CCmode:
22384 case CCGCmode:
22385 case CCGOCmode:
22386 case CCNOmode:
22387 case CCAmode:
22388 case CCCmode:
22389 case CCOmode:
22390 case CCPmode:
22391 case CCSmode:
22392 case CCZmode:
22393 return CCmode;
22396 case CCFPmode:
22397 case CCFPUmode:
22398 /* These are only compatible with themselves, which we already
22399 checked above. */
22400 return VOIDmode;
22405 /* Return a comparison we can do and that it is equivalent to
22406 swap_condition (code) apart possibly from orderedness.
22407 But, never change orderedness if TARGET_IEEE_FP, returning
22408 UNKNOWN in that case if necessary. */
22410 static enum rtx_code
22411 ix86_fp_swap_condition (enum rtx_code code)
22413 switch (code)
22415 case GT: /* GTU - CF=0 & ZF=0 */
22416 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22417 case GE: /* GEU - CF=0 */
22418 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22419 case UNLT: /* LTU - CF=1 */
22420 return TARGET_IEEE_FP ? UNKNOWN : GT;
22421 case UNLE: /* LEU - CF=1 | ZF=1 */
22422 return TARGET_IEEE_FP ? UNKNOWN : GE;
22423 default:
22424 return swap_condition (code);
22428 /* Return cost of comparison CODE using the best strategy for performance.
22429 All following functions do use number of instructions as a cost metrics.
22430 In future this should be tweaked to compute bytes for optimize_size and
22431 take into account performance of various instructions on various CPUs. */
22433 static int
22434 ix86_fp_comparison_cost (enum rtx_code code)
22436 int arith_cost;
22438 /* The cost of code using bit-twiddling on %ah. */
22439 switch (code)
22441 case UNLE:
22442 case UNLT:
22443 case LTGT:
22444 case GT:
22445 case GE:
22446 case UNORDERED:
22447 case ORDERED:
22448 case UNEQ:
22449 arith_cost = 4;
22450 break;
22451 case LT:
22452 case NE:
22453 case EQ:
22454 case UNGE:
22455 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22456 break;
22457 case LE:
22458 case UNGT:
22459 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22460 break;
22461 default:
22462 gcc_unreachable ();
22465 switch (ix86_fp_comparison_strategy (code))
22467 case IX86_FPCMP_COMI:
22468 return arith_cost > 4 ? 3 : 2;
22469 case IX86_FPCMP_SAHF:
22470 return arith_cost > 4 ? 4 : 3;
22471 default:
22472 return arith_cost;
22476 /* Return strategy to use for floating-point. We assume that fcomi is always
22477 preferrable where available, since that is also true when looking at size
22478 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22480 enum ix86_fpcmp_strategy
22481 ix86_fp_comparison_strategy (enum rtx_code)
22483 /* Do fcomi/sahf based test when profitable. */
22485 if (TARGET_CMOVE)
22486 return IX86_FPCMP_COMI;
22488 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22489 return IX86_FPCMP_SAHF;
22491 return IX86_FPCMP_ARITH;
22494 /* Swap, force into registers, or otherwise massage the two operands
22495 to a fp comparison. The operands are updated in place; the new
22496 comparison code is returned. */
22498 static enum rtx_code
22499 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22501 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22502 rtx op0 = *pop0, op1 = *pop1;
22503 machine_mode op_mode = GET_MODE (op0);
22504 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22506 /* All of the unordered compare instructions only work on registers.
22507 The same is true of the fcomi compare instructions. The XFmode
22508 compare instructions require registers except when comparing
22509 against zero or when converting operand 1 from fixed point to
22510 floating point. */
22512 if (!is_sse
22513 && (fpcmp_mode == CCFPUmode
22514 || (op_mode == XFmode
22515 && ! (standard_80387_constant_p (op0) == 1
22516 || standard_80387_constant_p (op1) == 1)
22517 && GET_CODE (op1) != FLOAT)
22518 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22520 op0 = force_reg (op_mode, op0);
22521 op1 = force_reg (op_mode, op1);
22523 else
22525 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22526 things around if they appear profitable, otherwise force op0
22527 into a register. */
22529 if (standard_80387_constant_p (op0) == 0
22530 || (MEM_P (op0)
22531 && ! (standard_80387_constant_p (op1) == 0
22532 || MEM_P (op1))))
22534 enum rtx_code new_code = ix86_fp_swap_condition (code);
22535 if (new_code != UNKNOWN)
22537 std::swap (op0, op1);
22538 code = new_code;
22542 if (!REG_P (op0))
22543 op0 = force_reg (op_mode, op0);
22545 if (CONSTANT_P (op1))
22547 int tmp = standard_80387_constant_p (op1);
22548 if (tmp == 0)
22549 op1 = validize_mem (force_const_mem (op_mode, op1));
22550 else if (tmp == 1)
22552 if (TARGET_CMOVE)
22553 op1 = force_reg (op_mode, op1);
22555 else
22556 op1 = force_reg (op_mode, op1);
22560 /* Try to rearrange the comparison to make it cheaper. */
22561 if (ix86_fp_comparison_cost (code)
22562 > ix86_fp_comparison_cost (swap_condition (code))
22563 && (REG_P (op1) || can_create_pseudo_p ()))
22565 std::swap (op0, op1);
22566 code = swap_condition (code);
22567 if (!REG_P (op0))
22568 op0 = force_reg (op_mode, op0);
22571 *pop0 = op0;
22572 *pop1 = op1;
22573 return code;
22576 /* Convert comparison codes we use to represent FP comparison to integer
22577 code that will result in proper branch. Return UNKNOWN if no such code
22578 is available. */
22580 enum rtx_code
22581 ix86_fp_compare_code_to_integer (enum rtx_code code)
22583 switch (code)
22585 case GT:
22586 return GTU;
22587 case GE:
22588 return GEU;
22589 case ORDERED:
22590 case UNORDERED:
22591 return code;
22592 case UNEQ:
22593 return EQ;
22594 case UNLT:
22595 return LTU;
22596 case UNLE:
22597 return LEU;
22598 case LTGT:
22599 return NE;
22600 default:
22601 return UNKNOWN;
22605 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22607 static rtx
22608 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22610 machine_mode fpcmp_mode, intcmp_mode;
22611 rtx tmp, tmp2;
22613 fpcmp_mode = ix86_fp_compare_mode (code);
22614 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22616 /* Do fcomi/sahf based test when profitable. */
22617 switch (ix86_fp_comparison_strategy (code))
22619 case IX86_FPCMP_COMI:
22620 intcmp_mode = fpcmp_mode;
22621 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22622 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22623 emit_insn (tmp);
22624 break;
22626 case IX86_FPCMP_SAHF:
22627 intcmp_mode = fpcmp_mode;
22628 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22629 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22631 if (!scratch)
22632 scratch = gen_reg_rtx (HImode);
22633 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22634 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22635 break;
22637 case IX86_FPCMP_ARITH:
22638 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22639 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22640 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22641 if (!scratch)
22642 scratch = gen_reg_rtx (HImode);
22643 emit_insn (gen_rtx_SET (scratch, tmp2));
22645 /* In the unordered case, we have to check C2 for NaN's, which
22646 doesn't happen to work out to anything nice combination-wise.
22647 So do some bit twiddling on the value we've got in AH to come
22648 up with an appropriate set of condition codes. */
22650 intcmp_mode = CCNOmode;
22651 switch (code)
22653 case GT:
22654 case UNGT:
22655 if (code == GT || !TARGET_IEEE_FP)
22657 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22658 code = EQ;
22660 else
22662 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22663 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22664 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22665 intcmp_mode = CCmode;
22666 code = GEU;
22668 break;
22669 case LT:
22670 case UNLT:
22671 if (code == LT && TARGET_IEEE_FP)
22673 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22674 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22675 intcmp_mode = CCmode;
22676 code = EQ;
22678 else
22680 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22681 code = NE;
22683 break;
22684 case GE:
22685 case UNGE:
22686 if (code == GE || !TARGET_IEEE_FP)
22688 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22689 code = EQ;
22691 else
22693 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22694 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22695 code = NE;
22697 break;
22698 case LE:
22699 case UNLE:
22700 if (code == LE && TARGET_IEEE_FP)
22702 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22703 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22704 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22705 intcmp_mode = CCmode;
22706 code = LTU;
22708 else
22710 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22711 code = NE;
22713 break;
22714 case EQ:
22715 case UNEQ:
22716 if (code == EQ && TARGET_IEEE_FP)
22718 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22719 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22720 intcmp_mode = CCmode;
22721 code = EQ;
22723 else
22725 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22726 code = NE;
22728 break;
22729 case NE:
22730 case LTGT:
22731 if (code == NE && TARGET_IEEE_FP)
22733 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22734 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22735 GEN_INT (0x40)));
22736 code = NE;
22738 else
22740 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22741 code = EQ;
22743 break;
22745 case UNORDERED:
22746 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22747 code = NE;
22748 break;
22749 case ORDERED:
22750 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22751 code = EQ;
22752 break;
22754 default:
22755 gcc_unreachable ();
22757 break;
22759 default:
22760 gcc_unreachable();
22763 /* Return the test that should be put into the flags user, i.e.
22764 the bcc, scc, or cmov instruction. */
22765 return gen_rtx_fmt_ee (code, VOIDmode,
22766 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22767 const0_rtx);
22770 static rtx
22771 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22773 rtx ret;
22775 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22776 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22778 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22780 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22781 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22783 else
22784 ret = ix86_expand_int_compare (code, op0, op1);
22786 return ret;
22789 void
22790 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22792 machine_mode mode = GET_MODE (op0);
22793 rtx tmp;
22795 /* Handle special case - vector comparsion with boolean result, transform
22796 it using ptest instruction. */
22797 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22799 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22800 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22802 gcc_assert (code == EQ || code == NE);
22803 /* Generate XOR since we can't check that one operand is zero vector. */
22804 tmp = gen_reg_rtx (mode);
22805 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22806 tmp = gen_lowpart (p_mode, tmp);
22807 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22808 gen_rtx_UNSPEC (CCmode,
22809 gen_rtvec (2, tmp, tmp),
22810 UNSPEC_PTEST)));
22811 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22812 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22813 gen_rtx_LABEL_REF (VOIDmode, label),
22814 pc_rtx);
22815 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22816 return;
22819 switch (mode)
22821 case SFmode:
22822 case DFmode:
22823 case XFmode:
22824 case QImode:
22825 case HImode:
22826 case SImode:
22827 simple:
22828 tmp = ix86_expand_compare (code, op0, op1);
22829 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22830 gen_rtx_LABEL_REF (VOIDmode, label),
22831 pc_rtx);
22832 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22833 return;
22835 case DImode:
22836 if (TARGET_64BIT)
22837 goto simple;
22838 /* For 32-bit target DI comparison may be performed on
22839 SSE registers. To allow this we should avoid split
22840 to SI mode which is achieved by doing xor in DI mode
22841 and then comparing with zero (which is recognized by
22842 STV pass). We don't compare using xor when optimizing
22843 for size. */
22844 if (!optimize_insn_for_size_p ()
22845 && TARGET_STV
22846 && (code == EQ || code == NE))
22848 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22849 op1 = const0_rtx;
22851 /* FALLTHRU */
22852 case TImode:
22853 /* Expand DImode branch into multiple compare+branch. */
22855 rtx lo[2], hi[2];
22856 rtx_code_label *label2;
22857 enum rtx_code code1, code2, code3;
22858 machine_mode submode;
22860 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22862 std::swap (op0, op1);
22863 code = swap_condition (code);
22866 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22867 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22869 submode = mode == DImode ? SImode : DImode;
22871 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22872 avoid two branches. This costs one extra insn, so disable when
22873 optimizing for size. */
22875 if ((code == EQ || code == NE)
22876 && (!optimize_insn_for_size_p ()
22877 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22879 rtx xor0, xor1;
22881 xor1 = hi[0];
22882 if (hi[1] != const0_rtx)
22883 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22884 NULL_RTX, 0, OPTAB_WIDEN);
22886 xor0 = lo[0];
22887 if (lo[1] != const0_rtx)
22888 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22889 NULL_RTX, 0, OPTAB_WIDEN);
22891 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22892 NULL_RTX, 0, OPTAB_WIDEN);
22894 ix86_expand_branch (code, tmp, const0_rtx, label);
22895 return;
22898 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22899 op1 is a constant and the low word is zero, then we can just
22900 examine the high word. Similarly for low word -1 and
22901 less-or-equal-than or greater-than. */
22903 if (CONST_INT_P (hi[1]))
22904 switch (code)
22906 case LT: case LTU: case GE: case GEU:
22907 if (lo[1] == const0_rtx)
22909 ix86_expand_branch (code, hi[0], hi[1], label);
22910 return;
22912 break;
22913 case LE: case LEU: case GT: case GTU:
22914 if (lo[1] == constm1_rtx)
22916 ix86_expand_branch (code, hi[0], hi[1], label);
22917 return;
22919 break;
22920 default:
22921 break;
22924 /* Otherwise, we need two or three jumps. */
22926 label2 = gen_label_rtx ();
22928 code1 = code;
22929 code2 = swap_condition (code);
22930 code3 = unsigned_condition (code);
22932 switch (code)
22934 case LT: case GT: case LTU: case GTU:
22935 break;
22937 case LE: code1 = LT; code2 = GT; break;
22938 case GE: code1 = GT; code2 = LT; break;
22939 case LEU: code1 = LTU; code2 = GTU; break;
22940 case GEU: code1 = GTU; code2 = LTU; break;
22942 case EQ: code1 = UNKNOWN; code2 = NE; break;
22943 case NE: code2 = UNKNOWN; break;
22945 default:
22946 gcc_unreachable ();
22950 * a < b =>
22951 * if (hi(a) < hi(b)) goto true;
22952 * if (hi(a) > hi(b)) goto false;
22953 * if (lo(a) < lo(b)) goto true;
22954 * false:
22957 if (code1 != UNKNOWN)
22958 ix86_expand_branch (code1, hi[0], hi[1], label);
22959 if (code2 != UNKNOWN)
22960 ix86_expand_branch (code2, hi[0], hi[1], label2);
22962 ix86_expand_branch (code3, lo[0], lo[1], label);
22964 if (code2 != UNKNOWN)
22965 emit_label (label2);
22966 return;
22969 default:
22970 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22971 goto simple;
22975 /* Split branch based on floating point condition. */
22976 void
22977 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22978 rtx target1, rtx target2, rtx tmp)
22980 rtx condition;
22981 rtx_insn *i;
22983 if (target2 != pc_rtx)
22985 std::swap (target1, target2);
22986 code = reverse_condition_maybe_unordered (code);
22989 condition = ix86_expand_fp_compare (code, op1, op2,
22990 tmp);
22992 i = emit_jump_insn (gen_rtx_SET
22993 (pc_rtx,
22994 gen_rtx_IF_THEN_ELSE (VOIDmode,
22995 condition, target1, target2)));
22996 if (split_branch_probability >= 0)
22997 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
23000 void
23001 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23003 rtx ret;
23005 gcc_assert (GET_MODE (dest) == QImode);
23007 ret = ix86_expand_compare (code, op0, op1);
23008 PUT_MODE (ret, QImode);
23009 emit_insn (gen_rtx_SET (dest, ret));
23012 /* Expand comparison setting or clearing carry flag. Return true when
23013 successful and set pop for the operation. */
23014 static bool
23015 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23017 machine_mode mode =
23018 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23020 /* Do not handle double-mode compares that go through special path. */
23021 if (mode == (TARGET_64BIT ? TImode : DImode))
23022 return false;
23024 if (SCALAR_FLOAT_MODE_P (mode))
23026 rtx compare_op;
23027 rtx_insn *compare_seq;
23029 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23031 /* Shortcut: following common codes never translate
23032 into carry flag compares. */
23033 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23034 || code == ORDERED || code == UNORDERED)
23035 return false;
23037 /* These comparisons require zero flag; swap operands so they won't. */
23038 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23039 && !TARGET_IEEE_FP)
23041 std::swap (op0, op1);
23042 code = swap_condition (code);
23045 /* Try to expand the comparison and verify that we end up with
23046 carry flag based comparison. This fails to be true only when
23047 we decide to expand comparison using arithmetic that is not
23048 too common scenario. */
23049 start_sequence ();
23050 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23051 compare_seq = get_insns ();
23052 end_sequence ();
23054 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23055 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23056 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23057 else
23058 code = GET_CODE (compare_op);
23060 if (code != LTU && code != GEU)
23061 return false;
23063 emit_insn (compare_seq);
23064 *pop = compare_op;
23065 return true;
23068 if (!INTEGRAL_MODE_P (mode))
23069 return false;
23071 switch (code)
23073 case LTU:
23074 case GEU:
23075 break;
23077 /* Convert a==0 into (unsigned)a<1. */
23078 case EQ:
23079 case NE:
23080 if (op1 != const0_rtx)
23081 return false;
23082 op1 = const1_rtx;
23083 code = (code == EQ ? LTU : GEU);
23084 break;
23086 /* Convert a>b into b<a or a>=b-1. */
23087 case GTU:
23088 case LEU:
23089 if (CONST_INT_P (op1))
23091 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23092 /* Bail out on overflow. We still can swap operands but that
23093 would force loading of the constant into register. */
23094 if (op1 == const0_rtx
23095 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23096 return false;
23097 code = (code == GTU ? GEU : LTU);
23099 else
23101 std::swap (op0, op1);
23102 code = (code == GTU ? LTU : GEU);
23104 break;
23106 /* Convert a>=0 into (unsigned)a<0x80000000. */
23107 case LT:
23108 case GE:
23109 if (mode == DImode || op1 != const0_rtx)
23110 return false;
23111 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23112 code = (code == LT ? GEU : LTU);
23113 break;
23114 case LE:
23115 case GT:
23116 if (mode == DImode || op1 != constm1_rtx)
23117 return false;
23118 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23119 code = (code == LE ? GEU : LTU);
23120 break;
23122 default:
23123 return false;
23125 /* Swapping operands may cause constant to appear as first operand. */
23126 if (!nonimmediate_operand (op0, VOIDmode))
23128 if (!can_create_pseudo_p ())
23129 return false;
23130 op0 = force_reg (mode, op0);
23132 *pop = ix86_expand_compare (code, op0, op1);
23133 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23134 return true;
23137 bool
23138 ix86_expand_int_movcc (rtx operands[])
23140 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23141 rtx_insn *compare_seq;
23142 rtx compare_op;
23143 machine_mode mode = GET_MODE (operands[0]);
23144 bool sign_bit_compare_p = false;
23145 rtx op0 = XEXP (operands[1], 0);
23146 rtx op1 = XEXP (operands[1], 1);
23148 if (GET_MODE (op0) == TImode
23149 || (GET_MODE (op0) == DImode
23150 && !TARGET_64BIT))
23151 return false;
23153 start_sequence ();
23154 compare_op = ix86_expand_compare (code, op0, op1);
23155 compare_seq = get_insns ();
23156 end_sequence ();
23158 compare_code = GET_CODE (compare_op);
23160 if ((op1 == const0_rtx && (code == GE || code == LT))
23161 || (op1 == constm1_rtx && (code == GT || code == LE)))
23162 sign_bit_compare_p = true;
23164 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23165 HImode insns, we'd be swallowed in word prefix ops. */
23167 if ((mode != HImode || TARGET_FAST_PREFIX)
23168 && (mode != (TARGET_64BIT ? TImode : DImode))
23169 && CONST_INT_P (operands[2])
23170 && CONST_INT_P (operands[3]))
23172 rtx out = operands[0];
23173 HOST_WIDE_INT ct = INTVAL (operands[2]);
23174 HOST_WIDE_INT cf = INTVAL (operands[3]);
23175 HOST_WIDE_INT diff;
23177 diff = ct - cf;
23178 /* Sign bit compares are better done using shifts than we do by using
23179 sbb. */
23180 if (sign_bit_compare_p
23181 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23183 /* Detect overlap between destination and compare sources. */
23184 rtx tmp = out;
23186 if (!sign_bit_compare_p)
23188 rtx flags;
23189 bool fpcmp = false;
23191 compare_code = GET_CODE (compare_op);
23193 flags = XEXP (compare_op, 0);
23195 if (GET_MODE (flags) == CCFPmode
23196 || GET_MODE (flags) == CCFPUmode)
23198 fpcmp = true;
23199 compare_code
23200 = ix86_fp_compare_code_to_integer (compare_code);
23203 /* To simplify rest of code, restrict to the GEU case. */
23204 if (compare_code == LTU)
23206 std::swap (ct, cf);
23207 compare_code = reverse_condition (compare_code);
23208 code = reverse_condition (code);
23210 else
23212 if (fpcmp)
23213 PUT_CODE (compare_op,
23214 reverse_condition_maybe_unordered
23215 (GET_CODE (compare_op)));
23216 else
23217 PUT_CODE (compare_op,
23218 reverse_condition (GET_CODE (compare_op)));
23220 diff = ct - cf;
23222 if (reg_overlap_mentioned_p (out, op0)
23223 || reg_overlap_mentioned_p (out, op1))
23224 tmp = gen_reg_rtx (mode);
23226 if (mode == DImode)
23227 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23228 else
23229 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23230 flags, compare_op));
23232 else
23234 if (code == GT || code == GE)
23235 code = reverse_condition (code);
23236 else
23238 std::swap (ct, cf);
23239 diff = ct - cf;
23241 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23244 if (diff == 1)
23247 * cmpl op0,op1
23248 * sbbl dest,dest
23249 * [addl dest, ct]
23251 * Size 5 - 8.
23253 if (ct)
23254 tmp = expand_simple_binop (mode, PLUS,
23255 tmp, GEN_INT (ct),
23256 copy_rtx (tmp), 1, OPTAB_DIRECT);
23258 else if (cf == -1)
23261 * cmpl op0,op1
23262 * sbbl dest,dest
23263 * orl $ct, dest
23265 * Size 8.
23267 tmp = expand_simple_binop (mode, IOR,
23268 tmp, GEN_INT (ct),
23269 copy_rtx (tmp), 1, OPTAB_DIRECT);
23271 else if (diff == -1 && ct)
23274 * cmpl op0,op1
23275 * sbbl dest,dest
23276 * notl dest
23277 * [addl dest, cf]
23279 * Size 8 - 11.
23281 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23282 if (cf)
23283 tmp = expand_simple_binop (mode, PLUS,
23284 copy_rtx (tmp), GEN_INT (cf),
23285 copy_rtx (tmp), 1, OPTAB_DIRECT);
23287 else
23290 * cmpl op0,op1
23291 * sbbl dest,dest
23292 * [notl dest]
23293 * andl cf - ct, dest
23294 * [addl dest, ct]
23296 * Size 8 - 11.
23299 if (cf == 0)
23301 cf = ct;
23302 ct = 0;
23303 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23306 tmp = expand_simple_binop (mode, AND,
23307 copy_rtx (tmp),
23308 gen_int_mode (cf - ct, mode),
23309 copy_rtx (tmp), 1, OPTAB_DIRECT);
23310 if (ct)
23311 tmp = expand_simple_binop (mode, PLUS,
23312 copy_rtx (tmp), GEN_INT (ct),
23313 copy_rtx (tmp), 1, OPTAB_DIRECT);
23316 if (!rtx_equal_p (tmp, out))
23317 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23319 return true;
23322 if (diff < 0)
23324 machine_mode cmp_mode = GET_MODE (op0);
23325 enum rtx_code new_code;
23327 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23329 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23331 /* We may be reversing unordered compare to normal compare, that
23332 is not valid in general (we may convert non-trapping condition
23333 to trapping one), however on i386 we currently emit all
23334 comparisons unordered. */
23335 new_code = reverse_condition_maybe_unordered (code);
23337 else
23338 new_code = ix86_reverse_condition (code, cmp_mode);
23339 if (new_code != UNKNOWN)
23341 std::swap (ct, cf);
23342 diff = -diff;
23343 code = new_code;
23347 compare_code = UNKNOWN;
23348 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23349 && CONST_INT_P (op1))
23351 if (op1 == const0_rtx
23352 && (code == LT || code == GE))
23353 compare_code = code;
23354 else if (op1 == constm1_rtx)
23356 if (code == LE)
23357 compare_code = LT;
23358 else if (code == GT)
23359 compare_code = GE;
23363 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23364 if (compare_code != UNKNOWN
23365 && GET_MODE (op0) == GET_MODE (out)
23366 && (cf == -1 || ct == -1))
23368 /* If lea code below could be used, only optimize
23369 if it results in a 2 insn sequence. */
23371 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23372 || diff == 3 || diff == 5 || diff == 9)
23373 || (compare_code == LT && ct == -1)
23374 || (compare_code == GE && cf == -1))
23377 * notl op1 (if necessary)
23378 * sarl $31, op1
23379 * orl cf, op1
23381 if (ct != -1)
23383 cf = ct;
23384 ct = -1;
23385 code = reverse_condition (code);
23388 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23390 out = expand_simple_binop (mode, IOR,
23391 out, GEN_INT (cf),
23392 out, 1, OPTAB_DIRECT);
23393 if (out != operands[0])
23394 emit_move_insn (operands[0], out);
23396 return true;
23401 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23402 || diff == 3 || diff == 5 || diff == 9)
23403 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23404 && (mode != DImode
23405 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23408 * xorl dest,dest
23409 * cmpl op1,op2
23410 * setcc dest
23411 * lea cf(dest*(ct-cf)),dest
23413 * Size 14.
23415 * This also catches the degenerate setcc-only case.
23418 rtx tmp;
23419 int nops;
23421 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23423 nops = 0;
23424 /* On x86_64 the lea instruction operates on Pmode, so we need
23425 to get arithmetics done in proper mode to match. */
23426 if (diff == 1)
23427 tmp = copy_rtx (out);
23428 else
23430 rtx out1;
23431 out1 = copy_rtx (out);
23432 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23433 nops++;
23434 if (diff & 1)
23436 tmp = gen_rtx_PLUS (mode, tmp, out1);
23437 nops++;
23440 if (cf != 0)
23442 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23443 nops++;
23445 if (!rtx_equal_p (tmp, out))
23447 if (nops == 1)
23448 out = force_operand (tmp, copy_rtx (out));
23449 else
23450 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23452 if (!rtx_equal_p (out, operands[0]))
23453 emit_move_insn (operands[0], copy_rtx (out));
23455 return true;
23459 * General case: Jumpful:
23460 * xorl dest,dest cmpl op1, op2
23461 * cmpl op1, op2 movl ct, dest
23462 * setcc dest jcc 1f
23463 * decl dest movl cf, dest
23464 * andl (cf-ct),dest 1:
23465 * addl ct,dest
23467 * Size 20. Size 14.
23469 * This is reasonably steep, but branch mispredict costs are
23470 * high on modern cpus, so consider failing only if optimizing
23471 * for space.
23474 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23475 && BRANCH_COST (optimize_insn_for_speed_p (),
23476 false) >= 2)
23478 if (cf == 0)
23480 machine_mode cmp_mode = GET_MODE (op0);
23481 enum rtx_code new_code;
23483 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23485 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23487 /* We may be reversing unordered compare to normal compare,
23488 that is not valid in general (we may convert non-trapping
23489 condition to trapping one), however on i386 we currently
23490 emit all comparisons unordered. */
23491 new_code = reverse_condition_maybe_unordered (code);
23493 else
23495 new_code = ix86_reverse_condition (code, cmp_mode);
23496 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23497 compare_code = reverse_condition (compare_code);
23500 if (new_code != UNKNOWN)
23502 cf = ct;
23503 ct = 0;
23504 code = new_code;
23508 if (compare_code != UNKNOWN)
23510 /* notl op1 (if needed)
23511 sarl $31, op1
23512 andl (cf-ct), op1
23513 addl ct, op1
23515 For x < 0 (resp. x <= -1) there will be no notl,
23516 so if possible swap the constants to get rid of the
23517 complement.
23518 True/false will be -1/0 while code below (store flag
23519 followed by decrement) is 0/-1, so the constants need
23520 to be exchanged once more. */
23522 if (compare_code == GE || !cf)
23524 code = reverse_condition (code);
23525 compare_code = LT;
23527 else
23528 std::swap (ct, cf);
23530 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23532 else
23534 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23536 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23537 constm1_rtx,
23538 copy_rtx (out), 1, OPTAB_DIRECT);
23541 out = expand_simple_binop (mode, AND, copy_rtx (out),
23542 gen_int_mode (cf - ct, mode),
23543 copy_rtx (out), 1, OPTAB_DIRECT);
23544 if (ct)
23545 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23546 copy_rtx (out), 1, OPTAB_DIRECT);
23547 if (!rtx_equal_p (out, operands[0]))
23548 emit_move_insn (operands[0], copy_rtx (out));
23550 return true;
23554 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23556 /* Try a few things more with specific constants and a variable. */
23558 optab op;
23559 rtx var, orig_out, out, tmp;
23561 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23562 return false;
23564 /* If one of the two operands is an interesting constant, load a
23565 constant with the above and mask it in with a logical operation. */
23567 if (CONST_INT_P (operands[2]))
23569 var = operands[3];
23570 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23571 operands[3] = constm1_rtx, op = and_optab;
23572 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23573 operands[3] = const0_rtx, op = ior_optab;
23574 else
23575 return false;
23577 else if (CONST_INT_P (operands[3]))
23579 var = operands[2];
23580 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23581 operands[2] = constm1_rtx, op = and_optab;
23582 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23583 operands[2] = const0_rtx, op = ior_optab;
23584 else
23585 return false;
23587 else
23588 return false;
23590 orig_out = operands[0];
23591 tmp = gen_reg_rtx (mode);
23592 operands[0] = tmp;
23594 /* Recurse to get the constant loaded. */
23595 if (!ix86_expand_int_movcc (operands))
23596 return false;
23598 /* Mask in the interesting variable. */
23599 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23600 OPTAB_WIDEN);
23601 if (!rtx_equal_p (out, orig_out))
23602 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23604 return true;
23608 * For comparison with above,
23610 * movl cf,dest
23611 * movl ct,tmp
23612 * cmpl op1,op2
23613 * cmovcc tmp,dest
23615 * Size 15.
23618 if (! nonimmediate_operand (operands[2], mode))
23619 operands[2] = force_reg (mode, operands[2]);
23620 if (! nonimmediate_operand (operands[3], mode))
23621 operands[3] = force_reg (mode, operands[3]);
23623 if (! register_operand (operands[2], VOIDmode)
23624 && (mode == QImode
23625 || ! register_operand (operands[3], VOIDmode)))
23626 operands[2] = force_reg (mode, operands[2]);
23628 if (mode == QImode
23629 && ! register_operand (operands[3], VOIDmode))
23630 operands[3] = force_reg (mode, operands[3]);
23632 emit_insn (compare_seq);
23633 emit_insn (gen_rtx_SET (operands[0],
23634 gen_rtx_IF_THEN_ELSE (mode,
23635 compare_op, operands[2],
23636 operands[3])));
23637 return true;
23640 /* Swap, force into registers, or otherwise massage the two operands
23641 to an sse comparison with a mask result. Thus we differ a bit from
23642 ix86_prepare_fp_compare_args which expects to produce a flags result.
23644 The DEST operand exists to help determine whether to commute commutative
23645 operators. The POP0/POP1 operands are updated in place. The new
23646 comparison code is returned, or UNKNOWN if not implementable. */
23648 static enum rtx_code
23649 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23650 rtx *pop0, rtx *pop1)
23652 switch (code)
23654 case LTGT:
23655 case UNEQ:
23656 /* AVX supports all the needed comparisons. */
23657 if (TARGET_AVX)
23658 break;
23659 /* We have no LTGT as an operator. We could implement it with
23660 NE & ORDERED, but this requires an extra temporary. It's
23661 not clear that it's worth it. */
23662 return UNKNOWN;
23664 case LT:
23665 case LE:
23666 case UNGT:
23667 case UNGE:
23668 /* These are supported directly. */
23669 break;
23671 case EQ:
23672 case NE:
23673 case UNORDERED:
23674 case ORDERED:
23675 /* AVX has 3 operand comparisons, no need to swap anything. */
23676 if (TARGET_AVX)
23677 break;
23678 /* For commutative operators, try to canonicalize the destination
23679 operand to be first in the comparison - this helps reload to
23680 avoid extra moves. */
23681 if (!dest || !rtx_equal_p (dest, *pop1))
23682 break;
23683 /* FALLTHRU */
23685 case GE:
23686 case GT:
23687 case UNLE:
23688 case UNLT:
23689 /* These are not supported directly before AVX, and furthermore
23690 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23691 comparison operands to transform into something that is
23692 supported. */
23693 std::swap (*pop0, *pop1);
23694 code = swap_condition (code);
23695 break;
23697 default:
23698 gcc_unreachable ();
23701 return code;
23704 /* Detect conditional moves that exactly match min/max operational
23705 semantics. Note that this is IEEE safe, as long as we don't
23706 interchange the operands.
23708 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23709 and TRUE if the operation is successful and instructions are emitted. */
23711 static bool
23712 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23713 rtx cmp_op1, rtx if_true, rtx if_false)
23715 machine_mode mode;
23716 bool is_min;
23717 rtx tmp;
23719 if (code == LT)
23721 else if (code == UNGE)
23722 std::swap (if_true, if_false);
23723 else
23724 return false;
23726 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23727 is_min = true;
23728 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23729 is_min = false;
23730 else
23731 return false;
23733 mode = GET_MODE (dest);
23735 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23736 but MODE may be a vector mode and thus not appropriate. */
23737 if (!flag_finite_math_only || flag_signed_zeros)
23739 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23740 rtvec v;
23742 if_true = force_reg (mode, if_true);
23743 v = gen_rtvec (2, if_true, if_false);
23744 tmp = gen_rtx_UNSPEC (mode, v, u);
23746 else
23748 code = is_min ? SMIN : SMAX;
23749 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23752 emit_insn (gen_rtx_SET (dest, tmp));
23753 return true;
23756 /* Expand an sse vector comparison. Return the register with the result. */
23758 static rtx
23759 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23760 rtx op_true, rtx op_false)
23762 machine_mode mode = GET_MODE (dest);
23763 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23765 /* In general case result of comparison can differ from operands' type. */
23766 machine_mode cmp_mode;
23768 /* In AVX512F the result of comparison is an integer mask. */
23769 bool maskcmp = false;
23770 rtx x;
23772 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23774 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23775 gcc_assert (cmp_mode != BLKmode);
23777 maskcmp = true;
23779 else
23780 cmp_mode = cmp_ops_mode;
23783 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23784 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23785 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23787 if (optimize
23788 || (maskcmp && cmp_mode != mode)
23789 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23790 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23791 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23793 /* Compare patterns for int modes are unspec in AVX512F only. */
23794 if (maskcmp && (code == GT || code == EQ))
23796 rtx (*gen)(rtx, rtx, rtx);
23798 switch (cmp_ops_mode)
23800 case V64QImode:
23801 gcc_assert (TARGET_AVX512BW);
23802 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23803 break;
23804 case V32HImode:
23805 gcc_assert (TARGET_AVX512BW);
23806 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23807 break;
23808 case V16SImode:
23809 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23810 break;
23811 case V8DImode:
23812 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23813 break;
23814 default:
23815 gen = NULL;
23818 if (gen)
23820 emit_insn (gen (dest, cmp_op0, cmp_op1));
23821 return dest;
23824 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23826 if (cmp_mode != mode && !maskcmp)
23828 x = force_reg (cmp_ops_mode, x);
23829 convert_move (dest, x, false);
23831 else
23832 emit_insn (gen_rtx_SET (dest, x));
23834 return dest;
23837 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23838 operations. This is used for both scalar and vector conditional moves. */
23840 void
23841 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23843 machine_mode mode = GET_MODE (dest);
23844 machine_mode cmpmode = GET_MODE (cmp);
23846 /* In AVX512F the result of comparison is an integer mask. */
23847 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23849 rtx t2, t3, x;
23851 /* If we have an integer mask and FP value then we need
23852 to cast mask to FP mode. */
23853 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23855 cmp = force_reg (cmpmode, cmp);
23856 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23859 if (vector_all_ones_operand (op_true, mode)
23860 && rtx_equal_p (op_false, CONST0_RTX (mode))
23861 && !maskcmp)
23863 emit_insn (gen_rtx_SET (dest, cmp));
23865 else if (op_false == CONST0_RTX (mode)
23866 && !maskcmp)
23868 op_true = force_reg (mode, op_true);
23869 x = gen_rtx_AND (mode, cmp, op_true);
23870 emit_insn (gen_rtx_SET (dest, x));
23872 else if (op_true == CONST0_RTX (mode)
23873 && !maskcmp)
23875 op_false = force_reg (mode, op_false);
23876 x = gen_rtx_NOT (mode, cmp);
23877 x = gen_rtx_AND (mode, x, op_false);
23878 emit_insn (gen_rtx_SET (dest, x));
23880 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23881 && !maskcmp)
23883 op_false = force_reg (mode, op_false);
23884 x = gen_rtx_IOR (mode, cmp, op_false);
23885 emit_insn (gen_rtx_SET (dest, x));
23887 else if (TARGET_XOP
23888 && !maskcmp)
23890 op_true = force_reg (mode, op_true);
23892 if (!nonimmediate_operand (op_false, mode))
23893 op_false = force_reg (mode, op_false);
23895 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23896 op_true,
23897 op_false)));
23899 else
23901 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23902 rtx d = dest;
23904 if (!nonimmediate_operand (op_true, mode))
23905 op_true = force_reg (mode, op_true);
23907 op_false = force_reg (mode, op_false);
23909 switch (mode)
23911 case V4SFmode:
23912 if (TARGET_SSE4_1)
23913 gen = gen_sse4_1_blendvps;
23914 break;
23915 case V2DFmode:
23916 if (TARGET_SSE4_1)
23917 gen = gen_sse4_1_blendvpd;
23918 break;
23919 case V16QImode:
23920 case V8HImode:
23921 case V4SImode:
23922 case V2DImode:
23923 if (TARGET_SSE4_1)
23925 gen = gen_sse4_1_pblendvb;
23926 if (mode != V16QImode)
23927 d = gen_reg_rtx (V16QImode);
23928 op_false = gen_lowpart (V16QImode, op_false);
23929 op_true = gen_lowpart (V16QImode, op_true);
23930 cmp = gen_lowpart (V16QImode, cmp);
23932 break;
23933 case V8SFmode:
23934 if (TARGET_AVX)
23935 gen = gen_avx_blendvps256;
23936 break;
23937 case V4DFmode:
23938 if (TARGET_AVX)
23939 gen = gen_avx_blendvpd256;
23940 break;
23941 case V32QImode:
23942 case V16HImode:
23943 case V8SImode:
23944 case V4DImode:
23945 if (TARGET_AVX2)
23947 gen = gen_avx2_pblendvb;
23948 if (mode != V32QImode)
23949 d = gen_reg_rtx (V32QImode);
23950 op_false = gen_lowpart (V32QImode, op_false);
23951 op_true = gen_lowpart (V32QImode, op_true);
23952 cmp = gen_lowpart (V32QImode, cmp);
23954 break;
23956 case V64QImode:
23957 gen = gen_avx512bw_blendmv64qi;
23958 break;
23959 case V32HImode:
23960 gen = gen_avx512bw_blendmv32hi;
23961 break;
23962 case V16SImode:
23963 gen = gen_avx512f_blendmv16si;
23964 break;
23965 case V8DImode:
23966 gen = gen_avx512f_blendmv8di;
23967 break;
23968 case V8DFmode:
23969 gen = gen_avx512f_blendmv8df;
23970 break;
23971 case V16SFmode:
23972 gen = gen_avx512f_blendmv16sf;
23973 break;
23975 default:
23976 break;
23979 if (gen != NULL)
23981 emit_insn (gen (d, op_false, op_true, cmp));
23982 if (d != dest)
23983 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23985 else
23987 op_true = force_reg (mode, op_true);
23989 t2 = gen_reg_rtx (mode);
23990 if (optimize)
23991 t3 = gen_reg_rtx (mode);
23992 else
23993 t3 = dest;
23995 x = gen_rtx_AND (mode, op_true, cmp);
23996 emit_insn (gen_rtx_SET (t2, x));
23998 x = gen_rtx_NOT (mode, cmp);
23999 x = gen_rtx_AND (mode, x, op_false);
24000 emit_insn (gen_rtx_SET (t3, x));
24002 x = gen_rtx_IOR (mode, t3, t2);
24003 emit_insn (gen_rtx_SET (dest, x));
24008 /* Expand a floating-point conditional move. Return true if successful. */
24010 bool
24011 ix86_expand_fp_movcc (rtx operands[])
24013 machine_mode mode = GET_MODE (operands[0]);
24014 enum rtx_code code = GET_CODE (operands[1]);
24015 rtx tmp, compare_op;
24016 rtx op0 = XEXP (operands[1], 0);
24017 rtx op1 = XEXP (operands[1], 1);
24019 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24021 machine_mode cmode;
24023 /* Since we've no cmove for sse registers, don't force bad register
24024 allocation just to gain access to it. Deny movcc when the
24025 comparison mode doesn't match the move mode. */
24026 cmode = GET_MODE (op0);
24027 if (cmode == VOIDmode)
24028 cmode = GET_MODE (op1);
24029 if (cmode != mode)
24030 return false;
24032 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24033 if (code == UNKNOWN)
24034 return false;
24036 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24037 operands[2], operands[3]))
24038 return true;
24040 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24041 operands[2], operands[3]);
24042 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24043 return true;
24046 if (GET_MODE (op0) == TImode
24047 || (GET_MODE (op0) == DImode
24048 && !TARGET_64BIT))
24049 return false;
24051 /* The floating point conditional move instructions don't directly
24052 support conditions resulting from a signed integer comparison. */
24054 compare_op = ix86_expand_compare (code, op0, op1);
24055 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24057 tmp = gen_reg_rtx (QImode);
24058 ix86_expand_setcc (tmp, code, op0, op1);
24060 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24063 emit_insn (gen_rtx_SET (operands[0],
24064 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24065 operands[2], operands[3])));
24067 return true;
24070 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24072 static int
24073 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24075 switch (code)
24077 case EQ:
24078 return 0;
24079 case LT:
24080 case LTU:
24081 return 1;
24082 case LE:
24083 case LEU:
24084 return 2;
24085 case NE:
24086 return 4;
24087 case GE:
24088 case GEU:
24089 return 5;
24090 case GT:
24091 case GTU:
24092 return 6;
24093 default:
24094 gcc_unreachable ();
24098 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24100 static int
24101 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24103 switch (code)
24105 case EQ:
24106 return 0x00;
24107 case NE:
24108 return 0x04;
24109 case GT:
24110 return 0x0e;
24111 case LE:
24112 return 0x02;
24113 case GE:
24114 return 0x0d;
24115 case LT:
24116 return 0x01;
24117 case UNLE:
24118 return 0x0a;
24119 case UNLT:
24120 return 0x09;
24121 case UNGE:
24122 return 0x05;
24123 case UNGT:
24124 return 0x06;
24125 case UNEQ:
24126 return 0x18;
24127 case LTGT:
24128 return 0x0c;
24129 case ORDERED:
24130 return 0x07;
24131 case UNORDERED:
24132 return 0x03;
24133 default:
24134 gcc_unreachable ();
24138 /* Return immediate value to be used in UNSPEC_PCMP
24139 for comparison CODE in MODE. */
24141 static int
24142 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24144 if (FLOAT_MODE_P (mode))
24145 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24146 return ix86_int_cmp_code_to_pcmp_immediate (code);
24149 /* Expand AVX-512 vector comparison. */
24151 bool
24152 ix86_expand_mask_vec_cmp (rtx operands[])
24154 machine_mode mask_mode = GET_MODE (operands[0]);
24155 machine_mode cmp_mode = GET_MODE (operands[2]);
24156 enum rtx_code code = GET_CODE (operands[1]);
24157 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24158 int unspec_code;
24159 rtx unspec;
24161 switch (code)
24163 case LEU:
24164 case GTU:
24165 case GEU:
24166 case LTU:
24167 unspec_code = UNSPEC_UNSIGNED_PCMP;
24168 break;
24170 default:
24171 unspec_code = UNSPEC_PCMP;
24174 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24175 operands[3], imm),
24176 unspec_code);
24177 emit_insn (gen_rtx_SET (operands[0], unspec));
24179 return true;
24182 /* Expand fp vector comparison. */
24184 bool
24185 ix86_expand_fp_vec_cmp (rtx operands[])
24187 enum rtx_code code = GET_CODE (operands[1]);
24188 rtx cmp;
24190 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24191 &operands[2], &operands[3]);
24192 if (code == UNKNOWN)
24194 rtx temp;
24195 switch (GET_CODE (operands[1]))
24197 case LTGT:
24198 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24199 operands[3], NULL, NULL);
24200 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24201 operands[3], NULL, NULL);
24202 code = AND;
24203 break;
24204 case UNEQ:
24205 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24206 operands[3], NULL, NULL);
24207 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24208 operands[3], NULL, NULL);
24209 code = IOR;
24210 break;
24211 default:
24212 gcc_unreachable ();
24214 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24215 OPTAB_DIRECT);
24217 else
24218 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24219 operands[1], operands[2]);
24221 if (operands[0] != cmp)
24222 emit_move_insn (operands[0], cmp);
24224 return true;
24227 static rtx
24228 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24229 rtx op_true, rtx op_false, bool *negate)
24231 machine_mode data_mode = GET_MODE (dest);
24232 machine_mode mode = GET_MODE (cop0);
24233 rtx x;
24235 *negate = false;
24237 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24238 if (TARGET_XOP
24239 && (mode == V16QImode || mode == V8HImode
24240 || mode == V4SImode || mode == V2DImode))
24242 else
24244 /* Canonicalize the comparison to EQ, GT, GTU. */
24245 switch (code)
24247 case EQ:
24248 case GT:
24249 case GTU:
24250 break;
24252 case NE:
24253 case LE:
24254 case LEU:
24255 code = reverse_condition (code);
24256 *negate = true;
24257 break;
24259 case GE:
24260 case GEU:
24261 code = reverse_condition (code);
24262 *negate = true;
24263 /* FALLTHRU */
24265 case LT:
24266 case LTU:
24267 std::swap (cop0, cop1);
24268 code = swap_condition (code);
24269 break;
24271 default:
24272 gcc_unreachable ();
24275 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24276 if (mode == V2DImode)
24278 switch (code)
24280 case EQ:
24281 /* SSE4.1 supports EQ. */
24282 if (!TARGET_SSE4_1)
24283 return NULL;
24284 break;
24286 case GT:
24287 case GTU:
24288 /* SSE4.2 supports GT/GTU. */
24289 if (!TARGET_SSE4_2)
24290 return NULL;
24291 break;
24293 default:
24294 gcc_unreachable ();
24298 /* Unsigned parallel compare is not supported by the hardware.
24299 Play some tricks to turn this into a signed comparison
24300 against 0. */
24301 if (code == GTU)
24303 cop0 = force_reg (mode, cop0);
24305 switch (mode)
24307 case V16SImode:
24308 case V8DImode:
24309 case V8SImode:
24310 case V4DImode:
24311 case V4SImode:
24312 case V2DImode:
24314 rtx t1, t2, mask;
24315 rtx (*gen_sub3) (rtx, rtx, rtx);
24317 switch (mode)
24319 case V16SImode: gen_sub3 = gen_subv16si3; break;
24320 case V8DImode: gen_sub3 = gen_subv8di3; break;
24321 case V8SImode: gen_sub3 = gen_subv8si3; break;
24322 case V4DImode: gen_sub3 = gen_subv4di3; break;
24323 case V4SImode: gen_sub3 = gen_subv4si3; break;
24324 case V2DImode: gen_sub3 = gen_subv2di3; break;
24325 default:
24326 gcc_unreachable ();
24328 /* Subtract (-(INT MAX) - 1) from both operands to make
24329 them signed. */
24330 mask = ix86_build_signbit_mask (mode, true, false);
24331 t1 = gen_reg_rtx (mode);
24332 emit_insn (gen_sub3 (t1, cop0, mask));
24334 t2 = gen_reg_rtx (mode);
24335 emit_insn (gen_sub3 (t2, cop1, mask));
24337 cop0 = t1;
24338 cop1 = t2;
24339 code = GT;
24341 break;
24343 case V64QImode:
24344 case V32HImode:
24345 case V32QImode:
24346 case V16HImode:
24347 case V16QImode:
24348 case V8HImode:
24349 /* Perform a parallel unsigned saturating subtraction. */
24350 x = gen_reg_rtx (mode);
24351 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24352 cop1)));
24354 cop0 = x;
24355 cop1 = CONST0_RTX (mode);
24356 code = EQ;
24357 *negate = !*negate;
24358 break;
24360 default:
24361 gcc_unreachable ();
24366 if (*negate)
24367 std::swap (op_true, op_false);
24369 /* Allow the comparison to be done in one mode, but the movcc to
24370 happen in another mode. */
24371 if (data_mode == mode)
24373 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24374 op_true, op_false);
24376 else
24378 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24379 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24380 op_true, op_false);
24381 if (GET_MODE (x) == mode)
24382 x = gen_lowpart (data_mode, x);
24385 return x;
24388 /* Expand integer vector comparison. */
24390 bool
24391 ix86_expand_int_vec_cmp (rtx operands[])
24393 rtx_code code = GET_CODE (operands[1]);
24394 bool negate = false;
24395 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24396 operands[3], NULL, NULL, &negate);
24398 if (!cmp)
24399 return false;
24401 if (negate)
24402 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24403 CONST0_RTX (GET_MODE (cmp)),
24404 NULL, NULL, &negate);
24406 gcc_assert (!negate);
24408 if (operands[0] != cmp)
24409 emit_move_insn (operands[0], cmp);
24411 return true;
24414 /* Expand a floating-point vector conditional move; a vcond operation
24415 rather than a movcc operation. */
24417 bool
24418 ix86_expand_fp_vcond (rtx operands[])
24420 enum rtx_code code = GET_CODE (operands[3]);
24421 rtx cmp;
24423 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24424 &operands[4], &operands[5]);
24425 if (code == UNKNOWN)
24427 rtx temp;
24428 switch (GET_CODE (operands[3]))
24430 case LTGT:
24431 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24432 operands[5], operands[0], operands[0]);
24433 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24434 operands[5], operands[1], operands[2]);
24435 code = AND;
24436 break;
24437 case UNEQ:
24438 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24439 operands[5], operands[0], operands[0]);
24440 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24441 operands[5], operands[1], operands[2]);
24442 code = IOR;
24443 break;
24444 default:
24445 gcc_unreachable ();
24447 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24448 OPTAB_DIRECT);
24449 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24450 return true;
24453 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24454 operands[5], operands[1], operands[2]))
24455 return true;
24457 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24458 operands[1], operands[2]);
24459 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24460 return true;
24463 /* Expand a signed/unsigned integral vector conditional move. */
24465 bool
24466 ix86_expand_int_vcond (rtx operands[])
24468 machine_mode data_mode = GET_MODE (operands[0]);
24469 machine_mode mode = GET_MODE (operands[4]);
24470 enum rtx_code code = GET_CODE (operands[3]);
24471 bool negate = false;
24472 rtx x, cop0, cop1;
24474 cop0 = operands[4];
24475 cop1 = operands[5];
24477 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24478 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24479 if ((code == LT || code == GE)
24480 && data_mode == mode
24481 && cop1 == CONST0_RTX (mode)
24482 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24483 && GET_MODE_UNIT_SIZE (data_mode) > 1
24484 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24485 && (GET_MODE_SIZE (data_mode) == 16
24486 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24488 rtx negop = operands[2 - (code == LT)];
24489 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24490 if (negop == CONST1_RTX (data_mode))
24492 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24493 operands[0], 1, OPTAB_DIRECT);
24494 if (res != operands[0])
24495 emit_move_insn (operands[0], res);
24496 return true;
24498 else if (GET_MODE_INNER (data_mode) != DImode
24499 && vector_all_ones_operand (negop, data_mode))
24501 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24502 operands[0], 0, OPTAB_DIRECT);
24503 if (res != operands[0])
24504 emit_move_insn (operands[0], res);
24505 return true;
24509 if (!nonimmediate_operand (cop1, mode))
24510 cop1 = force_reg (mode, cop1);
24511 if (!general_operand (operands[1], data_mode))
24512 operands[1] = force_reg (data_mode, operands[1]);
24513 if (!general_operand (operands[2], data_mode))
24514 operands[2] = force_reg (data_mode, operands[2]);
24516 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24517 operands[1], operands[2], &negate);
24519 if (!x)
24520 return false;
24522 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24523 operands[2-negate]);
24524 return true;
24527 /* AVX512F does support 64-byte integer vector operations,
24528 thus the longest vector we are faced with is V64QImode. */
24529 #define MAX_VECT_LEN 64
24531 struct expand_vec_perm_d
24533 rtx target, op0, op1;
24534 unsigned char perm[MAX_VECT_LEN];
24535 machine_mode vmode;
24536 unsigned char nelt;
24537 bool one_operand_p;
24538 bool testing_p;
24541 static bool
24542 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24543 struct expand_vec_perm_d *d)
24545 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24546 expander, so args are either in d, or in op0, op1 etc. */
24547 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24548 machine_mode maskmode = mode;
24549 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24551 switch (mode)
24553 case V8HImode:
24554 if (TARGET_AVX512VL && TARGET_AVX512BW)
24555 gen = gen_avx512vl_vpermi2varv8hi3;
24556 break;
24557 case V16HImode:
24558 if (TARGET_AVX512VL && TARGET_AVX512BW)
24559 gen = gen_avx512vl_vpermi2varv16hi3;
24560 break;
24561 case V64QImode:
24562 if (TARGET_AVX512VBMI)
24563 gen = gen_avx512bw_vpermi2varv64qi3;
24564 break;
24565 case V32HImode:
24566 if (TARGET_AVX512BW)
24567 gen = gen_avx512bw_vpermi2varv32hi3;
24568 break;
24569 case V4SImode:
24570 if (TARGET_AVX512VL)
24571 gen = gen_avx512vl_vpermi2varv4si3;
24572 break;
24573 case V8SImode:
24574 if (TARGET_AVX512VL)
24575 gen = gen_avx512vl_vpermi2varv8si3;
24576 break;
24577 case V16SImode:
24578 if (TARGET_AVX512F)
24579 gen = gen_avx512f_vpermi2varv16si3;
24580 break;
24581 case V4SFmode:
24582 if (TARGET_AVX512VL)
24584 gen = gen_avx512vl_vpermi2varv4sf3;
24585 maskmode = V4SImode;
24587 break;
24588 case V8SFmode:
24589 if (TARGET_AVX512VL)
24591 gen = gen_avx512vl_vpermi2varv8sf3;
24592 maskmode = V8SImode;
24594 break;
24595 case V16SFmode:
24596 if (TARGET_AVX512F)
24598 gen = gen_avx512f_vpermi2varv16sf3;
24599 maskmode = V16SImode;
24601 break;
24602 case V2DImode:
24603 if (TARGET_AVX512VL)
24604 gen = gen_avx512vl_vpermi2varv2di3;
24605 break;
24606 case V4DImode:
24607 if (TARGET_AVX512VL)
24608 gen = gen_avx512vl_vpermi2varv4di3;
24609 break;
24610 case V8DImode:
24611 if (TARGET_AVX512F)
24612 gen = gen_avx512f_vpermi2varv8di3;
24613 break;
24614 case V2DFmode:
24615 if (TARGET_AVX512VL)
24617 gen = gen_avx512vl_vpermi2varv2df3;
24618 maskmode = V2DImode;
24620 break;
24621 case V4DFmode:
24622 if (TARGET_AVX512VL)
24624 gen = gen_avx512vl_vpermi2varv4df3;
24625 maskmode = V4DImode;
24627 break;
24628 case V8DFmode:
24629 if (TARGET_AVX512F)
24631 gen = gen_avx512f_vpermi2varv8df3;
24632 maskmode = V8DImode;
24634 break;
24635 default:
24636 break;
24639 if (gen == NULL)
24640 return false;
24642 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24643 expander, so args are either in d, or in op0, op1 etc. */
24644 if (d)
24646 rtx vec[64];
24647 target = d->target;
24648 op0 = d->op0;
24649 op1 = d->op1;
24650 for (int i = 0; i < d->nelt; ++i)
24651 vec[i] = GEN_INT (d->perm[i]);
24652 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24655 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24656 return true;
24659 /* Expand a variable vector permutation. */
24661 void
24662 ix86_expand_vec_perm (rtx operands[])
24664 rtx target = operands[0];
24665 rtx op0 = operands[1];
24666 rtx op1 = operands[2];
24667 rtx mask = operands[3];
24668 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24669 machine_mode mode = GET_MODE (op0);
24670 machine_mode maskmode = GET_MODE (mask);
24671 int w, e, i;
24672 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24674 /* Number of elements in the vector. */
24675 w = GET_MODE_NUNITS (mode);
24676 e = GET_MODE_UNIT_SIZE (mode);
24677 gcc_assert (w <= 64);
24679 if (TARGET_AVX512F && one_operand_shuffle)
24681 rtx (*gen) (rtx, rtx, rtx) = NULL;
24682 switch (mode)
24684 case V16SImode:
24685 gen =gen_avx512f_permvarv16si;
24686 break;
24687 case V16SFmode:
24688 gen = gen_avx512f_permvarv16sf;
24689 break;
24690 case V8DImode:
24691 gen = gen_avx512f_permvarv8di;
24692 break;
24693 case V8DFmode:
24694 gen = gen_avx512f_permvarv8df;
24695 break;
24696 default:
24697 break;
24699 if (gen != NULL)
24701 emit_insn (gen (target, op0, mask));
24702 return;
24706 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24707 return;
24709 if (TARGET_AVX2)
24711 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24713 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24714 an constant shuffle operand. With a tiny bit of effort we can
24715 use VPERMD instead. A re-interpretation stall for V4DFmode is
24716 unfortunate but there's no avoiding it.
24717 Similarly for V16HImode we don't have instructions for variable
24718 shuffling, while for V32QImode we can use after preparing suitable
24719 masks vpshufb; vpshufb; vpermq; vpor. */
24721 if (mode == V16HImode)
24723 maskmode = mode = V32QImode;
24724 w = 32;
24725 e = 1;
24727 else
24729 maskmode = mode = V8SImode;
24730 w = 8;
24731 e = 4;
24733 t1 = gen_reg_rtx (maskmode);
24735 /* Replicate the low bits of the V4DImode mask into V8SImode:
24736 mask = { A B C D }
24737 t1 = { A A B B C C D D }. */
24738 for (i = 0; i < w / 2; ++i)
24739 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24740 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24741 vt = force_reg (maskmode, vt);
24742 mask = gen_lowpart (maskmode, mask);
24743 if (maskmode == V8SImode)
24744 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24745 else
24746 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24748 /* Multiply the shuffle indicies by two. */
24749 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24750 OPTAB_DIRECT);
24752 /* Add one to the odd shuffle indicies:
24753 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24754 for (i = 0; i < w / 2; ++i)
24756 vec[i * 2] = const0_rtx;
24757 vec[i * 2 + 1] = const1_rtx;
24759 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24760 vt = validize_mem (force_const_mem (maskmode, vt));
24761 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24762 OPTAB_DIRECT);
24764 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24765 operands[3] = mask = t1;
24766 target = gen_reg_rtx (mode);
24767 op0 = gen_lowpart (mode, op0);
24768 op1 = gen_lowpart (mode, op1);
24771 switch (mode)
24773 case V8SImode:
24774 /* The VPERMD and VPERMPS instructions already properly ignore
24775 the high bits of the shuffle elements. No need for us to
24776 perform an AND ourselves. */
24777 if (one_operand_shuffle)
24779 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24780 if (target != operands[0])
24781 emit_move_insn (operands[0],
24782 gen_lowpart (GET_MODE (operands[0]), target));
24784 else
24786 t1 = gen_reg_rtx (V8SImode);
24787 t2 = gen_reg_rtx (V8SImode);
24788 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24789 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24790 goto merge_two;
24792 return;
24794 case V8SFmode:
24795 mask = gen_lowpart (V8SImode, mask);
24796 if (one_operand_shuffle)
24797 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24798 else
24800 t1 = gen_reg_rtx (V8SFmode);
24801 t2 = gen_reg_rtx (V8SFmode);
24802 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24803 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24804 goto merge_two;
24806 return;
24808 case V4SImode:
24809 /* By combining the two 128-bit input vectors into one 256-bit
24810 input vector, we can use VPERMD and VPERMPS for the full
24811 two-operand shuffle. */
24812 t1 = gen_reg_rtx (V8SImode);
24813 t2 = gen_reg_rtx (V8SImode);
24814 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24815 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24816 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24817 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24818 return;
24820 case V4SFmode:
24821 t1 = gen_reg_rtx (V8SFmode);
24822 t2 = gen_reg_rtx (V8SImode);
24823 mask = gen_lowpart (V4SImode, mask);
24824 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24825 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24826 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24827 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24828 return;
24830 case V32QImode:
24831 t1 = gen_reg_rtx (V32QImode);
24832 t2 = gen_reg_rtx (V32QImode);
24833 t3 = gen_reg_rtx (V32QImode);
24834 vt2 = GEN_INT (-128);
24835 for (i = 0; i < 32; i++)
24836 vec[i] = vt2;
24837 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24838 vt = force_reg (V32QImode, vt);
24839 for (i = 0; i < 32; i++)
24840 vec[i] = i < 16 ? vt2 : const0_rtx;
24841 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24842 vt2 = force_reg (V32QImode, vt2);
24843 /* From mask create two adjusted masks, which contain the same
24844 bits as mask in the low 7 bits of each vector element.
24845 The first mask will have the most significant bit clear
24846 if it requests element from the same 128-bit lane
24847 and MSB set if it requests element from the other 128-bit lane.
24848 The second mask will have the opposite values of the MSB,
24849 and additionally will have its 128-bit lanes swapped.
24850 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24851 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24852 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24853 stands for other 12 bytes. */
24854 /* The bit whether element is from the same lane or the other
24855 lane is bit 4, so shift it up by 3 to the MSB position. */
24856 t5 = gen_reg_rtx (V4DImode);
24857 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24858 GEN_INT (3)));
24859 /* Clear MSB bits from the mask just in case it had them set. */
24860 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24861 /* After this t1 will have MSB set for elements from other lane. */
24862 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24863 /* Clear bits other than MSB. */
24864 emit_insn (gen_andv32qi3 (t1, t1, vt));
24865 /* Or in the lower bits from mask into t3. */
24866 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24867 /* And invert MSB bits in t1, so MSB is set for elements from the same
24868 lane. */
24869 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24870 /* Swap 128-bit lanes in t3. */
24871 t6 = gen_reg_rtx (V4DImode);
24872 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24873 const2_rtx, GEN_INT (3),
24874 const0_rtx, const1_rtx));
24875 /* And or in the lower bits from mask into t1. */
24876 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24877 if (one_operand_shuffle)
24879 /* Each of these shuffles will put 0s in places where
24880 element from the other 128-bit lane is needed, otherwise
24881 will shuffle in the requested value. */
24882 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24883 gen_lowpart (V32QImode, t6)));
24884 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24885 /* For t3 the 128-bit lanes are swapped again. */
24886 t7 = gen_reg_rtx (V4DImode);
24887 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24888 const2_rtx, GEN_INT (3),
24889 const0_rtx, const1_rtx));
24890 /* And oring both together leads to the result. */
24891 emit_insn (gen_iorv32qi3 (target, t1,
24892 gen_lowpart (V32QImode, t7)));
24893 if (target != operands[0])
24894 emit_move_insn (operands[0],
24895 gen_lowpart (GET_MODE (operands[0]), target));
24896 return;
24899 t4 = gen_reg_rtx (V32QImode);
24900 /* Similarly to the above one_operand_shuffle code,
24901 just for repeated twice for each operand. merge_two:
24902 code will merge the two results together. */
24903 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24904 gen_lowpart (V32QImode, t6)));
24905 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24906 gen_lowpart (V32QImode, t6)));
24907 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24908 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24909 t7 = gen_reg_rtx (V4DImode);
24910 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24911 const2_rtx, GEN_INT (3),
24912 const0_rtx, const1_rtx));
24913 t8 = gen_reg_rtx (V4DImode);
24914 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24915 const2_rtx, GEN_INT (3),
24916 const0_rtx, const1_rtx));
24917 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24918 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24919 t1 = t4;
24920 t2 = t3;
24921 goto merge_two;
24923 default:
24924 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24925 break;
24929 if (TARGET_XOP)
24931 /* The XOP VPPERM insn supports three inputs. By ignoring the
24932 one_operand_shuffle special case, we avoid creating another
24933 set of constant vectors in memory. */
24934 one_operand_shuffle = false;
24936 /* mask = mask & {2*w-1, ...} */
24937 vt = GEN_INT (2*w - 1);
24939 else
24941 /* mask = mask & {w-1, ...} */
24942 vt = GEN_INT (w - 1);
24945 for (i = 0; i < w; i++)
24946 vec[i] = vt;
24947 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24948 mask = expand_simple_binop (maskmode, AND, mask, vt,
24949 NULL_RTX, 0, OPTAB_DIRECT);
24951 /* For non-QImode operations, convert the word permutation control
24952 into a byte permutation control. */
24953 if (mode != V16QImode)
24955 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24956 GEN_INT (exact_log2 (e)),
24957 NULL_RTX, 0, OPTAB_DIRECT);
24959 /* Convert mask to vector of chars. */
24960 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24962 /* Replicate each of the input bytes into byte positions:
24963 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24964 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24965 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24966 for (i = 0; i < 16; ++i)
24967 vec[i] = GEN_INT (i/e * e);
24968 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24969 vt = validize_mem (force_const_mem (V16QImode, vt));
24970 if (TARGET_XOP)
24971 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24972 else
24973 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24975 /* Convert it into the byte positions by doing
24976 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24977 for (i = 0; i < 16; ++i)
24978 vec[i] = GEN_INT (i % e);
24979 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24980 vt = validize_mem (force_const_mem (V16QImode, vt));
24981 emit_insn (gen_addv16qi3 (mask, mask, vt));
24984 /* The actual shuffle operations all operate on V16QImode. */
24985 op0 = gen_lowpart (V16QImode, op0);
24986 op1 = gen_lowpart (V16QImode, op1);
24988 if (TARGET_XOP)
24990 if (GET_MODE (target) != V16QImode)
24991 target = gen_reg_rtx (V16QImode);
24992 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24993 if (target != operands[0])
24994 emit_move_insn (operands[0],
24995 gen_lowpart (GET_MODE (operands[0]), target));
24997 else if (one_operand_shuffle)
24999 if (GET_MODE (target) != V16QImode)
25000 target = gen_reg_rtx (V16QImode);
25001 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25002 if (target != operands[0])
25003 emit_move_insn (operands[0],
25004 gen_lowpart (GET_MODE (operands[0]), target));
25006 else
25008 rtx xops[6];
25009 bool ok;
25011 /* Shuffle the two input vectors independently. */
25012 t1 = gen_reg_rtx (V16QImode);
25013 t2 = gen_reg_rtx (V16QImode);
25014 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25015 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25017 merge_two:
25018 /* Then merge them together. The key is whether any given control
25019 element contained a bit set that indicates the second word. */
25020 mask = operands[3];
25021 vt = GEN_INT (w);
25022 if (maskmode == V2DImode && !TARGET_SSE4_1)
25024 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25025 more shuffle to convert the V2DI input mask into a V4SI
25026 input mask. At which point the masking that expand_int_vcond
25027 will work as desired. */
25028 rtx t3 = gen_reg_rtx (V4SImode);
25029 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25030 const0_rtx, const0_rtx,
25031 const2_rtx, const2_rtx));
25032 mask = t3;
25033 maskmode = V4SImode;
25034 e = w = 4;
25037 for (i = 0; i < w; i++)
25038 vec[i] = vt;
25039 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25040 vt = force_reg (maskmode, vt);
25041 mask = expand_simple_binop (maskmode, AND, mask, vt,
25042 NULL_RTX, 0, OPTAB_DIRECT);
25044 if (GET_MODE (target) != mode)
25045 target = gen_reg_rtx (mode);
25046 xops[0] = target;
25047 xops[1] = gen_lowpart (mode, t2);
25048 xops[2] = gen_lowpart (mode, t1);
25049 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25050 xops[4] = mask;
25051 xops[5] = vt;
25052 ok = ix86_expand_int_vcond (xops);
25053 gcc_assert (ok);
25054 if (target != operands[0])
25055 emit_move_insn (operands[0],
25056 gen_lowpart (GET_MODE (operands[0]), target));
25060 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25061 true if we should do zero extension, else sign extension. HIGH_P is
25062 true if we want the N/2 high elements, else the low elements. */
25064 void
25065 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25067 machine_mode imode = GET_MODE (src);
25068 rtx tmp;
25070 if (TARGET_SSE4_1)
25072 rtx (*unpack)(rtx, rtx);
25073 rtx (*extract)(rtx, rtx) = NULL;
25074 machine_mode halfmode = BLKmode;
25076 switch (imode)
25078 case V64QImode:
25079 if (unsigned_p)
25080 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25081 else
25082 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25083 halfmode = V32QImode;
25084 extract
25085 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25086 break;
25087 case V32QImode:
25088 if (unsigned_p)
25089 unpack = gen_avx2_zero_extendv16qiv16hi2;
25090 else
25091 unpack = gen_avx2_sign_extendv16qiv16hi2;
25092 halfmode = V16QImode;
25093 extract
25094 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25095 break;
25096 case V32HImode:
25097 if (unsigned_p)
25098 unpack = gen_avx512f_zero_extendv16hiv16si2;
25099 else
25100 unpack = gen_avx512f_sign_extendv16hiv16si2;
25101 halfmode = V16HImode;
25102 extract
25103 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25104 break;
25105 case V16HImode:
25106 if (unsigned_p)
25107 unpack = gen_avx2_zero_extendv8hiv8si2;
25108 else
25109 unpack = gen_avx2_sign_extendv8hiv8si2;
25110 halfmode = V8HImode;
25111 extract
25112 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25113 break;
25114 case V16SImode:
25115 if (unsigned_p)
25116 unpack = gen_avx512f_zero_extendv8siv8di2;
25117 else
25118 unpack = gen_avx512f_sign_extendv8siv8di2;
25119 halfmode = V8SImode;
25120 extract
25121 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25122 break;
25123 case V8SImode:
25124 if (unsigned_p)
25125 unpack = gen_avx2_zero_extendv4siv4di2;
25126 else
25127 unpack = gen_avx2_sign_extendv4siv4di2;
25128 halfmode = V4SImode;
25129 extract
25130 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25131 break;
25132 case V16QImode:
25133 if (unsigned_p)
25134 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25135 else
25136 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25137 break;
25138 case V8HImode:
25139 if (unsigned_p)
25140 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25141 else
25142 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25143 break;
25144 case V4SImode:
25145 if (unsigned_p)
25146 unpack = gen_sse4_1_zero_extendv2siv2di2;
25147 else
25148 unpack = gen_sse4_1_sign_extendv2siv2di2;
25149 break;
25150 default:
25151 gcc_unreachable ();
25154 if (GET_MODE_SIZE (imode) >= 32)
25156 tmp = gen_reg_rtx (halfmode);
25157 emit_insn (extract (tmp, src));
25159 else if (high_p)
25161 /* Shift higher 8 bytes to lower 8 bytes. */
25162 tmp = gen_reg_rtx (V1TImode);
25163 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25164 GEN_INT (64)));
25165 tmp = gen_lowpart (imode, tmp);
25167 else
25168 tmp = src;
25170 emit_insn (unpack (dest, tmp));
25172 else
25174 rtx (*unpack)(rtx, rtx, rtx);
25176 switch (imode)
25178 case V16QImode:
25179 if (high_p)
25180 unpack = gen_vec_interleave_highv16qi;
25181 else
25182 unpack = gen_vec_interleave_lowv16qi;
25183 break;
25184 case V8HImode:
25185 if (high_p)
25186 unpack = gen_vec_interleave_highv8hi;
25187 else
25188 unpack = gen_vec_interleave_lowv8hi;
25189 break;
25190 case V4SImode:
25191 if (high_p)
25192 unpack = gen_vec_interleave_highv4si;
25193 else
25194 unpack = gen_vec_interleave_lowv4si;
25195 break;
25196 default:
25197 gcc_unreachable ();
25200 if (unsigned_p)
25201 tmp = force_reg (imode, CONST0_RTX (imode));
25202 else
25203 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25204 src, pc_rtx, pc_rtx);
25206 rtx tmp2 = gen_reg_rtx (imode);
25207 emit_insn (unpack (tmp2, src, tmp));
25208 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25212 /* Expand conditional increment or decrement using adb/sbb instructions.
25213 The default case using setcc followed by the conditional move can be
25214 done by generic code. */
25215 bool
25216 ix86_expand_int_addcc (rtx operands[])
25218 enum rtx_code code = GET_CODE (operands[1]);
25219 rtx flags;
25220 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25221 rtx compare_op;
25222 rtx val = const0_rtx;
25223 bool fpcmp = false;
25224 machine_mode mode;
25225 rtx op0 = XEXP (operands[1], 0);
25226 rtx op1 = XEXP (operands[1], 1);
25228 if (operands[3] != const1_rtx
25229 && operands[3] != constm1_rtx)
25230 return false;
25231 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25232 return false;
25233 code = GET_CODE (compare_op);
25235 flags = XEXP (compare_op, 0);
25237 if (GET_MODE (flags) == CCFPmode
25238 || GET_MODE (flags) == CCFPUmode)
25240 fpcmp = true;
25241 code = ix86_fp_compare_code_to_integer (code);
25244 if (code != LTU)
25246 val = constm1_rtx;
25247 if (fpcmp)
25248 PUT_CODE (compare_op,
25249 reverse_condition_maybe_unordered
25250 (GET_CODE (compare_op)));
25251 else
25252 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25255 mode = GET_MODE (operands[0]);
25257 /* Construct either adc or sbb insn. */
25258 if ((code == LTU) == (operands[3] == constm1_rtx))
25260 switch (mode)
25262 case QImode:
25263 insn = gen_subqi3_carry;
25264 break;
25265 case HImode:
25266 insn = gen_subhi3_carry;
25267 break;
25268 case SImode:
25269 insn = gen_subsi3_carry;
25270 break;
25271 case DImode:
25272 insn = gen_subdi3_carry;
25273 break;
25274 default:
25275 gcc_unreachable ();
25278 else
25280 switch (mode)
25282 case QImode:
25283 insn = gen_addqi3_carry;
25284 break;
25285 case HImode:
25286 insn = gen_addhi3_carry;
25287 break;
25288 case SImode:
25289 insn = gen_addsi3_carry;
25290 break;
25291 case DImode:
25292 insn = gen_adddi3_carry;
25293 break;
25294 default:
25295 gcc_unreachable ();
25298 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25300 return true;
25304 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25305 but works for floating pointer parameters and nonoffsetable memories.
25306 For pushes, it returns just stack offsets; the values will be saved
25307 in the right order. Maximally three parts are generated. */
25309 static int
25310 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25312 int size;
25314 if (!TARGET_64BIT)
25315 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25316 else
25317 size = (GET_MODE_SIZE (mode) + 4) / 8;
25319 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25320 gcc_assert (size >= 2 && size <= 4);
25322 /* Optimize constant pool reference to immediates. This is used by fp
25323 moves, that force all constants to memory to allow combining. */
25324 if (MEM_P (operand) && MEM_READONLY_P (operand))
25326 rtx tmp = maybe_get_pool_constant (operand);
25327 if (tmp)
25328 operand = tmp;
25331 if (MEM_P (operand) && !offsettable_memref_p (operand))
25333 /* The only non-offsetable memories we handle are pushes. */
25334 int ok = push_operand (operand, VOIDmode);
25336 gcc_assert (ok);
25338 operand = copy_rtx (operand);
25339 PUT_MODE (operand, word_mode);
25340 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25341 return size;
25344 if (GET_CODE (operand) == CONST_VECTOR)
25346 machine_mode imode = int_mode_for_mode (mode);
25347 /* Caution: if we looked through a constant pool memory above,
25348 the operand may actually have a different mode now. That's
25349 ok, since we want to pun this all the way back to an integer. */
25350 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25351 gcc_assert (operand != NULL);
25352 mode = imode;
25355 if (!TARGET_64BIT)
25357 if (mode == DImode)
25358 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25359 else
25361 int i;
25363 if (REG_P (operand))
25365 gcc_assert (reload_completed);
25366 for (i = 0; i < size; i++)
25367 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25369 else if (offsettable_memref_p (operand))
25371 operand = adjust_address (operand, SImode, 0);
25372 parts[0] = operand;
25373 for (i = 1; i < size; i++)
25374 parts[i] = adjust_address (operand, SImode, 4 * i);
25376 else if (CONST_DOUBLE_P (operand))
25378 const REAL_VALUE_TYPE *r;
25379 long l[4];
25381 r = CONST_DOUBLE_REAL_VALUE (operand);
25382 switch (mode)
25384 case TFmode:
25385 real_to_target (l, r, mode);
25386 parts[3] = gen_int_mode (l[3], SImode);
25387 parts[2] = gen_int_mode (l[2], SImode);
25388 break;
25389 case XFmode:
25390 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25391 long double may not be 80-bit. */
25392 real_to_target (l, r, mode);
25393 parts[2] = gen_int_mode (l[2], SImode);
25394 break;
25395 case DFmode:
25396 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25397 break;
25398 default:
25399 gcc_unreachable ();
25401 parts[1] = gen_int_mode (l[1], SImode);
25402 parts[0] = gen_int_mode (l[0], SImode);
25404 else
25405 gcc_unreachable ();
25408 else
25410 if (mode == TImode)
25411 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25412 if (mode == XFmode || mode == TFmode)
25414 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25415 if (REG_P (operand))
25417 gcc_assert (reload_completed);
25418 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25419 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25421 else if (offsettable_memref_p (operand))
25423 operand = adjust_address (operand, DImode, 0);
25424 parts[0] = operand;
25425 parts[1] = adjust_address (operand, upper_mode, 8);
25427 else if (CONST_DOUBLE_P (operand))
25429 long l[4];
25431 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25433 /* real_to_target puts 32-bit pieces in each long. */
25434 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25435 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25436 << 32), DImode);
25438 if (upper_mode == SImode)
25439 parts[1] = gen_int_mode (l[2], SImode);
25440 else
25441 parts[1]
25442 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25443 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25444 << 32), DImode);
25446 else
25447 gcc_unreachable ();
25451 return size;
25454 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25455 Return false when normal moves are needed; true when all required
25456 insns have been emitted. Operands 2-4 contain the input values
25457 int the correct order; operands 5-7 contain the output values. */
25459 void
25460 ix86_split_long_move (rtx operands[])
25462 rtx part[2][4];
25463 int nparts, i, j;
25464 int push = 0;
25465 int collisions = 0;
25466 machine_mode mode = GET_MODE (operands[0]);
25467 bool collisionparts[4];
25469 /* The DFmode expanders may ask us to move double.
25470 For 64bit target this is single move. By hiding the fact
25471 here we simplify i386.md splitters. */
25472 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25474 /* Optimize constant pool reference to immediates. This is used by
25475 fp moves, that force all constants to memory to allow combining. */
25477 if (MEM_P (operands[1])
25478 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25479 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25480 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25481 if (push_operand (operands[0], VOIDmode))
25483 operands[0] = copy_rtx (operands[0]);
25484 PUT_MODE (operands[0], word_mode);
25486 else
25487 operands[0] = gen_lowpart (DImode, operands[0]);
25488 operands[1] = gen_lowpart (DImode, operands[1]);
25489 emit_move_insn (operands[0], operands[1]);
25490 return;
25493 /* The only non-offsettable memory we handle is push. */
25494 if (push_operand (operands[0], VOIDmode))
25495 push = 1;
25496 else
25497 gcc_assert (!MEM_P (operands[0])
25498 || offsettable_memref_p (operands[0]));
25500 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25501 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25503 /* When emitting push, take care for source operands on the stack. */
25504 if (push && MEM_P (operands[1])
25505 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25507 rtx src_base = XEXP (part[1][nparts - 1], 0);
25509 /* Compensate for the stack decrement by 4. */
25510 if (!TARGET_64BIT && nparts == 3
25511 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25512 src_base = plus_constant (Pmode, src_base, 4);
25514 /* src_base refers to the stack pointer and is
25515 automatically decreased by emitted push. */
25516 for (i = 0; i < nparts; i++)
25517 part[1][i] = change_address (part[1][i],
25518 GET_MODE (part[1][i]), src_base);
25521 /* We need to do copy in the right order in case an address register
25522 of the source overlaps the destination. */
25523 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25525 rtx tmp;
25527 for (i = 0; i < nparts; i++)
25529 collisionparts[i]
25530 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25531 if (collisionparts[i])
25532 collisions++;
25535 /* Collision in the middle part can be handled by reordering. */
25536 if (collisions == 1 && nparts == 3 && collisionparts [1])
25538 std::swap (part[0][1], part[0][2]);
25539 std::swap (part[1][1], part[1][2]);
25541 else if (collisions == 1
25542 && nparts == 4
25543 && (collisionparts [1] || collisionparts [2]))
25545 if (collisionparts [1])
25547 std::swap (part[0][1], part[0][2]);
25548 std::swap (part[1][1], part[1][2]);
25550 else
25552 std::swap (part[0][2], part[0][3]);
25553 std::swap (part[1][2], part[1][3]);
25557 /* If there are more collisions, we can't handle it by reordering.
25558 Do an lea to the last part and use only one colliding move. */
25559 else if (collisions > 1)
25561 rtx base, addr, tls_base = NULL_RTX;
25563 collisions = 1;
25565 base = part[0][nparts - 1];
25567 /* Handle the case when the last part isn't valid for lea.
25568 Happens in 64-bit mode storing the 12-byte XFmode. */
25569 if (GET_MODE (base) != Pmode)
25570 base = gen_rtx_REG (Pmode, REGNO (base));
25572 addr = XEXP (part[1][0], 0);
25573 if (TARGET_TLS_DIRECT_SEG_REFS)
25575 struct ix86_address parts;
25576 int ok = ix86_decompose_address (addr, &parts);
25577 gcc_assert (ok);
25578 if (parts.seg == DEFAULT_TLS_SEG_REG)
25580 /* It is not valid to use %gs: or %fs: in
25581 lea though, so we need to remove it from the
25582 address used for lea and add it to each individual
25583 memory loads instead. */
25584 addr = copy_rtx (addr);
25585 rtx *x = &addr;
25586 while (GET_CODE (*x) == PLUS)
25588 for (i = 0; i < 2; i++)
25590 rtx u = XEXP (*x, i);
25591 if (GET_CODE (u) == ZERO_EXTEND)
25592 u = XEXP (u, 0);
25593 if (GET_CODE (u) == UNSPEC
25594 && XINT (u, 1) == UNSPEC_TP)
25596 tls_base = XEXP (*x, i);
25597 *x = XEXP (*x, 1 - i);
25598 break;
25601 if (tls_base)
25602 break;
25603 x = &XEXP (*x, 0);
25605 gcc_assert (tls_base);
25608 emit_insn (gen_rtx_SET (base, addr));
25609 if (tls_base)
25610 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25611 part[1][0] = replace_equiv_address (part[1][0], base);
25612 for (i = 1; i < nparts; i++)
25614 if (tls_base)
25615 base = copy_rtx (base);
25616 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25617 part[1][i] = replace_equiv_address (part[1][i], tmp);
25622 if (push)
25624 if (!TARGET_64BIT)
25626 if (nparts == 3)
25628 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25629 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25630 stack_pointer_rtx, GEN_INT (-4)));
25631 emit_move_insn (part[0][2], part[1][2]);
25633 else if (nparts == 4)
25635 emit_move_insn (part[0][3], part[1][3]);
25636 emit_move_insn (part[0][2], part[1][2]);
25639 else
25641 /* In 64bit mode we don't have 32bit push available. In case this is
25642 register, it is OK - we will just use larger counterpart. We also
25643 retype memory - these comes from attempt to avoid REX prefix on
25644 moving of second half of TFmode value. */
25645 if (GET_MODE (part[1][1]) == SImode)
25647 switch (GET_CODE (part[1][1]))
25649 case MEM:
25650 part[1][1] = adjust_address (part[1][1], DImode, 0);
25651 break;
25653 case REG:
25654 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25655 break;
25657 default:
25658 gcc_unreachable ();
25661 if (GET_MODE (part[1][0]) == SImode)
25662 part[1][0] = part[1][1];
25665 emit_move_insn (part[0][1], part[1][1]);
25666 emit_move_insn (part[0][0], part[1][0]);
25667 return;
25670 /* Choose correct order to not overwrite the source before it is copied. */
25671 if ((REG_P (part[0][0])
25672 && REG_P (part[1][1])
25673 && (REGNO (part[0][0]) == REGNO (part[1][1])
25674 || (nparts == 3
25675 && REGNO (part[0][0]) == REGNO (part[1][2]))
25676 || (nparts == 4
25677 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25678 || (collisions > 0
25679 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25681 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25683 operands[2 + i] = part[0][j];
25684 operands[6 + i] = part[1][j];
25687 else
25689 for (i = 0; i < nparts; i++)
25691 operands[2 + i] = part[0][i];
25692 operands[6 + i] = part[1][i];
25696 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25697 if (optimize_insn_for_size_p ())
25699 for (j = 0; j < nparts - 1; j++)
25700 if (CONST_INT_P (operands[6 + j])
25701 && operands[6 + j] != const0_rtx
25702 && REG_P (operands[2 + j]))
25703 for (i = j; i < nparts - 1; i++)
25704 if (CONST_INT_P (operands[7 + i])
25705 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25706 operands[7 + i] = operands[2 + j];
25709 for (i = 0; i < nparts; i++)
25710 emit_move_insn (operands[2 + i], operands[6 + i]);
25712 return;
25715 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25716 left shift by a constant, either using a single shift or
25717 a sequence of add instructions. */
25719 static void
25720 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25722 rtx (*insn)(rtx, rtx, rtx);
25724 if (count == 1
25725 || (count * ix86_cost->add <= ix86_cost->shift_const
25726 && !optimize_insn_for_size_p ()))
25728 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25729 while (count-- > 0)
25730 emit_insn (insn (operand, operand, operand));
25732 else
25734 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25735 emit_insn (insn (operand, operand, GEN_INT (count)));
25739 void
25740 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25742 rtx (*gen_ashl3)(rtx, rtx, rtx);
25743 rtx (*gen_shld)(rtx, rtx, rtx);
25744 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25746 rtx low[2], high[2];
25747 int count;
25749 if (CONST_INT_P (operands[2]))
25751 split_double_mode (mode, operands, 2, low, high);
25752 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25754 if (count >= half_width)
25756 emit_move_insn (high[0], low[1]);
25757 emit_move_insn (low[0], const0_rtx);
25759 if (count > half_width)
25760 ix86_expand_ashl_const (high[0], count - half_width, mode);
25762 else
25764 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25766 if (!rtx_equal_p (operands[0], operands[1]))
25767 emit_move_insn (operands[0], operands[1]);
25769 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25770 ix86_expand_ashl_const (low[0], count, mode);
25772 return;
25775 split_double_mode (mode, operands, 1, low, high);
25777 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25779 if (operands[1] == const1_rtx)
25781 /* Assuming we've chosen a QImode capable registers, then 1 << N
25782 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25783 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25785 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25787 ix86_expand_clear (low[0]);
25788 ix86_expand_clear (high[0]);
25789 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25791 d = gen_lowpart (QImode, low[0]);
25792 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25793 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25794 emit_insn (gen_rtx_SET (d, s));
25796 d = gen_lowpart (QImode, high[0]);
25797 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25798 s = gen_rtx_NE (QImode, flags, const0_rtx);
25799 emit_insn (gen_rtx_SET (d, s));
25802 /* Otherwise, we can get the same results by manually performing
25803 a bit extract operation on bit 5/6, and then performing the two
25804 shifts. The two methods of getting 0/1 into low/high are exactly
25805 the same size. Avoiding the shift in the bit extract case helps
25806 pentium4 a bit; no one else seems to care much either way. */
25807 else
25809 machine_mode half_mode;
25810 rtx (*gen_lshr3)(rtx, rtx, rtx);
25811 rtx (*gen_and3)(rtx, rtx, rtx);
25812 rtx (*gen_xor3)(rtx, rtx, rtx);
25813 HOST_WIDE_INT bits;
25814 rtx x;
25816 if (mode == DImode)
25818 half_mode = SImode;
25819 gen_lshr3 = gen_lshrsi3;
25820 gen_and3 = gen_andsi3;
25821 gen_xor3 = gen_xorsi3;
25822 bits = 5;
25824 else
25826 half_mode = DImode;
25827 gen_lshr3 = gen_lshrdi3;
25828 gen_and3 = gen_anddi3;
25829 gen_xor3 = gen_xordi3;
25830 bits = 6;
25833 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25834 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25835 else
25836 x = gen_lowpart (half_mode, operands[2]);
25837 emit_insn (gen_rtx_SET (high[0], x));
25839 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25840 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25841 emit_move_insn (low[0], high[0]);
25842 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25845 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25846 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25847 return;
25850 if (operands[1] == constm1_rtx)
25852 /* For -1 << N, we can avoid the shld instruction, because we
25853 know that we're shifting 0...31/63 ones into a -1. */
25854 emit_move_insn (low[0], constm1_rtx);
25855 if (optimize_insn_for_size_p ())
25856 emit_move_insn (high[0], low[0]);
25857 else
25858 emit_move_insn (high[0], constm1_rtx);
25860 else
25862 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25864 if (!rtx_equal_p (operands[0], operands[1]))
25865 emit_move_insn (operands[0], operands[1]);
25867 split_double_mode (mode, operands, 1, low, high);
25868 emit_insn (gen_shld (high[0], low[0], operands[2]));
25871 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25873 if (TARGET_CMOVE && scratch)
25875 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25876 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25878 ix86_expand_clear (scratch);
25879 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25881 else
25883 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25884 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25886 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25890 void
25891 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25893 rtx (*gen_ashr3)(rtx, rtx, rtx)
25894 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25895 rtx (*gen_shrd)(rtx, rtx, rtx);
25896 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25898 rtx low[2], high[2];
25899 int count;
25901 if (CONST_INT_P (operands[2]))
25903 split_double_mode (mode, operands, 2, low, high);
25904 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25906 if (count == GET_MODE_BITSIZE (mode) - 1)
25908 emit_move_insn (high[0], high[1]);
25909 emit_insn (gen_ashr3 (high[0], high[0],
25910 GEN_INT (half_width - 1)));
25911 emit_move_insn (low[0], high[0]);
25914 else if (count >= half_width)
25916 emit_move_insn (low[0], high[1]);
25917 emit_move_insn (high[0], low[0]);
25918 emit_insn (gen_ashr3 (high[0], high[0],
25919 GEN_INT (half_width - 1)));
25921 if (count > half_width)
25922 emit_insn (gen_ashr3 (low[0], low[0],
25923 GEN_INT (count - half_width)));
25925 else
25927 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25929 if (!rtx_equal_p (operands[0], operands[1]))
25930 emit_move_insn (operands[0], operands[1]);
25932 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25933 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25936 else
25938 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25940 if (!rtx_equal_p (operands[0], operands[1]))
25941 emit_move_insn (operands[0], operands[1]);
25943 split_double_mode (mode, operands, 1, low, high);
25945 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25946 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25948 if (TARGET_CMOVE && scratch)
25950 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25951 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25953 emit_move_insn (scratch, high[0]);
25954 emit_insn (gen_ashr3 (scratch, scratch,
25955 GEN_INT (half_width - 1)));
25956 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25957 scratch));
25959 else
25961 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25962 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25964 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25969 void
25970 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25972 rtx (*gen_lshr3)(rtx, rtx, rtx)
25973 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25974 rtx (*gen_shrd)(rtx, rtx, rtx);
25975 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25977 rtx low[2], high[2];
25978 int count;
25980 if (CONST_INT_P (operands[2]))
25982 split_double_mode (mode, operands, 2, low, high);
25983 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25985 if (count >= half_width)
25987 emit_move_insn (low[0], high[1]);
25988 ix86_expand_clear (high[0]);
25990 if (count > half_width)
25991 emit_insn (gen_lshr3 (low[0], low[0],
25992 GEN_INT (count - half_width)));
25994 else
25996 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25998 if (!rtx_equal_p (operands[0], operands[1]))
25999 emit_move_insn (operands[0], operands[1]);
26001 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26002 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26005 else
26007 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26009 if (!rtx_equal_p (operands[0], operands[1]))
26010 emit_move_insn (operands[0], operands[1]);
26012 split_double_mode (mode, operands, 1, low, high);
26014 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26015 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26017 if (TARGET_CMOVE && scratch)
26019 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26020 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26022 ix86_expand_clear (scratch);
26023 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26024 scratch));
26026 else
26028 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26029 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26031 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26036 /* Predict just emitted jump instruction to be taken with probability PROB. */
26037 static void
26038 predict_jump (int prob)
26040 rtx_insn *insn = get_last_insn ();
26041 gcc_assert (JUMP_P (insn));
26042 add_int_reg_note (insn, REG_BR_PROB, prob);
26045 /* Helper function for the string operations below. Dest VARIABLE whether
26046 it is aligned to VALUE bytes. If true, jump to the label. */
26047 static rtx_code_label *
26048 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26050 rtx_code_label *label = gen_label_rtx ();
26051 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26052 if (GET_MODE (variable) == DImode)
26053 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26054 else
26055 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26056 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26057 1, label);
26058 if (epilogue)
26059 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26060 else
26061 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26062 return label;
26065 /* Adjust COUNTER by the VALUE. */
26066 static void
26067 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26069 rtx (*gen_add)(rtx, rtx, rtx)
26070 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26072 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26075 /* Zero extend possibly SImode EXP to Pmode register. */
26077 ix86_zero_extend_to_Pmode (rtx exp)
26079 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26082 /* Divide COUNTREG by SCALE. */
26083 static rtx
26084 scale_counter (rtx countreg, int scale)
26086 rtx sc;
26088 if (scale == 1)
26089 return countreg;
26090 if (CONST_INT_P (countreg))
26091 return GEN_INT (INTVAL (countreg) / scale);
26092 gcc_assert (REG_P (countreg));
26094 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26095 GEN_INT (exact_log2 (scale)),
26096 NULL, 1, OPTAB_DIRECT);
26097 return sc;
26100 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26101 DImode for constant loop counts. */
26103 static machine_mode
26104 counter_mode (rtx count_exp)
26106 if (GET_MODE (count_exp) != VOIDmode)
26107 return GET_MODE (count_exp);
26108 if (!CONST_INT_P (count_exp))
26109 return Pmode;
26110 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26111 return DImode;
26112 return SImode;
26115 /* Copy the address to a Pmode register. This is used for x32 to
26116 truncate DImode TLS address to a SImode register. */
26118 static rtx
26119 ix86_copy_addr_to_reg (rtx addr)
26121 rtx reg;
26122 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26124 reg = copy_addr_to_reg (addr);
26125 REG_POINTER (reg) = 1;
26126 return reg;
26128 else
26130 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26131 reg = copy_to_mode_reg (DImode, addr);
26132 REG_POINTER (reg) = 1;
26133 return gen_rtx_SUBREG (SImode, reg, 0);
26137 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26138 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26139 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26140 memory by VALUE (supposed to be in MODE).
26142 The size is rounded down to whole number of chunk size moved at once.
26143 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26146 static void
26147 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26148 rtx destptr, rtx srcptr, rtx value,
26149 rtx count, machine_mode mode, int unroll,
26150 int expected_size, bool issetmem)
26152 rtx_code_label *out_label, *top_label;
26153 rtx iter, tmp;
26154 machine_mode iter_mode = counter_mode (count);
26155 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26156 rtx piece_size = GEN_INT (piece_size_n);
26157 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26158 rtx size;
26159 int i;
26161 top_label = gen_label_rtx ();
26162 out_label = gen_label_rtx ();
26163 iter = gen_reg_rtx (iter_mode);
26165 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26166 NULL, 1, OPTAB_DIRECT);
26167 /* Those two should combine. */
26168 if (piece_size == const1_rtx)
26170 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26171 true, out_label);
26172 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26174 emit_move_insn (iter, const0_rtx);
26176 emit_label (top_label);
26178 tmp = convert_modes (Pmode, iter_mode, iter, true);
26180 /* This assert could be relaxed - in this case we'll need to compute
26181 smallest power of two, containing in PIECE_SIZE_N and pass it to
26182 offset_address. */
26183 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26184 destmem = offset_address (destmem, tmp, piece_size_n);
26185 destmem = adjust_address (destmem, mode, 0);
26187 if (!issetmem)
26189 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26190 srcmem = adjust_address (srcmem, mode, 0);
26192 /* When unrolling for chips that reorder memory reads and writes,
26193 we can save registers by using single temporary.
26194 Also using 4 temporaries is overkill in 32bit mode. */
26195 if (!TARGET_64BIT && 0)
26197 for (i = 0; i < unroll; i++)
26199 if (i)
26201 destmem =
26202 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26203 srcmem =
26204 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26206 emit_move_insn (destmem, srcmem);
26209 else
26211 rtx tmpreg[4];
26212 gcc_assert (unroll <= 4);
26213 for (i = 0; i < unroll; i++)
26215 tmpreg[i] = gen_reg_rtx (mode);
26216 if (i)
26218 srcmem =
26219 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26221 emit_move_insn (tmpreg[i], srcmem);
26223 for (i = 0; i < unroll; i++)
26225 if (i)
26227 destmem =
26228 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26230 emit_move_insn (destmem, tmpreg[i]);
26234 else
26235 for (i = 0; i < unroll; i++)
26237 if (i)
26238 destmem =
26239 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26240 emit_move_insn (destmem, value);
26243 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26244 true, OPTAB_LIB_WIDEN);
26245 if (tmp != iter)
26246 emit_move_insn (iter, tmp);
26248 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26249 true, top_label);
26250 if (expected_size != -1)
26252 expected_size /= GET_MODE_SIZE (mode) * unroll;
26253 if (expected_size == 0)
26254 predict_jump (0);
26255 else if (expected_size > REG_BR_PROB_BASE)
26256 predict_jump (REG_BR_PROB_BASE - 1);
26257 else
26258 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26260 else
26261 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26262 iter = ix86_zero_extend_to_Pmode (iter);
26263 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26264 true, OPTAB_LIB_WIDEN);
26265 if (tmp != destptr)
26266 emit_move_insn (destptr, tmp);
26267 if (!issetmem)
26269 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26270 true, OPTAB_LIB_WIDEN);
26271 if (tmp != srcptr)
26272 emit_move_insn (srcptr, tmp);
26274 emit_label (out_label);
26277 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26278 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26279 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26280 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26281 ORIG_VALUE is the original value passed to memset to fill the memory with.
26282 Other arguments have same meaning as for previous function. */
26284 static void
26285 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26286 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26287 rtx count,
26288 machine_mode mode, bool issetmem)
26290 rtx destexp;
26291 rtx srcexp;
26292 rtx countreg;
26293 HOST_WIDE_INT rounded_count;
26295 /* If possible, it is shorter to use rep movs.
26296 TODO: Maybe it is better to move this logic to decide_alg. */
26297 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26298 && (!issetmem || orig_value == const0_rtx))
26299 mode = SImode;
26301 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26302 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26304 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26305 GET_MODE_SIZE (mode)));
26306 if (mode != QImode)
26308 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26309 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26310 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26312 else
26313 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26314 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26316 rounded_count
26317 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26318 destmem = shallow_copy_rtx (destmem);
26319 set_mem_size (destmem, rounded_count);
26321 else if (MEM_SIZE_KNOWN_P (destmem))
26322 clear_mem_size (destmem);
26324 if (issetmem)
26326 value = force_reg (mode, gen_lowpart (mode, value));
26327 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26329 else
26331 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26332 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26333 if (mode != QImode)
26335 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26336 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26337 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26339 else
26340 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26341 if (CONST_INT_P (count))
26343 rounded_count
26344 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26345 srcmem = shallow_copy_rtx (srcmem);
26346 set_mem_size (srcmem, rounded_count);
26348 else
26350 if (MEM_SIZE_KNOWN_P (srcmem))
26351 clear_mem_size (srcmem);
26353 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26354 destexp, srcexp));
26358 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26359 DESTMEM.
26360 SRC is passed by pointer to be updated on return.
26361 Return value is updated DST. */
26362 static rtx
26363 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26364 HOST_WIDE_INT size_to_move)
26366 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26367 enum insn_code code;
26368 machine_mode move_mode;
26369 int piece_size, i;
26371 /* Find the widest mode in which we could perform moves.
26372 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26373 it until move of such size is supported. */
26374 piece_size = 1 << floor_log2 (size_to_move);
26375 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26376 code = optab_handler (mov_optab, move_mode);
26377 while (code == CODE_FOR_nothing && piece_size > 1)
26379 piece_size >>= 1;
26380 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26381 code = optab_handler (mov_optab, move_mode);
26384 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26385 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26386 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26388 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26389 move_mode = mode_for_vector (word_mode, nunits);
26390 code = optab_handler (mov_optab, move_mode);
26391 if (code == CODE_FOR_nothing)
26393 move_mode = word_mode;
26394 piece_size = GET_MODE_SIZE (move_mode);
26395 code = optab_handler (mov_optab, move_mode);
26398 gcc_assert (code != CODE_FOR_nothing);
26400 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26401 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26403 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26404 gcc_assert (size_to_move % piece_size == 0);
26405 adjust = GEN_INT (piece_size);
26406 for (i = 0; i < size_to_move; i += piece_size)
26408 /* We move from memory to memory, so we'll need to do it via
26409 a temporary register. */
26410 tempreg = gen_reg_rtx (move_mode);
26411 emit_insn (GEN_FCN (code) (tempreg, src));
26412 emit_insn (GEN_FCN (code) (dst, tempreg));
26414 emit_move_insn (destptr,
26415 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26416 emit_move_insn (srcptr,
26417 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26419 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26420 piece_size);
26421 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26422 piece_size);
26425 /* Update DST and SRC rtx. */
26426 *srcmem = src;
26427 return dst;
26430 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26431 static void
26432 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26433 rtx destptr, rtx srcptr, rtx count, int max_size)
26435 rtx src, dest;
26436 if (CONST_INT_P (count))
26438 HOST_WIDE_INT countval = INTVAL (count);
26439 HOST_WIDE_INT epilogue_size = countval % max_size;
26440 int i;
26442 /* For now MAX_SIZE should be a power of 2. This assert could be
26443 relaxed, but it'll require a bit more complicated epilogue
26444 expanding. */
26445 gcc_assert ((max_size & (max_size - 1)) == 0);
26446 for (i = max_size; i >= 1; i >>= 1)
26448 if (epilogue_size & i)
26449 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26451 return;
26453 if (max_size > 8)
26455 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26456 count, 1, OPTAB_DIRECT);
26457 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26458 count, QImode, 1, 4, false);
26459 return;
26462 /* When there are stringops, we can cheaply increase dest and src pointers.
26463 Otherwise we save code size by maintaining offset (zero is readily
26464 available from preceding rep operation) and using x86 addressing modes.
26466 if (TARGET_SINGLE_STRINGOP)
26468 if (max_size > 4)
26470 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26471 src = change_address (srcmem, SImode, srcptr);
26472 dest = change_address (destmem, SImode, destptr);
26473 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26474 emit_label (label);
26475 LABEL_NUSES (label) = 1;
26477 if (max_size > 2)
26479 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26480 src = change_address (srcmem, HImode, srcptr);
26481 dest = change_address (destmem, HImode, destptr);
26482 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26483 emit_label (label);
26484 LABEL_NUSES (label) = 1;
26486 if (max_size > 1)
26488 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26489 src = change_address (srcmem, QImode, srcptr);
26490 dest = change_address (destmem, QImode, destptr);
26491 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26492 emit_label (label);
26493 LABEL_NUSES (label) = 1;
26496 else
26498 rtx offset = force_reg (Pmode, const0_rtx);
26499 rtx tmp;
26501 if (max_size > 4)
26503 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26504 src = change_address (srcmem, SImode, srcptr);
26505 dest = change_address (destmem, SImode, destptr);
26506 emit_move_insn (dest, src);
26507 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26508 true, OPTAB_LIB_WIDEN);
26509 if (tmp != offset)
26510 emit_move_insn (offset, tmp);
26511 emit_label (label);
26512 LABEL_NUSES (label) = 1;
26514 if (max_size > 2)
26516 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26517 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26518 src = change_address (srcmem, HImode, tmp);
26519 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26520 dest = change_address (destmem, HImode, tmp);
26521 emit_move_insn (dest, src);
26522 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26523 true, OPTAB_LIB_WIDEN);
26524 if (tmp != offset)
26525 emit_move_insn (offset, tmp);
26526 emit_label (label);
26527 LABEL_NUSES (label) = 1;
26529 if (max_size > 1)
26531 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26532 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26533 src = change_address (srcmem, QImode, tmp);
26534 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26535 dest = change_address (destmem, QImode, tmp);
26536 emit_move_insn (dest, src);
26537 emit_label (label);
26538 LABEL_NUSES (label) = 1;
26543 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26544 with value PROMOTED_VAL.
26545 SRC is passed by pointer to be updated on return.
26546 Return value is updated DST. */
26547 static rtx
26548 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26549 HOST_WIDE_INT size_to_move)
26551 rtx dst = destmem, adjust;
26552 enum insn_code code;
26553 machine_mode move_mode;
26554 int piece_size, i;
26556 /* Find the widest mode in which we could perform moves.
26557 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26558 it until move of such size is supported. */
26559 move_mode = GET_MODE (promoted_val);
26560 if (move_mode == VOIDmode)
26561 move_mode = QImode;
26562 if (size_to_move < GET_MODE_SIZE (move_mode))
26564 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26565 promoted_val = gen_lowpart (move_mode, promoted_val);
26567 piece_size = GET_MODE_SIZE (move_mode);
26568 code = optab_handler (mov_optab, move_mode);
26569 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26571 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26573 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26574 gcc_assert (size_to_move % piece_size == 0);
26575 adjust = GEN_INT (piece_size);
26576 for (i = 0; i < size_to_move; i += piece_size)
26578 if (piece_size <= GET_MODE_SIZE (word_mode))
26580 emit_insn (gen_strset (destptr, dst, promoted_val));
26581 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26582 piece_size);
26583 continue;
26586 emit_insn (GEN_FCN (code) (dst, promoted_val));
26588 emit_move_insn (destptr,
26589 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26591 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26592 piece_size);
26595 /* Update DST rtx. */
26596 return dst;
26598 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26599 static void
26600 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26601 rtx count, int max_size)
26603 count =
26604 expand_simple_binop (counter_mode (count), AND, count,
26605 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26606 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26607 gen_lowpart (QImode, value), count, QImode,
26608 1, max_size / 2, true);
26611 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26612 static void
26613 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26614 rtx count, int max_size)
26616 rtx dest;
26618 if (CONST_INT_P (count))
26620 HOST_WIDE_INT countval = INTVAL (count);
26621 HOST_WIDE_INT epilogue_size = countval % max_size;
26622 int i;
26624 /* For now MAX_SIZE should be a power of 2. This assert could be
26625 relaxed, but it'll require a bit more complicated epilogue
26626 expanding. */
26627 gcc_assert ((max_size & (max_size - 1)) == 0);
26628 for (i = max_size; i >= 1; i >>= 1)
26630 if (epilogue_size & i)
26632 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26633 destmem = emit_memset (destmem, destptr, vec_value, i);
26634 else
26635 destmem = emit_memset (destmem, destptr, value, i);
26638 return;
26640 if (max_size > 32)
26642 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26643 return;
26645 if (max_size > 16)
26647 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26648 if (TARGET_64BIT)
26650 dest = change_address (destmem, DImode, destptr);
26651 emit_insn (gen_strset (destptr, dest, value));
26652 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26653 emit_insn (gen_strset (destptr, dest, value));
26655 else
26657 dest = change_address (destmem, SImode, destptr);
26658 emit_insn (gen_strset (destptr, dest, value));
26659 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26660 emit_insn (gen_strset (destptr, dest, value));
26661 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26662 emit_insn (gen_strset (destptr, dest, value));
26663 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26664 emit_insn (gen_strset (destptr, dest, value));
26666 emit_label (label);
26667 LABEL_NUSES (label) = 1;
26669 if (max_size > 8)
26671 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26672 if (TARGET_64BIT)
26674 dest = change_address (destmem, DImode, destptr);
26675 emit_insn (gen_strset (destptr, dest, value));
26677 else
26679 dest = change_address (destmem, SImode, destptr);
26680 emit_insn (gen_strset (destptr, dest, value));
26681 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26682 emit_insn (gen_strset (destptr, dest, value));
26684 emit_label (label);
26685 LABEL_NUSES (label) = 1;
26687 if (max_size > 4)
26689 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26690 dest = change_address (destmem, SImode, destptr);
26691 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26692 emit_label (label);
26693 LABEL_NUSES (label) = 1;
26695 if (max_size > 2)
26697 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26698 dest = change_address (destmem, HImode, destptr);
26699 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26700 emit_label (label);
26701 LABEL_NUSES (label) = 1;
26703 if (max_size > 1)
26705 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26706 dest = change_address (destmem, QImode, destptr);
26707 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26708 emit_label (label);
26709 LABEL_NUSES (label) = 1;
26713 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26714 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26715 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26716 ignored.
26717 Return value is updated DESTMEM. */
26718 static rtx
26719 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26720 rtx destptr, rtx srcptr, rtx value,
26721 rtx vec_value, rtx count, int align,
26722 int desired_alignment, bool issetmem)
26724 int i;
26725 for (i = 1; i < desired_alignment; i <<= 1)
26727 if (align <= i)
26729 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26730 if (issetmem)
26732 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26733 destmem = emit_memset (destmem, destptr, vec_value, i);
26734 else
26735 destmem = emit_memset (destmem, destptr, value, i);
26737 else
26738 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26739 ix86_adjust_counter (count, i);
26740 emit_label (label);
26741 LABEL_NUSES (label) = 1;
26742 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26745 return destmem;
26748 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26749 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26750 and jump to DONE_LABEL. */
26751 static void
26752 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26753 rtx destptr, rtx srcptr,
26754 rtx value, rtx vec_value,
26755 rtx count, int size,
26756 rtx done_label, bool issetmem)
26758 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26759 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26760 rtx modesize;
26761 int n;
26763 /* If we do not have vector value to copy, we must reduce size. */
26764 if (issetmem)
26766 if (!vec_value)
26768 if (GET_MODE (value) == VOIDmode && size > 8)
26769 mode = Pmode;
26770 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26771 mode = GET_MODE (value);
26773 else
26774 mode = GET_MODE (vec_value), value = vec_value;
26776 else
26778 /* Choose appropriate vector mode. */
26779 if (size >= 32)
26780 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26781 else if (size >= 16)
26782 mode = TARGET_SSE ? V16QImode : DImode;
26783 srcmem = change_address (srcmem, mode, srcptr);
26785 destmem = change_address (destmem, mode, destptr);
26786 modesize = GEN_INT (GET_MODE_SIZE (mode));
26787 gcc_assert (GET_MODE_SIZE (mode) <= size);
26788 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26790 if (issetmem)
26791 emit_move_insn (destmem, gen_lowpart (mode, value));
26792 else
26794 emit_move_insn (destmem, srcmem);
26795 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26797 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26800 destmem = offset_address (destmem, count, 1);
26801 destmem = offset_address (destmem, GEN_INT (-2 * size),
26802 GET_MODE_SIZE (mode));
26803 if (!issetmem)
26805 srcmem = offset_address (srcmem, count, 1);
26806 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26807 GET_MODE_SIZE (mode));
26809 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26811 if (issetmem)
26812 emit_move_insn (destmem, gen_lowpart (mode, value));
26813 else
26815 emit_move_insn (destmem, srcmem);
26816 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26818 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26820 emit_jump_insn (gen_jump (done_label));
26821 emit_barrier ();
26823 emit_label (label);
26824 LABEL_NUSES (label) = 1;
26827 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26828 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26829 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26830 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26831 DONE_LABEL is a label after the whole copying sequence. The label is created
26832 on demand if *DONE_LABEL is NULL.
26833 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26834 bounds after the initial copies.
26836 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26837 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26838 we will dispatch to a library call for large blocks.
26840 In pseudocode we do:
26842 if (COUNT < SIZE)
26844 Assume that SIZE is 4. Bigger sizes are handled analogously
26845 if (COUNT & 4)
26847 copy 4 bytes from SRCPTR to DESTPTR
26848 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26849 goto done_label
26851 if (!COUNT)
26852 goto done_label;
26853 copy 1 byte from SRCPTR to DESTPTR
26854 if (COUNT & 2)
26856 copy 2 bytes from SRCPTR to DESTPTR
26857 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26860 else
26862 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26863 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26865 OLD_DESPTR = DESTPTR;
26866 Align DESTPTR up to DESIRED_ALIGN
26867 SRCPTR += DESTPTR - OLD_DESTPTR
26868 COUNT -= DEST_PTR - OLD_DESTPTR
26869 if (DYNAMIC_CHECK)
26870 Round COUNT down to multiple of SIZE
26871 << optional caller supplied zero size guard is here >>
26872 << optional caller supplied dynamic check is here >>
26873 << caller supplied main copy loop is here >>
26875 done_label:
26877 static void
26878 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26879 rtx *destptr, rtx *srcptr,
26880 machine_mode mode,
26881 rtx value, rtx vec_value,
26882 rtx *count,
26883 rtx_code_label **done_label,
26884 int size,
26885 int desired_align,
26886 int align,
26887 unsigned HOST_WIDE_INT *min_size,
26888 bool dynamic_check,
26889 bool issetmem)
26891 rtx_code_label *loop_label = NULL, *label;
26892 int n;
26893 rtx modesize;
26894 int prolog_size = 0;
26895 rtx mode_value;
26897 /* Chose proper value to copy. */
26898 if (issetmem && VECTOR_MODE_P (mode))
26899 mode_value = vec_value;
26900 else
26901 mode_value = value;
26902 gcc_assert (GET_MODE_SIZE (mode) <= size);
26904 /* See if block is big or small, handle small blocks. */
26905 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26907 int size2 = size;
26908 loop_label = gen_label_rtx ();
26910 if (!*done_label)
26911 *done_label = gen_label_rtx ();
26913 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26914 1, loop_label);
26915 size2 >>= 1;
26917 /* Handle sizes > 3. */
26918 for (;size2 > 2; size2 >>= 1)
26919 expand_small_movmem_or_setmem (destmem, srcmem,
26920 *destptr, *srcptr,
26921 value, vec_value,
26922 *count,
26923 size2, *done_label, issetmem);
26924 /* Nothing to copy? Jump to DONE_LABEL if so */
26925 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26926 1, *done_label);
26928 /* Do a byte copy. */
26929 destmem = change_address (destmem, QImode, *destptr);
26930 if (issetmem)
26931 emit_move_insn (destmem, gen_lowpart (QImode, value));
26932 else
26934 srcmem = change_address (srcmem, QImode, *srcptr);
26935 emit_move_insn (destmem, srcmem);
26938 /* Handle sizes 2 and 3. */
26939 label = ix86_expand_aligntest (*count, 2, false);
26940 destmem = change_address (destmem, HImode, *destptr);
26941 destmem = offset_address (destmem, *count, 1);
26942 destmem = offset_address (destmem, GEN_INT (-2), 2);
26943 if (issetmem)
26944 emit_move_insn (destmem, gen_lowpart (HImode, value));
26945 else
26947 srcmem = change_address (srcmem, HImode, *srcptr);
26948 srcmem = offset_address (srcmem, *count, 1);
26949 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26950 emit_move_insn (destmem, srcmem);
26953 emit_label (label);
26954 LABEL_NUSES (label) = 1;
26955 emit_jump_insn (gen_jump (*done_label));
26956 emit_barrier ();
26958 else
26959 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26960 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26962 /* Start memcpy for COUNT >= SIZE. */
26963 if (loop_label)
26965 emit_label (loop_label);
26966 LABEL_NUSES (loop_label) = 1;
26969 /* Copy first desired_align bytes. */
26970 if (!issetmem)
26971 srcmem = change_address (srcmem, mode, *srcptr);
26972 destmem = change_address (destmem, mode, *destptr);
26973 modesize = GEN_INT (GET_MODE_SIZE (mode));
26974 for (n = 0; prolog_size < desired_align - align; n++)
26976 if (issetmem)
26977 emit_move_insn (destmem, mode_value);
26978 else
26980 emit_move_insn (destmem, srcmem);
26981 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26983 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26984 prolog_size += GET_MODE_SIZE (mode);
26988 /* Copy last SIZE bytes. */
26989 destmem = offset_address (destmem, *count, 1);
26990 destmem = offset_address (destmem,
26991 GEN_INT (-size - prolog_size),
26993 if (issetmem)
26994 emit_move_insn (destmem, mode_value);
26995 else
26997 srcmem = offset_address (srcmem, *count, 1);
26998 srcmem = offset_address (srcmem,
26999 GEN_INT (-size - prolog_size),
27001 emit_move_insn (destmem, srcmem);
27003 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27005 destmem = offset_address (destmem, modesize, 1);
27006 if (issetmem)
27007 emit_move_insn (destmem, mode_value);
27008 else
27010 srcmem = offset_address (srcmem, modesize, 1);
27011 emit_move_insn (destmem, srcmem);
27015 /* Align destination. */
27016 if (desired_align > 1 && desired_align > align)
27018 rtx saveddest = *destptr;
27020 gcc_assert (desired_align <= size);
27021 /* Align destptr up, place it to new register. */
27022 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27023 GEN_INT (prolog_size),
27024 NULL_RTX, 1, OPTAB_DIRECT);
27025 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27026 REG_POINTER (*destptr) = 1;
27027 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27028 GEN_INT (-desired_align),
27029 *destptr, 1, OPTAB_DIRECT);
27030 /* See how many bytes we skipped. */
27031 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27032 *destptr,
27033 saveddest, 1, OPTAB_DIRECT);
27034 /* Adjust srcptr and count. */
27035 if (!issetmem)
27036 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27037 saveddest, *srcptr, 1, OPTAB_DIRECT);
27038 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27039 saveddest, *count, 1, OPTAB_DIRECT);
27040 /* We copied at most size + prolog_size. */
27041 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27042 *min_size
27043 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27044 else
27045 *min_size = 0;
27047 /* Our loops always round down the block size, but for dispatch to
27048 library we need precise value. */
27049 if (dynamic_check)
27050 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27051 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27053 else
27055 gcc_assert (prolog_size == 0);
27056 /* Decrease count, so we won't end up copying last word twice. */
27057 if (!CONST_INT_P (*count))
27058 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27059 constm1_rtx, *count, 1, OPTAB_DIRECT);
27060 else
27061 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27062 (unsigned HOST_WIDE_INT)size));
27063 if (*min_size)
27064 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27069 /* This function is like the previous one, except here we know how many bytes
27070 need to be copied. That allows us to update alignment not only of DST, which
27071 is returned, but also of SRC, which is passed as a pointer for that
27072 reason. */
27073 static rtx
27074 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27075 rtx srcreg, rtx value, rtx vec_value,
27076 int desired_align, int align_bytes,
27077 bool issetmem)
27079 rtx src = NULL;
27080 rtx orig_dst = dst;
27081 rtx orig_src = NULL;
27082 int piece_size = 1;
27083 int copied_bytes = 0;
27085 if (!issetmem)
27087 gcc_assert (srcp != NULL);
27088 src = *srcp;
27089 orig_src = src;
27092 for (piece_size = 1;
27093 piece_size <= desired_align && copied_bytes < align_bytes;
27094 piece_size <<= 1)
27096 if (align_bytes & piece_size)
27098 if (issetmem)
27100 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27101 dst = emit_memset (dst, destreg, vec_value, piece_size);
27102 else
27103 dst = emit_memset (dst, destreg, value, piece_size);
27105 else
27106 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27107 copied_bytes += piece_size;
27110 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27111 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27112 if (MEM_SIZE_KNOWN_P (orig_dst))
27113 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27115 if (!issetmem)
27117 int src_align_bytes = get_mem_align_offset (src, desired_align
27118 * BITS_PER_UNIT);
27119 if (src_align_bytes >= 0)
27120 src_align_bytes = desired_align - src_align_bytes;
27121 if (src_align_bytes >= 0)
27123 unsigned int src_align;
27124 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27126 if ((src_align_bytes & (src_align - 1))
27127 == (align_bytes & (src_align - 1)))
27128 break;
27130 if (src_align > (unsigned int) desired_align)
27131 src_align = desired_align;
27132 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27133 set_mem_align (src, src_align * BITS_PER_UNIT);
27135 if (MEM_SIZE_KNOWN_P (orig_src))
27136 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27137 *srcp = src;
27140 return dst;
27143 /* Return true if ALG can be used in current context.
27144 Assume we expand memset if MEMSET is true. */
27145 static bool
27146 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27148 if (alg == no_stringop)
27149 return false;
27150 if (alg == vector_loop)
27151 return TARGET_SSE || TARGET_AVX;
27152 /* Algorithms using the rep prefix want at least edi and ecx;
27153 additionally, memset wants eax and memcpy wants esi. Don't
27154 consider such algorithms if the user has appropriated those
27155 registers for their own purposes, or if we have a non-default
27156 address space, since some string insns cannot override the segment. */
27157 if (alg == rep_prefix_1_byte
27158 || alg == rep_prefix_4_byte
27159 || alg == rep_prefix_8_byte)
27161 if (have_as)
27162 return false;
27163 if (fixed_regs[CX_REG]
27164 || fixed_regs[DI_REG]
27165 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27166 return false;
27168 return true;
27171 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27172 static enum stringop_alg
27173 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27174 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27175 bool memset, bool zero_memset, bool have_as,
27176 int *dynamic_check, bool *noalign, bool recur)
27178 const struct stringop_algs *algs;
27179 bool optimize_for_speed;
27180 int max = 0;
27181 const struct processor_costs *cost;
27182 int i;
27183 bool any_alg_usable_p = false;
27185 *noalign = false;
27186 *dynamic_check = -1;
27188 /* Even if the string operation call is cold, we still might spend a lot
27189 of time processing large blocks. */
27190 if (optimize_function_for_size_p (cfun)
27191 || (optimize_insn_for_size_p ()
27192 && (max_size < 256
27193 || (expected_size != -1 && expected_size < 256))))
27194 optimize_for_speed = false;
27195 else
27196 optimize_for_speed = true;
27198 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27199 if (memset)
27200 algs = &cost->memset[TARGET_64BIT != 0];
27201 else
27202 algs = &cost->memcpy[TARGET_64BIT != 0];
27204 /* See maximal size for user defined algorithm. */
27205 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27207 enum stringop_alg candidate = algs->size[i].alg;
27208 bool usable = alg_usable_p (candidate, memset, have_as);
27209 any_alg_usable_p |= usable;
27211 if (candidate != libcall && candidate && usable)
27212 max = algs->size[i].max;
27215 /* If expected size is not known but max size is small enough
27216 so inline version is a win, set expected size into
27217 the range. */
27218 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27219 && expected_size == -1)
27220 expected_size = min_size / 2 + max_size / 2;
27222 /* If user specified the algorithm, honor it if possible. */
27223 if (ix86_stringop_alg != no_stringop
27224 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27225 return ix86_stringop_alg;
27226 /* rep; movq or rep; movl is the smallest variant. */
27227 else if (!optimize_for_speed)
27229 *noalign = true;
27230 if (!count || (count & 3) || (memset && !zero_memset))
27231 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27232 ? rep_prefix_1_byte : loop_1_byte;
27233 else
27234 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27235 ? rep_prefix_4_byte : loop;
27237 /* Very tiny blocks are best handled via the loop, REP is expensive to
27238 setup. */
27239 else if (expected_size != -1 && expected_size < 4)
27240 return loop_1_byte;
27241 else if (expected_size != -1)
27243 enum stringop_alg alg = libcall;
27244 bool alg_noalign = false;
27245 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27247 /* We get here if the algorithms that were not libcall-based
27248 were rep-prefix based and we are unable to use rep prefixes
27249 based on global register usage. Break out of the loop and
27250 use the heuristic below. */
27251 if (algs->size[i].max == 0)
27252 break;
27253 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27255 enum stringop_alg candidate = algs->size[i].alg;
27257 if (candidate != libcall
27258 && alg_usable_p (candidate, memset, have_as))
27260 alg = candidate;
27261 alg_noalign = algs->size[i].noalign;
27263 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27264 last non-libcall inline algorithm. */
27265 if (TARGET_INLINE_ALL_STRINGOPS)
27267 /* When the current size is best to be copied by a libcall,
27268 but we are still forced to inline, run the heuristic below
27269 that will pick code for medium sized blocks. */
27270 if (alg != libcall)
27272 *noalign = alg_noalign;
27273 return alg;
27275 else if (!any_alg_usable_p)
27276 break;
27278 else if (alg_usable_p (candidate, memset, have_as))
27280 *noalign = algs->size[i].noalign;
27281 return candidate;
27286 /* When asked to inline the call anyway, try to pick meaningful choice.
27287 We look for maximal size of block that is faster to copy by hand and
27288 take blocks of at most of that size guessing that average size will
27289 be roughly half of the block.
27291 If this turns out to be bad, we might simply specify the preferred
27292 choice in ix86_costs. */
27293 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27294 && (algs->unknown_size == libcall
27295 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27297 enum stringop_alg alg;
27298 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27300 /* If there aren't any usable algorithms or if recursing already,
27301 then recursing on smaller sizes or same size isn't going to
27302 find anything. Just return the simple byte-at-a-time copy loop. */
27303 if (!any_alg_usable_p || recur)
27305 /* Pick something reasonable. */
27306 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27307 *dynamic_check = 128;
27308 return loop_1_byte;
27310 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27311 zero_memset, have_as, dynamic_check, noalign, true);
27312 gcc_assert (*dynamic_check == -1);
27313 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27314 *dynamic_check = max;
27315 else
27316 gcc_assert (alg != libcall);
27317 return alg;
27319 return (alg_usable_p (algs->unknown_size, memset, have_as)
27320 ? algs->unknown_size : libcall);
27323 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27324 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27325 static int
27326 decide_alignment (int align,
27327 enum stringop_alg alg,
27328 int expected_size,
27329 machine_mode move_mode)
27331 int desired_align = 0;
27333 gcc_assert (alg != no_stringop);
27335 if (alg == libcall)
27336 return 0;
27337 if (move_mode == VOIDmode)
27338 return 0;
27340 desired_align = GET_MODE_SIZE (move_mode);
27341 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27342 copying whole cacheline at once. */
27343 if (TARGET_PENTIUMPRO
27344 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27345 desired_align = 8;
27347 if (optimize_size)
27348 desired_align = 1;
27349 if (desired_align < align)
27350 desired_align = align;
27351 if (expected_size != -1 && expected_size < 4)
27352 desired_align = align;
27354 return desired_align;
27358 /* Helper function for memcpy. For QImode value 0xXY produce
27359 0xXYXYXYXY of wide specified by MODE. This is essentially
27360 a * 0x10101010, but we can do slightly better than
27361 synth_mult by unwinding the sequence by hand on CPUs with
27362 slow multiply. */
27363 static rtx
27364 promote_duplicated_reg (machine_mode mode, rtx val)
27366 machine_mode valmode = GET_MODE (val);
27367 rtx tmp;
27368 int nops = mode == DImode ? 3 : 2;
27370 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27371 if (val == const0_rtx)
27372 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27373 if (CONST_INT_P (val))
27375 HOST_WIDE_INT v = INTVAL (val) & 255;
27377 v |= v << 8;
27378 v |= v << 16;
27379 if (mode == DImode)
27380 v |= (v << 16) << 16;
27381 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27384 if (valmode == VOIDmode)
27385 valmode = QImode;
27386 if (valmode != QImode)
27387 val = gen_lowpart (QImode, val);
27388 if (mode == QImode)
27389 return val;
27390 if (!TARGET_PARTIAL_REG_STALL)
27391 nops--;
27392 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27393 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27394 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27395 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27397 rtx reg = convert_modes (mode, QImode, val, true);
27398 tmp = promote_duplicated_reg (mode, const1_rtx);
27399 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27400 OPTAB_DIRECT);
27402 else
27404 rtx reg = convert_modes (mode, QImode, val, true);
27406 if (!TARGET_PARTIAL_REG_STALL)
27407 if (mode == SImode)
27408 emit_insn (gen_insvsi_1 (reg, reg));
27409 else
27410 emit_insn (gen_insvdi_1 (reg, reg));
27411 else
27413 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27414 NULL, 1, OPTAB_DIRECT);
27415 reg =
27416 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27418 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27419 NULL, 1, OPTAB_DIRECT);
27420 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27421 if (mode == SImode)
27422 return reg;
27423 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27424 NULL, 1, OPTAB_DIRECT);
27425 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27426 return reg;
27430 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27431 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27432 alignment from ALIGN to DESIRED_ALIGN. */
27433 static rtx
27434 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27435 int align)
27437 rtx promoted_val;
27439 if (TARGET_64BIT
27440 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27441 promoted_val = promote_duplicated_reg (DImode, val);
27442 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27443 promoted_val = promote_duplicated_reg (SImode, val);
27444 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27445 promoted_val = promote_duplicated_reg (HImode, val);
27446 else
27447 promoted_val = val;
27449 return promoted_val;
27452 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27453 operations when profitable. The code depends upon architecture, block size
27454 and alignment, but always has one of the following overall structures:
27456 Aligned move sequence:
27458 1) Prologue guard: Conditional that jumps up to epilogues for small
27459 blocks that can be handled by epilogue alone. This is faster
27460 but also needed for correctness, since prologue assume the block
27461 is larger than the desired alignment.
27463 Optional dynamic check for size and libcall for large
27464 blocks is emitted here too, with -minline-stringops-dynamically.
27466 2) Prologue: copy first few bytes in order to get destination
27467 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27468 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27469 copied. We emit either a jump tree on power of two sized
27470 blocks, or a byte loop.
27472 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27473 with specified algorithm.
27475 4) Epilogue: code copying tail of the block that is too small to be
27476 handled by main body (or up to size guarded by prologue guard).
27478 Misaligned move sequence
27480 1) missaligned move prologue/epilogue containing:
27481 a) Prologue handling small memory blocks and jumping to done_label
27482 (skipped if blocks are known to be large enough)
27483 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27484 needed by single possibly misaligned move
27485 (skipped if alignment is not needed)
27486 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27488 2) Zero size guard dispatching to done_label, if needed
27490 3) dispatch to library call, if needed,
27492 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27493 with specified algorithm. */
27494 bool
27495 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27496 rtx align_exp, rtx expected_align_exp,
27497 rtx expected_size_exp, rtx min_size_exp,
27498 rtx max_size_exp, rtx probable_max_size_exp,
27499 bool issetmem)
27501 rtx destreg;
27502 rtx srcreg = NULL;
27503 rtx_code_label *label = NULL;
27504 rtx tmp;
27505 rtx_code_label *jump_around_label = NULL;
27506 HOST_WIDE_INT align = 1;
27507 unsigned HOST_WIDE_INT count = 0;
27508 HOST_WIDE_INT expected_size = -1;
27509 int size_needed = 0, epilogue_size_needed;
27510 int desired_align = 0, align_bytes = 0;
27511 enum stringop_alg alg;
27512 rtx promoted_val = NULL;
27513 rtx vec_promoted_val = NULL;
27514 bool force_loopy_epilogue = false;
27515 int dynamic_check;
27516 bool need_zero_guard = false;
27517 bool noalign;
27518 machine_mode move_mode = VOIDmode;
27519 int unroll_factor = 1;
27520 /* TODO: Once value ranges are available, fill in proper data. */
27521 unsigned HOST_WIDE_INT min_size = 0;
27522 unsigned HOST_WIDE_INT max_size = -1;
27523 unsigned HOST_WIDE_INT probable_max_size = -1;
27524 bool misaligned_prologue_used = false;
27525 bool have_as;
27527 if (CONST_INT_P (align_exp))
27528 align = INTVAL (align_exp);
27529 /* i386 can do misaligned access on reasonably increased cost. */
27530 if (CONST_INT_P (expected_align_exp)
27531 && INTVAL (expected_align_exp) > align)
27532 align = INTVAL (expected_align_exp);
27533 /* ALIGN is the minimum of destination and source alignment, but we care here
27534 just about destination alignment. */
27535 else if (!issetmem
27536 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27537 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27539 if (CONST_INT_P (count_exp))
27541 min_size = max_size = probable_max_size = count = expected_size
27542 = INTVAL (count_exp);
27543 /* When COUNT is 0, there is nothing to do. */
27544 if (!count)
27545 return true;
27547 else
27549 if (min_size_exp)
27550 min_size = INTVAL (min_size_exp);
27551 if (max_size_exp)
27552 max_size = INTVAL (max_size_exp);
27553 if (probable_max_size_exp)
27554 probable_max_size = INTVAL (probable_max_size_exp);
27555 if (CONST_INT_P (expected_size_exp))
27556 expected_size = INTVAL (expected_size_exp);
27559 /* Make sure we don't need to care about overflow later on. */
27560 if (count > (HOST_WIDE_INT_1U << 30))
27561 return false;
27563 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27564 if (!issetmem)
27565 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27567 /* Step 0: Decide on preferred algorithm, desired alignment and
27568 size of chunks to be copied by main loop. */
27569 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27570 issetmem,
27571 issetmem && val_exp == const0_rtx, have_as,
27572 &dynamic_check, &noalign, false);
27573 if (alg == libcall)
27574 return false;
27575 gcc_assert (alg != no_stringop);
27577 /* For now vector-version of memset is generated only for memory zeroing, as
27578 creating of promoted vector value is very cheap in this case. */
27579 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27580 alg = unrolled_loop;
27582 if (!count)
27583 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27584 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27585 if (!issetmem)
27586 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27588 unroll_factor = 1;
27589 move_mode = word_mode;
27590 switch (alg)
27592 case libcall:
27593 case no_stringop:
27594 case last_alg:
27595 gcc_unreachable ();
27596 case loop_1_byte:
27597 need_zero_guard = true;
27598 move_mode = QImode;
27599 break;
27600 case loop:
27601 need_zero_guard = true;
27602 break;
27603 case unrolled_loop:
27604 need_zero_guard = true;
27605 unroll_factor = (TARGET_64BIT ? 4 : 2);
27606 break;
27607 case vector_loop:
27608 need_zero_guard = true;
27609 unroll_factor = 4;
27610 /* Find the widest supported mode. */
27611 move_mode = word_mode;
27612 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27613 != CODE_FOR_nothing)
27614 move_mode = GET_MODE_WIDER_MODE (move_mode);
27616 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27617 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27618 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27620 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27621 move_mode = mode_for_vector (word_mode, nunits);
27622 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27623 move_mode = word_mode;
27625 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27626 break;
27627 case rep_prefix_8_byte:
27628 move_mode = DImode;
27629 break;
27630 case rep_prefix_4_byte:
27631 move_mode = SImode;
27632 break;
27633 case rep_prefix_1_byte:
27634 move_mode = QImode;
27635 break;
27637 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27638 epilogue_size_needed = size_needed;
27640 /* If we are going to call any library calls conditionally, make sure any
27641 pending stack adjustment happen before the first conditional branch,
27642 otherwise they will be emitted before the library call only and won't
27643 happen from the other branches. */
27644 if (dynamic_check != -1)
27645 do_pending_stack_adjust ();
27647 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27648 if (!TARGET_ALIGN_STRINGOPS || noalign)
27649 align = desired_align;
27651 /* Step 1: Prologue guard. */
27653 /* Alignment code needs count to be in register. */
27654 if (CONST_INT_P (count_exp) && desired_align > align)
27656 if (INTVAL (count_exp) > desired_align
27657 && INTVAL (count_exp) > size_needed)
27659 align_bytes
27660 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27661 if (align_bytes <= 0)
27662 align_bytes = 0;
27663 else
27664 align_bytes = desired_align - align_bytes;
27666 if (align_bytes == 0)
27667 count_exp = force_reg (counter_mode (count_exp), count_exp);
27669 gcc_assert (desired_align >= 1 && align >= 1);
27671 /* Misaligned move sequences handle both prologue and epilogue at once.
27672 Default code generation results in a smaller code for large alignments
27673 and also avoids redundant job when sizes are known precisely. */
27674 misaligned_prologue_used
27675 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27676 && MAX (desired_align, epilogue_size_needed) <= 32
27677 && desired_align <= epilogue_size_needed
27678 && ((desired_align > align && !align_bytes)
27679 || (!count && epilogue_size_needed > 1)));
27681 /* Do the cheap promotion to allow better CSE across the
27682 main loop and epilogue (ie one load of the big constant in the
27683 front of all code.
27684 For now the misaligned move sequences do not have fast path
27685 without broadcasting. */
27686 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27688 if (alg == vector_loop)
27690 gcc_assert (val_exp == const0_rtx);
27691 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27692 promoted_val = promote_duplicated_reg_to_size (val_exp,
27693 GET_MODE_SIZE (word_mode),
27694 desired_align, align);
27696 else
27698 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27699 desired_align, align);
27702 /* Misaligned move sequences handles both prologues and epilogues at once.
27703 Default code generation results in smaller code for large alignments and
27704 also avoids redundant job when sizes are known precisely. */
27705 if (misaligned_prologue_used)
27707 /* Misaligned move prologue handled small blocks by itself. */
27708 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27709 (dst, src, &destreg, &srcreg,
27710 move_mode, promoted_val, vec_promoted_val,
27711 &count_exp,
27712 &jump_around_label,
27713 desired_align < align
27714 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27715 desired_align, align, &min_size, dynamic_check, issetmem);
27716 if (!issetmem)
27717 src = change_address (src, BLKmode, srcreg);
27718 dst = change_address (dst, BLKmode, destreg);
27719 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27720 epilogue_size_needed = 0;
27721 if (need_zero_guard
27722 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27724 /* It is possible that we copied enough so the main loop will not
27725 execute. */
27726 gcc_assert (size_needed > 1);
27727 if (jump_around_label == NULL_RTX)
27728 jump_around_label = gen_label_rtx ();
27729 emit_cmp_and_jump_insns (count_exp,
27730 GEN_INT (size_needed),
27731 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27732 if (expected_size == -1
27733 || expected_size < (desired_align - align) / 2 + size_needed)
27734 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27735 else
27736 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27739 /* Ensure that alignment prologue won't copy past end of block. */
27740 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27742 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27743 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27744 Make sure it is power of 2. */
27745 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27747 /* To improve performance of small blocks, we jump around the VAL
27748 promoting mode. This mean that if the promoted VAL is not constant,
27749 we might not use it in the epilogue and have to use byte
27750 loop variant. */
27751 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27752 force_loopy_epilogue = true;
27753 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27754 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27756 /* If main algorithm works on QImode, no epilogue is needed.
27757 For small sizes just don't align anything. */
27758 if (size_needed == 1)
27759 desired_align = align;
27760 else
27761 goto epilogue;
27763 else if (!count
27764 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27766 label = gen_label_rtx ();
27767 emit_cmp_and_jump_insns (count_exp,
27768 GEN_INT (epilogue_size_needed),
27769 LTU, 0, counter_mode (count_exp), 1, label);
27770 if (expected_size == -1 || expected_size < epilogue_size_needed)
27771 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27772 else
27773 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27777 /* Emit code to decide on runtime whether library call or inline should be
27778 used. */
27779 if (dynamic_check != -1)
27781 if (!issetmem && CONST_INT_P (count_exp))
27783 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27785 emit_block_copy_via_libcall (dst, src, count_exp);
27786 count_exp = const0_rtx;
27787 goto epilogue;
27790 else
27792 rtx_code_label *hot_label = gen_label_rtx ();
27793 if (jump_around_label == NULL_RTX)
27794 jump_around_label = gen_label_rtx ();
27795 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27796 LEU, 0, counter_mode (count_exp),
27797 1, hot_label);
27798 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27799 if (issetmem)
27800 set_storage_via_libcall (dst, count_exp, val_exp);
27801 else
27802 emit_block_copy_via_libcall (dst, src, count_exp);
27803 emit_jump (jump_around_label);
27804 emit_label (hot_label);
27808 /* Step 2: Alignment prologue. */
27809 /* Do the expensive promotion once we branched off the small blocks. */
27810 if (issetmem && !promoted_val)
27811 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27812 desired_align, align);
27814 if (desired_align > align && !misaligned_prologue_used)
27816 if (align_bytes == 0)
27818 /* Except for the first move in prologue, we no longer know
27819 constant offset in aliasing info. It don't seems to worth
27820 the pain to maintain it for the first move, so throw away
27821 the info early. */
27822 dst = change_address (dst, BLKmode, destreg);
27823 if (!issetmem)
27824 src = change_address (src, BLKmode, srcreg);
27825 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27826 promoted_val, vec_promoted_val,
27827 count_exp, align, desired_align,
27828 issetmem);
27829 /* At most desired_align - align bytes are copied. */
27830 if (min_size < (unsigned)(desired_align - align))
27831 min_size = 0;
27832 else
27833 min_size -= desired_align - align;
27835 else
27837 /* If we know how many bytes need to be stored before dst is
27838 sufficiently aligned, maintain aliasing info accurately. */
27839 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27840 srcreg,
27841 promoted_val,
27842 vec_promoted_val,
27843 desired_align,
27844 align_bytes,
27845 issetmem);
27847 count_exp = plus_constant (counter_mode (count_exp),
27848 count_exp, -align_bytes);
27849 count -= align_bytes;
27850 min_size -= align_bytes;
27851 max_size -= align_bytes;
27853 if (need_zero_guard
27854 && min_size < (unsigned HOST_WIDE_INT) size_needed
27855 && (count < (unsigned HOST_WIDE_INT) size_needed
27856 || (align_bytes == 0
27857 && count < ((unsigned HOST_WIDE_INT) size_needed
27858 + desired_align - align))))
27860 /* It is possible that we copied enough so the main loop will not
27861 execute. */
27862 gcc_assert (size_needed > 1);
27863 if (label == NULL_RTX)
27864 label = gen_label_rtx ();
27865 emit_cmp_and_jump_insns (count_exp,
27866 GEN_INT (size_needed),
27867 LTU, 0, counter_mode (count_exp), 1, label);
27868 if (expected_size == -1
27869 || expected_size < (desired_align - align) / 2 + size_needed)
27870 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27871 else
27872 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27875 if (label && size_needed == 1)
27877 emit_label (label);
27878 LABEL_NUSES (label) = 1;
27879 label = NULL;
27880 epilogue_size_needed = 1;
27881 if (issetmem)
27882 promoted_val = val_exp;
27884 else if (label == NULL_RTX && !misaligned_prologue_used)
27885 epilogue_size_needed = size_needed;
27887 /* Step 3: Main loop. */
27889 switch (alg)
27891 case libcall:
27892 case no_stringop:
27893 case last_alg:
27894 gcc_unreachable ();
27895 case loop_1_byte:
27896 case loop:
27897 case unrolled_loop:
27898 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27899 count_exp, move_mode, unroll_factor,
27900 expected_size, issetmem);
27901 break;
27902 case vector_loop:
27903 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27904 vec_promoted_val, count_exp, move_mode,
27905 unroll_factor, expected_size, issetmem);
27906 break;
27907 case rep_prefix_8_byte:
27908 case rep_prefix_4_byte:
27909 case rep_prefix_1_byte:
27910 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27911 val_exp, count_exp, move_mode, issetmem);
27912 break;
27914 /* Adjust properly the offset of src and dest memory for aliasing. */
27915 if (CONST_INT_P (count_exp))
27917 if (!issetmem)
27918 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27919 (count / size_needed) * size_needed);
27920 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27921 (count / size_needed) * size_needed);
27923 else
27925 if (!issetmem)
27926 src = change_address (src, BLKmode, srcreg);
27927 dst = change_address (dst, BLKmode, destreg);
27930 /* Step 4: Epilogue to copy the remaining bytes. */
27931 epilogue:
27932 if (label)
27934 /* When the main loop is done, COUNT_EXP might hold original count,
27935 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27936 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27937 bytes. Compensate if needed. */
27939 if (size_needed < epilogue_size_needed)
27941 tmp =
27942 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27943 GEN_INT (size_needed - 1), count_exp, 1,
27944 OPTAB_DIRECT);
27945 if (tmp != count_exp)
27946 emit_move_insn (count_exp, tmp);
27948 emit_label (label);
27949 LABEL_NUSES (label) = 1;
27952 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27954 if (force_loopy_epilogue)
27955 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27956 epilogue_size_needed);
27957 else
27959 if (issetmem)
27960 expand_setmem_epilogue (dst, destreg, promoted_val,
27961 vec_promoted_val, count_exp,
27962 epilogue_size_needed);
27963 else
27964 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27965 epilogue_size_needed);
27968 if (jump_around_label)
27969 emit_label (jump_around_label);
27970 return true;
27974 /* Expand the appropriate insns for doing strlen if not just doing
27975 repnz; scasb
27977 out = result, initialized with the start address
27978 align_rtx = alignment of the address.
27979 scratch = scratch register, initialized with the startaddress when
27980 not aligned, otherwise undefined
27982 This is just the body. It needs the initializations mentioned above and
27983 some address computing at the end. These things are done in i386.md. */
27985 static void
27986 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27988 int align;
27989 rtx tmp;
27990 rtx_code_label *align_2_label = NULL;
27991 rtx_code_label *align_3_label = NULL;
27992 rtx_code_label *align_4_label = gen_label_rtx ();
27993 rtx_code_label *end_0_label = gen_label_rtx ();
27994 rtx mem;
27995 rtx tmpreg = gen_reg_rtx (SImode);
27996 rtx scratch = gen_reg_rtx (SImode);
27997 rtx cmp;
27999 align = 0;
28000 if (CONST_INT_P (align_rtx))
28001 align = INTVAL (align_rtx);
28003 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28005 /* Is there a known alignment and is it less than 4? */
28006 if (align < 4)
28008 rtx scratch1 = gen_reg_rtx (Pmode);
28009 emit_move_insn (scratch1, out);
28010 /* Is there a known alignment and is it not 2? */
28011 if (align != 2)
28013 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28014 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28016 /* Leave just the 3 lower bits. */
28017 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28018 NULL_RTX, 0, OPTAB_WIDEN);
28020 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28021 Pmode, 1, align_4_label);
28022 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28023 Pmode, 1, align_2_label);
28024 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28025 Pmode, 1, align_3_label);
28027 else
28029 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28030 check if is aligned to 4 - byte. */
28032 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28033 NULL_RTX, 0, OPTAB_WIDEN);
28035 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28036 Pmode, 1, align_4_label);
28039 mem = change_address (src, QImode, out);
28041 /* Now compare the bytes. */
28043 /* Compare the first n unaligned byte on a byte per byte basis. */
28044 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28045 QImode, 1, end_0_label);
28047 /* Increment the address. */
28048 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28050 /* Not needed with an alignment of 2 */
28051 if (align != 2)
28053 emit_label (align_2_label);
28055 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28056 end_0_label);
28058 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28060 emit_label (align_3_label);
28063 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28064 end_0_label);
28066 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28069 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28070 align this loop. It gives only huge programs, but does not help to
28071 speed up. */
28072 emit_label (align_4_label);
28074 mem = change_address (src, SImode, out);
28075 emit_move_insn (scratch, mem);
28076 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28078 /* This formula yields a nonzero result iff one of the bytes is zero.
28079 This saves three branches inside loop and many cycles. */
28081 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28082 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28083 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28084 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28085 gen_int_mode (0x80808080, SImode)));
28086 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28087 align_4_label);
28089 if (TARGET_CMOVE)
28091 rtx reg = gen_reg_rtx (SImode);
28092 rtx reg2 = gen_reg_rtx (Pmode);
28093 emit_move_insn (reg, tmpreg);
28094 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28096 /* If zero is not in the first two bytes, move two bytes forward. */
28097 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28098 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28099 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28100 emit_insn (gen_rtx_SET (tmpreg,
28101 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28102 reg,
28103 tmpreg)));
28104 /* Emit lea manually to avoid clobbering of flags. */
28105 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28107 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28108 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28109 emit_insn (gen_rtx_SET (out,
28110 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28111 reg2,
28112 out)));
28114 else
28116 rtx_code_label *end_2_label = gen_label_rtx ();
28117 /* Is zero in the first two bytes? */
28119 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28120 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28121 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28122 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28123 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28124 pc_rtx);
28125 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28126 JUMP_LABEL (tmp) = end_2_label;
28128 /* Not in the first two. Move two bytes forward. */
28129 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28130 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28132 emit_label (end_2_label);
28136 /* Avoid branch in fixing the byte. */
28137 tmpreg = gen_lowpart (QImode, tmpreg);
28138 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28139 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28140 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28141 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28143 emit_label (end_0_label);
28146 /* Expand strlen. */
28148 bool
28149 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28151 rtx addr, scratch1, scratch2, scratch3, scratch4;
28153 /* The generic case of strlen expander is long. Avoid it's
28154 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28156 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28157 && !TARGET_INLINE_ALL_STRINGOPS
28158 && !optimize_insn_for_size_p ()
28159 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28160 return false;
28162 addr = force_reg (Pmode, XEXP (src, 0));
28163 scratch1 = gen_reg_rtx (Pmode);
28165 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28166 && !optimize_insn_for_size_p ())
28168 /* Well it seems that some optimizer does not combine a call like
28169 foo(strlen(bar), strlen(bar));
28170 when the move and the subtraction is done here. It does calculate
28171 the length just once when these instructions are done inside of
28172 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28173 often used and I use one fewer register for the lifetime of
28174 output_strlen_unroll() this is better. */
28176 emit_move_insn (out, addr);
28178 ix86_expand_strlensi_unroll_1 (out, src, align);
28180 /* strlensi_unroll_1 returns the address of the zero at the end of
28181 the string, like memchr(), so compute the length by subtracting
28182 the start address. */
28183 emit_insn (ix86_gen_sub3 (out, out, addr));
28185 else
28187 rtx unspec;
28189 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28190 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28191 return false;
28192 /* Can't use this for non-default address spaces. */
28193 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28194 return false;
28196 scratch2 = gen_reg_rtx (Pmode);
28197 scratch3 = gen_reg_rtx (Pmode);
28198 scratch4 = force_reg (Pmode, constm1_rtx);
28200 emit_move_insn (scratch3, addr);
28201 eoschar = force_reg (QImode, eoschar);
28203 src = replace_equiv_address_nv (src, scratch3);
28205 /* If .md starts supporting :P, this can be done in .md. */
28206 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28207 scratch4), UNSPEC_SCAS);
28208 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28209 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28210 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28212 return true;
28215 /* For given symbol (function) construct code to compute address of it's PLT
28216 entry in large x86-64 PIC model. */
28217 static rtx
28218 construct_plt_address (rtx symbol)
28220 rtx tmp, unspec;
28222 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28223 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28224 gcc_assert (Pmode == DImode);
28226 tmp = gen_reg_rtx (Pmode);
28227 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28229 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28230 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28231 return tmp;
28235 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28236 rtx callarg2,
28237 rtx pop, bool sibcall)
28239 rtx vec[3];
28240 rtx use = NULL, call;
28241 unsigned int vec_len = 0;
28242 tree fndecl;
28244 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28246 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28247 if (fndecl
28248 && (lookup_attribute ("interrupt",
28249 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28250 error ("interrupt service routine can't be called directly");
28252 else
28253 fndecl = NULL_TREE;
28255 if (pop == const0_rtx)
28256 pop = NULL;
28257 gcc_assert (!TARGET_64BIT || !pop);
28259 if (TARGET_MACHO && !TARGET_64BIT)
28261 #if TARGET_MACHO
28262 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28263 fnaddr = machopic_indirect_call_target (fnaddr);
28264 #endif
28266 else
28268 /* Static functions and indirect calls don't need the pic register. Also,
28269 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28270 it an indirect call. */
28271 rtx addr = XEXP (fnaddr, 0);
28272 if (flag_pic
28273 && GET_CODE (addr) == SYMBOL_REF
28274 && !SYMBOL_REF_LOCAL_P (addr))
28276 if (flag_plt
28277 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28278 || !lookup_attribute ("noplt",
28279 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28281 if (!TARGET_64BIT
28282 || (ix86_cmodel == CM_LARGE_PIC
28283 && DEFAULT_ABI != MS_ABI))
28285 use_reg (&use, gen_rtx_REG (Pmode,
28286 REAL_PIC_OFFSET_TABLE_REGNUM));
28287 if (ix86_use_pseudo_pic_reg ())
28288 emit_move_insn (gen_rtx_REG (Pmode,
28289 REAL_PIC_OFFSET_TABLE_REGNUM),
28290 pic_offset_table_rtx);
28293 else if (!TARGET_PECOFF && !TARGET_MACHO)
28295 if (TARGET_64BIT)
28297 fnaddr = gen_rtx_UNSPEC (Pmode,
28298 gen_rtvec (1, addr),
28299 UNSPEC_GOTPCREL);
28300 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28302 else
28304 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28305 UNSPEC_GOT);
28306 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28307 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28308 fnaddr);
28310 fnaddr = gen_const_mem (Pmode, fnaddr);
28311 /* Pmode may not be the same as word_mode for x32, which
28312 doesn't support indirect branch via 32-bit memory slot.
28313 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28314 indirect branch via x32 GOT slot is OK. */
28315 if (GET_MODE (fnaddr) != word_mode)
28316 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28317 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28322 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28323 parameters passed in vector registers. */
28324 if (TARGET_64BIT
28325 && (INTVAL (callarg2) > 0
28326 || (INTVAL (callarg2) == 0
28327 && (TARGET_SSE || !flag_skip_rax_setup))))
28329 rtx al = gen_rtx_REG (QImode, AX_REG);
28330 emit_move_insn (al, callarg2);
28331 use_reg (&use, al);
28334 if (ix86_cmodel == CM_LARGE_PIC
28335 && !TARGET_PECOFF
28336 && MEM_P (fnaddr)
28337 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28338 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28339 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28340 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28341 branch via x32 GOT slot is OK. */
28342 else if (!(TARGET_X32
28343 && MEM_P (fnaddr)
28344 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28345 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28346 && (sibcall
28347 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28348 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28350 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28351 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28354 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28356 if (retval)
28358 /* We should add bounds as destination register in case
28359 pointer with bounds may be returned. */
28360 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28362 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28363 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28364 if (GET_CODE (retval) == PARALLEL)
28366 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28367 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28368 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28369 retval = chkp_join_splitted_slot (retval, par);
28371 else
28373 retval = gen_rtx_PARALLEL (VOIDmode,
28374 gen_rtvec (3, retval, b0, b1));
28375 chkp_put_regs_to_expr_list (retval);
28379 call = gen_rtx_SET (retval, call);
28381 vec[vec_len++] = call;
28383 if (pop)
28385 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28386 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28387 vec[vec_len++] = pop;
28390 if (cfun->machine->no_caller_saved_registers
28391 && (!fndecl
28392 || (!TREE_THIS_VOLATILE (fndecl)
28393 && !lookup_attribute ("no_caller_saved_registers",
28394 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28396 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28397 bool is_64bit_ms_abi = (TARGET_64BIT
28398 && ix86_function_abi (fndecl) == MS_ABI);
28399 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28401 /* If there are no caller-saved registers, add all registers
28402 that are clobbered by the call which returns. */
28403 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28404 if (!fixed_regs[i]
28405 && (ix86_call_used_regs[i] == 1
28406 || (ix86_call_used_regs[i] & c_mask))
28407 && !STACK_REGNO_P (i)
28408 && !MMX_REGNO_P (i))
28409 clobber_reg (&use,
28410 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28412 else if (TARGET_64BIT_MS_ABI
28413 && (!callarg2 || INTVAL (callarg2) != -2))
28415 int const cregs_size
28416 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28417 int i;
28419 for (i = 0; i < cregs_size; i++)
28421 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28422 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28424 clobber_reg (&use, gen_rtx_REG (mode, regno));
28428 if (vec_len > 1)
28429 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28430 call = emit_call_insn (call);
28431 if (use)
28432 CALL_INSN_FUNCTION_USAGE (call) = use;
28434 return call;
28437 /* Return true if the function being called was marked with attribute
28438 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28439 to handle the non-PIC case in the backend because there is no easy
28440 interface for the front-end to force non-PLT calls to use the GOT.
28441 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28442 to call the function marked "noplt" indirectly. */
28444 static bool
28445 ix86_nopic_noplt_attribute_p (rtx call_op)
28447 if (flag_pic || ix86_cmodel == CM_LARGE
28448 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28449 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28450 || SYMBOL_REF_LOCAL_P (call_op))
28451 return false;
28453 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28455 if (!flag_plt
28456 || (symbol_decl != NULL_TREE
28457 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28458 return true;
28460 return false;
28463 /* Output the assembly for a call instruction. */
28465 const char *
28466 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28468 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28469 bool seh_nop_p = false;
28470 const char *xasm;
28472 if (SIBLING_CALL_P (insn))
28474 if (direct_p)
28476 if (ix86_nopic_noplt_attribute_p (call_op))
28478 if (TARGET_64BIT)
28479 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28480 else
28481 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28483 else
28484 xasm = "%!jmp\t%P0";
28486 /* SEH epilogue detection requires the indirect branch case
28487 to include REX.W. */
28488 else if (TARGET_SEH)
28489 xasm = "%!rex.W jmp\t%A0";
28490 else
28491 xasm = "%!jmp\t%A0";
28493 output_asm_insn (xasm, &call_op);
28494 return "";
28497 /* SEH unwinding can require an extra nop to be emitted in several
28498 circumstances. Determine if we have one of those. */
28499 if (TARGET_SEH)
28501 rtx_insn *i;
28503 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28505 /* If we get to another real insn, we don't need the nop. */
28506 if (INSN_P (i))
28507 break;
28509 /* If we get to the epilogue note, prevent a catch region from
28510 being adjacent to the standard epilogue sequence. If non-
28511 call-exceptions, we'll have done this during epilogue emission. */
28512 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28513 && !flag_non_call_exceptions
28514 && !can_throw_internal (insn))
28516 seh_nop_p = true;
28517 break;
28521 /* If we didn't find a real insn following the call, prevent the
28522 unwinder from looking into the next function. */
28523 if (i == NULL)
28524 seh_nop_p = true;
28527 if (direct_p)
28529 if (ix86_nopic_noplt_attribute_p (call_op))
28531 if (TARGET_64BIT)
28532 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28533 else
28534 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28536 else
28537 xasm = "%!call\t%P0";
28539 else
28540 xasm = "%!call\t%A0";
28542 output_asm_insn (xasm, &call_op);
28544 if (seh_nop_p)
28545 return "nop";
28547 return "";
28550 /* Clear stack slot assignments remembered from previous functions.
28551 This is called from INIT_EXPANDERS once before RTL is emitted for each
28552 function. */
28554 static struct machine_function *
28555 ix86_init_machine_status (void)
28557 struct machine_function *f;
28559 f = ggc_cleared_alloc<machine_function> ();
28560 f->use_fast_prologue_epilogue_nregs = -1;
28561 f->call_abi = ix86_abi;
28563 return f;
28566 /* Return a MEM corresponding to a stack slot with mode MODE.
28567 Allocate a new slot if necessary.
28569 The RTL for a function can have several slots available: N is
28570 which slot to use. */
28573 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28575 struct stack_local_entry *s;
28577 gcc_assert (n < MAX_386_STACK_LOCALS);
28579 for (s = ix86_stack_locals; s; s = s->next)
28580 if (s->mode == mode && s->n == n)
28581 return validize_mem (copy_rtx (s->rtl));
28583 s = ggc_alloc<stack_local_entry> ();
28584 s->n = n;
28585 s->mode = mode;
28586 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28588 s->next = ix86_stack_locals;
28589 ix86_stack_locals = s;
28590 return validize_mem (copy_rtx (s->rtl));
28593 static void
28594 ix86_instantiate_decls (void)
28596 struct stack_local_entry *s;
28598 for (s = ix86_stack_locals; s; s = s->next)
28599 if (s->rtl != NULL_RTX)
28600 instantiate_decl_rtl (s->rtl);
28603 /* Return the number used for encoding REG, in the range 0..7. */
28605 static int
28606 reg_encoded_number (rtx reg)
28608 unsigned regno = REGNO (reg);
28609 switch (regno)
28611 case AX_REG:
28612 return 0;
28613 case CX_REG:
28614 return 1;
28615 case DX_REG:
28616 return 2;
28617 case BX_REG:
28618 return 3;
28619 case SP_REG:
28620 return 4;
28621 case BP_REG:
28622 return 5;
28623 case SI_REG:
28624 return 6;
28625 case DI_REG:
28626 return 7;
28627 default:
28628 break;
28630 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28631 return regno - FIRST_STACK_REG;
28632 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28633 return regno - FIRST_SSE_REG;
28634 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28635 return regno - FIRST_MMX_REG;
28636 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28637 return regno - FIRST_REX_SSE_REG;
28638 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28639 return regno - FIRST_REX_INT_REG;
28640 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28641 return regno - FIRST_MASK_REG;
28642 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28643 return regno - FIRST_BND_REG;
28644 return -1;
28647 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28648 in its encoding if it could be relevant for ROP mitigation, otherwise
28649 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28650 used for calculating it into them. */
28652 static int
28653 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28654 int *popno0 = 0, int *popno1 = 0)
28656 if (asm_noperands (PATTERN (insn)) >= 0)
28657 return -1;
28658 int has_modrm = get_attr_modrm (insn);
28659 if (!has_modrm)
28660 return -1;
28661 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28662 rtx op0, op1;
28663 switch (cls)
28665 case MODRM_CLASS_OP02:
28666 gcc_assert (noperands >= 3);
28667 if (popno0)
28669 *popno0 = 0;
28670 *popno1 = 2;
28672 op0 = operands[0];
28673 op1 = operands[2];
28674 break;
28675 case MODRM_CLASS_OP01:
28676 gcc_assert (noperands >= 2);
28677 if (popno0)
28679 *popno0 = 0;
28680 *popno1 = 1;
28682 op0 = operands[0];
28683 op1 = operands[1];
28684 break;
28685 default:
28686 return -1;
28688 if (REG_P (op0) && REG_P (op1))
28690 int enc0 = reg_encoded_number (op0);
28691 int enc1 = reg_encoded_number (op1);
28692 return 0xc0 + (enc1 << 3) + enc0;
28694 return -1;
28697 /* Check whether x86 address PARTS is a pc-relative address. */
28699 static bool
28700 rip_relative_addr_p (struct ix86_address *parts)
28702 rtx base, index, disp;
28704 base = parts->base;
28705 index = parts->index;
28706 disp = parts->disp;
28708 if (disp && !base && !index)
28710 if (TARGET_64BIT)
28712 rtx symbol = disp;
28714 if (GET_CODE (disp) == CONST)
28715 symbol = XEXP (disp, 0);
28716 if (GET_CODE (symbol) == PLUS
28717 && CONST_INT_P (XEXP (symbol, 1)))
28718 symbol = XEXP (symbol, 0);
28720 if (GET_CODE (symbol) == LABEL_REF
28721 || (GET_CODE (symbol) == SYMBOL_REF
28722 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28723 || (GET_CODE (symbol) == UNSPEC
28724 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28725 || XINT (symbol, 1) == UNSPEC_PCREL
28726 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28727 return true;
28730 return false;
28733 /* Calculate the length of the memory address in the instruction encoding.
28734 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28735 or other prefixes. We never generate addr32 prefix for LEA insn. */
28738 memory_address_length (rtx addr, bool lea)
28740 struct ix86_address parts;
28741 rtx base, index, disp;
28742 int len;
28743 int ok;
28745 if (GET_CODE (addr) == PRE_DEC
28746 || GET_CODE (addr) == POST_INC
28747 || GET_CODE (addr) == PRE_MODIFY
28748 || GET_CODE (addr) == POST_MODIFY)
28749 return 0;
28751 ok = ix86_decompose_address (addr, &parts);
28752 gcc_assert (ok);
28754 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28756 /* If this is not LEA instruction, add the length of addr32 prefix. */
28757 if (TARGET_64BIT && !lea
28758 && (SImode_address_operand (addr, VOIDmode)
28759 || (parts.base && GET_MODE (parts.base) == SImode)
28760 || (parts.index && GET_MODE (parts.index) == SImode)))
28761 len++;
28763 base = parts.base;
28764 index = parts.index;
28765 disp = parts.disp;
28767 if (base && SUBREG_P (base))
28768 base = SUBREG_REG (base);
28769 if (index && SUBREG_P (index))
28770 index = SUBREG_REG (index);
28772 gcc_assert (base == NULL_RTX || REG_P (base));
28773 gcc_assert (index == NULL_RTX || REG_P (index));
28775 /* Rule of thumb:
28776 - esp as the base always wants an index,
28777 - ebp as the base always wants a displacement,
28778 - r12 as the base always wants an index,
28779 - r13 as the base always wants a displacement. */
28781 /* Register Indirect. */
28782 if (base && !index && !disp)
28784 /* esp (for its index) and ebp (for its displacement) need
28785 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28786 code. */
28787 if (base == arg_pointer_rtx
28788 || base == frame_pointer_rtx
28789 || REGNO (base) == SP_REG
28790 || REGNO (base) == BP_REG
28791 || REGNO (base) == R12_REG
28792 || REGNO (base) == R13_REG)
28793 len++;
28796 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28797 is not disp32, but disp32(%rip), so for disp32
28798 SIB byte is needed, unless print_operand_address
28799 optimizes it into disp32(%rip) or (%rip) is implied
28800 by UNSPEC. */
28801 else if (disp && !base && !index)
28803 len += 4;
28804 if (!rip_relative_addr_p (&parts))
28805 len++;
28807 else
28809 /* Find the length of the displacement constant. */
28810 if (disp)
28812 if (base && satisfies_constraint_K (disp))
28813 len += 1;
28814 else
28815 len += 4;
28817 /* ebp always wants a displacement. Similarly r13. */
28818 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28819 len++;
28821 /* An index requires the two-byte modrm form.... */
28822 if (index
28823 /* ...like esp (or r12), which always wants an index. */
28824 || base == arg_pointer_rtx
28825 || base == frame_pointer_rtx
28826 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28827 len++;
28830 return len;
28833 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28834 is set, expect that insn have 8bit immediate alternative. */
28836 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28838 int len = 0;
28839 int i;
28840 extract_insn_cached (insn);
28841 for (i = recog_data.n_operands - 1; i >= 0; --i)
28842 if (CONSTANT_P (recog_data.operand[i]))
28844 enum attr_mode mode = get_attr_mode (insn);
28846 gcc_assert (!len);
28847 if (shortform && CONST_INT_P (recog_data.operand[i]))
28849 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28850 switch (mode)
28852 case MODE_QI:
28853 len = 1;
28854 continue;
28855 case MODE_HI:
28856 ival = trunc_int_for_mode (ival, HImode);
28857 break;
28858 case MODE_SI:
28859 ival = trunc_int_for_mode (ival, SImode);
28860 break;
28861 default:
28862 break;
28864 if (IN_RANGE (ival, -128, 127))
28866 len = 1;
28867 continue;
28870 switch (mode)
28872 case MODE_QI:
28873 len = 1;
28874 break;
28875 case MODE_HI:
28876 len = 2;
28877 break;
28878 case MODE_SI:
28879 len = 4;
28880 break;
28881 /* Immediates for DImode instructions are encoded
28882 as 32bit sign extended values. */
28883 case MODE_DI:
28884 len = 4;
28885 break;
28886 default:
28887 fatal_insn ("unknown insn mode", insn);
28890 return len;
28893 /* Compute default value for "length_address" attribute. */
28895 ix86_attr_length_address_default (rtx_insn *insn)
28897 int i;
28899 if (get_attr_type (insn) == TYPE_LEA)
28901 rtx set = PATTERN (insn), addr;
28903 if (GET_CODE (set) == PARALLEL)
28904 set = XVECEXP (set, 0, 0);
28906 gcc_assert (GET_CODE (set) == SET);
28908 addr = SET_SRC (set);
28910 return memory_address_length (addr, true);
28913 extract_insn_cached (insn);
28914 for (i = recog_data.n_operands - 1; i >= 0; --i)
28916 rtx op = recog_data.operand[i];
28917 if (MEM_P (op))
28919 constrain_operands_cached (insn, reload_completed);
28920 if (which_alternative != -1)
28922 const char *constraints = recog_data.constraints[i];
28923 int alt = which_alternative;
28925 while (*constraints == '=' || *constraints == '+')
28926 constraints++;
28927 while (alt-- > 0)
28928 while (*constraints++ != ',')
28930 /* Skip ignored operands. */
28931 if (*constraints == 'X')
28932 continue;
28935 int len = memory_address_length (XEXP (op, 0), false);
28937 /* Account for segment prefix for non-default addr spaces. */
28938 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28939 len++;
28941 return len;
28944 return 0;
28947 /* Compute default value for "length_vex" attribute. It includes
28948 2 or 3 byte VEX prefix and 1 opcode byte. */
28951 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28952 bool has_vex_w)
28954 int i;
28956 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28957 byte VEX prefix. */
28958 if (!has_0f_opcode || has_vex_w)
28959 return 3 + 1;
28961 /* We can always use 2 byte VEX prefix in 32bit. */
28962 if (!TARGET_64BIT)
28963 return 2 + 1;
28965 extract_insn_cached (insn);
28967 for (i = recog_data.n_operands - 1; i >= 0; --i)
28968 if (REG_P (recog_data.operand[i]))
28970 /* REX.W bit uses 3 byte VEX prefix. */
28971 if (GET_MODE (recog_data.operand[i]) == DImode
28972 && GENERAL_REG_P (recog_data.operand[i]))
28973 return 3 + 1;
28975 else
28977 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28978 if (MEM_P (recog_data.operand[i])
28979 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28980 return 3 + 1;
28983 return 2 + 1;
28986 /* Return the maximum number of instructions a cpu can issue. */
28988 static int
28989 ix86_issue_rate (void)
28991 switch (ix86_tune)
28993 case PROCESSOR_PENTIUM:
28994 case PROCESSOR_LAKEMONT:
28995 case PROCESSOR_BONNELL:
28996 case PROCESSOR_SILVERMONT:
28997 case PROCESSOR_KNL:
28998 case PROCESSOR_INTEL:
28999 case PROCESSOR_K6:
29000 case PROCESSOR_BTVER2:
29001 case PROCESSOR_PENTIUM4:
29002 case PROCESSOR_NOCONA:
29003 return 2;
29005 case PROCESSOR_PENTIUMPRO:
29006 case PROCESSOR_ATHLON:
29007 case PROCESSOR_K8:
29008 case PROCESSOR_AMDFAM10:
29009 case PROCESSOR_GENERIC:
29010 case PROCESSOR_BTVER1:
29011 return 3;
29013 case PROCESSOR_BDVER1:
29014 case PROCESSOR_BDVER2:
29015 case PROCESSOR_BDVER3:
29016 case PROCESSOR_BDVER4:
29017 case PROCESSOR_ZNVER1:
29018 case PROCESSOR_CORE2:
29019 case PROCESSOR_NEHALEM:
29020 case PROCESSOR_SANDYBRIDGE:
29021 case PROCESSOR_HASWELL:
29022 return 4;
29024 default:
29025 return 1;
29029 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29030 by DEP_INSN and nothing set by DEP_INSN. */
29032 static bool
29033 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29035 rtx set, set2;
29037 /* Simplify the test for uninteresting insns. */
29038 if (insn_type != TYPE_SETCC
29039 && insn_type != TYPE_ICMOV
29040 && insn_type != TYPE_FCMOV
29041 && insn_type != TYPE_IBR)
29042 return false;
29044 if ((set = single_set (dep_insn)) != 0)
29046 set = SET_DEST (set);
29047 set2 = NULL_RTX;
29049 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29050 && XVECLEN (PATTERN (dep_insn), 0) == 2
29051 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29052 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29054 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29055 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29057 else
29058 return false;
29060 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29061 return false;
29063 /* This test is true if the dependent insn reads the flags but
29064 not any other potentially set register. */
29065 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29066 return false;
29068 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29069 return false;
29071 return true;
29074 /* Return true iff USE_INSN has a memory address with operands set by
29075 SET_INSN. */
29077 bool
29078 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29080 int i;
29081 extract_insn_cached (use_insn);
29082 for (i = recog_data.n_operands - 1; i >= 0; --i)
29083 if (MEM_P (recog_data.operand[i]))
29085 rtx addr = XEXP (recog_data.operand[i], 0);
29086 return modified_in_p (addr, set_insn) != 0;
29088 return false;
29091 /* Helper function for exact_store_load_dependency.
29092 Return true if addr is found in insn. */
29093 static bool
29094 exact_dependency_1 (rtx addr, rtx insn)
29096 enum rtx_code code;
29097 const char *format_ptr;
29098 int i, j;
29100 code = GET_CODE (insn);
29101 switch (code)
29103 case MEM:
29104 if (rtx_equal_p (addr, insn))
29105 return true;
29106 break;
29107 case REG:
29108 CASE_CONST_ANY:
29109 case SYMBOL_REF:
29110 case CODE_LABEL:
29111 case PC:
29112 case CC0:
29113 case EXPR_LIST:
29114 return false;
29115 default:
29116 break;
29119 format_ptr = GET_RTX_FORMAT (code);
29120 for (i = 0; i < GET_RTX_LENGTH (code); i++)
29122 switch (*format_ptr++)
29124 case 'e':
29125 if (exact_dependency_1 (addr, XEXP (insn, i)))
29126 return true;
29127 break;
29128 case 'E':
29129 for (j = 0; j < XVECLEN (insn, i); j++)
29130 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
29131 return true;
29132 break;
29135 return false;
29138 /* Return true if there exists exact dependency for store & load, i.e.
29139 the same memory address is used in them. */
29140 static bool
29141 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
29143 rtx set1, set2;
29145 set1 = single_set (store);
29146 if (!set1)
29147 return false;
29148 if (!MEM_P (SET_DEST (set1)))
29149 return false;
29150 set2 = single_set (load);
29151 if (!set2)
29152 return false;
29153 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
29154 return true;
29155 return false;
29158 static int
29159 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
29160 unsigned int)
29162 enum attr_type insn_type, dep_insn_type;
29163 enum attr_memory memory;
29164 rtx set, set2;
29165 int dep_insn_code_number;
29167 /* Anti and output dependencies have zero cost on all CPUs. */
29168 if (dep_type != 0)
29169 return 0;
29171 dep_insn_code_number = recog_memoized (dep_insn);
29173 /* If we can't recognize the insns, we can't really do anything. */
29174 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
29175 return cost;
29177 insn_type = get_attr_type (insn);
29178 dep_insn_type = get_attr_type (dep_insn);
29180 switch (ix86_tune)
29182 case PROCESSOR_PENTIUM:
29183 case PROCESSOR_LAKEMONT:
29184 /* Address Generation Interlock adds a cycle of latency. */
29185 if (insn_type == TYPE_LEA)
29187 rtx addr = PATTERN (insn);
29189 if (GET_CODE (addr) == PARALLEL)
29190 addr = XVECEXP (addr, 0, 0);
29192 gcc_assert (GET_CODE (addr) == SET);
29194 addr = SET_SRC (addr);
29195 if (modified_in_p (addr, dep_insn))
29196 cost += 1;
29198 else if (ix86_agi_dependent (dep_insn, insn))
29199 cost += 1;
29201 /* ??? Compares pair with jump/setcc. */
29202 if (ix86_flags_dependent (insn, dep_insn, insn_type))
29203 cost = 0;
29205 /* Floating point stores require value to be ready one cycle earlier. */
29206 if (insn_type == TYPE_FMOV
29207 && get_attr_memory (insn) == MEMORY_STORE
29208 && !ix86_agi_dependent (dep_insn, insn))
29209 cost += 1;
29210 break;
29212 case PROCESSOR_PENTIUMPRO:
29213 /* INT->FP conversion is expensive. */
29214 if (get_attr_fp_int_src (dep_insn))
29215 cost += 5;
29217 /* There is one cycle extra latency between an FP op and a store. */
29218 if (insn_type == TYPE_FMOV
29219 && (set = single_set (dep_insn)) != NULL_RTX
29220 && (set2 = single_set (insn)) != NULL_RTX
29221 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
29222 && MEM_P (SET_DEST (set2)))
29223 cost += 1;
29225 memory = get_attr_memory (insn);
29227 /* Show ability of reorder buffer to hide latency of load by executing
29228 in parallel with previous instruction in case
29229 previous instruction is not needed to compute the address. */
29230 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29231 && !ix86_agi_dependent (dep_insn, insn))
29233 /* Claim moves to take one cycle, as core can issue one load
29234 at time and the next load can start cycle later. */
29235 if (dep_insn_type == TYPE_IMOV
29236 || dep_insn_type == TYPE_FMOV)
29237 cost = 1;
29238 else if (cost > 1)
29239 cost--;
29241 break;
29243 case PROCESSOR_K6:
29244 /* The esp dependency is resolved before
29245 the instruction is really finished. */
29246 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29247 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29248 return 1;
29250 /* INT->FP conversion is expensive. */
29251 if (get_attr_fp_int_src (dep_insn))
29252 cost += 5;
29254 memory = get_attr_memory (insn);
29256 /* Show ability of reorder buffer to hide latency of load by executing
29257 in parallel with previous instruction in case
29258 previous instruction is not needed to compute the address. */
29259 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29260 && !ix86_agi_dependent (dep_insn, insn))
29262 /* Claim moves to take one cycle, as core can issue one load
29263 at time and the next load can start cycle later. */
29264 if (dep_insn_type == TYPE_IMOV
29265 || dep_insn_type == TYPE_FMOV)
29266 cost = 1;
29267 else if (cost > 2)
29268 cost -= 2;
29269 else
29270 cost = 1;
29272 break;
29274 case PROCESSOR_AMDFAM10:
29275 case PROCESSOR_BDVER1:
29276 case PROCESSOR_BDVER2:
29277 case PROCESSOR_BDVER3:
29278 case PROCESSOR_BDVER4:
29279 case PROCESSOR_ZNVER1:
29280 case PROCESSOR_BTVER1:
29281 case PROCESSOR_BTVER2:
29282 case PROCESSOR_GENERIC:
29283 /* Stack engine allows to execute push&pop instructions in parall. */
29284 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29285 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29286 return 0;
29287 /* FALLTHRU */
29289 case PROCESSOR_ATHLON:
29290 case PROCESSOR_K8:
29291 memory = get_attr_memory (insn);
29293 /* Show ability of reorder buffer to hide latency of load by executing
29294 in parallel with previous instruction in case
29295 previous instruction is not needed to compute the address. */
29296 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29297 && !ix86_agi_dependent (dep_insn, insn))
29299 enum attr_unit unit = get_attr_unit (insn);
29300 int loadcost = 3;
29302 /* Because of the difference between the length of integer and
29303 floating unit pipeline preparation stages, the memory operands
29304 for floating point are cheaper.
29306 ??? For Athlon it the difference is most probably 2. */
29307 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29308 loadcost = 3;
29309 else
29310 loadcost = TARGET_ATHLON ? 2 : 0;
29312 if (cost >= loadcost)
29313 cost -= loadcost;
29314 else
29315 cost = 0;
29317 break;
29319 case PROCESSOR_CORE2:
29320 case PROCESSOR_NEHALEM:
29321 case PROCESSOR_SANDYBRIDGE:
29322 case PROCESSOR_HASWELL:
29323 /* Stack engine allows to execute push&pop instructions in parall. */
29324 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29325 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29326 return 0;
29328 memory = get_attr_memory (insn);
29330 /* Show ability of reorder buffer to hide latency of load by executing
29331 in parallel with previous instruction in case
29332 previous instruction is not needed to compute the address. */
29333 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29334 && !ix86_agi_dependent (dep_insn, insn))
29336 if (cost >= 4)
29337 cost -= 4;
29338 else
29339 cost = 0;
29341 break;
29343 case PROCESSOR_SILVERMONT:
29344 case PROCESSOR_KNL:
29345 case PROCESSOR_INTEL:
29346 if (!reload_completed)
29347 return cost;
29349 /* Increase cost of integer loads. */
29350 memory = get_attr_memory (dep_insn);
29351 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29353 enum attr_unit unit = get_attr_unit (dep_insn);
29354 if (unit == UNIT_INTEGER && cost == 1)
29356 if (memory == MEMORY_LOAD)
29357 cost = 3;
29358 else
29360 /* Increase cost of ld/st for short int types only
29361 because of store forwarding issue. */
29362 rtx set = single_set (dep_insn);
29363 if (set && (GET_MODE (SET_DEST (set)) == QImode
29364 || GET_MODE (SET_DEST (set)) == HImode))
29366 /* Increase cost of store/load insn if exact
29367 dependence exists and it is load insn. */
29368 enum attr_memory insn_memory = get_attr_memory (insn);
29369 if (insn_memory == MEMORY_LOAD
29370 && exact_store_load_dependency (dep_insn, insn))
29371 cost = 3;
29377 default:
29378 break;
29381 return cost;
29384 /* How many alternative schedules to try. This should be as wide as the
29385 scheduling freedom in the DFA, but no wider. Making this value too
29386 large results extra work for the scheduler. */
29388 static int
29389 ia32_multipass_dfa_lookahead (void)
29391 switch (ix86_tune)
29393 case PROCESSOR_PENTIUM:
29394 case PROCESSOR_LAKEMONT:
29395 return 2;
29397 case PROCESSOR_PENTIUMPRO:
29398 case PROCESSOR_K6:
29399 return 1;
29401 case PROCESSOR_BDVER1:
29402 case PROCESSOR_BDVER2:
29403 case PROCESSOR_BDVER3:
29404 case PROCESSOR_BDVER4:
29405 /* We use lookahead value 4 for BD both before and after reload
29406 schedules. Plan is to have value 8 included for O3. */
29407 return 4;
29409 case PROCESSOR_CORE2:
29410 case PROCESSOR_NEHALEM:
29411 case PROCESSOR_SANDYBRIDGE:
29412 case PROCESSOR_HASWELL:
29413 case PROCESSOR_BONNELL:
29414 case PROCESSOR_SILVERMONT:
29415 case PROCESSOR_KNL:
29416 case PROCESSOR_INTEL:
29417 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29418 as many instructions can be executed on a cycle, i.e.,
29419 issue_rate. I wonder why tuning for many CPUs does not do this. */
29420 if (reload_completed)
29421 return ix86_issue_rate ();
29422 /* Don't use lookahead for pre-reload schedule to save compile time. */
29423 return 0;
29425 default:
29426 return 0;
29430 /* Return true if target platform supports macro-fusion. */
29432 static bool
29433 ix86_macro_fusion_p ()
29435 return TARGET_FUSE_CMP_AND_BRANCH;
29438 /* Check whether current microarchitecture support macro fusion
29439 for insn pair "CONDGEN + CONDJMP". Refer to
29440 "Intel Architectures Optimization Reference Manual". */
29442 static bool
29443 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29445 rtx src, dest;
29446 enum rtx_code ccode;
29447 rtx compare_set = NULL_RTX, test_if, cond;
29448 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29450 if (!any_condjump_p (condjmp))
29451 return false;
29453 if (get_attr_type (condgen) != TYPE_TEST
29454 && get_attr_type (condgen) != TYPE_ICMP
29455 && get_attr_type (condgen) != TYPE_INCDEC
29456 && get_attr_type (condgen) != TYPE_ALU)
29457 return false;
29459 compare_set = single_set (condgen);
29460 if (compare_set == NULL_RTX
29461 && !TARGET_FUSE_ALU_AND_BRANCH)
29462 return false;
29464 if (compare_set == NULL_RTX)
29466 int i;
29467 rtx pat = PATTERN (condgen);
29468 for (i = 0; i < XVECLEN (pat, 0); i++)
29469 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29471 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29472 if (GET_CODE (set_src) == COMPARE)
29473 compare_set = XVECEXP (pat, 0, i);
29474 else
29475 alu_set = XVECEXP (pat, 0, i);
29478 if (compare_set == NULL_RTX)
29479 return false;
29480 src = SET_SRC (compare_set);
29481 if (GET_CODE (src) != COMPARE)
29482 return false;
29484 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29485 supported. */
29486 if ((MEM_P (XEXP (src, 0))
29487 && CONST_INT_P (XEXP (src, 1)))
29488 || (MEM_P (XEXP (src, 1))
29489 && CONST_INT_P (XEXP (src, 0))))
29490 return false;
29492 /* No fusion for RIP-relative address. */
29493 if (MEM_P (XEXP (src, 0)))
29494 addr = XEXP (XEXP (src, 0), 0);
29495 else if (MEM_P (XEXP (src, 1)))
29496 addr = XEXP (XEXP (src, 1), 0);
29498 if (addr) {
29499 ix86_address parts;
29500 int ok = ix86_decompose_address (addr, &parts);
29501 gcc_assert (ok);
29503 if (rip_relative_addr_p (&parts))
29504 return false;
29507 test_if = SET_SRC (pc_set (condjmp));
29508 cond = XEXP (test_if, 0);
29509 ccode = GET_CODE (cond);
29510 /* Check whether conditional jump use Sign or Overflow Flags. */
29511 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29512 && (ccode == GE
29513 || ccode == GT
29514 || ccode == LE
29515 || ccode == LT))
29516 return false;
29518 /* Return true for TYPE_TEST and TYPE_ICMP. */
29519 if (get_attr_type (condgen) == TYPE_TEST
29520 || get_attr_type (condgen) == TYPE_ICMP)
29521 return true;
29523 /* The following is the case that macro-fusion for alu + jmp. */
29524 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29525 return false;
29527 /* No fusion for alu op with memory destination operand. */
29528 dest = SET_DEST (alu_set);
29529 if (MEM_P (dest))
29530 return false;
29532 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29533 supported. */
29534 if (get_attr_type (condgen) == TYPE_INCDEC
29535 && (ccode == GEU
29536 || ccode == GTU
29537 || ccode == LEU
29538 || ccode == LTU))
29539 return false;
29541 return true;
29544 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29545 execution. It is applied if
29546 (1) IMUL instruction is on the top of list;
29547 (2) There exists the only producer of independent IMUL instruction in
29548 ready list.
29549 Return index of IMUL producer if it was found and -1 otherwise. */
29550 static int
29551 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29553 rtx_insn *insn;
29554 rtx set, insn1, insn2;
29555 sd_iterator_def sd_it;
29556 dep_t dep;
29557 int index = -1;
29558 int i;
29560 if (!TARGET_BONNELL)
29561 return index;
29563 /* Check that IMUL instruction is on the top of ready list. */
29564 insn = ready[n_ready - 1];
29565 set = single_set (insn);
29566 if (!set)
29567 return index;
29568 if (!(GET_CODE (SET_SRC (set)) == MULT
29569 && GET_MODE (SET_SRC (set)) == SImode))
29570 return index;
29572 /* Search for producer of independent IMUL instruction. */
29573 for (i = n_ready - 2; i >= 0; i--)
29575 insn = ready[i];
29576 if (!NONDEBUG_INSN_P (insn))
29577 continue;
29578 /* Skip IMUL instruction. */
29579 insn2 = PATTERN (insn);
29580 if (GET_CODE (insn2) == PARALLEL)
29581 insn2 = XVECEXP (insn2, 0, 0);
29582 if (GET_CODE (insn2) == SET
29583 && GET_CODE (SET_SRC (insn2)) == MULT
29584 && GET_MODE (SET_SRC (insn2)) == SImode)
29585 continue;
29587 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29589 rtx con;
29590 con = DEP_CON (dep);
29591 if (!NONDEBUG_INSN_P (con))
29592 continue;
29593 insn1 = PATTERN (con);
29594 if (GET_CODE (insn1) == PARALLEL)
29595 insn1 = XVECEXP (insn1, 0, 0);
29597 if (GET_CODE (insn1) == SET
29598 && GET_CODE (SET_SRC (insn1)) == MULT
29599 && GET_MODE (SET_SRC (insn1)) == SImode)
29601 sd_iterator_def sd_it1;
29602 dep_t dep1;
29603 /* Check if there is no other dependee for IMUL. */
29604 index = i;
29605 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29607 rtx pro;
29608 pro = DEP_PRO (dep1);
29609 if (!NONDEBUG_INSN_P (pro))
29610 continue;
29611 if (pro != insn)
29612 index = -1;
29614 if (index >= 0)
29615 break;
29618 if (index >= 0)
29619 break;
29621 return index;
29624 /* Try to find the best candidate on the top of ready list if two insns
29625 have the same priority - candidate is best if its dependees were
29626 scheduled earlier. Applied for Silvermont only.
29627 Return true if top 2 insns must be interchanged. */
29628 static bool
29629 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29631 rtx_insn *top = ready[n_ready - 1];
29632 rtx_insn *next = ready[n_ready - 2];
29633 rtx set;
29634 sd_iterator_def sd_it;
29635 dep_t dep;
29636 int clock1 = -1;
29637 int clock2 = -1;
29638 #define INSN_TICK(INSN) (HID (INSN)->tick)
29640 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29641 return false;
29643 if (!NONDEBUG_INSN_P (top))
29644 return false;
29645 if (!NONJUMP_INSN_P (top))
29646 return false;
29647 if (!NONDEBUG_INSN_P (next))
29648 return false;
29649 if (!NONJUMP_INSN_P (next))
29650 return false;
29651 set = single_set (top);
29652 if (!set)
29653 return false;
29654 set = single_set (next);
29655 if (!set)
29656 return false;
29658 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29660 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29661 return false;
29662 /* Determine winner more precise. */
29663 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29665 rtx pro;
29666 pro = DEP_PRO (dep);
29667 if (!NONDEBUG_INSN_P (pro))
29668 continue;
29669 if (INSN_TICK (pro) > clock1)
29670 clock1 = INSN_TICK (pro);
29672 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29674 rtx pro;
29675 pro = DEP_PRO (dep);
29676 if (!NONDEBUG_INSN_P (pro))
29677 continue;
29678 if (INSN_TICK (pro) > clock2)
29679 clock2 = INSN_TICK (pro);
29682 if (clock1 == clock2)
29684 /* Determine winner - load must win. */
29685 enum attr_memory memory1, memory2;
29686 memory1 = get_attr_memory (top);
29687 memory2 = get_attr_memory (next);
29688 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29689 return true;
29691 return (bool) (clock2 < clock1);
29693 return false;
29694 #undef INSN_TICK
29697 /* Perform possible reodering of ready list for Atom/Silvermont only.
29698 Return issue rate. */
29699 static int
29700 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29701 int *pn_ready, int clock_var)
29703 int issue_rate = -1;
29704 int n_ready = *pn_ready;
29705 int i;
29706 rtx_insn *insn;
29707 int index = -1;
29709 /* Set up issue rate. */
29710 issue_rate = ix86_issue_rate ();
29712 /* Do reodering for BONNELL/SILVERMONT only. */
29713 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29714 return issue_rate;
29716 /* Nothing to do if ready list contains only 1 instruction. */
29717 if (n_ready <= 1)
29718 return issue_rate;
29720 /* Do reodering for post-reload scheduler only. */
29721 if (!reload_completed)
29722 return issue_rate;
29724 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29726 if (sched_verbose > 1)
29727 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29728 INSN_UID (ready[index]));
29730 /* Put IMUL producer (ready[index]) at the top of ready list. */
29731 insn = ready[index];
29732 for (i = index; i < n_ready - 1; i++)
29733 ready[i] = ready[i + 1];
29734 ready[n_ready - 1] = insn;
29735 return issue_rate;
29738 /* Skip selective scheduling since HID is not populated in it. */
29739 if (clock_var != 0
29740 && !sel_sched_p ()
29741 && swap_top_of_ready_list (ready, n_ready))
29743 if (sched_verbose > 1)
29744 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29745 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29746 /* Swap 2 top elements of ready list. */
29747 insn = ready[n_ready - 1];
29748 ready[n_ready - 1] = ready[n_ready - 2];
29749 ready[n_ready - 2] = insn;
29751 return issue_rate;
29754 static bool
29755 ix86_class_likely_spilled_p (reg_class_t);
29757 /* Returns true if lhs of insn is HW function argument register and set up
29758 is_spilled to true if it is likely spilled HW register. */
29759 static bool
29760 insn_is_function_arg (rtx insn, bool* is_spilled)
29762 rtx dst;
29764 if (!NONDEBUG_INSN_P (insn))
29765 return false;
29766 /* Call instructions are not movable, ignore it. */
29767 if (CALL_P (insn))
29768 return false;
29769 insn = PATTERN (insn);
29770 if (GET_CODE (insn) == PARALLEL)
29771 insn = XVECEXP (insn, 0, 0);
29772 if (GET_CODE (insn) != SET)
29773 return false;
29774 dst = SET_DEST (insn);
29775 if (REG_P (dst) && HARD_REGISTER_P (dst)
29776 && ix86_function_arg_regno_p (REGNO (dst)))
29778 /* Is it likely spilled HW register? */
29779 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29780 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29781 *is_spilled = true;
29782 return true;
29784 return false;
29787 /* Add output dependencies for chain of function adjacent arguments if only
29788 there is a move to likely spilled HW register. Return first argument
29789 if at least one dependence was added or NULL otherwise. */
29790 static rtx_insn *
29791 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29793 rtx_insn *insn;
29794 rtx_insn *last = call;
29795 rtx_insn *first_arg = NULL;
29796 bool is_spilled = false;
29798 head = PREV_INSN (head);
29800 /* Find nearest to call argument passing instruction. */
29801 while (true)
29803 last = PREV_INSN (last);
29804 if (last == head)
29805 return NULL;
29806 if (!NONDEBUG_INSN_P (last))
29807 continue;
29808 if (insn_is_function_arg (last, &is_spilled))
29809 break;
29810 return NULL;
29813 first_arg = last;
29814 while (true)
29816 insn = PREV_INSN (last);
29817 if (!INSN_P (insn))
29818 break;
29819 if (insn == head)
29820 break;
29821 if (!NONDEBUG_INSN_P (insn))
29823 last = insn;
29824 continue;
29826 if (insn_is_function_arg (insn, &is_spilled))
29828 /* Add output depdendence between two function arguments if chain
29829 of output arguments contains likely spilled HW registers. */
29830 if (is_spilled)
29831 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29832 first_arg = last = insn;
29834 else
29835 break;
29837 if (!is_spilled)
29838 return NULL;
29839 return first_arg;
29842 /* Add output or anti dependency from insn to first_arg to restrict its code
29843 motion. */
29844 static void
29845 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29847 rtx set;
29848 rtx tmp;
29850 /* Add anti dependencies for bounds stores. */
29851 if (INSN_P (insn)
29852 && GET_CODE (PATTERN (insn)) == PARALLEL
29853 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29854 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29856 add_dependence (first_arg, insn, REG_DEP_ANTI);
29857 return;
29860 set = single_set (insn);
29861 if (!set)
29862 return;
29863 tmp = SET_DEST (set);
29864 if (REG_P (tmp))
29866 /* Add output dependency to the first function argument. */
29867 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29868 return;
29870 /* Add anti dependency. */
29871 add_dependence (first_arg, insn, REG_DEP_ANTI);
29874 /* Avoid cross block motion of function argument through adding dependency
29875 from the first non-jump instruction in bb. */
29876 static void
29877 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29879 rtx_insn *insn = BB_END (bb);
29881 while (insn)
29883 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29885 rtx set = single_set (insn);
29886 if (set)
29888 avoid_func_arg_motion (arg, insn);
29889 return;
29892 if (insn == BB_HEAD (bb))
29893 return;
29894 insn = PREV_INSN (insn);
29898 /* Hook for pre-reload schedule - avoid motion of function arguments
29899 passed in likely spilled HW registers. */
29900 static void
29901 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29903 rtx_insn *insn;
29904 rtx_insn *first_arg = NULL;
29905 if (reload_completed)
29906 return;
29907 while (head != tail && DEBUG_INSN_P (head))
29908 head = NEXT_INSN (head);
29909 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29910 if (INSN_P (insn) && CALL_P (insn))
29912 first_arg = add_parameter_dependencies (insn, head);
29913 if (first_arg)
29915 /* Add dependee for first argument to predecessors if only
29916 region contains more than one block. */
29917 basic_block bb = BLOCK_FOR_INSN (insn);
29918 int rgn = CONTAINING_RGN (bb->index);
29919 int nr_blks = RGN_NR_BLOCKS (rgn);
29920 /* Skip trivial regions and region head blocks that can have
29921 predecessors outside of region. */
29922 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29924 edge e;
29925 edge_iterator ei;
29927 /* Regions are SCCs with the exception of selective
29928 scheduling with pipelining of outer blocks enabled.
29929 So also check that immediate predecessors of a non-head
29930 block are in the same region. */
29931 FOR_EACH_EDGE (e, ei, bb->preds)
29933 /* Avoid creating of loop-carried dependencies through
29934 using topological ordering in the region. */
29935 if (rgn == CONTAINING_RGN (e->src->index)
29936 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29937 add_dependee_for_func_arg (first_arg, e->src);
29940 insn = first_arg;
29941 if (insn == head)
29942 break;
29945 else if (first_arg)
29946 avoid_func_arg_motion (first_arg, insn);
29949 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29950 HW registers to maximum, to schedule them at soon as possible. These are
29951 moves from function argument registers at the top of the function entry
29952 and moves from function return value registers after call. */
29953 static int
29954 ix86_adjust_priority (rtx_insn *insn, int priority)
29956 rtx set;
29958 if (reload_completed)
29959 return priority;
29961 if (!NONDEBUG_INSN_P (insn))
29962 return priority;
29964 set = single_set (insn);
29965 if (set)
29967 rtx tmp = SET_SRC (set);
29968 if (REG_P (tmp)
29969 && HARD_REGISTER_P (tmp)
29970 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29971 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29972 return current_sched_info->sched_max_insns_priority;
29975 return priority;
29978 /* Model decoder of Core 2/i7.
29979 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29980 track the instruction fetch block boundaries and make sure that long
29981 (9+ bytes) instructions are assigned to D0. */
29983 /* Maximum length of an insn that can be handled by
29984 a secondary decoder unit. '8' for Core 2/i7. */
29985 static int core2i7_secondary_decoder_max_insn_size;
29987 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29988 '16' for Core 2/i7. */
29989 static int core2i7_ifetch_block_size;
29991 /* Maximum number of instructions decoder can handle per cycle.
29992 '6' for Core 2/i7. */
29993 static int core2i7_ifetch_block_max_insns;
29995 typedef struct ix86_first_cycle_multipass_data_ *
29996 ix86_first_cycle_multipass_data_t;
29997 typedef const struct ix86_first_cycle_multipass_data_ *
29998 const_ix86_first_cycle_multipass_data_t;
30000 /* A variable to store target state across calls to max_issue within
30001 one cycle. */
30002 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30003 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30005 /* Initialize DATA. */
30006 static void
30007 core2i7_first_cycle_multipass_init (void *_data)
30009 ix86_first_cycle_multipass_data_t data
30010 = (ix86_first_cycle_multipass_data_t) _data;
30012 data->ifetch_block_len = 0;
30013 data->ifetch_block_n_insns = 0;
30014 data->ready_try_change = NULL;
30015 data->ready_try_change_size = 0;
30018 /* Advancing the cycle; reset ifetch block counts. */
30019 static void
30020 core2i7_dfa_post_advance_cycle (void)
30022 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30024 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30026 data->ifetch_block_len = 0;
30027 data->ifetch_block_n_insns = 0;
30030 static int min_insn_size (rtx_insn *);
30032 /* Filter out insns from ready_try that the core will not be able to issue
30033 on current cycle due to decoder. */
30034 static void
30035 core2i7_first_cycle_multipass_filter_ready_try
30036 (const_ix86_first_cycle_multipass_data_t data,
30037 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30039 while (n_ready--)
30041 rtx_insn *insn;
30042 int insn_size;
30044 if (ready_try[n_ready])
30045 continue;
30047 insn = get_ready_element (n_ready);
30048 insn_size = min_insn_size (insn);
30050 if (/* If this is a too long an insn for a secondary decoder ... */
30051 (!first_cycle_insn_p
30052 && insn_size > core2i7_secondary_decoder_max_insn_size)
30053 /* ... or it would not fit into the ifetch block ... */
30054 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30055 /* ... or the decoder is full already ... */
30056 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30057 /* ... mask the insn out. */
30059 ready_try[n_ready] = 1;
30061 if (data->ready_try_change)
30062 bitmap_set_bit (data->ready_try_change, n_ready);
30067 /* Prepare for a new round of multipass lookahead scheduling. */
30068 static void
30069 core2i7_first_cycle_multipass_begin (void *_data,
30070 signed char *ready_try, int n_ready,
30071 bool first_cycle_insn_p)
30073 ix86_first_cycle_multipass_data_t data
30074 = (ix86_first_cycle_multipass_data_t) _data;
30075 const_ix86_first_cycle_multipass_data_t prev_data
30076 = ix86_first_cycle_multipass_data;
30078 /* Restore the state from the end of the previous round. */
30079 data->ifetch_block_len = prev_data->ifetch_block_len;
30080 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
30082 /* Filter instructions that cannot be issued on current cycle due to
30083 decoder restrictions. */
30084 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30085 first_cycle_insn_p);
30088 /* INSN is being issued in current solution. Account for its impact on
30089 the decoder model. */
30090 static void
30091 core2i7_first_cycle_multipass_issue (void *_data,
30092 signed char *ready_try, int n_ready,
30093 rtx_insn *insn, const void *_prev_data)
30095 ix86_first_cycle_multipass_data_t data
30096 = (ix86_first_cycle_multipass_data_t) _data;
30097 const_ix86_first_cycle_multipass_data_t prev_data
30098 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
30100 int insn_size = min_insn_size (insn);
30102 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
30103 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
30104 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
30105 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30107 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
30108 if (!data->ready_try_change)
30110 data->ready_try_change = sbitmap_alloc (n_ready);
30111 data->ready_try_change_size = n_ready;
30113 else if (data->ready_try_change_size < n_ready)
30115 data->ready_try_change = sbitmap_resize (data->ready_try_change,
30116 n_ready, 0);
30117 data->ready_try_change_size = n_ready;
30119 bitmap_clear (data->ready_try_change);
30121 /* Filter out insns from ready_try that the core will not be able to issue
30122 on current cycle due to decoder. */
30123 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30124 false);
30127 /* Revert the effect on ready_try. */
30128 static void
30129 core2i7_first_cycle_multipass_backtrack (const void *_data,
30130 signed char *ready_try,
30131 int n_ready ATTRIBUTE_UNUSED)
30133 const_ix86_first_cycle_multipass_data_t data
30134 = (const_ix86_first_cycle_multipass_data_t) _data;
30135 unsigned int i = 0;
30136 sbitmap_iterator sbi;
30138 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
30139 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
30141 ready_try[i] = 0;
30145 /* Save the result of multipass lookahead scheduling for the next round. */
30146 static void
30147 core2i7_first_cycle_multipass_end (const void *_data)
30149 const_ix86_first_cycle_multipass_data_t data
30150 = (const_ix86_first_cycle_multipass_data_t) _data;
30151 ix86_first_cycle_multipass_data_t next_data
30152 = ix86_first_cycle_multipass_data;
30154 if (data != NULL)
30156 next_data->ifetch_block_len = data->ifetch_block_len;
30157 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
30161 /* Deallocate target data. */
30162 static void
30163 core2i7_first_cycle_multipass_fini (void *_data)
30165 ix86_first_cycle_multipass_data_t data
30166 = (ix86_first_cycle_multipass_data_t) _data;
30168 if (data->ready_try_change)
30170 sbitmap_free (data->ready_try_change);
30171 data->ready_try_change = NULL;
30172 data->ready_try_change_size = 0;
30176 /* Prepare for scheduling pass. */
30177 static void
30178 ix86_sched_init_global (FILE *, int, int)
30180 /* Install scheduling hooks for current CPU. Some of these hooks are used
30181 in time-critical parts of the scheduler, so we only set them up when
30182 they are actually used. */
30183 switch (ix86_tune)
30185 case PROCESSOR_CORE2:
30186 case PROCESSOR_NEHALEM:
30187 case PROCESSOR_SANDYBRIDGE:
30188 case PROCESSOR_HASWELL:
30189 /* Do not perform multipass scheduling for pre-reload schedule
30190 to save compile time. */
30191 if (reload_completed)
30193 targetm.sched.dfa_post_advance_cycle
30194 = core2i7_dfa_post_advance_cycle;
30195 targetm.sched.first_cycle_multipass_init
30196 = core2i7_first_cycle_multipass_init;
30197 targetm.sched.first_cycle_multipass_begin
30198 = core2i7_first_cycle_multipass_begin;
30199 targetm.sched.first_cycle_multipass_issue
30200 = core2i7_first_cycle_multipass_issue;
30201 targetm.sched.first_cycle_multipass_backtrack
30202 = core2i7_first_cycle_multipass_backtrack;
30203 targetm.sched.first_cycle_multipass_end
30204 = core2i7_first_cycle_multipass_end;
30205 targetm.sched.first_cycle_multipass_fini
30206 = core2i7_first_cycle_multipass_fini;
30208 /* Set decoder parameters. */
30209 core2i7_secondary_decoder_max_insn_size = 8;
30210 core2i7_ifetch_block_size = 16;
30211 core2i7_ifetch_block_max_insns = 6;
30212 break;
30214 /* Fall through. */
30215 default:
30216 targetm.sched.dfa_post_advance_cycle = NULL;
30217 targetm.sched.first_cycle_multipass_init = NULL;
30218 targetm.sched.first_cycle_multipass_begin = NULL;
30219 targetm.sched.first_cycle_multipass_issue = NULL;
30220 targetm.sched.first_cycle_multipass_backtrack = NULL;
30221 targetm.sched.first_cycle_multipass_end = NULL;
30222 targetm.sched.first_cycle_multipass_fini = NULL;
30223 break;
30228 /* Compute the alignment given to a constant that is being placed in memory.
30229 EXP is the constant and ALIGN is the alignment that the object would
30230 ordinarily have.
30231 The value of this function is used instead of that alignment to align
30232 the object. */
30235 ix86_constant_alignment (tree exp, int align)
30237 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30238 || TREE_CODE (exp) == INTEGER_CST)
30240 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30241 return 64;
30242 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30243 return 128;
30245 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30246 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30247 return BITS_PER_WORD;
30249 return align;
30252 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30253 the data type, and ALIGN is the alignment that the object would
30254 ordinarily have. */
30256 static int
30257 iamcu_alignment (tree type, int align)
30259 enum machine_mode mode;
30261 if (align < 32 || TYPE_USER_ALIGN (type))
30262 return align;
30264 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30265 bytes. */
30266 mode = TYPE_MODE (strip_array_types (type));
30267 switch (GET_MODE_CLASS (mode))
30269 case MODE_INT:
30270 case MODE_COMPLEX_INT:
30271 case MODE_COMPLEX_FLOAT:
30272 case MODE_FLOAT:
30273 case MODE_DECIMAL_FLOAT:
30274 return 32;
30275 default:
30276 return align;
30280 /* Compute the alignment for a static variable.
30281 TYPE is the data type, and ALIGN is the alignment that
30282 the object would ordinarily have. The value of this function is used
30283 instead of that alignment to align the object. */
30286 ix86_data_alignment (tree type, int align, bool opt)
30288 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30289 for symbols from other compilation units or symbols that don't need
30290 to bind locally. In order to preserve some ABI compatibility with
30291 those compilers, ensure we don't decrease alignment from what we
30292 used to assume. */
30294 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30296 /* A data structure, equal or greater than the size of a cache line
30297 (64 bytes in the Pentium 4 and other recent Intel processors, including
30298 processors based on Intel Core microarchitecture) should be aligned
30299 so that its base address is a multiple of a cache line size. */
30301 int max_align
30302 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30304 if (max_align < BITS_PER_WORD)
30305 max_align = BITS_PER_WORD;
30307 switch (ix86_align_data_type)
30309 case ix86_align_data_type_abi: opt = false; break;
30310 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30311 case ix86_align_data_type_cacheline: break;
30314 if (TARGET_IAMCU)
30315 align = iamcu_alignment (type, align);
30317 if (opt
30318 && AGGREGATE_TYPE_P (type)
30319 && TYPE_SIZE (type)
30320 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30322 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30323 && align < max_align_compat)
30324 align = max_align_compat;
30325 if (wi::geu_p (TYPE_SIZE (type), max_align)
30326 && align < max_align)
30327 align = max_align;
30330 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30331 to 16byte boundary. */
30332 if (TARGET_64BIT)
30334 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30335 && TYPE_SIZE (type)
30336 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30337 && wi::geu_p (TYPE_SIZE (type), 128)
30338 && align < 128)
30339 return 128;
30342 if (!opt)
30343 return align;
30345 if (TREE_CODE (type) == ARRAY_TYPE)
30347 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30348 return 64;
30349 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30350 return 128;
30352 else if (TREE_CODE (type) == COMPLEX_TYPE)
30355 if (TYPE_MODE (type) == DCmode && align < 64)
30356 return 64;
30357 if ((TYPE_MODE (type) == XCmode
30358 || TYPE_MODE (type) == TCmode) && align < 128)
30359 return 128;
30361 else if ((TREE_CODE (type) == RECORD_TYPE
30362 || TREE_CODE (type) == UNION_TYPE
30363 || TREE_CODE (type) == QUAL_UNION_TYPE)
30364 && TYPE_FIELDS (type))
30366 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30367 return 64;
30368 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30369 return 128;
30371 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30372 || TREE_CODE (type) == INTEGER_TYPE)
30374 if (TYPE_MODE (type) == DFmode && align < 64)
30375 return 64;
30376 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30377 return 128;
30380 return align;
30383 /* Compute the alignment for a local variable or a stack slot. EXP is
30384 the data type or decl itself, MODE is the widest mode available and
30385 ALIGN is the alignment that the object would ordinarily have. The
30386 value of this macro is used instead of that alignment to align the
30387 object. */
30389 unsigned int
30390 ix86_local_alignment (tree exp, machine_mode mode,
30391 unsigned int align)
30393 tree type, decl;
30395 if (exp && DECL_P (exp))
30397 type = TREE_TYPE (exp);
30398 decl = exp;
30400 else
30402 type = exp;
30403 decl = NULL;
30406 /* Don't do dynamic stack realignment for long long objects with
30407 -mpreferred-stack-boundary=2. */
30408 if (!TARGET_64BIT
30409 && align == 64
30410 && ix86_preferred_stack_boundary < 64
30411 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30412 && (!type || !TYPE_USER_ALIGN (type))
30413 && (!decl || !DECL_USER_ALIGN (decl)))
30414 align = 32;
30416 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30417 register in MODE. We will return the largest alignment of XF
30418 and DF. */
30419 if (!type)
30421 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30422 align = GET_MODE_ALIGNMENT (DFmode);
30423 return align;
30426 /* Don't increase alignment for Intel MCU psABI. */
30427 if (TARGET_IAMCU)
30428 return align;
30430 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30431 to 16byte boundary. Exact wording is:
30433 An array uses the same alignment as its elements, except that a local or
30434 global array variable of length at least 16 bytes or
30435 a C99 variable-length array variable always has alignment of at least 16 bytes.
30437 This was added to allow use of aligned SSE instructions at arrays. This
30438 rule is meant for static storage (where compiler can not do the analysis
30439 by itself). We follow it for automatic variables only when convenient.
30440 We fully control everything in the function compiled and functions from
30441 other unit can not rely on the alignment.
30443 Exclude va_list type. It is the common case of local array where
30444 we can not benefit from the alignment.
30446 TODO: Probably one should optimize for size only when var is not escaping. */
30447 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30448 && TARGET_SSE)
30450 if (AGGREGATE_TYPE_P (type)
30451 && (va_list_type_node == NULL_TREE
30452 || (TYPE_MAIN_VARIANT (type)
30453 != TYPE_MAIN_VARIANT (va_list_type_node)))
30454 && TYPE_SIZE (type)
30455 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30456 && wi::geu_p (TYPE_SIZE (type), 16)
30457 && align < 128)
30458 return 128;
30460 if (TREE_CODE (type) == ARRAY_TYPE)
30462 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30463 return 64;
30464 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30465 return 128;
30467 else if (TREE_CODE (type) == COMPLEX_TYPE)
30469 if (TYPE_MODE (type) == DCmode && align < 64)
30470 return 64;
30471 if ((TYPE_MODE (type) == XCmode
30472 || TYPE_MODE (type) == TCmode) && align < 128)
30473 return 128;
30475 else if ((TREE_CODE (type) == RECORD_TYPE
30476 || TREE_CODE (type) == UNION_TYPE
30477 || TREE_CODE (type) == QUAL_UNION_TYPE)
30478 && TYPE_FIELDS (type))
30480 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30481 return 64;
30482 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30483 return 128;
30485 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30486 || TREE_CODE (type) == INTEGER_TYPE)
30489 if (TYPE_MODE (type) == DFmode && align < 64)
30490 return 64;
30491 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30492 return 128;
30494 return align;
30497 /* Compute the minimum required alignment for dynamic stack realignment
30498 purposes for a local variable, parameter or a stack slot. EXP is
30499 the data type or decl itself, MODE is its mode and ALIGN is the
30500 alignment that the object would ordinarily have. */
30502 unsigned int
30503 ix86_minimum_alignment (tree exp, machine_mode mode,
30504 unsigned int align)
30506 tree type, decl;
30508 if (exp && DECL_P (exp))
30510 type = TREE_TYPE (exp);
30511 decl = exp;
30513 else
30515 type = exp;
30516 decl = NULL;
30519 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30520 return align;
30522 /* Don't do dynamic stack realignment for long long objects with
30523 -mpreferred-stack-boundary=2. */
30524 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30525 && (!type || !TYPE_USER_ALIGN (type))
30526 && (!decl || !DECL_USER_ALIGN (decl)))
30528 gcc_checking_assert (!TARGET_STV);
30529 return 32;
30532 return align;
30535 /* Find a location for the static chain incoming to a nested function.
30536 This is a register, unless all free registers are used by arguments. */
30538 static rtx
30539 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30541 unsigned regno;
30543 /* While this function won't be called by the middle-end when a static
30544 chain isn't needed, it's also used throughout the backend so it's
30545 easiest to keep this check centralized. */
30546 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30547 return NULL;
30549 if (TARGET_64BIT)
30551 /* We always use R10 in 64-bit mode. */
30552 regno = R10_REG;
30554 else
30556 const_tree fntype, fndecl;
30557 unsigned int ccvt;
30559 /* By default in 32-bit mode we use ECX to pass the static chain. */
30560 regno = CX_REG;
30562 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30564 fntype = TREE_TYPE (fndecl_or_type);
30565 fndecl = fndecl_or_type;
30567 else
30569 fntype = fndecl_or_type;
30570 fndecl = NULL;
30573 ccvt = ix86_get_callcvt (fntype);
30574 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30576 /* Fastcall functions use ecx/edx for arguments, which leaves
30577 us with EAX for the static chain.
30578 Thiscall functions use ecx for arguments, which also
30579 leaves us with EAX for the static chain. */
30580 regno = AX_REG;
30582 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30584 /* Thiscall functions use ecx for arguments, which leaves
30585 us with EAX and EDX for the static chain.
30586 We are using for abi-compatibility EAX. */
30587 regno = AX_REG;
30589 else if (ix86_function_regparm (fntype, fndecl) == 3)
30591 /* For regparm 3, we have no free call-clobbered registers in
30592 which to store the static chain. In order to implement this,
30593 we have the trampoline push the static chain to the stack.
30594 However, we can't push a value below the return address when
30595 we call the nested function directly, so we have to use an
30596 alternate entry point. For this we use ESI, and have the
30597 alternate entry point push ESI, so that things appear the
30598 same once we're executing the nested function. */
30599 if (incoming_p)
30601 if (fndecl == current_function_decl)
30602 ix86_static_chain_on_stack = true;
30603 return gen_frame_mem (SImode,
30604 plus_constant (Pmode,
30605 arg_pointer_rtx, -8));
30607 regno = SI_REG;
30611 return gen_rtx_REG (Pmode, regno);
30614 /* Emit RTL insns to initialize the variable parts of a trampoline.
30615 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30616 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30617 to be passed to the target function. */
30619 static void
30620 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30622 rtx mem, fnaddr;
30623 int opcode;
30624 int offset = 0;
30626 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30628 if (TARGET_64BIT)
30630 int size;
30632 /* Load the function address to r11. Try to load address using
30633 the shorter movl instead of movabs. We may want to support
30634 movq for kernel mode, but kernel does not use trampolines at
30635 the moment. FNADDR is a 32bit address and may not be in
30636 DImode when ptr_mode == SImode. Always use movl in this
30637 case. */
30638 if (ptr_mode == SImode
30639 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30641 fnaddr = copy_addr_to_reg (fnaddr);
30643 mem = adjust_address (m_tramp, HImode, offset);
30644 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30646 mem = adjust_address (m_tramp, SImode, offset + 2);
30647 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30648 offset += 6;
30650 else
30652 mem = adjust_address (m_tramp, HImode, offset);
30653 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30655 mem = adjust_address (m_tramp, DImode, offset + 2);
30656 emit_move_insn (mem, fnaddr);
30657 offset += 10;
30660 /* Load static chain using movabs to r10. Use the shorter movl
30661 instead of movabs when ptr_mode == SImode. */
30662 if (ptr_mode == SImode)
30664 opcode = 0xba41;
30665 size = 6;
30667 else
30669 opcode = 0xba49;
30670 size = 10;
30673 mem = adjust_address (m_tramp, HImode, offset);
30674 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30676 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30677 emit_move_insn (mem, chain_value);
30678 offset += size;
30680 /* Jump to r11; the last (unused) byte is a nop, only there to
30681 pad the write out to a single 32-bit store. */
30682 mem = adjust_address (m_tramp, SImode, offset);
30683 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30684 offset += 4;
30686 else
30688 rtx disp, chain;
30690 /* Depending on the static chain location, either load a register
30691 with a constant, or push the constant to the stack. All of the
30692 instructions are the same size. */
30693 chain = ix86_static_chain (fndecl, true);
30694 if (REG_P (chain))
30696 switch (REGNO (chain))
30698 case AX_REG:
30699 opcode = 0xb8; break;
30700 case CX_REG:
30701 opcode = 0xb9; break;
30702 default:
30703 gcc_unreachable ();
30706 else
30707 opcode = 0x68;
30709 mem = adjust_address (m_tramp, QImode, offset);
30710 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30712 mem = adjust_address (m_tramp, SImode, offset + 1);
30713 emit_move_insn (mem, chain_value);
30714 offset += 5;
30716 mem = adjust_address (m_tramp, QImode, offset);
30717 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30719 mem = adjust_address (m_tramp, SImode, offset + 1);
30721 /* Compute offset from the end of the jmp to the target function.
30722 In the case in which the trampoline stores the static chain on
30723 the stack, we need to skip the first insn which pushes the
30724 (call-saved) register static chain; this push is 1 byte. */
30725 offset += 5;
30726 disp = expand_binop (SImode, sub_optab, fnaddr,
30727 plus_constant (Pmode, XEXP (m_tramp, 0),
30728 offset - (MEM_P (chain) ? 1 : 0)),
30729 NULL_RTX, 1, OPTAB_DIRECT);
30730 emit_move_insn (mem, disp);
30733 gcc_assert (offset <= TRAMPOLINE_SIZE);
30735 #ifdef HAVE_ENABLE_EXECUTE_STACK
30736 #ifdef CHECK_EXECUTE_STACK_ENABLED
30737 if (CHECK_EXECUTE_STACK_ENABLED)
30738 #endif
30739 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30740 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30741 #endif
30744 /* The following file contains several enumerations and data structures
30745 built from the definitions in i386-builtin-types.def. */
30747 #include "i386-builtin-types.inc"
30749 /* Table for the ix86 builtin non-function types. */
30750 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30752 /* Retrieve an element from the above table, building some of
30753 the types lazily. */
30755 static tree
30756 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30758 unsigned int index;
30759 tree type, itype;
30761 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30763 type = ix86_builtin_type_tab[(int) tcode];
30764 if (type != NULL)
30765 return type;
30767 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30768 if (tcode <= IX86_BT_LAST_VECT)
30770 machine_mode mode;
30772 index = tcode - IX86_BT_LAST_PRIM - 1;
30773 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30774 mode = ix86_builtin_type_vect_mode[index];
30776 type = build_vector_type_for_mode (itype, mode);
30778 else
30780 int quals;
30782 index = tcode - IX86_BT_LAST_VECT - 1;
30783 if (tcode <= IX86_BT_LAST_PTR)
30784 quals = TYPE_UNQUALIFIED;
30785 else
30786 quals = TYPE_QUAL_CONST;
30788 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30789 if (quals != TYPE_UNQUALIFIED)
30790 itype = build_qualified_type (itype, quals);
30792 type = build_pointer_type (itype);
30795 ix86_builtin_type_tab[(int) tcode] = type;
30796 return type;
30799 /* Table for the ix86 builtin function types. */
30800 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30802 /* Retrieve an element from the above table, building some of
30803 the types lazily. */
30805 static tree
30806 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30808 tree type;
30810 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30812 type = ix86_builtin_func_type_tab[(int) tcode];
30813 if (type != NULL)
30814 return type;
30816 if (tcode <= IX86_BT_LAST_FUNC)
30818 unsigned start = ix86_builtin_func_start[(int) tcode];
30819 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30820 tree rtype, atype, args = void_list_node;
30821 unsigned i;
30823 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30824 for (i = after - 1; i > start; --i)
30826 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30827 args = tree_cons (NULL, atype, args);
30830 type = build_function_type (rtype, args);
30832 else
30834 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30835 enum ix86_builtin_func_type icode;
30837 icode = ix86_builtin_func_alias_base[index];
30838 type = ix86_get_builtin_func_type (icode);
30841 ix86_builtin_func_type_tab[(int) tcode] = type;
30842 return type;
30846 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30847 bdesc_* arrays below should come first, then builtins for each bdesc_*
30848 array in ascending order, so that we can use direct array accesses. */
30849 enum ix86_builtins
30851 IX86_BUILTIN_MASKMOVQ,
30852 IX86_BUILTIN_LDMXCSR,
30853 IX86_BUILTIN_STMXCSR,
30854 IX86_BUILTIN_MASKMOVDQU,
30855 IX86_BUILTIN_PSLLDQ128,
30856 IX86_BUILTIN_CLFLUSH,
30857 IX86_BUILTIN_MONITOR,
30858 IX86_BUILTIN_MWAIT,
30859 IX86_BUILTIN_CLZERO,
30860 IX86_BUILTIN_VEC_INIT_V2SI,
30861 IX86_BUILTIN_VEC_INIT_V4HI,
30862 IX86_BUILTIN_VEC_INIT_V8QI,
30863 IX86_BUILTIN_VEC_EXT_V2DF,
30864 IX86_BUILTIN_VEC_EXT_V2DI,
30865 IX86_BUILTIN_VEC_EXT_V4SF,
30866 IX86_BUILTIN_VEC_EXT_V4SI,
30867 IX86_BUILTIN_VEC_EXT_V8HI,
30868 IX86_BUILTIN_VEC_EXT_V2SI,
30869 IX86_BUILTIN_VEC_EXT_V4HI,
30870 IX86_BUILTIN_VEC_EXT_V16QI,
30871 IX86_BUILTIN_VEC_SET_V2DI,
30872 IX86_BUILTIN_VEC_SET_V4SF,
30873 IX86_BUILTIN_VEC_SET_V4SI,
30874 IX86_BUILTIN_VEC_SET_V8HI,
30875 IX86_BUILTIN_VEC_SET_V4HI,
30876 IX86_BUILTIN_VEC_SET_V16QI,
30877 IX86_BUILTIN_GATHERSIV2DF,
30878 IX86_BUILTIN_GATHERSIV4DF,
30879 IX86_BUILTIN_GATHERDIV2DF,
30880 IX86_BUILTIN_GATHERDIV4DF,
30881 IX86_BUILTIN_GATHERSIV4SF,
30882 IX86_BUILTIN_GATHERSIV8SF,
30883 IX86_BUILTIN_GATHERDIV4SF,
30884 IX86_BUILTIN_GATHERDIV8SF,
30885 IX86_BUILTIN_GATHERSIV2DI,
30886 IX86_BUILTIN_GATHERSIV4DI,
30887 IX86_BUILTIN_GATHERDIV2DI,
30888 IX86_BUILTIN_GATHERDIV4DI,
30889 IX86_BUILTIN_GATHERSIV4SI,
30890 IX86_BUILTIN_GATHERSIV8SI,
30891 IX86_BUILTIN_GATHERDIV4SI,
30892 IX86_BUILTIN_GATHERDIV8SI,
30893 IX86_BUILTIN_VFMSUBSD3_MASK3,
30894 IX86_BUILTIN_VFMSUBSS3_MASK3,
30895 IX86_BUILTIN_GATHER3SIV8SF,
30896 IX86_BUILTIN_GATHER3SIV4SF,
30897 IX86_BUILTIN_GATHER3SIV4DF,
30898 IX86_BUILTIN_GATHER3SIV2DF,
30899 IX86_BUILTIN_GATHER3DIV8SF,
30900 IX86_BUILTIN_GATHER3DIV4SF,
30901 IX86_BUILTIN_GATHER3DIV4DF,
30902 IX86_BUILTIN_GATHER3DIV2DF,
30903 IX86_BUILTIN_GATHER3SIV8SI,
30904 IX86_BUILTIN_GATHER3SIV4SI,
30905 IX86_BUILTIN_GATHER3SIV4DI,
30906 IX86_BUILTIN_GATHER3SIV2DI,
30907 IX86_BUILTIN_GATHER3DIV8SI,
30908 IX86_BUILTIN_GATHER3DIV4SI,
30909 IX86_BUILTIN_GATHER3DIV4DI,
30910 IX86_BUILTIN_GATHER3DIV2DI,
30911 IX86_BUILTIN_SCATTERSIV8SF,
30912 IX86_BUILTIN_SCATTERSIV4SF,
30913 IX86_BUILTIN_SCATTERSIV4DF,
30914 IX86_BUILTIN_SCATTERSIV2DF,
30915 IX86_BUILTIN_SCATTERDIV8SF,
30916 IX86_BUILTIN_SCATTERDIV4SF,
30917 IX86_BUILTIN_SCATTERDIV4DF,
30918 IX86_BUILTIN_SCATTERDIV2DF,
30919 IX86_BUILTIN_SCATTERSIV8SI,
30920 IX86_BUILTIN_SCATTERSIV4SI,
30921 IX86_BUILTIN_SCATTERSIV4DI,
30922 IX86_BUILTIN_SCATTERSIV2DI,
30923 IX86_BUILTIN_SCATTERDIV8SI,
30924 IX86_BUILTIN_SCATTERDIV4SI,
30925 IX86_BUILTIN_SCATTERDIV4DI,
30926 IX86_BUILTIN_SCATTERDIV2DI,
30927 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30928 where all operands are 32-byte or 64-byte wide respectively. */
30929 IX86_BUILTIN_GATHERALTSIV4DF,
30930 IX86_BUILTIN_GATHERALTDIV8SF,
30931 IX86_BUILTIN_GATHERALTSIV4DI,
30932 IX86_BUILTIN_GATHERALTDIV8SI,
30933 IX86_BUILTIN_GATHER3ALTDIV16SF,
30934 IX86_BUILTIN_GATHER3ALTDIV16SI,
30935 IX86_BUILTIN_GATHER3ALTSIV4DF,
30936 IX86_BUILTIN_GATHER3ALTDIV8SF,
30937 IX86_BUILTIN_GATHER3ALTSIV4DI,
30938 IX86_BUILTIN_GATHER3ALTDIV8SI,
30939 IX86_BUILTIN_GATHER3ALTSIV8DF,
30940 IX86_BUILTIN_GATHER3ALTSIV8DI,
30941 IX86_BUILTIN_GATHER3DIV16SF,
30942 IX86_BUILTIN_GATHER3DIV16SI,
30943 IX86_BUILTIN_GATHER3DIV8DF,
30944 IX86_BUILTIN_GATHER3DIV8DI,
30945 IX86_BUILTIN_GATHER3SIV16SF,
30946 IX86_BUILTIN_GATHER3SIV16SI,
30947 IX86_BUILTIN_GATHER3SIV8DF,
30948 IX86_BUILTIN_GATHER3SIV8DI,
30949 IX86_BUILTIN_SCATTERALTSIV8DF,
30950 IX86_BUILTIN_SCATTERALTDIV16SF,
30951 IX86_BUILTIN_SCATTERALTSIV8DI,
30952 IX86_BUILTIN_SCATTERALTDIV16SI,
30953 IX86_BUILTIN_SCATTERDIV16SF,
30954 IX86_BUILTIN_SCATTERDIV16SI,
30955 IX86_BUILTIN_SCATTERDIV8DF,
30956 IX86_BUILTIN_SCATTERDIV8DI,
30957 IX86_BUILTIN_SCATTERSIV16SF,
30958 IX86_BUILTIN_SCATTERSIV16SI,
30959 IX86_BUILTIN_SCATTERSIV8DF,
30960 IX86_BUILTIN_SCATTERSIV8DI,
30961 IX86_BUILTIN_GATHERPFQPD,
30962 IX86_BUILTIN_GATHERPFDPS,
30963 IX86_BUILTIN_GATHERPFDPD,
30964 IX86_BUILTIN_GATHERPFQPS,
30965 IX86_BUILTIN_SCATTERPFDPD,
30966 IX86_BUILTIN_SCATTERPFDPS,
30967 IX86_BUILTIN_SCATTERPFQPD,
30968 IX86_BUILTIN_SCATTERPFQPS,
30969 IX86_BUILTIN_CLWB,
30970 IX86_BUILTIN_CLFLUSHOPT,
30971 IX86_BUILTIN_INFQ,
30972 IX86_BUILTIN_HUGE_VALQ,
30973 IX86_BUILTIN_NANQ,
30974 IX86_BUILTIN_NANSQ,
30975 IX86_BUILTIN_XABORT,
30976 IX86_BUILTIN_ADDCARRYX32,
30977 IX86_BUILTIN_ADDCARRYX64,
30978 IX86_BUILTIN_SBB32,
30979 IX86_BUILTIN_SBB64,
30980 IX86_BUILTIN_RDRAND16_STEP,
30981 IX86_BUILTIN_RDRAND32_STEP,
30982 IX86_BUILTIN_RDRAND64_STEP,
30983 IX86_BUILTIN_RDSEED16_STEP,
30984 IX86_BUILTIN_RDSEED32_STEP,
30985 IX86_BUILTIN_RDSEED64_STEP,
30986 IX86_BUILTIN_MONITORX,
30987 IX86_BUILTIN_MWAITX,
30988 IX86_BUILTIN_CFSTRING,
30989 IX86_BUILTIN_CPU_INIT,
30990 IX86_BUILTIN_CPU_IS,
30991 IX86_BUILTIN_CPU_SUPPORTS,
30992 IX86_BUILTIN_READ_FLAGS,
30993 IX86_BUILTIN_WRITE_FLAGS,
30995 /* All the remaining builtins are tracked in bdesc_* arrays in
30996 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30997 this point. */
30998 #define BDESC(mask, icode, name, code, comparison, flag) \
30999 code,
31000 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31001 code, \
31002 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31003 #define BDESC_END(kind, next_kind)
31005 #include "i386-builtin.def"
31007 #undef BDESC
31008 #undef BDESC_FIRST
31009 #undef BDESC_END
31011 IX86_BUILTIN_MAX,
31013 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31015 /* Now just the aliases for bdesc_* start/end. */
31016 #define BDESC(mask, icode, name, code, comparison, flag)
31017 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31018 #define BDESC_END(kind, next_kind) \
31019 IX86_BUILTIN__BDESC_##kind##_LAST \
31020 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31022 #include "i386-builtin.def"
31024 #undef BDESC
31025 #undef BDESC_FIRST
31026 #undef BDESC_END
31028 /* Just to make sure there is no comma after the last enumerator. */
31029 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31032 /* Table for the ix86 builtin decls. */
31033 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31035 /* Table of all of the builtin functions that are possible with different ISA's
31036 but are waiting to be built until a function is declared to use that
31037 ISA. */
31038 struct builtin_isa {
31039 const char *name; /* function name */
31040 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
31041 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31042 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31043 bool const_p; /* true if the declaration is constant */
31044 bool leaf_p; /* true if the declaration has leaf attribute */
31045 bool nothrow_p; /* true if the declaration has nothrow attribute */
31046 bool set_and_not_built_p;
31049 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
31051 /* Bits that can still enable any inclusion of a builtin. */
31052 static HOST_WIDE_INT deferred_isa_values = 0;
31053 static HOST_WIDE_INT deferred_isa_values2 = 0;
31055 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
31056 of which isa_flags to use in the ix86_builtins_isa array. Stores the
31057 function decl in the ix86_builtins array. Returns the function decl or
31058 NULL_TREE, if the builtin was not added.
31060 If the front end has a special hook for builtin functions, delay adding
31061 builtin functions that aren't in the current ISA until the ISA is changed
31062 with function specific optimization. Doing so, can save about 300K for the
31063 default compiler. When the builtin is expanded, check at that time whether
31064 it is valid.
31066 If the front end doesn't have a special hook, record all builtins, even if
31067 it isn't an instruction set in the current ISA in case the user uses
31068 function specific options for a different ISA, so that we don't get scope
31069 errors if a builtin is added in the middle of a function scope. */
31071 static inline tree
31072 def_builtin (HOST_WIDE_INT mask, const char *name,
31073 enum ix86_builtin_func_type tcode,
31074 enum ix86_builtins code)
31076 tree decl = NULL_TREE;
31078 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
31080 ix86_builtins_isa[(int) code].isa = mask;
31082 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
31083 where any bit set means that built-in is enable, this bit must be *and-ed*
31084 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
31085 means that *both* cpuid bits must be set for the built-in to be available.
31086 Handle this here. */
31087 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31088 mask &= ~OPTION_MASK_ISA_AVX512VL;
31090 mask &= ~OPTION_MASK_ISA_64BIT;
31091 if (mask == 0
31092 || (mask & ix86_isa_flags) != 0
31093 || (lang_hooks.builtin_function
31094 == lang_hooks.builtin_function_ext_scope))
31097 tree type = ix86_get_builtin_func_type (tcode);
31098 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31099 NULL, NULL_TREE);
31100 ix86_builtins[(int) code] = decl;
31101 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31103 else
31105 /* Just a MASK where set_and_not_built_p == true can potentially
31106 include a builtin. */
31107 deferred_isa_values |= mask;
31108 ix86_builtins[(int) code] = NULL_TREE;
31109 ix86_builtins_isa[(int) code].tcode = tcode;
31110 ix86_builtins_isa[(int) code].name = name;
31111 ix86_builtins_isa[(int) code].leaf_p = false;
31112 ix86_builtins_isa[(int) code].nothrow_p = false;
31113 ix86_builtins_isa[(int) code].const_p = false;
31114 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31118 return decl;
31121 /* Like def_builtin, but also marks the function decl "const". */
31123 static inline tree
31124 def_builtin_const (HOST_WIDE_INT mask, const char *name,
31125 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31127 tree decl = def_builtin (mask, name, tcode, code);
31128 if (decl)
31129 TREE_READONLY (decl) = 1;
31130 else
31131 ix86_builtins_isa[(int) code].const_p = true;
31133 return decl;
31136 /* Like def_builtin, but for additional isa2 flags. */
31138 static inline tree
31139 def_builtin2 (HOST_WIDE_INT mask, const char *name,
31140 enum ix86_builtin_func_type tcode,
31141 enum ix86_builtins code)
31143 tree decl = NULL_TREE;
31145 ix86_builtins_isa[(int) code].isa2 = mask;
31147 if (mask == 0
31148 || (mask & ix86_isa_flags2) != 0
31149 || (lang_hooks.builtin_function
31150 == lang_hooks.builtin_function_ext_scope))
31153 tree type = ix86_get_builtin_func_type (tcode);
31154 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31155 NULL, NULL_TREE);
31156 ix86_builtins[(int) code] = decl;
31157 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
31159 else
31161 /* Just a MASK where set_and_not_built_p == true can potentially
31162 include a builtin. */
31163 deferred_isa_values2 |= mask;
31164 ix86_builtins[(int) code] = NULL_TREE;
31165 ix86_builtins_isa[(int) code].tcode = tcode;
31166 ix86_builtins_isa[(int) code].name = name;
31167 ix86_builtins_isa[(int) code].leaf_p = false;
31168 ix86_builtins_isa[(int) code].nothrow_p = false;
31169 ix86_builtins_isa[(int) code].const_p = false;
31170 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
31173 return decl;
31176 /* Like def_builtin, but also marks the function decl "const". */
31178 static inline tree
31179 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31180 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31182 tree decl = def_builtin2 (mask, name, tcode, code);
31183 if (decl)
31184 TREE_READONLY (decl) = 1;
31185 else
31186 ix86_builtins_isa[(int) code].const_p = true;
31188 return decl;
31191 /* Add any new builtin functions for a given ISA that may not have been
31192 declared. This saves a bit of space compared to adding all of the
31193 declarations to the tree, even if we didn't use them. */
31195 static void
31196 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31198 if ((isa & deferred_isa_values) == 0
31199 && (isa2 & deferred_isa_values2) == 0)
31200 return;
31202 /* Bits in ISA value can be removed from potential isa values. */
31203 deferred_isa_values &= ~isa;
31204 deferred_isa_values2 &= ~isa2;
31206 int i;
31207 tree saved_current_target_pragma = current_target_pragma;
31208 current_target_pragma = NULL_TREE;
31210 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31212 if (((ix86_builtins_isa[i].isa & isa) != 0
31213 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31214 && ix86_builtins_isa[i].set_and_not_built_p)
31216 tree decl, type;
31218 /* Don't define the builtin again. */
31219 ix86_builtins_isa[i].set_and_not_built_p = false;
31221 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31222 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31223 type, i, BUILT_IN_MD, NULL,
31224 NULL_TREE);
31226 ix86_builtins[i] = decl;
31227 if (ix86_builtins_isa[i].const_p)
31228 TREE_READONLY (decl) = 1;
31229 if (ix86_builtins_isa[i].leaf_p)
31230 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31231 NULL_TREE);
31232 if (ix86_builtins_isa[i].nothrow_p)
31233 TREE_NOTHROW (decl) = 1;
31237 current_target_pragma = saved_current_target_pragma;
31240 /* Bits for builtin_description.flag. */
31242 /* Set when we don't support the comparison natively, and should
31243 swap_comparison in order to support it. */
31244 #define BUILTIN_DESC_SWAP_OPERANDS 1
31246 struct builtin_description
31248 const HOST_WIDE_INT mask;
31249 const enum insn_code icode;
31250 const char *const name;
31251 const enum ix86_builtins code;
31252 const enum rtx_code comparison;
31253 const int flag;
31256 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31257 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31258 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31259 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31260 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31261 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31262 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31263 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31264 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31265 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31266 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31267 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31268 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31269 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31270 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31271 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31272 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31273 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31274 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31275 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31276 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31277 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31278 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31279 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31280 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31281 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31282 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31283 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31284 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31285 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31286 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31287 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31288 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31289 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31290 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31291 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31292 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31293 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31294 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31295 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31296 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31297 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31298 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31299 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31300 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31301 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31302 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31303 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31304 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31305 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31306 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31307 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31309 #define BDESC(mask, icode, name, code, comparison, flag) \
31310 { mask, icode, name, code, comparison, flag },
31311 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31312 static const struct builtin_description bdesc_##kind[] = \
31314 BDESC (mask, icode, name, code, comparison, flag)
31315 #define BDESC_END(kind, next_kind) \
31318 #include "i386-builtin.def"
31320 #undef BDESC
31321 #undef BDESC_FIRST
31322 #undef BDESC_END
31324 /* TM vector builtins. */
31326 /* Reuse the existing x86-specific `struct builtin_description' cause
31327 we're lazy. Add casts to make them fit. */
31328 static const struct builtin_description bdesc_tm[] =
31330 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31331 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31332 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31333 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31334 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31335 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31336 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31338 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31339 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31340 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31341 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31342 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31343 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31344 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31346 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31347 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31348 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31349 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31350 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31351 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31352 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31354 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31355 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31356 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31359 /* Initialize the transactional memory vector load/store builtins. */
31361 static void
31362 ix86_init_tm_builtins (void)
31364 enum ix86_builtin_func_type ftype;
31365 const struct builtin_description *d;
31366 size_t i;
31367 tree decl;
31368 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31369 tree attrs_log, attrs_type_log;
31371 if (!flag_tm)
31372 return;
31374 /* If there are no builtins defined, we must be compiling in a
31375 language without trans-mem support. */
31376 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31377 return;
31379 /* Use whatever attributes a normal TM load has. */
31380 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31381 attrs_load = DECL_ATTRIBUTES (decl);
31382 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31383 /* Use whatever attributes a normal TM store has. */
31384 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31385 attrs_store = DECL_ATTRIBUTES (decl);
31386 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31387 /* Use whatever attributes a normal TM log has. */
31388 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31389 attrs_log = DECL_ATTRIBUTES (decl);
31390 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31392 for (i = 0, d = bdesc_tm;
31393 i < ARRAY_SIZE (bdesc_tm);
31394 i++, d++)
31396 if ((d->mask & ix86_isa_flags) != 0
31397 || (lang_hooks.builtin_function
31398 == lang_hooks.builtin_function_ext_scope))
31400 tree type, attrs, attrs_type;
31401 enum built_in_function code = (enum built_in_function) d->code;
31403 ftype = (enum ix86_builtin_func_type) d->flag;
31404 type = ix86_get_builtin_func_type (ftype);
31406 if (BUILTIN_TM_LOAD_P (code))
31408 attrs = attrs_load;
31409 attrs_type = attrs_type_load;
31411 else if (BUILTIN_TM_STORE_P (code))
31413 attrs = attrs_store;
31414 attrs_type = attrs_type_store;
31416 else
31418 attrs = attrs_log;
31419 attrs_type = attrs_type_log;
31421 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31422 /* The builtin without the prefix for
31423 calling it directly. */
31424 d->name + strlen ("__builtin_"),
31425 attrs);
31426 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31427 set the TYPE_ATTRIBUTES. */
31428 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31430 set_builtin_decl (code, decl, false);
31435 /* Macros for verification of enum ix86_builtins order. */
31436 #define BDESC_VERIFY(x, y, z) \
31437 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31438 #define BDESC_VERIFYS(x, y, z) \
31439 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31441 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31442 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31443 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31444 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31445 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31446 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31447 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31448 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31449 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31450 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31451 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31452 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31453 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31454 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31455 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31456 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31457 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31458 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31459 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31460 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31462 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31463 in the current target ISA to allow the user to compile particular modules
31464 with different target specific options that differ from the command line
31465 options. */
31466 static void
31467 ix86_init_mmx_sse_builtins (void)
31469 const struct builtin_description * d;
31470 enum ix86_builtin_func_type ftype;
31471 size_t i;
31473 /* Add all special builtins with variable number of operands. */
31474 for (i = 0, d = bdesc_special_args;
31475 i < ARRAY_SIZE (bdesc_special_args);
31476 i++, d++)
31478 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31479 if (d->name == 0)
31480 continue;
31482 ftype = (enum ix86_builtin_func_type) d->flag;
31483 def_builtin (d->mask, d->name, ftype, d->code);
31485 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31486 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31487 ARRAY_SIZE (bdesc_special_args) - 1);
31489 /* Add all builtins with variable number of operands. */
31490 for (i = 0, d = bdesc_args;
31491 i < ARRAY_SIZE (bdesc_args);
31492 i++, d++)
31494 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31495 if (d->name == 0)
31496 continue;
31498 ftype = (enum ix86_builtin_func_type) d->flag;
31499 def_builtin_const (d->mask, d->name, ftype, d->code);
31501 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31502 IX86_BUILTIN__BDESC_ARGS_FIRST,
31503 ARRAY_SIZE (bdesc_args) - 1);
31505 /* Add all builtins with variable number of operands. */
31506 for (i = 0, d = bdesc_args2;
31507 i < ARRAY_SIZE (bdesc_args2);
31508 i++, d++)
31510 if (d->name == 0)
31511 continue;
31513 ftype = (enum ix86_builtin_func_type) d->flag;
31514 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31517 /* Add all builtins with rounding. */
31518 for (i = 0, d = bdesc_round_args;
31519 i < ARRAY_SIZE (bdesc_round_args);
31520 i++, d++)
31522 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31523 if (d->name == 0)
31524 continue;
31526 ftype = (enum ix86_builtin_func_type) d->flag;
31527 def_builtin_const (d->mask, d->name, ftype, d->code);
31529 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31530 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31531 ARRAY_SIZE (bdesc_round_args) - 1);
31533 /* pcmpestr[im] insns. */
31534 for (i = 0, d = bdesc_pcmpestr;
31535 i < ARRAY_SIZE (bdesc_pcmpestr);
31536 i++, d++)
31538 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31539 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31540 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31541 else
31542 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31543 def_builtin_const (d->mask, d->name, ftype, d->code);
31545 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31546 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31547 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31549 /* pcmpistr[im] insns. */
31550 for (i = 0, d = bdesc_pcmpistr;
31551 i < ARRAY_SIZE (bdesc_pcmpistr);
31552 i++, d++)
31554 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31555 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31556 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31557 else
31558 ftype = INT_FTYPE_V16QI_V16QI_INT;
31559 def_builtin_const (d->mask, d->name, ftype, d->code);
31561 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31562 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31563 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31565 /* comi/ucomi insns. */
31566 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31568 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31569 if (d->mask == OPTION_MASK_ISA_SSE2)
31570 ftype = INT_FTYPE_V2DF_V2DF;
31571 else
31572 ftype = INT_FTYPE_V4SF_V4SF;
31573 def_builtin_const (d->mask, d->name, ftype, d->code);
31575 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31576 IX86_BUILTIN__BDESC_COMI_FIRST,
31577 ARRAY_SIZE (bdesc_comi) - 1);
31579 /* SSE */
31580 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31581 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31582 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31583 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31585 /* SSE or 3DNow!A */
31586 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31587 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31588 IX86_BUILTIN_MASKMOVQ);
31590 /* SSE2 */
31591 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31592 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31594 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31595 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31596 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31597 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31599 /* SSE3. */
31600 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31601 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31602 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31603 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31605 /* AES */
31606 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31607 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31608 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31609 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31610 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31611 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31612 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31613 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31614 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31615 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31616 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31617 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31619 /* PCLMUL */
31620 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31621 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31623 /* RDRND */
31624 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31625 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31626 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31627 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31628 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31629 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31630 IX86_BUILTIN_RDRAND64_STEP);
31632 /* AVX2 */
31633 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31634 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31635 IX86_BUILTIN_GATHERSIV2DF);
31637 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31638 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31639 IX86_BUILTIN_GATHERSIV4DF);
31641 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31642 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31643 IX86_BUILTIN_GATHERDIV2DF);
31645 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31646 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31647 IX86_BUILTIN_GATHERDIV4DF);
31649 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31650 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31651 IX86_BUILTIN_GATHERSIV4SF);
31653 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31654 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31655 IX86_BUILTIN_GATHERSIV8SF);
31657 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31658 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31659 IX86_BUILTIN_GATHERDIV4SF);
31661 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31662 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31663 IX86_BUILTIN_GATHERDIV8SF);
31665 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31666 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31667 IX86_BUILTIN_GATHERSIV2DI);
31669 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31670 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31671 IX86_BUILTIN_GATHERSIV4DI);
31673 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31674 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31675 IX86_BUILTIN_GATHERDIV2DI);
31677 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31678 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31679 IX86_BUILTIN_GATHERDIV4DI);
31681 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31682 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31683 IX86_BUILTIN_GATHERSIV4SI);
31685 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31686 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31687 IX86_BUILTIN_GATHERSIV8SI);
31689 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31690 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31691 IX86_BUILTIN_GATHERDIV4SI);
31693 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31694 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31695 IX86_BUILTIN_GATHERDIV8SI);
31697 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31698 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31699 IX86_BUILTIN_GATHERALTSIV4DF);
31701 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31702 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31703 IX86_BUILTIN_GATHERALTDIV8SF);
31705 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31706 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31707 IX86_BUILTIN_GATHERALTSIV4DI);
31709 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31710 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31711 IX86_BUILTIN_GATHERALTDIV8SI);
31713 /* AVX512F */
31714 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31715 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31716 IX86_BUILTIN_GATHER3SIV16SF);
31718 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31719 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31720 IX86_BUILTIN_GATHER3SIV8DF);
31722 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31723 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31724 IX86_BUILTIN_GATHER3DIV16SF);
31726 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31727 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31728 IX86_BUILTIN_GATHER3DIV8DF);
31730 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31731 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31732 IX86_BUILTIN_GATHER3SIV16SI);
31734 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31735 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31736 IX86_BUILTIN_GATHER3SIV8DI);
31738 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31739 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31740 IX86_BUILTIN_GATHER3DIV16SI);
31742 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31743 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31744 IX86_BUILTIN_GATHER3DIV8DI);
31746 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31747 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31748 IX86_BUILTIN_GATHER3ALTSIV8DF);
31750 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31751 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31752 IX86_BUILTIN_GATHER3ALTDIV16SF);
31754 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31755 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31756 IX86_BUILTIN_GATHER3ALTSIV8DI);
31758 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31759 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31760 IX86_BUILTIN_GATHER3ALTDIV16SI);
31762 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31763 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31764 IX86_BUILTIN_SCATTERSIV16SF);
31766 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31767 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31768 IX86_BUILTIN_SCATTERSIV8DF);
31770 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31771 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31772 IX86_BUILTIN_SCATTERDIV16SF);
31774 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31775 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31776 IX86_BUILTIN_SCATTERDIV8DF);
31778 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31779 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31780 IX86_BUILTIN_SCATTERSIV16SI);
31782 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31783 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31784 IX86_BUILTIN_SCATTERSIV8DI);
31786 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31787 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31788 IX86_BUILTIN_SCATTERDIV16SI);
31790 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31791 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31792 IX86_BUILTIN_SCATTERDIV8DI);
31794 /* AVX512VL */
31795 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31796 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31797 IX86_BUILTIN_GATHER3SIV2DF);
31799 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31800 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31801 IX86_BUILTIN_GATHER3SIV4DF);
31803 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31804 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31805 IX86_BUILTIN_GATHER3DIV2DF);
31807 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31808 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31809 IX86_BUILTIN_GATHER3DIV4DF);
31811 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31812 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31813 IX86_BUILTIN_GATHER3SIV4SF);
31815 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31816 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31817 IX86_BUILTIN_GATHER3SIV8SF);
31819 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31820 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31821 IX86_BUILTIN_GATHER3DIV4SF);
31823 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31824 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31825 IX86_BUILTIN_GATHER3DIV8SF);
31827 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31828 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31829 IX86_BUILTIN_GATHER3SIV2DI);
31831 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31832 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31833 IX86_BUILTIN_GATHER3SIV4DI);
31835 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31836 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31837 IX86_BUILTIN_GATHER3DIV2DI);
31839 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31840 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31841 IX86_BUILTIN_GATHER3DIV4DI);
31843 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31844 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31845 IX86_BUILTIN_GATHER3SIV4SI);
31847 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31848 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31849 IX86_BUILTIN_GATHER3SIV8SI);
31851 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31852 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31853 IX86_BUILTIN_GATHER3DIV4SI);
31855 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31856 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31857 IX86_BUILTIN_GATHER3DIV8SI);
31859 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31860 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31861 IX86_BUILTIN_GATHER3ALTSIV4DF);
31863 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31864 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31865 IX86_BUILTIN_GATHER3ALTDIV8SF);
31867 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31868 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31869 IX86_BUILTIN_GATHER3ALTSIV4DI);
31871 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31872 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31873 IX86_BUILTIN_GATHER3ALTDIV8SI);
31875 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31876 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31877 IX86_BUILTIN_SCATTERSIV8SF);
31879 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31880 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31881 IX86_BUILTIN_SCATTERSIV4SF);
31883 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31884 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31885 IX86_BUILTIN_SCATTERSIV4DF);
31887 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31888 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31889 IX86_BUILTIN_SCATTERSIV2DF);
31891 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31892 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31893 IX86_BUILTIN_SCATTERDIV8SF);
31895 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31896 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31897 IX86_BUILTIN_SCATTERDIV4SF);
31899 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31900 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31901 IX86_BUILTIN_SCATTERDIV4DF);
31903 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31904 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31905 IX86_BUILTIN_SCATTERDIV2DF);
31907 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31908 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31909 IX86_BUILTIN_SCATTERSIV8SI);
31911 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31912 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31913 IX86_BUILTIN_SCATTERSIV4SI);
31915 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31916 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31917 IX86_BUILTIN_SCATTERSIV4DI);
31919 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31920 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31921 IX86_BUILTIN_SCATTERSIV2DI);
31923 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31924 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31925 IX86_BUILTIN_SCATTERDIV8SI);
31927 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31928 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31929 IX86_BUILTIN_SCATTERDIV4SI);
31931 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31932 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31933 IX86_BUILTIN_SCATTERDIV4DI);
31935 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31936 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31937 IX86_BUILTIN_SCATTERDIV2DI);
31938 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31939 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31940 IX86_BUILTIN_SCATTERALTSIV8DF);
31942 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31943 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31944 IX86_BUILTIN_SCATTERALTDIV16SF);
31946 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31947 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31948 IX86_BUILTIN_SCATTERALTSIV8DI);
31950 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31951 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31952 IX86_BUILTIN_SCATTERALTDIV16SI);
31954 /* AVX512PF */
31955 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31956 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31957 IX86_BUILTIN_GATHERPFDPD);
31958 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31959 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31960 IX86_BUILTIN_GATHERPFDPS);
31961 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31962 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31963 IX86_BUILTIN_GATHERPFQPD);
31964 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31965 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31966 IX86_BUILTIN_GATHERPFQPS);
31967 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31968 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31969 IX86_BUILTIN_SCATTERPFDPD);
31970 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31971 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31972 IX86_BUILTIN_SCATTERPFDPS);
31973 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31974 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31975 IX86_BUILTIN_SCATTERPFQPD);
31976 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31977 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31978 IX86_BUILTIN_SCATTERPFQPS);
31980 /* SHA */
31981 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31982 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31983 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31984 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31985 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31986 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31987 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31988 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31989 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31990 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31991 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31992 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31993 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31994 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31996 /* RTM. */
31997 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31998 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32000 /* MMX access to the vec_init patterns. */
32001 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32002 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32004 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32005 V4HI_FTYPE_HI_HI_HI_HI,
32006 IX86_BUILTIN_VEC_INIT_V4HI);
32008 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
32009 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
32010 IX86_BUILTIN_VEC_INIT_V8QI);
32012 /* Access to the vec_extract patterns. */
32013 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
32014 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
32015 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
32016 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
32017 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
32018 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
32019 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
32020 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
32021 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
32022 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
32024 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32025 "__builtin_ia32_vec_ext_v4hi",
32026 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
32028 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
32029 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
32031 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
32032 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
32034 /* Access to the vec_set patterns. */
32035 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
32036 "__builtin_ia32_vec_set_v2di",
32037 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
32039 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
32040 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
32042 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
32043 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
32045 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
32046 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
32048 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32049 "__builtin_ia32_vec_set_v4hi",
32050 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
32052 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
32053 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
32055 /* RDSEED */
32056 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
32057 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
32058 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
32059 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
32060 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
32061 "__builtin_ia32_rdseed_di_step",
32062 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
32064 /* ADCX */
32065 def_builtin (0, "__builtin_ia32_addcarryx_u32",
32066 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
32067 def_builtin (OPTION_MASK_ISA_64BIT,
32068 "__builtin_ia32_addcarryx_u64",
32069 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32070 IX86_BUILTIN_ADDCARRYX64);
32072 /* SBB */
32073 def_builtin (0, "__builtin_ia32_sbb_u32",
32074 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
32075 def_builtin (OPTION_MASK_ISA_64BIT,
32076 "__builtin_ia32_sbb_u64",
32077 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32078 IX86_BUILTIN_SBB64);
32080 /* Read/write FLAGS. */
32081 def_builtin (0, "__builtin_ia32_readeflags_u32",
32082 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32083 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
32084 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32085 def_builtin (0, "__builtin_ia32_writeeflags_u32",
32086 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
32087 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
32088 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
32090 /* CLFLUSHOPT. */
32091 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
32092 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
32094 /* CLWB. */
32095 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
32096 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
32098 /* MONITORX and MWAITX. */
32099 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
32100 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
32101 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
32102 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
32104 /* CLZERO. */
32105 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
32106 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
32108 /* Add FMA4 multi-arg argument instructions */
32109 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32111 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
32112 if (d->name == 0)
32113 continue;
32115 ftype = (enum ix86_builtin_func_type) d->flag;
32116 def_builtin_const (d->mask, d->name, ftype, d->code);
32118 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
32119 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32120 ARRAY_SIZE (bdesc_multi_arg) - 1);
32123 static void
32124 ix86_init_mpx_builtins ()
32126 const struct builtin_description * d;
32127 enum ix86_builtin_func_type ftype;
32128 tree decl;
32129 size_t i;
32131 for (i = 0, d = bdesc_mpx;
32132 i < ARRAY_SIZE (bdesc_mpx);
32133 i++, d++)
32135 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32136 if (d->name == 0)
32137 continue;
32139 ftype = (enum ix86_builtin_func_type) d->flag;
32140 decl = def_builtin (d->mask, d->name, ftype, d->code);
32142 /* With no leaf and nothrow flags for MPX builtins
32143 abnormal edges may follow its call when setjmp
32144 presents in the function. Since we may have a lot
32145 of MPX builtins calls it causes lots of useless
32146 edges and enormous PHI nodes. To avoid this we mark
32147 MPX builtins as leaf and nothrow. */
32148 if (decl)
32150 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32151 NULL_TREE);
32152 TREE_NOTHROW (decl) = 1;
32154 else
32156 ix86_builtins_isa[(int)d->code].leaf_p = true;
32157 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32160 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32161 IX86_BUILTIN__BDESC_MPX_FIRST,
32162 ARRAY_SIZE (bdesc_mpx) - 1);
32164 for (i = 0, d = bdesc_mpx_const;
32165 i < ARRAY_SIZE (bdesc_mpx_const);
32166 i++, d++)
32168 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32169 if (d->name == 0)
32170 continue;
32172 ftype = (enum ix86_builtin_func_type) d->flag;
32173 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
32175 if (decl)
32177 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32178 NULL_TREE);
32179 TREE_NOTHROW (decl) = 1;
32181 else
32183 ix86_builtins_isa[(int)d->code].leaf_p = true;
32184 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32187 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32188 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32189 ARRAY_SIZE (bdesc_mpx_const) - 1);
32191 #undef BDESC_VERIFY
32192 #undef BDESC_VERIFYS
32194 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32195 to return a pointer to VERSION_DECL if the outcome of the expression
32196 formed by PREDICATE_CHAIN is true. This function will be called during
32197 version dispatch to decide which function version to execute. It returns
32198 the basic block at the end, to which more conditions can be added. */
32200 static basic_block
32201 add_condition_to_bb (tree function_decl, tree version_decl,
32202 tree predicate_chain, basic_block new_bb)
32204 gimple *return_stmt;
32205 tree convert_expr, result_var;
32206 gimple *convert_stmt;
32207 gimple *call_cond_stmt;
32208 gimple *if_else_stmt;
32210 basic_block bb1, bb2, bb3;
32211 edge e12, e23;
32213 tree cond_var, and_expr_var = NULL_TREE;
32214 gimple_seq gseq;
32216 tree predicate_decl, predicate_arg;
32218 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32220 gcc_assert (new_bb != NULL);
32221 gseq = bb_seq (new_bb);
32224 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32225 build_fold_addr_expr (version_decl));
32226 result_var = create_tmp_var (ptr_type_node);
32227 convert_stmt = gimple_build_assign (result_var, convert_expr);
32228 return_stmt = gimple_build_return (result_var);
32230 if (predicate_chain == NULL_TREE)
32232 gimple_seq_add_stmt (&gseq, convert_stmt);
32233 gimple_seq_add_stmt (&gseq, return_stmt);
32234 set_bb_seq (new_bb, gseq);
32235 gimple_set_bb (convert_stmt, new_bb);
32236 gimple_set_bb (return_stmt, new_bb);
32237 pop_cfun ();
32238 return new_bb;
32241 while (predicate_chain != NULL)
32243 cond_var = create_tmp_var (integer_type_node);
32244 predicate_decl = TREE_PURPOSE (predicate_chain);
32245 predicate_arg = TREE_VALUE (predicate_chain);
32246 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32247 gimple_call_set_lhs (call_cond_stmt, cond_var);
32249 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32250 gimple_set_bb (call_cond_stmt, new_bb);
32251 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32253 predicate_chain = TREE_CHAIN (predicate_chain);
32255 if (and_expr_var == NULL)
32256 and_expr_var = cond_var;
32257 else
32259 gimple *assign_stmt;
32260 /* Use MIN_EXPR to check if any integer is zero?.
32261 and_expr_var = min_expr <cond_var, and_expr_var> */
32262 assign_stmt = gimple_build_assign (and_expr_var,
32263 build2 (MIN_EXPR, integer_type_node,
32264 cond_var, and_expr_var));
32266 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32267 gimple_set_bb (assign_stmt, new_bb);
32268 gimple_seq_add_stmt (&gseq, assign_stmt);
32272 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32273 integer_zero_node,
32274 NULL_TREE, NULL_TREE);
32275 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32276 gimple_set_bb (if_else_stmt, new_bb);
32277 gimple_seq_add_stmt (&gseq, if_else_stmt);
32279 gimple_seq_add_stmt (&gseq, convert_stmt);
32280 gimple_seq_add_stmt (&gseq, return_stmt);
32281 set_bb_seq (new_bb, gseq);
32283 bb1 = new_bb;
32284 e12 = split_block (bb1, if_else_stmt);
32285 bb2 = e12->dest;
32286 e12->flags &= ~EDGE_FALLTHRU;
32287 e12->flags |= EDGE_TRUE_VALUE;
32289 e23 = split_block (bb2, return_stmt);
32291 gimple_set_bb (convert_stmt, bb2);
32292 gimple_set_bb (return_stmt, bb2);
32294 bb3 = e23->dest;
32295 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32297 remove_edge (e23);
32298 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32300 pop_cfun ();
32302 return bb3;
32305 /* This parses the attribute arguments to target in DECL and determines
32306 the right builtin to use to match the platform specification.
32307 It returns the priority value for this version decl. If PREDICATE_LIST
32308 is not NULL, it stores the list of cpu features that need to be checked
32309 before dispatching this function. */
32311 static unsigned int
32312 get_builtin_code_for_version (tree decl, tree *predicate_list)
32314 tree attrs;
32315 struct cl_target_option cur_target;
32316 tree target_node;
32317 struct cl_target_option *new_target;
32318 const char *arg_str = NULL;
32319 const char *attrs_str = NULL;
32320 char *tok_str = NULL;
32321 char *token;
32323 /* Priority of i386 features, greater value is higher priority. This is
32324 used to decide the order in which function dispatch must happen. For
32325 instance, a version specialized for SSE4.2 should be checked for dispatch
32326 before a version for SSE3, as SSE4.2 implies SSE3. */
32327 enum feature_priority
32329 P_ZERO = 0,
32330 P_MMX,
32331 P_SSE,
32332 P_SSE2,
32333 P_SSE3,
32334 P_SSSE3,
32335 P_PROC_SSSE3,
32336 P_SSE4_A,
32337 P_PROC_SSE4_A,
32338 P_SSE4_1,
32339 P_SSE4_2,
32340 P_PROC_SSE4_2,
32341 P_POPCNT,
32342 P_AES,
32343 P_PCLMUL,
32344 P_AVX,
32345 P_PROC_AVX,
32346 P_BMI,
32347 P_PROC_BMI,
32348 P_FMA4,
32349 P_XOP,
32350 P_PROC_XOP,
32351 P_FMA,
32352 P_PROC_FMA,
32353 P_BMI2,
32354 P_AVX2,
32355 P_PROC_AVX2,
32356 P_AVX512F,
32357 P_PROC_AVX512F
32360 enum feature_priority priority = P_ZERO;
32362 /* These are the target attribute strings for which a dispatcher is
32363 available, from fold_builtin_cpu. */
32365 static struct _feature_list
32367 const char *const name;
32368 const enum feature_priority priority;
32370 const feature_list[] =
32372 {"mmx", P_MMX},
32373 {"sse", P_SSE},
32374 {"sse2", P_SSE2},
32375 {"sse3", P_SSE3},
32376 {"sse4a", P_SSE4_A},
32377 {"ssse3", P_SSSE3},
32378 {"sse4.1", P_SSE4_1},
32379 {"sse4.2", P_SSE4_2},
32380 {"popcnt", P_POPCNT},
32381 {"aes", P_AES},
32382 {"pclmul", P_PCLMUL},
32383 {"avx", P_AVX},
32384 {"bmi", P_BMI},
32385 {"fma4", P_FMA4},
32386 {"xop", P_XOP},
32387 {"fma", P_FMA},
32388 {"bmi2", P_BMI2},
32389 {"avx2", P_AVX2},
32390 {"avx512f", P_AVX512F}
32394 static unsigned int NUM_FEATURES
32395 = sizeof (feature_list) / sizeof (struct _feature_list);
32397 unsigned int i;
32399 tree predicate_chain = NULL_TREE;
32400 tree predicate_decl, predicate_arg;
32402 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32403 gcc_assert (attrs != NULL);
32405 attrs = TREE_VALUE (TREE_VALUE (attrs));
32407 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32408 attrs_str = TREE_STRING_POINTER (attrs);
32410 /* Return priority zero for default function. */
32411 if (strcmp (attrs_str, "default") == 0)
32412 return 0;
32414 /* Handle arch= if specified. For priority, set it to be 1 more than
32415 the best instruction set the processor can handle. For instance, if
32416 there is a version for atom and a version for ssse3 (the highest ISA
32417 priority for atom), the atom version must be checked for dispatch
32418 before the ssse3 version. */
32419 if (strstr (attrs_str, "arch=") != NULL)
32421 cl_target_option_save (&cur_target, &global_options);
32422 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32423 &global_options_set);
32425 gcc_assert (target_node);
32426 new_target = TREE_TARGET_OPTION (target_node);
32427 gcc_assert (new_target);
32429 if (new_target->arch_specified && new_target->arch > 0)
32431 switch (new_target->arch)
32433 case PROCESSOR_CORE2:
32434 arg_str = "core2";
32435 priority = P_PROC_SSSE3;
32436 break;
32437 case PROCESSOR_NEHALEM:
32438 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32439 arg_str = "westmere";
32440 else
32441 /* We translate "arch=corei7" and "arch=nehalem" to
32442 "corei7" so that it will be mapped to M_INTEL_COREI7
32443 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32444 arg_str = "corei7";
32445 priority = P_PROC_SSE4_2;
32446 break;
32447 case PROCESSOR_SANDYBRIDGE:
32448 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32449 arg_str = "ivybridge";
32450 else
32451 arg_str = "sandybridge";
32452 priority = P_PROC_AVX;
32453 break;
32454 case PROCESSOR_HASWELL:
32455 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32456 arg_str = "skylake-avx512";
32457 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32458 arg_str = "skylake";
32459 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32460 arg_str = "broadwell";
32461 else
32462 arg_str = "haswell";
32463 priority = P_PROC_AVX2;
32464 break;
32465 case PROCESSOR_BONNELL:
32466 arg_str = "bonnell";
32467 priority = P_PROC_SSSE3;
32468 break;
32469 case PROCESSOR_KNL:
32470 arg_str = "knl";
32471 priority = P_PROC_AVX512F;
32472 break;
32473 case PROCESSOR_SILVERMONT:
32474 arg_str = "silvermont";
32475 priority = P_PROC_SSE4_2;
32476 break;
32477 case PROCESSOR_AMDFAM10:
32478 arg_str = "amdfam10h";
32479 priority = P_PROC_SSE4_A;
32480 break;
32481 case PROCESSOR_BTVER1:
32482 arg_str = "btver1";
32483 priority = P_PROC_SSE4_A;
32484 break;
32485 case PROCESSOR_BTVER2:
32486 arg_str = "btver2";
32487 priority = P_PROC_BMI;
32488 break;
32489 case PROCESSOR_BDVER1:
32490 arg_str = "bdver1";
32491 priority = P_PROC_XOP;
32492 break;
32493 case PROCESSOR_BDVER2:
32494 arg_str = "bdver2";
32495 priority = P_PROC_FMA;
32496 break;
32497 case PROCESSOR_BDVER3:
32498 arg_str = "bdver3";
32499 priority = P_PROC_FMA;
32500 break;
32501 case PROCESSOR_BDVER4:
32502 arg_str = "bdver4";
32503 priority = P_PROC_AVX2;
32504 break;
32505 case PROCESSOR_ZNVER1:
32506 arg_str = "znver1";
32507 priority = P_PROC_AVX2;
32508 break;
32512 cl_target_option_restore (&global_options, &cur_target);
32514 if (predicate_list && arg_str == NULL)
32516 error_at (DECL_SOURCE_LOCATION (decl),
32517 "No dispatcher found for the versioning attributes");
32518 return 0;
32521 if (predicate_list)
32523 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32524 /* For a C string literal the length includes the trailing NULL. */
32525 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32526 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32527 predicate_chain);
32531 /* Process feature name. */
32532 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32533 strcpy (tok_str, attrs_str);
32534 token = strtok (tok_str, ",");
32535 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32537 while (token != NULL)
32539 /* Do not process "arch=" */
32540 if (strncmp (token, "arch=", 5) == 0)
32542 token = strtok (NULL, ",");
32543 continue;
32545 for (i = 0; i < NUM_FEATURES; ++i)
32547 if (strcmp (token, feature_list[i].name) == 0)
32549 if (predicate_list)
32551 predicate_arg = build_string_literal (
32552 strlen (feature_list[i].name) + 1,
32553 feature_list[i].name);
32554 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32555 predicate_chain);
32557 /* Find the maximum priority feature. */
32558 if (feature_list[i].priority > priority)
32559 priority = feature_list[i].priority;
32561 break;
32564 if (predicate_list && i == NUM_FEATURES)
32566 error_at (DECL_SOURCE_LOCATION (decl),
32567 "No dispatcher found for %s", token);
32568 return 0;
32570 token = strtok (NULL, ",");
32572 free (tok_str);
32574 if (predicate_list && predicate_chain == NULL_TREE)
32576 error_at (DECL_SOURCE_LOCATION (decl),
32577 "No dispatcher found for the versioning attributes : %s",
32578 attrs_str);
32579 return 0;
32581 else if (predicate_list)
32583 predicate_chain = nreverse (predicate_chain);
32584 *predicate_list = predicate_chain;
32587 return priority;
32590 /* This compares the priority of target features in function DECL1
32591 and DECL2. It returns positive value if DECL1 is higher priority,
32592 negative value if DECL2 is higher priority and 0 if they are the
32593 same. */
32595 static int
32596 ix86_compare_version_priority (tree decl1, tree decl2)
32598 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32599 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32601 return (int)priority1 - (int)priority2;
32604 /* V1 and V2 point to function versions with different priorities
32605 based on the target ISA. This function compares their priorities. */
32607 static int
32608 feature_compare (const void *v1, const void *v2)
32610 typedef struct _function_version_info
32612 tree version_decl;
32613 tree predicate_chain;
32614 unsigned int dispatch_priority;
32615 } function_version_info;
32617 const function_version_info c1 = *(const function_version_info *)v1;
32618 const function_version_info c2 = *(const function_version_info *)v2;
32619 return (c2.dispatch_priority - c1.dispatch_priority);
32622 /* This function generates the dispatch function for
32623 multi-versioned functions. DISPATCH_DECL is the function which will
32624 contain the dispatch logic. FNDECLS are the function choices for
32625 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32626 in DISPATCH_DECL in which the dispatch code is generated. */
32628 static int
32629 dispatch_function_versions (tree dispatch_decl,
32630 void *fndecls_p,
32631 basic_block *empty_bb)
32633 tree default_decl;
32634 gimple *ifunc_cpu_init_stmt;
32635 gimple_seq gseq;
32636 int ix;
32637 tree ele;
32638 vec<tree> *fndecls;
32639 unsigned int num_versions = 0;
32640 unsigned int actual_versions = 0;
32641 unsigned int i;
32643 struct _function_version_info
32645 tree version_decl;
32646 tree predicate_chain;
32647 unsigned int dispatch_priority;
32648 }*function_version_info;
32650 gcc_assert (dispatch_decl != NULL
32651 && fndecls_p != NULL
32652 && empty_bb != NULL);
32654 /*fndecls_p is actually a vector. */
32655 fndecls = static_cast<vec<tree> *> (fndecls_p);
32657 /* At least one more version other than the default. */
32658 num_versions = fndecls->length ();
32659 gcc_assert (num_versions >= 2);
32661 function_version_info = (struct _function_version_info *)
32662 XNEWVEC (struct _function_version_info, (num_versions - 1));
32664 /* The first version in the vector is the default decl. */
32665 default_decl = (*fndecls)[0];
32667 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32669 gseq = bb_seq (*empty_bb);
32670 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32671 constructors, so explicity call __builtin_cpu_init here. */
32672 ifunc_cpu_init_stmt = gimple_build_call_vec (
32673 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32674 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32675 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32676 set_bb_seq (*empty_bb, gseq);
32678 pop_cfun ();
32681 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32683 tree version_decl = ele;
32684 tree predicate_chain = NULL_TREE;
32685 unsigned int priority;
32686 /* Get attribute string, parse it and find the right predicate decl.
32687 The predicate function could be a lengthy combination of many
32688 features, like arch-type and various isa-variants. */
32689 priority = get_builtin_code_for_version (version_decl,
32690 &predicate_chain);
32692 if (predicate_chain == NULL_TREE)
32693 continue;
32695 function_version_info [actual_versions].version_decl = version_decl;
32696 function_version_info [actual_versions].predicate_chain
32697 = predicate_chain;
32698 function_version_info [actual_versions].dispatch_priority = priority;
32699 actual_versions++;
32702 /* Sort the versions according to descending order of dispatch priority. The
32703 priority is based on the ISA. This is not a perfect solution. There
32704 could still be ambiguity. If more than one function version is suitable
32705 to execute, which one should be dispatched? In future, allow the user
32706 to specify a dispatch priority next to the version. */
32707 qsort (function_version_info, actual_versions,
32708 sizeof (struct _function_version_info), feature_compare);
32710 for (i = 0; i < actual_versions; ++i)
32711 *empty_bb = add_condition_to_bb (dispatch_decl,
32712 function_version_info[i].version_decl,
32713 function_version_info[i].predicate_chain,
32714 *empty_bb);
32716 /* dispatch default version at the end. */
32717 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32718 NULL, *empty_bb);
32720 free (function_version_info);
32721 return 0;
32724 /* Comparator function to be used in qsort routine to sort attribute
32725 specification strings to "target". */
32727 static int
32728 attr_strcmp (const void *v1, const void *v2)
32730 const char *c1 = *(char *const*)v1;
32731 const char *c2 = *(char *const*)v2;
32732 return strcmp (c1, c2);
32735 /* ARGLIST is the argument to target attribute. This function tokenizes
32736 the comma separated arguments, sorts them and returns a string which
32737 is a unique identifier for the comma separated arguments. It also
32738 replaces non-identifier characters "=,-" with "_". */
32740 static char *
32741 sorted_attr_string (tree arglist)
32743 tree arg;
32744 size_t str_len_sum = 0;
32745 char **args = NULL;
32746 char *attr_str, *ret_str;
32747 char *attr = NULL;
32748 unsigned int argnum = 1;
32749 unsigned int i;
32751 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32753 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32754 size_t len = strlen (str);
32755 str_len_sum += len + 1;
32756 if (arg != arglist)
32757 argnum++;
32758 for (i = 0; i < strlen (str); i++)
32759 if (str[i] == ',')
32760 argnum++;
32763 attr_str = XNEWVEC (char, str_len_sum);
32764 str_len_sum = 0;
32765 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32767 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32768 size_t len = strlen (str);
32769 memcpy (attr_str + str_len_sum, str, len);
32770 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32771 str_len_sum += len + 1;
32774 /* Replace "=,-" with "_". */
32775 for (i = 0; i < strlen (attr_str); i++)
32776 if (attr_str[i] == '=' || attr_str[i]== '-')
32777 attr_str[i] = '_';
32779 if (argnum == 1)
32780 return attr_str;
32782 args = XNEWVEC (char *, argnum);
32784 i = 0;
32785 attr = strtok (attr_str, ",");
32786 while (attr != NULL)
32788 args[i] = attr;
32789 i++;
32790 attr = strtok (NULL, ",");
32793 qsort (args, argnum, sizeof (char *), attr_strcmp);
32795 ret_str = XNEWVEC (char, str_len_sum);
32796 str_len_sum = 0;
32797 for (i = 0; i < argnum; i++)
32799 size_t len = strlen (args[i]);
32800 memcpy (ret_str + str_len_sum, args[i], len);
32801 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32802 str_len_sum += len + 1;
32805 XDELETEVEC (args);
32806 XDELETEVEC (attr_str);
32807 return ret_str;
32810 /* This function changes the assembler name for functions that are
32811 versions. If DECL is a function version and has a "target"
32812 attribute, it appends the attribute string to its assembler name. */
32814 static tree
32815 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32817 tree version_attr;
32818 const char *orig_name, *version_string;
32819 char *attr_str, *assembler_name;
32821 if (DECL_DECLARED_INLINE_P (decl)
32822 && lookup_attribute ("gnu_inline",
32823 DECL_ATTRIBUTES (decl)))
32824 error_at (DECL_SOURCE_LOCATION (decl),
32825 "Function versions cannot be marked as gnu_inline,"
32826 " bodies have to be generated");
32828 if (DECL_VIRTUAL_P (decl)
32829 || DECL_VINDEX (decl))
32830 sorry ("Virtual function multiversioning not supported");
32832 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32834 /* target attribute string cannot be NULL. */
32835 gcc_assert (version_attr != NULL_TREE);
32837 orig_name = IDENTIFIER_POINTER (id);
32838 version_string
32839 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32841 if (strcmp (version_string, "default") == 0)
32842 return id;
32844 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32845 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32847 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32849 /* Allow assembler name to be modified if already set. */
32850 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32851 SET_DECL_RTL (decl, NULL);
32853 tree ret = get_identifier (assembler_name);
32854 XDELETEVEC (attr_str);
32855 XDELETEVEC (assembler_name);
32856 return ret;
32859 /* This function returns true if FN1 and FN2 are versions of the same function,
32860 that is, the target strings of the function decls are different. This assumes
32861 that FN1 and FN2 have the same signature. */
32863 static bool
32864 ix86_function_versions (tree fn1, tree fn2)
32866 tree attr1, attr2;
32867 char *target1, *target2;
32868 bool result;
32870 if (TREE_CODE (fn1) != FUNCTION_DECL
32871 || TREE_CODE (fn2) != FUNCTION_DECL)
32872 return false;
32874 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32875 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32877 /* At least one function decl should have the target attribute specified. */
32878 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32879 return false;
32881 /* Diagnose missing target attribute if one of the decls is already
32882 multi-versioned. */
32883 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32885 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32887 if (attr2 != NULL_TREE)
32889 std::swap (fn1, fn2);
32890 attr1 = attr2;
32892 error_at (DECL_SOURCE_LOCATION (fn2),
32893 "missing %<target%> attribute for multi-versioned %D",
32894 fn2);
32895 inform (DECL_SOURCE_LOCATION (fn1),
32896 "previous declaration of %D", fn1);
32897 /* Prevent diagnosing of the same error multiple times. */
32898 DECL_ATTRIBUTES (fn2)
32899 = tree_cons (get_identifier ("target"),
32900 copy_node (TREE_VALUE (attr1)),
32901 DECL_ATTRIBUTES (fn2));
32903 return false;
32906 target1 = sorted_attr_string (TREE_VALUE (attr1));
32907 target2 = sorted_attr_string (TREE_VALUE (attr2));
32909 /* The sorted target strings must be different for fn1 and fn2
32910 to be versions. */
32911 if (strcmp (target1, target2) == 0)
32912 result = false;
32913 else
32914 result = true;
32916 XDELETEVEC (target1);
32917 XDELETEVEC (target2);
32919 return result;
32922 static tree
32923 ix86_mangle_decl_assembler_name (tree decl, tree id)
32925 /* For function version, add the target suffix to the assembler name. */
32926 if (TREE_CODE (decl) == FUNCTION_DECL
32927 && DECL_FUNCTION_VERSIONED (decl))
32928 id = ix86_mangle_function_version_assembler_name (decl, id);
32929 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32930 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32931 #endif
32933 return id;
32936 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32937 is true, append the full path name of the source file. */
32939 static char *
32940 make_name (tree decl, const char *suffix, bool make_unique)
32942 char *global_var_name;
32943 int name_len;
32944 const char *name;
32945 const char *unique_name = NULL;
32947 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32949 /* Get a unique name that can be used globally without any chances
32950 of collision at link time. */
32951 if (make_unique)
32952 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32954 name_len = strlen (name) + strlen (suffix) + 2;
32956 if (make_unique)
32957 name_len += strlen (unique_name) + 1;
32958 global_var_name = XNEWVEC (char, name_len);
32960 /* Use '.' to concatenate names as it is demangler friendly. */
32961 if (make_unique)
32962 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32963 suffix);
32964 else
32965 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32967 return global_var_name;
32970 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32972 /* Make a dispatcher declaration for the multi-versioned function DECL.
32973 Calls to DECL function will be replaced with calls to the dispatcher
32974 by the front-end. Return the decl created. */
32976 static tree
32977 make_dispatcher_decl (const tree decl)
32979 tree func_decl;
32980 char *func_name;
32981 tree fn_type, func_type;
32982 bool is_uniq = false;
32984 if (TREE_PUBLIC (decl) == 0)
32985 is_uniq = true;
32987 func_name = make_name (decl, "ifunc", is_uniq);
32989 fn_type = TREE_TYPE (decl);
32990 func_type = build_function_type (TREE_TYPE (fn_type),
32991 TYPE_ARG_TYPES (fn_type));
32993 func_decl = build_fn_decl (func_name, func_type);
32994 XDELETEVEC (func_name);
32995 TREE_USED (func_decl) = 1;
32996 DECL_CONTEXT (func_decl) = NULL_TREE;
32997 DECL_INITIAL (func_decl) = error_mark_node;
32998 DECL_ARTIFICIAL (func_decl) = 1;
32999 /* Mark this func as external, the resolver will flip it again if
33000 it gets generated. */
33001 DECL_EXTERNAL (func_decl) = 1;
33002 /* This will be of type IFUNCs have to be externally visible. */
33003 TREE_PUBLIC (func_decl) = 1;
33005 return func_decl;
33008 #endif
33010 /* Returns true if decl is multi-versioned and DECL is the default function,
33011 that is it is not tagged with target specific optimization. */
33013 static bool
33014 is_function_default_version (const tree decl)
33016 if (TREE_CODE (decl) != FUNCTION_DECL
33017 || !DECL_FUNCTION_VERSIONED (decl))
33018 return false;
33019 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33020 gcc_assert (attr);
33021 attr = TREE_VALUE (TREE_VALUE (attr));
33022 return (TREE_CODE (attr) == STRING_CST
33023 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
33026 /* Make a dispatcher declaration for the multi-versioned function DECL.
33027 Calls to DECL function will be replaced with calls to the dispatcher
33028 by the front-end. Returns the decl of the dispatcher function. */
33030 static tree
33031 ix86_get_function_versions_dispatcher (void *decl)
33033 tree fn = (tree) decl;
33034 struct cgraph_node *node = NULL;
33035 struct cgraph_node *default_node = NULL;
33036 struct cgraph_function_version_info *node_v = NULL;
33037 struct cgraph_function_version_info *first_v = NULL;
33039 tree dispatch_decl = NULL;
33041 struct cgraph_function_version_info *default_version_info = NULL;
33043 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33045 node = cgraph_node::get (fn);
33046 gcc_assert (node != NULL);
33048 node_v = node->function_version ();
33049 gcc_assert (node_v != NULL);
33051 if (node_v->dispatcher_resolver != NULL)
33052 return node_v->dispatcher_resolver;
33054 /* Find the default version and make it the first node. */
33055 first_v = node_v;
33056 /* Go to the beginning of the chain. */
33057 while (first_v->prev != NULL)
33058 first_v = first_v->prev;
33059 default_version_info = first_v;
33060 while (default_version_info != NULL)
33062 if (is_function_default_version
33063 (default_version_info->this_node->decl))
33064 break;
33065 default_version_info = default_version_info->next;
33068 /* If there is no default node, just return NULL. */
33069 if (default_version_info == NULL)
33070 return NULL;
33072 /* Make default info the first node. */
33073 if (first_v != default_version_info)
33075 default_version_info->prev->next = default_version_info->next;
33076 if (default_version_info->next)
33077 default_version_info->next->prev = default_version_info->prev;
33078 first_v->prev = default_version_info;
33079 default_version_info->next = first_v;
33080 default_version_info->prev = NULL;
33083 default_node = default_version_info->this_node;
33085 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33086 if (targetm.has_ifunc_p ())
33088 struct cgraph_function_version_info *it_v = NULL;
33089 struct cgraph_node *dispatcher_node = NULL;
33090 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33092 /* Right now, the dispatching is done via ifunc. */
33093 dispatch_decl = make_dispatcher_decl (default_node->decl);
33095 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33096 gcc_assert (dispatcher_node != NULL);
33097 dispatcher_node->dispatcher_function = 1;
33098 dispatcher_version_info
33099 = dispatcher_node->insert_new_function_version ();
33100 dispatcher_version_info->next = default_version_info;
33101 dispatcher_node->definition = 1;
33103 /* Set the dispatcher for all the versions. */
33104 it_v = default_version_info;
33105 while (it_v != NULL)
33107 it_v->dispatcher_resolver = dispatch_decl;
33108 it_v = it_v->next;
33111 else
33112 #endif
33114 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33115 "multiversioning needs ifunc which is not supported "
33116 "on this target");
33119 return dispatch_decl;
33122 /* Make the resolver function decl to dispatch the versions of
33123 a multi-versioned function, DEFAULT_DECL. Create an
33124 empty basic block in the resolver and store the pointer in
33125 EMPTY_BB. Return the decl of the resolver function. */
33127 static tree
33128 make_resolver_func (const tree default_decl,
33129 const tree dispatch_decl,
33130 basic_block *empty_bb)
33132 char *resolver_name;
33133 tree decl, type, decl_name, t;
33134 bool is_uniq = false;
33136 /* IFUNC's have to be globally visible. So, if the default_decl is
33137 not, then the name of the IFUNC should be made unique. */
33138 if (TREE_PUBLIC (default_decl) == 0)
33139 is_uniq = true;
33141 /* Append the filename to the resolver function if the versions are
33142 not externally visible. This is because the resolver function has
33143 to be externally visible for the loader to find it. So, appending
33144 the filename will prevent conflicts with a resolver function from
33145 another module which is based on the same version name. */
33146 resolver_name = make_name (default_decl, "resolver", is_uniq);
33148 /* The resolver function should return a (void *). */
33149 type = build_function_type_list (ptr_type_node, NULL_TREE);
33151 decl = build_fn_decl (resolver_name, type);
33152 decl_name = get_identifier (resolver_name);
33153 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33155 DECL_NAME (decl) = decl_name;
33156 TREE_USED (decl) = 1;
33157 DECL_ARTIFICIAL (decl) = 1;
33158 DECL_IGNORED_P (decl) = 0;
33159 /* IFUNC resolvers have to be externally visible. */
33160 TREE_PUBLIC (decl) = 1;
33161 DECL_UNINLINABLE (decl) = 1;
33163 /* Resolver is not external, body is generated. */
33164 DECL_EXTERNAL (decl) = 0;
33165 DECL_EXTERNAL (dispatch_decl) = 0;
33167 DECL_CONTEXT (decl) = NULL_TREE;
33168 DECL_INITIAL (decl) = make_node (BLOCK);
33169 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33171 if (DECL_COMDAT_GROUP (default_decl)
33172 || TREE_PUBLIC (default_decl))
33174 /* In this case, each translation unit with a call to this
33175 versioned function will put out a resolver. Ensure it
33176 is comdat to keep just one copy. */
33177 DECL_COMDAT (decl) = 1;
33178 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33180 /* Build result decl and add to function_decl. */
33181 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33182 DECL_ARTIFICIAL (t) = 1;
33183 DECL_IGNORED_P (t) = 1;
33184 DECL_RESULT (decl) = t;
33186 gimplify_function_tree (decl);
33187 push_cfun (DECL_STRUCT_FUNCTION (decl));
33188 *empty_bb = init_lowered_empty_function (decl, false, 0);
33190 cgraph_node::add_new_function (decl, true);
33191 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33193 pop_cfun ();
33195 gcc_assert (dispatch_decl != NULL);
33196 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33197 DECL_ATTRIBUTES (dispatch_decl)
33198 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33200 /* Create the alias for dispatch to resolver here. */
33201 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33202 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33203 XDELETEVEC (resolver_name);
33204 return decl;
33207 /* Generate the dispatching code body to dispatch multi-versioned function
33208 DECL. The target hook is called to process the "target" attributes and
33209 provide the code to dispatch the right function at run-time. NODE points
33210 to the dispatcher decl whose body will be created. */
33212 static tree
33213 ix86_generate_version_dispatcher_body (void *node_p)
33215 tree resolver_decl;
33216 basic_block empty_bb;
33217 tree default_ver_decl;
33218 struct cgraph_node *versn;
33219 struct cgraph_node *node;
33221 struct cgraph_function_version_info *node_version_info = NULL;
33222 struct cgraph_function_version_info *versn_info = NULL;
33224 node = (cgraph_node *)node_p;
33226 node_version_info = node->function_version ();
33227 gcc_assert (node->dispatcher_function
33228 && node_version_info != NULL);
33230 if (node_version_info->dispatcher_resolver)
33231 return node_version_info->dispatcher_resolver;
33233 /* The first version in the chain corresponds to the default version. */
33234 default_ver_decl = node_version_info->next->this_node->decl;
33236 /* node is going to be an alias, so remove the finalized bit. */
33237 node->definition = false;
33239 resolver_decl = make_resolver_func (default_ver_decl,
33240 node->decl, &empty_bb);
33242 node_version_info->dispatcher_resolver = resolver_decl;
33244 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33246 auto_vec<tree, 2> fn_ver_vec;
33248 for (versn_info = node_version_info->next; versn_info;
33249 versn_info = versn_info->next)
33251 versn = versn_info->this_node;
33252 /* Check for virtual functions here again, as by this time it should
33253 have been determined if this function needs a vtable index or
33254 not. This happens for methods in derived classes that override
33255 virtual methods in base classes but are not explicitly marked as
33256 virtual. */
33257 if (DECL_VINDEX (versn->decl))
33258 sorry ("Virtual function multiversioning not supported");
33260 fn_ver_vec.safe_push (versn->decl);
33263 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
33264 cgraph_edge::rebuild_edges ();
33265 pop_cfun ();
33266 return resolver_decl;
33268 /* This builds the processor_model struct type defined in
33269 libgcc/config/i386/cpuinfo.c */
33271 static tree
33272 build_processor_model_struct (void)
33274 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
33275 "__cpu_features"};
33276 tree field = NULL_TREE, field_chain = NULL_TREE;
33277 int i;
33278 tree type = make_node (RECORD_TYPE);
33280 /* The first 3 fields are unsigned int. */
33281 for (i = 0; i < 3; ++i)
33283 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33284 get_identifier (field_name[i]), unsigned_type_node);
33285 if (field_chain != NULL_TREE)
33286 DECL_CHAIN (field) = field_chain;
33287 field_chain = field;
33290 /* The last field is an array of unsigned integers of size one. */
33291 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33292 get_identifier (field_name[3]),
33293 build_array_type (unsigned_type_node,
33294 build_index_type (size_one_node)));
33295 if (field_chain != NULL_TREE)
33296 DECL_CHAIN (field) = field_chain;
33297 field_chain = field;
33299 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33300 return type;
33303 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33305 static tree
33306 make_var_decl (tree type, const char *name)
33308 tree new_decl;
33310 new_decl = build_decl (UNKNOWN_LOCATION,
33311 VAR_DECL,
33312 get_identifier(name),
33313 type);
33315 DECL_EXTERNAL (new_decl) = 1;
33316 TREE_STATIC (new_decl) = 1;
33317 TREE_PUBLIC (new_decl) = 1;
33318 DECL_INITIAL (new_decl) = 0;
33319 DECL_ARTIFICIAL (new_decl) = 0;
33320 DECL_PRESERVE_P (new_decl) = 1;
33322 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33323 assemble_variable (new_decl, 0, 0, 0);
33325 return new_decl;
33328 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33329 into an integer defined in libgcc/config/i386/cpuinfo.c */
33331 static tree
33332 fold_builtin_cpu (tree fndecl, tree *args)
33334 unsigned int i;
33335 enum ix86_builtins fn_code = (enum ix86_builtins)
33336 DECL_FUNCTION_CODE (fndecl);
33337 tree param_string_cst = NULL;
33339 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33340 enum processor_features
33342 F_CMOV = 0,
33343 F_MMX,
33344 F_POPCNT,
33345 F_SSE,
33346 F_SSE2,
33347 F_SSE3,
33348 F_SSSE3,
33349 F_SSE4_1,
33350 F_SSE4_2,
33351 F_AVX,
33352 F_AVX2,
33353 F_SSE4_A,
33354 F_FMA4,
33355 F_XOP,
33356 F_FMA,
33357 F_AVX512F,
33358 F_BMI,
33359 F_BMI2,
33360 F_AES,
33361 F_PCLMUL,
33362 F_AVX512VL,
33363 F_AVX512BW,
33364 F_AVX512DQ,
33365 F_AVX512CD,
33366 F_AVX512ER,
33367 F_AVX512PF,
33368 F_AVX512VBMI,
33369 F_AVX512IFMA,
33370 F_AVX5124VNNIW,
33371 F_AVX5124FMAPS,
33372 F_AVX512VPOPCNTDQ,
33373 F_MAX
33376 /* These are the values for vendor types and cpu types and subtypes
33377 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33378 the corresponding start value. */
33379 enum processor_model
33381 M_INTEL = 1,
33382 M_AMD,
33383 M_CPU_TYPE_START,
33384 M_INTEL_BONNELL,
33385 M_INTEL_CORE2,
33386 M_INTEL_COREI7,
33387 M_AMDFAM10H,
33388 M_AMDFAM15H,
33389 M_INTEL_SILVERMONT,
33390 M_INTEL_KNL,
33391 M_AMD_BTVER1,
33392 M_AMD_BTVER2,
33393 M_CPU_SUBTYPE_START,
33394 M_INTEL_COREI7_NEHALEM,
33395 M_INTEL_COREI7_WESTMERE,
33396 M_INTEL_COREI7_SANDYBRIDGE,
33397 M_AMDFAM10H_BARCELONA,
33398 M_AMDFAM10H_SHANGHAI,
33399 M_AMDFAM10H_ISTANBUL,
33400 M_AMDFAM15H_BDVER1,
33401 M_AMDFAM15H_BDVER2,
33402 M_AMDFAM15H_BDVER3,
33403 M_AMDFAM15H_BDVER4,
33404 M_AMDFAM17H_ZNVER1,
33405 M_INTEL_COREI7_IVYBRIDGE,
33406 M_INTEL_COREI7_HASWELL,
33407 M_INTEL_COREI7_BROADWELL,
33408 M_INTEL_COREI7_SKYLAKE,
33409 M_INTEL_COREI7_SKYLAKE_AVX512
33412 static struct _arch_names_table
33414 const char *const name;
33415 const enum processor_model model;
33417 const arch_names_table[] =
33419 {"amd", M_AMD},
33420 {"intel", M_INTEL},
33421 {"atom", M_INTEL_BONNELL},
33422 {"slm", M_INTEL_SILVERMONT},
33423 {"core2", M_INTEL_CORE2},
33424 {"corei7", M_INTEL_COREI7},
33425 {"nehalem", M_INTEL_COREI7_NEHALEM},
33426 {"westmere", M_INTEL_COREI7_WESTMERE},
33427 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33428 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33429 {"haswell", M_INTEL_COREI7_HASWELL},
33430 {"broadwell", M_INTEL_COREI7_BROADWELL},
33431 {"skylake", M_INTEL_COREI7_SKYLAKE},
33432 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33433 {"bonnell", M_INTEL_BONNELL},
33434 {"silvermont", M_INTEL_SILVERMONT},
33435 {"knl", M_INTEL_KNL},
33436 {"amdfam10h", M_AMDFAM10H},
33437 {"barcelona", M_AMDFAM10H_BARCELONA},
33438 {"shanghai", M_AMDFAM10H_SHANGHAI},
33439 {"istanbul", M_AMDFAM10H_ISTANBUL},
33440 {"btver1", M_AMD_BTVER1},
33441 {"amdfam15h", M_AMDFAM15H},
33442 {"bdver1", M_AMDFAM15H_BDVER1},
33443 {"bdver2", M_AMDFAM15H_BDVER2},
33444 {"bdver3", M_AMDFAM15H_BDVER3},
33445 {"bdver4", M_AMDFAM15H_BDVER4},
33446 {"btver2", M_AMD_BTVER2},
33447 {"znver1", M_AMDFAM17H_ZNVER1},
33450 static struct _isa_names_table
33452 const char *const name;
33453 const enum processor_features feature;
33455 const isa_names_table[] =
33457 {"cmov", F_CMOV},
33458 {"mmx", F_MMX},
33459 {"popcnt", F_POPCNT},
33460 {"sse", F_SSE},
33461 {"sse2", F_SSE2},
33462 {"sse3", F_SSE3},
33463 {"ssse3", F_SSSE3},
33464 {"sse4a", F_SSE4_A},
33465 {"sse4.1", F_SSE4_1},
33466 {"sse4.2", F_SSE4_2},
33467 {"avx", F_AVX},
33468 {"fma4", F_FMA4},
33469 {"xop", F_XOP},
33470 {"fma", F_FMA},
33471 {"avx2", F_AVX2},
33472 {"avx512f", F_AVX512F},
33473 {"bmi", F_BMI},
33474 {"bmi2", F_BMI2},
33475 {"aes", F_AES},
33476 {"pclmul", F_PCLMUL},
33477 {"avx512vl",F_AVX512VL},
33478 {"avx512bw",F_AVX512BW},
33479 {"avx512dq",F_AVX512DQ},
33480 {"avx512cd",F_AVX512CD},
33481 {"avx512er",F_AVX512ER},
33482 {"avx512pf",F_AVX512PF},
33483 {"avx512vbmi",F_AVX512VBMI},
33484 {"avx512ifma",F_AVX512IFMA},
33485 {"avx5124vnniw",F_AVX5124VNNIW},
33486 {"avx5124fmaps",F_AVX5124FMAPS},
33487 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
33490 tree __processor_model_type = build_processor_model_struct ();
33491 tree __cpu_model_var = make_var_decl (__processor_model_type,
33492 "__cpu_model");
33495 varpool_node::add (__cpu_model_var);
33497 gcc_assert ((args != NULL) && (*args != NULL));
33499 param_string_cst = *args;
33500 while (param_string_cst
33501 && TREE_CODE (param_string_cst) != STRING_CST)
33503 /* *args must be a expr that can contain other EXPRS leading to a
33504 STRING_CST. */
33505 if (!EXPR_P (param_string_cst))
33507 error ("Parameter to builtin must be a string constant or literal");
33508 return integer_zero_node;
33510 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33513 gcc_assert (param_string_cst);
33515 if (fn_code == IX86_BUILTIN_CPU_IS)
33517 tree ref;
33518 tree field;
33519 tree final;
33521 unsigned int field_val = 0;
33522 unsigned int NUM_ARCH_NAMES
33523 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33525 for (i = 0; i < NUM_ARCH_NAMES; i++)
33526 if (strcmp (arch_names_table[i].name,
33527 TREE_STRING_POINTER (param_string_cst)) == 0)
33528 break;
33530 if (i == NUM_ARCH_NAMES)
33532 error ("Parameter to builtin not valid: %s",
33533 TREE_STRING_POINTER (param_string_cst));
33534 return integer_zero_node;
33537 field = TYPE_FIELDS (__processor_model_type);
33538 field_val = arch_names_table[i].model;
33540 /* CPU types are stored in the next field. */
33541 if (field_val > M_CPU_TYPE_START
33542 && field_val < M_CPU_SUBTYPE_START)
33544 field = DECL_CHAIN (field);
33545 field_val -= M_CPU_TYPE_START;
33548 /* CPU subtypes are stored in the next field. */
33549 if (field_val > M_CPU_SUBTYPE_START)
33551 field = DECL_CHAIN ( DECL_CHAIN (field));
33552 field_val -= M_CPU_SUBTYPE_START;
33555 /* Get the appropriate field in __cpu_model. */
33556 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33557 field, NULL_TREE);
33559 /* Check the value. */
33560 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33561 build_int_cstu (unsigned_type_node, field_val));
33562 return build1 (CONVERT_EXPR, integer_type_node, final);
33564 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33566 tree ref;
33567 tree array_elt;
33568 tree field;
33569 tree final;
33571 unsigned int field_val = 0;
33572 unsigned int NUM_ISA_NAMES
33573 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33575 for (i = 0; i < NUM_ISA_NAMES; i++)
33576 if (strcmp (isa_names_table[i].name,
33577 TREE_STRING_POINTER (param_string_cst)) == 0)
33578 break;
33580 if (i == NUM_ISA_NAMES)
33582 error ("Parameter to builtin not valid: %s",
33583 TREE_STRING_POINTER (param_string_cst));
33584 return integer_zero_node;
33587 field = TYPE_FIELDS (__processor_model_type);
33588 /* Get the last field, which is __cpu_features. */
33589 while (DECL_CHAIN (field))
33590 field = DECL_CHAIN (field);
33592 /* Get the appropriate field: __cpu_model.__cpu_features */
33593 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33594 field, NULL_TREE);
33596 /* Access the 0th element of __cpu_features array. */
33597 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33598 integer_zero_node, NULL_TREE, NULL_TREE);
33600 field_val = (1 << isa_names_table[i].feature);
33601 /* Return __cpu_model.__cpu_features[0] & field_val */
33602 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33603 build_int_cstu (unsigned_type_node, field_val));
33604 return build1 (CONVERT_EXPR, integer_type_node, final);
33606 gcc_unreachable ();
33609 static tree
33610 ix86_fold_builtin (tree fndecl, int n_args,
33611 tree *args, bool ignore ATTRIBUTE_UNUSED)
33613 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33615 enum ix86_builtins fn_code = (enum ix86_builtins)
33616 DECL_FUNCTION_CODE (fndecl);
33617 switch (fn_code)
33619 case IX86_BUILTIN_CPU_IS:
33620 case IX86_BUILTIN_CPU_SUPPORTS:
33621 gcc_assert (n_args == 1);
33622 return fold_builtin_cpu (fndecl, args);
33624 case IX86_BUILTIN_NANQ:
33625 case IX86_BUILTIN_NANSQ:
33627 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33628 const char *str = c_getstr (*args);
33629 int quiet = fn_code == IX86_BUILTIN_NANQ;
33630 REAL_VALUE_TYPE real;
33632 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33633 return build_real (type, real);
33634 return NULL_TREE;
33637 case IX86_BUILTIN_INFQ:
33638 case IX86_BUILTIN_HUGE_VALQ:
33640 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33641 REAL_VALUE_TYPE inf;
33642 real_inf (&inf);
33643 return build_real (type, inf);
33646 case IX86_BUILTIN_TZCNT16:
33647 case IX86_BUILTIN_CTZS:
33648 case IX86_BUILTIN_TZCNT32:
33649 case IX86_BUILTIN_TZCNT64:
33650 gcc_assert (n_args == 1);
33651 if (TREE_CODE (args[0]) == INTEGER_CST)
33653 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33654 tree arg = args[0];
33655 if (fn_code == IX86_BUILTIN_TZCNT16
33656 || fn_code == IX86_BUILTIN_CTZS)
33657 arg = fold_convert (short_unsigned_type_node, arg);
33658 if (integer_zerop (arg))
33659 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33660 else
33661 return fold_const_call (CFN_CTZ, type, arg);
33663 break;
33665 case IX86_BUILTIN_LZCNT16:
33666 case IX86_BUILTIN_CLZS:
33667 case IX86_BUILTIN_LZCNT32:
33668 case IX86_BUILTIN_LZCNT64:
33669 gcc_assert (n_args == 1);
33670 if (TREE_CODE (args[0]) == INTEGER_CST)
33672 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33673 tree arg = args[0];
33674 if (fn_code == IX86_BUILTIN_LZCNT16
33675 || fn_code == IX86_BUILTIN_CLZS)
33676 arg = fold_convert (short_unsigned_type_node, arg);
33677 if (integer_zerop (arg))
33678 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33679 else
33680 return fold_const_call (CFN_CLZ, type, arg);
33682 break;
33684 case IX86_BUILTIN_BEXTR32:
33685 case IX86_BUILTIN_BEXTR64:
33686 case IX86_BUILTIN_BEXTRI32:
33687 case IX86_BUILTIN_BEXTRI64:
33688 gcc_assert (n_args == 2);
33689 if (tree_fits_uhwi_p (args[1]))
33691 unsigned HOST_WIDE_INT res = 0;
33692 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33693 unsigned int start = tree_to_uhwi (args[1]);
33694 unsigned int len = (start & 0xff00) >> 8;
33695 start &= 0xff;
33696 if (start >= prec || len == 0)
33697 res = 0;
33698 else if (!tree_fits_uhwi_p (args[0]))
33699 break;
33700 else
33701 res = tree_to_uhwi (args[0]) >> start;
33702 if (len > prec)
33703 len = prec;
33704 if (len < HOST_BITS_PER_WIDE_INT)
33705 res &= (HOST_WIDE_INT_1U << len) - 1;
33706 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33708 break;
33710 case IX86_BUILTIN_BZHI32:
33711 case IX86_BUILTIN_BZHI64:
33712 gcc_assert (n_args == 2);
33713 if (tree_fits_uhwi_p (args[1]))
33715 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33716 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33717 return args[0];
33718 if (!tree_fits_uhwi_p (args[0]))
33719 break;
33720 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33721 res &= ~(HOST_WIDE_INT_M1U << idx);
33722 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33724 break;
33726 case IX86_BUILTIN_PDEP32:
33727 case IX86_BUILTIN_PDEP64:
33728 gcc_assert (n_args == 2);
33729 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33731 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33732 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33733 unsigned HOST_WIDE_INT res = 0;
33734 unsigned HOST_WIDE_INT m, k = 1;
33735 for (m = 1; m; m <<= 1)
33736 if ((mask & m) != 0)
33738 if ((src & k) != 0)
33739 res |= m;
33740 k <<= 1;
33742 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33744 break;
33746 case IX86_BUILTIN_PEXT32:
33747 case IX86_BUILTIN_PEXT64:
33748 gcc_assert (n_args == 2);
33749 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33751 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33752 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33753 unsigned HOST_WIDE_INT res = 0;
33754 unsigned HOST_WIDE_INT m, k = 1;
33755 for (m = 1; m; m <<= 1)
33756 if ((mask & m) != 0)
33758 if ((src & m) != 0)
33759 res |= k;
33760 k <<= 1;
33762 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33764 break;
33766 default:
33767 break;
33771 #ifdef SUBTARGET_FOLD_BUILTIN
33772 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33773 #endif
33775 return NULL_TREE;
33778 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33779 constant) in GIMPLE. */
33781 bool
33782 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33784 gimple *stmt = gsi_stmt (*gsi);
33785 tree fndecl = gimple_call_fndecl (stmt);
33786 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33787 int n_args = gimple_call_num_args (stmt);
33788 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33789 tree decl = NULL_TREE;
33790 tree arg0, arg1;
33792 switch (fn_code)
33794 case IX86_BUILTIN_TZCNT32:
33795 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33796 goto fold_tzcnt_lzcnt;
33798 case IX86_BUILTIN_TZCNT64:
33799 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33800 goto fold_tzcnt_lzcnt;
33802 case IX86_BUILTIN_LZCNT32:
33803 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33804 goto fold_tzcnt_lzcnt;
33806 case IX86_BUILTIN_LZCNT64:
33807 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33808 goto fold_tzcnt_lzcnt;
33810 fold_tzcnt_lzcnt:
33811 gcc_assert (n_args == 1);
33812 arg0 = gimple_call_arg (stmt, 0);
33813 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33815 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33816 /* If arg0 is provably non-zero, optimize into generic
33817 __builtin_c[tl]z{,ll} function the middle-end handles
33818 better. */
33819 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33820 return false;
33822 location_t loc = gimple_location (stmt);
33823 gimple *g = gimple_build_call (decl, 1, arg0);
33824 gimple_set_location (g, loc);
33825 tree lhs = make_ssa_name (integer_type_node);
33826 gimple_call_set_lhs (g, lhs);
33827 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33828 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33829 gimple_set_location (g, loc);
33830 gsi_replace (gsi, g, false);
33831 return true;
33833 break;
33835 case IX86_BUILTIN_BZHI32:
33836 case IX86_BUILTIN_BZHI64:
33837 gcc_assert (n_args == 2);
33838 arg1 = gimple_call_arg (stmt, 1);
33839 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33841 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33842 arg0 = gimple_call_arg (stmt, 0);
33843 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33844 break;
33845 location_t loc = gimple_location (stmt);
33846 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33847 gimple_set_location (g, loc);
33848 gsi_replace (gsi, g, false);
33849 return true;
33851 break;
33853 case IX86_BUILTIN_PDEP32:
33854 case IX86_BUILTIN_PDEP64:
33855 case IX86_BUILTIN_PEXT32:
33856 case IX86_BUILTIN_PEXT64:
33857 gcc_assert (n_args == 2);
33858 arg1 = gimple_call_arg (stmt, 1);
33859 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33861 location_t loc = gimple_location (stmt);
33862 arg0 = gimple_call_arg (stmt, 0);
33863 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33864 gimple_set_location (g, loc);
33865 gsi_replace (gsi, g, false);
33866 return true;
33868 break;
33870 default:
33871 break;
33874 return false;
33877 /* Make builtins to detect cpu type and features supported. NAME is
33878 the builtin name, CODE is the builtin code, and FTYPE is the function
33879 type of the builtin. */
33881 static void
33882 make_cpu_type_builtin (const char* name, int code,
33883 enum ix86_builtin_func_type ftype, bool is_const)
33885 tree decl;
33886 tree type;
33888 type = ix86_get_builtin_func_type (ftype);
33889 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33890 NULL, NULL_TREE);
33891 gcc_assert (decl != NULL_TREE);
33892 ix86_builtins[(int) code] = decl;
33893 TREE_READONLY (decl) = is_const;
33896 /* Make builtins to get CPU type and features supported. The created
33897 builtins are :
33899 __builtin_cpu_init (), to detect cpu type and features,
33900 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33901 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33904 static void
33905 ix86_init_platform_type_builtins (void)
33907 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33908 INT_FTYPE_VOID, false);
33909 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33910 INT_FTYPE_PCCHAR, true);
33911 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33912 INT_FTYPE_PCCHAR, true);
33915 /* Internal method for ix86_init_builtins. */
33917 static void
33918 ix86_init_builtins_va_builtins_abi (void)
33920 tree ms_va_ref, sysv_va_ref;
33921 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33922 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33923 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33924 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33926 if (!TARGET_64BIT)
33927 return;
33928 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33929 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33930 ms_va_ref = build_reference_type (ms_va_list_type_node);
33931 sysv_va_ref =
33932 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33934 fnvoid_va_end_ms =
33935 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33936 fnvoid_va_start_ms =
33937 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33938 fnvoid_va_end_sysv =
33939 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33940 fnvoid_va_start_sysv =
33941 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33942 NULL_TREE);
33943 fnvoid_va_copy_ms =
33944 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33945 NULL_TREE);
33946 fnvoid_va_copy_sysv =
33947 build_function_type_list (void_type_node, sysv_va_ref,
33948 sysv_va_ref, NULL_TREE);
33950 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33951 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33952 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33953 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33954 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33955 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33956 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33957 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33958 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33959 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33960 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33961 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33964 static void
33965 ix86_init_builtin_types (void)
33967 tree float80_type_node, const_string_type_node;
33969 /* The __float80 type. */
33970 float80_type_node = long_double_type_node;
33971 if (TYPE_MODE (float80_type_node) != XFmode)
33973 if (float64x_type_node != NULL_TREE
33974 && TYPE_MODE (float64x_type_node) == XFmode)
33975 float80_type_node = float64x_type_node;
33976 else
33978 /* The __float80 type. */
33979 float80_type_node = make_node (REAL_TYPE);
33981 TYPE_PRECISION (float80_type_node) = 80;
33982 layout_type (float80_type_node);
33985 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33987 /* The __float128 type. The node has already been created as
33988 _Float128, so we only need to register the __float128 name for
33989 it. */
33990 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33992 const_string_type_node
33993 = build_pointer_type (build_qualified_type
33994 (char_type_node, TYPE_QUAL_CONST));
33996 /* This macro is built by i386-builtin-types.awk. */
33997 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34000 static void
34001 ix86_init_builtins (void)
34003 tree ftype, decl;
34005 ix86_init_builtin_types ();
34007 /* Builtins to get CPU type and features. */
34008 ix86_init_platform_type_builtins ();
34010 /* TFmode support builtins. */
34011 def_builtin_const (0, "__builtin_infq",
34012 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34013 def_builtin_const (0, "__builtin_huge_valq",
34014 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34016 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34017 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34018 BUILT_IN_MD, "nanq", NULL_TREE);
34019 TREE_READONLY (decl) = 1;
34020 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34022 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34023 BUILT_IN_MD, "nansq", NULL_TREE);
34024 TREE_READONLY (decl) = 1;
34025 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34027 /* We will expand them to normal call if SSE isn't available since
34028 they are used by libgcc. */
34029 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34030 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34031 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34032 TREE_READONLY (decl) = 1;
34033 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34035 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34036 decl = add_builtin_function ("__builtin_copysignq", ftype,
34037 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34038 "__copysigntf3", NULL_TREE);
34039 TREE_READONLY (decl) = 1;
34040 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34042 ix86_init_tm_builtins ();
34043 ix86_init_mmx_sse_builtins ();
34044 ix86_init_mpx_builtins ();
34046 if (TARGET_LP64)
34047 ix86_init_builtins_va_builtins_abi ();
34049 #ifdef SUBTARGET_INIT_BUILTINS
34050 SUBTARGET_INIT_BUILTINS;
34051 #endif
34054 /* Return the ix86 builtin for CODE. */
34056 static tree
34057 ix86_builtin_decl (unsigned code, bool)
34059 if (code >= IX86_BUILTIN_MAX)
34060 return error_mark_node;
34062 return ix86_builtins[code];
34065 /* Errors in the source file can cause expand_expr to return const0_rtx
34066 where we expect a vector. To avoid crashing, use one of the vector
34067 clear instructions. */
34068 static rtx
34069 safe_vector_operand (rtx x, machine_mode mode)
34071 if (x == const0_rtx)
34072 x = CONST0_RTX (mode);
34073 return x;
34076 /* Fixup modeless constants to fit required mode. */
34077 static rtx
34078 fixup_modeless_constant (rtx x, machine_mode mode)
34080 if (GET_MODE (x) == VOIDmode)
34081 x = convert_to_mode (mode, x, 1);
34082 return x;
34085 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34087 static rtx
34088 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34090 rtx pat;
34091 tree arg0 = CALL_EXPR_ARG (exp, 0);
34092 tree arg1 = CALL_EXPR_ARG (exp, 1);
34093 rtx op0 = expand_normal (arg0);
34094 rtx op1 = expand_normal (arg1);
34095 machine_mode tmode = insn_data[icode].operand[0].mode;
34096 machine_mode mode0 = insn_data[icode].operand[1].mode;
34097 machine_mode mode1 = insn_data[icode].operand[2].mode;
34099 if (VECTOR_MODE_P (mode0))
34100 op0 = safe_vector_operand (op0, mode0);
34101 if (VECTOR_MODE_P (mode1))
34102 op1 = safe_vector_operand (op1, mode1);
34104 if (optimize || !target
34105 || GET_MODE (target) != tmode
34106 || !insn_data[icode].operand[0].predicate (target, tmode))
34107 target = gen_reg_rtx (tmode);
34109 if (GET_MODE (op1) == SImode && mode1 == TImode)
34111 rtx x = gen_reg_rtx (V4SImode);
34112 emit_insn (gen_sse2_loadd (x, op1));
34113 op1 = gen_lowpart (TImode, x);
34116 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34117 op0 = copy_to_mode_reg (mode0, op0);
34118 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34119 op1 = copy_to_mode_reg (mode1, op1);
34121 pat = GEN_FCN (icode) (target, op0, op1);
34122 if (! pat)
34123 return 0;
34125 emit_insn (pat);
34127 return target;
34130 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34132 static rtx
34133 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34134 enum ix86_builtin_func_type m_type,
34135 enum rtx_code sub_code)
34137 rtx pat;
34138 int i;
34139 int nargs;
34140 bool comparison_p = false;
34141 bool tf_p = false;
34142 bool last_arg_constant = false;
34143 int num_memory = 0;
34144 struct {
34145 rtx op;
34146 machine_mode mode;
34147 } args[4];
34149 machine_mode tmode = insn_data[icode].operand[0].mode;
34151 switch (m_type)
34153 case MULTI_ARG_4_DF2_DI_I:
34154 case MULTI_ARG_4_DF2_DI_I1:
34155 case MULTI_ARG_4_SF2_SI_I:
34156 case MULTI_ARG_4_SF2_SI_I1:
34157 nargs = 4;
34158 last_arg_constant = true;
34159 break;
34161 case MULTI_ARG_3_SF:
34162 case MULTI_ARG_3_DF:
34163 case MULTI_ARG_3_SF2:
34164 case MULTI_ARG_3_DF2:
34165 case MULTI_ARG_3_DI:
34166 case MULTI_ARG_3_SI:
34167 case MULTI_ARG_3_SI_DI:
34168 case MULTI_ARG_3_HI:
34169 case MULTI_ARG_3_HI_SI:
34170 case MULTI_ARG_3_QI:
34171 case MULTI_ARG_3_DI2:
34172 case MULTI_ARG_3_SI2:
34173 case MULTI_ARG_3_HI2:
34174 case MULTI_ARG_3_QI2:
34175 nargs = 3;
34176 break;
34178 case MULTI_ARG_2_SF:
34179 case MULTI_ARG_2_DF:
34180 case MULTI_ARG_2_DI:
34181 case MULTI_ARG_2_SI:
34182 case MULTI_ARG_2_HI:
34183 case MULTI_ARG_2_QI:
34184 nargs = 2;
34185 break;
34187 case MULTI_ARG_2_DI_IMM:
34188 case MULTI_ARG_2_SI_IMM:
34189 case MULTI_ARG_2_HI_IMM:
34190 case MULTI_ARG_2_QI_IMM:
34191 nargs = 2;
34192 last_arg_constant = true;
34193 break;
34195 case MULTI_ARG_1_SF:
34196 case MULTI_ARG_1_DF:
34197 case MULTI_ARG_1_SF2:
34198 case MULTI_ARG_1_DF2:
34199 case MULTI_ARG_1_DI:
34200 case MULTI_ARG_1_SI:
34201 case MULTI_ARG_1_HI:
34202 case MULTI_ARG_1_QI:
34203 case MULTI_ARG_1_SI_DI:
34204 case MULTI_ARG_1_HI_DI:
34205 case MULTI_ARG_1_HI_SI:
34206 case MULTI_ARG_1_QI_DI:
34207 case MULTI_ARG_1_QI_SI:
34208 case MULTI_ARG_1_QI_HI:
34209 nargs = 1;
34210 break;
34212 case MULTI_ARG_2_DI_CMP:
34213 case MULTI_ARG_2_SI_CMP:
34214 case MULTI_ARG_2_HI_CMP:
34215 case MULTI_ARG_2_QI_CMP:
34216 nargs = 2;
34217 comparison_p = true;
34218 break;
34220 case MULTI_ARG_2_SF_TF:
34221 case MULTI_ARG_2_DF_TF:
34222 case MULTI_ARG_2_DI_TF:
34223 case MULTI_ARG_2_SI_TF:
34224 case MULTI_ARG_2_HI_TF:
34225 case MULTI_ARG_2_QI_TF:
34226 nargs = 2;
34227 tf_p = true;
34228 break;
34230 default:
34231 gcc_unreachable ();
34234 if (optimize || !target
34235 || GET_MODE (target) != tmode
34236 || !insn_data[icode].operand[0].predicate (target, tmode))
34237 target = gen_reg_rtx (tmode);
34239 gcc_assert (nargs <= 4);
34241 for (i = 0; i < nargs; i++)
34243 tree arg = CALL_EXPR_ARG (exp, i);
34244 rtx op = expand_normal (arg);
34245 int adjust = (comparison_p) ? 1 : 0;
34246 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34248 if (last_arg_constant && i == nargs - 1)
34250 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34252 enum insn_code new_icode = icode;
34253 switch (icode)
34255 case CODE_FOR_xop_vpermil2v2df3:
34256 case CODE_FOR_xop_vpermil2v4sf3:
34257 case CODE_FOR_xop_vpermil2v4df3:
34258 case CODE_FOR_xop_vpermil2v8sf3:
34259 error ("the last argument must be a 2-bit immediate");
34260 return gen_reg_rtx (tmode);
34261 case CODE_FOR_xop_rotlv2di3:
34262 new_icode = CODE_FOR_rotlv2di3;
34263 goto xop_rotl;
34264 case CODE_FOR_xop_rotlv4si3:
34265 new_icode = CODE_FOR_rotlv4si3;
34266 goto xop_rotl;
34267 case CODE_FOR_xop_rotlv8hi3:
34268 new_icode = CODE_FOR_rotlv8hi3;
34269 goto xop_rotl;
34270 case CODE_FOR_xop_rotlv16qi3:
34271 new_icode = CODE_FOR_rotlv16qi3;
34272 xop_rotl:
34273 if (CONST_INT_P (op))
34275 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34276 op = GEN_INT (INTVAL (op) & mask);
34277 gcc_checking_assert
34278 (insn_data[icode].operand[i + 1].predicate (op, mode));
34280 else
34282 gcc_checking_assert
34283 (nargs == 2
34284 && insn_data[new_icode].operand[0].mode == tmode
34285 && insn_data[new_icode].operand[1].mode == tmode
34286 && insn_data[new_icode].operand[2].mode == mode
34287 && insn_data[new_icode].operand[0].predicate
34288 == insn_data[icode].operand[0].predicate
34289 && insn_data[new_icode].operand[1].predicate
34290 == insn_data[icode].operand[1].predicate);
34291 icode = new_icode;
34292 goto non_constant;
34294 break;
34295 default:
34296 gcc_unreachable ();
34300 else
34302 non_constant:
34303 if (VECTOR_MODE_P (mode))
34304 op = safe_vector_operand (op, mode);
34306 /* If we aren't optimizing, only allow one memory operand to be
34307 generated. */
34308 if (memory_operand (op, mode))
34309 num_memory++;
34311 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34313 if (optimize
34314 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34315 || num_memory > 1)
34316 op = force_reg (mode, op);
34319 args[i].op = op;
34320 args[i].mode = mode;
34323 switch (nargs)
34325 case 1:
34326 pat = GEN_FCN (icode) (target, args[0].op);
34327 break;
34329 case 2:
34330 if (tf_p)
34331 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34332 GEN_INT ((int)sub_code));
34333 else if (! comparison_p)
34334 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34335 else
34337 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34338 args[0].op,
34339 args[1].op);
34341 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34343 break;
34345 case 3:
34346 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34347 break;
34349 case 4:
34350 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34351 break;
34353 default:
34354 gcc_unreachable ();
34357 if (! pat)
34358 return 0;
34360 emit_insn (pat);
34361 return target;
34364 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34365 insns with vec_merge. */
34367 static rtx
34368 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34369 rtx target)
34371 rtx pat;
34372 tree arg0 = CALL_EXPR_ARG (exp, 0);
34373 rtx op1, op0 = expand_normal (arg0);
34374 machine_mode tmode = insn_data[icode].operand[0].mode;
34375 machine_mode mode0 = insn_data[icode].operand[1].mode;
34377 if (optimize || !target
34378 || GET_MODE (target) != tmode
34379 || !insn_data[icode].operand[0].predicate (target, tmode))
34380 target = gen_reg_rtx (tmode);
34382 if (VECTOR_MODE_P (mode0))
34383 op0 = safe_vector_operand (op0, mode0);
34385 if ((optimize && !register_operand (op0, mode0))
34386 || !insn_data[icode].operand[1].predicate (op0, mode0))
34387 op0 = copy_to_mode_reg (mode0, op0);
34389 op1 = op0;
34390 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34391 op1 = copy_to_mode_reg (mode0, op1);
34393 pat = GEN_FCN (icode) (target, op0, op1);
34394 if (! pat)
34395 return 0;
34396 emit_insn (pat);
34397 return target;
34400 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34402 static rtx
34403 ix86_expand_sse_compare (const struct builtin_description *d,
34404 tree exp, rtx target, bool swap)
34406 rtx pat;
34407 tree arg0 = CALL_EXPR_ARG (exp, 0);
34408 tree arg1 = CALL_EXPR_ARG (exp, 1);
34409 rtx op0 = expand_normal (arg0);
34410 rtx op1 = expand_normal (arg1);
34411 rtx op2;
34412 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34413 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34414 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34415 enum rtx_code comparison = d->comparison;
34417 if (VECTOR_MODE_P (mode0))
34418 op0 = safe_vector_operand (op0, mode0);
34419 if (VECTOR_MODE_P (mode1))
34420 op1 = safe_vector_operand (op1, mode1);
34422 /* Swap operands if we have a comparison that isn't available in
34423 hardware. */
34424 if (swap)
34425 std::swap (op0, op1);
34427 if (optimize || !target
34428 || GET_MODE (target) != tmode
34429 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34430 target = gen_reg_rtx (tmode);
34432 if ((optimize && !register_operand (op0, mode0))
34433 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34434 op0 = copy_to_mode_reg (mode0, op0);
34435 if ((optimize && !register_operand (op1, mode1))
34436 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34437 op1 = copy_to_mode_reg (mode1, op1);
34439 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34440 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34441 if (! pat)
34442 return 0;
34443 emit_insn (pat);
34444 return target;
34447 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34449 static rtx
34450 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34451 rtx target)
34453 rtx pat;
34454 tree arg0 = CALL_EXPR_ARG (exp, 0);
34455 tree arg1 = CALL_EXPR_ARG (exp, 1);
34456 rtx op0 = expand_normal (arg0);
34457 rtx op1 = expand_normal (arg1);
34458 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34459 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34460 enum rtx_code comparison = d->comparison;
34462 if (VECTOR_MODE_P (mode0))
34463 op0 = safe_vector_operand (op0, mode0);
34464 if (VECTOR_MODE_P (mode1))
34465 op1 = safe_vector_operand (op1, mode1);
34467 /* Swap operands if we have a comparison that isn't available in
34468 hardware. */
34469 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34470 std::swap (op0, op1);
34472 target = gen_reg_rtx (SImode);
34473 emit_move_insn (target, const0_rtx);
34474 target = gen_rtx_SUBREG (QImode, target, 0);
34476 if ((optimize && !register_operand (op0, mode0))
34477 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34478 op0 = copy_to_mode_reg (mode0, op0);
34479 if ((optimize && !register_operand (op1, mode1))
34480 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34481 op1 = copy_to_mode_reg (mode1, op1);
34483 pat = GEN_FCN (d->icode) (op0, op1);
34484 if (! pat)
34485 return 0;
34486 emit_insn (pat);
34487 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34488 gen_rtx_fmt_ee (comparison, QImode,
34489 SET_DEST (pat),
34490 const0_rtx)));
34492 return SUBREG_REG (target);
34495 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34497 static rtx
34498 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34499 rtx target)
34501 rtx pat;
34502 tree arg0 = CALL_EXPR_ARG (exp, 0);
34503 rtx op1, op0 = expand_normal (arg0);
34504 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34505 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34507 if (optimize || target == 0
34508 || GET_MODE (target) != tmode
34509 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34510 target = gen_reg_rtx (tmode);
34512 if (VECTOR_MODE_P (mode0))
34513 op0 = safe_vector_operand (op0, mode0);
34515 if ((optimize && !register_operand (op0, mode0))
34516 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34517 op0 = copy_to_mode_reg (mode0, op0);
34519 op1 = GEN_INT (d->comparison);
34521 pat = GEN_FCN (d->icode) (target, op0, op1);
34522 if (! pat)
34523 return 0;
34524 emit_insn (pat);
34525 return target;
34528 static rtx
34529 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34530 tree exp, rtx target)
34532 rtx pat;
34533 tree arg0 = CALL_EXPR_ARG (exp, 0);
34534 tree arg1 = CALL_EXPR_ARG (exp, 1);
34535 rtx op0 = expand_normal (arg0);
34536 rtx op1 = expand_normal (arg1);
34537 rtx op2;
34538 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34539 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34540 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34542 if (optimize || target == 0
34543 || GET_MODE (target) != tmode
34544 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34545 target = gen_reg_rtx (tmode);
34547 op0 = safe_vector_operand (op0, mode0);
34548 op1 = safe_vector_operand (op1, mode1);
34550 if ((optimize && !register_operand (op0, mode0))
34551 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34552 op0 = copy_to_mode_reg (mode0, op0);
34553 if ((optimize && !register_operand (op1, mode1))
34554 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34555 op1 = copy_to_mode_reg (mode1, op1);
34557 op2 = GEN_INT (d->comparison);
34559 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34560 if (! pat)
34561 return 0;
34562 emit_insn (pat);
34563 return target;
34566 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34568 static rtx
34569 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34570 rtx target)
34572 rtx pat;
34573 tree arg0 = CALL_EXPR_ARG (exp, 0);
34574 tree arg1 = CALL_EXPR_ARG (exp, 1);
34575 rtx op0 = expand_normal (arg0);
34576 rtx op1 = expand_normal (arg1);
34577 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34578 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34579 enum rtx_code comparison = d->comparison;
34581 if (VECTOR_MODE_P (mode0))
34582 op0 = safe_vector_operand (op0, mode0);
34583 if (VECTOR_MODE_P (mode1))
34584 op1 = safe_vector_operand (op1, mode1);
34586 target = gen_reg_rtx (SImode);
34587 emit_move_insn (target, const0_rtx);
34588 target = gen_rtx_SUBREG (QImode, target, 0);
34590 if ((optimize && !register_operand (op0, mode0))
34591 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34592 op0 = copy_to_mode_reg (mode0, op0);
34593 if ((optimize && !register_operand (op1, mode1))
34594 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34595 op1 = copy_to_mode_reg (mode1, op1);
34597 pat = GEN_FCN (d->icode) (op0, op1);
34598 if (! pat)
34599 return 0;
34600 emit_insn (pat);
34601 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34602 gen_rtx_fmt_ee (comparison, QImode,
34603 SET_DEST (pat),
34604 const0_rtx)));
34606 return SUBREG_REG (target);
34609 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34611 static rtx
34612 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34613 tree exp, rtx target)
34615 rtx pat;
34616 tree arg0 = CALL_EXPR_ARG (exp, 0);
34617 tree arg1 = CALL_EXPR_ARG (exp, 1);
34618 tree arg2 = CALL_EXPR_ARG (exp, 2);
34619 tree arg3 = CALL_EXPR_ARG (exp, 3);
34620 tree arg4 = CALL_EXPR_ARG (exp, 4);
34621 rtx scratch0, scratch1;
34622 rtx op0 = expand_normal (arg0);
34623 rtx op1 = expand_normal (arg1);
34624 rtx op2 = expand_normal (arg2);
34625 rtx op3 = expand_normal (arg3);
34626 rtx op4 = expand_normal (arg4);
34627 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34629 tmode0 = insn_data[d->icode].operand[0].mode;
34630 tmode1 = insn_data[d->icode].operand[1].mode;
34631 modev2 = insn_data[d->icode].operand[2].mode;
34632 modei3 = insn_data[d->icode].operand[3].mode;
34633 modev4 = insn_data[d->icode].operand[4].mode;
34634 modei5 = insn_data[d->icode].operand[5].mode;
34635 modeimm = insn_data[d->icode].operand[6].mode;
34637 if (VECTOR_MODE_P (modev2))
34638 op0 = safe_vector_operand (op0, modev2);
34639 if (VECTOR_MODE_P (modev4))
34640 op2 = safe_vector_operand (op2, modev4);
34642 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34643 op0 = copy_to_mode_reg (modev2, op0);
34644 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34645 op1 = copy_to_mode_reg (modei3, op1);
34646 if ((optimize && !register_operand (op2, modev4))
34647 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34648 op2 = copy_to_mode_reg (modev4, op2);
34649 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34650 op3 = copy_to_mode_reg (modei5, op3);
34652 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34654 error ("the fifth argument must be an 8-bit immediate");
34655 return const0_rtx;
34658 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34660 if (optimize || !target
34661 || GET_MODE (target) != tmode0
34662 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34663 target = gen_reg_rtx (tmode0);
34665 scratch1 = gen_reg_rtx (tmode1);
34667 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34669 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34671 if (optimize || !target
34672 || GET_MODE (target) != tmode1
34673 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34674 target = gen_reg_rtx (tmode1);
34676 scratch0 = gen_reg_rtx (tmode0);
34678 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34680 else
34682 gcc_assert (d->flag);
34684 scratch0 = gen_reg_rtx (tmode0);
34685 scratch1 = gen_reg_rtx (tmode1);
34687 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34690 if (! pat)
34691 return 0;
34693 emit_insn (pat);
34695 if (d->flag)
34697 target = gen_reg_rtx (SImode);
34698 emit_move_insn (target, const0_rtx);
34699 target = gen_rtx_SUBREG (QImode, target, 0);
34701 emit_insn
34702 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34703 gen_rtx_fmt_ee (EQ, QImode,
34704 gen_rtx_REG ((machine_mode) d->flag,
34705 FLAGS_REG),
34706 const0_rtx)));
34707 return SUBREG_REG (target);
34709 else
34710 return target;
34714 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34716 static rtx
34717 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34718 tree exp, rtx target)
34720 rtx pat;
34721 tree arg0 = CALL_EXPR_ARG (exp, 0);
34722 tree arg1 = CALL_EXPR_ARG (exp, 1);
34723 tree arg2 = CALL_EXPR_ARG (exp, 2);
34724 rtx scratch0, scratch1;
34725 rtx op0 = expand_normal (arg0);
34726 rtx op1 = expand_normal (arg1);
34727 rtx op2 = expand_normal (arg2);
34728 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34730 tmode0 = insn_data[d->icode].operand[0].mode;
34731 tmode1 = insn_data[d->icode].operand[1].mode;
34732 modev2 = insn_data[d->icode].operand[2].mode;
34733 modev3 = insn_data[d->icode].operand[3].mode;
34734 modeimm = insn_data[d->icode].operand[4].mode;
34736 if (VECTOR_MODE_P (modev2))
34737 op0 = safe_vector_operand (op0, modev2);
34738 if (VECTOR_MODE_P (modev3))
34739 op1 = safe_vector_operand (op1, modev3);
34741 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34742 op0 = copy_to_mode_reg (modev2, op0);
34743 if ((optimize && !register_operand (op1, modev3))
34744 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34745 op1 = copy_to_mode_reg (modev3, op1);
34747 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34749 error ("the third argument must be an 8-bit immediate");
34750 return const0_rtx;
34753 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34755 if (optimize || !target
34756 || GET_MODE (target) != tmode0
34757 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34758 target = gen_reg_rtx (tmode0);
34760 scratch1 = gen_reg_rtx (tmode1);
34762 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34764 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34766 if (optimize || !target
34767 || GET_MODE (target) != tmode1
34768 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34769 target = gen_reg_rtx (tmode1);
34771 scratch0 = gen_reg_rtx (tmode0);
34773 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34775 else
34777 gcc_assert (d->flag);
34779 scratch0 = gen_reg_rtx (tmode0);
34780 scratch1 = gen_reg_rtx (tmode1);
34782 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34785 if (! pat)
34786 return 0;
34788 emit_insn (pat);
34790 if (d->flag)
34792 target = gen_reg_rtx (SImode);
34793 emit_move_insn (target, const0_rtx);
34794 target = gen_rtx_SUBREG (QImode, target, 0);
34796 emit_insn
34797 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34798 gen_rtx_fmt_ee (EQ, QImode,
34799 gen_rtx_REG ((machine_mode) d->flag,
34800 FLAGS_REG),
34801 const0_rtx)));
34802 return SUBREG_REG (target);
34804 else
34805 return target;
34808 /* Subroutine of ix86_expand_builtin to take care of insns with
34809 variable number of operands. */
34811 static rtx
34812 ix86_expand_args_builtin (const struct builtin_description *d,
34813 tree exp, rtx target)
34815 rtx pat, real_target;
34816 unsigned int i, nargs;
34817 unsigned int nargs_constant = 0;
34818 unsigned int mask_pos = 0;
34819 int num_memory = 0;
34820 struct
34822 rtx op;
34823 machine_mode mode;
34824 } args[6];
34825 bool last_arg_count = false;
34826 enum insn_code icode = d->icode;
34827 const struct insn_data_d *insn_p = &insn_data[icode];
34828 machine_mode tmode = insn_p->operand[0].mode;
34829 machine_mode rmode = VOIDmode;
34830 bool swap = false;
34831 enum rtx_code comparison = d->comparison;
34833 switch ((enum ix86_builtin_func_type) d->flag)
34835 case V2DF_FTYPE_V2DF_ROUND:
34836 case V4DF_FTYPE_V4DF_ROUND:
34837 case V8DF_FTYPE_V8DF_ROUND:
34838 case V4SF_FTYPE_V4SF_ROUND:
34839 case V8SF_FTYPE_V8SF_ROUND:
34840 case V16SF_FTYPE_V16SF_ROUND:
34841 case V4SI_FTYPE_V4SF_ROUND:
34842 case V8SI_FTYPE_V8SF_ROUND:
34843 case V16SI_FTYPE_V16SF_ROUND:
34844 return ix86_expand_sse_round (d, exp, target);
34845 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34846 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34847 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34848 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34849 case INT_FTYPE_V8SF_V8SF_PTEST:
34850 case INT_FTYPE_V4DI_V4DI_PTEST:
34851 case INT_FTYPE_V4DF_V4DF_PTEST:
34852 case INT_FTYPE_V4SF_V4SF_PTEST:
34853 case INT_FTYPE_V2DI_V2DI_PTEST:
34854 case INT_FTYPE_V2DF_V2DF_PTEST:
34855 return ix86_expand_sse_ptest (d, exp, target);
34856 case FLOAT128_FTYPE_FLOAT128:
34857 case FLOAT_FTYPE_FLOAT:
34858 case INT_FTYPE_INT:
34859 case UINT_FTYPE_UINT:
34860 case UINT16_FTYPE_UINT16:
34861 case UINT64_FTYPE_INT:
34862 case UINT64_FTYPE_UINT64:
34863 case INT64_FTYPE_INT64:
34864 case INT64_FTYPE_V4SF:
34865 case INT64_FTYPE_V2DF:
34866 case INT_FTYPE_V16QI:
34867 case INT_FTYPE_V8QI:
34868 case INT_FTYPE_V8SF:
34869 case INT_FTYPE_V4DF:
34870 case INT_FTYPE_V4SF:
34871 case INT_FTYPE_V2DF:
34872 case INT_FTYPE_V32QI:
34873 case V16QI_FTYPE_V16QI:
34874 case V8SI_FTYPE_V8SF:
34875 case V8SI_FTYPE_V4SI:
34876 case V8HI_FTYPE_V8HI:
34877 case V8HI_FTYPE_V16QI:
34878 case V8QI_FTYPE_V8QI:
34879 case V8SF_FTYPE_V8SF:
34880 case V8SF_FTYPE_V8SI:
34881 case V8SF_FTYPE_V4SF:
34882 case V8SF_FTYPE_V8HI:
34883 case V4SI_FTYPE_V4SI:
34884 case V4SI_FTYPE_V16QI:
34885 case V4SI_FTYPE_V4SF:
34886 case V4SI_FTYPE_V8SI:
34887 case V4SI_FTYPE_V8HI:
34888 case V4SI_FTYPE_V4DF:
34889 case V4SI_FTYPE_V2DF:
34890 case V4HI_FTYPE_V4HI:
34891 case V4DF_FTYPE_V4DF:
34892 case V4DF_FTYPE_V4SI:
34893 case V4DF_FTYPE_V4SF:
34894 case V4DF_FTYPE_V2DF:
34895 case V4SF_FTYPE_V4SF:
34896 case V4SF_FTYPE_V4SI:
34897 case V4SF_FTYPE_V8SF:
34898 case V4SF_FTYPE_V4DF:
34899 case V4SF_FTYPE_V8HI:
34900 case V4SF_FTYPE_V2DF:
34901 case V2DI_FTYPE_V2DI:
34902 case V2DI_FTYPE_V16QI:
34903 case V2DI_FTYPE_V8HI:
34904 case V2DI_FTYPE_V4SI:
34905 case V2DF_FTYPE_V2DF:
34906 case V2DF_FTYPE_V4SI:
34907 case V2DF_FTYPE_V4DF:
34908 case V2DF_FTYPE_V4SF:
34909 case V2DF_FTYPE_V2SI:
34910 case V2SI_FTYPE_V2SI:
34911 case V2SI_FTYPE_V4SF:
34912 case V2SI_FTYPE_V2SF:
34913 case V2SI_FTYPE_V2DF:
34914 case V2SF_FTYPE_V2SF:
34915 case V2SF_FTYPE_V2SI:
34916 case V32QI_FTYPE_V32QI:
34917 case V32QI_FTYPE_V16QI:
34918 case V16HI_FTYPE_V16HI:
34919 case V16HI_FTYPE_V8HI:
34920 case V8SI_FTYPE_V8SI:
34921 case V16HI_FTYPE_V16QI:
34922 case V8SI_FTYPE_V16QI:
34923 case V4DI_FTYPE_V16QI:
34924 case V8SI_FTYPE_V8HI:
34925 case V4DI_FTYPE_V8HI:
34926 case V4DI_FTYPE_V4SI:
34927 case V4DI_FTYPE_V2DI:
34928 case UQI_FTYPE_UQI:
34929 case UHI_FTYPE_UHI:
34930 case USI_FTYPE_USI:
34931 case USI_FTYPE_UQI:
34932 case USI_FTYPE_UHI:
34933 case UDI_FTYPE_UDI:
34934 case UHI_FTYPE_V16QI:
34935 case USI_FTYPE_V32QI:
34936 case UDI_FTYPE_V64QI:
34937 case V16QI_FTYPE_UHI:
34938 case V32QI_FTYPE_USI:
34939 case V64QI_FTYPE_UDI:
34940 case V8HI_FTYPE_UQI:
34941 case V16HI_FTYPE_UHI:
34942 case V32HI_FTYPE_USI:
34943 case V4SI_FTYPE_UQI:
34944 case V8SI_FTYPE_UQI:
34945 case V4SI_FTYPE_UHI:
34946 case V8SI_FTYPE_UHI:
34947 case UQI_FTYPE_V8HI:
34948 case UHI_FTYPE_V16HI:
34949 case USI_FTYPE_V32HI:
34950 case UQI_FTYPE_V4SI:
34951 case UQI_FTYPE_V8SI:
34952 case UHI_FTYPE_V16SI:
34953 case UQI_FTYPE_V2DI:
34954 case UQI_FTYPE_V4DI:
34955 case UQI_FTYPE_V8DI:
34956 case V16SI_FTYPE_UHI:
34957 case V2DI_FTYPE_UQI:
34958 case V4DI_FTYPE_UQI:
34959 case V16SI_FTYPE_INT:
34960 case V16SF_FTYPE_V8SF:
34961 case V16SI_FTYPE_V8SI:
34962 case V16SF_FTYPE_V4SF:
34963 case V16SI_FTYPE_V4SI:
34964 case V16SI_FTYPE_V16SF:
34965 case V16SI_FTYPE_V16SI:
34966 case V16SF_FTYPE_V16SF:
34967 case V8DI_FTYPE_UQI:
34968 case V8DI_FTYPE_V8DI:
34969 case V8DF_FTYPE_V4DF:
34970 case V8DF_FTYPE_V2DF:
34971 case V8DF_FTYPE_V8DF:
34972 nargs = 1;
34973 break;
34974 case V4SF_FTYPE_V4SF_VEC_MERGE:
34975 case V2DF_FTYPE_V2DF_VEC_MERGE:
34976 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34977 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34978 case V16QI_FTYPE_V16QI_V16QI:
34979 case V16QI_FTYPE_V8HI_V8HI:
34980 case V16SF_FTYPE_V16SF_V16SF:
34981 case V8QI_FTYPE_V8QI_V8QI:
34982 case V8QI_FTYPE_V4HI_V4HI:
34983 case V8HI_FTYPE_V8HI_V8HI:
34984 case V8HI_FTYPE_V16QI_V16QI:
34985 case V8HI_FTYPE_V4SI_V4SI:
34986 case V8SF_FTYPE_V8SF_V8SF:
34987 case V8SF_FTYPE_V8SF_V8SI:
34988 case V8DF_FTYPE_V8DF_V8DF:
34989 case V4SI_FTYPE_V4SI_V4SI:
34990 case V4SI_FTYPE_V8HI_V8HI:
34991 case V4SI_FTYPE_V2DF_V2DF:
34992 case V4HI_FTYPE_V4HI_V4HI:
34993 case V4HI_FTYPE_V8QI_V8QI:
34994 case V4HI_FTYPE_V2SI_V2SI:
34995 case V4DF_FTYPE_V4DF_V4DF:
34996 case V4DF_FTYPE_V4DF_V4DI:
34997 case V4SF_FTYPE_V4SF_V4SF:
34998 case V4SF_FTYPE_V4SF_V4SI:
34999 case V4SF_FTYPE_V4SF_V2SI:
35000 case V4SF_FTYPE_V4SF_V2DF:
35001 case V4SF_FTYPE_V4SF_UINT:
35002 case V4SF_FTYPE_V4SF_DI:
35003 case V4SF_FTYPE_V4SF_SI:
35004 case V2DI_FTYPE_V2DI_V2DI:
35005 case V2DI_FTYPE_V16QI_V16QI:
35006 case V2DI_FTYPE_V4SI_V4SI:
35007 case V2DI_FTYPE_V2DI_V16QI:
35008 case V2SI_FTYPE_V2SI_V2SI:
35009 case V2SI_FTYPE_V4HI_V4HI:
35010 case V2SI_FTYPE_V2SF_V2SF:
35011 case V2DF_FTYPE_V2DF_V2DF:
35012 case V2DF_FTYPE_V2DF_V4SF:
35013 case V2DF_FTYPE_V2DF_V2DI:
35014 case V2DF_FTYPE_V2DF_DI:
35015 case V2DF_FTYPE_V2DF_SI:
35016 case V2DF_FTYPE_V2DF_UINT:
35017 case V2SF_FTYPE_V2SF_V2SF:
35018 case V1DI_FTYPE_V1DI_V1DI:
35019 case V1DI_FTYPE_V8QI_V8QI:
35020 case V1DI_FTYPE_V2SI_V2SI:
35021 case V32QI_FTYPE_V16HI_V16HI:
35022 case V16HI_FTYPE_V8SI_V8SI:
35023 case V32QI_FTYPE_V32QI_V32QI:
35024 case V16HI_FTYPE_V32QI_V32QI:
35025 case V16HI_FTYPE_V16HI_V16HI:
35026 case V8SI_FTYPE_V4DF_V4DF:
35027 case V8SI_FTYPE_V8SI_V8SI:
35028 case V8SI_FTYPE_V16HI_V16HI:
35029 case V4DI_FTYPE_V4DI_V4DI:
35030 case V4DI_FTYPE_V8SI_V8SI:
35031 case V8DI_FTYPE_V64QI_V64QI:
35032 if (comparison == UNKNOWN)
35033 return ix86_expand_binop_builtin (icode, exp, target);
35034 nargs = 2;
35035 break;
35036 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35037 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35038 gcc_assert (comparison != UNKNOWN);
35039 nargs = 2;
35040 swap = true;
35041 break;
35042 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35043 case V16HI_FTYPE_V16HI_SI_COUNT:
35044 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35045 case V8SI_FTYPE_V8SI_SI_COUNT:
35046 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35047 case V4DI_FTYPE_V4DI_INT_COUNT:
35048 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35049 case V8HI_FTYPE_V8HI_SI_COUNT:
35050 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35051 case V4SI_FTYPE_V4SI_SI_COUNT:
35052 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35053 case V4HI_FTYPE_V4HI_SI_COUNT:
35054 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35055 case V2DI_FTYPE_V2DI_SI_COUNT:
35056 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35057 case V2SI_FTYPE_V2SI_SI_COUNT:
35058 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35059 case V1DI_FTYPE_V1DI_SI_COUNT:
35060 nargs = 2;
35061 last_arg_count = true;
35062 break;
35063 case UINT64_FTYPE_UINT64_UINT64:
35064 case UINT_FTYPE_UINT_UINT:
35065 case UINT_FTYPE_UINT_USHORT:
35066 case UINT_FTYPE_UINT_UCHAR:
35067 case UINT16_FTYPE_UINT16_INT:
35068 case UINT8_FTYPE_UINT8_INT:
35069 case UQI_FTYPE_UQI_UQI:
35070 case UHI_FTYPE_UHI_UHI:
35071 case USI_FTYPE_USI_USI:
35072 case UDI_FTYPE_UDI_UDI:
35073 case V16SI_FTYPE_V8DF_V8DF:
35074 nargs = 2;
35075 break;
35076 case V2DI_FTYPE_V2DI_INT_CONVERT:
35077 nargs = 2;
35078 rmode = V1TImode;
35079 nargs_constant = 1;
35080 break;
35081 case V4DI_FTYPE_V4DI_INT_CONVERT:
35082 nargs = 2;
35083 rmode = V2TImode;
35084 nargs_constant = 1;
35085 break;
35086 case V8DI_FTYPE_V8DI_INT_CONVERT:
35087 nargs = 2;
35088 rmode = V4TImode;
35089 nargs_constant = 1;
35090 break;
35091 case V8HI_FTYPE_V8HI_INT:
35092 case V8HI_FTYPE_V8SF_INT:
35093 case V16HI_FTYPE_V16SF_INT:
35094 case V8HI_FTYPE_V4SF_INT:
35095 case V8SF_FTYPE_V8SF_INT:
35096 case V4SF_FTYPE_V16SF_INT:
35097 case V16SF_FTYPE_V16SF_INT:
35098 case V4SI_FTYPE_V4SI_INT:
35099 case V4SI_FTYPE_V8SI_INT:
35100 case V4HI_FTYPE_V4HI_INT:
35101 case V4DF_FTYPE_V4DF_INT:
35102 case V4DF_FTYPE_V8DF_INT:
35103 case V4SF_FTYPE_V4SF_INT:
35104 case V4SF_FTYPE_V8SF_INT:
35105 case V2DI_FTYPE_V2DI_INT:
35106 case V2DF_FTYPE_V2DF_INT:
35107 case V2DF_FTYPE_V4DF_INT:
35108 case V16HI_FTYPE_V16HI_INT:
35109 case V8SI_FTYPE_V8SI_INT:
35110 case V16SI_FTYPE_V16SI_INT:
35111 case V4SI_FTYPE_V16SI_INT:
35112 case V4DI_FTYPE_V4DI_INT:
35113 case V2DI_FTYPE_V4DI_INT:
35114 case V4DI_FTYPE_V8DI_INT:
35115 case QI_FTYPE_V4SF_INT:
35116 case QI_FTYPE_V2DF_INT:
35117 case UQI_FTYPE_UQI_UQI_CONST:
35118 case UHI_FTYPE_UHI_UQI:
35119 case USI_FTYPE_USI_UQI:
35120 case UDI_FTYPE_UDI_UQI:
35121 nargs = 2;
35122 nargs_constant = 1;
35123 break;
35124 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35125 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35126 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35127 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35128 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35129 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35130 case UHI_FTYPE_V16SI_V16SI_UHI:
35131 case UQI_FTYPE_V8DI_V8DI_UQI:
35132 case V16HI_FTYPE_V16SI_V16HI_UHI:
35133 case V16QI_FTYPE_V16SI_V16QI_UHI:
35134 case V16QI_FTYPE_V8DI_V16QI_UQI:
35135 case V16SF_FTYPE_V16SF_V16SF_UHI:
35136 case V16SF_FTYPE_V4SF_V16SF_UHI:
35137 case V16SI_FTYPE_SI_V16SI_UHI:
35138 case V16SI_FTYPE_V16HI_V16SI_UHI:
35139 case V16SI_FTYPE_V16QI_V16SI_UHI:
35140 case V8SF_FTYPE_V4SF_V8SF_UQI:
35141 case V4DF_FTYPE_V2DF_V4DF_UQI:
35142 case V8SI_FTYPE_V4SI_V8SI_UQI:
35143 case V8SI_FTYPE_SI_V8SI_UQI:
35144 case V4SI_FTYPE_V4SI_V4SI_UQI:
35145 case V4SI_FTYPE_SI_V4SI_UQI:
35146 case V4DI_FTYPE_V2DI_V4DI_UQI:
35147 case V4DI_FTYPE_DI_V4DI_UQI:
35148 case V2DI_FTYPE_V2DI_V2DI_UQI:
35149 case V2DI_FTYPE_DI_V2DI_UQI:
35150 case V64QI_FTYPE_V64QI_V64QI_UDI:
35151 case V64QI_FTYPE_V16QI_V64QI_UDI:
35152 case V64QI_FTYPE_QI_V64QI_UDI:
35153 case V32QI_FTYPE_V32QI_V32QI_USI:
35154 case V32QI_FTYPE_V16QI_V32QI_USI:
35155 case V32QI_FTYPE_QI_V32QI_USI:
35156 case V16QI_FTYPE_V16QI_V16QI_UHI:
35157 case V16QI_FTYPE_QI_V16QI_UHI:
35158 case V32HI_FTYPE_V8HI_V32HI_USI:
35159 case V32HI_FTYPE_HI_V32HI_USI:
35160 case V16HI_FTYPE_V8HI_V16HI_UHI:
35161 case V16HI_FTYPE_HI_V16HI_UHI:
35162 case V8HI_FTYPE_V8HI_V8HI_UQI:
35163 case V8HI_FTYPE_HI_V8HI_UQI:
35164 case V8SF_FTYPE_V8HI_V8SF_UQI:
35165 case V4SF_FTYPE_V8HI_V4SF_UQI:
35166 case V8SI_FTYPE_V8SF_V8SI_UQI:
35167 case V4SI_FTYPE_V4SF_V4SI_UQI:
35168 case V4DI_FTYPE_V4SF_V4DI_UQI:
35169 case V2DI_FTYPE_V4SF_V2DI_UQI:
35170 case V4SF_FTYPE_V4DI_V4SF_UQI:
35171 case V4SF_FTYPE_V2DI_V4SF_UQI:
35172 case V4DF_FTYPE_V4DI_V4DF_UQI:
35173 case V2DF_FTYPE_V2DI_V2DF_UQI:
35174 case V16QI_FTYPE_V8HI_V16QI_UQI:
35175 case V16QI_FTYPE_V16HI_V16QI_UHI:
35176 case V16QI_FTYPE_V4SI_V16QI_UQI:
35177 case V16QI_FTYPE_V8SI_V16QI_UQI:
35178 case V8HI_FTYPE_V4SI_V8HI_UQI:
35179 case V8HI_FTYPE_V8SI_V8HI_UQI:
35180 case V16QI_FTYPE_V2DI_V16QI_UQI:
35181 case V16QI_FTYPE_V4DI_V16QI_UQI:
35182 case V8HI_FTYPE_V2DI_V8HI_UQI:
35183 case V8HI_FTYPE_V4DI_V8HI_UQI:
35184 case V4SI_FTYPE_V2DI_V4SI_UQI:
35185 case V4SI_FTYPE_V4DI_V4SI_UQI:
35186 case V32QI_FTYPE_V32HI_V32QI_USI:
35187 case UHI_FTYPE_V16QI_V16QI_UHI:
35188 case USI_FTYPE_V32QI_V32QI_USI:
35189 case UDI_FTYPE_V64QI_V64QI_UDI:
35190 case UQI_FTYPE_V8HI_V8HI_UQI:
35191 case UHI_FTYPE_V16HI_V16HI_UHI:
35192 case USI_FTYPE_V32HI_V32HI_USI:
35193 case UQI_FTYPE_V4SI_V4SI_UQI:
35194 case UQI_FTYPE_V8SI_V8SI_UQI:
35195 case UQI_FTYPE_V2DI_V2DI_UQI:
35196 case UQI_FTYPE_V4DI_V4DI_UQI:
35197 case V4SF_FTYPE_V2DF_V4SF_UQI:
35198 case V4SF_FTYPE_V4DF_V4SF_UQI:
35199 case V16SI_FTYPE_V16SI_V16SI_UHI:
35200 case V16SI_FTYPE_V4SI_V16SI_UHI:
35201 case V2DI_FTYPE_V4SI_V2DI_UQI:
35202 case V2DI_FTYPE_V8HI_V2DI_UQI:
35203 case V2DI_FTYPE_V16QI_V2DI_UQI:
35204 case V4DI_FTYPE_V4DI_V4DI_UQI:
35205 case V4DI_FTYPE_V4SI_V4DI_UQI:
35206 case V4DI_FTYPE_V8HI_V4DI_UQI:
35207 case V4DI_FTYPE_V16QI_V4DI_UQI:
35208 case V4DI_FTYPE_V4DF_V4DI_UQI:
35209 case V2DI_FTYPE_V2DF_V2DI_UQI:
35210 case V4SI_FTYPE_V4DF_V4SI_UQI:
35211 case V4SI_FTYPE_V2DF_V4SI_UQI:
35212 case V4SI_FTYPE_V8HI_V4SI_UQI:
35213 case V4SI_FTYPE_V16QI_V4SI_UQI:
35214 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35215 case V8DF_FTYPE_V2DF_V8DF_UQI:
35216 case V8DF_FTYPE_V4DF_V8DF_UQI:
35217 case V8DF_FTYPE_V8DF_V8DF_UQI:
35218 case V8SF_FTYPE_V8SF_V8SF_UQI:
35219 case V8SF_FTYPE_V8SI_V8SF_UQI:
35220 case V4DF_FTYPE_V4DF_V4DF_UQI:
35221 case V4SF_FTYPE_V4SF_V4SF_UQI:
35222 case V2DF_FTYPE_V2DF_V2DF_UQI:
35223 case V2DF_FTYPE_V4SF_V2DF_UQI:
35224 case V2DF_FTYPE_V4SI_V2DF_UQI:
35225 case V4SF_FTYPE_V4SI_V4SF_UQI:
35226 case V4DF_FTYPE_V4SF_V4DF_UQI:
35227 case V4DF_FTYPE_V4SI_V4DF_UQI:
35228 case V8SI_FTYPE_V8SI_V8SI_UQI:
35229 case V8SI_FTYPE_V8HI_V8SI_UQI:
35230 case V8SI_FTYPE_V16QI_V8SI_UQI:
35231 case V8DF_FTYPE_V8SI_V8DF_UQI:
35232 case V8DI_FTYPE_DI_V8DI_UQI:
35233 case V16SF_FTYPE_V8SF_V16SF_UHI:
35234 case V16SI_FTYPE_V8SI_V16SI_UHI:
35235 case V16HI_FTYPE_V16HI_V16HI_UHI:
35236 case V8HI_FTYPE_V16QI_V8HI_UQI:
35237 case V16HI_FTYPE_V16QI_V16HI_UHI:
35238 case V32HI_FTYPE_V32HI_V32HI_USI:
35239 case V32HI_FTYPE_V32QI_V32HI_USI:
35240 case V8DI_FTYPE_V16QI_V8DI_UQI:
35241 case V8DI_FTYPE_V2DI_V8DI_UQI:
35242 case V8DI_FTYPE_V4DI_V8DI_UQI:
35243 case V8DI_FTYPE_V8DI_V8DI_UQI:
35244 case V8DI_FTYPE_V8HI_V8DI_UQI:
35245 case V8DI_FTYPE_V8SI_V8DI_UQI:
35246 case V8HI_FTYPE_V8DI_V8HI_UQI:
35247 case V8SI_FTYPE_V8DI_V8SI_UQI:
35248 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35249 nargs = 3;
35250 break;
35251 case V32QI_FTYPE_V32QI_V32QI_INT:
35252 case V16HI_FTYPE_V16HI_V16HI_INT:
35253 case V16QI_FTYPE_V16QI_V16QI_INT:
35254 case V4DI_FTYPE_V4DI_V4DI_INT:
35255 case V8HI_FTYPE_V8HI_V8HI_INT:
35256 case V8SI_FTYPE_V8SI_V8SI_INT:
35257 case V8SI_FTYPE_V8SI_V4SI_INT:
35258 case V8SF_FTYPE_V8SF_V8SF_INT:
35259 case V8SF_FTYPE_V8SF_V4SF_INT:
35260 case V4SI_FTYPE_V4SI_V4SI_INT:
35261 case V4DF_FTYPE_V4DF_V4DF_INT:
35262 case V16SF_FTYPE_V16SF_V16SF_INT:
35263 case V16SF_FTYPE_V16SF_V4SF_INT:
35264 case V16SI_FTYPE_V16SI_V4SI_INT:
35265 case V4DF_FTYPE_V4DF_V2DF_INT:
35266 case V4SF_FTYPE_V4SF_V4SF_INT:
35267 case V2DI_FTYPE_V2DI_V2DI_INT:
35268 case V4DI_FTYPE_V4DI_V2DI_INT:
35269 case V2DF_FTYPE_V2DF_V2DF_INT:
35270 case UQI_FTYPE_V8DI_V8UDI_INT:
35271 case UQI_FTYPE_V8DF_V8DF_INT:
35272 case UQI_FTYPE_V2DF_V2DF_INT:
35273 case UQI_FTYPE_V4SF_V4SF_INT:
35274 case UHI_FTYPE_V16SI_V16SI_INT:
35275 case UHI_FTYPE_V16SF_V16SF_INT:
35276 nargs = 3;
35277 nargs_constant = 1;
35278 break;
35279 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35280 nargs = 3;
35281 rmode = V4DImode;
35282 nargs_constant = 1;
35283 break;
35284 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35285 nargs = 3;
35286 rmode = V2DImode;
35287 nargs_constant = 1;
35288 break;
35289 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35290 nargs = 3;
35291 rmode = DImode;
35292 nargs_constant = 1;
35293 break;
35294 case V2DI_FTYPE_V2DI_UINT_UINT:
35295 nargs = 3;
35296 nargs_constant = 2;
35297 break;
35298 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35299 nargs = 3;
35300 rmode = V8DImode;
35301 nargs_constant = 1;
35302 break;
35303 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35304 nargs = 5;
35305 rmode = V8DImode;
35306 mask_pos = 2;
35307 nargs_constant = 1;
35308 break;
35309 case QI_FTYPE_V8DF_INT_UQI:
35310 case QI_FTYPE_V4DF_INT_UQI:
35311 case QI_FTYPE_V2DF_INT_UQI:
35312 case HI_FTYPE_V16SF_INT_UHI:
35313 case QI_FTYPE_V8SF_INT_UQI:
35314 case QI_FTYPE_V4SF_INT_UQI:
35315 nargs = 3;
35316 mask_pos = 1;
35317 nargs_constant = 1;
35318 break;
35319 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35320 nargs = 5;
35321 rmode = V4DImode;
35322 mask_pos = 2;
35323 nargs_constant = 1;
35324 break;
35325 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35326 nargs = 5;
35327 rmode = V2DImode;
35328 mask_pos = 2;
35329 nargs_constant = 1;
35330 break;
35331 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35332 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35333 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35334 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35335 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35336 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35337 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35338 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35339 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35340 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35341 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35342 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35343 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35344 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35345 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35346 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35347 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35348 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35349 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35350 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35351 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35352 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35353 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35354 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35355 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35356 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35357 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35358 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35359 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35360 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35361 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35362 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35363 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35364 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35365 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35366 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35367 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35368 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35369 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35370 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35371 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35372 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35373 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35374 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35375 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35376 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35377 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35378 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35379 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35380 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35381 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35382 nargs = 4;
35383 break;
35384 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35385 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35386 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35387 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35388 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35389 nargs = 4;
35390 nargs_constant = 1;
35391 break;
35392 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35393 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35394 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35395 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35396 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35397 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35398 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35399 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35400 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35401 case USI_FTYPE_V32QI_V32QI_INT_USI:
35402 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35403 case USI_FTYPE_V32HI_V32HI_INT_USI:
35404 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35405 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35406 nargs = 4;
35407 mask_pos = 1;
35408 nargs_constant = 1;
35409 break;
35410 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35411 nargs = 4;
35412 nargs_constant = 2;
35413 break;
35414 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35415 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35416 nargs = 4;
35417 break;
35418 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35419 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35420 mask_pos = 1;
35421 nargs = 4;
35422 nargs_constant = 1;
35423 break;
35424 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35425 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35426 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35427 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35428 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35429 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35430 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35431 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35432 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35433 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35434 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35435 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35436 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35437 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35438 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35439 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35440 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35441 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35442 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35443 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35444 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35445 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35446 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35447 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35448 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35449 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35450 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35451 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35452 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35453 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35454 nargs = 4;
35455 mask_pos = 2;
35456 nargs_constant = 1;
35457 break;
35458 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35459 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35460 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35461 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35462 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35463 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35464 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35465 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35466 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35467 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35468 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35469 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35470 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35471 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35472 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35473 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35474 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35475 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35476 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35477 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35478 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35479 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35480 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35481 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35482 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35483 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35484 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35485 nargs = 5;
35486 mask_pos = 2;
35487 nargs_constant = 1;
35488 break;
35489 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35490 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35491 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35492 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35493 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35494 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35495 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35496 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35497 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35498 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35499 nargs = 5;
35500 mask_pos = 1;
35501 nargs_constant = 1;
35502 break;
35504 default:
35505 gcc_unreachable ();
35508 gcc_assert (nargs <= ARRAY_SIZE (args));
35510 if (comparison != UNKNOWN)
35512 gcc_assert (nargs == 2);
35513 return ix86_expand_sse_compare (d, exp, target, swap);
35516 if (rmode == VOIDmode || rmode == tmode)
35518 if (optimize
35519 || target == 0
35520 || GET_MODE (target) != tmode
35521 || !insn_p->operand[0].predicate (target, tmode))
35522 target = gen_reg_rtx (tmode);
35523 real_target = target;
35525 else
35527 real_target = gen_reg_rtx (tmode);
35528 target = lowpart_subreg (rmode, real_target, tmode);
35531 for (i = 0; i < nargs; i++)
35533 tree arg = CALL_EXPR_ARG (exp, i);
35534 rtx op = expand_normal (arg);
35535 machine_mode mode = insn_p->operand[i + 1].mode;
35536 bool match = insn_p->operand[i + 1].predicate (op, mode);
35538 if (last_arg_count && (i + 1) == nargs)
35540 /* SIMD shift insns take either an 8-bit immediate or
35541 register as count. But builtin functions take int as
35542 count. If count doesn't match, we put it in register. */
35543 if (!match)
35545 op = lowpart_subreg (SImode, op, GET_MODE (op));
35546 if (!insn_p->operand[i + 1].predicate (op, mode))
35547 op = copy_to_reg (op);
35550 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35551 (!mask_pos && (nargs - i) <= nargs_constant))
35553 if (!match)
35554 switch (icode)
35556 case CODE_FOR_avx_vinsertf128v4di:
35557 case CODE_FOR_avx_vextractf128v4di:
35558 error ("the last argument must be an 1-bit immediate");
35559 return const0_rtx;
35561 case CODE_FOR_avx512f_cmpv8di3_mask:
35562 case CODE_FOR_avx512f_cmpv16si3_mask:
35563 case CODE_FOR_avx512f_ucmpv8di3_mask:
35564 case CODE_FOR_avx512f_ucmpv16si3_mask:
35565 case CODE_FOR_avx512vl_cmpv4di3_mask:
35566 case CODE_FOR_avx512vl_cmpv8si3_mask:
35567 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35568 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35569 case CODE_FOR_avx512vl_cmpv2di3_mask:
35570 case CODE_FOR_avx512vl_cmpv4si3_mask:
35571 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35572 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35573 error ("the last argument must be a 3-bit immediate");
35574 return const0_rtx;
35576 case CODE_FOR_sse4_1_roundsd:
35577 case CODE_FOR_sse4_1_roundss:
35579 case CODE_FOR_sse4_1_roundpd:
35580 case CODE_FOR_sse4_1_roundps:
35581 case CODE_FOR_avx_roundpd256:
35582 case CODE_FOR_avx_roundps256:
35584 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35585 case CODE_FOR_sse4_1_roundps_sfix:
35586 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35587 case CODE_FOR_avx_roundps_sfix256:
35589 case CODE_FOR_sse4_1_blendps:
35590 case CODE_FOR_avx_blendpd256:
35591 case CODE_FOR_avx_vpermilv4df:
35592 case CODE_FOR_avx_vpermilv4df_mask:
35593 case CODE_FOR_avx512f_getmantv8df_mask:
35594 case CODE_FOR_avx512f_getmantv16sf_mask:
35595 case CODE_FOR_avx512vl_getmantv8sf_mask:
35596 case CODE_FOR_avx512vl_getmantv4df_mask:
35597 case CODE_FOR_avx512vl_getmantv4sf_mask:
35598 case CODE_FOR_avx512vl_getmantv2df_mask:
35599 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35600 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35601 case CODE_FOR_avx512dq_rangepv4df_mask:
35602 case CODE_FOR_avx512dq_rangepv8sf_mask:
35603 case CODE_FOR_avx512dq_rangepv2df_mask:
35604 case CODE_FOR_avx512dq_rangepv4sf_mask:
35605 case CODE_FOR_avx_shufpd256_mask:
35606 error ("the last argument must be a 4-bit immediate");
35607 return const0_rtx;
35609 case CODE_FOR_sha1rnds4:
35610 case CODE_FOR_sse4_1_blendpd:
35611 case CODE_FOR_avx_vpermilv2df:
35612 case CODE_FOR_avx_vpermilv2df_mask:
35613 case CODE_FOR_xop_vpermil2v2df3:
35614 case CODE_FOR_xop_vpermil2v4sf3:
35615 case CODE_FOR_xop_vpermil2v4df3:
35616 case CODE_FOR_xop_vpermil2v8sf3:
35617 case CODE_FOR_avx512f_vinsertf32x4_mask:
35618 case CODE_FOR_avx512f_vinserti32x4_mask:
35619 case CODE_FOR_avx512f_vextractf32x4_mask:
35620 case CODE_FOR_avx512f_vextracti32x4_mask:
35621 case CODE_FOR_sse2_shufpd:
35622 case CODE_FOR_sse2_shufpd_mask:
35623 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35624 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35625 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35626 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35627 error ("the last argument must be a 2-bit immediate");
35628 return const0_rtx;
35630 case CODE_FOR_avx_vextractf128v4df:
35631 case CODE_FOR_avx_vextractf128v8sf:
35632 case CODE_FOR_avx_vextractf128v8si:
35633 case CODE_FOR_avx_vinsertf128v4df:
35634 case CODE_FOR_avx_vinsertf128v8sf:
35635 case CODE_FOR_avx_vinsertf128v8si:
35636 case CODE_FOR_avx512f_vinsertf64x4_mask:
35637 case CODE_FOR_avx512f_vinserti64x4_mask:
35638 case CODE_FOR_avx512f_vextractf64x4_mask:
35639 case CODE_FOR_avx512f_vextracti64x4_mask:
35640 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35641 case CODE_FOR_avx512dq_vinserti32x8_mask:
35642 case CODE_FOR_avx512vl_vinsertv4df:
35643 case CODE_FOR_avx512vl_vinsertv4di:
35644 case CODE_FOR_avx512vl_vinsertv8sf:
35645 case CODE_FOR_avx512vl_vinsertv8si:
35646 error ("the last argument must be a 1-bit immediate");
35647 return const0_rtx;
35649 case CODE_FOR_avx_vmcmpv2df3:
35650 case CODE_FOR_avx_vmcmpv4sf3:
35651 case CODE_FOR_avx_cmpv2df3:
35652 case CODE_FOR_avx_cmpv4sf3:
35653 case CODE_FOR_avx_cmpv4df3:
35654 case CODE_FOR_avx_cmpv8sf3:
35655 case CODE_FOR_avx512f_cmpv8df3_mask:
35656 case CODE_FOR_avx512f_cmpv16sf3_mask:
35657 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35658 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35659 error ("the last argument must be a 5-bit immediate");
35660 return const0_rtx;
35662 default:
35663 switch (nargs_constant)
35665 case 2:
35666 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35667 (!mask_pos && (nargs - i) == nargs_constant))
35669 error ("the next to last argument must be an 8-bit immediate");
35670 break;
35672 /* FALLTHRU */
35673 case 1:
35674 error ("the last argument must be an 8-bit immediate");
35675 break;
35676 default:
35677 gcc_unreachable ();
35679 return const0_rtx;
35682 else
35684 if (VECTOR_MODE_P (mode))
35685 op = safe_vector_operand (op, mode);
35687 /* If we aren't optimizing, only allow one memory operand to
35688 be generated. */
35689 if (memory_operand (op, mode))
35690 num_memory++;
35692 op = fixup_modeless_constant (op, mode);
35694 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35696 if (optimize || !match || num_memory > 1)
35697 op = copy_to_mode_reg (mode, op);
35699 else
35701 op = copy_to_reg (op);
35702 op = lowpart_subreg (mode, op, GET_MODE (op));
35706 args[i].op = op;
35707 args[i].mode = mode;
35710 switch (nargs)
35712 case 1:
35713 pat = GEN_FCN (icode) (real_target, args[0].op);
35714 break;
35715 case 2:
35716 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35717 break;
35718 case 3:
35719 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35720 args[2].op);
35721 break;
35722 case 4:
35723 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35724 args[2].op, args[3].op);
35725 break;
35726 case 5:
35727 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35728 args[2].op, args[3].op, args[4].op);
35729 break;
35730 case 6:
35731 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35732 args[2].op, args[3].op, args[4].op,
35733 args[5].op);
35734 break;
35735 default:
35736 gcc_unreachable ();
35739 if (! pat)
35740 return 0;
35742 emit_insn (pat);
35743 return target;
35746 /* Transform pattern of following layout:
35747 (parallel [
35748 set (A B)
35749 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35751 into:
35752 (set (A B))
35755 (parallel [ A B
35757 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35760 into:
35761 (parallel [ A B ... ]) */
35763 static rtx
35764 ix86_erase_embedded_rounding (rtx pat)
35766 if (GET_CODE (pat) == INSN)
35767 pat = PATTERN (pat);
35769 gcc_assert (GET_CODE (pat) == PARALLEL);
35771 if (XVECLEN (pat, 0) == 2)
35773 rtx p0 = XVECEXP (pat, 0, 0);
35774 rtx p1 = XVECEXP (pat, 0, 1);
35776 gcc_assert (GET_CODE (p0) == SET
35777 && GET_CODE (p1) == UNSPEC
35778 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35780 return p0;
35782 else
35784 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35785 int i = 0;
35786 int j = 0;
35788 for (; i < XVECLEN (pat, 0); ++i)
35790 rtx elem = XVECEXP (pat, 0, i);
35791 if (GET_CODE (elem) != UNSPEC
35792 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35793 res [j++] = elem;
35796 /* No more than 1 occurence was removed. */
35797 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35799 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35803 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35804 with rounding. */
35805 static rtx
35806 ix86_expand_sse_comi_round (const struct builtin_description *d,
35807 tree exp, rtx target)
35809 rtx pat, set_dst;
35810 tree arg0 = CALL_EXPR_ARG (exp, 0);
35811 tree arg1 = CALL_EXPR_ARG (exp, 1);
35812 tree arg2 = CALL_EXPR_ARG (exp, 2);
35813 tree arg3 = CALL_EXPR_ARG (exp, 3);
35814 rtx op0 = expand_normal (arg0);
35815 rtx op1 = expand_normal (arg1);
35816 rtx op2 = expand_normal (arg2);
35817 rtx op3 = expand_normal (arg3);
35818 enum insn_code icode = d->icode;
35819 const struct insn_data_d *insn_p = &insn_data[icode];
35820 machine_mode mode0 = insn_p->operand[0].mode;
35821 machine_mode mode1 = insn_p->operand[1].mode;
35822 enum rtx_code comparison = UNEQ;
35823 bool need_ucomi = false;
35825 /* See avxintrin.h for values. */
35826 enum rtx_code comi_comparisons[32] =
35828 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35829 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35830 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35832 bool need_ucomi_values[32] =
35834 true, false, false, true, true, false, false, true,
35835 true, false, false, true, true, false, false, true,
35836 false, true, true, false, false, true, true, false,
35837 false, true, true, false, false, true, true, false
35840 if (!CONST_INT_P (op2))
35842 error ("the third argument must be comparison constant");
35843 return const0_rtx;
35845 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35847 error ("incorrect comparison mode");
35848 return const0_rtx;
35851 if (!insn_p->operand[2].predicate (op3, SImode))
35853 error ("incorrect rounding operand");
35854 return const0_rtx;
35857 comparison = comi_comparisons[INTVAL (op2)];
35858 need_ucomi = need_ucomi_values[INTVAL (op2)];
35860 if (VECTOR_MODE_P (mode0))
35861 op0 = safe_vector_operand (op0, mode0);
35862 if (VECTOR_MODE_P (mode1))
35863 op1 = safe_vector_operand (op1, mode1);
35865 target = gen_reg_rtx (SImode);
35866 emit_move_insn (target, const0_rtx);
35867 target = gen_rtx_SUBREG (QImode, target, 0);
35869 if ((optimize && !register_operand (op0, mode0))
35870 || !insn_p->operand[0].predicate (op0, mode0))
35871 op0 = copy_to_mode_reg (mode0, op0);
35872 if ((optimize && !register_operand (op1, mode1))
35873 || !insn_p->operand[1].predicate (op1, mode1))
35874 op1 = copy_to_mode_reg (mode1, op1);
35876 if (need_ucomi)
35877 icode = icode == CODE_FOR_sse_comi_round
35878 ? CODE_FOR_sse_ucomi_round
35879 : CODE_FOR_sse2_ucomi_round;
35881 pat = GEN_FCN (icode) (op0, op1, op3);
35882 if (! pat)
35883 return 0;
35885 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35886 if (INTVAL (op3) == NO_ROUND)
35888 pat = ix86_erase_embedded_rounding (pat);
35889 if (! pat)
35890 return 0;
35892 set_dst = SET_DEST (pat);
35894 else
35896 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35897 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35900 emit_insn (pat);
35901 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35902 gen_rtx_fmt_ee (comparison, QImode,
35903 set_dst,
35904 const0_rtx)));
35906 return SUBREG_REG (target);
35909 static rtx
35910 ix86_expand_round_builtin (const struct builtin_description *d,
35911 tree exp, rtx target)
35913 rtx pat;
35914 unsigned int i, nargs;
35915 struct
35917 rtx op;
35918 machine_mode mode;
35919 } args[6];
35920 enum insn_code icode = d->icode;
35921 const struct insn_data_d *insn_p = &insn_data[icode];
35922 machine_mode tmode = insn_p->operand[0].mode;
35923 unsigned int nargs_constant = 0;
35924 unsigned int redundant_embed_rnd = 0;
35926 switch ((enum ix86_builtin_func_type) d->flag)
35928 case UINT64_FTYPE_V2DF_INT:
35929 case UINT64_FTYPE_V4SF_INT:
35930 case UINT_FTYPE_V2DF_INT:
35931 case UINT_FTYPE_V4SF_INT:
35932 case INT64_FTYPE_V2DF_INT:
35933 case INT64_FTYPE_V4SF_INT:
35934 case INT_FTYPE_V2DF_INT:
35935 case INT_FTYPE_V4SF_INT:
35936 nargs = 2;
35937 break;
35938 case V4SF_FTYPE_V4SF_UINT_INT:
35939 case V4SF_FTYPE_V4SF_UINT64_INT:
35940 case V2DF_FTYPE_V2DF_UINT64_INT:
35941 case V4SF_FTYPE_V4SF_INT_INT:
35942 case V4SF_FTYPE_V4SF_INT64_INT:
35943 case V2DF_FTYPE_V2DF_INT64_INT:
35944 case V4SF_FTYPE_V4SF_V4SF_INT:
35945 case V2DF_FTYPE_V2DF_V2DF_INT:
35946 case V4SF_FTYPE_V4SF_V2DF_INT:
35947 case V2DF_FTYPE_V2DF_V4SF_INT:
35948 nargs = 3;
35949 break;
35950 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35951 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35952 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35953 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35954 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35955 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35956 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35957 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35958 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35959 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35960 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35961 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35962 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35963 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35964 nargs = 4;
35965 break;
35966 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35967 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35968 nargs_constant = 2;
35969 nargs = 4;
35970 break;
35971 case INT_FTYPE_V4SF_V4SF_INT_INT:
35972 case INT_FTYPE_V2DF_V2DF_INT_INT:
35973 return ix86_expand_sse_comi_round (d, exp, target);
35974 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35975 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35976 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35977 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35978 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35979 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35980 nargs = 5;
35981 break;
35982 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35983 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35984 nargs_constant = 4;
35985 nargs = 5;
35986 break;
35987 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35988 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35989 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35990 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35991 nargs_constant = 3;
35992 nargs = 5;
35993 break;
35994 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35995 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35996 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35997 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35998 nargs = 6;
35999 nargs_constant = 4;
36000 break;
36001 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36002 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36003 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36004 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36005 nargs = 6;
36006 nargs_constant = 3;
36007 break;
36008 default:
36009 gcc_unreachable ();
36011 gcc_assert (nargs <= ARRAY_SIZE (args));
36013 if (optimize
36014 || target == 0
36015 || GET_MODE (target) != tmode
36016 || !insn_p->operand[0].predicate (target, tmode))
36017 target = gen_reg_rtx (tmode);
36019 for (i = 0; i < nargs; i++)
36021 tree arg = CALL_EXPR_ARG (exp, i);
36022 rtx op = expand_normal (arg);
36023 machine_mode mode = insn_p->operand[i + 1].mode;
36024 bool match = insn_p->operand[i + 1].predicate (op, mode);
36026 if (i == nargs - nargs_constant)
36028 if (!match)
36030 switch (icode)
36032 case CODE_FOR_avx512f_getmantv8df_mask_round:
36033 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36034 case CODE_FOR_avx512f_vgetmantv2df_round:
36035 case CODE_FOR_avx512f_vgetmantv4sf_round:
36036 error ("the immediate argument must be a 4-bit immediate");
36037 return const0_rtx;
36038 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36039 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36040 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36041 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36042 error ("the immediate argument must be a 5-bit immediate");
36043 return const0_rtx;
36044 default:
36045 error ("the immediate argument must be an 8-bit immediate");
36046 return const0_rtx;
36050 else if (i == nargs-1)
36052 if (!insn_p->operand[nargs].predicate (op, SImode))
36054 error ("incorrect rounding operand");
36055 return const0_rtx;
36058 /* If there is no rounding use normal version of the pattern. */
36059 if (INTVAL (op) == NO_ROUND)
36060 redundant_embed_rnd = 1;
36062 else
36064 if (VECTOR_MODE_P (mode))
36065 op = safe_vector_operand (op, mode);
36067 op = fixup_modeless_constant (op, mode);
36069 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36071 if (optimize || !match)
36072 op = copy_to_mode_reg (mode, op);
36074 else
36076 op = copy_to_reg (op);
36077 op = lowpart_subreg (mode, op, GET_MODE (op));
36081 args[i].op = op;
36082 args[i].mode = mode;
36085 switch (nargs)
36087 case 1:
36088 pat = GEN_FCN (icode) (target, args[0].op);
36089 break;
36090 case 2:
36091 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36092 break;
36093 case 3:
36094 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36095 args[2].op);
36096 break;
36097 case 4:
36098 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36099 args[2].op, args[3].op);
36100 break;
36101 case 5:
36102 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36103 args[2].op, args[3].op, args[4].op);
36104 break;
36105 case 6:
36106 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36107 args[2].op, args[3].op, args[4].op,
36108 args[5].op);
36109 break;
36110 default:
36111 gcc_unreachable ();
36114 if (!pat)
36115 return 0;
36117 if (redundant_embed_rnd)
36118 pat = ix86_erase_embedded_rounding (pat);
36120 emit_insn (pat);
36121 return target;
36124 /* Subroutine of ix86_expand_builtin to take care of special insns
36125 with variable number of operands. */
36127 static rtx
36128 ix86_expand_special_args_builtin (const struct builtin_description *d,
36129 tree exp, rtx target)
36131 tree arg;
36132 rtx pat, op;
36133 unsigned int i, nargs, arg_adjust, memory;
36134 bool aligned_mem = false;
36135 struct
36137 rtx op;
36138 machine_mode mode;
36139 } args[3];
36140 enum insn_code icode = d->icode;
36141 bool last_arg_constant = false;
36142 const struct insn_data_d *insn_p = &insn_data[icode];
36143 machine_mode tmode = insn_p->operand[0].mode;
36144 enum { load, store } klass;
36146 switch ((enum ix86_builtin_func_type) d->flag)
36148 case VOID_FTYPE_VOID:
36149 emit_insn (GEN_FCN (icode) (target));
36150 return 0;
36151 case VOID_FTYPE_UINT64:
36152 case VOID_FTYPE_UNSIGNED:
36153 nargs = 0;
36154 klass = store;
36155 memory = 0;
36156 break;
36158 case INT_FTYPE_VOID:
36159 case USHORT_FTYPE_VOID:
36160 case UINT64_FTYPE_VOID:
36161 case UNSIGNED_FTYPE_VOID:
36162 nargs = 0;
36163 klass = load;
36164 memory = 0;
36165 break;
36166 case UINT64_FTYPE_PUNSIGNED:
36167 case V2DI_FTYPE_PV2DI:
36168 case V4DI_FTYPE_PV4DI:
36169 case V32QI_FTYPE_PCCHAR:
36170 case V16QI_FTYPE_PCCHAR:
36171 case V8SF_FTYPE_PCV4SF:
36172 case V8SF_FTYPE_PCFLOAT:
36173 case V4SF_FTYPE_PCFLOAT:
36174 case V4DF_FTYPE_PCV2DF:
36175 case V4DF_FTYPE_PCDOUBLE:
36176 case V2DF_FTYPE_PCDOUBLE:
36177 case VOID_FTYPE_PVOID:
36178 case V8DI_FTYPE_PV8DI:
36179 nargs = 1;
36180 klass = load;
36181 memory = 0;
36182 switch (icode)
36184 case CODE_FOR_sse4_1_movntdqa:
36185 case CODE_FOR_avx2_movntdqa:
36186 case CODE_FOR_avx512f_movntdqa:
36187 aligned_mem = true;
36188 break;
36189 default:
36190 break;
36192 break;
36193 case VOID_FTYPE_PV2SF_V4SF:
36194 case VOID_FTYPE_PV8DI_V8DI:
36195 case VOID_FTYPE_PV4DI_V4DI:
36196 case VOID_FTYPE_PV2DI_V2DI:
36197 case VOID_FTYPE_PCHAR_V32QI:
36198 case VOID_FTYPE_PCHAR_V16QI:
36199 case VOID_FTYPE_PFLOAT_V16SF:
36200 case VOID_FTYPE_PFLOAT_V8SF:
36201 case VOID_FTYPE_PFLOAT_V4SF:
36202 case VOID_FTYPE_PDOUBLE_V8DF:
36203 case VOID_FTYPE_PDOUBLE_V4DF:
36204 case VOID_FTYPE_PDOUBLE_V2DF:
36205 case VOID_FTYPE_PLONGLONG_LONGLONG:
36206 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36207 case VOID_FTYPE_PINT_INT:
36208 nargs = 1;
36209 klass = store;
36210 /* Reserve memory operand for target. */
36211 memory = ARRAY_SIZE (args);
36212 switch (icode)
36214 /* These builtins and instructions require the memory
36215 to be properly aligned. */
36216 case CODE_FOR_avx_movntv4di:
36217 case CODE_FOR_sse2_movntv2di:
36218 case CODE_FOR_avx_movntv8sf:
36219 case CODE_FOR_sse_movntv4sf:
36220 case CODE_FOR_sse4a_vmmovntv4sf:
36221 case CODE_FOR_avx_movntv4df:
36222 case CODE_FOR_sse2_movntv2df:
36223 case CODE_FOR_sse4a_vmmovntv2df:
36224 case CODE_FOR_sse2_movntidi:
36225 case CODE_FOR_sse_movntq:
36226 case CODE_FOR_sse2_movntisi:
36227 case CODE_FOR_avx512f_movntv16sf:
36228 case CODE_FOR_avx512f_movntv8df:
36229 case CODE_FOR_avx512f_movntv8di:
36230 aligned_mem = true;
36231 break;
36232 default:
36233 break;
36235 break;
36236 case V4SF_FTYPE_V4SF_PCV2SF:
36237 case V2DF_FTYPE_V2DF_PCDOUBLE:
36238 nargs = 2;
36239 klass = load;
36240 memory = 1;
36241 break;
36242 case V8SF_FTYPE_PCV8SF_V8SI:
36243 case V4DF_FTYPE_PCV4DF_V4DI:
36244 case V4SF_FTYPE_PCV4SF_V4SI:
36245 case V2DF_FTYPE_PCV2DF_V2DI:
36246 case V8SI_FTYPE_PCV8SI_V8SI:
36247 case V4DI_FTYPE_PCV4DI_V4DI:
36248 case V4SI_FTYPE_PCV4SI_V4SI:
36249 case V2DI_FTYPE_PCV2DI_V2DI:
36250 nargs = 2;
36251 klass = load;
36252 memory = 0;
36253 break;
36254 case VOID_FTYPE_PV8DF_V8DF_UQI:
36255 case VOID_FTYPE_PV4DF_V4DF_UQI:
36256 case VOID_FTYPE_PV2DF_V2DF_UQI:
36257 case VOID_FTYPE_PV16SF_V16SF_UHI:
36258 case VOID_FTYPE_PV8SF_V8SF_UQI:
36259 case VOID_FTYPE_PV4SF_V4SF_UQI:
36260 case VOID_FTYPE_PV8DI_V8DI_UQI:
36261 case VOID_FTYPE_PV4DI_V4DI_UQI:
36262 case VOID_FTYPE_PV2DI_V2DI_UQI:
36263 case VOID_FTYPE_PV16SI_V16SI_UHI:
36264 case VOID_FTYPE_PV8SI_V8SI_UQI:
36265 case VOID_FTYPE_PV4SI_V4SI_UQI:
36266 switch (icode)
36268 /* These builtins and instructions require the memory
36269 to be properly aligned. */
36270 case CODE_FOR_avx512f_storev16sf_mask:
36271 case CODE_FOR_avx512f_storev16si_mask:
36272 case CODE_FOR_avx512f_storev8df_mask:
36273 case CODE_FOR_avx512f_storev8di_mask:
36274 case CODE_FOR_avx512vl_storev8sf_mask:
36275 case CODE_FOR_avx512vl_storev8si_mask:
36276 case CODE_FOR_avx512vl_storev4df_mask:
36277 case CODE_FOR_avx512vl_storev4di_mask:
36278 case CODE_FOR_avx512vl_storev4sf_mask:
36279 case CODE_FOR_avx512vl_storev4si_mask:
36280 case CODE_FOR_avx512vl_storev2df_mask:
36281 case CODE_FOR_avx512vl_storev2di_mask:
36282 aligned_mem = true;
36283 break;
36284 default:
36285 break;
36287 /* FALLTHRU */
36288 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36289 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36290 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36291 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36292 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36293 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36294 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36295 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36296 case VOID_FTYPE_PV8SI_V8DI_UQI:
36297 case VOID_FTYPE_PV8HI_V8DI_UQI:
36298 case VOID_FTYPE_PV16HI_V16SI_UHI:
36299 case VOID_FTYPE_PV16QI_V8DI_UQI:
36300 case VOID_FTYPE_PV16QI_V16SI_UHI:
36301 case VOID_FTYPE_PV4SI_V4DI_UQI:
36302 case VOID_FTYPE_PV4SI_V2DI_UQI:
36303 case VOID_FTYPE_PV8HI_V4DI_UQI:
36304 case VOID_FTYPE_PV8HI_V2DI_UQI:
36305 case VOID_FTYPE_PV8HI_V8SI_UQI:
36306 case VOID_FTYPE_PV8HI_V4SI_UQI:
36307 case VOID_FTYPE_PV16QI_V4DI_UQI:
36308 case VOID_FTYPE_PV16QI_V2DI_UQI:
36309 case VOID_FTYPE_PV16QI_V8SI_UQI:
36310 case VOID_FTYPE_PV16QI_V4SI_UQI:
36311 case VOID_FTYPE_PCHAR_V64QI_UDI:
36312 case VOID_FTYPE_PCHAR_V32QI_USI:
36313 case VOID_FTYPE_PCHAR_V16QI_UHI:
36314 case VOID_FTYPE_PSHORT_V32HI_USI:
36315 case VOID_FTYPE_PSHORT_V16HI_UHI:
36316 case VOID_FTYPE_PSHORT_V8HI_UQI:
36317 case VOID_FTYPE_PINT_V16SI_UHI:
36318 case VOID_FTYPE_PINT_V8SI_UQI:
36319 case VOID_FTYPE_PINT_V4SI_UQI:
36320 case VOID_FTYPE_PINT64_V8DI_UQI:
36321 case VOID_FTYPE_PINT64_V4DI_UQI:
36322 case VOID_FTYPE_PINT64_V2DI_UQI:
36323 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36324 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36325 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36326 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36327 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36328 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36329 nargs = 2;
36330 klass = store;
36331 /* Reserve memory operand for target. */
36332 memory = ARRAY_SIZE (args);
36333 break;
36334 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36335 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36336 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36337 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36338 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36339 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36340 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36341 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36342 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36343 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36344 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36345 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36346 switch (icode)
36348 /* These builtins and instructions require the memory
36349 to be properly aligned. */
36350 case CODE_FOR_avx512f_loadv16sf_mask:
36351 case CODE_FOR_avx512f_loadv16si_mask:
36352 case CODE_FOR_avx512f_loadv8df_mask:
36353 case CODE_FOR_avx512f_loadv8di_mask:
36354 case CODE_FOR_avx512vl_loadv8sf_mask:
36355 case CODE_FOR_avx512vl_loadv8si_mask:
36356 case CODE_FOR_avx512vl_loadv4df_mask:
36357 case CODE_FOR_avx512vl_loadv4di_mask:
36358 case CODE_FOR_avx512vl_loadv4sf_mask:
36359 case CODE_FOR_avx512vl_loadv4si_mask:
36360 case CODE_FOR_avx512vl_loadv2df_mask:
36361 case CODE_FOR_avx512vl_loadv2di_mask:
36362 case CODE_FOR_avx512bw_loadv64qi_mask:
36363 case CODE_FOR_avx512vl_loadv32qi_mask:
36364 case CODE_FOR_avx512vl_loadv16qi_mask:
36365 case CODE_FOR_avx512bw_loadv32hi_mask:
36366 case CODE_FOR_avx512vl_loadv16hi_mask:
36367 case CODE_FOR_avx512vl_loadv8hi_mask:
36368 aligned_mem = true;
36369 break;
36370 default:
36371 break;
36373 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36374 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36375 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36376 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36377 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36378 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36379 case V16SI_FTYPE_PCINT_V16SI_UHI:
36380 case V8SI_FTYPE_PCINT_V8SI_UQI:
36381 case V4SI_FTYPE_PCINT_V4SI_UQI:
36382 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36383 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36384 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36385 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36386 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36387 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36388 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36389 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36390 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36391 nargs = 3;
36392 klass = load;
36393 memory = 0;
36394 break;
36395 case VOID_FTYPE_UINT_UINT_UINT:
36396 case VOID_FTYPE_UINT64_UINT_UINT:
36397 case UCHAR_FTYPE_UINT_UINT_UINT:
36398 case UCHAR_FTYPE_UINT64_UINT_UINT:
36399 nargs = 3;
36400 klass = load;
36401 memory = ARRAY_SIZE (args);
36402 last_arg_constant = true;
36403 break;
36404 default:
36405 gcc_unreachable ();
36408 gcc_assert (nargs <= ARRAY_SIZE (args));
36410 if (klass == store)
36412 arg = CALL_EXPR_ARG (exp, 0);
36413 op = expand_normal (arg);
36414 gcc_assert (target == 0);
36415 if (memory)
36417 op = ix86_zero_extend_to_Pmode (op);
36418 target = gen_rtx_MEM (tmode, op);
36419 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36420 on it. Try to improve it using get_pointer_alignment,
36421 and if the special builtin is one that requires strict
36422 mode alignment, also from it's GET_MODE_ALIGNMENT.
36423 Failure to do so could lead to ix86_legitimate_combined_insn
36424 rejecting all changes to such insns. */
36425 unsigned int align = get_pointer_alignment (arg);
36426 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36427 align = GET_MODE_ALIGNMENT (tmode);
36428 if (MEM_ALIGN (target) < align)
36429 set_mem_align (target, align);
36431 else
36432 target = force_reg (tmode, op);
36433 arg_adjust = 1;
36435 else
36437 arg_adjust = 0;
36438 if (optimize
36439 || target == 0
36440 || !register_operand (target, tmode)
36441 || GET_MODE (target) != tmode)
36442 target = gen_reg_rtx (tmode);
36445 for (i = 0; i < nargs; i++)
36447 machine_mode mode = insn_p->operand[i + 1].mode;
36448 bool match;
36450 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36451 op = expand_normal (arg);
36452 match = insn_p->operand[i + 1].predicate (op, mode);
36454 if (last_arg_constant && (i + 1) == nargs)
36456 if (!match)
36458 if (icode == CODE_FOR_lwp_lwpvalsi3
36459 || icode == CODE_FOR_lwp_lwpinssi3
36460 || icode == CODE_FOR_lwp_lwpvaldi3
36461 || icode == CODE_FOR_lwp_lwpinsdi3)
36462 error ("the last argument must be a 32-bit immediate");
36463 else
36464 error ("the last argument must be an 8-bit immediate");
36465 return const0_rtx;
36468 else
36470 if (i == memory)
36472 /* This must be the memory operand. */
36473 op = ix86_zero_extend_to_Pmode (op);
36474 op = gen_rtx_MEM (mode, op);
36475 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36476 on it. Try to improve it using get_pointer_alignment,
36477 and if the special builtin is one that requires strict
36478 mode alignment, also from it's GET_MODE_ALIGNMENT.
36479 Failure to do so could lead to ix86_legitimate_combined_insn
36480 rejecting all changes to such insns. */
36481 unsigned int align = get_pointer_alignment (arg);
36482 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36483 align = GET_MODE_ALIGNMENT (mode);
36484 if (MEM_ALIGN (op) < align)
36485 set_mem_align (op, align);
36487 else
36489 /* This must be register. */
36490 if (VECTOR_MODE_P (mode))
36491 op = safe_vector_operand (op, mode);
36493 op = fixup_modeless_constant (op, mode);
36495 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36496 op = copy_to_mode_reg (mode, op);
36497 else
36499 op = copy_to_reg (op);
36500 op = lowpart_subreg (mode, op, GET_MODE (op));
36505 args[i].op = op;
36506 args[i].mode = mode;
36509 switch (nargs)
36511 case 0:
36512 pat = GEN_FCN (icode) (target);
36513 break;
36514 case 1:
36515 pat = GEN_FCN (icode) (target, args[0].op);
36516 break;
36517 case 2:
36518 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36519 break;
36520 case 3:
36521 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36522 break;
36523 default:
36524 gcc_unreachable ();
36527 if (! pat)
36528 return 0;
36529 emit_insn (pat);
36530 return klass == store ? 0 : target;
36533 /* Return the integer constant in ARG. Constrain it to be in the range
36534 of the subparts of VEC_TYPE; issue an error if not. */
36536 static int
36537 get_element_number (tree vec_type, tree arg)
36539 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36541 if (!tree_fits_uhwi_p (arg)
36542 || (elt = tree_to_uhwi (arg), elt > max))
36544 error ("selector must be an integer constant in the range 0..%wi", max);
36545 return 0;
36548 return elt;
36551 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36552 ix86_expand_vector_init. We DO have language-level syntax for this, in
36553 the form of (type){ init-list }. Except that since we can't place emms
36554 instructions from inside the compiler, we can't allow the use of MMX
36555 registers unless the user explicitly asks for it. So we do *not* define
36556 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36557 we have builtins invoked by mmintrin.h that gives us license to emit
36558 these sorts of instructions. */
36560 static rtx
36561 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36563 machine_mode tmode = TYPE_MODE (type);
36564 machine_mode inner_mode = GET_MODE_INNER (tmode);
36565 int i, n_elt = GET_MODE_NUNITS (tmode);
36566 rtvec v = rtvec_alloc (n_elt);
36568 gcc_assert (VECTOR_MODE_P (tmode));
36569 gcc_assert (call_expr_nargs (exp) == n_elt);
36571 for (i = 0; i < n_elt; ++i)
36573 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36574 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36577 if (!target || !register_operand (target, tmode))
36578 target = gen_reg_rtx (tmode);
36580 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36581 return target;
36584 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36585 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36586 had a language-level syntax for referencing vector elements. */
36588 static rtx
36589 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36591 machine_mode tmode, mode0;
36592 tree arg0, arg1;
36593 int elt;
36594 rtx op0;
36596 arg0 = CALL_EXPR_ARG (exp, 0);
36597 arg1 = CALL_EXPR_ARG (exp, 1);
36599 op0 = expand_normal (arg0);
36600 elt = get_element_number (TREE_TYPE (arg0), arg1);
36602 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36603 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36604 gcc_assert (VECTOR_MODE_P (mode0));
36606 op0 = force_reg (mode0, op0);
36608 if (optimize || !target || !register_operand (target, tmode))
36609 target = gen_reg_rtx (tmode);
36611 ix86_expand_vector_extract (true, target, op0, elt);
36613 return target;
36616 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36617 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36618 a language-level syntax for referencing vector elements. */
36620 static rtx
36621 ix86_expand_vec_set_builtin (tree exp)
36623 machine_mode tmode, mode1;
36624 tree arg0, arg1, arg2;
36625 int elt;
36626 rtx op0, op1, target;
36628 arg0 = CALL_EXPR_ARG (exp, 0);
36629 arg1 = CALL_EXPR_ARG (exp, 1);
36630 arg2 = CALL_EXPR_ARG (exp, 2);
36632 tmode = TYPE_MODE (TREE_TYPE (arg0));
36633 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36634 gcc_assert (VECTOR_MODE_P (tmode));
36636 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36637 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36638 elt = get_element_number (TREE_TYPE (arg0), arg2);
36640 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36641 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36643 op0 = force_reg (tmode, op0);
36644 op1 = force_reg (mode1, op1);
36646 /* OP0 is the source of these builtin functions and shouldn't be
36647 modified. Create a copy, use it and return it as target. */
36648 target = gen_reg_rtx (tmode);
36649 emit_move_insn (target, op0);
36650 ix86_expand_vector_set (true, target, op1, elt);
36652 return target;
36655 /* Emit conditional move of SRC to DST with condition
36656 OP1 CODE OP2. */
36657 static void
36658 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36660 rtx t;
36662 if (TARGET_CMOVE)
36664 t = ix86_expand_compare (code, op1, op2);
36665 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36666 src, dst)));
36668 else
36670 rtx_code_label *nomove = gen_label_rtx ();
36671 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36672 const0_rtx, GET_MODE (op1), 1, nomove);
36673 emit_move_insn (dst, src);
36674 emit_label (nomove);
36678 /* Choose max of DST and SRC and put it to DST. */
36679 static void
36680 ix86_emit_move_max (rtx dst, rtx src)
36682 ix86_emit_cmove (dst, src, LTU, dst, src);
36685 /* Expand an expression EXP that calls a built-in function,
36686 with result going to TARGET if that's convenient
36687 (and in mode MODE if that's convenient).
36688 SUBTARGET may be used as the target for computing one of EXP's operands.
36689 IGNORE is nonzero if the value is to be ignored. */
36691 static rtx
36692 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36693 machine_mode mode, int ignore)
36695 size_t i;
36696 enum insn_code icode;
36697 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36698 tree arg0, arg1, arg2, arg3, arg4;
36699 rtx op0, op1, op2, op3, op4, pat, insn;
36700 machine_mode mode0, mode1, mode2, mode3, mode4;
36701 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36703 /* For CPU builtins that can be folded, fold first and expand the fold. */
36704 switch (fcode)
36706 case IX86_BUILTIN_CPU_INIT:
36708 /* Make it call __cpu_indicator_init in libgcc. */
36709 tree call_expr, fndecl, type;
36710 type = build_function_type_list (integer_type_node, NULL_TREE);
36711 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36712 call_expr = build_call_expr (fndecl, 0);
36713 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36715 case IX86_BUILTIN_CPU_IS:
36716 case IX86_BUILTIN_CPU_SUPPORTS:
36718 tree arg0 = CALL_EXPR_ARG (exp, 0);
36719 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36720 gcc_assert (fold_expr != NULL_TREE);
36721 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36725 /* Determine whether the builtin function is available under the current ISA.
36726 Originally the builtin was not created if it wasn't applicable to the
36727 current ISA based on the command line switches. With function specific
36728 options, we need to check in the context of the function making the call
36729 whether it is supported. Treat AVX512VL specially. For other flags,
36730 if isa includes more than one ISA bit, treat those are requiring any
36731 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
36732 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
36733 at all, -m64 is a whole TU option. */
36734 if (((ix86_builtins_isa[fcode].isa
36735 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
36736 && !(ix86_builtins_isa[fcode].isa
36737 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
36738 & ix86_isa_flags))
36739 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
36740 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
36741 || (ix86_builtins_isa[fcode].isa2
36742 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
36744 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
36745 ix86_builtins_isa[fcode].isa2, 0, 0,
36746 NULL, NULL, (enum fpmath_unit) 0,
36747 false);
36748 if (!opts)
36749 error ("%qE needs unknown isa option", fndecl);
36750 else
36752 gcc_assert (opts != NULL);
36753 error ("%qE needs isa option %s", fndecl, opts);
36754 free (opts);
36756 return expand_call (exp, target, ignore);
36759 switch (fcode)
36761 case IX86_BUILTIN_BNDMK:
36762 if (!target
36763 || GET_MODE (target) != BNDmode
36764 || !register_operand (target, BNDmode))
36765 target = gen_reg_rtx (BNDmode);
36767 arg0 = CALL_EXPR_ARG (exp, 0);
36768 arg1 = CALL_EXPR_ARG (exp, 1);
36770 op0 = expand_normal (arg0);
36771 op1 = expand_normal (arg1);
36773 if (!register_operand (op0, Pmode))
36774 op0 = ix86_zero_extend_to_Pmode (op0);
36775 if (!register_operand (op1, Pmode))
36776 op1 = ix86_zero_extend_to_Pmode (op1);
36778 /* Builtin arg1 is size of block but instruction op1 should
36779 be (size - 1). */
36780 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36781 NULL_RTX, 1, OPTAB_DIRECT);
36783 emit_insn (BNDmode == BND64mode
36784 ? gen_bnd64_mk (target, op0, op1)
36785 : gen_bnd32_mk (target, op0, op1));
36786 return target;
36788 case IX86_BUILTIN_BNDSTX:
36789 arg0 = CALL_EXPR_ARG (exp, 0);
36790 arg1 = CALL_EXPR_ARG (exp, 1);
36791 arg2 = CALL_EXPR_ARG (exp, 2);
36793 op0 = expand_normal (arg0);
36794 op1 = expand_normal (arg1);
36795 op2 = expand_normal (arg2);
36797 if (!register_operand (op0, Pmode))
36798 op0 = ix86_zero_extend_to_Pmode (op0);
36799 if (!register_operand (op1, BNDmode))
36800 op1 = copy_to_mode_reg (BNDmode, op1);
36801 if (!register_operand (op2, Pmode))
36802 op2 = ix86_zero_extend_to_Pmode (op2);
36804 emit_insn (BNDmode == BND64mode
36805 ? gen_bnd64_stx (op2, op0, op1)
36806 : gen_bnd32_stx (op2, op0, op1));
36807 return 0;
36809 case IX86_BUILTIN_BNDLDX:
36810 if (!target
36811 || GET_MODE (target) != BNDmode
36812 || !register_operand (target, BNDmode))
36813 target = gen_reg_rtx (BNDmode);
36815 arg0 = CALL_EXPR_ARG (exp, 0);
36816 arg1 = CALL_EXPR_ARG (exp, 1);
36818 op0 = expand_normal (arg0);
36819 op1 = expand_normal (arg1);
36821 if (!register_operand (op0, Pmode))
36822 op0 = ix86_zero_extend_to_Pmode (op0);
36823 if (!register_operand (op1, Pmode))
36824 op1 = ix86_zero_extend_to_Pmode (op1);
36826 emit_insn (BNDmode == BND64mode
36827 ? gen_bnd64_ldx (target, op0, op1)
36828 : gen_bnd32_ldx (target, op0, op1));
36829 return target;
36831 case IX86_BUILTIN_BNDCL:
36832 arg0 = CALL_EXPR_ARG (exp, 0);
36833 arg1 = CALL_EXPR_ARG (exp, 1);
36835 op0 = expand_normal (arg0);
36836 op1 = expand_normal (arg1);
36838 if (!register_operand (op0, Pmode))
36839 op0 = ix86_zero_extend_to_Pmode (op0);
36840 if (!register_operand (op1, BNDmode))
36841 op1 = copy_to_mode_reg (BNDmode, op1);
36843 emit_insn (BNDmode == BND64mode
36844 ? gen_bnd64_cl (op1, op0)
36845 : gen_bnd32_cl (op1, op0));
36846 return 0;
36848 case IX86_BUILTIN_BNDCU:
36849 arg0 = CALL_EXPR_ARG (exp, 0);
36850 arg1 = CALL_EXPR_ARG (exp, 1);
36852 op0 = expand_normal (arg0);
36853 op1 = expand_normal (arg1);
36855 if (!register_operand (op0, Pmode))
36856 op0 = ix86_zero_extend_to_Pmode (op0);
36857 if (!register_operand (op1, BNDmode))
36858 op1 = copy_to_mode_reg (BNDmode, op1);
36860 emit_insn (BNDmode == BND64mode
36861 ? gen_bnd64_cu (op1, op0)
36862 : gen_bnd32_cu (op1, op0));
36863 return 0;
36865 case IX86_BUILTIN_BNDRET:
36866 arg0 = CALL_EXPR_ARG (exp, 0);
36867 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36868 target = chkp_get_rtl_bounds (arg0);
36870 /* If no bounds were specified for returned value,
36871 then use INIT bounds. It usually happens when
36872 some built-in function is expanded. */
36873 if (!target)
36875 rtx t1 = gen_reg_rtx (Pmode);
36876 rtx t2 = gen_reg_rtx (Pmode);
36877 target = gen_reg_rtx (BNDmode);
36878 emit_move_insn (t1, const0_rtx);
36879 emit_move_insn (t2, constm1_rtx);
36880 emit_insn (BNDmode == BND64mode
36881 ? gen_bnd64_mk (target, t1, t2)
36882 : gen_bnd32_mk (target, t1, t2));
36885 gcc_assert (target && REG_P (target));
36886 return target;
36888 case IX86_BUILTIN_BNDNARROW:
36890 rtx m1, m1h1, m1h2, lb, ub, t1;
36892 /* Return value and lb. */
36893 arg0 = CALL_EXPR_ARG (exp, 0);
36894 /* Bounds. */
36895 arg1 = CALL_EXPR_ARG (exp, 1);
36896 /* Size. */
36897 arg2 = CALL_EXPR_ARG (exp, 2);
36899 lb = expand_normal (arg0);
36900 op1 = expand_normal (arg1);
36901 op2 = expand_normal (arg2);
36903 /* Size was passed but we need to use (size - 1) as for bndmk. */
36904 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36905 NULL_RTX, 1, OPTAB_DIRECT);
36907 /* Add LB to size and inverse to get UB. */
36908 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36909 op2, 1, OPTAB_DIRECT);
36910 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36912 if (!register_operand (lb, Pmode))
36913 lb = ix86_zero_extend_to_Pmode (lb);
36914 if (!register_operand (ub, Pmode))
36915 ub = ix86_zero_extend_to_Pmode (ub);
36917 /* We need to move bounds to memory before any computations. */
36918 if (MEM_P (op1))
36919 m1 = op1;
36920 else
36922 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36923 emit_move_insn (m1, op1);
36926 /* Generate mem expression to be used for access to LB and UB. */
36927 m1h1 = adjust_address (m1, Pmode, 0);
36928 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36930 t1 = gen_reg_rtx (Pmode);
36932 /* Compute LB. */
36933 emit_move_insn (t1, m1h1);
36934 ix86_emit_move_max (t1, lb);
36935 emit_move_insn (m1h1, t1);
36937 /* Compute UB. UB is stored in 1's complement form. Therefore
36938 we also use max here. */
36939 emit_move_insn (t1, m1h2);
36940 ix86_emit_move_max (t1, ub);
36941 emit_move_insn (m1h2, t1);
36943 op2 = gen_reg_rtx (BNDmode);
36944 emit_move_insn (op2, m1);
36946 return chkp_join_splitted_slot (lb, op2);
36949 case IX86_BUILTIN_BNDINT:
36951 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36953 if (!target
36954 || GET_MODE (target) != BNDmode
36955 || !register_operand (target, BNDmode))
36956 target = gen_reg_rtx (BNDmode);
36958 arg0 = CALL_EXPR_ARG (exp, 0);
36959 arg1 = CALL_EXPR_ARG (exp, 1);
36961 op0 = expand_normal (arg0);
36962 op1 = expand_normal (arg1);
36964 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36965 rh1 = adjust_address (res, Pmode, 0);
36966 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36968 /* Put first bounds to temporaries. */
36969 lb1 = gen_reg_rtx (Pmode);
36970 ub1 = gen_reg_rtx (Pmode);
36971 if (MEM_P (op0))
36973 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36974 emit_move_insn (ub1, adjust_address (op0, Pmode,
36975 GET_MODE_SIZE (Pmode)));
36977 else
36979 emit_move_insn (res, op0);
36980 emit_move_insn (lb1, rh1);
36981 emit_move_insn (ub1, rh2);
36984 /* Put second bounds to temporaries. */
36985 lb2 = gen_reg_rtx (Pmode);
36986 ub2 = gen_reg_rtx (Pmode);
36987 if (MEM_P (op1))
36989 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36990 emit_move_insn (ub2, adjust_address (op1, Pmode,
36991 GET_MODE_SIZE (Pmode)));
36993 else
36995 emit_move_insn (res, op1);
36996 emit_move_insn (lb2, rh1);
36997 emit_move_insn (ub2, rh2);
37000 /* Compute LB. */
37001 ix86_emit_move_max (lb1, lb2);
37002 emit_move_insn (rh1, lb1);
37004 /* Compute UB. UB is stored in 1's complement form. Therefore
37005 we also use max here. */
37006 ix86_emit_move_max (ub1, ub2);
37007 emit_move_insn (rh2, ub1);
37009 emit_move_insn (target, res);
37011 return target;
37014 case IX86_BUILTIN_SIZEOF:
37016 tree name;
37017 rtx symbol;
37019 if (!target
37020 || GET_MODE (target) != Pmode
37021 || !register_operand (target, Pmode))
37022 target = gen_reg_rtx (Pmode);
37024 arg0 = CALL_EXPR_ARG (exp, 0);
37025 gcc_assert (VAR_P (arg0));
37027 name = DECL_ASSEMBLER_NAME (arg0);
37028 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37030 emit_insn (Pmode == SImode
37031 ? gen_move_size_reloc_si (target, symbol)
37032 : gen_move_size_reloc_di (target, symbol));
37034 return target;
37037 case IX86_BUILTIN_BNDLOWER:
37039 rtx mem, hmem;
37041 if (!target
37042 || GET_MODE (target) != Pmode
37043 || !register_operand (target, Pmode))
37044 target = gen_reg_rtx (Pmode);
37046 arg0 = CALL_EXPR_ARG (exp, 0);
37047 op0 = expand_normal (arg0);
37049 /* We need to move bounds to memory first. */
37050 if (MEM_P (op0))
37051 mem = op0;
37052 else
37054 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37055 emit_move_insn (mem, op0);
37058 /* Generate mem expression to access LB and load it. */
37059 hmem = adjust_address (mem, Pmode, 0);
37060 emit_move_insn (target, hmem);
37062 return target;
37065 case IX86_BUILTIN_BNDUPPER:
37067 rtx mem, hmem, res;
37069 if (!target
37070 || GET_MODE (target) != Pmode
37071 || !register_operand (target, Pmode))
37072 target = gen_reg_rtx (Pmode);
37074 arg0 = CALL_EXPR_ARG (exp, 0);
37075 op0 = expand_normal (arg0);
37077 /* We need to move bounds to memory first. */
37078 if (MEM_P (op0))
37079 mem = op0;
37080 else
37082 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37083 emit_move_insn (mem, op0);
37086 /* Generate mem expression to access UB. */
37087 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37089 /* We need to inverse all bits of UB. */
37090 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37092 if (res != target)
37093 emit_move_insn (target, res);
37095 return target;
37098 case IX86_BUILTIN_MASKMOVQ:
37099 case IX86_BUILTIN_MASKMOVDQU:
37100 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37101 ? CODE_FOR_mmx_maskmovq
37102 : CODE_FOR_sse2_maskmovdqu);
37103 /* Note the arg order is different from the operand order. */
37104 arg1 = CALL_EXPR_ARG (exp, 0);
37105 arg2 = CALL_EXPR_ARG (exp, 1);
37106 arg0 = CALL_EXPR_ARG (exp, 2);
37107 op0 = expand_normal (arg0);
37108 op1 = expand_normal (arg1);
37109 op2 = expand_normal (arg2);
37110 mode0 = insn_data[icode].operand[0].mode;
37111 mode1 = insn_data[icode].operand[1].mode;
37112 mode2 = insn_data[icode].operand[2].mode;
37114 op0 = ix86_zero_extend_to_Pmode (op0);
37115 op0 = gen_rtx_MEM (mode1, op0);
37117 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37118 op0 = copy_to_mode_reg (mode0, op0);
37119 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37120 op1 = copy_to_mode_reg (mode1, op1);
37121 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37122 op2 = copy_to_mode_reg (mode2, op2);
37123 pat = GEN_FCN (icode) (op0, op1, op2);
37124 if (! pat)
37125 return 0;
37126 emit_insn (pat);
37127 return 0;
37129 case IX86_BUILTIN_LDMXCSR:
37130 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37131 target = assign_386_stack_local (SImode, SLOT_TEMP);
37132 emit_move_insn (target, op0);
37133 emit_insn (gen_sse_ldmxcsr (target));
37134 return 0;
37136 case IX86_BUILTIN_STMXCSR:
37137 target = assign_386_stack_local (SImode, SLOT_TEMP);
37138 emit_insn (gen_sse_stmxcsr (target));
37139 return copy_to_mode_reg (SImode, target);
37141 case IX86_BUILTIN_CLFLUSH:
37142 arg0 = CALL_EXPR_ARG (exp, 0);
37143 op0 = expand_normal (arg0);
37144 icode = CODE_FOR_sse2_clflush;
37145 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37146 op0 = ix86_zero_extend_to_Pmode (op0);
37148 emit_insn (gen_sse2_clflush (op0));
37149 return 0;
37151 case IX86_BUILTIN_CLWB:
37152 arg0 = CALL_EXPR_ARG (exp, 0);
37153 op0 = expand_normal (arg0);
37154 icode = CODE_FOR_clwb;
37155 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37156 op0 = ix86_zero_extend_to_Pmode (op0);
37158 emit_insn (gen_clwb (op0));
37159 return 0;
37161 case IX86_BUILTIN_CLFLUSHOPT:
37162 arg0 = CALL_EXPR_ARG (exp, 0);
37163 op0 = expand_normal (arg0);
37164 icode = CODE_FOR_clflushopt;
37165 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37166 op0 = ix86_zero_extend_to_Pmode (op0);
37168 emit_insn (gen_clflushopt (op0));
37169 return 0;
37171 case IX86_BUILTIN_MONITOR:
37172 case IX86_BUILTIN_MONITORX:
37173 arg0 = CALL_EXPR_ARG (exp, 0);
37174 arg1 = CALL_EXPR_ARG (exp, 1);
37175 arg2 = CALL_EXPR_ARG (exp, 2);
37176 op0 = expand_normal (arg0);
37177 op1 = expand_normal (arg1);
37178 op2 = expand_normal (arg2);
37179 if (!REG_P (op0))
37180 op0 = ix86_zero_extend_to_Pmode (op0);
37181 if (!REG_P (op1))
37182 op1 = copy_to_mode_reg (SImode, op1);
37183 if (!REG_P (op2))
37184 op2 = copy_to_mode_reg (SImode, op2);
37186 emit_insn (fcode == IX86_BUILTIN_MONITOR
37187 ? ix86_gen_monitor (op0, op1, op2)
37188 : ix86_gen_monitorx (op0, op1, op2));
37189 return 0;
37191 case IX86_BUILTIN_MWAIT:
37192 arg0 = CALL_EXPR_ARG (exp, 0);
37193 arg1 = CALL_EXPR_ARG (exp, 1);
37194 op0 = expand_normal (arg0);
37195 op1 = expand_normal (arg1);
37196 if (!REG_P (op0))
37197 op0 = copy_to_mode_reg (SImode, op0);
37198 if (!REG_P (op1))
37199 op1 = copy_to_mode_reg (SImode, op1);
37200 emit_insn (gen_sse3_mwait (op0, op1));
37201 return 0;
37203 case IX86_BUILTIN_MWAITX:
37204 arg0 = CALL_EXPR_ARG (exp, 0);
37205 arg1 = CALL_EXPR_ARG (exp, 1);
37206 arg2 = CALL_EXPR_ARG (exp, 2);
37207 op0 = expand_normal (arg0);
37208 op1 = expand_normal (arg1);
37209 op2 = expand_normal (arg2);
37210 if (!REG_P (op0))
37211 op0 = copy_to_mode_reg (SImode, op0);
37212 if (!REG_P (op1))
37213 op1 = copy_to_mode_reg (SImode, op1);
37214 if (!REG_P (op2))
37215 op2 = copy_to_mode_reg (SImode, op2);
37216 emit_insn (gen_mwaitx (op0, op1, op2));
37217 return 0;
37219 case IX86_BUILTIN_CLZERO:
37220 arg0 = CALL_EXPR_ARG (exp, 0);
37221 op0 = expand_normal (arg0);
37222 if (!REG_P (op0))
37223 op0 = ix86_zero_extend_to_Pmode (op0);
37224 emit_insn (ix86_gen_clzero (op0));
37225 return 0;
37227 case IX86_BUILTIN_VEC_INIT_V2SI:
37228 case IX86_BUILTIN_VEC_INIT_V4HI:
37229 case IX86_BUILTIN_VEC_INIT_V8QI:
37230 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37232 case IX86_BUILTIN_VEC_EXT_V2DF:
37233 case IX86_BUILTIN_VEC_EXT_V2DI:
37234 case IX86_BUILTIN_VEC_EXT_V4SF:
37235 case IX86_BUILTIN_VEC_EXT_V4SI:
37236 case IX86_BUILTIN_VEC_EXT_V8HI:
37237 case IX86_BUILTIN_VEC_EXT_V2SI:
37238 case IX86_BUILTIN_VEC_EXT_V4HI:
37239 case IX86_BUILTIN_VEC_EXT_V16QI:
37240 return ix86_expand_vec_ext_builtin (exp, target);
37242 case IX86_BUILTIN_VEC_SET_V2DI:
37243 case IX86_BUILTIN_VEC_SET_V4SF:
37244 case IX86_BUILTIN_VEC_SET_V4SI:
37245 case IX86_BUILTIN_VEC_SET_V8HI:
37246 case IX86_BUILTIN_VEC_SET_V4HI:
37247 case IX86_BUILTIN_VEC_SET_V16QI:
37248 return ix86_expand_vec_set_builtin (exp);
37250 case IX86_BUILTIN_NANQ:
37251 case IX86_BUILTIN_NANSQ:
37252 return expand_call (exp, target, ignore);
37254 case IX86_BUILTIN_RDPMC:
37255 case IX86_BUILTIN_RDTSC:
37256 case IX86_BUILTIN_RDTSCP:
37258 op0 = gen_reg_rtx (DImode);
37259 op1 = gen_reg_rtx (DImode);
37261 if (fcode == IX86_BUILTIN_RDPMC)
37263 arg0 = CALL_EXPR_ARG (exp, 0);
37264 op2 = expand_normal (arg0);
37265 if (!register_operand (op2, SImode))
37266 op2 = copy_to_mode_reg (SImode, op2);
37268 insn = (TARGET_64BIT
37269 ? gen_rdpmc_rex64 (op0, op1, op2)
37270 : gen_rdpmc (op0, op2));
37271 emit_insn (insn);
37273 else if (fcode == IX86_BUILTIN_RDTSC)
37275 insn = (TARGET_64BIT
37276 ? gen_rdtsc_rex64 (op0, op1)
37277 : gen_rdtsc (op0));
37278 emit_insn (insn);
37280 else
37282 op2 = gen_reg_rtx (SImode);
37284 insn = (TARGET_64BIT
37285 ? gen_rdtscp_rex64 (op0, op1, op2)
37286 : gen_rdtscp (op0, op2));
37287 emit_insn (insn);
37289 arg0 = CALL_EXPR_ARG (exp, 0);
37290 op4 = expand_normal (arg0);
37291 if (!address_operand (op4, VOIDmode))
37293 op4 = convert_memory_address (Pmode, op4);
37294 op4 = copy_addr_to_reg (op4);
37296 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37299 if (target == 0)
37301 /* mode is VOIDmode if __builtin_rd* has been called
37302 without lhs. */
37303 if (mode == VOIDmode)
37304 return target;
37305 target = gen_reg_rtx (mode);
37308 if (TARGET_64BIT)
37310 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37311 op1, 1, OPTAB_DIRECT);
37312 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37313 op0, 1, OPTAB_DIRECT);
37316 emit_move_insn (target, op0);
37317 return target;
37319 case IX86_BUILTIN_FXSAVE:
37320 case IX86_BUILTIN_FXRSTOR:
37321 case IX86_BUILTIN_FXSAVE64:
37322 case IX86_BUILTIN_FXRSTOR64:
37323 case IX86_BUILTIN_FNSTENV:
37324 case IX86_BUILTIN_FLDENV:
37325 mode0 = BLKmode;
37326 switch (fcode)
37328 case IX86_BUILTIN_FXSAVE:
37329 icode = CODE_FOR_fxsave;
37330 break;
37331 case IX86_BUILTIN_FXRSTOR:
37332 icode = CODE_FOR_fxrstor;
37333 break;
37334 case IX86_BUILTIN_FXSAVE64:
37335 icode = CODE_FOR_fxsave64;
37336 break;
37337 case IX86_BUILTIN_FXRSTOR64:
37338 icode = CODE_FOR_fxrstor64;
37339 break;
37340 case IX86_BUILTIN_FNSTENV:
37341 icode = CODE_FOR_fnstenv;
37342 break;
37343 case IX86_BUILTIN_FLDENV:
37344 icode = CODE_FOR_fldenv;
37345 break;
37346 default:
37347 gcc_unreachable ();
37350 arg0 = CALL_EXPR_ARG (exp, 0);
37351 op0 = expand_normal (arg0);
37353 if (!address_operand (op0, VOIDmode))
37355 op0 = convert_memory_address (Pmode, op0);
37356 op0 = copy_addr_to_reg (op0);
37358 op0 = gen_rtx_MEM (mode0, op0);
37360 pat = GEN_FCN (icode) (op0);
37361 if (pat)
37362 emit_insn (pat);
37363 return 0;
37365 case IX86_BUILTIN_XSAVE:
37366 case IX86_BUILTIN_XRSTOR:
37367 case IX86_BUILTIN_XSAVE64:
37368 case IX86_BUILTIN_XRSTOR64:
37369 case IX86_BUILTIN_XSAVEOPT:
37370 case IX86_BUILTIN_XSAVEOPT64:
37371 case IX86_BUILTIN_XSAVES:
37372 case IX86_BUILTIN_XRSTORS:
37373 case IX86_BUILTIN_XSAVES64:
37374 case IX86_BUILTIN_XRSTORS64:
37375 case IX86_BUILTIN_XSAVEC:
37376 case IX86_BUILTIN_XSAVEC64:
37377 arg0 = CALL_EXPR_ARG (exp, 0);
37378 arg1 = CALL_EXPR_ARG (exp, 1);
37379 op0 = expand_normal (arg0);
37380 op1 = expand_normal (arg1);
37382 if (!address_operand (op0, VOIDmode))
37384 op0 = convert_memory_address (Pmode, op0);
37385 op0 = copy_addr_to_reg (op0);
37387 op0 = gen_rtx_MEM (BLKmode, op0);
37389 op1 = force_reg (DImode, op1);
37391 if (TARGET_64BIT)
37393 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37394 NULL, 1, OPTAB_DIRECT);
37395 switch (fcode)
37397 case IX86_BUILTIN_XSAVE:
37398 icode = CODE_FOR_xsave_rex64;
37399 break;
37400 case IX86_BUILTIN_XRSTOR:
37401 icode = CODE_FOR_xrstor_rex64;
37402 break;
37403 case IX86_BUILTIN_XSAVE64:
37404 icode = CODE_FOR_xsave64;
37405 break;
37406 case IX86_BUILTIN_XRSTOR64:
37407 icode = CODE_FOR_xrstor64;
37408 break;
37409 case IX86_BUILTIN_XSAVEOPT:
37410 icode = CODE_FOR_xsaveopt_rex64;
37411 break;
37412 case IX86_BUILTIN_XSAVEOPT64:
37413 icode = CODE_FOR_xsaveopt64;
37414 break;
37415 case IX86_BUILTIN_XSAVES:
37416 icode = CODE_FOR_xsaves_rex64;
37417 break;
37418 case IX86_BUILTIN_XRSTORS:
37419 icode = CODE_FOR_xrstors_rex64;
37420 break;
37421 case IX86_BUILTIN_XSAVES64:
37422 icode = CODE_FOR_xsaves64;
37423 break;
37424 case IX86_BUILTIN_XRSTORS64:
37425 icode = CODE_FOR_xrstors64;
37426 break;
37427 case IX86_BUILTIN_XSAVEC:
37428 icode = CODE_FOR_xsavec_rex64;
37429 break;
37430 case IX86_BUILTIN_XSAVEC64:
37431 icode = CODE_FOR_xsavec64;
37432 break;
37433 default:
37434 gcc_unreachable ();
37437 op2 = gen_lowpart (SImode, op2);
37438 op1 = gen_lowpart (SImode, op1);
37439 pat = GEN_FCN (icode) (op0, op1, op2);
37441 else
37443 switch (fcode)
37445 case IX86_BUILTIN_XSAVE:
37446 icode = CODE_FOR_xsave;
37447 break;
37448 case IX86_BUILTIN_XRSTOR:
37449 icode = CODE_FOR_xrstor;
37450 break;
37451 case IX86_BUILTIN_XSAVEOPT:
37452 icode = CODE_FOR_xsaveopt;
37453 break;
37454 case IX86_BUILTIN_XSAVES:
37455 icode = CODE_FOR_xsaves;
37456 break;
37457 case IX86_BUILTIN_XRSTORS:
37458 icode = CODE_FOR_xrstors;
37459 break;
37460 case IX86_BUILTIN_XSAVEC:
37461 icode = CODE_FOR_xsavec;
37462 break;
37463 default:
37464 gcc_unreachable ();
37466 pat = GEN_FCN (icode) (op0, op1);
37469 if (pat)
37470 emit_insn (pat);
37471 return 0;
37473 case IX86_BUILTIN_LLWPCB:
37474 arg0 = CALL_EXPR_ARG (exp, 0);
37475 op0 = expand_normal (arg0);
37476 icode = CODE_FOR_lwp_llwpcb;
37477 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37478 op0 = ix86_zero_extend_to_Pmode (op0);
37479 emit_insn (gen_lwp_llwpcb (op0));
37480 return 0;
37482 case IX86_BUILTIN_SLWPCB:
37483 icode = CODE_FOR_lwp_slwpcb;
37484 if (!target
37485 || !insn_data[icode].operand[0].predicate (target, Pmode))
37486 target = gen_reg_rtx (Pmode);
37487 emit_insn (gen_lwp_slwpcb (target));
37488 return target;
37490 case IX86_BUILTIN_BEXTRI32:
37491 case IX86_BUILTIN_BEXTRI64:
37492 arg0 = CALL_EXPR_ARG (exp, 0);
37493 arg1 = CALL_EXPR_ARG (exp, 1);
37494 op0 = expand_normal (arg0);
37495 op1 = expand_normal (arg1);
37496 icode = (fcode == IX86_BUILTIN_BEXTRI32
37497 ? CODE_FOR_tbm_bextri_si
37498 : CODE_FOR_tbm_bextri_di);
37499 if (!CONST_INT_P (op1))
37501 error ("last argument must be an immediate");
37502 return const0_rtx;
37504 else
37506 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37507 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37508 op1 = GEN_INT (length);
37509 op2 = GEN_INT (lsb_index);
37510 pat = GEN_FCN (icode) (target, op0, op1, op2);
37511 if (pat)
37512 emit_insn (pat);
37513 return target;
37516 case IX86_BUILTIN_RDRAND16_STEP:
37517 icode = CODE_FOR_rdrandhi_1;
37518 mode0 = HImode;
37519 goto rdrand_step;
37521 case IX86_BUILTIN_RDRAND32_STEP:
37522 icode = CODE_FOR_rdrandsi_1;
37523 mode0 = SImode;
37524 goto rdrand_step;
37526 case IX86_BUILTIN_RDRAND64_STEP:
37527 icode = CODE_FOR_rdranddi_1;
37528 mode0 = DImode;
37530 rdrand_step:
37531 op0 = gen_reg_rtx (mode0);
37532 emit_insn (GEN_FCN (icode) (op0));
37534 arg0 = CALL_EXPR_ARG (exp, 0);
37535 op1 = expand_normal (arg0);
37536 if (!address_operand (op1, VOIDmode))
37538 op1 = convert_memory_address (Pmode, op1);
37539 op1 = copy_addr_to_reg (op1);
37541 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37543 op1 = gen_reg_rtx (SImode);
37544 emit_move_insn (op1, CONST1_RTX (SImode));
37546 /* Emit SImode conditional move. */
37547 if (mode0 == HImode)
37549 op2 = gen_reg_rtx (SImode);
37550 emit_insn (gen_zero_extendhisi2 (op2, op0));
37552 else if (mode0 == SImode)
37553 op2 = op0;
37554 else
37555 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37557 if (target == 0
37558 || !register_operand (target, SImode))
37559 target = gen_reg_rtx (SImode);
37561 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37562 const0_rtx);
37563 emit_insn (gen_rtx_SET (target,
37564 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37565 return target;
37567 case IX86_BUILTIN_RDSEED16_STEP:
37568 icode = CODE_FOR_rdseedhi_1;
37569 mode0 = HImode;
37570 goto rdseed_step;
37572 case IX86_BUILTIN_RDSEED32_STEP:
37573 icode = CODE_FOR_rdseedsi_1;
37574 mode0 = SImode;
37575 goto rdseed_step;
37577 case IX86_BUILTIN_RDSEED64_STEP:
37578 icode = CODE_FOR_rdseeddi_1;
37579 mode0 = DImode;
37581 rdseed_step:
37582 op0 = gen_reg_rtx (mode0);
37583 emit_insn (GEN_FCN (icode) (op0));
37585 arg0 = CALL_EXPR_ARG (exp, 0);
37586 op1 = expand_normal (arg0);
37587 if (!address_operand (op1, VOIDmode))
37589 op1 = convert_memory_address (Pmode, op1);
37590 op1 = copy_addr_to_reg (op1);
37592 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37594 op2 = gen_reg_rtx (QImode);
37596 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37597 const0_rtx);
37598 emit_insn (gen_rtx_SET (op2, pat));
37600 if (target == 0
37601 || !register_operand (target, SImode))
37602 target = gen_reg_rtx (SImode);
37604 emit_insn (gen_zero_extendqisi2 (target, op2));
37605 return target;
37607 case IX86_BUILTIN_SBB32:
37608 icode = CODE_FOR_subborrowsi;
37609 mode0 = SImode;
37610 goto handlecarry;
37612 case IX86_BUILTIN_SBB64:
37613 icode = CODE_FOR_subborrowdi;
37614 mode0 = DImode;
37615 goto handlecarry;
37617 case IX86_BUILTIN_ADDCARRYX32:
37618 icode = CODE_FOR_addcarrysi;
37619 mode0 = SImode;
37620 goto handlecarry;
37622 case IX86_BUILTIN_ADDCARRYX64:
37623 icode = CODE_FOR_addcarrydi;
37624 mode0 = DImode;
37626 handlecarry:
37627 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37628 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37629 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37630 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37632 op1 = expand_normal (arg0);
37633 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37635 op2 = expand_normal (arg1);
37636 if (!register_operand (op2, mode0))
37637 op2 = copy_to_mode_reg (mode0, op2);
37639 op3 = expand_normal (arg2);
37640 if (!register_operand (op3, mode0))
37641 op3 = copy_to_mode_reg (mode0, op3);
37643 op4 = expand_normal (arg3);
37644 if (!address_operand (op4, VOIDmode))
37646 op4 = convert_memory_address (Pmode, op4);
37647 op4 = copy_addr_to_reg (op4);
37650 /* Generate CF from input operand. */
37651 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37653 /* Generate instruction that consumes CF. */
37654 op0 = gen_reg_rtx (mode0);
37656 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37657 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37658 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37660 /* Return current CF value. */
37661 if (target == 0)
37662 target = gen_reg_rtx (QImode);
37664 PUT_MODE (pat, QImode);
37665 emit_insn (gen_rtx_SET (target, pat));
37667 /* Store the result. */
37668 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37670 return target;
37672 case IX86_BUILTIN_READ_FLAGS:
37673 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37675 if (optimize
37676 || target == NULL_RTX
37677 || !nonimmediate_operand (target, word_mode)
37678 || GET_MODE (target) != word_mode)
37679 target = gen_reg_rtx (word_mode);
37681 emit_insn (gen_pop (target));
37682 return target;
37684 case IX86_BUILTIN_WRITE_FLAGS:
37686 arg0 = CALL_EXPR_ARG (exp, 0);
37687 op0 = expand_normal (arg0);
37688 if (!general_no_elim_operand (op0, word_mode))
37689 op0 = copy_to_mode_reg (word_mode, op0);
37691 emit_insn (gen_push (op0));
37692 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37693 return 0;
37695 case IX86_BUILTIN_KTESTC8:
37696 icode = CODE_FOR_ktestqi;
37697 mode0 = QImode;
37698 mode1 = CCCmode;
37699 goto kortest;
37701 case IX86_BUILTIN_KTESTZ8:
37702 icode = CODE_FOR_ktestqi;
37703 mode0 = QImode;
37704 mode1 = CCZmode;
37705 goto kortest;
37707 case IX86_BUILTIN_KTESTC16:
37708 icode = CODE_FOR_ktesthi;
37709 mode0 = HImode;
37710 mode1 = CCCmode;
37711 goto kortest;
37713 case IX86_BUILTIN_KTESTZ16:
37714 icode = CODE_FOR_ktesthi;
37715 mode0 = HImode;
37716 mode1 = CCZmode;
37717 goto kortest;
37719 case IX86_BUILTIN_KTESTC32:
37720 icode = CODE_FOR_ktestsi;
37721 mode0 = SImode;
37722 mode1 = CCCmode;
37723 goto kortest;
37725 case IX86_BUILTIN_KTESTZ32:
37726 icode = CODE_FOR_ktestsi;
37727 mode0 = SImode;
37728 mode1 = CCZmode;
37729 goto kortest;
37731 case IX86_BUILTIN_KTESTC64:
37732 icode = CODE_FOR_ktestdi;
37733 mode0 = DImode;
37734 mode1 = CCCmode;
37735 goto kortest;
37737 case IX86_BUILTIN_KTESTZ64:
37738 icode = CODE_FOR_ktestdi;
37739 mode0 = DImode;
37740 mode1 = CCZmode;
37741 goto kortest;
37743 case IX86_BUILTIN_KORTESTC8:
37744 icode = CODE_FOR_kortestqi;
37745 mode0 = QImode;
37746 mode1 = CCCmode;
37747 goto kortest;
37749 case IX86_BUILTIN_KORTESTZ8:
37750 icode = CODE_FOR_kortestqi;
37751 mode0 = QImode;
37752 mode1 = CCZmode;
37753 goto kortest;
37755 case IX86_BUILTIN_KORTESTC16:
37756 icode = CODE_FOR_kortesthi;
37757 mode0 = HImode;
37758 mode1 = CCCmode;
37759 goto kortest;
37761 case IX86_BUILTIN_KORTESTZ16:
37762 icode = CODE_FOR_kortesthi;
37763 mode0 = HImode;
37764 mode1 = CCZmode;
37765 goto kortest;
37767 case IX86_BUILTIN_KORTESTC32:
37768 icode = CODE_FOR_kortestsi;
37769 mode0 = SImode;
37770 mode1 = CCCmode;
37771 goto kortest;
37773 case IX86_BUILTIN_KORTESTZ32:
37774 icode = CODE_FOR_kortestsi;
37775 mode0 = SImode;
37776 mode1 = CCZmode;
37777 goto kortest;
37779 case IX86_BUILTIN_KORTESTC64:
37780 icode = CODE_FOR_kortestdi;
37781 mode0 = DImode;
37782 mode1 = CCCmode;
37783 goto kortest;
37785 case IX86_BUILTIN_KORTESTZ64:
37786 icode = CODE_FOR_kortestdi;
37787 mode0 = DImode;
37788 mode1 = CCZmode;
37790 kortest:
37791 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37792 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37793 op0 = expand_normal (arg0);
37794 op1 = expand_normal (arg1);
37796 op0 = copy_to_reg (op0);
37797 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37798 op1 = copy_to_reg (op1);
37799 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37801 target = gen_reg_rtx (QImode);
37802 emit_insn (gen_rtx_SET (target, const0_rtx));
37804 /* Emit kortest. */
37805 emit_insn (GEN_FCN (icode) (op0, op1));
37806 /* And use setcc to return result from flags. */
37807 ix86_expand_setcc (target, EQ,
37808 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37809 return target;
37811 case IX86_BUILTIN_GATHERSIV2DF:
37812 icode = CODE_FOR_avx2_gathersiv2df;
37813 goto gather_gen;
37814 case IX86_BUILTIN_GATHERSIV4DF:
37815 icode = CODE_FOR_avx2_gathersiv4df;
37816 goto gather_gen;
37817 case IX86_BUILTIN_GATHERDIV2DF:
37818 icode = CODE_FOR_avx2_gatherdiv2df;
37819 goto gather_gen;
37820 case IX86_BUILTIN_GATHERDIV4DF:
37821 icode = CODE_FOR_avx2_gatherdiv4df;
37822 goto gather_gen;
37823 case IX86_BUILTIN_GATHERSIV4SF:
37824 icode = CODE_FOR_avx2_gathersiv4sf;
37825 goto gather_gen;
37826 case IX86_BUILTIN_GATHERSIV8SF:
37827 icode = CODE_FOR_avx2_gathersiv8sf;
37828 goto gather_gen;
37829 case IX86_BUILTIN_GATHERDIV4SF:
37830 icode = CODE_FOR_avx2_gatherdiv4sf;
37831 goto gather_gen;
37832 case IX86_BUILTIN_GATHERDIV8SF:
37833 icode = CODE_FOR_avx2_gatherdiv8sf;
37834 goto gather_gen;
37835 case IX86_BUILTIN_GATHERSIV2DI:
37836 icode = CODE_FOR_avx2_gathersiv2di;
37837 goto gather_gen;
37838 case IX86_BUILTIN_GATHERSIV4DI:
37839 icode = CODE_FOR_avx2_gathersiv4di;
37840 goto gather_gen;
37841 case IX86_BUILTIN_GATHERDIV2DI:
37842 icode = CODE_FOR_avx2_gatherdiv2di;
37843 goto gather_gen;
37844 case IX86_BUILTIN_GATHERDIV4DI:
37845 icode = CODE_FOR_avx2_gatherdiv4di;
37846 goto gather_gen;
37847 case IX86_BUILTIN_GATHERSIV4SI:
37848 icode = CODE_FOR_avx2_gathersiv4si;
37849 goto gather_gen;
37850 case IX86_BUILTIN_GATHERSIV8SI:
37851 icode = CODE_FOR_avx2_gathersiv8si;
37852 goto gather_gen;
37853 case IX86_BUILTIN_GATHERDIV4SI:
37854 icode = CODE_FOR_avx2_gatherdiv4si;
37855 goto gather_gen;
37856 case IX86_BUILTIN_GATHERDIV8SI:
37857 icode = CODE_FOR_avx2_gatherdiv8si;
37858 goto gather_gen;
37859 case IX86_BUILTIN_GATHERALTSIV4DF:
37860 icode = CODE_FOR_avx2_gathersiv4df;
37861 goto gather_gen;
37862 case IX86_BUILTIN_GATHERALTDIV8SF:
37863 icode = CODE_FOR_avx2_gatherdiv8sf;
37864 goto gather_gen;
37865 case IX86_BUILTIN_GATHERALTSIV4DI:
37866 icode = CODE_FOR_avx2_gathersiv4di;
37867 goto gather_gen;
37868 case IX86_BUILTIN_GATHERALTDIV8SI:
37869 icode = CODE_FOR_avx2_gatherdiv8si;
37870 goto gather_gen;
37871 case IX86_BUILTIN_GATHER3SIV16SF:
37872 icode = CODE_FOR_avx512f_gathersiv16sf;
37873 goto gather_gen;
37874 case IX86_BUILTIN_GATHER3SIV8DF:
37875 icode = CODE_FOR_avx512f_gathersiv8df;
37876 goto gather_gen;
37877 case IX86_BUILTIN_GATHER3DIV16SF:
37878 icode = CODE_FOR_avx512f_gatherdiv16sf;
37879 goto gather_gen;
37880 case IX86_BUILTIN_GATHER3DIV8DF:
37881 icode = CODE_FOR_avx512f_gatherdiv8df;
37882 goto gather_gen;
37883 case IX86_BUILTIN_GATHER3SIV16SI:
37884 icode = CODE_FOR_avx512f_gathersiv16si;
37885 goto gather_gen;
37886 case IX86_BUILTIN_GATHER3SIV8DI:
37887 icode = CODE_FOR_avx512f_gathersiv8di;
37888 goto gather_gen;
37889 case IX86_BUILTIN_GATHER3DIV16SI:
37890 icode = CODE_FOR_avx512f_gatherdiv16si;
37891 goto gather_gen;
37892 case IX86_BUILTIN_GATHER3DIV8DI:
37893 icode = CODE_FOR_avx512f_gatherdiv8di;
37894 goto gather_gen;
37895 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37896 icode = CODE_FOR_avx512f_gathersiv8df;
37897 goto gather_gen;
37898 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37899 icode = CODE_FOR_avx512f_gatherdiv16sf;
37900 goto gather_gen;
37901 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37902 icode = CODE_FOR_avx512f_gathersiv8di;
37903 goto gather_gen;
37904 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37905 icode = CODE_FOR_avx512f_gatherdiv16si;
37906 goto gather_gen;
37907 case IX86_BUILTIN_GATHER3SIV2DF:
37908 icode = CODE_FOR_avx512vl_gathersiv2df;
37909 goto gather_gen;
37910 case IX86_BUILTIN_GATHER3SIV4DF:
37911 icode = CODE_FOR_avx512vl_gathersiv4df;
37912 goto gather_gen;
37913 case IX86_BUILTIN_GATHER3DIV2DF:
37914 icode = CODE_FOR_avx512vl_gatherdiv2df;
37915 goto gather_gen;
37916 case IX86_BUILTIN_GATHER3DIV4DF:
37917 icode = CODE_FOR_avx512vl_gatherdiv4df;
37918 goto gather_gen;
37919 case IX86_BUILTIN_GATHER3SIV4SF:
37920 icode = CODE_FOR_avx512vl_gathersiv4sf;
37921 goto gather_gen;
37922 case IX86_BUILTIN_GATHER3SIV8SF:
37923 icode = CODE_FOR_avx512vl_gathersiv8sf;
37924 goto gather_gen;
37925 case IX86_BUILTIN_GATHER3DIV4SF:
37926 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37927 goto gather_gen;
37928 case IX86_BUILTIN_GATHER3DIV8SF:
37929 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37930 goto gather_gen;
37931 case IX86_BUILTIN_GATHER3SIV2DI:
37932 icode = CODE_FOR_avx512vl_gathersiv2di;
37933 goto gather_gen;
37934 case IX86_BUILTIN_GATHER3SIV4DI:
37935 icode = CODE_FOR_avx512vl_gathersiv4di;
37936 goto gather_gen;
37937 case IX86_BUILTIN_GATHER3DIV2DI:
37938 icode = CODE_FOR_avx512vl_gatherdiv2di;
37939 goto gather_gen;
37940 case IX86_BUILTIN_GATHER3DIV4DI:
37941 icode = CODE_FOR_avx512vl_gatherdiv4di;
37942 goto gather_gen;
37943 case IX86_BUILTIN_GATHER3SIV4SI:
37944 icode = CODE_FOR_avx512vl_gathersiv4si;
37945 goto gather_gen;
37946 case IX86_BUILTIN_GATHER3SIV8SI:
37947 icode = CODE_FOR_avx512vl_gathersiv8si;
37948 goto gather_gen;
37949 case IX86_BUILTIN_GATHER3DIV4SI:
37950 icode = CODE_FOR_avx512vl_gatherdiv4si;
37951 goto gather_gen;
37952 case IX86_BUILTIN_GATHER3DIV8SI:
37953 icode = CODE_FOR_avx512vl_gatherdiv8si;
37954 goto gather_gen;
37955 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37956 icode = CODE_FOR_avx512vl_gathersiv4df;
37957 goto gather_gen;
37958 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37959 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37960 goto gather_gen;
37961 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37962 icode = CODE_FOR_avx512vl_gathersiv4di;
37963 goto gather_gen;
37964 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37965 icode = CODE_FOR_avx512vl_gatherdiv8si;
37966 goto gather_gen;
37967 case IX86_BUILTIN_SCATTERSIV16SF:
37968 icode = CODE_FOR_avx512f_scattersiv16sf;
37969 goto scatter_gen;
37970 case IX86_BUILTIN_SCATTERSIV8DF:
37971 icode = CODE_FOR_avx512f_scattersiv8df;
37972 goto scatter_gen;
37973 case IX86_BUILTIN_SCATTERDIV16SF:
37974 icode = CODE_FOR_avx512f_scatterdiv16sf;
37975 goto scatter_gen;
37976 case IX86_BUILTIN_SCATTERDIV8DF:
37977 icode = CODE_FOR_avx512f_scatterdiv8df;
37978 goto scatter_gen;
37979 case IX86_BUILTIN_SCATTERSIV16SI:
37980 icode = CODE_FOR_avx512f_scattersiv16si;
37981 goto scatter_gen;
37982 case IX86_BUILTIN_SCATTERSIV8DI:
37983 icode = CODE_FOR_avx512f_scattersiv8di;
37984 goto scatter_gen;
37985 case IX86_BUILTIN_SCATTERDIV16SI:
37986 icode = CODE_FOR_avx512f_scatterdiv16si;
37987 goto scatter_gen;
37988 case IX86_BUILTIN_SCATTERDIV8DI:
37989 icode = CODE_FOR_avx512f_scatterdiv8di;
37990 goto scatter_gen;
37991 case IX86_BUILTIN_SCATTERSIV8SF:
37992 icode = CODE_FOR_avx512vl_scattersiv8sf;
37993 goto scatter_gen;
37994 case IX86_BUILTIN_SCATTERSIV4SF:
37995 icode = CODE_FOR_avx512vl_scattersiv4sf;
37996 goto scatter_gen;
37997 case IX86_BUILTIN_SCATTERSIV4DF:
37998 icode = CODE_FOR_avx512vl_scattersiv4df;
37999 goto scatter_gen;
38000 case IX86_BUILTIN_SCATTERSIV2DF:
38001 icode = CODE_FOR_avx512vl_scattersiv2df;
38002 goto scatter_gen;
38003 case IX86_BUILTIN_SCATTERDIV8SF:
38004 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38005 goto scatter_gen;
38006 case IX86_BUILTIN_SCATTERDIV4SF:
38007 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38008 goto scatter_gen;
38009 case IX86_BUILTIN_SCATTERDIV4DF:
38010 icode = CODE_FOR_avx512vl_scatterdiv4df;
38011 goto scatter_gen;
38012 case IX86_BUILTIN_SCATTERDIV2DF:
38013 icode = CODE_FOR_avx512vl_scatterdiv2df;
38014 goto scatter_gen;
38015 case IX86_BUILTIN_SCATTERSIV8SI:
38016 icode = CODE_FOR_avx512vl_scattersiv8si;
38017 goto scatter_gen;
38018 case IX86_BUILTIN_SCATTERSIV4SI:
38019 icode = CODE_FOR_avx512vl_scattersiv4si;
38020 goto scatter_gen;
38021 case IX86_BUILTIN_SCATTERSIV4DI:
38022 icode = CODE_FOR_avx512vl_scattersiv4di;
38023 goto scatter_gen;
38024 case IX86_BUILTIN_SCATTERSIV2DI:
38025 icode = CODE_FOR_avx512vl_scattersiv2di;
38026 goto scatter_gen;
38027 case IX86_BUILTIN_SCATTERDIV8SI:
38028 icode = CODE_FOR_avx512vl_scatterdiv8si;
38029 goto scatter_gen;
38030 case IX86_BUILTIN_SCATTERDIV4SI:
38031 icode = CODE_FOR_avx512vl_scatterdiv4si;
38032 goto scatter_gen;
38033 case IX86_BUILTIN_SCATTERDIV4DI:
38034 icode = CODE_FOR_avx512vl_scatterdiv4di;
38035 goto scatter_gen;
38036 case IX86_BUILTIN_SCATTERDIV2DI:
38037 icode = CODE_FOR_avx512vl_scatterdiv2di;
38038 goto scatter_gen;
38039 case IX86_BUILTIN_GATHERPFDPD:
38040 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38041 goto vec_prefetch_gen;
38042 case IX86_BUILTIN_SCATTERALTSIV8DF:
38043 icode = CODE_FOR_avx512f_scattersiv8df;
38044 goto scatter_gen;
38045 case IX86_BUILTIN_SCATTERALTDIV16SF:
38046 icode = CODE_FOR_avx512f_scatterdiv16sf;
38047 goto scatter_gen;
38048 case IX86_BUILTIN_SCATTERALTSIV8DI:
38049 icode = CODE_FOR_avx512f_scattersiv8di;
38050 goto scatter_gen;
38051 case IX86_BUILTIN_SCATTERALTDIV16SI:
38052 icode = CODE_FOR_avx512f_scatterdiv16si;
38053 goto scatter_gen;
38054 case IX86_BUILTIN_GATHERPFDPS:
38055 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38056 goto vec_prefetch_gen;
38057 case IX86_BUILTIN_GATHERPFQPD:
38058 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38059 goto vec_prefetch_gen;
38060 case IX86_BUILTIN_GATHERPFQPS:
38061 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38062 goto vec_prefetch_gen;
38063 case IX86_BUILTIN_SCATTERPFDPD:
38064 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38065 goto vec_prefetch_gen;
38066 case IX86_BUILTIN_SCATTERPFDPS:
38067 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38068 goto vec_prefetch_gen;
38069 case IX86_BUILTIN_SCATTERPFQPD:
38070 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38071 goto vec_prefetch_gen;
38072 case IX86_BUILTIN_SCATTERPFQPS:
38073 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38074 goto vec_prefetch_gen;
38076 gather_gen:
38077 rtx half;
38078 rtx (*gen) (rtx, rtx);
38080 arg0 = CALL_EXPR_ARG (exp, 0);
38081 arg1 = CALL_EXPR_ARG (exp, 1);
38082 arg2 = CALL_EXPR_ARG (exp, 2);
38083 arg3 = CALL_EXPR_ARG (exp, 3);
38084 arg4 = CALL_EXPR_ARG (exp, 4);
38085 op0 = expand_normal (arg0);
38086 op1 = expand_normal (arg1);
38087 op2 = expand_normal (arg2);
38088 op3 = expand_normal (arg3);
38089 op4 = expand_normal (arg4);
38090 /* Note the arg order is different from the operand order. */
38091 mode0 = insn_data[icode].operand[1].mode;
38092 mode2 = insn_data[icode].operand[3].mode;
38093 mode3 = insn_data[icode].operand[4].mode;
38094 mode4 = insn_data[icode].operand[5].mode;
38096 if (target == NULL_RTX
38097 || GET_MODE (target) != insn_data[icode].operand[0].mode
38098 || !insn_data[icode].operand[0].predicate (target,
38099 GET_MODE (target)))
38100 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38101 else
38102 subtarget = target;
38104 switch (fcode)
38106 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38107 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38108 half = gen_reg_rtx (V8SImode);
38109 if (!nonimmediate_operand (op2, V16SImode))
38110 op2 = copy_to_mode_reg (V16SImode, op2);
38111 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38112 op2 = half;
38113 break;
38114 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38115 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38116 case IX86_BUILTIN_GATHERALTSIV4DF:
38117 case IX86_BUILTIN_GATHERALTSIV4DI:
38118 half = gen_reg_rtx (V4SImode);
38119 if (!nonimmediate_operand (op2, V8SImode))
38120 op2 = copy_to_mode_reg (V8SImode, op2);
38121 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38122 op2 = half;
38123 break;
38124 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38125 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38126 half = gen_reg_rtx (mode0);
38127 if (mode0 == V8SFmode)
38128 gen = gen_vec_extract_lo_v16sf;
38129 else
38130 gen = gen_vec_extract_lo_v16si;
38131 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38132 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38133 emit_insn (gen (half, op0));
38134 op0 = half;
38135 if (GET_MODE (op3) != VOIDmode)
38137 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38138 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38139 emit_insn (gen (half, op3));
38140 op3 = half;
38142 break;
38143 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38144 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38145 case IX86_BUILTIN_GATHERALTDIV8SF:
38146 case IX86_BUILTIN_GATHERALTDIV8SI:
38147 half = gen_reg_rtx (mode0);
38148 if (mode0 == V4SFmode)
38149 gen = gen_vec_extract_lo_v8sf;
38150 else
38151 gen = gen_vec_extract_lo_v8si;
38152 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38153 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38154 emit_insn (gen (half, op0));
38155 op0 = half;
38156 if (GET_MODE (op3) != VOIDmode)
38158 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38159 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38160 emit_insn (gen (half, op3));
38161 op3 = half;
38163 break;
38164 default:
38165 break;
38168 /* Force memory operand only with base register here. But we
38169 don't want to do it on memory operand for other builtin
38170 functions. */
38171 op1 = ix86_zero_extend_to_Pmode (op1);
38173 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38174 op0 = copy_to_mode_reg (mode0, op0);
38175 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38176 op1 = copy_to_mode_reg (Pmode, op1);
38177 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38178 op2 = copy_to_mode_reg (mode2, op2);
38180 op3 = fixup_modeless_constant (op3, mode3);
38182 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38184 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38185 op3 = copy_to_mode_reg (mode3, op3);
38187 else
38189 op3 = copy_to_reg (op3);
38190 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38192 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38194 error ("the last argument must be scale 1, 2, 4, 8");
38195 return const0_rtx;
38198 /* Optimize. If mask is known to have all high bits set,
38199 replace op0 with pc_rtx to signal that the instruction
38200 overwrites the whole destination and doesn't use its
38201 previous contents. */
38202 if (optimize)
38204 if (TREE_CODE (arg3) == INTEGER_CST)
38206 if (integer_all_onesp (arg3))
38207 op0 = pc_rtx;
38209 else if (TREE_CODE (arg3) == VECTOR_CST)
38211 unsigned int negative = 0;
38212 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38214 tree cst = VECTOR_CST_ELT (arg3, i);
38215 if (TREE_CODE (cst) == INTEGER_CST
38216 && tree_int_cst_sign_bit (cst))
38217 negative++;
38218 else if (TREE_CODE (cst) == REAL_CST
38219 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38220 negative++;
38222 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38223 op0 = pc_rtx;
38225 else if (TREE_CODE (arg3) == SSA_NAME
38226 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38228 /* Recognize also when mask is like:
38229 __v2df src = _mm_setzero_pd ();
38230 __v2df mask = _mm_cmpeq_pd (src, src);
38232 __v8sf src = _mm256_setzero_ps ();
38233 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38234 as that is a cheaper way to load all ones into
38235 a register than having to load a constant from
38236 memory. */
38237 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38238 if (is_gimple_call (def_stmt))
38240 tree fndecl = gimple_call_fndecl (def_stmt);
38241 if (fndecl
38242 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38243 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38245 case IX86_BUILTIN_CMPPD:
38246 case IX86_BUILTIN_CMPPS:
38247 case IX86_BUILTIN_CMPPD256:
38248 case IX86_BUILTIN_CMPPS256:
38249 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38250 break;
38251 /* FALLTHRU */
38252 case IX86_BUILTIN_CMPEQPD:
38253 case IX86_BUILTIN_CMPEQPS:
38254 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38255 && initializer_zerop (gimple_call_arg (def_stmt,
38256 1)))
38257 op0 = pc_rtx;
38258 break;
38259 default:
38260 break;
38266 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38267 if (! pat)
38268 return const0_rtx;
38269 emit_insn (pat);
38271 switch (fcode)
38273 case IX86_BUILTIN_GATHER3DIV16SF:
38274 if (target == NULL_RTX)
38275 target = gen_reg_rtx (V8SFmode);
38276 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38277 break;
38278 case IX86_BUILTIN_GATHER3DIV16SI:
38279 if (target == NULL_RTX)
38280 target = gen_reg_rtx (V8SImode);
38281 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38282 break;
38283 case IX86_BUILTIN_GATHER3DIV8SF:
38284 case IX86_BUILTIN_GATHERDIV8SF:
38285 if (target == NULL_RTX)
38286 target = gen_reg_rtx (V4SFmode);
38287 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38288 break;
38289 case IX86_BUILTIN_GATHER3DIV8SI:
38290 case IX86_BUILTIN_GATHERDIV8SI:
38291 if (target == NULL_RTX)
38292 target = gen_reg_rtx (V4SImode);
38293 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38294 break;
38295 default:
38296 target = subtarget;
38297 break;
38299 return target;
38301 scatter_gen:
38302 arg0 = CALL_EXPR_ARG (exp, 0);
38303 arg1 = CALL_EXPR_ARG (exp, 1);
38304 arg2 = CALL_EXPR_ARG (exp, 2);
38305 arg3 = CALL_EXPR_ARG (exp, 3);
38306 arg4 = CALL_EXPR_ARG (exp, 4);
38307 op0 = expand_normal (arg0);
38308 op1 = expand_normal (arg1);
38309 op2 = expand_normal (arg2);
38310 op3 = expand_normal (arg3);
38311 op4 = expand_normal (arg4);
38312 mode1 = insn_data[icode].operand[1].mode;
38313 mode2 = insn_data[icode].operand[2].mode;
38314 mode3 = insn_data[icode].operand[3].mode;
38315 mode4 = insn_data[icode].operand[4].mode;
38317 /* Scatter instruction stores operand op3 to memory with
38318 indices from op2 and scale from op4 under writemask op1.
38319 If index operand op2 has more elements then source operand
38320 op3 one need to use only its low half. And vice versa. */
38321 switch (fcode)
38323 case IX86_BUILTIN_SCATTERALTSIV8DF:
38324 case IX86_BUILTIN_SCATTERALTSIV8DI:
38325 half = gen_reg_rtx (V8SImode);
38326 if (!nonimmediate_operand (op2, V16SImode))
38327 op2 = copy_to_mode_reg (V16SImode, op2);
38328 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38329 op2 = half;
38330 break;
38331 case IX86_BUILTIN_SCATTERALTDIV16SF:
38332 case IX86_BUILTIN_SCATTERALTDIV16SI:
38333 half = gen_reg_rtx (mode3);
38334 if (mode3 == V8SFmode)
38335 gen = gen_vec_extract_lo_v16sf;
38336 else
38337 gen = gen_vec_extract_lo_v16si;
38338 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38339 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38340 emit_insn (gen (half, op3));
38341 op3 = half;
38342 break;
38343 default:
38344 break;
38347 /* Force memory operand only with base register here. But we
38348 don't want to do it on memory operand for other builtin
38349 functions. */
38350 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38352 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38353 op0 = copy_to_mode_reg (Pmode, op0);
38355 op1 = fixup_modeless_constant (op1, mode1);
38357 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38359 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38360 op1 = copy_to_mode_reg (mode1, op1);
38362 else
38364 op1 = copy_to_reg (op1);
38365 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38368 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38369 op2 = copy_to_mode_reg (mode2, op2);
38371 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38372 op3 = copy_to_mode_reg (mode3, op3);
38374 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38376 error ("the last argument must be scale 1, 2, 4, 8");
38377 return const0_rtx;
38380 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38381 if (! pat)
38382 return const0_rtx;
38384 emit_insn (pat);
38385 return 0;
38387 vec_prefetch_gen:
38388 arg0 = CALL_EXPR_ARG (exp, 0);
38389 arg1 = CALL_EXPR_ARG (exp, 1);
38390 arg2 = CALL_EXPR_ARG (exp, 2);
38391 arg3 = CALL_EXPR_ARG (exp, 3);
38392 arg4 = CALL_EXPR_ARG (exp, 4);
38393 op0 = expand_normal (arg0);
38394 op1 = expand_normal (arg1);
38395 op2 = expand_normal (arg2);
38396 op3 = expand_normal (arg3);
38397 op4 = expand_normal (arg4);
38398 mode0 = insn_data[icode].operand[0].mode;
38399 mode1 = insn_data[icode].operand[1].mode;
38400 mode3 = insn_data[icode].operand[3].mode;
38401 mode4 = insn_data[icode].operand[4].mode;
38403 op0 = fixup_modeless_constant (op0, mode0);
38405 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38407 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38408 op0 = copy_to_mode_reg (mode0, op0);
38410 else
38412 op0 = copy_to_reg (op0);
38413 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38416 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38417 op1 = copy_to_mode_reg (mode1, op1);
38419 /* Force memory operand only with base register here. But we
38420 don't want to do it on memory operand for other builtin
38421 functions. */
38422 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38424 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38425 op2 = copy_to_mode_reg (Pmode, op2);
38427 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38429 error ("the forth argument must be scale 1, 2, 4, 8");
38430 return const0_rtx;
38433 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38435 error ("incorrect hint operand");
38436 return const0_rtx;
38439 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38440 if (! pat)
38441 return const0_rtx;
38443 emit_insn (pat);
38445 return 0;
38447 case IX86_BUILTIN_XABORT:
38448 icode = CODE_FOR_xabort;
38449 arg0 = CALL_EXPR_ARG (exp, 0);
38450 op0 = expand_normal (arg0);
38451 mode0 = insn_data[icode].operand[0].mode;
38452 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38454 error ("the xabort's argument must be an 8-bit immediate");
38455 return const0_rtx;
38457 emit_insn (gen_xabort (op0));
38458 return 0;
38460 default:
38461 break;
38464 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38465 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38467 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38468 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38469 target);
38472 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38473 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38475 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38476 switch (fcode)
38478 case IX86_BUILTIN_FABSQ:
38479 case IX86_BUILTIN_COPYSIGNQ:
38480 if (!TARGET_SSE)
38481 /* Emit a normal call if SSE isn't available. */
38482 return expand_call (exp, target, ignore);
38483 /* FALLTHRU */
38484 default:
38485 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38489 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38490 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38492 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38493 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38494 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38495 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38496 int masked = 1;
38497 machine_mode mode, wide_mode, nar_mode;
38499 nar_mode = V4SFmode;
38500 mode = V16SFmode;
38501 wide_mode = V64SFmode;
38502 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38503 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38505 switch (fcode)
38507 case IX86_BUILTIN_4FMAPS:
38508 fcn = gen_avx5124fmaddps_4fmaddps;
38509 masked = 0;
38510 goto v4fma_expand;
38512 case IX86_BUILTIN_4DPWSSD:
38513 nar_mode = V4SImode;
38514 mode = V16SImode;
38515 wide_mode = V64SImode;
38516 fcn = gen_avx5124vnniw_vp4dpwssd;
38517 masked = 0;
38518 goto v4fma_expand;
38520 case IX86_BUILTIN_4DPWSSDS:
38521 nar_mode = V4SImode;
38522 mode = V16SImode;
38523 wide_mode = V64SImode;
38524 fcn = gen_avx5124vnniw_vp4dpwssds;
38525 masked = 0;
38526 goto v4fma_expand;
38528 case IX86_BUILTIN_4FNMAPS:
38529 fcn = gen_avx5124fmaddps_4fnmaddps;
38530 masked = 0;
38531 goto v4fma_expand;
38533 case IX86_BUILTIN_4FNMAPS_MASK:
38534 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38535 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38536 goto v4fma_expand;
38538 case IX86_BUILTIN_4DPWSSD_MASK:
38539 nar_mode = V4SImode;
38540 mode = V16SImode;
38541 wide_mode = V64SImode;
38542 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38543 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38544 goto v4fma_expand;
38546 case IX86_BUILTIN_4DPWSSDS_MASK:
38547 nar_mode = V4SImode;
38548 mode = V16SImode;
38549 wide_mode = V64SImode;
38550 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38551 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38552 goto v4fma_expand;
38554 case IX86_BUILTIN_4FMAPS_MASK:
38556 tree args[4];
38557 rtx ops[4];
38558 rtx wide_reg;
38559 rtx accum;
38560 rtx addr;
38561 rtx mem;
38563 v4fma_expand:
38564 wide_reg = gen_reg_rtx (wide_mode);
38565 for (i = 0; i < 4; i++)
38567 args[i] = CALL_EXPR_ARG (exp, i);
38568 ops[i] = expand_normal (args[i]);
38570 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38571 ops[i]);
38574 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38575 accum = force_reg (mode, accum);
38577 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38578 addr = force_reg (Pmode, addr);
38580 mem = gen_rtx_MEM (nar_mode, addr);
38582 target = gen_reg_rtx (mode);
38584 emit_move_insn (target, accum);
38586 if (! masked)
38587 emit_insn (fcn (target, accum, wide_reg, mem));
38588 else
38590 rtx merge, mask;
38591 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38593 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38595 if (CONST_INT_P (mask))
38596 mask = fixup_modeless_constant (mask, HImode);
38598 mask = force_reg (HImode, mask);
38600 if (GET_MODE (mask) != HImode)
38601 mask = gen_rtx_SUBREG (HImode, mask, 0);
38603 /* If merge is 0 then we're about to emit z-masked variant. */
38604 if (const0_operand (merge, mode))
38605 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38606 /* If merge is the same as accum then emit merge-masked variant. */
38607 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38609 merge = force_reg (mode, merge);
38610 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38612 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38613 else
38615 target = gen_reg_rtx (mode);
38616 emit_move_insn (target, merge);
38617 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38620 return target;
38623 case IX86_BUILTIN_4FNMASS:
38624 fcn = gen_avx5124fmaddps_4fnmaddss;
38625 masked = 0;
38626 goto s4fma_expand;
38628 case IX86_BUILTIN_4FMASS:
38629 fcn = gen_avx5124fmaddps_4fmaddss;
38630 masked = 0;
38631 goto s4fma_expand;
38633 case IX86_BUILTIN_4FNMASS_MASK:
38634 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38635 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38636 goto s4fma_expand;
38638 case IX86_BUILTIN_4FMASS_MASK:
38640 tree args[4];
38641 rtx ops[4];
38642 rtx wide_reg;
38643 rtx accum;
38644 rtx addr;
38645 rtx mem;
38647 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38648 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38650 s4fma_expand:
38651 mode = V4SFmode;
38652 wide_reg = gen_reg_rtx (V64SFmode);
38653 for (i = 0; i < 4; i++)
38655 rtx tmp;
38656 args[i] = CALL_EXPR_ARG (exp, i);
38657 ops[i] = expand_normal (args[i]);
38659 tmp = gen_reg_rtx (SFmode);
38660 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38662 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38663 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38666 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38667 accum = force_reg (V4SFmode, accum);
38669 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38670 addr = force_reg (Pmode, addr);
38672 mem = gen_rtx_MEM (V4SFmode, addr);
38674 target = gen_reg_rtx (V4SFmode);
38676 emit_move_insn (target, accum);
38678 if (! masked)
38679 emit_insn (fcn (target, accum, wide_reg, mem));
38680 else
38682 rtx merge, mask;
38683 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38685 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38687 if (CONST_INT_P (mask))
38688 mask = fixup_modeless_constant (mask, QImode);
38690 mask = force_reg (QImode, mask);
38692 if (GET_MODE (mask) != QImode)
38693 mask = gen_rtx_SUBREG (QImode, mask, 0);
38695 /* If merge is 0 then we're about to emit z-masked variant. */
38696 if (const0_operand (merge, mode))
38697 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38698 /* If merge is the same as accum then emit merge-masked
38699 variant. */
38700 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38702 merge = force_reg (mode, merge);
38703 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38705 /* Merge with something unknown might happen if we z-mask
38706 w/ -O0. */
38707 else
38709 target = gen_reg_rtx (mode);
38710 emit_move_insn (target, merge);
38711 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38714 return target;
38716 case IX86_BUILTIN_RDPID:
38717 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38718 target);
38719 default:
38720 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38724 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38725 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38727 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38728 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38731 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38732 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38734 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38735 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38738 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38739 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38741 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38742 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38745 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38746 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38748 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38749 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38752 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38753 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38755 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38756 const struct builtin_description *d = bdesc_multi_arg + i;
38757 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38758 (enum ix86_builtin_func_type)
38759 d->flag, d->comparison);
38762 gcc_unreachable ();
38765 /* This returns the target-specific builtin with code CODE if
38766 current_function_decl has visibility on this builtin, which is checked
38767 using isa flags. Returns NULL_TREE otherwise. */
38769 static tree ix86_get_builtin (enum ix86_builtins code)
38771 struct cl_target_option *opts;
38772 tree target_tree = NULL_TREE;
38774 /* Determine the isa flags of current_function_decl. */
38776 if (current_function_decl)
38777 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38779 if (target_tree == NULL)
38780 target_tree = target_option_default_node;
38782 opts = TREE_TARGET_OPTION (target_tree);
38784 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38785 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38786 return ix86_builtin_decl (code, true);
38787 else
38788 return NULL_TREE;
38791 /* Return function decl for target specific builtin
38792 for given MPX builtin passed i FCODE. */
38793 static tree
38794 ix86_builtin_mpx_function (unsigned fcode)
38796 switch (fcode)
38798 case BUILT_IN_CHKP_BNDMK:
38799 return ix86_builtins[IX86_BUILTIN_BNDMK];
38801 case BUILT_IN_CHKP_BNDSTX:
38802 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38804 case BUILT_IN_CHKP_BNDLDX:
38805 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38807 case BUILT_IN_CHKP_BNDCL:
38808 return ix86_builtins[IX86_BUILTIN_BNDCL];
38810 case BUILT_IN_CHKP_BNDCU:
38811 return ix86_builtins[IX86_BUILTIN_BNDCU];
38813 case BUILT_IN_CHKP_BNDRET:
38814 return ix86_builtins[IX86_BUILTIN_BNDRET];
38816 case BUILT_IN_CHKP_INTERSECT:
38817 return ix86_builtins[IX86_BUILTIN_BNDINT];
38819 case BUILT_IN_CHKP_NARROW:
38820 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38822 case BUILT_IN_CHKP_SIZEOF:
38823 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38825 case BUILT_IN_CHKP_EXTRACT_LOWER:
38826 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38828 case BUILT_IN_CHKP_EXTRACT_UPPER:
38829 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38831 default:
38832 return NULL_TREE;
38835 gcc_unreachable ();
38838 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38840 Return an address to be used to load/store bounds for pointer
38841 passed in SLOT.
38843 SLOT_NO is an integer constant holding number of a target
38844 dependent special slot to be used in case SLOT is not a memory.
38846 SPECIAL_BASE is a pointer to be used as a base of fake address
38847 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38848 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38850 static rtx
38851 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38853 rtx addr = NULL;
38855 /* NULL slot means we pass bounds for pointer not passed to the
38856 function at all. Register slot means we pass pointer in a
38857 register. In both these cases bounds are passed via Bounds
38858 Table. Since we do not have actual pointer stored in memory,
38859 we have to use fake addresses to access Bounds Table. We
38860 start with (special_base - sizeof (void*)) and decrease this
38861 address by pointer size to get addresses for other slots. */
38862 if (!slot || REG_P (slot))
38864 gcc_assert (CONST_INT_P (slot_no));
38865 addr = plus_constant (Pmode, special_base,
38866 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38868 /* If pointer is passed in a memory then its address is used to
38869 access Bounds Table. */
38870 else if (MEM_P (slot))
38872 addr = XEXP (slot, 0);
38873 if (!register_operand (addr, Pmode))
38874 addr = copy_addr_to_reg (addr);
38876 else
38877 gcc_unreachable ();
38879 return addr;
38882 /* Expand pass uses this hook to load bounds for function parameter
38883 PTR passed in SLOT in case its bounds are not passed in a register.
38885 If SLOT is a memory, then bounds are loaded as for regular pointer
38886 loaded from memory. PTR may be NULL in case SLOT is a memory.
38887 In such case value of PTR (if required) may be loaded from SLOT.
38889 If SLOT is NULL or a register then SLOT_NO is an integer constant
38890 holding number of the target dependent special slot which should be
38891 used to obtain bounds.
38893 Return loaded bounds. */
38895 static rtx
38896 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38898 rtx reg = gen_reg_rtx (BNDmode);
38899 rtx addr;
38901 /* Get address to be used to access Bounds Table. Special slots start
38902 at the location of return address of the current function. */
38903 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38905 /* Load pointer value from a memory if we don't have it. */
38906 if (!ptr)
38908 gcc_assert (MEM_P (slot));
38909 ptr = copy_addr_to_reg (slot);
38912 if (!register_operand (ptr, Pmode))
38913 ptr = ix86_zero_extend_to_Pmode (ptr);
38915 emit_insn (BNDmode == BND64mode
38916 ? gen_bnd64_ldx (reg, addr, ptr)
38917 : gen_bnd32_ldx (reg, addr, ptr));
38919 return reg;
38922 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38923 passed in SLOT in case BOUNDS are not passed in a register.
38925 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38926 stored in memory. PTR may be NULL in case SLOT is a memory.
38927 In such case value of PTR (if required) may be loaded from SLOT.
38929 If SLOT is NULL or a register then SLOT_NO is an integer constant
38930 holding number of the target dependent special slot which should be
38931 used to store BOUNDS. */
38933 static void
38934 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38936 rtx addr;
38938 /* Get address to be used to access Bounds Table. Special slots start
38939 at the location of return address of a called function. */
38940 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38942 /* Load pointer value from a memory if we don't have it. */
38943 if (!ptr)
38945 gcc_assert (MEM_P (slot));
38946 ptr = copy_addr_to_reg (slot);
38949 if (!register_operand (ptr, Pmode))
38950 ptr = ix86_zero_extend_to_Pmode (ptr);
38952 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38953 if (!register_operand (bounds, BNDmode))
38954 bounds = copy_to_mode_reg (BNDmode, bounds);
38956 emit_insn (BNDmode == BND64mode
38957 ? gen_bnd64_stx (addr, ptr, bounds)
38958 : gen_bnd32_stx (addr, ptr, bounds));
38961 /* Load and return bounds returned by function in SLOT. */
38963 static rtx
38964 ix86_load_returned_bounds (rtx slot)
38966 rtx res;
38968 gcc_assert (REG_P (slot));
38969 res = gen_reg_rtx (BNDmode);
38970 emit_move_insn (res, slot);
38972 return res;
38975 /* Store BOUNDS returned by function into SLOT. */
38977 static void
38978 ix86_store_returned_bounds (rtx slot, rtx bounds)
38980 gcc_assert (REG_P (slot));
38981 emit_move_insn (slot, bounds);
38984 /* Returns a function decl for a vectorized version of the combined function
38985 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38986 if it is not available. */
38988 static tree
38989 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38990 tree type_in)
38992 machine_mode in_mode, out_mode;
38993 int in_n, out_n;
38995 if (TREE_CODE (type_out) != VECTOR_TYPE
38996 || TREE_CODE (type_in) != VECTOR_TYPE)
38997 return NULL_TREE;
38999 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39000 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39001 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39002 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39004 switch (fn)
39006 CASE_CFN_EXP2:
39007 if (out_mode == SFmode && in_mode == SFmode)
39009 if (out_n == 16 && in_n == 16)
39010 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39012 break;
39014 CASE_CFN_IFLOOR:
39015 CASE_CFN_LFLOOR:
39016 CASE_CFN_LLFLOOR:
39017 /* The round insn does not trap on denormals. */
39018 if (flag_trapping_math || !TARGET_ROUND)
39019 break;
39021 if (out_mode == SImode && in_mode == DFmode)
39023 if (out_n == 4 && in_n == 2)
39024 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39025 else if (out_n == 8 && in_n == 4)
39026 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39027 else if (out_n == 16 && in_n == 8)
39028 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39030 if (out_mode == SImode && in_mode == SFmode)
39032 if (out_n == 4 && in_n == 4)
39033 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39034 else if (out_n == 8 && in_n == 8)
39035 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39036 else if (out_n == 16 && in_n == 16)
39037 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39039 break;
39041 CASE_CFN_ICEIL:
39042 CASE_CFN_LCEIL:
39043 CASE_CFN_LLCEIL:
39044 /* The round insn does not trap on denormals. */
39045 if (flag_trapping_math || !TARGET_ROUND)
39046 break;
39048 if (out_mode == SImode && in_mode == DFmode)
39050 if (out_n == 4 && in_n == 2)
39051 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39052 else if (out_n == 8 && in_n == 4)
39053 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39054 else if (out_n == 16 && in_n == 8)
39055 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39057 if (out_mode == SImode && in_mode == SFmode)
39059 if (out_n == 4 && in_n == 4)
39060 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39061 else if (out_n == 8 && in_n == 8)
39062 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39063 else if (out_n == 16 && in_n == 16)
39064 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39066 break;
39068 CASE_CFN_IRINT:
39069 CASE_CFN_LRINT:
39070 CASE_CFN_LLRINT:
39071 if (out_mode == SImode && in_mode == DFmode)
39073 if (out_n == 4 && in_n == 2)
39074 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39075 else if (out_n == 8 && in_n == 4)
39076 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39077 else if (out_n == 16 && in_n == 8)
39078 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39080 if (out_mode == SImode && in_mode == SFmode)
39082 if (out_n == 4 && in_n == 4)
39083 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39084 else if (out_n == 8 && in_n == 8)
39085 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39086 else if (out_n == 16 && in_n == 16)
39087 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39089 break;
39091 CASE_CFN_IROUND:
39092 CASE_CFN_LROUND:
39093 CASE_CFN_LLROUND:
39094 /* The round insn does not trap on denormals. */
39095 if (flag_trapping_math || !TARGET_ROUND)
39096 break;
39098 if (out_mode == SImode && in_mode == DFmode)
39100 if (out_n == 4 && in_n == 2)
39101 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39102 else if (out_n == 8 && in_n == 4)
39103 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39104 else if (out_n == 16 && in_n == 8)
39105 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39107 if (out_mode == SImode && in_mode == SFmode)
39109 if (out_n == 4 && in_n == 4)
39110 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39111 else if (out_n == 8 && in_n == 8)
39112 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39113 else if (out_n == 16 && in_n == 16)
39114 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39116 break;
39118 CASE_CFN_FLOOR:
39119 /* The round insn does not trap on denormals. */
39120 if (flag_trapping_math || !TARGET_ROUND)
39121 break;
39123 if (out_mode == DFmode && in_mode == DFmode)
39125 if (out_n == 2 && in_n == 2)
39126 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39127 else if (out_n == 4 && in_n == 4)
39128 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39129 else if (out_n == 8 && in_n == 8)
39130 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39132 if (out_mode == SFmode && in_mode == SFmode)
39134 if (out_n == 4 && in_n == 4)
39135 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39136 else if (out_n == 8 && in_n == 8)
39137 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39138 else if (out_n == 16 && in_n == 16)
39139 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39141 break;
39143 CASE_CFN_CEIL:
39144 /* The round insn does not trap on denormals. */
39145 if (flag_trapping_math || !TARGET_ROUND)
39146 break;
39148 if (out_mode == DFmode && in_mode == DFmode)
39150 if (out_n == 2 && in_n == 2)
39151 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39152 else if (out_n == 4 && in_n == 4)
39153 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39154 else if (out_n == 8 && in_n == 8)
39155 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39157 if (out_mode == SFmode && in_mode == SFmode)
39159 if (out_n == 4 && in_n == 4)
39160 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39161 else if (out_n == 8 && in_n == 8)
39162 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39163 else if (out_n == 16 && in_n == 16)
39164 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39166 break;
39168 CASE_CFN_TRUNC:
39169 /* The round insn does not trap on denormals. */
39170 if (flag_trapping_math || !TARGET_ROUND)
39171 break;
39173 if (out_mode == DFmode && in_mode == DFmode)
39175 if (out_n == 2 && in_n == 2)
39176 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39177 else if (out_n == 4 && in_n == 4)
39178 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39179 else if (out_n == 8 && in_n == 8)
39180 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39182 if (out_mode == SFmode && in_mode == SFmode)
39184 if (out_n == 4 && in_n == 4)
39185 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39186 else if (out_n == 8 && in_n == 8)
39187 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39188 else if (out_n == 16 && in_n == 16)
39189 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39191 break;
39193 CASE_CFN_RINT:
39194 /* The round insn does not trap on denormals. */
39195 if (flag_trapping_math || !TARGET_ROUND)
39196 break;
39198 if (out_mode == DFmode && in_mode == DFmode)
39200 if (out_n == 2 && in_n == 2)
39201 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39202 else if (out_n == 4 && in_n == 4)
39203 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39205 if (out_mode == SFmode && in_mode == SFmode)
39207 if (out_n == 4 && in_n == 4)
39208 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39209 else if (out_n == 8 && in_n == 8)
39210 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39212 break;
39214 CASE_CFN_FMA:
39215 if (out_mode == DFmode && in_mode == DFmode)
39217 if (out_n == 2 && in_n == 2)
39218 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39219 if (out_n == 4 && in_n == 4)
39220 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39222 if (out_mode == SFmode && in_mode == SFmode)
39224 if (out_n == 4 && in_n == 4)
39225 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39226 if (out_n == 8 && in_n == 8)
39227 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39229 break;
39231 default:
39232 break;
39235 /* Dispatch to a handler for a vectorization library. */
39236 if (ix86_veclib_handler)
39237 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39239 return NULL_TREE;
39242 /* Handler for an SVML-style interface to
39243 a library with vectorized intrinsics. */
39245 static tree
39246 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39248 char name[20];
39249 tree fntype, new_fndecl, args;
39250 unsigned arity;
39251 const char *bname;
39252 machine_mode el_mode, in_mode;
39253 int n, in_n;
39255 /* The SVML is suitable for unsafe math only. */
39256 if (!flag_unsafe_math_optimizations)
39257 return NULL_TREE;
39259 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39260 n = TYPE_VECTOR_SUBPARTS (type_out);
39261 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39262 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39263 if (el_mode != in_mode
39264 || n != in_n)
39265 return NULL_TREE;
39267 switch (fn)
39269 CASE_CFN_EXP:
39270 CASE_CFN_LOG:
39271 CASE_CFN_LOG10:
39272 CASE_CFN_POW:
39273 CASE_CFN_TANH:
39274 CASE_CFN_TAN:
39275 CASE_CFN_ATAN:
39276 CASE_CFN_ATAN2:
39277 CASE_CFN_ATANH:
39278 CASE_CFN_CBRT:
39279 CASE_CFN_SINH:
39280 CASE_CFN_SIN:
39281 CASE_CFN_ASINH:
39282 CASE_CFN_ASIN:
39283 CASE_CFN_COSH:
39284 CASE_CFN_COS:
39285 CASE_CFN_ACOSH:
39286 CASE_CFN_ACOS:
39287 if ((el_mode != DFmode || n != 2)
39288 && (el_mode != SFmode || n != 4))
39289 return NULL_TREE;
39290 break;
39292 default:
39293 return NULL_TREE;
39296 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39297 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39299 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39300 strcpy (name, "vmlsLn4");
39301 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39302 strcpy (name, "vmldLn2");
39303 else if (n == 4)
39305 sprintf (name, "vmls%s", bname+10);
39306 name[strlen (name)-1] = '4';
39308 else
39309 sprintf (name, "vmld%s2", bname+10);
39311 /* Convert to uppercase. */
39312 name[4] &= ~0x20;
39314 arity = 0;
39315 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39316 arity++;
39318 if (arity == 1)
39319 fntype = build_function_type_list (type_out, type_in, NULL);
39320 else
39321 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39323 /* Build a function declaration for the vectorized function. */
39324 new_fndecl = build_decl (BUILTINS_LOCATION,
39325 FUNCTION_DECL, get_identifier (name), fntype);
39326 TREE_PUBLIC (new_fndecl) = 1;
39327 DECL_EXTERNAL (new_fndecl) = 1;
39328 DECL_IS_NOVOPS (new_fndecl) = 1;
39329 TREE_READONLY (new_fndecl) = 1;
39331 return new_fndecl;
39334 /* Handler for an ACML-style interface to
39335 a library with vectorized intrinsics. */
39337 static tree
39338 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39340 char name[20] = "__vr.._";
39341 tree fntype, new_fndecl, args;
39342 unsigned arity;
39343 const char *bname;
39344 machine_mode el_mode, in_mode;
39345 int n, in_n;
39347 /* The ACML is 64bits only and suitable for unsafe math only as
39348 it does not correctly support parts of IEEE with the required
39349 precision such as denormals. */
39350 if (!TARGET_64BIT
39351 || !flag_unsafe_math_optimizations)
39352 return NULL_TREE;
39354 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39355 n = TYPE_VECTOR_SUBPARTS (type_out);
39356 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39357 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39358 if (el_mode != in_mode
39359 || n != in_n)
39360 return NULL_TREE;
39362 switch (fn)
39364 CASE_CFN_SIN:
39365 CASE_CFN_COS:
39366 CASE_CFN_EXP:
39367 CASE_CFN_LOG:
39368 CASE_CFN_LOG2:
39369 CASE_CFN_LOG10:
39370 if (el_mode == DFmode && n == 2)
39372 name[4] = 'd';
39373 name[5] = '2';
39375 else if (el_mode == SFmode && n == 4)
39377 name[4] = 's';
39378 name[5] = '4';
39380 else
39381 return NULL_TREE;
39382 break;
39384 default:
39385 return NULL_TREE;
39388 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39389 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39390 sprintf (name + 7, "%s", bname+10);
39392 arity = 0;
39393 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39394 arity++;
39396 if (arity == 1)
39397 fntype = build_function_type_list (type_out, type_in, NULL);
39398 else
39399 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39401 /* Build a function declaration for the vectorized function. */
39402 new_fndecl = build_decl (BUILTINS_LOCATION,
39403 FUNCTION_DECL, get_identifier (name), fntype);
39404 TREE_PUBLIC (new_fndecl) = 1;
39405 DECL_EXTERNAL (new_fndecl) = 1;
39406 DECL_IS_NOVOPS (new_fndecl) = 1;
39407 TREE_READONLY (new_fndecl) = 1;
39409 return new_fndecl;
39412 /* Returns a decl of a function that implements gather load with
39413 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39414 Return NULL_TREE if it is not available. */
39416 static tree
39417 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39418 const_tree index_type, int scale)
39420 bool si;
39421 enum ix86_builtins code;
39423 if (! TARGET_AVX2)
39424 return NULL_TREE;
39426 if ((TREE_CODE (index_type) != INTEGER_TYPE
39427 && !POINTER_TYPE_P (index_type))
39428 || (TYPE_MODE (index_type) != SImode
39429 && TYPE_MODE (index_type) != DImode))
39430 return NULL_TREE;
39432 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39433 return NULL_TREE;
39435 /* v*gather* insn sign extends index to pointer mode. */
39436 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39437 && TYPE_UNSIGNED (index_type))
39438 return NULL_TREE;
39440 if (scale <= 0
39441 || scale > 8
39442 || (scale & (scale - 1)) != 0)
39443 return NULL_TREE;
39445 si = TYPE_MODE (index_type) == SImode;
39446 switch (TYPE_MODE (mem_vectype))
39448 case V2DFmode:
39449 if (TARGET_AVX512VL)
39450 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39451 else
39452 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39453 break;
39454 case V4DFmode:
39455 if (TARGET_AVX512VL)
39456 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39457 else
39458 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39459 break;
39460 case V2DImode:
39461 if (TARGET_AVX512VL)
39462 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39463 else
39464 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39465 break;
39466 case V4DImode:
39467 if (TARGET_AVX512VL)
39468 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39469 else
39470 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39471 break;
39472 case V4SFmode:
39473 if (TARGET_AVX512VL)
39474 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39475 else
39476 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39477 break;
39478 case V8SFmode:
39479 if (TARGET_AVX512VL)
39480 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39481 else
39482 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39483 break;
39484 case V4SImode:
39485 if (TARGET_AVX512VL)
39486 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39487 else
39488 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39489 break;
39490 case V8SImode:
39491 if (TARGET_AVX512VL)
39492 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39493 else
39494 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39495 break;
39496 case V8DFmode:
39497 if (TARGET_AVX512F)
39498 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39499 else
39500 return NULL_TREE;
39501 break;
39502 case V8DImode:
39503 if (TARGET_AVX512F)
39504 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39505 else
39506 return NULL_TREE;
39507 break;
39508 case V16SFmode:
39509 if (TARGET_AVX512F)
39510 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39511 else
39512 return NULL_TREE;
39513 break;
39514 case V16SImode:
39515 if (TARGET_AVX512F)
39516 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39517 else
39518 return NULL_TREE;
39519 break;
39520 default:
39521 return NULL_TREE;
39524 return ix86_get_builtin (code);
39527 /* Returns a decl of a function that implements scatter store with
39528 register type VECTYPE and index type INDEX_TYPE and SCALE.
39529 Return NULL_TREE if it is not available. */
39531 static tree
39532 ix86_vectorize_builtin_scatter (const_tree vectype,
39533 const_tree index_type, int scale)
39535 bool si;
39536 enum ix86_builtins code;
39538 if (!TARGET_AVX512F)
39539 return NULL_TREE;
39541 if ((TREE_CODE (index_type) != INTEGER_TYPE
39542 && !POINTER_TYPE_P (index_type))
39543 || (TYPE_MODE (index_type) != SImode
39544 && TYPE_MODE (index_type) != DImode))
39545 return NULL_TREE;
39547 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39548 return NULL_TREE;
39550 /* v*scatter* insn sign extends index to pointer mode. */
39551 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39552 && TYPE_UNSIGNED (index_type))
39553 return NULL_TREE;
39555 /* Scale can be 1, 2, 4 or 8. */
39556 if (scale <= 0
39557 || scale > 8
39558 || (scale & (scale - 1)) != 0)
39559 return NULL_TREE;
39561 si = TYPE_MODE (index_type) == SImode;
39562 switch (TYPE_MODE (vectype))
39564 case V8DFmode:
39565 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39566 break;
39567 case V8DImode:
39568 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39569 break;
39570 case V16SFmode:
39571 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39572 break;
39573 case V16SImode:
39574 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39575 break;
39576 default:
39577 return NULL_TREE;
39580 return ix86_builtins[code];
39583 /* Return true if it is safe to use the rsqrt optabs to optimize
39584 1.0/sqrt. */
39586 static bool
39587 use_rsqrt_p ()
39589 return (TARGET_SSE_MATH
39590 && flag_finite_math_only
39591 && !flag_trapping_math
39592 && flag_unsafe_math_optimizations);
39595 /* Returns a code for a target-specific builtin that implements
39596 reciprocal of the function, or NULL_TREE if not available. */
39598 static tree
39599 ix86_builtin_reciprocal (tree fndecl)
39601 switch (DECL_FUNCTION_CODE (fndecl))
39603 /* Vectorized version of sqrt to rsqrt conversion. */
39604 case IX86_BUILTIN_SQRTPS_NR:
39605 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39607 case IX86_BUILTIN_SQRTPS_NR256:
39608 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39610 default:
39611 return NULL_TREE;
39615 /* Helper for avx_vpermilps256_operand et al. This is also used by
39616 the expansion functions to turn the parallel back into a mask.
39617 The return value is 0 for no match and the imm8+1 for a match. */
39620 avx_vpermilp_parallel (rtx par, machine_mode mode)
39622 unsigned i, nelt = GET_MODE_NUNITS (mode);
39623 unsigned mask = 0;
39624 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39626 if (XVECLEN (par, 0) != (int) nelt)
39627 return 0;
39629 /* Validate that all of the elements are constants, and not totally
39630 out of range. Copy the data into an integral array to make the
39631 subsequent checks easier. */
39632 for (i = 0; i < nelt; ++i)
39634 rtx er = XVECEXP (par, 0, i);
39635 unsigned HOST_WIDE_INT ei;
39637 if (!CONST_INT_P (er))
39638 return 0;
39639 ei = INTVAL (er);
39640 if (ei >= nelt)
39641 return 0;
39642 ipar[i] = ei;
39645 switch (mode)
39647 case V8DFmode:
39648 /* In the 512-bit DFmode case, we can only move elements within
39649 a 128-bit lane. First fill the second part of the mask,
39650 then fallthru. */
39651 for (i = 4; i < 6; ++i)
39653 if (ipar[i] < 4 || ipar[i] >= 6)
39654 return 0;
39655 mask |= (ipar[i] - 4) << i;
39657 for (i = 6; i < 8; ++i)
39659 if (ipar[i] < 6)
39660 return 0;
39661 mask |= (ipar[i] - 6) << i;
39663 /* FALLTHRU */
39665 case V4DFmode:
39666 /* In the 256-bit DFmode case, we can only move elements within
39667 a 128-bit lane. */
39668 for (i = 0; i < 2; ++i)
39670 if (ipar[i] >= 2)
39671 return 0;
39672 mask |= ipar[i] << i;
39674 for (i = 2; i < 4; ++i)
39676 if (ipar[i] < 2)
39677 return 0;
39678 mask |= (ipar[i] - 2) << i;
39680 break;
39682 case V16SFmode:
39683 /* In 512 bit SFmode case, permutation in the upper 256 bits
39684 must mirror the permutation in the lower 256-bits. */
39685 for (i = 0; i < 8; ++i)
39686 if (ipar[i] + 8 != ipar[i + 8])
39687 return 0;
39688 /* FALLTHRU */
39690 case V8SFmode:
39691 /* In 256 bit SFmode case, we have full freedom of
39692 movement within the low 128-bit lane, but the high 128-bit
39693 lane must mirror the exact same pattern. */
39694 for (i = 0; i < 4; ++i)
39695 if (ipar[i] + 4 != ipar[i + 4])
39696 return 0;
39697 nelt = 4;
39698 /* FALLTHRU */
39700 case V2DFmode:
39701 case V4SFmode:
39702 /* In the 128-bit case, we've full freedom in the placement of
39703 the elements from the source operand. */
39704 for (i = 0; i < nelt; ++i)
39705 mask |= ipar[i] << (i * (nelt / 2));
39706 break;
39708 default:
39709 gcc_unreachable ();
39712 /* Make sure success has a non-zero value by adding one. */
39713 return mask + 1;
39716 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39717 the expansion functions to turn the parallel back into a mask.
39718 The return value is 0 for no match and the imm8+1 for a match. */
39721 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39723 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39724 unsigned mask = 0;
39725 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39727 if (XVECLEN (par, 0) != (int) nelt)
39728 return 0;
39730 /* Validate that all of the elements are constants, and not totally
39731 out of range. Copy the data into an integral array to make the
39732 subsequent checks easier. */
39733 for (i = 0; i < nelt; ++i)
39735 rtx er = XVECEXP (par, 0, i);
39736 unsigned HOST_WIDE_INT ei;
39738 if (!CONST_INT_P (er))
39739 return 0;
39740 ei = INTVAL (er);
39741 if (ei >= 2 * nelt)
39742 return 0;
39743 ipar[i] = ei;
39746 /* Validate that the halves of the permute are halves. */
39747 for (i = 0; i < nelt2 - 1; ++i)
39748 if (ipar[i] + 1 != ipar[i + 1])
39749 return 0;
39750 for (i = nelt2; i < nelt - 1; ++i)
39751 if (ipar[i] + 1 != ipar[i + 1])
39752 return 0;
39754 /* Reconstruct the mask. */
39755 for (i = 0; i < 2; ++i)
39757 unsigned e = ipar[i * nelt2];
39758 if (e % nelt2)
39759 return 0;
39760 e /= nelt2;
39761 mask |= e << (i * 4);
39764 /* Make sure success has a non-zero value by adding one. */
39765 return mask + 1;
39768 /* Return a register priority for hard reg REGNO. */
39769 static int
39770 ix86_register_priority (int hard_regno)
39772 /* ebp and r13 as the base always wants a displacement, r12 as the
39773 base always wants an index. So discourage their usage in an
39774 address. */
39775 if (hard_regno == R12_REG || hard_regno == R13_REG)
39776 return 0;
39777 if (hard_regno == BP_REG)
39778 return 1;
39779 /* New x86-64 int registers result in bigger code size. Discourage
39780 them. */
39781 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39782 return 2;
39783 /* New x86-64 SSE registers result in bigger code size. Discourage
39784 them. */
39785 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39786 return 2;
39787 /* Usage of AX register results in smaller code. Prefer it. */
39788 if (hard_regno == AX_REG)
39789 return 4;
39790 return 3;
39793 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39795 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39796 QImode must go into class Q_REGS.
39797 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39798 movdf to do mem-to-mem moves through integer regs. */
39800 static reg_class_t
39801 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39803 machine_mode mode = GET_MODE (x);
39805 /* We're only allowed to return a subclass of CLASS. Many of the
39806 following checks fail for NO_REGS, so eliminate that early. */
39807 if (regclass == NO_REGS)
39808 return NO_REGS;
39810 /* All classes can load zeros. */
39811 if (x == CONST0_RTX (mode))
39812 return regclass;
39814 /* Force constants into memory if we are loading a (nonzero) constant into
39815 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39816 instructions to load from a constant. */
39817 if (CONSTANT_P (x)
39818 && (MAYBE_MMX_CLASS_P (regclass)
39819 || MAYBE_SSE_CLASS_P (regclass)
39820 || MAYBE_MASK_CLASS_P (regclass)))
39821 return NO_REGS;
39823 /* Floating-point constants need more complex checks. */
39824 if (CONST_DOUBLE_P (x))
39826 /* General regs can load everything. */
39827 if (INTEGER_CLASS_P (regclass))
39828 return regclass;
39830 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39831 zero above. We only want to wind up preferring 80387 registers if
39832 we plan on doing computation with them. */
39833 if (IS_STACK_MODE (mode)
39834 && standard_80387_constant_p (x) > 0)
39836 /* Limit class to FP regs. */
39837 if (FLOAT_CLASS_P (regclass))
39838 return FLOAT_REGS;
39839 else if (regclass == FP_TOP_SSE_REGS)
39840 return FP_TOP_REG;
39841 else if (regclass == FP_SECOND_SSE_REGS)
39842 return FP_SECOND_REG;
39845 return NO_REGS;
39848 /* Prefer SSE regs only, if we can use them for math. */
39849 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39850 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39852 /* Generally when we see PLUS here, it's the function invariant
39853 (plus soft-fp const_int). Which can only be computed into general
39854 regs. */
39855 if (GET_CODE (x) == PLUS)
39856 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39858 /* QImode constants are easy to load, but non-constant QImode data
39859 must go into Q_REGS. */
39860 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39862 if (Q_CLASS_P (regclass))
39863 return regclass;
39864 else if (reg_class_subset_p (Q_REGS, regclass))
39865 return Q_REGS;
39866 else
39867 return NO_REGS;
39870 return regclass;
39873 /* Discourage putting floating-point values in SSE registers unless
39874 SSE math is being used, and likewise for the 387 registers. */
39875 static reg_class_t
39876 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39878 machine_mode mode = GET_MODE (x);
39880 /* Restrict the output reload class to the register bank that we are doing
39881 math on. If we would like not to return a subset of CLASS, reject this
39882 alternative: if reload cannot do this, it will still use its choice. */
39883 mode = GET_MODE (x);
39884 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39885 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39887 if (IS_STACK_MODE (mode))
39889 if (regclass == FP_TOP_SSE_REGS)
39890 return FP_TOP_REG;
39891 else if (regclass == FP_SECOND_SSE_REGS)
39892 return FP_SECOND_REG;
39893 else
39894 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39897 return regclass;
39900 static reg_class_t
39901 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39902 machine_mode mode, secondary_reload_info *sri)
39904 /* Double-word spills from general registers to non-offsettable memory
39905 references (zero-extended addresses) require special handling. */
39906 if (TARGET_64BIT
39907 && MEM_P (x)
39908 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39909 && INTEGER_CLASS_P (rclass)
39910 && !offsettable_memref_p (x))
39912 sri->icode = (in_p
39913 ? CODE_FOR_reload_noff_load
39914 : CODE_FOR_reload_noff_store);
39915 /* Add the cost of moving address to a temporary. */
39916 sri->extra_cost = 1;
39918 return NO_REGS;
39921 /* QImode spills from non-QI registers require
39922 intermediate register on 32bit targets. */
39923 if (mode == QImode
39924 && ((!TARGET_64BIT && !in_p
39925 && INTEGER_CLASS_P (rclass)
39926 && MAYBE_NON_Q_CLASS_P (rclass))
39927 || (!TARGET_AVX512DQ
39928 && MAYBE_MASK_CLASS_P (rclass))))
39930 int regno = true_regnum (x);
39932 /* Return Q_REGS if the operand is in memory. */
39933 if (regno == -1)
39934 return Q_REGS;
39936 return NO_REGS;
39939 /* This condition handles corner case where an expression involving
39940 pointers gets vectorized. We're trying to use the address of a
39941 stack slot as a vector initializer.
39943 (set (reg:V2DI 74 [ vect_cst_.2 ])
39944 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39946 Eventually frame gets turned into sp+offset like this:
39948 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39949 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39950 (const_int 392 [0x188]))))
39952 That later gets turned into:
39954 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39955 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39956 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39958 We'll have the following reload recorded:
39960 Reload 0: reload_in (DI) =
39961 (plus:DI (reg/f:DI 7 sp)
39962 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39963 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39964 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39965 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39966 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39967 reload_reg_rtx: (reg:V2DI 22 xmm1)
39969 Which isn't going to work since SSE instructions can't handle scalar
39970 additions. Returning GENERAL_REGS forces the addition into integer
39971 register and reload can handle subsequent reloads without problems. */
39973 if (in_p && GET_CODE (x) == PLUS
39974 && SSE_CLASS_P (rclass)
39975 && SCALAR_INT_MODE_P (mode))
39976 return GENERAL_REGS;
39978 return NO_REGS;
39981 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39983 static bool
39984 ix86_class_likely_spilled_p (reg_class_t rclass)
39986 switch (rclass)
39988 case AREG:
39989 case DREG:
39990 case CREG:
39991 case BREG:
39992 case AD_REGS:
39993 case SIREG:
39994 case DIREG:
39995 case SSE_FIRST_REG:
39996 case FP_TOP_REG:
39997 case FP_SECOND_REG:
39998 case BND_REGS:
39999 return true;
40001 default:
40002 break;
40005 return false;
40008 /* If we are copying between registers from different register sets
40009 (e.g. FP and integer), we may need a memory location.
40011 The function can't work reliably when one of the CLASSES is a class
40012 containing registers from multiple sets. We avoid this by never combining
40013 different sets in a single alternative in the machine description.
40014 Ensure that this constraint holds to avoid unexpected surprises.
40016 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40017 so do not enforce these sanity checks.
40019 To optimize register_move_cost performance, define inline variant. */
40021 static inline bool
40022 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40023 machine_mode mode, int strict)
40025 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40026 return false;
40028 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40029 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40030 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40031 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40032 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40033 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40034 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40035 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40037 gcc_assert (!strict || lra_in_progress);
40038 return true;
40041 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40042 return true;
40044 /* Between mask and general, we have moves no larger than word size. */
40045 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40046 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40047 return true;
40049 /* ??? This is a lie. We do have moves between mmx/general, and for
40050 mmx/sse2. But by saying we need secondary memory we discourage the
40051 register allocator from using the mmx registers unless needed. */
40052 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40053 return true;
40055 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40057 /* SSE1 doesn't have any direct moves from other classes. */
40058 if (!TARGET_SSE2)
40059 return true;
40061 /* If the target says that inter-unit moves are more expensive
40062 than moving through memory, then don't generate them. */
40063 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40064 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40065 return true;
40067 /* Between SSE and general, we have moves no larger than word size. */
40068 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40069 return true;
40072 return false;
40075 bool
40076 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40077 machine_mode mode, int strict)
40079 return inline_secondary_memory_needed (class1, class2, mode, strict);
40082 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40084 On the 80386, this is the size of MODE in words,
40085 except in the FP regs, where a single reg is always enough. */
40087 static unsigned char
40088 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40090 if (MAYBE_INTEGER_CLASS_P (rclass))
40092 if (mode == XFmode)
40093 return (TARGET_64BIT ? 2 : 3);
40094 else if (mode == XCmode)
40095 return (TARGET_64BIT ? 4 : 6);
40096 else
40097 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40099 else
40101 if (COMPLEX_MODE_P (mode))
40102 return 2;
40103 else
40104 return 1;
40108 /* Return true if the registers in CLASS cannot represent the change from
40109 modes FROM to TO. */
40111 bool
40112 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40113 enum reg_class regclass)
40115 if (from == to)
40116 return false;
40118 /* x87 registers can't do subreg at all, as all values are reformatted
40119 to extended precision. */
40120 if (MAYBE_FLOAT_CLASS_P (regclass))
40121 return true;
40123 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40125 /* Vector registers do not support QI or HImode loads. If we don't
40126 disallow a change to these modes, reload will assume it's ok to
40127 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40128 the vec_dupv4hi pattern. */
40129 if (GET_MODE_SIZE (from) < 4)
40130 return true;
40133 return false;
40136 /* Return the cost of moving data of mode M between a
40137 register and memory. A value of 2 is the default; this cost is
40138 relative to those in `REGISTER_MOVE_COST'.
40140 This function is used extensively by register_move_cost that is used to
40141 build tables at startup. Make it inline in this case.
40142 When IN is 2, return maximum of in and out move cost.
40144 If moving between registers and memory is more expensive than
40145 between two registers, you should define this macro to express the
40146 relative cost.
40148 Model also increased moving costs of QImode registers in non
40149 Q_REGS classes.
40151 static inline int
40152 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40153 int in)
40155 int cost;
40156 if (FLOAT_CLASS_P (regclass))
40158 int index;
40159 switch (mode)
40161 case SFmode:
40162 index = 0;
40163 break;
40164 case DFmode:
40165 index = 1;
40166 break;
40167 case XFmode:
40168 index = 2;
40169 break;
40170 default:
40171 return 100;
40173 if (in == 2)
40174 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40175 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40177 if (SSE_CLASS_P (regclass))
40179 int index;
40180 switch (GET_MODE_SIZE (mode))
40182 case 4:
40183 index = 0;
40184 break;
40185 case 8:
40186 index = 1;
40187 break;
40188 case 16:
40189 index = 2;
40190 break;
40191 default:
40192 return 100;
40194 if (in == 2)
40195 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40196 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40198 if (MMX_CLASS_P (regclass))
40200 int index;
40201 switch (GET_MODE_SIZE (mode))
40203 case 4:
40204 index = 0;
40205 break;
40206 case 8:
40207 index = 1;
40208 break;
40209 default:
40210 return 100;
40212 if (in)
40213 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40214 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40216 switch (GET_MODE_SIZE (mode))
40218 case 1:
40219 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40221 if (!in)
40222 return ix86_cost->int_store[0];
40223 if (TARGET_PARTIAL_REG_DEPENDENCY
40224 && optimize_function_for_speed_p (cfun))
40225 cost = ix86_cost->movzbl_load;
40226 else
40227 cost = ix86_cost->int_load[0];
40228 if (in == 2)
40229 return MAX (cost, ix86_cost->int_store[0]);
40230 return cost;
40232 else
40234 if (in == 2)
40235 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40236 if (in)
40237 return ix86_cost->movzbl_load;
40238 else
40239 return ix86_cost->int_store[0] + 4;
40241 break;
40242 case 2:
40243 if (in == 2)
40244 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40245 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40246 default:
40247 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40248 if (mode == TFmode)
40249 mode = XFmode;
40250 if (in == 2)
40251 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40252 else if (in)
40253 cost = ix86_cost->int_load[2];
40254 else
40255 cost = ix86_cost->int_store[2];
40256 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40260 static int
40261 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40262 bool in)
40264 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40268 /* Return the cost of moving data from a register in class CLASS1 to
40269 one in class CLASS2.
40271 It is not required that the cost always equal 2 when FROM is the same as TO;
40272 on some machines it is expensive to move between registers if they are not
40273 general registers. */
40275 static int
40276 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40277 reg_class_t class2_i)
40279 enum reg_class class1 = (enum reg_class) class1_i;
40280 enum reg_class class2 = (enum reg_class) class2_i;
40282 /* In case we require secondary memory, compute cost of the store followed
40283 by load. In order to avoid bad register allocation choices, we need
40284 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40286 if (inline_secondary_memory_needed (class1, class2, mode, 0))
40288 int cost = 1;
40290 cost += inline_memory_move_cost (mode, class1, 2);
40291 cost += inline_memory_move_cost (mode, class2, 2);
40293 /* In case of copying from general_purpose_register we may emit multiple
40294 stores followed by single load causing memory size mismatch stall.
40295 Count this as arbitrarily high cost of 20. */
40296 if (targetm.class_max_nregs (class1, mode)
40297 > targetm.class_max_nregs (class2, mode))
40298 cost += 20;
40300 /* In the case of FP/MMX moves, the registers actually overlap, and we
40301 have to switch modes in order to treat them differently. */
40302 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40303 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40304 cost += 20;
40306 return cost;
40309 /* Moves between SSE/MMX and integer unit are expensive. */
40310 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40311 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40313 /* ??? By keeping returned value relatively high, we limit the number
40314 of moves between integer and MMX/SSE registers for all targets.
40315 Additionally, high value prevents problem with x86_modes_tieable_p(),
40316 where integer modes in MMX/SSE registers are not tieable
40317 because of missing QImode and HImode moves to, from or between
40318 MMX/SSE registers. */
40319 return MAX (8, ix86_cost->mmxsse_to_integer);
40321 if (MAYBE_FLOAT_CLASS_P (class1))
40322 return ix86_cost->fp_move;
40323 if (MAYBE_SSE_CLASS_P (class1))
40324 return ix86_cost->sse_move;
40325 if (MAYBE_MMX_CLASS_P (class1))
40326 return ix86_cost->mmx_move;
40327 return 2;
40330 /* Return TRUE if hard register REGNO can hold a value of machine-mode
40331 MODE. */
40333 bool
40334 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
40336 /* Flags and only flags can only hold CCmode values. */
40337 if (CC_REGNO_P (regno))
40338 return GET_MODE_CLASS (mode) == MODE_CC;
40339 if (GET_MODE_CLASS (mode) == MODE_CC
40340 || GET_MODE_CLASS (mode) == MODE_RANDOM
40341 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40342 return false;
40343 if (STACK_REGNO_P (regno))
40344 return VALID_FP_MODE_P (mode);
40345 if (MASK_REGNO_P (regno))
40346 return (VALID_MASK_REG_MODE (mode)
40347 || (TARGET_AVX512BW
40348 && VALID_MASK_AVX512BW_MODE (mode)));
40349 if (BND_REGNO_P (regno))
40350 return VALID_BND_REG_MODE (mode);
40351 if (SSE_REGNO_P (regno))
40353 /* We implement the move patterns for all vector modes into and
40354 out of SSE registers, even when no operation instructions
40355 are available. */
40357 /* For AVX-512 we allow, regardless of regno:
40358 - XI mode
40359 - any of 512-bit wide vector mode
40360 - any scalar mode. */
40361 if (TARGET_AVX512F
40362 && (mode == XImode
40363 || VALID_AVX512F_REG_MODE (mode)
40364 || VALID_AVX512F_SCALAR_MODE (mode)))
40365 return true;
40367 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40368 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40369 && MOD4_SSE_REGNO_P (regno)
40370 && mode == V64SFmode)
40371 return true;
40373 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40374 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40375 && MOD4_SSE_REGNO_P (regno)
40376 && mode == V64SImode)
40377 return true;
40379 /* TODO check for QI/HI scalars. */
40380 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40381 if (TARGET_AVX512VL
40382 && (mode == OImode
40383 || mode == TImode
40384 || VALID_AVX256_REG_MODE (mode)
40385 || VALID_AVX512VL_128_REG_MODE (mode)))
40386 return true;
40388 /* xmm16-xmm31 are only available for AVX-512. */
40389 if (EXT_REX_SSE_REGNO_P (regno))
40390 return false;
40392 /* OImode and AVX modes are available only when AVX is enabled. */
40393 return ((TARGET_AVX
40394 && VALID_AVX256_REG_OR_OI_MODE (mode))
40395 || VALID_SSE_REG_MODE (mode)
40396 || VALID_SSE2_REG_MODE (mode)
40397 || VALID_MMX_REG_MODE (mode)
40398 || VALID_MMX_REG_MODE_3DNOW (mode));
40400 if (MMX_REGNO_P (regno))
40402 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40403 so if the register is available at all, then we can move data of
40404 the given mode into or out of it. */
40405 return (VALID_MMX_REG_MODE (mode)
40406 || VALID_MMX_REG_MODE_3DNOW (mode));
40409 if (mode == QImode)
40411 /* Take care for QImode values - they can be in non-QI regs,
40412 but then they do cause partial register stalls. */
40413 if (ANY_QI_REGNO_P (regno))
40414 return true;
40415 if (!TARGET_PARTIAL_REG_STALL)
40416 return true;
40417 /* LRA checks if the hard register is OK for the given mode.
40418 QImode values can live in non-QI regs, so we allow all
40419 registers here. */
40420 if (lra_in_progress)
40421 return true;
40422 return !can_create_pseudo_p ();
40424 /* We handle both integer and floats in the general purpose registers. */
40425 else if (VALID_INT_MODE_P (mode))
40426 return true;
40427 else if (VALID_FP_MODE_P (mode))
40428 return true;
40429 else if (VALID_DFP_MODE_P (mode))
40430 return true;
40431 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40432 on to use that value in smaller contexts, this can easily force a
40433 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40434 supporting DImode, allow it. */
40435 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40436 return true;
40438 return false;
40441 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40442 tieable integer mode. */
40444 static bool
40445 ix86_tieable_integer_mode_p (machine_mode mode)
40447 switch (mode)
40449 case HImode:
40450 case SImode:
40451 return true;
40453 case QImode:
40454 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40456 case DImode:
40457 return TARGET_64BIT;
40459 default:
40460 return false;
40464 /* Return true if MODE1 is accessible in a register that can hold MODE2
40465 without copying. That is, all register classes that can hold MODE2
40466 can also hold MODE1. */
40468 bool
40469 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40471 if (mode1 == mode2)
40472 return true;
40474 if (ix86_tieable_integer_mode_p (mode1)
40475 && ix86_tieable_integer_mode_p (mode2))
40476 return true;
40478 /* MODE2 being XFmode implies fp stack or general regs, which means we
40479 can tie any smaller floating point modes to it. Note that we do not
40480 tie this with TFmode. */
40481 if (mode2 == XFmode)
40482 return mode1 == SFmode || mode1 == DFmode;
40484 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40485 that we can tie it with SFmode. */
40486 if (mode2 == DFmode)
40487 return mode1 == SFmode;
40489 /* If MODE2 is only appropriate for an SSE register, then tie with
40490 any other mode acceptable to SSE registers. */
40491 if (GET_MODE_SIZE (mode2) == 32
40492 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40493 return (GET_MODE_SIZE (mode1) == 32
40494 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40495 if (GET_MODE_SIZE (mode2) == 16
40496 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40497 return (GET_MODE_SIZE (mode1) == 16
40498 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40500 /* If MODE2 is appropriate for an MMX register, then tie
40501 with any other mode acceptable to MMX registers. */
40502 if (GET_MODE_SIZE (mode2) == 8
40503 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40504 return (GET_MODE_SIZE (mode1) == 8
40505 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40507 return false;
40510 /* Return the cost of moving between two registers of mode MODE. */
40512 static int
40513 ix86_set_reg_reg_cost (machine_mode mode)
40515 unsigned int units = UNITS_PER_WORD;
40517 switch (GET_MODE_CLASS (mode))
40519 default:
40520 break;
40522 case MODE_CC:
40523 units = GET_MODE_SIZE (CCmode);
40524 break;
40526 case MODE_FLOAT:
40527 if ((TARGET_SSE && mode == TFmode)
40528 || (TARGET_80387 && mode == XFmode)
40529 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40530 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40531 units = GET_MODE_SIZE (mode);
40532 break;
40534 case MODE_COMPLEX_FLOAT:
40535 if ((TARGET_SSE && mode == TCmode)
40536 || (TARGET_80387 && mode == XCmode)
40537 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40538 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40539 units = GET_MODE_SIZE (mode);
40540 break;
40542 case MODE_VECTOR_INT:
40543 case MODE_VECTOR_FLOAT:
40544 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40545 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40546 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40547 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40548 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40549 units = GET_MODE_SIZE (mode);
40552 /* Return the cost of moving between two registers of mode MODE,
40553 assuming that the move will be in pieces of at most UNITS bytes. */
40554 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40557 /* Compute a (partial) cost for rtx X. Return true if the complete
40558 cost has been computed, and false if subexpressions should be
40559 scanned. In either case, *TOTAL contains the cost result. */
40561 static bool
40562 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40563 int *total, bool speed)
40565 rtx mask;
40566 enum rtx_code code = GET_CODE (x);
40567 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40568 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40569 int src_cost;
40571 switch (code)
40573 case SET:
40574 if (register_operand (SET_DEST (x), VOIDmode)
40575 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40577 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40578 return true;
40581 if (register_operand (SET_SRC (x), VOIDmode))
40582 /* Avoid potentially incorrect high cost from rtx_costs
40583 for non-tieable SUBREGs. */
40584 src_cost = 0;
40585 else
40587 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40589 if (CONSTANT_P (SET_SRC (x)))
40590 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40591 a small value, possibly zero for cheap constants. */
40592 src_cost += COSTS_N_INSNS (1);
40595 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40596 return true;
40598 case CONST_INT:
40599 case CONST:
40600 case LABEL_REF:
40601 case SYMBOL_REF:
40602 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40603 *total = 3;
40604 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40605 *total = 2;
40606 else if (flag_pic && SYMBOLIC_CONST (x)
40607 && !(TARGET_64BIT
40608 && (GET_CODE (x) == LABEL_REF
40609 || (GET_CODE (x) == SYMBOL_REF
40610 && SYMBOL_REF_LOCAL_P (x))))
40611 /* Use 0 cost for CONST to improve its propagation. */
40612 && (TARGET_64BIT || GET_CODE (x) != CONST))
40613 *total = 1;
40614 else
40615 *total = 0;
40616 return true;
40618 case CONST_DOUBLE:
40619 if (IS_STACK_MODE (mode))
40620 switch (standard_80387_constant_p (x))
40622 case -1:
40623 case 0:
40624 break;
40625 case 1: /* 0.0 */
40626 *total = 1;
40627 return true;
40628 default: /* Other constants */
40629 *total = 2;
40630 return true;
40632 /* FALLTHRU */
40634 case CONST_VECTOR:
40635 switch (standard_sse_constant_p (x, mode))
40637 case 0:
40638 break;
40639 case 1: /* 0: xor eliminates false dependency */
40640 *total = 0;
40641 return true;
40642 default: /* -1: cmp contains false dependency */
40643 *total = 1;
40644 return true;
40646 /* FALLTHRU */
40648 case CONST_WIDE_INT:
40649 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40650 it'll probably end up. Add a penalty for size. */
40651 *total = (COSTS_N_INSNS (1)
40652 + (!TARGET_64BIT && flag_pic)
40653 + (GET_MODE_SIZE (mode) <= 4
40654 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40655 return true;
40657 case ZERO_EXTEND:
40658 /* The zero extensions is often completely free on x86_64, so make
40659 it as cheap as possible. */
40660 if (TARGET_64BIT && mode == DImode
40661 && GET_MODE (XEXP (x, 0)) == SImode)
40662 *total = 1;
40663 else if (TARGET_ZERO_EXTEND_WITH_AND)
40664 *total = cost->add;
40665 else
40666 *total = cost->movzx;
40667 return false;
40669 case SIGN_EXTEND:
40670 *total = cost->movsx;
40671 return false;
40673 case ASHIFT:
40674 if (SCALAR_INT_MODE_P (mode)
40675 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40676 && CONST_INT_P (XEXP (x, 1)))
40678 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40679 if (value == 1)
40681 *total = cost->add;
40682 return false;
40684 if ((value == 2 || value == 3)
40685 && cost->lea <= cost->shift_const)
40687 *total = cost->lea;
40688 return false;
40691 /* FALLTHRU */
40693 case ROTATE:
40694 case ASHIFTRT:
40695 case LSHIFTRT:
40696 case ROTATERT:
40697 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40699 /* ??? Should be SSE vector operation cost. */
40700 /* At least for published AMD latencies, this really is the same
40701 as the latency for a simple fpu operation like fabs. */
40702 /* V*QImode is emulated with 1-11 insns. */
40703 if (mode == V16QImode || mode == V32QImode)
40705 int count = 11;
40706 if (TARGET_XOP && mode == V16QImode)
40708 /* For XOP we use vpshab, which requires a broadcast of the
40709 value to the variable shift insn. For constants this
40710 means a V16Q const in mem; even when we can perform the
40711 shift with one insn set the cost to prefer paddb. */
40712 if (CONSTANT_P (XEXP (x, 1)))
40714 *total = (cost->fabs
40715 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
40716 + (speed ? 2 : COSTS_N_BYTES (16)));
40717 return true;
40719 count = 3;
40721 else if (TARGET_SSSE3)
40722 count = 7;
40723 *total = cost->fabs * count;
40725 else
40726 *total = cost->fabs;
40728 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40730 if (CONST_INT_P (XEXP (x, 1)))
40732 if (INTVAL (XEXP (x, 1)) > 32)
40733 *total = cost->shift_const + COSTS_N_INSNS (2);
40734 else
40735 *total = cost->shift_const * 2;
40737 else
40739 if (GET_CODE (XEXP (x, 1)) == AND)
40740 *total = cost->shift_var * 2;
40741 else
40742 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
40745 else
40747 if (CONST_INT_P (XEXP (x, 1)))
40748 *total = cost->shift_const;
40749 else if (SUBREG_P (XEXP (x, 1))
40750 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
40752 /* Return the cost after shift-and truncation. */
40753 *total = cost->shift_var;
40754 return true;
40756 else
40757 *total = cost->shift_var;
40759 return false;
40761 case FMA:
40763 rtx sub;
40765 gcc_assert (FLOAT_MODE_P (mode));
40766 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40768 /* ??? SSE scalar/vector cost should be used here. */
40769 /* ??? Bald assumption that fma has the same cost as fmul. */
40770 *total = cost->fmul;
40771 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40773 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40774 sub = XEXP (x, 0);
40775 if (GET_CODE (sub) == NEG)
40776 sub = XEXP (sub, 0);
40777 *total += rtx_cost (sub, mode, FMA, 0, speed);
40779 sub = XEXP (x, 2);
40780 if (GET_CODE (sub) == NEG)
40781 sub = XEXP (sub, 0);
40782 *total += rtx_cost (sub, mode, FMA, 2, speed);
40783 return true;
40786 case MULT:
40787 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40789 /* ??? SSE scalar cost should be used here. */
40790 *total = cost->fmul;
40791 return false;
40793 else if (X87_FLOAT_MODE_P (mode))
40795 *total = cost->fmul;
40796 return false;
40798 else if (FLOAT_MODE_P (mode))
40800 /* ??? SSE vector cost should be used here. */
40801 *total = cost->fmul;
40802 return false;
40804 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40806 /* V*QImode is emulated with 7-13 insns. */
40807 if (mode == V16QImode || mode == V32QImode)
40809 int extra = 11;
40810 if (TARGET_XOP && mode == V16QImode)
40811 extra = 5;
40812 else if (TARGET_SSSE3)
40813 extra = 6;
40814 *total = cost->fmul * 2 + cost->fabs * extra;
40816 /* V*DImode is emulated with 5-8 insns. */
40817 else if (mode == V2DImode || mode == V4DImode)
40819 if (TARGET_XOP && mode == V2DImode)
40820 *total = cost->fmul * 2 + cost->fabs * 3;
40821 else
40822 *total = cost->fmul * 3 + cost->fabs * 5;
40824 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40825 insns, including two PMULUDQ. */
40826 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40827 *total = cost->fmul * 2 + cost->fabs * 5;
40828 else
40829 *total = cost->fmul;
40830 return false;
40832 else
40834 rtx op0 = XEXP (x, 0);
40835 rtx op1 = XEXP (x, 1);
40836 int nbits;
40837 if (CONST_INT_P (XEXP (x, 1)))
40839 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40840 for (nbits = 0; value != 0; value &= value - 1)
40841 nbits++;
40843 else
40844 /* This is arbitrary. */
40845 nbits = 7;
40847 /* Compute costs correctly for widening multiplication. */
40848 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40849 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40850 == GET_MODE_SIZE (mode))
40852 int is_mulwiden = 0;
40853 machine_mode inner_mode = GET_MODE (op0);
40855 if (GET_CODE (op0) == GET_CODE (op1))
40856 is_mulwiden = 1, op1 = XEXP (op1, 0);
40857 else if (CONST_INT_P (op1))
40859 if (GET_CODE (op0) == SIGN_EXTEND)
40860 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40861 == INTVAL (op1);
40862 else
40863 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40866 if (is_mulwiden)
40867 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40870 *total = (cost->mult_init[MODE_INDEX (mode)]
40871 + nbits * cost->mult_bit
40872 + rtx_cost (op0, mode, outer_code, opno, speed)
40873 + rtx_cost (op1, mode, outer_code, opno, speed));
40875 return true;
40878 case DIV:
40879 case UDIV:
40880 case MOD:
40881 case UMOD:
40882 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40883 /* ??? SSE cost should be used here. */
40884 *total = cost->fdiv;
40885 else if (X87_FLOAT_MODE_P (mode))
40886 *total = cost->fdiv;
40887 else if (FLOAT_MODE_P (mode))
40888 /* ??? SSE vector cost should be used here. */
40889 *total = cost->fdiv;
40890 else
40891 *total = cost->divide[MODE_INDEX (mode)];
40892 return false;
40894 case PLUS:
40895 if (GET_MODE_CLASS (mode) == MODE_INT
40896 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40898 if (GET_CODE (XEXP (x, 0)) == PLUS
40899 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40900 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40901 && CONSTANT_P (XEXP (x, 1)))
40903 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40904 if (val == 2 || val == 4 || val == 8)
40906 *total = cost->lea;
40907 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40908 outer_code, opno, speed);
40909 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40910 outer_code, opno, speed);
40911 *total += rtx_cost (XEXP (x, 1), mode,
40912 outer_code, opno, speed);
40913 return true;
40916 else if (GET_CODE (XEXP (x, 0)) == MULT
40917 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40919 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40920 if (val == 2 || val == 4 || val == 8)
40922 *total = cost->lea;
40923 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40924 outer_code, opno, speed);
40925 *total += rtx_cost (XEXP (x, 1), mode,
40926 outer_code, opno, speed);
40927 return true;
40930 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40932 *total = cost->lea;
40933 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40934 outer_code, opno, speed);
40935 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40936 outer_code, opno, speed);
40937 *total += rtx_cost (XEXP (x, 1), mode,
40938 outer_code, opno, speed);
40939 return true;
40942 /* FALLTHRU */
40944 case MINUS:
40945 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40947 /* ??? SSE cost should be used here. */
40948 *total = cost->fadd;
40949 return false;
40951 else if (X87_FLOAT_MODE_P (mode))
40953 *total = cost->fadd;
40954 return false;
40956 else if (FLOAT_MODE_P (mode))
40958 /* ??? SSE vector cost should be used here. */
40959 *total = cost->fadd;
40960 return false;
40962 /* FALLTHRU */
40964 case AND:
40965 case IOR:
40966 case XOR:
40967 if (GET_MODE_CLASS (mode) == MODE_INT
40968 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40970 *total = (cost->add * 2
40971 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40972 << (GET_MODE (XEXP (x, 0)) != DImode))
40973 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40974 << (GET_MODE (XEXP (x, 1)) != DImode)));
40975 return true;
40977 /* FALLTHRU */
40979 case NEG:
40980 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40982 /* ??? SSE cost should be used here. */
40983 *total = cost->fchs;
40984 return false;
40986 else if (X87_FLOAT_MODE_P (mode))
40988 *total = cost->fchs;
40989 return false;
40991 else if (FLOAT_MODE_P (mode))
40993 /* ??? SSE vector cost should be used here. */
40994 *total = cost->fchs;
40995 return false;
40997 /* FALLTHRU */
40999 case NOT:
41000 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41002 /* ??? Should be SSE vector operation cost. */
41003 /* At least for published AMD latencies, this really is the same
41004 as the latency for a simple fpu operation like fabs. */
41005 *total = cost->fabs;
41007 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41008 *total = cost->add * 2;
41009 else
41010 *total = cost->add;
41011 return false;
41013 case COMPARE:
41014 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41015 && XEXP (XEXP (x, 0), 1) == const1_rtx
41016 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41017 && XEXP (x, 1) == const0_rtx)
41019 /* This kind of construct is implemented using test[bwl].
41020 Treat it as if we had an AND. */
41021 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41022 *total = (cost->add
41023 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41024 opno, speed)
41025 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41026 return true;
41029 /* The embedded comparison operand is completely free. */
41030 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41031 && XEXP (x, 1) == const0_rtx)
41032 *total = 0;
41034 return false;
41036 case FLOAT_EXTEND:
41037 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41038 *total = 0;
41039 return false;
41041 case ABS:
41042 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41043 /* ??? SSE cost should be used here. */
41044 *total = cost->fabs;
41045 else if (X87_FLOAT_MODE_P (mode))
41046 *total = cost->fabs;
41047 else if (FLOAT_MODE_P (mode))
41048 /* ??? SSE vector cost should be used here. */
41049 *total = cost->fabs;
41050 return false;
41052 case SQRT:
41053 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41054 /* ??? SSE cost should be used here. */
41055 *total = cost->fsqrt;
41056 else if (X87_FLOAT_MODE_P (mode))
41057 *total = cost->fsqrt;
41058 else if (FLOAT_MODE_P (mode))
41059 /* ??? SSE vector cost should be used here. */
41060 *total = cost->fsqrt;
41061 return false;
41063 case UNSPEC:
41064 if (XINT (x, 1) == UNSPEC_TP)
41065 *total = 0;
41066 return false;
41068 case VEC_SELECT:
41069 case VEC_CONCAT:
41070 case VEC_DUPLICATE:
41071 /* ??? Assume all of these vector manipulation patterns are
41072 recognizable. In which case they all pretty much have the
41073 same cost. */
41074 *total = cost->fabs;
41075 return true;
41076 case VEC_MERGE:
41077 mask = XEXP (x, 2);
41078 /* This is masked instruction, assume the same cost,
41079 as nonmasked variant. */
41080 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41081 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41082 else
41083 *total = cost->fabs;
41084 return true;
41086 default:
41087 return false;
41091 #if TARGET_MACHO
41093 static int current_machopic_label_num;
41095 /* Given a symbol name and its associated stub, write out the
41096 definition of the stub. */
41098 void
41099 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41101 unsigned int length;
41102 char *binder_name, *symbol_name, lazy_ptr_name[32];
41103 int label = ++current_machopic_label_num;
41105 /* For 64-bit we shouldn't get here. */
41106 gcc_assert (!TARGET_64BIT);
41108 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41109 symb = targetm.strip_name_encoding (symb);
41111 length = strlen (stub);
41112 binder_name = XALLOCAVEC (char, length + 32);
41113 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41115 length = strlen (symb);
41116 symbol_name = XALLOCAVEC (char, length + 32);
41117 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41119 sprintf (lazy_ptr_name, "L%d$lz", label);
41121 if (MACHOPIC_ATT_STUB)
41122 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41123 else if (MACHOPIC_PURE)
41124 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41125 else
41126 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41128 fprintf (file, "%s:\n", stub);
41129 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41131 if (MACHOPIC_ATT_STUB)
41133 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41135 else if (MACHOPIC_PURE)
41137 /* PIC stub. */
41138 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41139 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41140 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41141 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41142 label, lazy_ptr_name, label);
41143 fprintf (file, "\tjmp\t*%%ecx\n");
41145 else
41146 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41148 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41149 it needs no stub-binding-helper. */
41150 if (MACHOPIC_ATT_STUB)
41151 return;
41153 fprintf (file, "%s:\n", binder_name);
41155 if (MACHOPIC_PURE)
41157 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41158 fprintf (file, "\tpushl\t%%ecx\n");
41160 else
41161 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41163 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41165 /* N.B. Keep the correspondence of these
41166 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41167 old-pic/new-pic/non-pic stubs; altering this will break
41168 compatibility with existing dylibs. */
41169 if (MACHOPIC_PURE)
41171 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41172 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41174 else
41175 /* 16-byte -mdynamic-no-pic stub. */
41176 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41178 fprintf (file, "%s:\n", lazy_ptr_name);
41179 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41180 fprintf (file, ASM_LONG "%s\n", binder_name);
41182 #endif /* TARGET_MACHO */
41184 /* Order the registers for register allocator. */
41186 void
41187 x86_order_regs_for_local_alloc (void)
41189 int pos = 0;
41190 int i;
41192 /* First allocate the local general purpose registers. */
41193 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41194 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41195 reg_alloc_order [pos++] = i;
41197 /* Global general purpose registers. */
41198 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41199 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41200 reg_alloc_order [pos++] = i;
41202 /* x87 registers come first in case we are doing FP math
41203 using them. */
41204 if (!TARGET_SSE_MATH)
41205 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41206 reg_alloc_order [pos++] = i;
41208 /* SSE registers. */
41209 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41210 reg_alloc_order [pos++] = i;
41211 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41212 reg_alloc_order [pos++] = i;
41214 /* Extended REX SSE registers. */
41215 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41216 reg_alloc_order [pos++] = i;
41218 /* Mask register. */
41219 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41220 reg_alloc_order [pos++] = i;
41222 /* MPX bound registers. */
41223 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41224 reg_alloc_order [pos++] = i;
41226 /* x87 registers. */
41227 if (TARGET_SSE_MATH)
41228 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41229 reg_alloc_order [pos++] = i;
41231 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41232 reg_alloc_order [pos++] = i;
41234 /* Initialize the rest of array as we do not allocate some registers
41235 at all. */
41236 while (pos < FIRST_PSEUDO_REGISTER)
41237 reg_alloc_order [pos++] = 0;
41240 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41241 in struct attribute_spec handler. */
41242 static tree
41243 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
41244 tree args,
41245 int,
41246 bool *no_add_attrs)
41248 if (TREE_CODE (*node) != FUNCTION_TYPE
41249 && TREE_CODE (*node) != METHOD_TYPE
41250 && TREE_CODE (*node) != FIELD_DECL
41251 && TREE_CODE (*node) != TYPE_DECL)
41253 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41254 name);
41255 *no_add_attrs = true;
41256 return NULL_TREE;
41258 if (TARGET_64BIT)
41260 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41261 name);
41262 *no_add_attrs = true;
41263 return NULL_TREE;
41265 if (is_attribute_p ("callee_pop_aggregate_return", name))
41267 tree cst;
41269 cst = TREE_VALUE (args);
41270 if (TREE_CODE (cst) != INTEGER_CST)
41272 warning (OPT_Wattributes,
41273 "%qE attribute requires an integer constant argument",
41274 name);
41275 *no_add_attrs = true;
41277 else if (compare_tree_int (cst, 0) != 0
41278 && compare_tree_int (cst, 1) != 0)
41280 warning (OPT_Wattributes,
41281 "argument to %qE attribute is neither zero, nor one",
41282 name);
41283 *no_add_attrs = true;
41286 return NULL_TREE;
41289 return NULL_TREE;
41292 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41293 struct attribute_spec.handler. */
41294 static tree
41295 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41296 bool *no_add_attrs)
41298 if (TREE_CODE (*node) != FUNCTION_TYPE
41299 && TREE_CODE (*node) != METHOD_TYPE
41300 && TREE_CODE (*node) != FIELD_DECL
41301 && TREE_CODE (*node) != TYPE_DECL)
41303 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41304 name);
41305 *no_add_attrs = true;
41306 return NULL_TREE;
41309 /* Can combine regparm with all attributes but fastcall. */
41310 if (is_attribute_p ("ms_abi", name))
41312 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41314 error ("ms_abi and sysv_abi attributes are not compatible");
41317 return NULL_TREE;
41319 else if (is_attribute_p ("sysv_abi", name))
41321 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41323 error ("ms_abi and sysv_abi attributes are not compatible");
41326 return NULL_TREE;
41329 return NULL_TREE;
41332 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41333 struct attribute_spec.handler. */
41334 static tree
41335 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41336 bool *no_add_attrs)
41338 tree *type = NULL;
41339 if (DECL_P (*node))
41341 if (TREE_CODE (*node) == TYPE_DECL)
41342 type = &TREE_TYPE (*node);
41344 else
41345 type = node;
41347 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41349 warning (OPT_Wattributes, "%qE attribute ignored",
41350 name);
41351 *no_add_attrs = true;
41354 else if ((is_attribute_p ("ms_struct", name)
41355 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41356 || ((is_attribute_p ("gcc_struct", name)
41357 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41359 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41360 name);
41361 *no_add_attrs = true;
41364 return NULL_TREE;
41367 static tree
41368 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
41369 bool *no_add_attrs)
41371 if (TREE_CODE (*node) != FUNCTION_DECL)
41373 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41374 name);
41375 *no_add_attrs = true;
41377 return NULL_TREE;
41380 static tree
41381 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41382 int, bool *)
41384 return NULL_TREE;
41387 static tree
41388 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41390 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41391 but the function type contains args and return type data. */
41392 tree func_type = *node;
41393 tree return_type = TREE_TYPE (func_type);
41395 int nargs = 0;
41396 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41397 while (current_arg_type
41398 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41400 if (nargs == 0)
41402 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41403 error ("interrupt service routine should have a pointer "
41404 "as the first argument");
41406 else if (nargs == 1)
41408 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41409 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41410 error ("interrupt service routine should have unsigned %s"
41411 "int as the second argument",
41412 TARGET_64BIT
41413 ? (TARGET_X32 ? "long long " : "long ")
41414 : "");
41416 nargs++;
41417 current_arg_type = TREE_CHAIN (current_arg_type);
41419 if (!nargs || nargs > 2)
41420 error ("interrupt service routine can only have a pointer argument "
41421 "and an optional integer argument");
41422 if (! VOID_TYPE_P (return_type))
41423 error ("interrupt service routine can't have non-void return value");
41425 return NULL_TREE;
41428 static bool
41429 ix86_ms_bitfield_layout_p (const_tree record_type)
41431 return ((TARGET_MS_BITFIELD_LAYOUT
41432 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41433 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41436 /* Returns an expression indicating where the this parameter is
41437 located on entry to the FUNCTION. */
41439 static rtx
41440 x86_this_parameter (tree function)
41442 tree type = TREE_TYPE (function);
41443 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41444 int nregs;
41446 if (TARGET_64BIT)
41448 const int *parm_regs;
41450 if (ix86_function_type_abi (type) == MS_ABI)
41451 parm_regs = x86_64_ms_abi_int_parameter_registers;
41452 else
41453 parm_regs = x86_64_int_parameter_registers;
41454 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41457 nregs = ix86_function_regparm (type, function);
41459 if (nregs > 0 && !stdarg_p (type))
41461 int regno;
41462 unsigned int ccvt = ix86_get_callcvt (type);
41464 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41465 regno = aggr ? DX_REG : CX_REG;
41466 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41468 regno = CX_REG;
41469 if (aggr)
41470 return gen_rtx_MEM (SImode,
41471 plus_constant (Pmode, stack_pointer_rtx, 4));
41473 else
41475 regno = AX_REG;
41476 if (aggr)
41478 regno = DX_REG;
41479 if (nregs == 1)
41480 return gen_rtx_MEM (SImode,
41481 plus_constant (Pmode,
41482 stack_pointer_rtx, 4));
41485 return gen_rtx_REG (SImode, regno);
41488 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41489 aggr ? 8 : 4));
41492 /* Determine whether x86_output_mi_thunk can succeed. */
41494 static bool
41495 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41496 const_tree function)
41498 /* 64-bit can handle anything. */
41499 if (TARGET_64BIT)
41500 return true;
41502 /* For 32-bit, everything's fine if we have one free register. */
41503 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41504 return true;
41506 /* Need a free register for vcall_offset. */
41507 if (vcall_offset)
41508 return false;
41510 /* Need a free register for GOT references. */
41511 if (flag_pic && !targetm.binds_local_p (function))
41512 return false;
41514 /* Otherwise ok. */
41515 return true;
41518 /* Output the assembler code for a thunk function. THUNK_DECL is the
41519 declaration for the thunk function itself, FUNCTION is the decl for
41520 the target function. DELTA is an immediate constant offset to be
41521 added to THIS. If VCALL_OFFSET is nonzero, the word at
41522 *(*this + vcall_offset) should be added to THIS. */
41524 static void
41525 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41526 HOST_WIDE_INT vcall_offset, tree function)
41528 rtx this_param = x86_this_parameter (function);
41529 rtx this_reg, tmp, fnaddr;
41530 unsigned int tmp_regno;
41531 rtx_insn *insn;
41533 if (TARGET_64BIT)
41534 tmp_regno = R10_REG;
41535 else
41537 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41538 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41539 tmp_regno = AX_REG;
41540 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41541 tmp_regno = DX_REG;
41542 else
41543 tmp_regno = CX_REG;
41546 emit_note (NOTE_INSN_PROLOGUE_END);
41548 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41549 pull it in now and let DELTA benefit. */
41550 if (REG_P (this_param))
41551 this_reg = this_param;
41552 else if (vcall_offset)
41554 /* Put the this parameter into %eax. */
41555 this_reg = gen_rtx_REG (Pmode, AX_REG);
41556 emit_move_insn (this_reg, this_param);
41558 else
41559 this_reg = NULL_RTX;
41561 /* Adjust the this parameter by a fixed constant. */
41562 if (delta)
41564 rtx delta_rtx = GEN_INT (delta);
41565 rtx delta_dst = this_reg ? this_reg : this_param;
41567 if (TARGET_64BIT)
41569 if (!x86_64_general_operand (delta_rtx, Pmode))
41571 tmp = gen_rtx_REG (Pmode, tmp_regno);
41572 emit_move_insn (tmp, delta_rtx);
41573 delta_rtx = tmp;
41577 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41580 /* Adjust the this parameter by a value stored in the vtable. */
41581 if (vcall_offset)
41583 rtx vcall_addr, vcall_mem, this_mem;
41585 tmp = gen_rtx_REG (Pmode, tmp_regno);
41587 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41588 if (Pmode != ptr_mode)
41589 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41590 emit_move_insn (tmp, this_mem);
41592 /* Adjust the this parameter. */
41593 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41594 if (TARGET_64BIT
41595 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41597 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41598 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41599 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41602 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41603 if (Pmode != ptr_mode)
41604 emit_insn (gen_addsi_1_zext (this_reg,
41605 gen_rtx_REG (ptr_mode,
41606 REGNO (this_reg)),
41607 vcall_mem));
41608 else
41609 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41612 /* If necessary, drop THIS back to its stack slot. */
41613 if (this_reg && this_reg != this_param)
41614 emit_move_insn (this_param, this_reg);
41616 fnaddr = XEXP (DECL_RTL (function), 0);
41617 if (TARGET_64BIT)
41619 if (!flag_pic || targetm.binds_local_p (function)
41620 || TARGET_PECOFF)
41622 else
41624 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41625 tmp = gen_rtx_CONST (Pmode, tmp);
41626 fnaddr = gen_const_mem (Pmode, tmp);
41629 else
41631 if (!flag_pic || targetm.binds_local_p (function))
41633 #if TARGET_MACHO
41634 else if (TARGET_MACHO)
41636 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41637 fnaddr = XEXP (fnaddr, 0);
41639 #endif /* TARGET_MACHO */
41640 else
41642 tmp = gen_rtx_REG (Pmode, CX_REG);
41643 output_set_got (tmp, NULL_RTX);
41645 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41646 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41647 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41648 fnaddr = gen_const_mem (Pmode, fnaddr);
41652 /* Our sibling call patterns do not allow memories, because we have no
41653 predicate that can distinguish between frame and non-frame memory.
41654 For our purposes here, we can get away with (ab)using a jump pattern,
41655 because we're going to do no optimization. */
41656 if (MEM_P (fnaddr))
41658 if (sibcall_insn_operand (fnaddr, word_mode))
41660 fnaddr = XEXP (DECL_RTL (function), 0);
41661 tmp = gen_rtx_MEM (QImode, fnaddr);
41662 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41663 tmp = emit_call_insn (tmp);
41664 SIBLING_CALL_P (tmp) = 1;
41666 else
41667 emit_jump_insn (gen_indirect_jump (fnaddr));
41669 else
41671 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41673 // CM_LARGE_PIC always uses pseudo PIC register which is
41674 // uninitialized. Since FUNCTION is local and calling it
41675 // doesn't go through PLT, we use scratch register %r11 as
41676 // PIC register and initialize it here.
41677 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41678 ix86_init_large_pic_reg (tmp_regno);
41679 fnaddr = legitimize_pic_address (fnaddr,
41680 gen_rtx_REG (Pmode, tmp_regno));
41683 if (!sibcall_insn_operand (fnaddr, word_mode))
41685 tmp = gen_rtx_REG (word_mode, tmp_regno);
41686 if (GET_MODE (fnaddr) != word_mode)
41687 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41688 emit_move_insn (tmp, fnaddr);
41689 fnaddr = tmp;
41692 tmp = gen_rtx_MEM (QImode, fnaddr);
41693 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41694 tmp = emit_call_insn (tmp);
41695 SIBLING_CALL_P (tmp) = 1;
41697 emit_barrier ();
41699 /* Emit just enough of rest_of_compilation to get the insns emitted.
41700 Note that use_thunk calls assemble_start_function et al. */
41701 insn = get_insns ();
41702 shorten_branches (insn);
41703 final_start_function (insn, file, 1);
41704 final (insn, file, 1);
41705 final_end_function ();
41708 static void
41709 x86_file_start (void)
41711 default_file_start ();
41712 if (TARGET_16BIT)
41713 fputs ("\t.code16gcc\n", asm_out_file);
41714 #if TARGET_MACHO
41715 darwin_file_start ();
41716 #endif
41717 if (X86_FILE_START_VERSION_DIRECTIVE)
41718 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41719 if (X86_FILE_START_FLTUSED)
41720 fputs ("\t.global\t__fltused\n", asm_out_file);
41721 if (ix86_asm_dialect == ASM_INTEL)
41722 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41726 x86_field_alignment (tree type, int computed)
41728 machine_mode mode;
41730 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41731 return computed;
41732 if (TARGET_IAMCU)
41733 return iamcu_alignment (type, computed);
41734 mode = TYPE_MODE (strip_array_types (type));
41735 if (mode == DFmode || mode == DCmode
41736 || GET_MODE_CLASS (mode) == MODE_INT
41737 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41738 return MIN (32, computed);
41739 return computed;
41742 /* Print call to TARGET to FILE. */
41744 static void
41745 x86_print_call_or_nop (FILE *file, const char *target)
41747 if (flag_nop_mcount)
41748 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
41749 else
41750 fprintf (file, "1:\tcall\t%s\n", target);
41753 /* Output assembler code to FILE to increment profiler label # LABELNO
41754 for profiling a function entry. */
41755 void
41756 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41758 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41759 : MCOUNT_NAME);
41760 if (TARGET_64BIT)
41762 #ifndef NO_PROFILE_COUNTERS
41763 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41764 #endif
41766 if (!TARGET_PECOFF && flag_pic)
41767 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41768 else
41769 x86_print_call_or_nop (file, mcount_name);
41771 else if (flag_pic)
41773 #ifndef NO_PROFILE_COUNTERS
41774 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41775 LPREFIX, labelno);
41776 #endif
41777 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41779 else
41781 #ifndef NO_PROFILE_COUNTERS
41782 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41783 LPREFIX, labelno);
41784 #endif
41785 x86_print_call_or_nop (file, mcount_name);
41788 if (flag_record_mcount)
41790 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41791 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41792 fprintf (file, "\t.previous\n");
41796 /* We don't have exact information about the insn sizes, but we may assume
41797 quite safely that we are informed about all 1 byte insns and memory
41798 address sizes. This is enough to eliminate unnecessary padding in
41799 99% of cases. */
41801 static int
41802 min_insn_size (rtx_insn *insn)
41804 int l = 0, len;
41806 if (!INSN_P (insn) || !active_insn_p (insn))
41807 return 0;
41809 /* Discard alignments we've emit and jump instructions. */
41810 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41811 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41812 return 0;
41814 /* Important case - calls are always 5 bytes.
41815 It is common to have many calls in the row. */
41816 if (CALL_P (insn)
41817 && symbolic_reference_mentioned_p (PATTERN (insn))
41818 && !SIBLING_CALL_P (insn))
41819 return 5;
41820 len = get_attr_length (insn);
41821 if (len <= 1)
41822 return 1;
41824 /* For normal instructions we rely on get_attr_length being exact,
41825 with a few exceptions. */
41826 if (!JUMP_P (insn))
41828 enum attr_type type = get_attr_type (insn);
41830 switch (type)
41832 case TYPE_MULTI:
41833 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41834 || asm_noperands (PATTERN (insn)) >= 0)
41835 return 0;
41836 break;
41837 case TYPE_OTHER:
41838 case TYPE_FCMP:
41839 break;
41840 default:
41841 /* Otherwise trust get_attr_length. */
41842 return len;
41845 l = get_attr_length_address (insn);
41846 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41847 l = 4;
41849 if (l)
41850 return 1+l;
41851 else
41852 return 2;
41855 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41857 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41858 window. */
41860 static void
41861 ix86_avoid_jump_mispredicts (void)
41863 rtx_insn *insn, *start = get_insns ();
41864 int nbytes = 0, njumps = 0;
41865 bool isjump = false;
41867 /* Look for all minimal intervals of instructions containing 4 jumps.
41868 The intervals are bounded by START and INSN. NBYTES is the total
41869 size of instructions in the interval including INSN and not including
41870 START. When the NBYTES is smaller than 16 bytes, it is possible
41871 that the end of START and INSN ends up in the same 16byte page.
41873 The smallest offset in the page INSN can start is the case where START
41874 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41875 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41877 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41878 have to, control transfer to label(s) can be performed through other
41879 means, and also we estimate minimum length of all asm stmts as 0. */
41880 for (insn = start; insn; insn = NEXT_INSN (insn))
41882 int min_size;
41884 if (LABEL_P (insn))
41886 int align = label_to_alignment (insn);
41887 int max_skip = label_to_max_skip (insn);
41889 if (max_skip > 15)
41890 max_skip = 15;
41891 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41892 already in the current 16 byte page, because otherwise
41893 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41894 bytes to reach 16 byte boundary. */
41895 if (align <= 0
41896 || (align <= 3 && max_skip != (1 << align) - 1))
41897 max_skip = 0;
41898 if (dump_file)
41899 fprintf (dump_file, "Label %i with max_skip %i\n",
41900 INSN_UID (insn), max_skip);
41901 if (max_skip)
41903 while (nbytes + max_skip >= 16)
41905 start = NEXT_INSN (start);
41906 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41907 || CALL_P (start))
41908 njumps--, isjump = true;
41909 else
41910 isjump = false;
41911 nbytes -= min_insn_size (start);
41914 continue;
41917 min_size = min_insn_size (insn);
41918 nbytes += min_size;
41919 if (dump_file)
41920 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41921 INSN_UID (insn), min_size);
41922 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41923 || CALL_P (insn))
41924 njumps++;
41925 else
41926 continue;
41928 while (njumps > 3)
41930 start = NEXT_INSN (start);
41931 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41932 || CALL_P (start))
41933 njumps--, isjump = true;
41934 else
41935 isjump = false;
41936 nbytes -= min_insn_size (start);
41938 gcc_assert (njumps >= 0);
41939 if (dump_file)
41940 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41941 INSN_UID (start), INSN_UID (insn), nbytes);
41943 if (njumps == 3 && isjump && nbytes < 16)
41945 int padsize = 15 - nbytes + min_insn_size (insn);
41947 if (dump_file)
41948 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41949 INSN_UID (insn), padsize);
41950 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41954 #endif
41956 /* AMD Athlon works faster
41957 when RET is not destination of conditional jump or directly preceded
41958 by other jump instruction. We avoid the penalty by inserting NOP just
41959 before the RET instructions in such cases. */
41960 static void
41961 ix86_pad_returns (void)
41963 edge e;
41964 edge_iterator ei;
41966 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41968 basic_block bb = e->src;
41969 rtx_insn *ret = BB_END (bb);
41970 rtx_insn *prev;
41971 bool replace = false;
41973 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41974 || optimize_bb_for_size_p (bb))
41975 continue;
41976 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41977 if (active_insn_p (prev) || LABEL_P (prev))
41978 break;
41979 if (prev && LABEL_P (prev))
41981 edge e;
41982 edge_iterator ei;
41984 FOR_EACH_EDGE (e, ei, bb->preds)
41985 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41986 && !(e->flags & EDGE_FALLTHRU))
41988 replace = true;
41989 break;
41992 if (!replace)
41994 prev = prev_active_insn (ret);
41995 if (prev
41996 && ((JUMP_P (prev) && any_condjump_p (prev))
41997 || CALL_P (prev)))
41998 replace = true;
41999 /* Empty functions get branch mispredict even when
42000 the jump destination is not visible to us. */
42001 if (!prev && !optimize_function_for_size_p (cfun))
42002 replace = true;
42004 if (replace)
42006 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42007 delete_insn (ret);
42012 /* Count the minimum number of instructions in BB. Return 4 if the
42013 number of instructions >= 4. */
42015 static int
42016 ix86_count_insn_bb (basic_block bb)
42018 rtx_insn *insn;
42019 int insn_count = 0;
42021 /* Count number of instructions in this block. Return 4 if the number
42022 of instructions >= 4. */
42023 FOR_BB_INSNS (bb, insn)
42025 /* Only happen in exit blocks. */
42026 if (JUMP_P (insn)
42027 && ANY_RETURN_P (PATTERN (insn)))
42028 break;
42030 if (NONDEBUG_INSN_P (insn)
42031 && GET_CODE (PATTERN (insn)) != USE
42032 && GET_CODE (PATTERN (insn)) != CLOBBER)
42034 insn_count++;
42035 if (insn_count >= 4)
42036 return insn_count;
42040 return insn_count;
42044 /* Count the minimum number of instructions in code path in BB.
42045 Return 4 if the number of instructions >= 4. */
42047 static int
42048 ix86_count_insn (basic_block bb)
42050 edge e;
42051 edge_iterator ei;
42052 int min_prev_count;
42054 /* Only bother counting instructions along paths with no
42055 more than 2 basic blocks between entry and exit. Given
42056 that BB has an edge to exit, determine if a predecessor
42057 of BB has an edge from entry. If so, compute the number
42058 of instructions in the predecessor block. If there
42059 happen to be multiple such blocks, compute the minimum. */
42060 min_prev_count = 4;
42061 FOR_EACH_EDGE (e, ei, bb->preds)
42063 edge prev_e;
42064 edge_iterator prev_ei;
42066 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42068 min_prev_count = 0;
42069 break;
42071 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42073 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42075 int count = ix86_count_insn_bb (e->src);
42076 if (count < min_prev_count)
42077 min_prev_count = count;
42078 break;
42083 if (min_prev_count < 4)
42084 min_prev_count += ix86_count_insn_bb (bb);
42086 return min_prev_count;
42089 /* Pad short function to 4 instructions. */
42091 static void
42092 ix86_pad_short_function (void)
42094 edge e;
42095 edge_iterator ei;
42097 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42099 rtx_insn *ret = BB_END (e->src);
42100 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42102 int insn_count = ix86_count_insn (e->src);
42104 /* Pad short function. */
42105 if (insn_count < 4)
42107 rtx_insn *insn = ret;
42109 /* Find epilogue. */
42110 while (insn
42111 && (!NOTE_P (insn)
42112 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42113 insn = PREV_INSN (insn);
42115 if (!insn)
42116 insn = ret;
42118 /* Two NOPs count as one instruction. */
42119 insn_count = 2 * (4 - insn_count);
42120 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42126 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42127 the epilogue, the Windows system unwinder will apply epilogue logic and
42128 produce incorrect offsets. This can be avoided by adding a nop between
42129 the last insn that can throw and the first insn of the epilogue. */
42131 static void
42132 ix86_seh_fixup_eh_fallthru (void)
42134 edge e;
42135 edge_iterator ei;
42137 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42139 rtx_insn *insn, *next;
42141 /* Find the beginning of the epilogue. */
42142 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42143 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42144 break;
42145 if (insn == NULL)
42146 continue;
42148 /* We only care about preceding insns that can throw. */
42149 insn = prev_active_insn (insn);
42150 if (insn == NULL || !can_throw_internal (insn))
42151 continue;
42153 /* Do not separate calls from their debug information. */
42154 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42155 if (NOTE_P (next)
42156 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42157 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42158 insn = next;
42159 else
42160 break;
42162 emit_insn_after (gen_nops (const1_rtx), insn);
42166 /* Given a register number BASE, the lowest of a group of registers, update
42167 regsets IN and OUT with the registers that should be avoided in input
42168 and output operands respectively when trying to avoid generating a modr/m
42169 byte for -fmitigate-rop. */
42171 static void
42172 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42174 SET_HARD_REG_BIT (out, base);
42175 SET_HARD_REG_BIT (out, base + 1);
42176 SET_HARD_REG_BIT (in, base + 2);
42177 SET_HARD_REG_BIT (in, base + 3);
42180 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
42181 that certain encodings of modr/m bytes do not occur. */
42182 static void
42183 ix86_mitigate_rop (void)
42185 HARD_REG_SET input_risky;
42186 HARD_REG_SET output_risky;
42187 HARD_REG_SET inout_risky;
42189 CLEAR_HARD_REG_SET (output_risky);
42190 CLEAR_HARD_REG_SET (input_risky);
42191 SET_HARD_REG_BIT (output_risky, AX_REG);
42192 SET_HARD_REG_BIT (output_risky, CX_REG);
42193 SET_HARD_REG_BIT (input_risky, BX_REG);
42194 SET_HARD_REG_BIT (input_risky, DX_REG);
42195 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42196 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42197 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42198 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42199 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42200 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42201 COPY_HARD_REG_SET (inout_risky, input_risky);
42202 IOR_HARD_REG_SET (inout_risky, output_risky);
42204 df_note_add_problem ();
42205 /* Fix up what stack-regs did. */
42206 df_insn_rescan_all ();
42207 df_analyze ();
42209 regrename_init (true);
42210 regrename_analyze (NULL);
42212 auto_vec<du_head_p> cands;
42214 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42216 if (!NONDEBUG_INSN_P (insn))
42217 continue;
42219 if (GET_CODE (PATTERN (insn)) == USE
42220 || GET_CODE (PATTERN (insn)) == CLOBBER)
42221 continue;
42223 extract_insn (insn);
42225 int opno0, opno1;
42226 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42227 recog_data.n_operands, &opno0,
42228 &opno1);
42230 if (!ix86_rop_should_change_byte_p (modrm))
42231 continue;
42233 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42235 /* This happens when regrename has to fail a block. */
42236 if (!info->op_info)
42237 continue;
42239 if (info->op_info[opno0].n_chains != 0)
42241 gcc_assert (info->op_info[opno0].n_chains == 1);
42242 du_head_p op0c;
42243 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42244 if (op0c->target_data_1 + op0c->target_data_2 == 0
42245 && !op0c->cannot_rename)
42246 cands.safe_push (op0c);
42248 op0c->target_data_1++;
42250 if (info->op_info[opno1].n_chains != 0)
42252 gcc_assert (info->op_info[opno1].n_chains == 1);
42253 du_head_p op1c;
42254 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42255 if (op1c->target_data_1 + op1c->target_data_2 == 0
42256 && !op1c->cannot_rename)
42257 cands.safe_push (op1c);
42259 op1c->target_data_2++;
42263 int i;
42264 du_head_p head;
42265 FOR_EACH_VEC_ELT (cands, i, head)
42267 int old_reg, best_reg;
42268 HARD_REG_SET unavailable;
42270 CLEAR_HARD_REG_SET (unavailable);
42271 if (head->target_data_1)
42272 IOR_HARD_REG_SET (unavailable, output_risky);
42273 if (head->target_data_2)
42274 IOR_HARD_REG_SET (unavailable, input_risky);
42276 int n_uses;
42277 reg_class superclass = regrename_find_superclass (head, &n_uses,
42278 &unavailable);
42279 old_reg = head->regno;
42280 best_reg = find_rename_reg (head, superclass, &unavailable,
42281 old_reg, false);
42282 bool ok = regrename_do_replace (head, best_reg);
42283 gcc_assert (ok);
42284 if (dump_file)
42285 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42286 reg_names[best_reg], reg_class_names[superclass]);
42290 regrename_finish ();
42292 df_analyze ();
42294 basic_block bb;
42295 regset_head live;
42297 INIT_REG_SET (&live);
42299 FOR_EACH_BB_FN (bb, cfun)
42301 rtx_insn *insn;
42303 COPY_REG_SET (&live, DF_LR_OUT (bb));
42304 df_simulate_initialize_backwards (bb, &live);
42306 FOR_BB_INSNS_REVERSE (bb, insn)
42308 if (!NONDEBUG_INSN_P (insn))
42309 continue;
42311 df_simulate_one_insn_backwards (bb, insn, &live);
42313 if (GET_CODE (PATTERN (insn)) == USE
42314 || GET_CODE (PATTERN (insn)) == CLOBBER)
42315 continue;
42317 extract_insn (insn);
42318 constrain_operands_cached (insn, reload_completed);
42319 int opno0, opno1;
42320 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42321 recog_data.n_operands, &opno0,
42322 &opno1);
42323 if (modrm < 0
42324 || !ix86_rop_should_change_byte_p (modrm)
42325 || opno0 == opno1)
42326 continue;
42328 rtx oldreg = recog_data.operand[opno1];
42329 preprocess_constraints (insn);
42330 const operand_alternative *alt = which_op_alt ();
42332 int i;
42333 for (i = 0; i < recog_data.n_operands; i++)
42334 if (i != opno1
42335 && alt[i].earlyclobber
42336 && reg_overlap_mentioned_p (recog_data.operand[i],
42337 oldreg))
42338 break;
42340 if (i < recog_data.n_operands)
42341 continue;
42343 if (dump_file)
42344 fprintf (dump_file,
42345 "attempting to fix modrm byte in insn %d:"
42346 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42347 reg_class_names[alt[opno1].cl]);
42349 HARD_REG_SET unavailable;
42350 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42351 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42352 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42353 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42354 IOR_HARD_REG_SET (unavailable, output_risky);
42355 IOR_COMPL_HARD_REG_SET (unavailable,
42356 reg_class_contents[alt[opno1].cl]);
42358 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42359 if (!TEST_HARD_REG_BIT (unavailable, i))
42360 break;
42361 if (i == FIRST_PSEUDO_REGISTER)
42363 if (dump_file)
42364 fprintf (dump_file, ", none available\n");
42365 continue;
42367 if (dump_file)
42368 fprintf (dump_file, " -> %d\n", i);
42369 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42370 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42371 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42376 /* Implement machine specific optimizations. We implement padding of returns
42377 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42378 static void
42379 ix86_reorg (void)
42381 /* We are freeing block_for_insn in the toplev to keep compatibility
42382 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42383 compute_bb_for_insn ();
42385 if (flag_mitigate_rop)
42386 ix86_mitigate_rop ();
42388 if (TARGET_SEH && current_function_has_exception_handlers ())
42389 ix86_seh_fixup_eh_fallthru ();
42391 if (optimize && optimize_function_for_speed_p (cfun))
42393 if (TARGET_PAD_SHORT_FUNCTION)
42394 ix86_pad_short_function ();
42395 else if (TARGET_PAD_RETURNS)
42396 ix86_pad_returns ();
42397 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42398 if (TARGET_FOUR_JUMP_LIMIT)
42399 ix86_avoid_jump_mispredicts ();
42400 #endif
42404 /* Return nonzero when QImode register that must be represented via REX prefix
42405 is used. */
42406 bool
42407 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42409 int i;
42410 extract_insn_cached (insn);
42411 for (i = 0; i < recog_data.n_operands; i++)
42412 if (GENERAL_REG_P (recog_data.operand[i])
42413 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42414 return true;
42415 return false;
42418 /* Return true when INSN mentions register that must be encoded using REX
42419 prefix. */
42420 bool
42421 x86_extended_reg_mentioned_p (rtx insn)
42423 subrtx_iterator::array_type array;
42424 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42426 const_rtx x = *iter;
42427 if (REG_P (x)
42428 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42429 return true;
42431 return false;
42434 /* If profitable, negate (without causing overflow) integer constant
42435 of mode MODE at location LOC. Return true in this case. */
42436 bool
42437 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42439 HOST_WIDE_INT val;
42441 if (!CONST_INT_P (*loc))
42442 return false;
42444 switch (mode)
42446 case DImode:
42447 /* DImode x86_64 constants must fit in 32 bits. */
42448 gcc_assert (x86_64_immediate_operand (*loc, mode));
42450 mode = SImode;
42451 break;
42453 case SImode:
42454 case HImode:
42455 case QImode:
42456 break;
42458 default:
42459 gcc_unreachable ();
42462 /* Avoid overflows. */
42463 if (mode_signbit_p (mode, *loc))
42464 return false;
42466 val = INTVAL (*loc);
42468 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42469 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42470 if ((val < 0 && val != -128)
42471 || val == 128)
42473 *loc = GEN_INT (-val);
42474 return true;
42477 return false;
42480 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42481 optabs would emit if we didn't have TFmode patterns. */
42483 void
42484 x86_emit_floatuns (rtx operands[2])
42486 rtx_code_label *neglab, *donelab;
42487 rtx i0, i1, f0, in, out;
42488 machine_mode mode, inmode;
42490 inmode = GET_MODE (operands[1]);
42491 gcc_assert (inmode == SImode || inmode == DImode);
42493 out = operands[0];
42494 in = force_reg (inmode, operands[1]);
42495 mode = GET_MODE (out);
42496 neglab = gen_label_rtx ();
42497 donelab = gen_label_rtx ();
42498 f0 = gen_reg_rtx (mode);
42500 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42502 expand_float (out, in, 0);
42504 emit_jump_insn (gen_jump (donelab));
42505 emit_barrier ();
42507 emit_label (neglab);
42509 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42510 1, OPTAB_DIRECT);
42511 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42512 1, OPTAB_DIRECT);
42513 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42515 expand_float (f0, i0, 0);
42517 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42519 emit_label (donelab);
42522 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42523 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42524 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42525 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42527 /* Get a vector mode of the same size as the original but with elements
42528 twice as wide. This is only guaranteed to apply to integral vectors. */
42530 static inline machine_mode
42531 get_mode_wider_vector (machine_mode o)
42533 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42534 machine_mode n = GET_MODE_WIDER_MODE (o);
42535 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42536 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42537 return n;
42540 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42541 fill target with val via vec_duplicate. */
42543 static bool
42544 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42546 bool ok;
42547 rtx_insn *insn;
42548 rtx dup;
42550 /* First attempt to recognize VAL as-is. */
42551 dup = gen_rtx_VEC_DUPLICATE (mode, val);
42552 insn = emit_insn (gen_rtx_SET (target, dup));
42553 if (recog_memoized (insn) < 0)
42555 rtx_insn *seq;
42556 /* If that fails, force VAL into a register. */
42558 start_sequence ();
42559 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
42560 seq = get_insns ();
42561 end_sequence ();
42562 if (seq)
42563 emit_insn_before (seq, insn);
42565 ok = recog_memoized (insn) >= 0;
42566 gcc_assert (ok);
42568 return true;
42571 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42572 with all elements equal to VAR. Return true if successful. */
42574 static bool
42575 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42576 rtx target, rtx val)
42578 bool ok;
42580 switch (mode)
42582 case V2SImode:
42583 case V2SFmode:
42584 if (!mmx_ok)
42585 return false;
42586 /* FALLTHRU */
42588 case V4DFmode:
42589 case V4DImode:
42590 case V8SFmode:
42591 case V8SImode:
42592 case V2DFmode:
42593 case V2DImode:
42594 case V4SFmode:
42595 case V4SImode:
42596 case V16SImode:
42597 case V8DImode:
42598 case V16SFmode:
42599 case V8DFmode:
42600 return ix86_vector_duplicate_value (mode, target, val);
42602 case V4HImode:
42603 if (!mmx_ok)
42604 return false;
42605 if (TARGET_SSE || TARGET_3DNOW_A)
42607 rtx x;
42609 val = gen_lowpart (SImode, val);
42610 x = gen_rtx_TRUNCATE (HImode, val);
42611 x = gen_rtx_VEC_DUPLICATE (mode, x);
42612 emit_insn (gen_rtx_SET (target, x));
42613 return true;
42615 goto widen;
42617 case V8QImode:
42618 if (!mmx_ok)
42619 return false;
42620 goto widen;
42622 case V8HImode:
42623 if (TARGET_AVX2)
42624 return ix86_vector_duplicate_value (mode, target, val);
42626 if (TARGET_SSE2)
42628 struct expand_vec_perm_d dperm;
42629 rtx tmp1, tmp2;
42631 permute:
42632 memset (&dperm, 0, sizeof (dperm));
42633 dperm.target = target;
42634 dperm.vmode = mode;
42635 dperm.nelt = GET_MODE_NUNITS (mode);
42636 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42637 dperm.one_operand_p = true;
42639 /* Extend to SImode using a paradoxical SUBREG. */
42640 tmp1 = gen_reg_rtx (SImode);
42641 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42643 /* Insert the SImode value as low element of a V4SImode vector. */
42644 tmp2 = gen_reg_rtx (V4SImode);
42645 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42646 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42648 ok = (expand_vec_perm_1 (&dperm)
42649 || expand_vec_perm_broadcast_1 (&dperm));
42650 gcc_assert (ok);
42651 return ok;
42653 goto widen;
42655 case V16QImode:
42656 if (TARGET_AVX2)
42657 return ix86_vector_duplicate_value (mode, target, val);
42659 if (TARGET_SSE2)
42660 goto permute;
42661 goto widen;
42663 widen:
42664 /* Replicate the value once into the next wider mode and recurse. */
42666 machine_mode smode, wsmode, wvmode;
42667 rtx x;
42669 smode = GET_MODE_INNER (mode);
42670 wvmode = get_mode_wider_vector (mode);
42671 wsmode = GET_MODE_INNER (wvmode);
42673 val = convert_modes (wsmode, smode, val, true);
42674 x = expand_simple_binop (wsmode, ASHIFT, val,
42675 GEN_INT (GET_MODE_BITSIZE (smode)),
42676 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42677 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42679 x = gen_reg_rtx (wvmode);
42680 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42681 gcc_assert (ok);
42682 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42683 return ok;
42686 case V16HImode:
42687 case V32QImode:
42688 if (TARGET_AVX2)
42689 return ix86_vector_duplicate_value (mode, target, val);
42690 else
42692 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42693 rtx x = gen_reg_rtx (hvmode);
42695 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42696 gcc_assert (ok);
42698 x = gen_rtx_VEC_CONCAT (mode, x, x);
42699 emit_insn (gen_rtx_SET (target, x));
42701 return true;
42703 case V64QImode:
42704 case V32HImode:
42705 if (TARGET_AVX512BW)
42706 return ix86_vector_duplicate_value (mode, target, val);
42707 else
42709 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42710 rtx x = gen_reg_rtx (hvmode);
42712 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42713 gcc_assert (ok);
42715 x = gen_rtx_VEC_CONCAT (mode, x, x);
42716 emit_insn (gen_rtx_SET (target, x));
42718 return true;
42720 default:
42721 return false;
42725 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42726 whose ONE_VAR element is VAR, and other elements are zero. Return true
42727 if successful. */
42729 static bool
42730 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42731 rtx target, rtx var, int one_var)
42733 machine_mode vsimode;
42734 rtx new_target;
42735 rtx x, tmp;
42736 bool use_vector_set = false;
42738 switch (mode)
42740 case V2DImode:
42741 /* For SSE4.1, we normally use vector set. But if the second
42742 element is zero and inter-unit moves are OK, we use movq
42743 instead. */
42744 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42745 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42746 && one_var == 0));
42747 break;
42748 case V16QImode:
42749 case V4SImode:
42750 case V4SFmode:
42751 use_vector_set = TARGET_SSE4_1;
42752 break;
42753 case V8HImode:
42754 use_vector_set = TARGET_SSE2;
42755 break;
42756 case V4HImode:
42757 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42758 break;
42759 case V32QImode:
42760 case V16HImode:
42761 case V8SImode:
42762 case V8SFmode:
42763 case V4DFmode:
42764 use_vector_set = TARGET_AVX;
42765 break;
42766 case V4DImode:
42767 /* Use ix86_expand_vector_set in 64bit mode only. */
42768 use_vector_set = TARGET_AVX && TARGET_64BIT;
42769 break;
42770 default:
42771 break;
42774 if (use_vector_set)
42776 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42777 var = force_reg (GET_MODE_INNER (mode), var);
42778 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42779 return true;
42782 switch (mode)
42784 case V2SFmode:
42785 case V2SImode:
42786 if (!mmx_ok)
42787 return false;
42788 /* FALLTHRU */
42790 case V2DFmode:
42791 case V2DImode:
42792 if (one_var != 0)
42793 return false;
42794 var = force_reg (GET_MODE_INNER (mode), var);
42795 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42796 emit_insn (gen_rtx_SET (target, x));
42797 return true;
42799 case V4SFmode:
42800 case V4SImode:
42801 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42802 new_target = gen_reg_rtx (mode);
42803 else
42804 new_target = target;
42805 var = force_reg (GET_MODE_INNER (mode), var);
42806 x = gen_rtx_VEC_DUPLICATE (mode, var);
42807 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42808 emit_insn (gen_rtx_SET (new_target, x));
42809 if (one_var != 0)
42811 /* We need to shuffle the value to the correct position, so
42812 create a new pseudo to store the intermediate result. */
42814 /* With SSE2, we can use the integer shuffle insns. */
42815 if (mode != V4SFmode && TARGET_SSE2)
42817 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42818 const1_rtx,
42819 GEN_INT (one_var == 1 ? 0 : 1),
42820 GEN_INT (one_var == 2 ? 0 : 1),
42821 GEN_INT (one_var == 3 ? 0 : 1)));
42822 if (target != new_target)
42823 emit_move_insn (target, new_target);
42824 return true;
42827 /* Otherwise convert the intermediate result to V4SFmode and
42828 use the SSE1 shuffle instructions. */
42829 if (mode != V4SFmode)
42831 tmp = gen_reg_rtx (V4SFmode);
42832 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42834 else
42835 tmp = new_target;
42837 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42838 const1_rtx,
42839 GEN_INT (one_var == 1 ? 0 : 1),
42840 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42841 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42843 if (mode != V4SFmode)
42844 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42845 else if (tmp != target)
42846 emit_move_insn (target, tmp);
42848 else if (target != new_target)
42849 emit_move_insn (target, new_target);
42850 return true;
42852 case V8HImode:
42853 case V16QImode:
42854 vsimode = V4SImode;
42855 goto widen;
42856 case V4HImode:
42857 case V8QImode:
42858 if (!mmx_ok)
42859 return false;
42860 vsimode = V2SImode;
42861 goto widen;
42862 widen:
42863 if (one_var != 0)
42864 return false;
42866 /* Zero extend the variable element to SImode and recurse. */
42867 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42869 x = gen_reg_rtx (vsimode);
42870 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42871 var, one_var))
42872 gcc_unreachable ();
42874 emit_move_insn (target, gen_lowpart (mode, x));
42875 return true;
42877 default:
42878 return false;
42882 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42883 consisting of the values in VALS. It is known that all elements
42884 except ONE_VAR are constants. Return true if successful. */
42886 static bool
42887 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42888 rtx target, rtx vals, int one_var)
42890 rtx var = XVECEXP (vals, 0, one_var);
42891 machine_mode wmode;
42892 rtx const_vec, x;
42894 const_vec = copy_rtx (vals);
42895 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42896 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42898 switch (mode)
42900 case V2DFmode:
42901 case V2DImode:
42902 case V2SFmode:
42903 case V2SImode:
42904 /* For the two element vectors, it's just as easy to use
42905 the general case. */
42906 return false;
42908 case V4DImode:
42909 /* Use ix86_expand_vector_set in 64bit mode only. */
42910 if (!TARGET_64BIT)
42911 return false;
42912 /* FALLTHRU */
42913 case V4DFmode:
42914 case V8SFmode:
42915 case V8SImode:
42916 case V16HImode:
42917 case V32QImode:
42918 case V4SFmode:
42919 case V4SImode:
42920 case V8HImode:
42921 case V4HImode:
42922 break;
42924 case V16QImode:
42925 if (TARGET_SSE4_1)
42926 break;
42927 wmode = V8HImode;
42928 goto widen;
42929 case V8QImode:
42930 wmode = V4HImode;
42931 goto widen;
42932 widen:
42933 /* There's no way to set one QImode entry easily. Combine
42934 the variable value with its adjacent constant value, and
42935 promote to an HImode set. */
42936 x = XVECEXP (vals, 0, one_var ^ 1);
42937 if (one_var & 1)
42939 var = convert_modes (HImode, QImode, var, true);
42940 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42941 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42942 x = GEN_INT (INTVAL (x) & 0xff);
42944 else
42946 var = convert_modes (HImode, QImode, var, true);
42947 x = gen_int_mode (INTVAL (x) << 8, HImode);
42949 if (x != const0_rtx)
42950 var = expand_simple_binop (HImode, IOR, var, x, var,
42951 1, OPTAB_LIB_WIDEN);
42953 x = gen_reg_rtx (wmode);
42954 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42955 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42957 emit_move_insn (target, gen_lowpart (mode, x));
42958 return true;
42960 default:
42961 return false;
42964 emit_move_insn (target, const_vec);
42965 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42966 return true;
42969 /* A subroutine of ix86_expand_vector_init_general. Use vector
42970 concatenate to handle the most general case: all values variable,
42971 and none identical. */
42973 static void
42974 ix86_expand_vector_init_concat (machine_mode mode,
42975 rtx target, rtx *ops, int n)
42977 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42978 rtx first[16], second[8], third[4];
42979 rtvec v;
42980 int i, j;
42982 switch (n)
42984 case 2:
42985 switch (mode)
42987 case V16SImode:
42988 cmode = V8SImode;
42989 break;
42990 case V16SFmode:
42991 cmode = V8SFmode;
42992 break;
42993 case V8DImode:
42994 cmode = V4DImode;
42995 break;
42996 case V8DFmode:
42997 cmode = V4DFmode;
42998 break;
42999 case V8SImode:
43000 cmode = V4SImode;
43001 break;
43002 case V8SFmode:
43003 cmode = V4SFmode;
43004 break;
43005 case V4DImode:
43006 cmode = V2DImode;
43007 break;
43008 case V4DFmode:
43009 cmode = V2DFmode;
43010 break;
43011 case V4SImode:
43012 cmode = V2SImode;
43013 break;
43014 case V4SFmode:
43015 cmode = V2SFmode;
43016 break;
43017 case V2DImode:
43018 cmode = DImode;
43019 break;
43020 case V2SImode:
43021 cmode = SImode;
43022 break;
43023 case V2DFmode:
43024 cmode = DFmode;
43025 break;
43026 case V2SFmode:
43027 cmode = SFmode;
43028 break;
43029 default:
43030 gcc_unreachable ();
43033 if (!register_operand (ops[1], cmode))
43034 ops[1] = force_reg (cmode, ops[1]);
43035 if (!register_operand (ops[0], cmode))
43036 ops[0] = force_reg (cmode, ops[0]);
43037 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43038 ops[1])));
43039 break;
43041 case 4:
43042 switch (mode)
43044 case V4DImode:
43045 cmode = V2DImode;
43046 break;
43047 case V4DFmode:
43048 cmode = V2DFmode;
43049 break;
43050 case V4SImode:
43051 cmode = V2SImode;
43052 break;
43053 case V4SFmode:
43054 cmode = V2SFmode;
43055 break;
43056 default:
43057 gcc_unreachable ();
43059 goto half;
43061 case 8:
43062 switch (mode)
43064 case V8DImode:
43065 cmode = V2DImode;
43066 hmode = V4DImode;
43067 break;
43068 case V8DFmode:
43069 cmode = V2DFmode;
43070 hmode = V4DFmode;
43071 break;
43072 case V8SImode:
43073 cmode = V2SImode;
43074 hmode = V4SImode;
43075 break;
43076 case V8SFmode:
43077 cmode = V2SFmode;
43078 hmode = V4SFmode;
43079 break;
43080 default:
43081 gcc_unreachable ();
43083 goto half;
43085 case 16:
43086 switch (mode)
43088 case V16SImode:
43089 cmode = V2SImode;
43090 hmode = V4SImode;
43091 gmode = V8SImode;
43092 break;
43093 case V16SFmode:
43094 cmode = V2SFmode;
43095 hmode = V4SFmode;
43096 gmode = V8SFmode;
43097 break;
43098 default:
43099 gcc_unreachable ();
43101 goto half;
43103 half:
43104 /* FIXME: We process inputs backward to help RA. PR 36222. */
43105 i = n - 1;
43106 j = (n >> 1) - 1;
43107 for (; i > 0; i -= 2, j--)
43109 first[j] = gen_reg_rtx (cmode);
43110 v = gen_rtvec (2, ops[i - 1], ops[i]);
43111 ix86_expand_vector_init (false, first[j],
43112 gen_rtx_PARALLEL (cmode, v));
43115 n >>= 1;
43116 if (n > 4)
43118 gcc_assert (hmode != VOIDmode);
43119 gcc_assert (gmode != VOIDmode);
43120 for (i = j = 0; i < n; i += 2, j++)
43122 second[j] = gen_reg_rtx (hmode);
43123 ix86_expand_vector_init_concat (hmode, second [j],
43124 &first [i], 2);
43126 n >>= 1;
43127 for (i = j = 0; i < n; i += 2, j++)
43129 third[j] = gen_reg_rtx (gmode);
43130 ix86_expand_vector_init_concat (gmode, third[j],
43131 &second[i], 2);
43133 n >>= 1;
43134 ix86_expand_vector_init_concat (mode, target, third, n);
43136 else if (n > 2)
43138 gcc_assert (hmode != VOIDmode);
43139 for (i = j = 0; i < n; i += 2, j++)
43141 second[j] = gen_reg_rtx (hmode);
43142 ix86_expand_vector_init_concat (hmode, second [j],
43143 &first [i], 2);
43145 n >>= 1;
43146 ix86_expand_vector_init_concat (mode, target, second, n);
43148 else
43149 ix86_expand_vector_init_concat (mode, target, first, n);
43150 break;
43152 default:
43153 gcc_unreachable ();
43157 /* A subroutine of ix86_expand_vector_init_general. Use vector
43158 interleave to handle the most general case: all values variable,
43159 and none identical. */
43161 static void
43162 ix86_expand_vector_init_interleave (machine_mode mode,
43163 rtx target, rtx *ops, int n)
43165 machine_mode first_imode, second_imode, third_imode, inner_mode;
43166 int i, j;
43167 rtx op0, op1;
43168 rtx (*gen_load_even) (rtx, rtx, rtx);
43169 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43170 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43172 switch (mode)
43174 case V8HImode:
43175 gen_load_even = gen_vec_setv8hi;
43176 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43177 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43178 inner_mode = HImode;
43179 first_imode = V4SImode;
43180 second_imode = V2DImode;
43181 third_imode = VOIDmode;
43182 break;
43183 case V16QImode:
43184 gen_load_even = gen_vec_setv16qi;
43185 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43186 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43187 inner_mode = QImode;
43188 first_imode = V8HImode;
43189 second_imode = V4SImode;
43190 third_imode = V2DImode;
43191 break;
43192 default:
43193 gcc_unreachable ();
43196 for (i = 0; i < n; i++)
43198 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43199 op0 = gen_reg_rtx (SImode);
43200 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43202 /* Insert the SImode value as low element of V4SImode vector. */
43203 op1 = gen_reg_rtx (V4SImode);
43204 op0 = gen_rtx_VEC_MERGE (V4SImode,
43205 gen_rtx_VEC_DUPLICATE (V4SImode,
43206 op0),
43207 CONST0_RTX (V4SImode),
43208 const1_rtx);
43209 emit_insn (gen_rtx_SET (op1, op0));
43211 /* Cast the V4SImode vector back to a vector in orignal mode. */
43212 op0 = gen_reg_rtx (mode);
43213 emit_move_insn (op0, gen_lowpart (mode, op1));
43215 /* Load even elements into the second position. */
43216 emit_insn (gen_load_even (op0,
43217 force_reg (inner_mode,
43218 ops [i + i + 1]),
43219 const1_rtx));
43221 /* Cast vector to FIRST_IMODE vector. */
43222 ops[i] = gen_reg_rtx (first_imode);
43223 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43226 /* Interleave low FIRST_IMODE vectors. */
43227 for (i = j = 0; i < n; i += 2, j++)
43229 op0 = gen_reg_rtx (first_imode);
43230 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43232 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43233 ops[j] = gen_reg_rtx (second_imode);
43234 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43237 /* Interleave low SECOND_IMODE vectors. */
43238 switch (second_imode)
43240 case V4SImode:
43241 for (i = j = 0; i < n / 2; i += 2, j++)
43243 op0 = gen_reg_rtx (second_imode);
43244 emit_insn (gen_interleave_second_low (op0, ops[i],
43245 ops[i + 1]));
43247 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43248 vector. */
43249 ops[j] = gen_reg_rtx (third_imode);
43250 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43252 second_imode = V2DImode;
43253 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43254 /* FALLTHRU */
43256 case V2DImode:
43257 op0 = gen_reg_rtx (second_imode);
43258 emit_insn (gen_interleave_second_low (op0, ops[0],
43259 ops[1]));
43261 /* Cast the SECOND_IMODE vector back to a vector on original
43262 mode. */
43263 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43264 break;
43266 default:
43267 gcc_unreachable ();
43271 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43272 all values variable, and none identical. */
43274 static void
43275 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43276 rtx target, rtx vals)
43278 rtx ops[64], op0, op1, op2, op3, op4, op5;
43279 machine_mode half_mode = VOIDmode;
43280 machine_mode quarter_mode = VOIDmode;
43281 int n, i;
43283 switch (mode)
43285 case V2SFmode:
43286 case V2SImode:
43287 if (!mmx_ok && !TARGET_SSE)
43288 break;
43289 /* FALLTHRU */
43291 case V16SImode:
43292 case V16SFmode:
43293 case V8DFmode:
43294 case V8DImode:
43295 case V8SFmode:
43296 case V8SImode:
43297 case V4DFmode:
43298 case V4DImode:
43299 case V4SFmode:
43300 case V4SImode:
43301 case V2DFmode:
43302 case V2DImode:
43303 n = GET_MODE_NUNITS (mode);
43304 for (i = 0; i < n; i++)
43305 ops[i] = XVECEXP (vals, 0, i);
43306 ix86_expand_vector_init_concat (mode, target, ops, n);
43307 return;
43309 case V32QImode:
43310 half_mode = V16QImode;
43311 goto half;
43313 case V16HImode:
43314 half_mode = V8HImode;
43315 goto half;
43317 half:
43318 n = GET_MODE_NUNITS (mode);
43319 for (i = 0; i < n; i++)
43320 ops[i] = XVECEXP (vals, 0, i);
43321 op0 = gen_reg_rtx (half_mode);
43322 op1 = gen_reg_rtx (half_mode);
43323 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43324 n >> 2);
43325 ix86_expand_vector_init_interleave (half_mode, op1,
43326 &ops [n >> 1], n >> 2);
43327 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43328 return;
43330 case V64QImode:
43331 quarter_mode = V16QImode;
43332 half_mode = V32QImode;
43333 goto quarter;
43335 case V32HImode:
43336 quarter_mode = V8HImode;
43337 half_mode = V16HImode;
43338 goto quarter;
43340 quarter:
43341 n = GET_MODE_NUNITS (mode);
43342 for (i = 0; i < n; i++)
43343 ops[i] = XVECEXP (vals, 0, i);
43344 op0 = gen_reg_rtx (quarter_mode);
43345 op1 = gen_reg_rtx (quarter_mode);
43346 op2 = gen_reg_rtx (quarter_mode);
43347 op3 = gen_reg_rtx (quarter_mode);
43348 op4 = gen_reg_rtx (half_mode);
43349 op5 = gen_reg_rtx (half_mode);
43350 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43351 n >> 3);
43352 ix86_expand_vector_init_interleave (quarter_mode, op1,
43353 &ops [n >> 2], n >> 3);
43354 ix86_expand_vector_init_interleave (quarter_mode, op2,
43355 &ops [n >> 1], n >> 3);
43356 ix86_expand_vector_init_interleave (quarter_mode, op3,
43357 &ops [(n >> 1) | (n >> 2)], n >> 3);
43358 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43359 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43360 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43361 return;
43363 case V16QImode:
43364 if (!TARGET_SSE4_1)
43365 break;
43366 /* FALLTHRU */
43368 case V8HImode:
43369 if (!TARGET_SSE2)
43370 break;
43372 /* Don't use ix86_expand_vector_init_interleave if we can't
43373 move from GPR to SSE register directly. */
43374 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43375 break;
43377 n = GET_MODE_NUNITS (mode);
43378 for (i = 0; i < n; i++)
43379 ops[i] = XVECEXP (vals, 0, i);
43380 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43381 return;
43383 case V4HImode:
43384 case V8QImode:
43385 break;
43387 default:
43388 gcc_unreachable ();
43392 int i, j, n_elts, n_words, n_elt_per_word;
43393 machine_mode inner_mode;
43394 rtx words[4], shift;
43396 inner_mode = GET_MODE_INNER (mode);
43397 n_elts = GET_MODE_NUNITS (mode);
43398 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43399 n_elt_per_word = n_elts / n_words;
43400 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43402 for (i = 0; i < n_words; ++i)
43404 rtx word = NULL_RTX;
43406 for (j = 0; j < n_elt_per_word; ++j)
43408 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43409 elt = convert_modes (word_mode, inner_mode, elt, true);
43411 if (j == 0)
43412 word = elt;
43413 else
43415 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43416 word, 1, OPTAB_LIB_WIDEN);
43417 word = expand_simple_binop (word_mode, IOR, word, elt,
43418 word, 1, OPTAB_LIB_WIDEN);
43422 words[i] = word;
43425 if (n_words == 1)
43426 emit_move_insn (target, gen_lowpart (mode, words[0]));
43427 else if (n_words == 2)
43429 rtx tmp = gen_reg_rtx (mode);
43430 emit_clobber (tmp);
43431 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43432 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43433 emit_move_insn (target, tmp);
43435 else if (n_words == 4)
43437 rtx tmp = gen_reg_rtx (V4SImode);
43438 gcc_assert (word_mode == SImode);
43439 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43440 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43441 emit_move_insn (target, gen_lowpart (mode, tmp));
43443 else
43444 gcc_unreachable ();
43448 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43449 instructions unless MMX_OK is true. */
43451 void
43452 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43454 machine_mode mode = GET_MODE (target);
43455 machine_mode inner_mode = GET_MODE_INNER (mode);
43456 int n_elts = GET_MODE_NUNITS (mode);
43457 int n_var = 0, one_var = -1;
43458 bool all_same = true, all_const_zero = true;
43459 int i;
43460 rtx x;
43462 for (i = 0; i < n_elts; ++i)
43464 x = XVECEXP (vals, 0, i);
43465 if (!(CONST_SCALAR_INT_P (x)
43466 || CONST_DOUBLE_P (x)
43467 || CONST_FIXED_P (x)))
43468 n_var++, one_var = i;
43469 else if (x != CONST0_RTX (inner_mode))
43470 all_const_zero = false;
43471 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43472 all_same = false;
43475 /* Constants are best loaded from the constant pool. */
43476 if (n_var == 0)
43478 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43479 return;
43482 /* If all values are identical, broadcast the value. */
43483 if (all_same
43484 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43485 XVECEXP (vals, 0, 0)))
43486 return;
43488 /* Values where only one field is non-constant are best loaded from
43489 the pool and overwritten via move later. */
43490 if (n_var == 1)
43492 if (all_const_zero
43493 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43494 XVECEXP (vals, 0, one_var),
43495 one_var))
43496 return;
43498 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43499 return;
43502 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43505 void
43506 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43508 machine_mode mode = GET_MODE (target);
43509 machine_mode inner_mode = GET_MODE_INNER (mode);
43510 machine_mode half_mode;
43511 bool use_vec_merge = false;
43512 rtx tmp;
43513 static rtx (*gen_extract[6][2]) (rtx, rtx)
43515 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43516 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43517 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43518 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43519 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43520 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43522 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43524 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43525 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43526 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43527 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43528 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43529 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43531 int i, j, n;
43532 machine_mode mmode = VOIDmode;
43533 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43535 switch (mode)
43537 case V2SFmode:
43538 case V2SImode:
43539 if (mmx_ok)
43541 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43542 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43543 if (elt == 0)
43544 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43545 else
43546 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43547 emit_insn (gen_rtx_SET (target, tmp));
43548 return;
43550 break;
43552 case V2DImode:
43553 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43554 if (use_vec_merge)
43555 break;
43557 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43558 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43559 if (elt == 0)
43560 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43561 else
43562 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43563 emit_insn (gen_rtx_SET (target, tmp));
43564 return;
43566 case V2DFmode:
43568 rtx op0, op1;
43570 /* For the two element vectors, we implement a VEC_CONCAT with
43571 the extraction of the other element. */
43573 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43574 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43576 if (elt == 0)
43577 op0 = val, op1 = tmp;
43578 else
43579 op0 = tmp, op1 = val;
43581 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43582 emit_insn (gen_rtx_SET (target, tmp));
43584 return;
43586 case V4SFmode:
43587 use_vec_merge = TARGET_SSE4_1;
43588 if (use_vec_merge)
43589 break;
43591 switch (elt)
43593 case 0:
43594 use_vec_merge = true;
43595 break;
43597 case 1:
43598 /* tmp = target = A B C D */
43599 tmp = copy_to_reg (target);
43600 /* target = A A B B */
43601 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43602 /* target = X A B B */
43603 ix86_expand_vector_set (false, target, val, 0);
43604 /* target = A X C D */
43605 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43606 const1_rtx, const0_rtx,
43607 GEN_INT (2+4), GEN_INT (3+4)));
43608 return;
43610 case 2:
43611 /* tmp = target = A B C D */
43612 tmp = copy_to_reg (target);
43613 /* tmp = X B C D */
43614 ix86_expand_vector_set (false, tmp, val, 0);
43615 /* target = A B X D */
43616 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43617 const0_rtx, const1_rtx,
43618 GEN_INT (0+4), GEN_INT (3+4)));
43619 return;
43621 case 3:
43622 /* tmp = target = A B C D */
43623 tmp = copy_to_reg (target);
43624 /* tmp = X B C D */
43625 ix86_expand_vector_set (false, tmp, val, 0);
43626 /* target = A B X D */
43627 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43628 const0_rtx, const1_rtx,
43629 GEN_INT (2+4), GEN_INT (0+4)));
43630 return;
43632 default:
43633 gcc_unreachable ();
43635 break;
43637 case V4SImode:
43638 use_vec_merge = TARGET_SSE4_1;
43639 if (use_vec_merge)
43640 break;
43642 /* Element 0 handled by vec_merge below. */
43643 if (elt == 0)
43645 use_vec_merge = true;
43646 break;
43649 if (TARGET_SSE2)
43651 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43652 store into element 0, then shuffle them back. */
43654 rtx order[4];
43656 order[0] = GEN_INT (elt);
43657 order[1] = const1_rtx;
43658 order[2] = const2_rtx;
43659 order[3] = GEN_INT (3);
43660 order[elt] = const0_rtx;
43662 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43663 order[1], order[2], order[3]));
43665 ix86_expand_vector_set (false, target, val, 0);
43667 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43668 order[1], order[2], order[3]));
43670 else
43672 /* For SSE1, we have to reuse the V4SF code. */
43673 rtx t = gen_reg_rtx (V4SFmode);
43674 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43675 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43676 emit_move_insn (target, gen_lowpart (mode, t));
43678 return;
43680 case V8HImode:
43681 use_vec_merge = TARGET_SSE2;
43682 break;
43683 case V4HImode:
43684 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43685 break;
43687 case V16QImode:
43688 use_vec_merge = TARGET_SSE4_1;
43689 break;
43691 case V8QImode:
43692 break;
43694 case V32QImode:
43695 half_mode = V16QImode;
43696 j = 0;
43697 n = 16;
43698 goto half;
43700 case V16HImode:
43701 half_mode = V8HImode;
43702 j = 1;
43703 n = 8;
43704 goto half;
43706 case V8SImode:
43707 half_mode = V4SImode;
43708 j = 2;
43709 n = 4;
43710 goto half;
43712 case V4DImode:
43713 half_mode = V2DImode;
43714 j = 3;
43715 n = 2;
43716 goto half;
43718 case V8SFmode:
43719 half_mode = V4SFmode;
43720 j = 4;
43721 n = 4;
43722 goto half;
43724 case V4DFmode:
43725 half_mode = V2DFmode;
43726 j = 5;
43727 n = 2;
43728 goto half;
43730 half:
43731 /* Compute offset. */
43732 i = elt / n;
43733 elt %= n;
43735 gcc_assert (i <= 1);
43737 /* Extract the half. */
43738 tmp = gen_reg_rtx (half_mode);
43739 emit_insn (gen_extract[j][i] (tmp, target));
43741 /* Put val in tmp at elt. */
43742 ix86_expand_vector_set (false, tmp, val, elt);
43744 /* Put it back. */
43745 emit_insn (gen_insert[j][i] (target, target, tmp));
43746 return;
43748 case V8DFmode:
43749 if (TARGET_AVX512F)
43751 mmode = QImode;
43752 gen_blendm = gen_avx512f_blendmv8df;
43754 break;
43756 case V8DImode:
43757 if (TARGET_AVX512F)
43759 mmode = QImode;
43760 gen_blendm = gen_avx512f_blendmv8di;
43762 break;
43764 case V16SFmode:
43765 if (TARGET_AVX512F)
43767 mmode = HImode;
43768 gen_blendm = gen_avx512f_blendmv16sf;
43770 break;
43772 case V16SImode:
43773 if (TARGET_AVX512F)
43775 mmode = HImode;
43776 gen_blendm = gen_avx512f_blendmv16si;
43778 break;
43780 case V32HImode:
43781 if (TARGET_AVX512F && TARGET_AVX512BW)
43783 mmode = SImode;
43784 gen_blendm = gen_avx512bw_blendmv32hi;
43786 break;
43788 case V64QImode:
43789 if (TARGET_AVX512F && TARGET_AVX512BW)
43791 mmode = DImode;
43792 gen_blendm = gen_avx512bw_blendmv64qi;
43794 break;
43796 default:
43797 break;
43800 if (mmode != VOIDmode)
43802 tmp = gen_reg_rtx (mode);
43803 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43804 /* The avx512*_blendm<mode> expanders have different operand order
43805 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43806 elements where the mask is set and second input operand otherwise,
43807 in {sse,avx}*_*blend* the first input operand is used for elements
43808 where the mask is clear and second input operand otherwise. */
43809 emit_insn (gen_blendm (target, target, tmp,
43810 force_reg (mmode,
43811 gen_int_mode (1 << elt, mmode))));
43813 else if (use_vec_merge)
43815 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43816 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43817 emit_insn (gen_rtx_SET (target, tmp));
43819 else
43821 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43823 emit_move_insn (mem, target);
43825 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43826 emit_move_insn (tmp, val);
43828 emit_move_insn (target, mem);
43832 void
43833 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43835 machine_mode mode = GET_MODE (vec);
43836 machine_mode inner_mode = GET_MODE_INNER (mode);
43837 bool use_vec_extr = false;
43838 rtx tmp;
43840 switch (mode)
43842 case V2SImode:
43843 case V2SFmode:
43844 if (!mmx_ok)
43845 break;
43846 /* FALLTHRU */
43848 case V2DFmode:
43849 case V2DImode:
43850 use_vec_extr = true;
43851 break;
43853 case V4SFmode:
43854 use_vec_extr = TARGET_SSE4_1;
43855 if (use_vec_extr)
43856 break;
43858 switch (elt)
43860 case 0:
43861 tmp = vec;
43862 break;
43864 case 1:
43865 case 3:
43866 tmp = gen_reg_rtx (mode);
43867 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43868 GEN_INT (elt), GEN_INT (elt),
43869 GEN_INT (elt+4), GEN_INT (elt+4)));
43870 break;
43872 case 2:
43873 tmp = gen_reg_rtx (mode);
43874 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43875 break;
43877 default:
43878 gcc_unreachable ();
43880 vec = tmp;
43881 use_vec_extr = true;
43882 elt = 0;
43883 break;
43885 case V4SImode:
43886 use_vec_extr = TARGET_SSE4_1;
43887 if (use_vec_extr)
43888 break;
43890 if (TARGET_SSE2)
43892 switch (elt)
43894 case 0:
43895 tmp = vec;
43896 break;
43898 case 1:
43899 case 3:
43900 tmp = gen_reg_rtx (mode);
43901 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43902 GEN_INT (elt), GEN_INT (elt),
43903 GEN_INT (elt), GEN_INT (elt)));
43904 break;
43906 case 2:
43907 tmp = gen_reg_rtx (mode);
43908 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43909 break;
43911 default:
43912 gcc_unreachable ();
43914 vec = tmp;
43915 use_vec_extr = true;
43916 elt = 0;
43918 else
43920 /* For SSE1, we have to reuse the V4SF code. */
43921 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43922 gen_lowpart (V4SFmode, vec), elt);
43923 return;
43925 break;
43927 case V8HImode:
43928 use_vec_extr = TARGET_SSE2;
43929 break;
43930 case V4HImode:
43931 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43932 break;
43934 case V16QImode:
43935 use_vec_extr = TARGET_SSE4_1;
43936 break;
43938 case V8SFmode:
43939 if (TARGET_AVX)
43941 tmp = gen_reg_rtx (V4SFmode);
43942 if (elt < 4)
43943 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43944 else
43945 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43946 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43947 return;
43949 break;
43951 case V4DFmode:
43952 if (TARGET_AVX)
43954 tmp = gen_reg_rtx (V2DFmode);
43955 if (elt < 2)
43956 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43957 else
43958 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43959 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43960 return;
43962 break;
43964 case V32QImode:
43965 if (TARGET_AVX)
43967 tmp = gen_reg_rtx (V16QImode);
43968 if (elt < 16)
43969 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43970 else
43971 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43972 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43973 return;
43975 break;
43977 case V16HImode:
43978 if (TARGET_AVX)
43980 tmp = gen_reg_rtx (V8HImode);
43981 if (elt < 8)
43982 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43983 else
43984 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43985 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43986 return;
43988 break;
43990 case V8SImode:
43991 if (TARGET_AVX)
43993 tmp = gen_reg_rtx (V4SImode);
43994 if (elt < 4)
43995 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43996 else
43997 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43998 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43999 return;
44001 break;
44003 case V4DImode:
44004 if (TARGET_AVX)
44006 tmp = gen_reg_rtx (V2DImode);
44007 if (elt < 2)
44008 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44009 else
44010 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44011 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44012 return;
44014 break;
44016 case V32HImode:
44017 if (TARGET_AVX512BW)
44019 tmp = gen_reg_rtx (V16HImode);
44020 if (elt < 16)
44021 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44022 else
44023 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44024 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44025 return;
44027 break;
44029 case V64QImode:
44030 if (TARGET_AVX512BW)
44032 tmp = gen_reg_rtx (V32QImode);
44033 if (elt < 32)
44034 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44035 else
44036 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44037 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44038 return;
44040 break;
44042 case V16SFmode:
44043 tmp = gen_reg_rtx (V8SFmode);
44044 if (elt < 8)
44045 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44046 else
44047 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44048 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44049 return;
44051 case V8DFmode:
44052 tmp = gen_reg_rtx (V4DFmode);
44053 if (elt < 4)
44054 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44055 else
44056 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44057 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44058 return;
44060 case V16SImode:
44061 tmp = gen_reg_rtx (V8SImode);
44062 if (elt < 8)
44063 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44064 else
44065 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44066 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44067 return;
44069 case V8DImode:
44070 tmp = gen_reg_rtx (V4DImode);
44071 if (elt < 4)
44072 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44073 else
44074 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44075 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44076 return;
44078 case V8QImode:
44079 /* ??? Could extract the appropriate HImode element and shift. */
44080 default:
44081 break;
44084 if (use_vec_extr)
44086 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44087 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44089 /* Let the rtl optimizers know about the zero extension performed. */
44090 if (inner_mode == QImode || inner_mode == HImode)
44092 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44093 target = gen_lowpart (SImode, target);
44096 emit_insn (gen_rtx_SET (target, tmp));
44098 else
44100 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44102 emit_move_insn (mem, vec);
44104 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44105 emit_move_insn (target, tmp);
44109 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44110 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44111 The upper bits of DEST are undefined, though they shouldn't cause
44112 exceptions (some bits from src or all zeros are ok). */
44114 static void
44115 emit_reduc_half (rtx dest, rtx src, int i)
44117 rtx tem, d = dest;
44118 switch (GET_MODE (src))
44120 case V4SFmode:
44121 if (i == 128)
44122 tem = gen_sse_movhlps (dest, src, src);
44123 else
44124 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44125 GEN_INT (1 + 4), GEN_INT (1 + 4));
44126 break;
44127 case V2DFmode:
44128 tem = gen_vec_interleave_highv2df (dest, src, src);
44129 break;
44130 case V16QImode:
44131 case V8HImode:
44132 case V4SImode:
44133 case V2DImode:
44134 d = gen_reg_rtx (V1TImode);
44135 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44136 GEN_INT (i / 2));
44137 break;
44138 case V8SFmode:
44139 if (i == 256)
44140 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44141 else
44142 tem = gen_avx_shufps256 (dest, src, src,
44143 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44144 break;
44145 case V4DFmode:
44146 if (i == 256)
44147 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44148 else
44149 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44150 break;
44151 case V32QImode:
44152 case V16HImode:
44153 case V8SImode:
44154 case V4DImode:
44155 if (i == 256)
44157 if (GET_MODE (dest) != V4DImode)
44158 d = gen_reg_rtx (V4DImode);
44159 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44160 gen_lowpart (V4DImode, src),
44161 const1_rtx);
44163 else
44165 d = gen_reg_rtx (V2TImode);
44166 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44167 GEN_INT (i / 2));
44169 break;
44170 case V64QImode:
44171 case V32HImode:
44172 case V16SImode:
44173 case V16SFmode:
44174 case V8DImode:
44175 case V8DFmode:
44176 if (i > 128)
44177 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44178 gen_lowpart (V16SImode, src),
44179 gen_lowpart (V16SImode, src),
44180 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44181 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44182 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44183 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44184 GEN_INT (0xC), GEN_INT (0xD),
44185 GEN_INT (0xE), GEN_INT (0xF),
44186 GEN_INT (0x10), GEN_INT (0x11),
44187 GEN_INT (0x12), GEN_INT (0x13),
44188 GEN_INT (0x14), GEN_INT (0x15),
44189 GEN_INT (0x16), GEN_INT (0x17));
44190 else
44191 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44192 gen_lowpart (V16SImode, src),
44193 GEN_INT (i == 128 ? 0x2 : 0x1),
44194 GEN_INT (0x3),
44195 GEN_INT (0x3),
44196 GEN_INT (0x3),
44197 GEN_INT (i == 128 ? 0x6 : 0x5),
44198 GEN_INT (0x7),
44199 GEN_INT (0x7),
44200 GEN_INT (0x7),
44201 GEN_INT (i == 128 ? 0xA : 0x9),
44202 GEN_INT (0xB),
44203 GEN_INT (0xB),
44204 GEN_INT (0xB),
44205 GEN_INT (i == 128 ? 0xE : 0xD),
44206 GEN_INT (0xF),
44207 GEN_INT (0xF),
44208 GEN_INT (0xF));
44209 break;
44210 default:
44211 gcc_unreachable ();
44213 emit_insn (tem);
44214 if (d != dest)
44215 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44218 /* Expand a vector reduction. FN is the binary pattern to reduce;
44219 DEST is the destination; IN is the input vector. */
44221 void
44222 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44224 rtx half, dst, vec = in;
44225 machine_mode mode = GET_MODE (in);
44226 int i;
44228 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44229 if (TARGET_SSE4_1
44230 && mode == V8HImode
44231 && fn == gen_uminv8hi3)
44233 emit_insn (gen_sse4_1_phminposuw (dest, in));
44234 return;
44237 for (i = GET_MODE_BITSIZE (mode);
44238 i > GET_MODE_UNIT_BITSIZE (mode);
44239 i >>= 1)
44241 half = gen_reg_rtx (mode);
44242 emit_reduc_half (half, vec, i);
44243 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44244 dst = dest;
44245 else
44246 dst = gen_reg_rtx (mode);
44247 emit_insn (fn (dst, half, vec));
44248 vec = dst;
44252 /* Target hook for scalar_mode_supported_p. */
44253 static bool
44254 ix86_scalar_mode_supported_p (machine_mode mode)
44256 if (DECIMAL_FLOAT_MODE_P (mode))
44257 return default_decimal_float_supported_p ();
44258 else if (mode == TFmode)
44259 return true;
44260 else
44261 return default_scalar_mode_supported_p (mode);
44264 /* Implements target hook vector_mode_supported_p. */
44265 static bool
44266 ix86_vector_mode_supported_p (machine_mode mode)
44268 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44269 return true;
44270 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44271 return true;
44272 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44273 return true;
44274 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44275 return true;
44276 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44277 return true;
44278 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44279 return true;
44280 return false;
44283 /* Target hook for c_mode_for_suffix. */
44284 static machine_mode
44285 ix86_c_mode_for_suffix (char suffix)
44287 if (suffix == 'q')
44288 return TFmode;
44289 if (suffix == 'w')
44290 return XFmode;
44292 return VOIDmode;
44295 /* Worker function for TARGET_MD_ASM_ADJUST.
44297 We implement asm flag outputs, and maintain source compatibility
44298 with the old cc0-based compiler. */
44300 static rtx_insn *
44301 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44302 vec<const char *> &constraints,
44303 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44305 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44306 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44308 bool saw_asm_flag = false;
44310 start_sequence ();
44311 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44313 const char *con = constraints[i];
44314 if (strncmp (con, "=@cc", 4) != 0)
44315 continue;
44316 con += 4;
44317 if (strchr (con, ',') != NULL)
44319 error ("alternatives not allowed in asm flag output");
44320 continue;
44323 bool invert = false;
44324 if (con[0] == 'n')
44325 invert = true, con++;
44327 machine_mode mode = CCmode;
44328 rtx_code code = UNKNOWN;
44330 switch (con[0])
44332 case 'a':
44333 if (con[1] == 0)
44334 mode = CCAmode, code = EQ;
44335 else if (con[1] == 'e' && con[2] == 0)
44336 mode = CCCmode, code = NE;
44337 break;
44338 case 'b':
44339 if (con[1] == 0)
44340 mode = CCCmode, code = EQ;
44341 else if (con[1] == 'e' && con[2] == 0)
44342 mode = CCAmode, code = NE;
44343 break;
44344 case 'c':
44345 if (con[1] == 0)
44346 mode = CCCmode, code = EQ;
44347 break;
44348 case 'e':
44349 if (con[1] == 0)
44350 mode = CCZmode, code = EQ;
44351 break;
44352 case 'g':
44353 if (con[1] == 0)
44354 mode = CCGCmode, code = GT;
44355 else if (con[1] == 'e' && con[2] == 0)
44356 mode = CCGCmode, code = GE;
44357 break;
44358 case 'l':
44359 if (con[1] == 0)
44360 mode = CCGCmode, code = LT;
44361 else if (con[1] == 'e' && con[2] == 0)
44362 mode = CCGCmode, code = LE;
44363 break;
44364 case 'o':
44365 if (con[1] == 0)
44366 mode = CCOmode, code = EQ;
44367 break;
44368 case 'p':
44369 if (con[1] == 0)
44370 mode = CCPmode, code = EQ;
44371 break;
44372 case 's':
44373 if (con[1] == 0)
44374 mode = CCSmode, code = EQ;
44375 break;
44376 case 'z':
44377 if (con[1] == 0)
44378 mode = CCZmode, code = EQ;
44379 break;
44381 if (code == UNKNOWN)
44383 error ("unknown asm flag output %qs", constraints[i]);
44384 continue;
44386 if (invert)
44387 code = reverse_condition (code);
44389 rtx dest = outputs[i];
44390 if (!saw_asm_flag)
44392 /* This is the first asm flag output. Here we put the flags
44393 register in as the real output and adjust the condition to
44394 allow it. */
44395 constraints[i] = "=Bf";
44396 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44397 saw_asm_flag = true;
44399 else
44401 /* We don't need the flags register as output twice. */
44402 constraints[i] = "=X";
44403 outputs[i] = gen_rtx_SCRATCH (SImode);
44406 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44407 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44409 machine_mode dest_mode = GET_MODE (dest);
44410 if (!SCALAR_INT_MODE_P (dest_mode))
44412 error ("invalid type for asm flag output");
44413 continue;
44416 if (dest_mode == DImode && !TARGET_64BIT)
44417 dest_mode = SImode;
44419 if (dest_mode != QImode)
44421 rtx destqi = gen_reg_rtx (QImode);
44422 emit_insn (gen_rtx_SET (destqi, x));
44424 if (TARGET_ZERO_EXTEND_WITH_AND
44425 && optimize_function_for_speed_p (cfun))
44427 x = force_reg (dest_mode, const0_rtx);
44429 emit_insn (gen_movstrictqi
44430 (gen_lowpart (QImode, x), destqi));
44432 else
44433 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44436 if (dest_mode != GET_MODE (dest))
44438 rtx tmp = gen_reg_rtx (SImode);
44440 emit_insn (gen_rtx_SET (tmp, x));
44441 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44443 else
44444 emit_insn (gen_rtx_SET (dest, x));
44446 rtx_insn *seq = get_insns ();
44447 end_sequence ();
44449 if (saw_asm_flag)
44450 return seq;
44451 else
44453 /* If we had no asm flag outputs, clobber the flags. */
44454 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44455 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44456 return NULL;
44460 /* Implements target vector targetm.asm.encode_section_info. */
44462 static void ATTRIBUTE_UNUSED
44463 ix86_encode_section_info (tree decl, rtx rtl, int first)
44465 default_encode_section_info (decl, rtl, first);
44467 if (ix86_in_large_data_p (decl))
44468 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44471 /* Worker function for REVERSE_CONDITION. */
44473 enum rtx_code
44474 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44476 return (mode != CCFPmode && mode != CCFPUmode
44477 ? reverse_condition (code)
44478 : reverse_condition_maybe_unordered (code));
44481 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44482 to OPERANDS[0]. */
44484 const char *
44485 output_387_reg_move (rtx_insn *insn, rtx *operands)
44487 if (REG_P (operands[0]))
44489 if (REG_P (operands[1])
44490 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44492 if (REGNO (operands[0]) == FIRST_STACK_REG)
44493 return output_387_ffreep (operands, 0);
44494 return "fstp\t%y0";
44496 if (STACK_TOP_P (operands[0]))
44497 return "fld%Z1\t%y1";
44498 return "fst\t%y0";
44500 else if (MEM_P (operands[0]))
44502 gcc_assert (REG_P (operands[1]));
44503 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44504 return "fstp%Z0\t%y0";
44505 else
44507 /* There is no non-popping store to memory for XFmode.
44508 So if we need one, follow the store with a load. */
44509 if (GET_MODE (operands[0]) == XFmode)
44510 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44511 else
44512 return "fst%Z0\t%y0";
44515 else
44516 gcc_unreachable();
44519 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44520 FP status register is set. */
44522 void
44523 ix86_emit_fp_unordered_jump (rtx label)
44525 rtx reg = gen_reg_rtx (HImode);
44526 rtx temp;
44528 emit_insn (gen_x86_fnstsw_1 (reg));
44530 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44532 emit_insn (gen_x86_sahf_1 (reg));
44534 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44535 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44537 else
44539 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44541 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44542 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44545 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44546 gen_rtx_LABEL_REF (VOIDmode, label),
44547 pc_rtx);
44548 temp = gen_rtx_SET (pc_rtx, temp);
44550 emit_jump_insn (temp);
44551 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44554 /* Output code to perform a log1p XFmode calculation. */
44556 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44558 rtx_code_label *label1 = gen_label_rtx ();
44559 rtx_code_label *label2 = gen_label_rtx ();
44561 rtx tmp = gen_reg_rtx (XFmode);
44562 rtx tmp2 = gen_reg_rtx (XFmode);
44563 rtx test;
44565 emit_insn (gen_absxf2 (tmp, op1));
44566 test = gen_rtx_GE (VOIDmode, tmp,
44567 const_double_from_real_value (
44568 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44569 XFmode));
44570 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44572 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44573 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44574 emit_jump (label2);
44576 emit_label (label1);
44577 emit_move_insn (tmp, CONST1_RTX (XFmode));
44578 emit_insn (gen_addxf3 (tmp, op1, tmp));
44579 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44580 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44582 emit_label (label2);
44585 /* Emit code for round calculation. */
44586 void ix86_emit_i387_round (rtx op0, rtx op1)
44588 machine_mode inmode = GET_MODE (op1);
44589 machine_mode outmode = GET_MODE (op0);
44590 rtx e1, e2, res, tmp, tmp1, half;
44591 rtx scratch = gen_reg_rtx (HImode);
44592 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44593 rtx_code_label *jump_label = gen_label_rtx ();
44594 rtx insn;
44595 rtx (*gen_abs) (rtx, rtx);
44596 rtx (*gen_neg) (rtx, rtx);
44598 switch (inmode)
44600 case SFmode:
44601 gen_abs = gen_abssf2;
44602 break;
44603 case DFmode:
44604 gen_abs = gen_absdf2;
44605 break;
44606 case XFmode:
44607 gen_abs = gen_absxf2;
44608 break;
44609 default:
44610 gcc_unreachable ();
44613 switch (outmode)
44615 case SFmode:
44616 gen_neg = gen_negsf2;
44617 break;
44618 case DFmode:
44619 gen_neg = gen_negdf2;
44620 break;
44621 case XFmode:
44622 gen_neg = gen_negxf2;
44623 break;
44624 case HImode:
44625 gen_neg = gen_neghi2;
44626 break;
44627 case SImode:
44628 gen_neg = gen_negsi2;
44629 break;
44630 case DImode:
44631 gen_neg = gen_negdi2;
44632 break;
44633 default:
44634 gcc_unreachable ();
44637 e1 = gen_reg_rtx (inmode);
44638 e2 = gen_reg_rtx (inmode);
44639 res = gen_reg_rtx (outmode);
44641 half = const_double_from_real_value (dconsthalf, inmode);
44643 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44645 /* scratch = fxam(op1) */
44646 emit_insn (gen_rtx_SET (scratch,
44647 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44648 UNSPEC_FXAM)));
44649 /* e1 = fabs(op1) */
44650 emit_insn (gen_abs (e1, op1));
44652 /* e2 = e1 + 0.5 */
44653 half = force_reg (inmode, half);
44654 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44656 /* res = floor(e2) */
44657 if (inmode != XFmode)
44659 tmp1 = gen_reg_rtx (XFmode);
44661 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44663 else
44664 tmp1 = e2;
44666 switch (outmode)
44668 case SFmode:
44669 case DFmode:
44671 rtx tmp0 = gen_reg_rtx (XFmode);
44673 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44675 emit_insn (gen_rtx_SET (res,
44676 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44677 UNSPEC_TRUNC_NOOP)));
44679 break;
44680 case XFmode:
44681 emit_insn (gen_frndintxf2_floor (res, tmp1));
44682 break;
44683 case HImode:
44684 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44685 break;
44686 case SImode:
44687 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44688 break;
44689 case DImode:
44690 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44691 break;
44692 default:
44693 gcc_unreachable ();
44696 /* flags = signbit(a) */
44697 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44699 /* if (flags) then res = -res */
44700 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44701 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44702 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44703 pc_rtx);
44704 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44705 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44706 JUMP_LABEL (insn) = jump_label;
44708 emit_insn (gen_neg (res, res));
44710 emit_label (jump_label);
44711 LABEL_NUSES (jump_label) = 1;
44713 emit_move_insn (op0, res);
44716 /* Output code to perform a Newton-Rhapson approximation of a single precision
44717 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44719 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44721 rtx x0, x1, e0, e1;
44723 x0 = gen_reg_rtx (mode);
44724 e0 = gen_reg_rtx (mode);
44725 e1 = gen_reg_rtx (mode);
44726 x1 = gen_reg_rtx (mode);
44728 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44730 b = force_reg (mode, b);
44732 /* x0 = rcp(b) estimate */
44733 if (mode == V16SFmode || mode == V8DFmode)
44735 if (TARGET_AVX512ER)
44737 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44738 UNSPEC_RCP28)));
44739 /* res = a * x0 */
44740 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44741 return;
44743 else
44744 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44745 UNSPEC_RCP14)));
44747 else
44748 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44749 UNSPEC_RCP)));
44751 /* e0 = x0 * b */
44752 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44754 /* e0 = x0 * e0 */
44755 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44757 /* e1 = x0 + x0 */
44758 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44760 /* x1 = e1 - e0 */
44761 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44763 /* res = a * x1 */
44764 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44767 /* Output code to perform a Newton-Rhapson approximation of a
44768 single precision floating point [reciprocal] square root. */
44770 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44772 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44773 REAL_VALUE_TYPE r;
44774 int unspec;
44776 x0 = gen_reg_rtx (mode);
44777 e0 = gen_reg_rtx (mode);
44778 e1 = gen_reg_rtx (mode);
44779 e2 = gen_reg_rtx (mode);
44780 e3 = gen_reg_rtx (mode);
44782 if (TARGET_AVX512ER && mode == V16SFmode)
44784 if (recip)
44785 /* res = rsqrt28(a) estimate */
44786 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44787 UNSPEC_RSQRT28)));
44788 else
44790 /* x0 = rsqrt28(a) estimate */
44791 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44792 UNSPEC_RSQRT28)));
44793 /* res = rcp28(x0) estimate */
44794 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44795 UNSPEC_RCP28)));
44797 return;
44800 real_from_integer (&r, VOIDmode, -3, SIGNED);
44801 mthree = const_double_from_real_value (r, SFmode);
44803 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44804 mhalf = const_double_from_real_value (r, SFmode);
44805 unspec = UNSPEC_RSQRT;
44807 if (VECTOR_MODE_P (mode))
44809 mthree = ix86_build_const_vector (mode, true, mthree);
44810 mhalf = ix86_build_const_vector (mode, true, mhalf);
44811 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44812 if (GET_MODE_SIZE (mode) == 64)
44813 unspec = UNSPEC_RSQRT14;
44816 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44817 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44819 a = force_reg (mode, a);
44821 /* x0 = rsqrt(a) estimate */
44822 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44823 unspec)));
44825 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44826 if (!recip)
44828 rtx zero = force_reg (mode, CONST0_RTX(mode));
44829 rtx mask;
44831 /* Handle masked compare. */
44832 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44834 mask = gen_reg_rtx (HImode);
44835 /* Imm value 0x4 corresponds to not-equal comparison. */
44836 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44837 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44839 else
44841 mask = gen_reg_rtx (mode);
44842 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44843 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44847 /* e0 = x0 * a */
44848 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44849 /* e1 = e0 * x0 */
44850 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44852 /* e2 = e1 - 3. */
44853 mthree = force_reg (mode, mthree);
44854 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44856 mhalf = force_reg (mode, mhalf);
44857 if (recip)
44858 /* e3 = -.5 * x0 */
44859 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44860 else
44861 /* e3 = -.5 * e0 */
44862 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44863 /* ret = e2 * e3 */
44864 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44867 #ifdef TARGET_SOLARIS
44868 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44870 static void
44871 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44872 tree decl)
44874 /* With Binutils 2.15, the "@unwind" marker must be specified on
44875 every occurrence of the ".eh_frame" section, not just the first
44876 one. */
44877 if (TARGET_64BIT
44878 && strcmp (name, ".eh_frame") == 0)
44880 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44881 flags & SECTION_WRITE ? "aw" : "a");
44882 return;
44885 #ifndef USE_GAS
44886 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44888 solaris_elf_asm_comdat_section (name, flags, decl);
44889 return;
44891 #endif
44893 default_elf_asm_named_section (name, flags, decl);
44895 #endif /* TARGET_SOLARIS */
44897 /* Return the mangling of TYPE if it is an extended fundamental type. */
44899 static const char *
44900 ix86_mangle_type (const_tree type)
44902 type = TYPE_MAIN_VARIANT (type);
44904 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44905 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44906 return NULL;
44908 switch (TYPE_MODE (type))
44910 case TFmode:
44911 /* __float128 is "g". */
44912 return "g";
44913 case XFmode:
44914 /* "long double" or __float80 is "e". */
44915 return "e";
44916 default:
44917 return NULL;
44921 #ifdef TARGET_THREAD_SSP_OFFSET
44922 /* If using TLS guards, don't waste time creating and expanding
44923 __stack_chk_guard decl and MEM as we are going to ignore it. */
44924 static tree
44925 ix86_stack_protect_guard (void)
44927 if (TARGET_SSP_TLS_GUARD)
44928 return NULL_TREE;
44929 return default_stack_protect_guard ();
44931 #endif
44933 /* For 32-bit code we can save PIC register setup by using
44934 __stack_chk_fail_local hidden function instead of calling
44935 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44936 register, so it is better to call __stack_chk_fail directly. */
44938 static tree ATTRIBUTE_UNUSED
44939 ix86_stack_protect_fail (void)
44941 return TARGET_64BIT
44942 ? default_external_stack_protect_fail ()
44943 : default_hidden_stack_protect_fail ();
44946 /* Select a format to encode pointers in exception handling data. CODE
44947 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44948 true if the symbol may be affected by dynamic relocations.
44950 ??? All x86 object file formats are capable of representing this.
44951 After all, the relocation needed is the same as for the call insn.
44952 Whether or not a particular assembler allows us to enter such, I
44953 guess we'll have to see. */
44955 asm_preferred_eh_data_format (int code, int global)
44957 if (flag_pic)
44959 int type = DW_EH_PE_sdata8;
44960 if (!TARGET_64BIT
44961 || ix86_cmodel == CM_SMALL_PIC
44962 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44963 type = DW_EH_PE_sdata4;
44964 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44966 if (ix86_cmodel == CM_SMALL
44967 || (ix86_cmodel == CM_MEDIUM && code))
44968 return DW_EH_PE_udata4;
44969 return DW_EH_PE_absptr;
44972 /* Expand copysign from SIGN to the positive value ABS_VALUE
44973 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44974 the sign-bit. */
44975 static void
44976 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44978 machine_mode mode = GET_MODE (sign);
44979 rtx sgn = gen_reg_rtx (mode);
44980 if (mask == NULL_RTX)
44982 machine_mode vmode;
44984 if (mode == SFmode)
44985 vmode = V4SFmode;
44986 else if (mode == DFmode)
44987 vmode = V2DFmode;
44988 else
44989 vmode = mode;
44991 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44992 if (!VECTOR_MODE_P (mode))
44994 /* We need to generate a scalar mode mask in this case. */
44995 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44996 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44997 mask = gen_reg_rtx (mode);
44998 emit_insn (gen_rtx_SET (mask, tmp));
45001 else
45002 mask = gen_rtx_NOT (mode, mask);
45003 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45004 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45007 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45008 mask for masking out the sign-bit is stored in *SMASK, if that is
45009 non-null. */
45010 static rtx
45011 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45013 machine_mode vmode, mode = GET_MODE (op0);
45014 rtx xa, mask;
45016 xa = gen_reg_rtx (mode);
45017 if (mode == SFmode)
45018 vmode = V4SFmode;
45019 else if (mode == DFmode)
45020 vmode = V2DFmode;
45021 else
45022 vmode = mode;
45023 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45024 if (!VECTOR_MODE_P (mode))
45026 /* We need to generate a scalar mode mask in this case. */
45027 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45028 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45029 mask = gen_reg_rtx (mode);
45030 emit_insn (gen_rtx_SET (mask, tmp));
45032 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45034 if (smask)
45035 *smask = mask;
45037 return xa;
45040 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45041 swapping the operands if SWAP_OPERANDS is true. The expanded
45042 code is a forward jump to a newly created label in case the
45043 comparison is true. The generated label rtx is returned. */
45044 static rtx_code_label *
45045 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45046 bool swap_operands)
45048 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45049 rtx_code_label *label;
45050 rtx tmp;
45052 if (swap_operands)
45053 std::swap (op0, op1);
45055 label = gen_label_rtx ();
45056 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45057 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45058 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45059 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45060 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45061 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45062 JUMP_LABEL (tmp) = label;
45064 return label;
45067 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45068 using comparison code CODE. Operands are swapped for the comparison if
45069 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45070 static rtx
45071 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45072 bool swap_operands)
45074 rtx (*insn)(rtx, rtx, rtx, rtx);
45075 machine_mode mode = GET_MODE (op0);
45076 rtx mask = gen_reg_rtx (mode);
45078 if (swap_operands)
45079 std::swap (op0, op1);
45081 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45083 emit_insn (insn (mask, op0, op1,
45084 gen_rtx_fmt_ee (code, mode, op0, op1)));
45085 return mask;
45088 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45089 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45090 static rtx
45091 ix86_gen_TWO52 (machine_mode mode)
45093 REAL_VALUE_TYPE TWO52r;
45094 rtx TWO52;
45096 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45097 TWO52 = const_double_from_real_value (TWO52r, mode);
45098 TWO52 = force_reg (mode, TWO52);
45100 return TWO52;
45103 /* Expand SSE sequence for computing lround from OP1 storing
45104 into OP0. */
45105 void
45106 ix86_expand_lround (rtx op0, rtx op1)
45108 /* C code for the stuff we're doing below:
45109 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45110 return (long)tmp;
45112 machine_mode mode = GET_MODE (op1);
45113 const struct real_format *fmt;
45114 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45115 rtx adj;
45117 /* load nextafter (0.5, 0.0) */
45118 fmt = REAL_MODE_FORMAT (mode);
45119 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45120 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45122 /* adj = copysign (0.5, op1) */
45123 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45124 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45126 /* adj = op1 + adj */
45127 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45129 /* op0 = (imode)adj */
45130 expand_fix (op0, adj, 0);
45133 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45134 into OPERAND0. */
45135 void
45136 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45138 /* C code for the stuff we're doing below (for do_floor):
45139 xi = (long)op1;
45140 xi -= (double)xi > op1 ? 1 : 0;
45141 return xi;
45143 machine_mode fmode = GET_MODE (op1);
45144 machine_mode imode = GET_MODE (op0);
45145 rtx ireg, freg, tmp;
45146 rtx_code_label *label;
45148 /* reg = (long)op1 */
45149 ireg = gen_reg_rtx (imode);
45150 expand_fix (ireg, op1, 0);
45152 /* freg = (double)reg */
45153 freg = gen_reg_rtx (fmode);
45154 expand_float (freg, ireg, 0);
45156 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45157 label = ix86_expand_sse_compare_and_jump (UNLE,
45158 freg, op1, !do_floor);
45159 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45160 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45161 emit_move_insn (ireg, tmp);
45163 emit_label (label);
45164 LABEL_NUSES (label) = 1;
45166 emit_move_insn (op0, ireg);
45169 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
45170 result in OPERAND0. */
45171 void
45172 ix86_expand_rint (rtx operand0, rtx operand1)
45174 /* C code for the stuff we're doing below:
45175 xa = fabs (operand1);
45176 if (!isless (xa, 2**52))
45177 return operand1;
45178 xa = xa + 2**52 - 2**52;
45179 return copysign (xa, operand1);
45181 machine_mode mode = GET_MODE (operand0);
45182 rtx res, xa, TWO52, mask;
45183 rtx_code_label *label;
45185 res = gen_reg_rtx (mode);
45186 emit_move_insn (res, operand1);
45188 /* xa = abs (operand1) */
45189 xa = ix86_expand_sse_fabs (res, &mask);
45191 /* if (!isless (xa, TWO52)) goto label; */
45192 TWO52 = ix86_gen_TWO52 (mode);
45193 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45195 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45196 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45198 ix86_sse_copysign_to_positive (res, xa, res, mask);
45200 emit_label (label);
45201 LABEL_NUSES (label) = 1;
45203 emit_move_insn (operand0, res);
45206 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45207 into OPERAND0. */
45208 void
45209 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45211 /* C code for the stuff we expand below.
45212 double xa = fabs (x), x2;
45213 if (!isless (xa, TWO52))
45214 return x;
45215 xa = xa + TWO52 - TWO52;
45216 x2 = copysign (xa, x);
45217 Compensate. Floor:
45218 if (x2 > x)
45219 x2 -= 1;
45220 Compensate. Ceil:
45221 if (x2 < x)
45222 x2 -= -1;
45223 return x2;
45225 machine_mode mode = GET_MODE (operand0);
45226 rtx xa, TWO52, tmp, one, res, mask;
45227 rtx_code_label *label;
45229 TWO52 = ix86_gen_TWO52 (mode);
45231 /* Temporary for holding the result, initialized to the input
45232 operand to ease control flow. */
45233 res = gen_reg_rtx (mode);
45234 emit_move_insn (res, operand1);
45236 /* xa = abs (operand1) */
45237 xa = ix86_expand_sse_fabs (res, &mask);
45239 /* if (!isless (xa, TWO52)) goto label; */
45240 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45242 /* xa = xa + TWO52 - TWO52; */
45243 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45244 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45246 /* xa = copysign (xa, operand1) */
45247 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45249 /* generate 1.0 or -1.0 */
45250 one = force_reg (mode,
45251 const_double_from_real_value (do_floor
45252 ? dconst1 : dconstm1, mode));
45254 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45255 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45256 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45257 /* We always need to subtract here to preserve signed zero. */
45258 tmp = expand_simple_binop (mode, MINUS,
45259 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45260 emit_move_insn (res, tmp);
45262 emit_label (label);
45263 LABEL_NUSES (label) = 1;
45265 emit_move_insn (operand0, res);
45268 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45269 into OPERAND0. */
45270 void
45271 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45273 /* C code for the stuff we expand below.
45274 double xa = fabs (x), x2;
45275 if (!isless (xa, TWO52))
45276 return x;
45277 x2 = (double)(long)x;
45278 Compensate. Floor:
45279 if (x2 > x)
45280 x2 -= 1;
45281 Compensate. Ceil:
45282 if (x2 < x)
45283 x2 += 1;
45284 if (HONOR_SIGNED_ZEROS (mode))
45285 return copysign (x2, x);
45286 return x2;
45288 machine_mode mode = GET_MODE (operand0);
45289 rtx xa, xi, TWO52, tmp, one, res, mask;
45290 rtx_code_label *label;
45292 TWO52 = ix86_gen_TWO52 (mode);
45294 /* Temporary for holding the result, initialized to the input
45295 operand to ease control flow. */
45296 res = gen_reg_rtx (mode);
45297 emit_move_insn (res, operand1);
45299 /* xa = abs (operand1) */
45300 xa = ix86_expand_sse_fabs (res, &mask);
45302 /* if (!isless (xa, TWO52)) goto label; */
45303 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45305 /* xa = (double)(long)x */
45306 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45307 expand_fix (xi, res, 0);
45308 expand_float (xa, xi, 0);
45310 /* generate 1.0 */
45311 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45313 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45314 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45315 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45316 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45317 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45318 emit_move_insn (res, tmp);
45320 if (HONOR_SIGNED_ZEROS (mode))
45321 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45323 emit_label (label);
45324 LABEL_NUSES (label) = 1;
45326 emit_move_insn (operand0, res);
45329 /* Expand SSE sequence for computing round from OPERAND1 storing
45330 into OPERAND0. Sequence that works without relying on DImode truncation
45331 via cvttsd2siq that is only available on 64bit targets. */
45332 void
45333 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45335 /* C code for the stuff we expand below.
45336 double xa = fabs (x), xa2, x2;
45337 if (!isless (xa, TWO52))
45338 return x;
45339 Using the absolute value and copying back sign makes
45340 -0.0 -> -0.0 correct.
45341 xa2 = xa + TWO52 - TWO52;
45342 Compensate.
45343 dxa = xa2 - xa;
45344 if (dxa <= -0.5)
45345 xa2 += 1;
45346 else if (dxa > 0.5)
45347 xa2 -= 1;
45348 x2 = copysign (xa2, x);
45349 return x2;
45351 machine_mode mode = GET_MODE (operand0);
45352 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45353 rtx_code_label *label;
45355 TWO52 = ix86_gen_TWO52 (mode);
45357 /* Temporary for holding the result, initialized to the input
45358 operand to ease control flow. */
45359 res = gen_reg_rtx (mode);
45360 emit_move_insn (res, operand1);
45362 /* xa = abs (operand1) */
45363 xa = ix86_expand_sse_fabs (res, &mask);
45365 /* if (!isless (xa, TWO52)) goto label; */
45366 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45368 /* xa2 = xa + TWO52 - TWO52; */
45369 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45370 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45372 /* dxa = xa2 - xa; */
45373 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45375 /* generate 0.5, 1.0 and -0.5 */
45376 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45377 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45378 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45379 0, OPTAB_DIRECT);
45381 /* Compensate. */
45382 tmp = gen_reg_rtx (mode);
45383 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45384 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45385 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45386 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45387 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45388 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45389 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45390 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45392 /* res = copysign (xa2, operand1) */
45393 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45395 emit_label (label);
45396 LABEL_NUSES (label) = 1;
45398 emit_move_insn (operand0, res);
45401 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45402 into OPERAND0. */
45403 void
45404 ix86_expand_trunc (rtx operand0, rtx operand1)
45406 /* C code for SSE variant we expand below.
45407 double xa = fabs (x), x2;
45408 if (!isless (xa, TWO52))
45409 return x;
45410 x2 = (double)(long)x;
45411 if (HONOR_SIGNED_ZEROS (mode))
45412 return copysign (x2, x);
45413 return x2;
45415 machine_mode mode = GET_MODE (operand0);
45416 rtx xa, xi, TWO52, res, mask;
45417 rtx_code_label *label;
45419 TWO52 = ix86_gen_TWO52 (mode);
45421 /* Temporary for holding the result, initialized to the input
45422 operand to ease control flow. */
45423 res = gen_reg_rtx (mode);
45424 emit_move_insn (res, operand1);
45426 /* xa = abs (operand1) */
45427 xa = ix86_expand_sse_fabs (res, &mask);
45429 /* if (!isless (xa, TWO52)) goto label; */
45430 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45432 /* x = (double)(long)x */
45433 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45434 expand_fix (xi, res, 0);
45435 expand_float (res, xi, 0);
45437 if (HONOR_SIGNED_ZEROS (mode))
45438 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45440 emit_label (label);
45441 LABEL_NUSES (label) = 1;
45443 emit_move_insn (operand0, res);
45446 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45447 into OPERAND0. */
45448 void
45449 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45451 machine_mode mode = GET_MODE (operand0);
45452 rtx xa, mask, TWO52, one, res, smask, tmp;
45453 rtx_code_label *label;
45455 /* C code for SSE variant we expand below.
45456 double xa = fabs (x), x2;
45457 if (!isless (xa, TWO52))
45458 return x;
45459 xa2 = xa + TWO52 - TWO52;
45460 Compensate:
45461 if (xa2 > xa)
45462 xa2 -= 1.0;
45463 x2 = copysign (xa2, x);
45464 return x2;
45467 TWO52 = ix86_gen_TWO52 (mode);
45469 /* Temporary for holding the result, initialized to the input
45470 operand to ease control flow. */
45471 res = gen_reg_rtx (mode);
45472 emit_move_insn (res, operand1);
45474 /* xa = abs (operand1) */
45475 xa = ix86_expand_sse_fabs (res, &smask);
45477 /* if (!isless (xa, TWO52)) goto label; */
45478 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45480 /* res = xa + TWO52 - TWO52; */
45481 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45482 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45483 emit_move_insn (res, tmp);
45485 /* generate 1.0 */
45486 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45488 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45489 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45490 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45491 tmp = expand_simple_binop (mode, MINUS,
45492 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45493 emit_move_insn (res, tmp);
45495 /* res = copysign (res, operand1) */
45496 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45498 emit_label (label);
45499 LABEL_NUSES (label) = 1;
45501 emit_move_insn (operand0, res);
45504 /* Expand SSE sequence for computing round from OPERAND1 storing
45505 into OPERAND0. */
45506 void
45507 ix86_expand_round (rtx operand0, rtx operand1)
45509 /* C code for the stuff we're doing below:
45510 double xa = fabs (x);
45511 if (!isless (xa, TWO52))
45512 return x;
45513 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45514 return copysign (xa, x);
45516 machine_mode mode = GET_MODE (operand0);
45517 rtx res, TWO52, xa, xi, half, mask;
45518 rtx_code_label *label;
45519 const struct real_format *fmt;
45520 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45522 /* Temporary for holding the result, initialized to the input
45523 operand to ease control flow. */
45524 res = gen_reg_rtx (mode);
45525 emit_move_insn (res, operand1);
45527 TWO52 = ix86_gen_TWO52 (mode);
45528 xa = ix86_expand_sse_fabs (res, &mask);
45529 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45531 /* load nextafter (0.5, 0.0) */
45532 fmt = REAL_MODE_FORMAT (mode);
45533 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45534 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45536 /* xa = xa + 0.5 */
45537 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45538 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45540 /* xa = (double)(int64_t)xa */
45541 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45542 expand_fix (xi, xa, 0);
45543 expand_float (xa, xi, 0);
45545 /* res = copysign (xa, operand1) */
45546 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45548 emit_label (label);
45549 LABEL_NUSES (label) = 1;
45551 emit_move_insn (operand0, res);
45554 /* Expand SSE sequence for computing round
45555 from OP1 storing into OP0 using sse4 round insn. */
45556 void
45557 ix86_expand_round_sse4 (rtx op0, rtx op1)
45559 machine_mode mode = GET_MODE (op0);
45560 rtx e1, e2, res, half;
45561 const struct real_format *fmt;
45562 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45563 rtx (*gen_copysign) (rtx, rtx, rtx);
45564 rtx (*gen_round) (rtx, rtx, rtx);
45566 switch (mode)
45568 case SFmode:
45569 gen_copysign = gen_copysignsf3;
45570 gen_round = gen_sse4_1_roundsf2;
45571 break;
45572 case DFmode:
45573 gen_copysign = gen_copysigndf3;
45574 gen_round = gen_sse4_1_rounddf2;
45575 break;
45576 default:
45577 gcc_unreachable ();
45580 /* round (a) = trunc (a + copysign (0.5, a)) */
45582 /* load nextafter (0.5, 0.0) */
45583 fmt = REAL_MODE_FORMAT (mode);
45584 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45585 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45586 half = const_double_from_real_value (pred_half, mode);
45588 /* e1 = copysign (0.5, op1) */
45589 e1 = gen_reg_rtx (mode);
45590 emit_insn (gen_copysign (e1, half, op1));
45592 /* e2 = op1 + e1 */
45593 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45595 /* res = trunc (e2) */
45596 res = gen_reg_rtx (mode);
45597 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45599 emit_move_insn (op0, res);
45603 /* Table of valid machine attributes. */
45604 static const struct attribute_spec ix86_attribute_table[] =
45606 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
45607 affects_type_identity } */
45608 /* Stdcall attribute says callee is responsible for popping arguments
45609 if they are not variable. */
45610 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45611 true },
45612 /* Fastcall attribute says callee is responsible for popping arguments
45613 if they are not variable. */
45614 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45615 true },
45616 /* Thiscall attribute says callee is responsible for popping arguments
45617 if they are not variable. */
45618 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45619 true },
45620 /* Cdecl attribute says the callee is a normal C declaration */
45621 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45622 true },
45623 /* Regparm attribute specifies how many integer arguments are to be
45624 passed in registers. */
45625 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
45626 true },
45627 /* Sseregparm attribute says we are using x86_64 calling conventions
45628 for FP arguments. */
45629 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
45630 true },
45631 /* The transactional memory builtins are implicitly regparm or fastcall
45632 depending on the ABI. Override the generic do-nothing attribute that
45633 these builtins were declared with. */
45634 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
45635 true },
45636 /* force_align_arg_pointer says this function realigns the stack at entry. */
45637 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45638 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
45639 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45640 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
45641 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
45642 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
45643 false },
45644 #endif
45645 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45646 false },
45647 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
45648 false },
45649 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45650 SUBTARGET_ATTRIBUTE_TABLE,
45651 #endif
45652 /* ms_abi and sysv_abi calling convention function attributes. */
45653 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45654 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
45655 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
45656 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
45657 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
45658 false },
45659 { "callee_pop_aggregate_return", 1, 1, false, true, true,
45660 ix86_handle_callee_pop_aggregate_return, true },
45661 { "interrupt", 0, 0, false, true, true,
45662 ix86_handle_interrupt_attribute, false },
45663 { "no_caller_saved_registers", 0, 0, false, true, true,
45664 ix86_handle_no_caller_saved_registers_attribute, false },
45666 /* End element. */
45667 { NULL, 0, 0, false, false, false, NULL, false }
45670 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45671 static int
45672 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45673 tree vectype, int)
45675 switch (type_of_cost)
45677 case scalar_stmt:
45678 return ix86_cost->scalar_stmt_cost;
45680 case scalar_load:
45681 return ix86_cost->scalar_load_cost;
45683 case scalar_store:
45684 return ix86_cost->scalar_store_cost;
45686 case vector_stmt:
45687 return ix86_cost->vec_stmt_cost;
45689 case vector_load:
45690 return ix86_cost->vec_align_load_cost;
45692 case vector_store:
45693 return ix86_cost->vec_store_cost;
45695 case vec_to_scalar:
45696 return ix86_cost->vec_to_scalar_cost;
45698 case scalar_to_vec:
45699 return ix86_cost->scalar_to_vec_cost;
45701 case unaligned_load:
45702 case unaligned_store:
45703 return ix86_cost->vec_unalign_load_cost;
45705 case cond_branch_taken:
45706 return ix86_cost->cond_taken_branch_cost;
45708 case cond_branch_not_taken:
45709 return ix86_cost->cond_not_taken_branch_cost;
45711 case vec_perm:
45712 case vec_promote_demote:
45713 return ix86_cost->vec_stmt_cost;
45715 case vec_construct:
45716 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
45718 default:
45719 gcc_unreachable ();
45723 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45724 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45725 insn every time. */
45727 static GTY(()) rtx_insn *vselect_insn;
45729 /* Initialize vselect_insn. */
45731 static void
45732 init_vselect_insn (void)
45734 unsigned i;
45735 rtx x;
45737 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45738 for (i = 0; i < MAX_VECT_LEN; ++i)
45739 XVECEXP (x, 0, i) = const0_rtx;
45740 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45741 const0_rtx), x);
45742 x = gen_rtx_SET (const0_rtx, x);
45743 start_sequence ();
45744 vselect_insn = emit_insn (x);
45745 end_sequence ();
45748 /* Construct (set target (vec_select op0 (parallel perm))) and
45749 return true if that's a valid instruction in the active ISA. */
45751 static bool
45752 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45753 unsigned nelt, bool testing_p)
45755 unsigned int i;
45756 rtx x, save_vconcat;
45757 int icode;
45759 if (vselect_insn == NULL_RTX)
45760 init_vselect_insn ();
45762 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45763 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45764 for (i = 0; i < nelt; ++i)
45765 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45766 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45767 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45768 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45769 SET_DEST (PATTERN (vselect_insn)) = target;
45770 icode = recog_memoized (vselect_insn);
45772 if (icode >= 0 && !testing_p)
45773 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45775 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45776 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45777 INSN_CODE (vselect_insn) = -1;
45779 return icode >= 0;
45782 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45784 static bool
45785 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45786 const unsigned char *perm, unsigned nelt,
45787 bool testing_p)
45789 machine_mode v2mode;
45790 rtx x;
45791 bool ok;
45793 if (vselect_insn == NULL_RTX)
45794 init_vselect_insn ();
45796 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
45797 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45798 PUT_MODE (x, v2mode);
45799 XEXP (x, 0) = op0;
45800 XEXP (x, 1) = op1;
45801 ok = expand_vselect (target, x, perm, nelt, testing_p);
45802 XEXP (x, 0) = const0_rtx;
45803 XEXP (x, 1) = const0_rtx;
45804 return ok;
45807 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45808 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45810 static bool
45811 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45813 machine_mode mmode, vmode = d->vmode;
45814 unsigned i, mask, nelt = d->nelt;
45815 rtx target, op0, op1, maskop, x;
45816 rtx rperm[32], vperm;
45818 if (d->one_operand_p)
45819 return false;
45820 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45821 && (TARGET_AVX512BW
45822 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45824 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45826 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45828 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45830 else
45831 return false;
45833 /* This is a blend, not a permute. Elements must stay in their
45834 respective lanes. */
45835 for (i = 0; i < nelt; ++i)
45837 unsigned e = d->perm[i];
45838 if (!(e == i || e == i + nelt))
45839 return false;
45842 if (d->testing_p)
45843 return true;
45845 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45846 decision should be extracted elsewhere, so that we only try that
45847 sequence once all budget==3 options have been tried. */
45848 target = d->target;
45849 op0 = d->op0;
45850 op1 = d->op1;
45851 mask = 0;
45853 switch (vmode)
45855 case V8DFmode:
45856 case V16SFmode:
45857 case V4DFmode:
45858 case V8SFmode:
45859 case V2DFmode:
45860 case V4SFmode:
45861 case V8HImode:
45862 case V8SImode:
45863 case V32HImode:
45864 case V64QImode:
45865 case V16SImode:
45866 case V8DImode:
45867 for (i = 0; i < nelt; ++i)
45868 mask |= (d->perm[i] >= nelt) << i;
45869 break;
45871 case V2DImode:
45872 for (i = 0; i < 2; ++i)
45873 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45874 vmode = V8HImode;
45875 goto do_subreg;
45877 case V4SImode:
45878 for (i = 0; i < 4; ++i)
45879 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45880 vmode = V8HImode;
45881 goto do_subreg;
45883 case V16QImode:
45884 /* See if bytes move in pairs so we can use pblendw with
45885 an immediate argument, rather than pblendvb with a vector
45886 argument. */
45887 for (i = 0; i < 16; i += 2)
45888 if (d->perm[i] + 1 != d->perm[i + 1])
45890 use_pblendvb:
45891 for (i = 0; i < nelt; ++i)
45892 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45894 finish_pblendvb:
45895 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45896 vperm = force_reg (vmode, vperm);
45898 if (GET_MODE_SIZE (vmode) == 16)
45899 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45900 else
45901 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45902 if (target != d->target)
45903 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45904 return true;
45907 for (i = 0; i < 8; ++i)
45908 mask |= (d->perm[i * 2] >= 16) << i;
45909 vmode = V8HImode;
45910 /* FALLTHRU */
45912 do_subreg:
45913 target = gen_reg_rtx (vmode);
45914 op0 = gen_lowpart (vmode, op0);
45915 op1 = gen_lowpart (vmode, op1);
45916 break;
45918 case V32QImode:
45919 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45920 for (i = 0; i < 32; i += 2)
45921 if (d->perm[i] + 1 != d->perm[i + 1])
45922 goto use_pblendvb;
45923 /* See if bytes move in quadruplets. If yes, vpblendd
45924 with immediate can be used. */
45925 for (i = 0; i < 32; i += 4)
45926 if (d->perm[i] + 2 != d->perm[i + 2])
45927 break;
45928 if (i < 32)
45930 /* See if bytes move the same in both lanes. If yes,
45931 vpblendw with immediate can be used. */
45932 for (i = 0; i < 16; i += 2)
45933 if (d->perm[i] + 16 != d->perm[i + 16])
45934 goto use_pblendvb;
45936 /* Use vpblendw. */
45937 for (i = 0; i < 16; ++i)
45938 mask |= (d->perm[i * 2] >= 32) << i;
45939 vmode = V16HImode;
45940 goto do_subreg;
45943 /* Use vpblendd. */
45944 for (i = 0; i < 8; ++i)
45945 mask |= (d->perm[i * 4] >= 32) << i;
45946 vmode = V8SImode;
45947 goto do_subreg;
45949 case V16HImode:
45950 /* See if words move in pairs. If yes, vpblendd can be used. */
45951 for (i = 0; i < 16; i += 2)
45952 if (d->perm[i] + 1 != d->perm[i + 1])
45953 break;
45954 if (i < 16)
45956 /* See if words move the same in both lanes. If not,
45957 vpblendvb must be used. */
45958 for (i = 0; i < 8; i++)
45959 if (d->perm[i] + 8 != d->perm[i + 8])
45961 /* Use vpblendvb. */
45962 for (i = 0; i < 32; ++i)
45963 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45965 vmode = V32QImode;
45966 nelt = 32;
45967 target = gen_reg_rtx (vmode);
45968 op0 = gen_lowpart (vmode, op0);
45969 op1 = gen_lowpart (vmode, op1);
45970 goto finish_pblendvb;
45973 /* Use vpblendw. */
45974 for (i = 0; i < 16; ++i)
45975 mask |= (d->perm[i] >= 16) << i;
45976 break;
45979 /* Use vpblendd. */
45980 for (i = 0; i < 8; ++i)
45981 mask |= (d->perm[i * 2] >= 16) << i;
45982 vmode = V8SImode;
45983 goto do_subreg;
45985 case V4DImode:
45986 /* Use vpblendd. */
45987 for (i = 0; i < 4; ++i)
45988 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45989 vmode = V8SImode;
45990 goto do_subreg;
45992 default:
45993 gcc_unreachable ();
45996 switch (vmode)
45998 case V8DFmode:
45999 case V8DImode:
46000 mmode = QImode;
46001 break;
46002 case V16SFmode:
46003 case V16SImode:
46004 mmode = HImode;
46005 break;
46006 case V32HImode:
46007 mmode = SImode;
46008 break;
46009 case V64QImode:
46010 mmode = DImode;
46011 break;
46012 default:
46013 mmode = VOIDmode;
46016 if (mmode != VOIDmode)
46017 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46018 else
46019 maskop = GEN_INT (mask);
46021 /* This matches five different patterns with the different modes. */
46022 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46023 x = gen_rtx_SET (target, x);
46024 emit_insn (x);
46025 if (target != d->target)
46026 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46028 return true;
46031 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46032 in terms of the variable form of vpermilps.
46034 Note that we will have already failed the immediate input vpermilps,
46035 which requires that the high and low part shuffle be identical; the
46036 variable form doesn't require that. */
46038 static bool
46039 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46041 rtx rperm[8], vperm;
46042 unsigned i;
46044 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46045 return false;
46047 /* We can only permute within the 128-bit lane. */
46048 for (i = 0; i < 8; ++i)
46050 unsigned e = d->perm[i];
46051 if (i < 4 ? e >= 4 : e < 4)
46052 return false;
46055 if (d->testing_p)
46056 return true;
46058 for (i = 0; i < 8; ++i)
46060 unsigned e = d->perm[i];
46062 /* Within each 128-bit lane, the elements of op0 are numbered
46063 from 0 and the elements of op1 are numbered from 4. */
46064 if (e >= 8 + 4)
46065 e -= 8;
46066 else if (e >= 4)
46067 e -= 4;
46069 rperm[i] = GEN_INT (e);
46072 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46073 vperm = force_reg (V8SImode, vperm);
46074 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46076 return true;
46079 /* Return true if permutation D can be performed as VMODE permutation
46080 instead. */
46082 static bool
46083 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46085 unsigned int i, j, chunk;
46087 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46088 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46089 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46090 return false;
46092 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46093 return true;
46095 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46096 for (i = 0; i < d->nelt; i += chunk)
46097 if (d->perm[i] & (chunk - 1))
46098 return false;
46099 else
46100 for (j = 1; j < chunk; ++j)
46101 if (d->perm[i] + j != d->perm[i + j])
46102 return false;
46104 return true;
46107 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46108 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46110 static bool
46111 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46113 unsigned i, nelt, eltsz, mask;
46114 unsigned char perm[64];
46115 machine_mode vmode = V16QImode;
46116 rtx rperm[64], vperm, target, op0, op1;
46118 nelt = d->nelt;
46120 if (!d->one_operand_p)
46122 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46124 if (TARGET_AVX2
46125 && valid_perm_using_mode_p (V2TImode, d))
46127 if (d->testing_p)
46128 return true;
46130 /* Use vperm2i128 insn. The pattern uses
46131 V4DImode instead of V2TImode. */
46132 target = d->target;
46133 if (d->vmode != V4DImode)
46134 target = gen_reg_rtx (V4DImode);
46135 op0 = gen_lowpart (V4DImode, d->op0);
46136 op1 = gen_lowpart (V4DImode, d->op1);
46137 rperm[0]
46138 = GEN_INT ((d->perm[0] / (nelt / 2))
46139 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46140 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46141 if (target != d->target)
46142 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46143 return true;
46145 return false;
46148 else
46150 if (GET_MODE_SIZE (d->vmode) == 16)
46152 if (!TARGET_SSSE3)
46153 return false;
46155 else if (GET_MODE_SIZE (d->vmode) == 32)
46157 if (!TARGET_AVX2)
46158 return false;
46160 /* V4DImode should be already handled through
46161 expand_vselect by vpermq instruction. */
46162 gcc_assert (d->vmode != V4DImode);
46164 vmode = V32QImode;
46165 if (d->vmode == V8SImode
46166 || d->vmode == V16HImode
46167 || d->vmode == V32QImode)
46169 /* First see if vpermq can be used for
46170 V8SImode/V16HImode/V32QImode. */
46171 if (valid_perm_using_mode_p (V4DImode, d))
46173 for (i = 0; i < 4; i++)
46174 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46175 if (d->testing_p)
46176 return true;
46177 target = gen_reg_rtx (V4DImode);
46178 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46179 perm, 4, false))
46181 emit_move_insn (d->target,
46182 gen_lowpart (d->vmode, target));
46183 return true;
46185 return false;
46188 /* Next see if vpermd can be used. */
46189 if (valid_perm_using_mode_p (V8SImode, d))
46190 vmode = V8SImode;
46192 /* Or if vpermps can be used. */
46193 else if (d->vmode == V8SFmode)
46194 vmode = V8SImode;
46196 if (vmode == V32QImode)
46198 /* vpshufb only works intra lanes, it is not
46199 possible to shuffle bytes in between the lanes. */
46200 for (i = 0; i < nelt; ++i)
46201 if ((d->perm[i] ^ i) & (nelt / 2))
46202 return false;
46205 else if (GET_MODE_SIZE (d->vmode) == 64)
46207 if (!TARGET_AVX512BW)
46208 return false;
46210 /* If vpermq didn't work, vpshufb won't work either. */
46211 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46212 return false;
46214 vmode = V64QImode;
46215 if (d->vmode == V16SImode
46216 || d->vmode == V32HImode
46217 || d->vmode == V64QImode)
46219 /* First see if vpermq can be used for
46220 V16SImode/V32HImode/V64QImode. */
46221 if (valid_perm_using_mode_p (V8DImode, d))
46223 for (i = 0; i < 8; i++)
46224 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46225 if (d->testing_p)
46226 return true;
46227 target = gen_reg_rtx (V8DImode);
46228 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46229 perm, 8, false))
46231 emit_move_insn (d->target,
46232 gen_lowpart (d->vmode, target));
46233 return true;
46235 return false;
46238 /* Next see if vpermd can be used. */
46239 if (valid_perm_using_mode_p (V16SImode, d))
46240 vmode = V16SImode;
46242 /* Or if vpermps can be used. */
46243 else if (d->vmode == V16SFmode)
46244 vmode = V16SImode;
46245 if (vmode == V64QImode)
46247 /* vpshufb only works intra lanes, it is not
46248 possible to shuffle bytes in between the lanes. */
46249 for (i = 0; i < nelt; ++i)
46250 if ((d->perm[i] ^ i) & (nelt / 4))
46251 return false;
46254 else
46255 return false;
46258 if (d->testing_p)
46259 return true;
46261 if (vmode == V8SImode)
46262 for (i = 0; i < 8; ++i)
46263 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46264 else if (vmode == V16SImode)
46265 for (i = 0; i < 16; ++i)
46266 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46267 else
46269 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46270 if (!d->one_operand_p)
46271 mask = 2 * nelt - 1;
46272 else if (vmode == V16QImode)
46273 mask = nelt - 1;
46274 else if (vmode == V64QImode)
46275 mask = nelt / 4 - 1;
46276 else
46277 mask = nelt / 2 - 1;
46279 for (i = 0; i < nelt; ++i)
46281 unsigned j, e = d->perm[i] & mask;
46282 for (j = 0; j < eltsz; ++j)
46283 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46287 vperm = gen_rtx_CONST_VECTOR (vmode,
46288 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46289 vperm = force_reg (vmode, vperm);
46291 target = d->target;
46292 if (d->vmode != vmode)
46293 target = gen_reg_rtx (vmode);
46294 op0 = gen_lowpart (vmode, d->op0);
46295 if (d->one_operand_p)
46297 if (vmode == V16QImode)
46298 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46299 else if (vmode == V32QImode)
46300 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46301 else if (vmode == V64QImode)
46302 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46303 else if (vmode == V8SFmode)
46304 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46305 else if (vmode == V8SImode)
46306 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46307 else if (vmode == V16SFmode)
46308 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46309 else if (vmode == V16SImode)
46310 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46311 else
46312 gcc_unreachable ();
46314 else
46316 op1 = gen_lowpart (vmode, d->op1);
46317 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46319 if (target != d->target)
46320 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46322 return true;
46325 /* For V*[QHS]Imode permutations, check if the same permutation
46326 can't be performed in a 2x, 4x or 8x wider inner mode. */
46328 static bool
46329 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46330 struct expand_vec_perm_d *nd)
46332 int i;
46333 enum machine_mode mode = VOIDmode;
46335 switch (d->vmode)
46337 case V16QImode: mode = V8HImode; break;
46338 case V32QImode: mode = V16HImode; break;
46339 case V64QImode: mode = V32HImode; break;
46340 case V8HImode: mode = V4SImode; break;
46341 case V16HImode: mode = V8SImode; break;
46342 case V32HImode: mode = V16SImode; break;
46343 case V4SImode: mode = V2DImode; break;
46344 case V8SImode: mode = V4DImode; break;
46345 case V16SImode: mode = V8DImode; break;
46346 default: return false;
46348 for (i = 0; i < d->nelt; i += 2)
46349 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46350 return false;
46351 nd->vmode = mode;
46352 nd->nelt = d->nelt / 2;
46353 for (i = 0; i < nd->nelt; i++)
46354 nd->perm[i] = d->perm[2 * i] / 2;
46355 if (GET_MODE_INNER (mode) != DImode)
46356 canonicalize_vector_int_perm (nd, nd);
46357 if (nd != d)
46359 nd->one_operand_p = d->one_operand_p;
46360 nd->testing_p = d->testing_p;
46361 if (d->op0 == d->op1)
46362 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46363 else
46365 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46366 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46368 if (d->testing_p)
46369 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46370 else
46371 nd->target = gen_reg_rtx (nd->vmode);
46373 return true;
46376 /* Try to expand one-operand permutation with constant mask. */
46378 static bool
46379 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46381 machine_mode mode = GET_MODE (d->op0);
46382 machine_mode maskmode = mode;
46383 rtx (*gen) (rtx, rtx, rtx) = NULL;
46384 rtx target, op0, mask;
46385 rtx vec[64];
46387 if (!rtx_equal_p (d->op0, d->op1))
46388 return false;
46390 if (!TARGET_AVX512F)
46391 return false;
46393 switch (mode)
46395 case V16SImode:
46396 gen = gen_avx512f_permvarv16si;
46397 break;
46398 case V16SFmode:
46399 gen = gen_avx512f_permvarv16sf;
46400 maskmode = V16SImode;
46401 break;
46402 case V8DImode:
46403 gen = gen_avx512f_permvarv8di;
46404 break;
46405 case V8DFmode:
46406 gen = gen_avx512f_permvarv8df;
46407 maskmode = V8DImode;
46408 break;
46409 default:
46410 return false;
46413 target = d->target;
46414 op0 = d->op0;
46415 for (int i = 0; i < d->nelt; ++i)
46416 vec[i] = GEN_INT (d->perm[i]);
46417 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46418 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46419 return true;
46422 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46423 in a single instruction. */
46425 static bool
46426 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46428 unsigned i, nelt = d->nelt;
46429 struct expand_vec_perm_d nd;
46431 /* Check plain VEC_SELECT first, because AVX has instructions that could
46432 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46433 input where SEL+CONCAT may not. */
46434 if (d->one_operand_p)
46436 int mask = nelt - 1;
46437 bool identity_perm = true;
46438 bool broadcast_perm = true;
46440 for (i = 0; i < nelt; i++)
46442 nd.perm[i] = d->perm[i] & mask;
46443 if (nd.perm[i] != i)
46444 identity_perm = false;
46445 if (nd.perm[i])
46446 broadcast_perm = false;
46449 if (identity_perm)
46451 if (!d->testing_p)
46452 emit_move_insn (d->target, d->op0);
46453 return true;
46455 else if (broadcast_perm && TARGET_AVX2)
46457 /* Use vpbroadcast{b,w,d}. */
46458 rtx (*gen) (rtx, rtx) = NULL;
46459 switch (d->vmode)
46461 case V64QImode:
46462 if (TARGET_AVX512BW)
46463 gen = gen_avx512bw_vec_dupv64qi_1;
46464 break;
46465 case V32QImode:
46466 gen = gen_avx2_pbroadcastv32qi_1;
46467 break;
46468 case V32HImode:
46469 if (TARGET_AVX512BW)
46470 gen = gen_avx512bw_vec_dupv32hi_1;
46471 break;
46472 case V16HImode:
46473 gen = gen_avx2_pbroadcastv16hi_1;
46474 break;
46475 case V16SImode:
46476 if (TARGET_AVX512F)
46477 gen = gen_avx512f_vec_dupv16si_1;
46478 break;
46479 case V8SImode:
46480 gen = gen_avx2_pbroadcastv8si_1;
46481 break;
46482 case V16QImode:
46483 gen = gen_avx2_pbroadcastv16qi;
46484 break;
46485 case V8HImode:
46486 gen = gen_avx2_pbroadcastv8hi;
46487 break;
46488 case V16SFmode:
46489 if (TARGET_AVX512F)
46490 gen = gen_avx512f_vec_dupv16sf_1;
46491 break;
46492 case V8SFmode:
46493 gen = gen_avx2_vec_dupv8sf_1;
46494 break;
46495 case V8DFmode:
46496 if (TARGET_AVX512F)
46497 gen = gen_avx512f_vec_dupv8df_1;
46498 break;
46499 case V8DImode:
46500 if (TARGET_AVX512F)
46501 gen = gen_avx512f_vec_dupv8di_1;
46502 break;
46503 /* For other modes prefer other shuffles this function creates. */
46504 default: break;
46506 if (gen != NULL)
46508 if (!d->testing_p)
46509 emit_insn (gen (d->target, d->op0));
46510 return true;
46514 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46515 return true;
46517 /* There are plenty of patterns in sse.md that are written for
46518 SEL+CONCAT and are not replicated for a single op. Perhaps
46519 that should be changed, to avoid the nastiness here. */
46521 /* Recognize interleave style patterns, which means incrementing
46522 every other permutation operand. */
46523 for (i = 0; i < nelt; i += 2)
46525 nd.perm[i] = d->perm[i] & mask;
46526 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46528 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46529 d->testing_p))
46530 return true;
46532 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46533 if (nelt >= 4)
46535 for (i = 0; i < nelt; i += 4)
46537 nd.perm[i + 0] = d->perm[i + 0] & mask;
46538 nd.perm[i + 1] = d->perm[i + 1] & mask;
46539 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46540 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46543 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46544 d->testing_p))
46545 return true;
46549 /* Finally, try the fully general two operand permute. */
46550 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46551 d->testing_p))
46552 return true;
46554 /* Recognize interleave style patterns with reversed operands. */
46555 if (!d->one_operand_p)
46557 for (i = 0; i < nelt; ++i)
46559 unsigned e = d->perm[i];
46560 if (e >= nelt)
46561 e -= nelt;
46562 else
46563 e += nelt;
46564 nd.perm[i] = e;
46567 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46568 d->testing_p))
46569 return true;
46572 /* Try the SSE4.1 blend variable merge instructions. */
46573 if (expand_vec_perm_blend (d))
46574 return true;
46576 /* Try one of the AVX vpermil variable permutations. */
46577 if (expand_vec_perm_vpermil (d))
46578 return true;
46580 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46581 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46582 if (expand_vec_perm_pshufb (d))
46583 return true;
46585 /* Try the AVX2 vpalignr instruction. */
46586 if (expand_vec_perm_palignr (d, true))
46587 return true;
46589 /* Try the AVX512F vperm{s,d} instructions. */
46590 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46591 return true;
46593 /* Try the AVX512F vpermi2 instructions. */
46594 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46595 return true;
46597 /* See if we can get the same permutation in different vector integer
46598 mode. */
46599 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46601 if (!d->testing_p)
46602 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46603 return true;
46605 return false;
46608 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46609 in terms of a pair of pshuflw + pshufhw instructions. */
46611 static bool
46612 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46614 unsigned char perm2[MAX_VECT_LEN];
46615 unsigned i;
46616 bool ok;
46618 if (d->vmode != V8HImode || !d->one_operand_p)
46619 return false;
46621 /* The two permutations only operate in 64-bit lanes. */
46622 for (i = 0; i < 4; ++i)
46623 if (d->perm[i] >= 4)
46624 return false;
46625 for (i = 4; i < 8; ++i)
46626 if (d->perm[i] < 4)
46627 return false;
46629 if (d->testing_p)
46630 return true;
46632 /* Emit the pshuflw. */
46633 memcpy (perm2, d->perm, 4);
46634 for (i = 4; i < 8; ++i)
46635 perm2[i] = i;
46636 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46637 gcc_assert (ok);
46639 /* Emit the pshufhw. */
46640 memcpy (perm2 + 4, d->perm + 4, 4);
46641 for (i = 0; i < 4; ++i)
46642 perm2[i] = i;
46643 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46644 gcc_assert (ok);
46646 return true;
46649 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46650 the permutation using the SSSE3 palignr instruction. This succeeds
46651 when all of the elements in PERM fit within one vector and we merely
46652 need to shift them down so that a single vector permutation has a
46653 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46654 the vpalignr instruction itself can perform the requested permutation. */
46656 static bool
46657 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46659 unsigned i, nelt = d->nelt;
46660 unsigned min, max, minswap, maxswap;
46661 bool in_order, ok, swap = false;
46662 rtx shift, target;
46663 struct expand_vec_perm_d dcopy;
46665 /* Even with AVX, palignr only operates on 128-bit vectors,
46666 in AVX2 palignr operates on both 128-bit lanes. */
46667 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46668 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46669 return false;
46671 min = 2 * nelt;
46672 max = 0;
46673 minswap = 2 * nelt;
46674 maxswap = 0;
46675 for (i = 0; i < nelt; ++i)
46677 unsigned e = d->perm[i];
46678 unsigned eswap = d->perm[i] ^ nelt;
46679 if (GET_MODE_SIZE (d->vmode) == 32)
46681 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46682 eswap = e ^ (nelt / 2);
46684 if (e < min)
46685 min = e;
46686 if (e > max)
46687 max = e;
46688 if (eswap < minswap)
46689 minswap = eswap;
46690 if (eswap > maxswap)
46691 maxswap = eswap;
46693 if (min == 0
46694 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46696 if (d->one_operand_p
46697 || minswap == 0
46698 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46699 ? nelt / 2 : nelt))
46700 return false;
46701 swap = true;
46702 min = minswap;
46703 max = maxswap;
46706 /* Given that we have SSSE3, we know we'll be able to implement the
46707 single operand permutation after the palignr with pshufb for
46708 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46709 first. */
46710 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46711 return true;
46713 dcopy = *d;
46714 if (swap)
46716 dcopy.op0 = d->op1;
46717 dcopy.op1 = d->op0;
46718 for (i = 0; i < nelt; ++i)
46719 dcopy.perm[i] ^= nelt;
46722 in_order = true;
46723 for (i = 0; i < nelt; ++i)
46725 unsigned e = dcopy.perm[i];
46726 if (GET_MODE_SIZE (d->vmode) == 32
46727 && e >= nelt
46728 && (e & (nelt / 2 - 1)) < min)
46729 e = e - min - (nelt / 2);
46730 else
46731 e = e - min;
46732 if (e != i)
46733 in_order = false;
46734 dcopy.perm[i] = e;
46736 dcopy.one_operand_p = true;
46738 if (single_insn_only_p && !in_order)
46739 return false;
46741 /* For AVX2, test whether we can permute the result in one instruction. */
46742 if (d->testing_p)
46744 if (in_order)
46745 return true;
46746 dcopy.op1 = dcopy.op0;
46747 return expand_vec_perm_1 (&dcopy);
46750 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46751 if (GET_MODE_SIZE (d->vmode) == 16)
46753 target = gen_reg_rtx (TImode);
46754 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46755 gen_lowpart (TImode, dcopy.op0), shift));
46757 else
46759 target = gen_reg_rtx (V2TImode);
46760 emit_insn (gen_avx2_palignrv2ti (target,
46761 gen_lowpart (V2TImode, dcopy.op1),
46762 gen_lowpart (V2TImode, dcopy.op0),
46763 shift));
46766 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46768 /* Test for the degenerate case where the alignment by itself
46769 produces the desired permutation. */
46770 if (in_order)
46772 emit_move_insn (d->target, dcopy.op0);
46773 return true;
46776 ok = expand_vec_perm_1 (&dcopy);
46777 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46779 return ok;
46782 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46783 the permutation using the SSE4_1 pblendv instruction. Potentially
46784 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46786 static bool
46787 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46789 unsigned i, which, nelt = d->nelt;
46790 struct expand_vec_perm_d dcopy, dcopy1;
46791 machine_mode vmode = d->vmode;
46792 bool ok;
46794 /* Use the same checks as in expand_vec_perm_blend. */
46795 if (d->one_operand_p)
46796 return false;
46797 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46799 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46801 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46803 else
46804 return false;
46806 /* Figure out where permutation elements stay not in their
46807 respective lanes. */
46808 for (i = 0, which = 0; i < nelt; ++i)
46810 unsigned e = d->perm[i];
46811 if (e != i)
46812 which |= (e < nelt ? 1 : 2);
46814 /* We can pblend the part where elements stay not in their
46815 respective lanes only when these elements are all in one
46816 half of a permutation.
46817 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46818 lanes, but both 8 and 9 >= 8
46819 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46820 respective lanes and 8 >= 8, but 2 not. */
46821 if (which != 1 && which != 2)
46822 return false;
46823 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46824 return true;
46826 /* First we apply one operand permutation to the part where
46827 elements stay not in their respective lanes. */
46828 dcopy = *d;
46829 if (which == 2)
46830 dcopy.op0 = dcopy.op1 = d->op1;
46831 else
46832 dcopy.op0 = dcopy.op1 = d->op0;
46833 if (!d->testing_p)
46834 dcopy.target = gen_reg_rtx (vmode);
46835 dcopy.one_operand_p = true;
46837 for (i = 0; i < nelt; ++i)
46838 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46840 ok = expand_vec_perm_1 (&dcopy);
46841 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46842 return false;
46843 else
46844 gcc_assert (ok);
46845 if (d->testing_p)
46846 return true;
46848 /* Next we put permuted elements into their positions. */
46849 dcopy1 = *d;
46850 if (which == 2)
46851 dcopy1.op1 = dcopy.target;
46852 else
46853 dcopy1.op0 = dcopy.target;
46855 for (i = 0; i < nelt; ++i)
46856 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46858 ok = expand_vec_perm_blend (&dcopy1);
46859 gcc_assert (ok);
46861 return true;
46864 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46866 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46867 a two vector permutation into a single vector permutation by using
46868 an interleave operation to merge the vectors. */
46870 static bool
46871 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46873 struct expand_vec_perm_d dremap, dfinal;
46874 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46875 unsigned HOST_WIDE_INT contents;
46876 unsigned char remap[2 * MAX_VECT_LEN];
46877 rtx_insn *seq;
46878 bool ok, same_halves = false;
46880 if (GET_MODE_SIZE (d->vmode) == 16)
46882 if (d->one_operand_p)
46883 return false;
46885 else if (GET_MODE_SIZE (d->vmode) == 32)
46887 if (!TARGET_AVX)
46888 return false;
46889 /* For 32-byte modes allow even d->one_operand_p.
46890 The lack of cross-lane shuffling in some instructions
46891 might prevent a single insn shuffle. */
46892 dfinal = *d;
46893 dfinal.testing_p = true;
46894 /* If expand_vec_perm_interleave3 can expand this into
46895 a 3 insn sequence, give up and let it be expanded as
46896 3 insn sequence. While that is one insn longer,
46897 it doesn't need a memory operand and in the common
46898 case that both interleave low and high permutations
46899 with the same operands are adjacent needs 4 insns
46900 for both after CSE. */
46901 if (expand_vec_perm_interleave3 (&dfinal))
46902 return false;
46904 else
46905 return false;
46907 /* Examine from whence the elements come. */
46908 contents = 0;
46909 for (i = 0; i < nelt; ++i)
46910 contents |= HOST_WIDE_INT_1U << d->perm[i];
46912 memset (remap, 0xff, sizeof (remap));
46913 dremap = *d;
46915 if (GET_MODE_SIZE (d->vmode) == 16)
46917 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46919 /* Split the two input vectors into 4 halves. */
46920 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46921 h2 = h1 << nelt2;
46922 h3 = h2 << nelt2;
46923 h4 = h3 << nelt2;
46925 /* If the elements from the low halves use interleave low, and similarly
46926 for interleave high. If the elements are from mis-matched halves, we
46927 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46928 if ((contents & (h1 | h3)) == contents)
46930 /* punpckl* */
46931 for (i = 0; i < nelt2; ++i)
46933 remap[i] = i * 2;
46934 remap[i + nelt] = i * 2 + 1;
46935 dremap.perm[i * 2] = i;
46936 dremap.perm[i * 2 + 1] = i + nelt;
46938 if (!TARGET_SSE2 && d->vmode == V4SImode)
46939 dremap.vmode = V4SFmode;
46941 else if ((contents & (h2 | h4)) == contents)
46943 /* punpckh* */
46944 for (i = 0; i < nelt2; ++i)
46946 remap[i + nelt2] = i * 2;
46947 remap[i + nelt + nelt2] = i * 2 + 1;
46948 dremap.perm[i * 2] = i + nelt2;
46949 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46951 if (!TARGET_SSE2 && d->vmode == V4SImode)
46952 dremap.vmode = V4SFmode;
46954 else if ((contents & (h1 | h4)) == contents)
46956 /* shufps */
46957 for (i = 0; i < nelt2; ++i)
46959 remap[i] = i;
46960 remap[i + nelt + nelt2] = i + nelt2;
46961 dremap.perm[i] = i;
46962 dremap.perm[i + nelt2] = i + nelt + nelt2;
46964 if (nelt != 4)
46966 /* shufpd */
46967 dremap.vmode = V2DImode;
46968 dremap.nelt = 2;
46969 dremap.perm[0] = 0;
46970 dremap.perm[1] = 3;
46973 else if ((contents & (h2 | h3)) == contents)
46975 /* shufps */
46976 for (i = 0; i < nelt2; ++i)
46978 remap[i + nelt2] = i;
46979 remap[i + nelt] = i + nelt2;
46980 dremap.perm[i] = i + nelt2;
46981 dremap.perm[i + nelt2] = i + nelt;
46983 if (nelt != 4)
46985 /* shufpd */
46986 dremap.vmode = V2DImode;
46987 dremap.nelt = 2;
46988 dremap.perm[0] = 1;
46989 dremap.perm[1] = 2;
46992 else
46993 return false;
46995 else
46997 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46998 unsigned HOST_WIDE_INT q[8];
46999 unsigned int nonzero_halves[4];
47001 /* Split the two input vectors into 8 quarters. */
47002 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47003 for (i = 1; i < 8; ++i)
47004 q[i] = q[0] << (nelt4 * i);
47005 for (i = 0; i < 4; ++i)
47006 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47008 nonzero_halves[nzcnt] = i;
47009 ++nzcnt;
47012 if (nzcnt == 1)
47014 gcc_assert (d->one_operand_p);
47015 nonzero_halves[1] = nonzero_halves[0];
47016 same_halves = true;
47018 else if (d->one_operand_p)
47020 gcc_assert (nonzero_halves[0] == 0);
47021 gcc_assert (nonzero_halves[1] == 1);
47024 if (nzcnt <= 2)
47026 if (d->perm[0] / nelt2 == nonzero_halves[1])
47028 /* Attempt to increase the likelihood that dfinal
47029 shuffle will be intra-lane. */
47030 std::swap (nonzero_halves[0], nonzero_halves[1]);
47033 /* vperm2f128 or vperm2i128. */
47034 for (i = 0; i < nelt2; ++i)
47036 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47037 remap[i + nonzero_halves[0] * nelt2] = i;
47038 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47039 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47042 if (d->vmode != V8SFmode
47043 && d->vmode != V4DFmode
47044 && d->vmode != V8SImode)
47046 dremap.vmode = V8SImode;
47047 dremap.nelt = 8;
47048 for (i = 0; i < 4; ++i)
47050 dremap.perm[i] = i + nonzero_halves[0] * 4;
47051 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47055 else if (d->one_operand_p)
47056 return false;
47057 else if (TARGET_AVX2
47058 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47060 /* vpunpckl* */
47061 for (i = 0; i < nelt4; ++i)
47063 remap[i] = i * 2;
47064 remap[i + nelt] = i * 2 + 1;
47065 remap[i + nelt2] = i * 2 + nelt2;
47066 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47067 dremap.perm[i * 2] = i;
47068 dremap.perm[i * 2 + 1] = i + nelt;
47069 dremap.perm[i * 2 + nelt2] = i + nelt2;
47070 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47073 else if (TARGET_AVX2
47074 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47076 /* vpunpckh* */
47077 for (i = 0; i < nelt4; ++i)
47079 remap[i + nelt4] = i * 2;
47080 remap[i + nelt + nelt4] = i * 2 + 1;
47081 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47082 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47083 dremap.perm[i * 2] = i + nelt4;
47084 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47085 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47086 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47089 else
47090 return false;
47093 /* Use the remapping array set up above to move the elements from their
47094 swizzled locations into their final destinations. */
47095 dfinal = *d;
47096 for (i = 0; i < nelt; ++i)
47098 unsigned e = remap[d->perm[i]];
47099 gcc_assert (e < nelt);
47100 /* If same_halves is true, both halves of the remapped vector are the
47101 same. Avoid cross-lane accesses if possible. */
47102 if (same_halves && i >= nelt2)
47104 gcc_assert (e < nelt2);
47105 dfinal.perm[i] = e + nelt2;
47107 else
47108 dfinal.perm[i] = e;
47110 if (!d->testing_p)
47112 dremap.target = gen_reg_rtx (dremap.vmode);
47113 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47115 dfinal.op1 = dfinal.op0;
47116 dfinal.one_operand_p = true;
47118 /* Test if the final remap can be done with a single insn. For V4SFmode or
47119 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47120 start_sequence ();
47121 ok = expand_vec_perm_1 (&dfinal);
47122 seq = get_insns ();
47123 end_sequence ();
47125 if (!ok)
47126 return false;
47128 if (d->testing_p)
47129 return true;
47131 if (dremap.vmode != dfinal.vmode)
47133 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47134 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47137 ok = expand_vec_perm_1 (&dremap);
47138 gcc_assert (ok);
47140 emit_insn (seq);
47141 return true;
47144 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47145 a single vector cross-lane permutation into vpermq followed
47146 by any of the single insn permutations. */
47148 static bool
47149 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47151 struct expand_vec_perm_d dremap, dfinal;
47152 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47153 unsigned contents[2];
47154 bool ok;
47156 if (!(TARGET_AVX2
47157 && (d->vmode == V32QImode || d->vmode == V16HImode)
47158 && d->one_operand_p))
47159 return false;
47161 contents[0] = 0;
47162 contents[1] = 0;
47163 for (i = 0; i < nelt2; ++i)
47165 contents[0] |= 1u << (d->perm[i] / nelt4);
47166 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47169 for (i = 0; i < 2; ++i)
47171 unsigned int cnt = 0;
47172 for (j = 0; j < 4; ++j)
47173 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47174 return false;
47177 if (d->testing_p)
47178 return true;
47180 dremap = *d;
47181 dremap.vmode = V4DImode;
47182 dremap.nelt = 4;
47183 dremap.target = gen_reg_rtx (V4DImode);
47184 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47185 dremap.op1 = dremap.op0;
47186 dremap.one_operand_p = true;
47187 for (i = 0; i < 2; ++i)
47189 unsigned int cnt = 0;
47190 for (j = 0; j < 4; ++j)
47191 if ((contents[i] & (1u << j)) != 0)
47192 dremap.perm[2 * i + cnt++] = j;
47193 for (; cnt < 2; ++cnt)
47194 dremap.perm[2 * i + cnt] = 0;
47197 dfinal = *d;
47198 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47199 dfinal.op1 = dfinal.op0;
47200 dfinal.one_operand_p = true;
47201 for (i = 0, j = 0; i < nelt; ++i)
47203 if (i == nelt2)
47204 j = 2;
47205 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47206 if ((d->perm[i] / nelt4) == dremap.perm[j])
47208 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47209 dfinal.perm[i] |= nelt4;
47210 else
47211 gcc_unreachable ();
47214 ok = expand_vec_perm_1 (&dremap);
47215 gcc_assert (ok);
47217 ok = expand_vec_perm_1 (&dfinal);
47218 gcc_assert (ok);
47220 return true;
47223 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47224 a vector permutation using two instructions, vperm2f128 resp.
47225 vperm2i128 followed by any single in-lane permutation. */
47227 static bool
47228 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47230 struct expand_vec_perm_d dfirst, dsecond;
47231 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47232 bool ok;
47234 if (!TARGET_AVX
47235 || GET_MODE_SIZE (d->vmode) != 32
47236 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47237 return false;
47239 dsecond = *d;
47240 dsecond.one_operand_p = false;
47241 dsecond.testing_p = true;
47243 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47244 immediate. For perm < 16 the second permutation uses
47245 d->op0 as first operand, for perm >= 16 it uses d->op1
47246 as first operand. The second operand is the result of
47247 vperm2[fi]128. */
47248 for (perm = 0; perm < 32; perm++)
47250 /* Ignore permutations which do not move anything cross-lane. */
47251 if (perm < 16)
47253 /* The second shuffle for e.g. V4DFmode has
47254 0123 and ABCD operands.
47255 Ignore AB23, as 23 is already in the second lane
47256 of the first operand. */
47257 if ((perm & 0xc) == (1 << 2)) continue;
47258 /* And 01CD, as 01 is in the first lane of the first
47259 operand. */
47260 if ((perm & 3) == 0) continue;
47261 /* And 4567, as then the vperm2[fi]128 doesn't change
47262 anything on the original 4567 second operand. */
47263 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47265 else
47267 /* The second shuffle for e.g. V4DFmode has
47268 4567 and ABCD operands.
47269 Ignore AB67, as 67 is already in the second lane
47270 of the first operand. */
47271 if ((perm & 0xc) == (3 << 2)) continue;
47272 /* And 45CD, as 45 is in the first lane of the first
47273 operand. */
47274 if ((perm & 3) == 2) continue;
47275 /* And 0123, as then the vperm2[fi]128 doesn't change
47276 anything on the original 0123 first operand. */
47277 if ((perm & 0xf) == (1 << 2)) continue;
47280 for (i = 0; i < nelt; i++)
47282 j = d->perm[i] / nelt2;
47283 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47284 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47285 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47286 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47287 else
47288 break;
47291 if (i == nelt)
47293 start_sequence ();
47294 ok = expand_vec_perm_1 (&dsecond);
47295 end_sequence ();
47297 else
47298 ok = false;
47300 if (ok)
47302 if (d->testing_p)
47303 return true;
47305 /* Found a usable second shuffle. dfirst will be
47306 vperm2f128 on d->op0 and d->op1. */
47307 dsecond.testing_p = false;
47308 dfirst = *d;
47309 dfirst.target = gen_reg_rtx (d->vmode);
47310 for (i = 0; i < nelt; i++)
47311 dfirst.perm[i] = (i & (nelt2 - 1))
47312 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47314 canonicalize_perm (&dfirst);
47315 ok = expand_vec_perm_1 (&dfirst);
47316 gcc_assert (ok);
47318 /* And dsecond is some single insn shuffle, taking
47319 d->op0 and result of vperm2f128 (if perm < 16) or
47320 d->op1 and result of vperm2f128 (otherwise). */
47321 if (perm >= 16)
47322 dsecond.op0 = dsecond.op1;
47323 dsecond.op1 = dfirst.target;
47325 ok = expand_vec_perm_1 (&dsecond);
47326 gcc_assert (ok);
47328 return true;
47331 /* For one operand, the only useful vperm2f128 permutation is 0x01
47332 aka lanes swap. */
47333 if (d->one_operand_p)
47334 return false;
47337 return false;
47340 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47341 a two vector permutation using 2 intra-lane interleave insns
47342 and cross-lane shuffle for 32-byte vectors. */
47344 static bool
47345 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47347 unsigned i, nelt;
47348 rtx (*gen) (rtx, rtx, rtx);
47350 if (d->one_operand_p)
47351 return false;
47352 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47354 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47356 else
47357 return false;
47359 nelt = d->nelt;
47360 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47361 return false;
47362 for (i = 0; i < nelt; i += 2)
47363 if (d->perm[i] != d->perm[0] + i / 2
47364 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47365 return false;
47367 if (d->testing_p)
47368 return true;
47370 switch (d->vmode)
47372 case V32QImode:
47373 if (d->perm[0])
47374 gen = gen_vec_interleave_highv32qi;
47375 else
47376 gen = gen_vec_interleave_lowv32qi;
47377 break;
47378 case V16HImode:
47379 if (d->perm[0])
47380 gen = gen_vec_interleave_highv16hi;
47381 else
47382 gen = gen_vec_interleave_lowv16hi;
47383 break;
47384 case V8SImode:
47385 if (d->perm[0])
47386 gen = gen_vec_interleave_highv8si;
47387 else
47388 gen = gen_vec_interleave_lowv8si;
47389 break;
47390 case V4DImode:
47391 if (d->perm[0])
47392 gen = gen_vec_interleave_highv4di;
47393 else
47394 gen = gen_vec_interleave_lowv4di;
47395 break;
47396 case V8SFmode:
47397 if (d->perm[0])
47398 gen = gen_vec_interleave_highv8sf;
47399 else
47400 gen = gen_vec_interleave_lowv8sf;
47401 break;
47402 case V4DFmode:
47403 if (d->perm[0])
47404 gen = gen_vec_interleave_highv4df;
47405 else
47406 gen = gen_vec_interleave_lowv4df;
47407 break;
47408 default:
47409 gcc_unreachable ();
47412 emit_insn (gen (d->target, d->op0, d->op1));
47413 return true;
47416 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47417 a single vector permutation using a single intra-lane vector
47418 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47419 the non-swapped and swapped vectors together. */
47421 static bool
47422 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47424 struct expand_vec_perm_d dfirst, dsecond;
47425 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47426 rtx_insn *seq;
47427 bool ok;
47428 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47430 if (!TARGET_AVX
47431 || TARGET_AVX2
47432 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47433 || !d->one_operand_p)
47434 return false;
47436 dfirst = *d;
47437 for (i = 0; i < nelt; i++)
47438 dfirst.perm[i] = 0xff;
47439 for (i = 0, msk = 0; i < nelt; i++)
47441 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47442 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47443 return false;
47444 dfirst.perm[j] = d->perm[i];
47445 if (j != i)
47446 msk |= (1 << i);
47448 for (i = 0; i < nelt; i++)
47449 if (dfirst.perm[i] == 0xff)
47450 dfirst.perm[i] = i;
47452 if (!d->testing_p)
47453 dfirst.target = gen_reg_rtx (dfirst.vmode);
47455 start_sequence ();
47456 ok = expand_vec_perm_1 (&dfirst);
47457 seq = get_insns ();
47458 end_sequence ();
47460 if (!ok)
47461 return false;
47463 if (d->testing_p)
47464 return true;
47466 emit_insn (seq);
47468 dsecond = *d;
47469 dsecond.op0 = dfirst.target;
47470 dsecond.op1 = dfirst.target;
47471 dsecond.one_operand_p = true;
47472 dsecond.target = gen_reg_rtx (dsecond.vmode);
47473 for (i = 0; i < nelt; i++)
47474 dsecond.perm[i] = i ^ nelt2;
47476 ok = expand_vec_perm_1 (&dsecond);
47477 gcc_assert (ok);
47479 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47480 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47481 return true;
47484 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47485 permutation using two vperm2f128, followed by a vshufpd insn blending
47486 the two vectors together. */
47488 static bool
47489 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47491 struct expand_vec_perm_d dfirst, dsecond, dthird;
47492 bool ok;
47494 if (!TARGET_AVX || (d->vmode != V4DFmode))
47495 return false;
47497 if (d->testing_p)
47498 return true;
47500 dfirst = *d;
47501 dsecond = *d;
47502 dthird = *d;
47504 dfirst.perm[0] = (d->perm[0] & ~1);
47505 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47506 dfirst.perm[2] = (d->perm[2] & ~1);
47507 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47508 dsecond.perm[0] = (d->perm[1] & ~1);
47509 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47510 dsecond.perm[2] = (d->perm[3] & ~1);
47511 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47512 dthird.perm[0] = (d->perm[0] % 2);
47513 dthird.perm[1] = (d->perm[1] % 2) + 4;
47514 dthird.perm[2] = (d->perm[2] % 2) + 2;
47515 dthird.perm[3] = (d->perm[3] % 2) + 6;
47517 dfirst.target = gen_reg_rtx (dfirst.vmode);
47518 dsecond.target = gen_reg_rtx (dsecond.vmode);
47519 dthird.op0 = dfirst.target;
47520 dthird.op1 = dsecond.target;
47521 dthird.one_operand_p = false;
47523 canonicalize_perm (&dfirst);
47524 canonicalize_perm (&dsecond);
47526 ok = expand_vec_perm_1 (&dfirst)
47527 && expand_vec_perm_1 (&dsecond)
47528 && expand_vec_perm_1 (&dthird);
47530 gcc_assert (ok);
47532 return true;
47535 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47536 permutation with two pshufb insns and an ior. We should have already
47537 failed all two instruction sequences. */
47539 static bool
47540 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47542 rtx rperm[2][16], vperm, l, h, op, m128;
47543 unsigned int i, nelt, eltsz;
47545 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47546 return false;
47547 gcc_assert (!d->one_operand_p);
47549 if (d->testing_p)
47550 return true;
47552 nelt = d->nelt;
47553 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47555 /* Generate two permutation masks. If the required element is within
47556 the given vector it is shuffled into the proper lane. If the required
47557 element is in the other vector, force a zero into the lane by setting
47558 bit 7 in the permutation mask. */
47559 m128 = GEN_INT (-128);
47560 for (i = 0; i < nelt; ++i)
47562 unsigned j, e = d->perm[i];
47563 unsigned which = (e >= nelt);
47564 if (e >= nelt)
47565 e -= nelt;
47567 for (j = 0; j < eltsz; ++j)
47569 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47570 rperm[1-which][i*eltsz + j] = m128;
47574 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47575 vperm = force_reg (V16QImode, vperm);
47577 l = gen_reg_rtx (V16QImode);
47578 op = gen_lowpart (V16QImode, d->op0);
47579 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47581 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47582 vperm = force_reg (V16QImode, vperm);
47584 h = gen_reg_rtx (V16QImode);
47585 op = gen_lowpart (V16QImode, d->op1);
47586 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47588 op = d->target;
47589 if (d->vmode != V16QImode)
47590 op = gen_reg_rtx (V16QImode);
47591 emit_insn (gen_iorv16qi3 (op, l, h));
47592 if (op != d->target)
47593 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47595 return true;
47598 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47599 with two vpshufb insns, vpermq and vpor. We should have already failed
47600 all two or three instruction sequences. */
47602 static bool
47603 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47605 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47606 unsigned int i, nelt, eltsz;
47608 if (!TARGET_AVX2
47609 || !d->one_operand_p
47610 || (d->vmode != V32QImode && d->vmode != V16HImode))
47611 return false;
47613 if (d->testing_p)
47614 return true;
47616 nelt = d->nelt;
47617 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47619 /* Generate two permutation masks. If the required element is within
47620 the same lane, it is shuffled in. If the required element from the
47621 other lane, force a zero by setting bit 7 in the permutation mask.
47622 In the other mask the mask has non-negative elements if element
47623 is requested from the other lane, but also moved to the other lane,
47624 so that the result of vpshufb can have the two V2TImode halves
47625 swapped. */
47626 m128 = GEN_INT (-128);
47627 for (i = 0; i < nelt; ++i)
47629 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47630 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47632 for (j = 0; j < eltsz; ++j)
47634 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47635 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47639 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47640 vperm = force_reg (V32QImode, vperm);
47642 h = gen_reg_rtx (V32QImode);
47643 op = gen_lowpart (V32QImode, d->op0);
47644 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47646 /* Swap the 128-byte lanes of h into hp. */
47647 hp = gen_reg_rtx (V4DImode);
47648 op = gen_lowpart (V4DImode, h);
47649 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47650 const1_rtx));
47652 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47653 vperm = force_reg (V32QImode, vperm);
47655 l = gen_reg_rtx (V32QImode);
47656 op = gen_lowpart (V32QImode, d->op0);
47657 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47659 op = d->target;
47660 if (d->vmode != V32QImode)
47661 op = gen_reg_rtx (V32QImode);
47662 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47663 if (op != d->target)
47664 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47666 return true;
47669 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47670 and extract-odd permutations of two V32QImode and V16QImode operand
47671 with two vpshufb insns, vpor and vpermq. We should have already
47672 failed all two or three instruction sequences. */
47674 static bool
47675 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47677 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47678 unsigned int i, nelt, eltsz;
47680 if (!TARGET_AVX2
47681 || d->one_operand_p
47682 || (d->vmode != V32QImode && d->vmode != V16HImode))
47683 return false;
47685 for (i = 0; i < d->nelt; ++i)
47686 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47687 return false;
47689 if (d->testing_p)
47690 return true;
47692 nelt = d->nelt;
47693 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47695 /* Generate two permutation masks. In the first permutation mask
47696 the first quarter will contain indexes for the first half
47697 of the op0, the second quarter will contain bit 7 set, third quarter
47698 will contain indexes for the second half of the op0 and the
47699 last quarter bit 7 set. In the second permutation mask
47700 the first quarter will contain bit 7 set, the second quarter
47701 indexes for the first half of the op1, the third quarter bit 7 set
47702 and last quarter indexes for the second half of the op1.
47703 I.e. the first mask e.g. for V32QImode extract even will be:
47704 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47705 (all values masked with 0xf except for -128) and second mask
47706 for extract even will be
47707 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47708 m128 = GEN_INT (-128);
47709 for (i = 0; i < nelt; ++i)
47711 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47712 unsigned which = d->perm[i] >= nelt;
47713 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47715 for (j = 0; j < eltsz; ++j)
47717 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47718 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47722 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47723 vperm = force_reg (V32QImode, vperm);
47725 l = gen_reg_rtx (V32QImode);
47726 op = gen_lowpart (V32QImode, d->op0);
47727 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47729 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47730 vperm = force_reg (V32QImode, vperm);
47732 h = gen_reg_rtx (V32QImode);
47733 op = gen_lowpart (V32QImode, d->op1);
47734 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47736 ior = gen_reg_rtx (V32QImode);
47737 emit_insn (gen_iorv32qi3 (ior, l, h));
47739 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47740 op = gen_reg_rtx (V4DImode);
47741 ior = gen_lowpart (V4DImode, ior);
47742 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47743 const1_rtx, GEN_INT (3)));
47744 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47746 return true;
47749 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47750 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47751 with two "and" and "pack" or two "shift" and "pack" insns. We should
47752 have already failed all two instruction sequences. */
47754 static bool
47755 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47757 rtx op, dop0, dop1, t, rperm[16];
47758 unsigned i, odd, c, s, nelt = d->nelt;
47759 bool end_perm = false;
47760 machine_mode half_mode;
47761 rtx (*gen_and) (rtx, rtx, rtx);
47762 rtx (*gen_pack) (rtx, rtx, rtx);
47763 rtx (*gen_shift) (rtx, rtx, rtx);
47765 if (d->one_operand_p)
47766 return false;
47768 switch (d->vmode)
47770 case V8HImode:
47771 /* Required for "pack". */
47772 if (!TARGET_SSE4_1)
47773 return false;
47774 c = 0xffff;
47775 s = 16;
47776 half_mode = V4SImode;
47777 gen_and = gen_andv4si3;
47778 gen_pack = gen_sse4_1_packusdw;
47779 gen_shift = gen_lshrv4si3;
47780 break;
47781 case V16QImode:
47782 /* No check as all instructions are SSE2. */
47783 c = 0xff;
47784 s = 8;
47785 half_mode = V8HImode;
47786 gen_and = gen_andv8hi3;
47787 gen_pack = gen_sse2_packuswb;
47788 gen_shift = gen_lshrv8hi3;
47789 break;
47790 case V16HImode:
47791 if (!TARGET_AVX2)
47792 return false;
47793 c = 0xffff;
47794 s = 16;
47795 half_mode = V8SImode;
47796 gen_and = gen_andv8si3;
47797 gen_pack = gen_avx2_packusdw;
47798 gen_shift = gen_lshrv8si3;
47799 end_perm = true;
47800 break;
47801 case V32QImode:
47802 if (!TARGET_AVX2)
47803 return false;
47804 c = 0xff;
47805 s = 8;
47806 half_mode = V16HImode;
47807 gen_and = gen_andv16hi3;
47808 gen_pack = gen_avx2_packuswb;
47809 gen_shift = gen_lshrv16hi3;
47810 end_perm = true;
47811 break;
47812 default:
47813 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47814 general shuffles. */
47815 return false;
47818 /* Check that permutation is even or odd. */
47819 odd = d->perm[0];
47820 if (odd > 1)
47821 return false;
47823 for (i = 1; i < nelt; ++i)
47824 if (d->perm[i] != 2 * i + odd)
47825 return false;
47827 if (d->testing_p)
47828 return true;
47830 dop0 = gen_reg_rtx (half_mode);
47831 dop1 = gen_reg_rtx (half_mode);
47832 if (odd == 0)
47834 for (i = 0; i < nelt / 2; i++)
47835 rperm[i] = GEN_INT (c);
47836 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
47837 t = force_reg (half_mode, t);
47838 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47839 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47841 else
47843 emit_insn (gen_shift (dop0,
47844 gen_lowpart (half_mode, d->op0),
47845 GEN_INT (s)));
47846 emit_insn (gen_shift (dop1,
47847 gen_lowpart (half_mode, d->op1),
47848 GEN_INT (s)));
47850 /* In AVX2 for 256 bit case we need to permute pack result. */
47851 if (TARGET_AVX2 && end_perm)
47853 op = gen_reg_rtx (d->vmode);
47854 t = gen_reg_rtx (V4DImode);
47855 emit_insn (gen_pack (op, dop0, dop1));
47856 emit_insn (gen_avx2_permv4di_1 (t,
47857 gen_lowpart (V4DImode, op),
47858 const0_rtx,
47859 const2_rtx,
47860 const1_rtx,
47861 GEN_INT (3)));
47862 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47864 else
47865 emit_insn (gen_pack (d->target, dop0, dop1));
47867 return true;
47870 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47871 and extract-odd permutations of two V64QI operands
47872 with two "shifts", two "truncs" and one "concat" insns for "odd"
47873 and two "truncs" and one concat insn for "even."
47874 Have already failed all two instruction sequences. */
47876 static bool
47877 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47879 rtx t1, t2, t3, t4;
47880 unsigned i, odd, nelt = d->nelt;
47882 if (!TARGET_AVX512BW
47883 || d->one_operand_p
47884 || d->vmode != V64QImode)
47885 return false;
47887 /* Check that permutation is even or odd. */
47888 odd = d->perm[0];
47889 if (odd > 1)
47890 return false;
47892 for (i = 1; i < nelt; ++i)
47893 if (d->perm[i] != 2 * i + odd)
47894 return false;
47896 if (d->testing_p)
47897 return true;
47900 if (odd)
47902 t1 = gen_reg_rtx (V32HImode);
47903 t2 = gen_reg_rtx (V32HImode);
47904 emit_insn (gen_lshrv32hi3 (t1,
47905 gen_lowpart (V32HImode, d->op0),
47906 GEN_INT (8)));
47907 emit_insn (gen_lshrv32hi3 (t2,
47908 gen_lowpart (V32HImode, d->op1),
47909 GEN_INT (8)));
47911 else
47913 t1 = gen_lowpart (V32HImode, d->op0);
47914 t2 = gen_lowpart (V32HImode, d->op1);
47917 t3 = gen_reg_rtx (V32QImode);
47918 t4 = gen_reg_rtx (V32QImode);
47919 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47920 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47921 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47923 return true;
47926 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47927 and extract-odd permutations. */
47929 static bool
47930 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47932 rtx t1, t2, t3, t4, t5;
47934 switch (d->vmode)
47936 case V4DFmode:
47937 if (d->testing_p)
47938 break;
47939 t1 = gen_reg_rtx (V4DFmode);
47940 t2 = gen_reg_rtx (V4DFmode);
47942 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47943 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47944 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47946 /* Now an unpck[lh]pd will produce the result required. */
47947 if (odd)
47948 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47949 else
47950 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47951 emit_insn (t3);
47952 break;
47954 case V8SFmode:
47956 int mask = odd ? 0xdd : 0x88;
47958 if (d->testing_p)
47959 break;
47960 t1 = gen_reg_rtx (V8SFmode);
47961 t2 = gen_reg_rtx (V8SFmode);
47962 t3 = gen_reg_rtx (V8SFmode);
47964 /* Shuffle within the 128-bit lanes to produce:
47965 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47966 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47967 GEN_INT (mask)));
47969 /* Shuffle the lanes around to produce:
47970 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47971 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47972 GEN_INT (0x3)));
47974 /* Shuffle within the 128-bit lanes to produce:
47975 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47976 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47978 /* Shuffle within the 128-bit lanes to produce:
47979 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47980 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47982 /* Shuffle the lanes around to produce:
47983 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47984 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47985 GEN_INT (0x20)));
47987 break;
47989 case V2DFmode:
47990 case V4SFmode:
47991 case V2DImode:
47992 case V4SImode:
47993 /* These are always directly implementable by expand_vec_perm_1. */
47994 gcc_unreachable ();
47996 case V8HImode:
47997 if (TARGET_SSE4_1)
47998 return expand_vec_perm_even_odd_pack (d);
47999 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48000 return expand_vec_perm_pshufb2 (d);
48001 else
48003 if (d->testing_p)
48004 break;
48005 /* We need 2*log2(N)-1 operations to achieve odd/even
48006 with interleave. */
48007 t1 = gen_reg_rtx (V8HImode);
48008 t2 = gen_reg_rtx (V8HImode);
48009 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48010 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48011 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48012 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48013 if (odd)
48014 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48015 else
48016 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48017 emit_insn (t3);
48019 break;
48021 case V16QImode:
48022 return expand_vec_perm_even_odd_pack (d);
48024 case V16HImode:
48025 case V32QImode:
48026 return expand_vec_perm_even_odd_pack (d);
48028 case V64QImode:
48029 return expand_vec_perm_even_odd_trunc (d);
48031 case V4DImode:
48032 if (!TARGET_AVX2)
48034 struct expand_vec_perm_d d_copy = *d;
48035 d_copy.vmode = V4DFmode;
48036 if (d->testing_p)
48037 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48038 else
48039 d_copy.target = gen_reg_rtx (V4DFmode);
48040 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48041 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48042 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48044 if (!d->testing_p)
48045 emit_move_insn (d->target,
48046 gen_lowpart (V4DImode, d_copy.target));
48047 return true;
48049 return false;
48052 if (d->testing_p)
48053 break;
48055 t1 = gen_reg_rtx (V4DImode);
48056 t2 = gen_reg_rtx (V4DImode);
48058 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48059 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48060 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48062 /* Now an vpunpck[lh]qdq will produce the result required. */
48063 if (odd)
48064 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48065 else
48066 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48067 emit_insn (t3);
48068 break;
48070 case V8SImode:
48071 if (!TARGET_AVX2)
48073 struct expand_vec_perm_d d_copy = *d;
48074 d_copy.vmode = V8SFmode;
48075 if (d->testing_p)
48076 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48077 else
48078 d_copy.target = gen_reg_rtx (V8SFmode);
48079 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48080 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48081 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48083 if (!d->testing_p)
48084 emit_move_insn (d->target,
48085 gen_lowpart (V8SImode, d_copy.target));
48086 return true;
48088 return false;
48091 if (d->testing_p)
48092 break;
48094 t1 = gen_reg_rtx (V8SImode);
48095 t2 = gen_reg_rtx (V8SImode);
48096 t3 = gen_reg_rtx (V4DImode);
48097 t4 = gen_reg_rtx (V4DImode);
48098 t5 = gen_reg_rtx (V4DImode);
48100 /* Shuffle the lanes around into
48101 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48102 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48103 gen_lowpart (V4DImode, d->op1),
48104 GEN_INT (0x20)));
48105 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48106 gen_lowpart (V4DImode, d->op1),
48107 GEN_INT (0x31)));
48109 /* Swap the 2nd and 3rd position in each lane into
48110 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48111 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48112 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48113 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48114 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48116 /* Now an vpunpck[lh]qdq will produce
48117 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48118 if (odd)
48119 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48120 gen_lowpart (V4DImode, t2));
48121 else
48122 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48123 gen_lowpart (V4DImode, t2));
48124 emit_insn (t3);
48125 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48126 break;
48128 default:
48129 gcc_unreachable ();
48132 return true;
48135 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48136 extract-even and extract-odd permutations. */
48138 static bool
48139 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48141 unsigned i, odd, nelt = d->nelt;
48143 odd = d->perm[0];
48144 if (odd != 0 && odd != 1)
48145 return false;
48147 for (i = 1; i < nelt; ++i)
48148 if (d->perm[i] != 2 * i + odd)
48149 return false;
48151 return expand_vec_perm_even_odd_1 (d, odd);
48154 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48155 permutations. We assume that expand_vec_perm_1 has already failed. */
48157 static bool
48158 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48160 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48161 machine_mode vmode = d->vmode;
48162 unsigned char perm2[4];
48163 rtx op0 = d->op0, dest;
48164 bool ok;
48166 switch (vmode)
48168 case V4DFmode:
48169 case V8SFmode:
48170 /* These are special-cased in sse.md so that we can optionally
48171 use the vbroadcast instruction. They expand to two insns
48172 if the input happens to be in a register. */
48173 gcc_unreachable ();
48175 case V2DFmode:
48176 case V2DImode:
48177 case V4SFmode:
48178 case V4SImode:
48179 /* These are always implementable using standard shuffle patterns. */
48180 gcc_unreachable ();
48182 case V8HImode:
48183 case V16QImode:
48184 /* These can be implemented via interleave. We save one insn by
48185 stopping once we have promoted to V4SImode and then use pshufd. */
48186 if (d->testing_p)
48187 return true;
48190 rtx dest;
48191 rtx (*gen) (rtx, rtx, rtx)
48192 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48193 : gen_vec_interleave_lowv8hi;
48195 if (elt >= nelt2)
48197 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48198 : gen_vec_interleave_highv8hi;
48199 elt -= nelt2;
48201 nelt2 /= 2;
48203 dest = gen_reg_rtx (vmode);
48204 emit_insn (gen (dest, op0, op0));
48205 vmode = get_mode_wider_vector (vmode);
48206 op0 = gen_lowpart (vmode, dest);
48208 while (vmode != V4SImode);
48210 memset (perm2, elt, 4);
48211 dest = gen_reg_rtx (V4SImode);
48212 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48213 gcc_assert (ok);
48214 if (!d->testing_p)
48215 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48216 return true;
48218 case V64QImode:
48219 case V32QImode:
48220 case V16HImode:
48221 case V8SImode:
48222 case V4DImode:
48223 /* For AVX2 broadcasts of the first element vpbroadcast* or
48224 vpermq should be used by expand_vec_perm_1. */
48225 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48226 return false;
48228 default:
48229 gcc_unreachable ();
48233 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48234 broadcast permutations. */
48236 static bool
48237 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48239 unsigned i, elt, nelt = d->nelt;
48241 if (!d->one_operand_p)
48242 return false;
48244 elt = d->perm[0];
48245 for (i = 1; i < nelt; ++i)
48246 if (d->perm[i] != elt)
48247 return false;
48249 return expand_vec_perm_broadcast_1 (d);
48252 /* Implement arbitrary permutations of two V64QImode operands
48253 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
48254 static bool
48255 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
48257 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48258 return false;
48260 if (d->testing_p)
48261 return true;
48263 struct expand_vec_perm_d ds[2];
48264 rtx rperm[128], vperm, target0, target1;
48265 unsigned int i, nelt;
48266 machine_mode vmode;
48268 nelt = d->nelt;
48269 vmode = V64QImode;
48271 for (i = 0; i < 2; i++)
48273 ds[i] = *d;
48274 ds[i].vmode = V32HImode;
48275 ds[i].nelt = 32;
48276 ds[i].target = gen_reg_rtx (V32HImode);
48277 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48278 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48281 /* Prepare permutations such that the first one takes care of
48282 putting the even bytes into the right positions or one higher
48283 positions (ds[0]) and the second one takes care of
48284 putting the odd bytes into the right positions or one below
48285 (ds[1]). */
48287 for (i = 0; i < nelt; i++)
48289 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48290 if (i & 1)
48292 rperm[i] = constm1_rtx;
48293 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48295 else
48297 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48298 rperm[i + 64] = constm1_rtx;
48302 bool ok = expand_vec_perm_1 (&ds[0]);
48303 gcc_assert (ok);
48304 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48306 ok = expand_vec_perm_1 (&ds[1]);
48307 gcc_assert (ok);
48308 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48310 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48311 vperm = force_reg (vmode, vperm);
48312 target0 = gen_reg_rtx (V64QImode);
48313 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48315 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48316 vperm = force_reg (vmode, vperm);
48317 target1 = gen_reg_rtx (V64QImode);
48318 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48320 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48321 return true;
48324 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48325 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48326 all the shorter instruction sequences. */
48328 static bool
48329 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48331 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48332 unsigned int i, nelt, eltsz;
48333 bool used[4];
48335 if (!TARGET_AVX2
48336 || d->one_operand_p
48337 || (d->vmode != V32QImode && d->vmode != V16HImode))
48338 return false;
48340 if (d->testing_p)
48341 return true;
48343 nelt = d->nelt;
48344 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48346 /* Generate 4 permutation masks. If the required element is within
48347 the same lane, it is shuffled in. If the required element from the
48348 other lane, force a zero by setting bit 7 in the permutation mask.
48349 In the other mask the mask has non-negative elements if element
48350 is requested from the other lane, but also moved to the other lane,
48351 so that the result of vpshufb can have the two V2TImode halves
48352 swapped. */
48353 m128 = GEN_INT (-128);
48354 for (i = 0; i < 32; ++i)
48356 rperm[0][i] = m128;
48357 rperm[1][i] = m128;
48358 rperm[2][i] = m128;
48359 rperm[3][i] = m128;
48361 used[0] = false;
48362 used[1] = false;
48363 used[2] = false;
48364 used[3] = false;
48365 for (i = 0; i < nelt; ++i)
48367 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48368 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48369 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48371 for (j = 0; j < eltsz; ++j)
48372 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48373 used[which] = true;
48376 for (i = 0; i < 2; ++i)
48378 if (!used[2 * i + 1])
48380 h[i] = NULL_RTX;
48381 continue;
48383 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48384 gen_rtvec_v (32, rperm[2 * i + 1]));
48385 vperm = force_reg (V32QImode, vperm);
48386 h[i] = gen_reg_rtx (V32QImode);
48387 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48388 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48391 /* Swap the 128-byte lanes of h[X]. */
48392 for (i = 0; i < 2; ++i)
48394 if (h[i] == NULL_RTX)
48395 continue;
48396 op = gen_reg_rtx (V4DImode);
48397 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48398 const2_rtx, GEN_INT (3), const0_rtx,
48399 const1_rtx));
48400 h[i] = gen_lowpart (V32QImode, op);
48403 for (i = 0; i < 2; ++i)
48405 if (!used[2 * i])
48407 l[i] = NULL_RTX;
48408 continue;
48410 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48411 vperm = force_reg (V32QImode, vperm);
48412 l[i] = gen_reg_rtx (V32QImode);
48413 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48414 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48417 for (i = 0; i < 2; ++i)
48419 if (h[i] && l[i])
48421 op = gen_reg_rtx (V32QImode);
48422 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48423 l[i] = op;
48425 else if (h[i])
48426 l[i] = h[i];
48429 gcc_assert (l[0] && l[1]);
48430 op = d->target;
48431 if (d->vmode != V32QImode)
48432 op = gen_reg_rtx (V32QImode);
48433 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48434 if (op != d->target)
48435 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48436 return true;
48439 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
48440 With all of the interface bits taken care of, perform the expansion
48441 in D and return true on success. */
48443 static bool
48444 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48446 /* Try a single instruction expansion. */
48447 if (expand_vec_perm_1 (d))
48448 return true;
48450 /* Try sequences of two instructions. */
48452 if (expand_vec_perm_pshuflw_pshufhw (d))
48453 return true;
48455 if (expand_vec_perm_palignr (d, false))
48456 return true;
48458 if (expand_vec_perm_interleave2 (d))
48459 return true;
48461 if (expand_vec_perm_broadcast (d))
48462 return true;
48464 if (expand_vec_perm_vpermq_perm_1 (d))
48465 return true;
48467 if (expand_vec_perm_vperm2f128 (d))
48468 return true;
48470 if (expand_vec_perm_pblendv (d))
48471 return true;
48473 /* Try sequences of three instructions. */
48475 if (expand_vec_perm_even_odd_pack (d))
48476 return true;
48478 if (expand_vec_perm_2vperm2f128_vshuf (d))
48479 return true;
48481 if (expand_vec_perm_pshufb2 (d))
48482 return true;
48484 if (expand_vec_perm_interleave3 (d))
48485 return true;
48487 if (expand_vec_perm_vperm2f128_vblend (d))
48488 return true;
48490 /* Try sequences of four instructions. */
48492 if (expand_vec_perm_even_odd_trunc (d))
48493 return true;
48494 if (expand_vec_perm_vpshufb2_vpermq (d))
48495 return true;
48497 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48498 return true;
48500 if (expand_vec_perm_vpermi2_vpshub2 (d))
48501 return true;
48503 /* ??? Look for narrow permutations whose element orderings would
48504 allow the promotion to a wider mode. */
48506 /* ??? Look for sequences of interleave or a wider permute that place
48507 the data into the correct lanes for a half-vector shuffle like
48508 pshuf[lh]w or vpermilps. */
48510 /* ??? Look for sequences of interleave that produce the desired results.
48511 The combinatorics of punpck[lh] get pretty ugly... */
48513 if (expand_vec_perm_even_odd (d))
48514 return true;
48516 /* Even longer sequences. */
48517 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48518 return true;
48520 /* See if we can get the same permutation in different vector integer
48521 mode. */
48522 struct expand_vec_perm_d nd;
48523 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48525 if (!d->testing_p)
48526 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48527 return true;
48530 return false;
48533 /* If a permutation only uses one operand, make it clear. Returns true
48534 if the permutation references both operands. */
48536 static bool
48537 canonicalize_perm (struct expand_vec_perm_d *d)
48539 int i, which, nelt = d->nelt;
48541 for (i = which = 0; i < nelt; ++i)
48542 which |= (d->perm[i] < nelt ? 1 : 2);
48544 d->one_operand_p = true;
48545 switch (which)
48547 default:
48548 gcc_unreachable();
48550 case 3:
48551 if (!rtx_equal_p (d->op0, d->op1))
48553 d->one_operand_p = false;
48554 break;
48556 /* The elements of PERM do not suggest that only the first operand
48557 is used, but both operands are identical. Allow easier matching
48558 of the permutation by folding the permutation into the single
48559 input vector. */
48560 /* FALLTHRU */
48562 case 2:
48563 for (i = 0; i < nelt; ++i)
48564 d->perm[i] &= nelt - 1;
48565 d->op0 = d->op1;
48566 break;
48568 case 1:
48569 d->op1 = d->op0;
48570 break;
48573 return (which == 3);
48576 bool
48577 ix86_expand_vec_perm_const (rtx operands[4])
48579 struct expand_vec_perm_d d;
48580 unsigned char perm[MAX_VECT_LEN];
48581 int i, nelt;
48582 bool two_args;
48583 rtx sel;
48585 d.target = operands[0];
48586 d.op0 = operands[1];
48587 d.op1 = operands[2];
48588 sel = operands[3];
48590 d.vmode = GET_MODE (d.target);
48591 gcc_assert (VECTOR_MODE_P (d.vmode));
48592 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48593 d.testing_p = false;
48595 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
48596 gcc_assert (XVECLEN (sel, 0) == nelt);
48597 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48599 for (i = 0; i < nelt; ++i)
48601 rtx e = XVECEXP (sel, 0, i);
48602 int ei = INTVAL (e) & (2 * nelt - 1);
48603 d.perm[i] = ei;
48604 perm[i] = ei;
48607 two_args = canonicalize_perm (&d);
48609 if (ix86_expand_vec_perm_const_1 (&d))
48610 return true;
48612 /* If the selector says both arguments are needed, but the operands are the
48613 same, the above tried to expand with one_operand_p and flattened selector.
48614 If that didn't work, retry without one_operand_p; we succeeded with that
48615 during testing. */
48616 if (two_args && d.one_operand_p)
48618 d.one_operand_p = false;
48619 memcpy (d.perm, perm, sizeof (perm));
48620 return ix86_expand_vec_perm_const_1 (&d);
48623 return false;
48626 /* Implement targetm.vectorize.vec_perm_const_ok. */
48628 static bool
48629 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
48630 const unsigned char *sel)
48632 struct expand_vec_perm_d d;
48633 unsigned int i, nelt, which;
48634 bool ret;
48636 d.vmode = vmode;
48637 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48638 d.testing_p = true;
48640 /* Given sufficient ISA support we can just return true here
48641 for selected vector modes. */
48642 switch (d.vmode)
48644 case V16SFmode:
48645 case V16SImode:
48646 case V8DImode:
48647 case V8DFmode:
48648 if (TARGET_AVX512F)
48649 /* All implementable with a single vpermi2 insn. */
48650 return true;
48651 break;
48652 case V32HImode:
48653 if (TARGET_AVX512BW)
48654 /* All implementable with a single vpermi2 insn. */
48655 return true;
48656 break;
48657 case V64QImode:
48658 if (TARGET_AVX512BW)
48659 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
48660 return true;
48661 break;
48662 case V8SImode:
48663 case V8SFmode:
48664 case V4DFmode:
48665 case V4DImode:
48666 if (TARGET_AVX512VL)
48667 /* All implementable with a single vpermi2 insn. */
48668 return true;
48669 break;
48670 case V16HImode:
48671 if (TARGET_AVX2)
48672 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48673 return true;
48674 break;
48675 case V32QImode:
48676 if (TARGET_AVX2)
48677 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48678 return true;
48679 break;
48680 case V4SImode:
48681 case V4SFmode:
48682 case V8HImode:
48683 case V16QImode:
48684 /* All implementable with a single vpperm insn. */
48685 if (TARGET_XOP)
48686 return true;
48687 /* All implementable with 2 pshufb + 1 ior. */
48688 if (TARGET_SSSE3)
48689 return true;
48690 break;
48691 case V2DImode:
48692 case V2DFmode:
48693 /* All implementable with shufpd or unpck[lh]pd. */
48694 return true;
48695 default:
48696 return false;
48699 /* Extract the values from the vector CST into the permutation
48700 array in D. */
48701 memcpy (d.perm, sel, nelt);
48702 for (i = which = 0; i < nelt; ++i)
48704 unsigned char e = d.perm[i];
48705 gcc_assert (e < 2 * nelt);
48706 which |= (e < nelt ? 1 : 2);
48709 /* For all elements from second vector, fold the elements to first. */
48710 if (which == 2)
48711 for (i = 0; i < nelt; ++i)
48712 d.perm[i] -= nelt;
48714 /* Check whether the mask can be applied to the vector type. */
48715 d.one_operand_p = (which != 3);
48717 /* Implementable with shufps or pshufd. */
48718 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48719 return true;
48721 /* Otherwise we have to go through the motions and see if we can
48722 figure out how to generate the requested permutation. */
48723 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48724 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48725 if (!d.one_operand_p)
48726 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48728 start_sequence ();
48729 ret = ix86_expand_vec_perm_const_1 (&d);
48730 end_sequence ();
48732 return ret;
48735 void
48736 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48738 struct expand_vec_perm_d d;
48739 unsigned i, nelt;
48741 d.target = targ;
48742 d.op0 = op0;
48743 d.op1 = op1;
48744 d.vmode = GET_MODE (targ);
48745 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48746 d.one_operand_p = false;
48747 d.testing_p = false;
48749 for (i = 0; i < nelt; ++i)
48750 d.perm[i] = i * 2 + odd;
48752 /* We'll either be able to implement the permutation directly... */
48753 if (expand_vec_perm_1 (&d))
48754 return;
48756 /* ... or we use the special-case patterns. */
48757 expand_vec_perm_even_odd_1 (&d, odd);
48760 static void
48761 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48763 struct expand_vec_perm_d d;
48764 unsigned i, nelt, base;
48765 bool ok;
48767 d.target = targ;
48768 d.op0 = op0;
48769 d.op1 = op1;
48770 d.vmode = GET_MODE (targ);
48771 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48772 d.one_operand_p = false;
48773 d.testing_p = false;
48775 base = high_p ? nelt / 2 : 0;
48776 for (i = 0; i < nelt / 2; ++i)
48778 d.perm[i * 2] = i + base;
48779 d.perm[i * 2 + 1] = i + base + nelt;
48782 /* Note that for AVX this isn't one instruction. */
48783 ok = ix86_expand_vec_perm_const_1 (&d);
48784 gcc_assert (ok);
48788 /* Expand a vector operation CODE for a V*QImode in terms of the
48789 same operation on V*HImode. */
48791 void
48792 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48794 machine_mode qimode = GET_MODE (dest);
48795 machine_mode himode;
48796 rtx (*gen_il) (rtx, rtx, rtx);
48797 rtx (*gen_ih) (rtx, rtx, rtx);
48798 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48799 struct expand_vec_perm_d d;
48800 bool ok, full_interleave;
48801 bool uns_p = false;
48802 int i;
48804 switch (qimode)
48806 case V16QImode:
48807 himode = V8HImode;
48808 gen_il = gen_vec_interleave_lowv16qi;
48809 gen_ih = gen_vec_interleave_highv16qi;
48810 break;
48811 case V32QImode:
48812 himode = V16HImode;
48813 gen_il = gen_avx2_interleave_lowv32qi;
48814 gen_ih = gen_avx2_interleave_highv32qi;
48815 break;
48816 case V64QImode:
48817 himode = V32HImode;
48818 gen_il = gen_avx512bw_interleave_lowv64qi;
48819 gen_ih = gen_avx512bw_interleave_highv64qi;
48820 break;
48821 default:
48822 gcc_unreachable ();
48825 op2_l = op2_h = op2;
48826 switch (code)
48828 case MULT:
48829 /* Unpack data such that we've got a source byte in each low byte of
48830 each word. We don't care what goes into the high byte of each word.
48831 Rather than trying to get zero in there, most convenient is to let
48832 it be a copy of the low byte. */
48833 op2_l = gen_reg_rtx (qimode);
48834 op2_h = gen_reg_rtx (qimode);
48835 emit_insn (gen_il (op2_l, op2, op2));
48836 emit_insn (gen_ih (op2_h, op2, op2));
48837 /* FALLTHRU */
48839 op1_l = gen_reg_rtx (qimode);
48840 op1_h = gen_reg_rtx (qimode);
48841 emit_insn (gen_il (op1_l, op1, op1));
48842 emit_insn (gen_ih (op1_h, op1, op1));
48843 full_interleave = qimode == V16QImode;
48844 break;
48846 case ASHIFT:
48847 case LSHIFTRT:
48848 uns_p = true;
48849 /* FALLTHRU */
48850 case ASHIFTRT:
48851 op1_l = gen_reg_rtx (himode);
48852 op1_h = gen_reg_rtx (himode);
48853 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48854 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48855 full_interleave = true;
48856 break;
48857 default:
48858 gcc_unreachable ();
48861 /* Perform the operation. */
48862 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48863 1, OPTAB_DIRECT);
48864 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48865 1, OPTAB_DIRECT);
48866 gcc_assert (res_l && res_h);
48868 /* Merge the data back into the right place. */
48869 d.target = dest;
48870 d.op0 = gen_lowpart (qimode, res_l);
48871 d.op1 = gen_lowpart (qimode, res_h);
48872 d.vmode = qimode;
48873 d.nelt = GET_MODE_NUNITS (qimode);
48874 d.one_operand_p = false;
48875 d.testing_p = false;
48877 if (full_interleave)
48879 /* For SSE2, we used an full interleave, so the desired
48880 results are in the even elements. */
48881 for (i = 0; i < d.nelt; ++i)
48882 d.perm[i] = i * 2;
48884 else
48886 /* For AVX, the interleave used above was not cross-lane. So the
48887 extraction is evens but with the second and third quarter swapped.
48888 Happily, that is even one insn shorter than even extraction.
48889 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48890 always first from the first and then from the second source operand,
48891 the index bits above the low 4 bits remains the same.
48892 Thus, for d.nelt == 32 we want permutation
48893 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48894 and for d.nelt == 64 we want permutation
48895 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48896 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48897 for (i = 0; i < d.nelt; ++i)
48898 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48901 ok = ix86_expand_vec_perm_const_1 (&d);
48902 gcc_assert (ok);
48904 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48905 gen_rtx_fmt_ee (code, qimode, op1, op2));
48908 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48909 if op is CONST_VECTOR with all odd elements equal to their
48910 preceding element. */
48912 static bool
48913 const_vector_equal_evenodd_p (rtx op)
48915 machine_mode mode = GET_MODE (op);
48916 int i, nunits = GET_MODE_NUNITS (mode);
48917 if (GET_CODE (op) != CONST_VECTOR
48918 || nunits != CONST_VECTOR_NUNITS (op))
48919 return false;
48920 for (i = 0; i < nunits; i += 2)
48921 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48922 return false;
48923 return true;
48926 void
48927 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48928 bool uns_p, bool odd_p)
48930 machine_mode mode = GET_MODE (op1);
48931 machine_mode wmode = GET_MODE (dest);
48932 rtx x;
48933 rtx orig_op1 = op1, orig_op2 = op2;
48935 if (!nonimmediate_operand (op1, mode))
48936 op1 = force_reg (mode, op1);
48937 if (!nonimmediate_operand (op2, mode))
48938 op2 = force_reg (mode, op2);
48940 /* We only play even/odd games with vectors of SImode. */
48941 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48943 /* If we're looking for the odd results, shift those members down to
48944 the even slots. For some cpus this is faster than a PSHUFD. */
48945 if (odd_p)
48947 /* For XOP use vpmacsdqh, but only for smult, as it is only
48948 signed. */
48949 if (TARGET_XOP && mode == V4SImode && !uns_p)
48951 x = force_reg (wmode, CONST0_RTX (wmode));
48952 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48953 return;
48956 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48957 if (!const_vector_equal_evenodd_p (orig_op1))
48958 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48959 x, NULL, 1, OPTAB_DIRECT);
48960 if (!const_vector_equal_evenodd_p (orig_op2))
48961 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48962 x, NULL, 1, OPTAB_DIRECT);
48963 op1 = gen_lowpart (mode, op1);
48964 op2 = gen_lowpart (mode, op2);
48967 if (mode == V16SImode)
48969 if (uns_p)
48970 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48971 else
48972 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48974 else if (mode == V8SImode)
48976 if (uns_p)
48977 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48978 else
48979 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48981 else if (uns_p)
48982 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48983 else if (TARGET_SSE4_1)
48984 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48985 else
48987 rtx s1, s2, t0, t1, t2;
48989 /* The easiest way to implement this without PMULDQ is to go through
48990 the motions as if we are performing a full 64-bit multiply. With
48991 the exception that we need to do less shuffling of the elements. */
48993 /* Compute the sign-extension, aka highparts, of the two operands. */
48994 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48995 op1, pc_rtx, pc_rtx);
48996 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48997 op2, pc_rtx, pc_rtx);
48999 /* Multiply LO(A) * HI(B), and vice-versa. */
49000 t1 = gen_reg_rtx (wmode);
49001 t2 = gen_reg_rtx (wmode);
49002 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49003 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49005 /* Multiply LO(A) * LO(B). */
49006 t0 = gen_reg_rtx (wmode);
49007 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49009 /* Combine and shift the highparts into place. */
49010 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49011 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49012 1, OPTAB_DIRECT);
49014 /* Combine high and low parts. */
49015 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49016 return;
49018 emit_insn (x);
49021 void
49022 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49023 bool uns_p, bool high_p)
49025 machine_mode wmode = GET_MODE (dest);
49026 machine_mode mode = GET_MODE (op1);
49027 rtx t1, t2, t3, t4, mask;
49029 switch (mode)
49031 case V4SImode:
49032 t1 = gen_reg_rtx (mode);
49033 t2 = gen_reg_rtx (mode);
49034 if (TARGET_XOP && !uns_p)
49036 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49037 shuffle the elements once so that all elements are in the right
49038 place for immediate use: { A C B D }. */
49039 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49040 const1_rtx, GEN_INT (3)));
49041 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49042 const1_rtx, GEN_INT (3)));
49044 else
49046 /* Put the elements into place for the multiply. */
49047 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49048 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49049 high_p = false;
49051 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49052 break;
49054 case V8SImode:
49055 /* Shuffle the elements between the lanes. After this we
49056 have { A B E F | C D G H } for each operand. */
49057 t1 = gen_reg_rtx (V4DImode);
49058 t2 = gen_reg_rtx (V4DImode);
49059 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49060 const0_rtx, const2_rtx,
49061 const1_rtx, GEN_INT (3)));
49062 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49063 const0_rtx, const2_rtx,
49064 const1_rtx, GEN_INT (3)));
49066 /* Shuffle the elements within the lanes. After this we
49067 have { A A B B | C C D D } or { E E F F | G G H H }. */
49068 t3 = gen_reg_rtx (V8SImode);
49069 t4 = gen_reg_rtx (V8SImode);
49070 mask = GEN_INT (high_p
49071 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49072 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49073 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49074 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49076 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49077 break;
49079 case V8HImode:
49080 case V16HImode:
49081 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49082 uns_p, OPTAB_DIRECT);
49083 t2 = expand_binop (mode,
49084 uns_p ? umul_highpart_optab : smul_highpart_optab,
49085 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49086 gcc_assert (t1 && t2);
49088 t3 = gen_reg_rtx (mode);
49089 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49090 emit_move_insn (dest, gen_lowpart (wmode, t3));
49091 break;
49093 case V16QImode:
49094 case V32QImode:
49095 case V32HImode:
49096 case V16SImode:
49097 case V64QImode:
49098 t1 = gen_reg_rtx (wmode);
49099 t2 = gen_reg_rtx (wmode);
49100 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49101 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49103 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49104 break;
49106 default:
49107 gcc_unreachable ();
49111 void
49112 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49114 rtx res_1, res_2, res_3, res_4;
49116 res_1 = gen_reg_rtx (V4SImode);
49117 res_2 = gen_reg_rtx (V4SImode);
49118 res_3 = gen_reg_rtx (V2DImode);
49119 res_4 = gen_reg_rtx (V2DImode);
49120 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49121 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49123 /* Move the results in element 2 down to element 1; we don't care
49124 what goes in elements 2 and 3. Then we can merge the parts
49125 back together with an interleave.
49127 Note that two other sequences were tried:
49128 (1) Use interleaves at the start instead of psrldq, which allows
49129 us to use a single shufps to merge things back at the end.
49130 (2) Use shufps here to combine the two vectors, then pshufd to
49131 put the elements in the correct order.
49132 In both cases the cost of the reformatting stall was too high
49133 and the overall sequence slower. */
49135 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49136 const0_rtx, const2_rtx,
49137 const0_rtx, const0_rtx));
49138 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49139 const0_rtx, const2_rtx,
49140 const0_rtx, const0_rtx));
49141 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49143 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49146 void
49147 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49149 machine_mode mode = GET_MODE (op0);
49150 rtx t1, t2, t3, t4, t5, t6;
49152 if (TARGET_AVX512DQ && mode == V8DImode)
49153 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49154 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49155 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49156 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49157 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49158 else if (TARGET_XOP && mode == V2DImode)
49160 /* op1: A,B,C,D, op2: E,F,G,H */
49161 op1 = gen_lowpart (V4SImode, op1);
49162 op2 = gen_lowpart (V4SImode, op2);
49164 t1 = gen_reg_rtx (V4SImode);
49165 t2 = gen_reg_rtx (V4SImode);
49166 t3 = gen_reg_rtx (V2DImode);
49167 t4 = gen_reg_rtx (V2DImode);
49169 /* t1: B,A,D,C */
49170 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49171 GEN_INT (1),
49172 GEN_INT (0),
49173 GEN_INT (3),
49174 GEN_INT (2)));
49176 /* t2: (B*E),(A*F),(D*G),(C*H) */
49177 emit_insn (gen_mulv4si3 (t2, t1, op2));
49179 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49180 emit_insn (gen_xop_phadddq (t3, t2));
49182 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49183 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49185 /* Multiply lower parts and add all */
49186 t5 = gen_reg_rtx (V2DImode);
49187 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49188 gen_lowpart (V4SImode, op1),
49189 gen_lowpart (V4SImode, op2)));
49190 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49193 else
49195 machine_mode nmode;
49196 rtx (*umul) (rtx, rtx, rtx);
49198 if (mode == V2DImode)
49200 umul = gen_vec_widen_umult_even_v4si;
49201 nmode = V4SImode;
49203 else if (mode == V4DImode)
49205 umul = gen_vec_widen_umult_even_v8si;
49206 nmode = V8SImode;
49208 else if (mode == V8DImode)
49210 umul = gen_vec_widen_umult_even_v16si;
49211 nmode = V16SImode;
49213 else
49214 gcc_unreachable ();
49217 /* Multiply low parts. */
49218 t1 = gen_reg_rtx (mode);
49219 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49221 /* Shift input vectors right 32 bits so we can multiply high parts. */
49222 t6 = GEN_INT (32);
49223 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49224 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49226 /* Multiply high parts by low parts. */
49227 t4 = gen_reg_rtx (mode);
49228 t5 = gen_reg_rtx (mode);
49229 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49230 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49232 /* Combine and shift the highparts back. */
49233 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49234 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49236 /* Combine high and low parts. */
49237 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49240 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49241 gen_rtx_MULT (mode, op1, op2));
49244 /* Return 1 if control tansfer instruction INSN
49245 should be encoded with bnd prefix.
49246 If insn is NULL then return 1 when control
49247 transfer instructions should be prefixed with
49248 bnd by default for current function. */
49250 bool
49251 ix86_bnd_prefixed_insn_p (rtx insn)
49253 /* For call insns check special flag. */
49254 if (insn && CALL_P (insn))
49256 rtx call = get_call_rtx_from (insn);
49257 if (call)
49258 return CALL_EXPR_WITH_BOUNDS_P (call);
49261 /* All other insns are prefixed only if function is instrumented. */
49262 return chkp_function_instrumented_p (current_function_decl);
49265 /* Calculate integer abs() using only SSE2 instructions. */
49267 void
49268 ix86_expand_sse2_abs (rtx target, rtx input)
49270 machine_mode mode = GET_MODE (target);
49271 rtx tmp0, tmp1, x;
49273 switch (mode)
49275 /* For 32-bit signed integer X, the best way to calculate the absolute
49276 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49277 case V4SImode:
49278 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49279 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49280 NULL, 0, OPTAB_DIRECT);
49281 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49282 NULL, 0, OPTAB_DIRECT);
49283 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49284 target, 0, OPTAB_DIRECT);
49285 break;
49287 /* For 16-bit signed integer X, the best way to calculate the absolute
49288 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49289 case V8HImode:
49290 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49292 x = expand_simple_binop (mode, SMAX, tmp0, input,
49293 target, 0, OPTAB_DIRECT);
49294 break;
49296 /* For 8-bit signed integer X, the best way to calculate the absolute
49297 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49298 as SSE2 provides the PMINUB insn. */
49299 case V16QImode:
49300 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49302 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49303 target, 0, OPTAB_DIRECT);
49304 break;
49306 default:
49307 gcc_unreachable ();
49310 if (x != target)
49311 emit_move_insn (target, x);
49314 /* Expand an extract from a vector register through pextr insn.
49315 Return true if successful. */
49317 bool
49318 ix86_expand_pextr (rtx *operands)
49320 rtx dst = operands[0];
49321 rtx src = operands[1];
49323 unsigned int size = INTVAL (operands[2]);
49324 unsigned int pos = INTVAL (operands[3]);
49326 if (SUBREG_P (dst))
49328 /* Reject non-lowpart subregs. */
49329 if (SUBREG_BYTE (dst) > 0)
49330 return false;
49331 dst = SUBREG_REG (dst);
49334 if (SUBREG_P (src))
49336 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49337 src = SUBREG_REG (src);
49340 switch (GET_MODE (src))
49342 case V16QImode:
49343 case V8HImode:
49344 case V4SImode:
49345 case V2DImode:
49346 case V1TImode:
49347 case TImode:
49349 machine_mode srcmode, dstmode;
49350 rtx d, pat;
49352 dstmode = mode_for_size (size, MODE_INT, 0);
49354 switch (dstmode)
49356 case QImode:
49357 if (!TARGET_SSE4_1)
49358 return false;
49359 srcmode = V16QImode;
49360 break;
49362 case HImode:
49363 if (!TARGET_SSE2)
49364 return false;
49365 srcmode = V8HImode;
49366 break;
49368 case SImode:
49369 if (!TARGET_SSE4_1)
49370 return false;
49371 srcmode = V4SImode;
49372 break;
49374 case DImode:
49375 gcc_assert (TARGET_64BIT);
49376 if (!TARGET_SSE4_1)
49377 return false;
49378 srcmode = V2DImode;
49379 break;
49381 default:
49382 return false;
49385 /* Reject extractions from misaligned positions. */
49386 if (pos & (size-1))
49387 return false;
49389 if (GET_MODE (dst) == dstmode)
49390 d = dst;
49391 else
49392 d = gen_reg_rtx (dstmode);
49394 /* Construct insn pattern. */
49395 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49396 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49398 /* Let the rtl optimizers know about the zero extension performed. */
49399 if (dstmode == QImode || dstmode == HImode)
49401 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49402 d = gen_lowpart (SImode, d);
49405 emit_insn (gen_rtx_SET (d, pat));
49407 if (d != dst)
49408 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49409 return true;
49412 default:
49413 return false;
49417 /* Expand an insert into a vector register through pinsr insn.
49418 Return true if successful. */
49420 bool
49421 ix86_expand_pinsr (rtx *operands)
49423 rtx dst = operands[0];
49424 rtx src = operands[3];
49426 unsigned int size = INTVAL (operands[1]);
49427 unsigned int pos = INTVAL (operands[2]);
49429 if (SUBREG_P (dst))
49431 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49432 dst = SUBREG_REG (dst);
49435 switch (GET_MODE (dst))
49437 case V16QImode:
49438 case V8HImode:
49439 case V4SImode:
49440 case V2DImode:
49441 case V1TImode:
49442 case TImode:
49444 machine_mode srcmode, dstmode;
49445 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49446 rtx d;
49448 srcmode = mode_for_size (size, MODE_INT, 0);
49450 switch (srcmode)
49452 case QImode:
49453 if (!TARGET_SSE4_1)
49454 return false;
49455 dstmode = V16QImode;
49456 pinsr = gen_sse4_1_pinsrb;
49457 break;
49459 case HImode:
49460 if (!TARGET_SSE2)
49461 return false;
49462 dstmode = V8HImode;
49463 pinsr = gen_sse2_pinsrw;
49464 break;
49466 case SImode:
49467 if (!TARGET_SSE4_1)
49468 return false;
49469 dstmode = V4SImode;
49470 pinsr = gen_sse4_1_pinsrd;
49471 break;
49473 case DImode:
49474 gcc_assert (TARGET_64BIT);
49475 if (!TARGET_SSE4_1)
49476 return false;
49477 dstmode = V2DImode;
49478 pinsr = gen_sse4_1_pinsrq;
49479 break;
49481 default:
49482 return false;
49485 /* Reject insertions to misaligned positions. */
49486 if (pos & (size-1))
49487 return false;
49489 if (SUBREG_P (src))
49491 unsigned int srcpos = SUBREG_BYTE (src);
49493 if (srcpos > 0)
49495 rtx extr_ops[4];
49497 extr_ops[0] = gen_reg_rtx (srcmode);
49498 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49499 extr_ops[2] = GEN_INT (size);
49500 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49502 if (!ix86_expand_pextr (extr_ops))
49503 return false;
49505 src = extr_ops[0];
49507 else
49508 src = gen_lowpart (srcmode, SUBREG_REG (src));
49511 if (GET_MODE (dst) == dstmode)
49512 d = dst;
49513 else
49514 d = gen_reg_rtx (dstmode);
49516 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49517 gen_lowpart (srcmode, src),
49518 GEN_INT (1 << (pos / size))));
49519 if (d != dst)
49520 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49521 return true;
49524 default:
49525 return false;
49529 /* This function returns the calling abi specific va_list type node.
49530 It returns the FNDECL specific va_list type. */
49532 static tree
49533 ix86_fn_abi_va_list (tree fndecl)
49535 if (!TARGET_64BIT)
49536 return va_list_type_node;
49537 gcc_assert (fndecl != NULL_TREE);
49539 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49540 return ms_va_list_type_node;
49541 else
49542 return sysv_va_list_type_node;
49545 /* Returns the canonical va_list type specified by TYPE. If there
49546 is no valid TYPE provided, it return NULL_TREE. */
49548 static tree
49549 ix86_canonical_va_list_type (tree type)
49551 if (TARGET_64BIT)
49553 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49554 return ms_va_list_type_node;
49556 if ((TREE_CODE (type) == ARRAY_TYPE
49557 && integer_zerop (array_type_nelts (type)))
49558 || POINTER_TYPE_P (type))
49560 tree elem_type = TREE_TYPE (type);
49561 if (TREE_CODE (elem_type) == RECORD_TYPE
49562 && lookup_attribute ("sysv_abi va_list",
49563 TYPE_ATTRIBUTES (elem_type)))
49564 return sysv_va_list_type_node;
49567 return NULL_TREE;
49570 return std_canonical_va_list_type (type);
49573 /* Iterate through the target-specific builtin types for va_list.
49574 IDX denotes the iterator, *PTREE is set to the result type of
49575 the va_list builtin, and *PNAME to its internal type.
49576 Returns zero if there is no element for this index, otherwise
49577 IDX should be increased upon the next call.
49578 Note, do not iterate a base builtin's name like __builtin_va_list.
49579 Used from c_common_nodes_and_builtins. */
49581 static int
49582 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49584 if (TARGET_64BIT)
49586 switch (idx)
49588 default:
49589 break;
49591 case 0:
49592 *ptree = ms_va_list_type_node;
49593 *pname = "__builtin_ms_va_list";
49594 return 1;
49596 case 1:
49597 *ptree = sysv_va_list_type_node;
49598 *pname = "__builtin_sysv_va_list";
49599 return 1;
49603 return 0;
49606 #undef TARGET_SCHED_DISPATCH
49607 #define TARGET_SCHED_DISPATCH has_dispatch
49608 #undef TARGET_SCHED_DISPATCH_DO
49609 #define TARGET_SCHED_DISPATCH_DO do_dispatch
49610 #undef TARGET_SCHED_REASSOCIATION_WIDTH
49611 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
49612 #undef TARGET_SCHED_REORDER
49613 #define TARGET_SCHED_REORDER ix86_sched_reorder
49614 #undef TARGET_SCHED_ADJUST_PRIORITY
49615 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
49616 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
49617 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
49618 ix86_dependencies_evaluation_hook
49620 /* The size of the dispatch window is the total number of bytes of
49621 object code allowed in a window. */
49622 #define DISPATCH_WINDOW_SIZE 16
49624 /* Number of dispatch windows considered for scheduling. */
49625 #define MAX_DISPATCH_WINDOWS 3
49627 /* Maximum number of instructions in a window. */
49628 #define MAX_INSN 4
49630 /* Maximum number of immediate operands in a window. */
49631 #define MAX_IMM 4
49633 /* Maximum number of immediate bits allowed in a window. */
49634 #define MAX_IMM_SIZE 128
49636 /* Maximum number of 32 bit immediates allowed in a window. */
49637 #define MAX_IMM_32 4
49639 /* Maximum number of 64 bit immediates allowed in a window. */
49640 #define MAX_IMM_64 2
49642 /* Maximum total of loads or prefetches allowed in a window. */
49643 #define MAX_LOAD 2
49645 /* Maximum total of stores allowed in a window. */
49646 #define MAX_STORE 1
49648 #undef BIG
49649 #define BIG 100
49652 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
49653 enum dispatch_group {
49654 disp_no_group = 0,
49655 disp_load,
49656 disp_store,
49657 disp_load_store,
49658 disp_prefetch,
49659 disp_imm,
49660 disp_imm_32,
49661 disp_imm_64,
49662 disp_branch,
49663 disp_cmp,
49664 disp_jcc,
49665 disp_last
49668 /* Number of allowable groups in a dispatch window. It is an array
49669 indexed by dispatch_group enum. 100 is used as a big number,
49670 because the number of these kind of operations does not have any
49671 effect in dispatch window, but we need them for other reasons in
49672 the table. */
49673 static unsigned int num_allowable_groups[disp_last] = {
49674 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
49677 char group_name[disp_last + 1][16] = {
49678 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
49679 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
49680 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
49683 /* Instruction path. */
49684 enum insn_path {
49685 no_path = 0,
49686 path_single, /* Single micro op. */
49687 path_double, /* Double micro op. */
49688 path_multi, /* Instructions with more than 2 micro op.. */
49689 last_path
49692 /* sched_insn_info defines a window to the instructions scheduled in
49693 the basic block. It contains a pointer to the insn_info table and
49694 the instruction scheduled.
49696 Windows are allocated for each basic block and are linked
49697 together. */
49698 typedef struct sched_insn_info_s {
49699 rtx insn;
49700 enum dispatch_group group;
49701 enum insn_path path;
49702 int byte_len;
49703 int imm_bytes;
49704 } sched_insn_info;
49706 /* Linked list of dispatch windows. This is a two way list of
49707 dispatch windows of a basic block. It contains information about
49708 the number of uops in the window and the total number of
49709 instructions and of bytes in the object code for this dispatch
49710 window. */
49711 typedef struct dispatch_windows_s {
49712 int num_insn; /* Number of insn in the window. */
49713 int num_uops; /* Number of uops in the window. */
49714 int window_size; /* Number of bytes in the window. */
49715 int window_num; /* Window number between 0 or 1. */
49716 int num_imm; /* Number of immediates in an insn. */
49717 int num_imm_32; /* Number of 32 bit immediates in an insn. */
49718 int num_imm_64; /* Number of 64 bit immediates in an insn. */
49719 int imm_size; /* Total immediates in the window. */
49720 int num_loads; /* Total memory loads in the window. */
49721 int num_stores; /* Total memory stores in the window. */
49722 int violation; /* Violation exists in window. */
49723 sched_insn_info *window; /* Pointer to the window. */
49724 struct dispatch_windows_s *next;
49725 struct dispatch_windows_s *prev;
49726 } dispatch_windows;
49728 /* Immediate valuse used in an insn. */
49729 typedef struct imm_info_s
49731 int imm;
49732 int imm32;
49733 int imm64;
49734 } imm_info;
49736 static dispatch_windows *dispatch_window_list;
49737 static dispatch_windows *dispatch_window_list1;
49739 /* Get dispatch group of insn. */
49741 static enum dispatch_group
49742 get_mem_group (rtx_insn *insn)
49744 enum attr_memory memory;
49746 if (INSN_CODE (insn) < 0)
49747 return disp_no_group;
49748 memory = get_attr_memory (insn);
49749 if (memory == MEMORY_STORE)
49750 return disp_store;
49752 if (memory == MEMORY_LOAD)
49753 return disp_load;
49755 if (memory == MEMORY_BOTH)
49756 return disp_load_store;
49758 return disp_no_group;
49761 /* Return true if insn is a compare instruction. */
49763 static bool
49764 is_cmp (rtx_insn *insn)
49766 enum attr_type type;
49768 type = get_attr_type (insn);
49769 return (type == TYPE_TEST
49770 || type == TYPE_ICMP
49771 || type == TYPE_FCMP
49772 || GET_CODE (PATTERN (insn)) == COMPARE);
49775 /* Return true if a dispatch violation encountered. */
49777 static bool
49778 dispatch_violation (void)
49780 if (dispatch_window_list->next)
49781 return dispatch_window_list->next->violation;
49782 return dispatch_window_list->violation;
49785 /* Return true if insn is a branch instruction. */
49787 static bool
49788 is_branch (rtx_insn *insn)
49790 return (CALL_P (insn) || JUMP_P (insn));
49793 /* Return true if insn is a prefetch instruction. */
49795 static bool
49796 is_prefetch (rtx_insn *insn)
49798 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
49801 /* This function initializes a dispatch window and the list container holding a
49802 pointer to the window. */
49804 static void
49805 init_window (int window_num)
49807 int i;
49808 dispatch_windows *new_list;
49810 if (window_num == 0)
49811 new_list = dispatch_window_list;
49812 else
49813 new_list = dispatch_window_list1;
49815 new_list->num_insn = 0;
49816 new_list->num_uops = 0;
49817 new_list->window_size = 0;
49818 new_list->next = NULL;
49819 new_list->prev = NULL;
49820 new_list->window_num = window_num;
49821 new_list->num_imm = 0;
49822 new_list->num_imm_32 = 0;
49823 new_list->num_imm_64 = 0;
49824 new_list->imm_size = 0;
49825 new_list->num_loads = 0;
49826 new_list->num_stores = 0;
49827 new_list->violation = false;
49829 for (i = 0; i < MAX_INSN; i++)
49831 new_list->window[i].insn = NULL;
49832 new_list->window[i].group = disp_no_group;
49833 new_list->window[i].path = no_path;
49834 new_list->window[i].byte_len = 0;
49835 new_list->window[i].imm_bytes = 0;
49837 return;
49840 /* This function allocates and initializes a dispatch window and the
49841 list container holding a pointer to the window. */
49843 static dispatch_windows *
49844 allocate_window (void)
49846 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
49847 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
49849 return new_list;
49852 /* This routine initializes the dispatch scheduling information. It
49853 initiates building dispatch scheduler tables and constructs the
49854 first dispatch window. */
49856 static void
49857 init_dispatch_sched (void)
49859 /* Allocate a dispatch list and a window. */
49860 dispatch_window_list = allocate_window ();
49861 dispatch_window_list1 = allocate_window ();
49862 init_window (0);
49863 init_window (1);
49866 /* This function returns true if a branch is detected. End of a basic block
49867 does not have to be a branch, but here we assume only branches end a
49868 window. */
49870 static bool
49871 is_end_basic_block (enum dispatch_group group)
49873 return group == disp_branch;
49876 /* This function is called when the end of a window processing is reached. */
49878 static void
49879 process_end_window (void)
49881 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
49882 if (dispatch_window_list->next)
49884 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
49885 gcc_assert (dispatch_window_list->window_size
49886 + dispatch_window_list1->window_size <= 48);
49887 init_window (1);
49889 init_window (0);
49892 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
49893 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
49894 for 48 bytes of instructions. Note that these windows are not dispatch
49895 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49897 static dispatch_windows *
49898 allocate_next_window (int window_num)
49900 if (window_num == 0)
49902 if (dispatch_window_list->next)
49903 init_window (1);
49904 init_window (0);
49905 return dispatch_window_list;
49908 dispatch_window_list->next = dispatch_window_list1;
49909 dispatch_window_list1->prev = dispatch_window_list;
49911 return dispatch_window_list1;
49914 /* Compute number of immediate operands of an instruction. */
49916 static void
49917 find_constant (rtx in_rtx, imm_info *imm_values)
49919 if (INSN_P (in_rtx))
49920 in_rtx = PATTERN (in_rtx);
49921 subrtx_iterator::array_type array;
49922 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49923 if (const_rtx x = *iter)
49924 switch (GET_CODE (x))
49926 case CONST:
49927 case SYMBOL_REF:
49928 case CONST_INT:
49929 (imm_values->imm)++;
49930 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49931 (imm_values->imm32)++;
49932 else
49933 (imm_values->imm64)++;
49934 break;
49936 case CONST_DOUBLE:
49937 case CONST_WIDE_INT:
49938 (imm_values->imm)++;
49939 (imm_values->imm64)++;
49940 break;
49942 case CODE_LABEL:
49943 if (LABEL_KIND (x) == LABEL_NORMAL)
49945 (imm_values->imm)++;
49946 (imm_values->imm32)++;
49948 break;
49950 default:
49951 break;
49955 /* Return total size of immediate operands of an instruction along with number
49956 of corresponding immediate-operands. It initializes its parameters to zero
49957 befor calling FIND_CONSTANT.
49958 INSN is the input instruction. IMM is the total of immediates.
49959 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49960 bit immediates. */
49962 static int
49963 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49965 imm_info imm_values = {0, 0, 0};
49967 find_constant (insn, &imm_values);
49968 *imm = imm_values.imm;
49969 *imm32 = imm_values.imm32;
49970 *imm64 = imm_values.imm64;
49971 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49974 /* This function indicates if an operand of an instruction is an
49975 immediate. */
49977 static bool
49978 has_immediate (rtx_insn *insn)
49980 int num_imm_operand;
49981 int num_imm32_operand;
49982 int num_imm64_operand;
49984 if (insn)
49985 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49986 &num_imm64_operand);
49987 return false;
49990 /* Return single or double path for instructions. */
49992 static enum insn_path
49993 get_insn_path (rtx_insn *insn)
49995 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49997 if ((int)path == 0)
49998 return path_single;
50000 if ((int)path == 1)
50001 return path_double;
50003 return path_multi;
50006 /* Return insn dispatch group. */
50008 static enum dispatch_group
50009 get_insn_group (rtx_insn *insn)
50011 enum dispatch_group group = get_mem_group (insn);
50012 if (group)
50013 return group;
50015 if (is_branch (insn))
50016 return disp_branch;
50018 if (is_cmp (insn))
50019 return disp_cmp;
50021 if (has_immediate (insn))
50022 return disp_imm;
50024 if (is_prefetch (insn))
50025 return disp_prefetch;
50027 return disp_no_group;
50030 /* Count number of GROUP restricted instructions in a dispatch
50031 window WINDOW_LIST. */
50033 static int
50034 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50036 enum dispatch_group group = get_insn_group (insn);
50037 int imm_size;
50038 int num_imm_operand;
50039 int num_imm32_operand;
50040 int num_imm64_operand;
50042 if (group == disp_no_group)
50043 return 0;
50045 if (group == disp_imm)
50047 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50048 &num_imm64_operand);
50049 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50050 || num_imm_operand + window_list->num_imm > MAX_IMM
50051 || (num_imm32_operand > 0
50052 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50053 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50054 || (num_imm64_operand > 0
50055 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50056 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50057 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50058 && num_imm64_operand > 0
50059 && ((window_list->num_imm_64 > 0
50060 && window_list->num_insn >= 2)
50061 || window_list->num_insn >= 3)))
50062 return BIG;
50064 return 1;
50067 if ((group == disp_load_store
50068 && (window_list->num_loads >= MAX_LOAD
50069 || window_list->num_stores >= MAX_STORE))
50070 || ((group == disp_load
50071 || group == disp_prefetch)
50072 && window_list->num_loads >= MAX_LOAD)
50073 || (group == disp_store
50074 && window_list->num_stores >= MAX_STORE))
50075 return BIG;
50077 return 1;
50080 /* This function returns true if insn satisfies dispatch rules on the
50081 last window scheduled. */
50083 static bool
50084 fits_dispatch_window (rtx_insn *insn)
50086 dispatch_windows *window_list = dispatch_window_list;
50087 dispatch_windows *window_list_next = dispatch_window_list->next;
50088 unsigned int num_restrict;
50089 enum dispatch_group group = get_insn_group (insn);
50090 enum insn_path path = get_insn_path (insn);
50091 int sum;
50093 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50094 instructions should be given the lowest priority in the
50095 scheduling process in Haifa scheduler to make sure they will be
50096 scheduled in the same dispatch window as the reference to them. */
50097 if (group == disp_jcc || group == disp_cmp)
50098 return false;
50100 /* Check nonrestricted. */
50101 if (group == disp_no_group || group == disp_branch)
50102 return true;
50104 /* Get last dispatch window. */
50105 if (window_list_next)
50106 window_list = window_list_next;
50108 if (window_list->window_num == 1)
50110 sum = window_list->prev->window_size + window_list->window_size;
50112 if (sum == 32
50113 || (min_insn_size (insn) + sum) >= 48)
50114 /* Window 1 is full. Go for next window. */
50115 return true;
50118 num_restrict = count_num_restricted (insn, window_list);
50120 if (num_restrict > num_allowable_groups[group])
50121 return false;
50123 /* See if it fits in the first window. */
50124 if (window_list->window_num == 0)
50126 /* The first widow should have only single and double path
50127 uops. */
50128 if (path == path_double
50129 && (window_list->num_uops + 2) > MAX_INSN)
50130 return false;
50131 else if (path != path_single)
50132 return false;
50134 return true;
50137 /* Add an instruction INSN with NUM_UOPS micro-operations to the
50138 dispatch window WINDOW_LIST. */
50140 static void
50141 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
50143 int byte_len = min_insn_size (insn);
50144 int num_insn = window_list->num_insn;
50145 int imm_size;
50146 sched_insn_info *window = window_list->window;
50147 enum dispatch_group group = get_insn_group (insn);
50148 enum insn_path path = get_insn_path (insn);
50149 int num_imm_operand;
50150 int num_imm32_operand;
50151 int num_imm64_operand;
50153 if (!window_list->violation && group != disp_cmp
50154 && !fits_dispatch_window (insn))
50155 window_list->violation = true;
50157 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50158 &num_imm64_operand);
50160 /* Initialize window with new instruction. */
50161 window[num_insn].insn = insn;
50162 window[num_insn].byte_len = byte_len;
50163 window[num_insn].group = group;
50164 window[num_insn].path = path;
50165 window[num_insn].imm_bytes = imm_size;
50167 window_list->window_size += byte_len;
50168 window_list->num_insn = num_insn + 1;
50169 window_list->num_uops = window_list->num_uops + num_uops;
50170 window_list->imm_size += imm_size;
50171 window_list->num_imm += num_imm_operand;
50172 window_list->num_imm_32 += num_imm32_operand;
50173 window_list->num_imm_64 += num_imm64_operand;
50175 if (group == disp_store)
50176 window_list->num_stores += 1;
50177 else if (group == disp_load
50178 || group == disp_prefetch)
50179 window_list->num_loads += 1;
50180 else if (group == disp_load_store)
50182 window_list->num_stores += 1;
50183 window_list->num_loads += 1;
50187 /* Adds a scheduled instruction, INSN, to the current dispatch window.
50188 If the total bytes of instructions or the number of instructions in
50189 the window exceed allowable, it allocates a new window. */
50191 static void
50192 add_to_dispatch_window (rtx_insn *insn)
50194 int byte_len;
50195 dispatch_windows *window_list;
50196 dispatch_windows *next_list;
50197 dispatch_windows *window0_list;
50198 enum insn_path path;
50199 enum dispatch_group insn_group;
50200 bool insn_fits;
50201 int num_insn;
50202 int num_uops;
50203 int window_num;
50204 int insn_num_uops;
50205 int sum;
50207 if (INSN_CODE (insn) < 0)
50208 return;
50210 byte_len = min_insn_size (insn);
50211 window_list = dispatch_window_list;
50212 next_list = window_list->next;
50213 path = get_insn_path (insn);
50214 insn_group = get_insn_group (insn);
50216 /* Get the last dispatch window. */
50217 if (next_list)
50218 window_list = dispatch_window_list->next;
50220 if (path == path_single)
50221 insn_num_uops = 1;
50222 else if (path == path_double)
50223 insn_num_uops = 2;
50224 else
50225 insn_num_uops = (int) path;
50227 /* If current window is full, get a new window.
50228 Window number zero is full, if MAX_INSN uops are scheduled in it.
50229 Window number one is full, if window zero's bytes plus window
50230 one's bytes is 32, or if the bytes of the new instruction added
50231 to the total makes it greater than 48, or it has already MAX_INSN
50232 instructions in it. */
50233 num_insn = window_list->num_insn;
50234 num_uops = window_list->num_uops;
50235 window_num = window_list->window_num;
50236 insn_fits = fits_dispatch_window (insn);
50238 if (num_insn >= MAX_INSN
50239 || num_uops + insn_num_uops > MAX_INSN
50240 || !(insn_fits))
50242 window_num = ~window_num & 1;
50243 window_list = allocate_next_window (window_num);
50246 if (window_num == 0)
50248 add_insn_window (insn, window_list, insn_num_uops);
50249 if (window_list->num_insn >= MAX_INSN
50250 && insn_group == disp_branch)
50252 process_end_window ();
50253 return;
50256 else if (window_num == 1)
50258 window0_list = window_list->prev;
50259 sum = window0_list->window_size + window_list->window_size;
50260 if (sum == 32
50261 || (byte_len + sum) >= 48)
50263 process_end_window ();
50264 window_list = dispatch_window_list;
50267 add_insn_window (insn, window_list, insn_num_uops);
50269 else
50270 gcc_unreachable ();
50272 if (is_end_basic_block (insn_group))
50274 /* End of basic block is reached do end-basic-block process. */
50275 process_end_window ();
50276 return;
50280 /* Print the dispatch window, WINDOW_NUM, to FILE. */
50282 DEBUG_FUNCTION static void
50283 debug_dispatch_window_file (FILE *file, int window_num)
50285 dispatch_windows *list;
50286 int i;
50288 if (window_num == 0)
50289 list = dispatch_window_list;
50290 else
50291 list = dispatch_window_list1;
50293 fprintf (file, "Window #%d:\n", list->window_num);
50294 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
50295 list->num_insn, list->num_uops, list->window_size);
50296 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50297 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
50299 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
50300 list->num_stores);
50301 fprintf (file, " insn info:\n");
50303 for (i = 0; i < MAX_INSN; i++)
50305 if (!list->window[i].insn)
50306 break;
50307 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
50308 i, group_name[list->window[i].group],
50309 i, (void *)list->window[i].insn,
50310 i, list->window[i].path,
50311 i, list->window[i].byte_len,
50312 i, list->window[i].imm_bytes);
50316 /* Print to stdout a dispatch window. */
50318 DEBUG_FUNCTION void
50319 debug_dispatch_window (int window_num)
50321 debug_dispatch_window_file (stdout, window_num);
50324 /* Print INSN dispatch information to FILE. */
50326 DEBUG_FUNCTION static void
50327 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
50329 int byte_len;
50330 enum insn_path path;
50331 enum dispatch_group group;
50332 int imm_size;
50333 int num_imm_operand;
50334 int num_imm32_operand;
50335 int num_imm64_operand;
50337 if (INSN_CODE (insn) < 0)
50338 return;
50340 byte_len = min_insn_size (insn);
50341 path = get_insn_path (insn);
50342 group = get_insn_group (insn);
50343 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50344 &num_imm64_operand);
50346 fprintf (file, " insn info:\n");
50347 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
50348 group_name[group], path, byte_len);
50349 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
50350 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
50353 /* Print to STDERR the status of the ready list with respect to
50354 dispatch windows. */
50356 DEBUG_FUNCTION void
50357 debug_ready_dispatch (void)
50359 int i;
50360 int no_ready = number_in_ready ();
50362 fprintf (stdout, "Number of ready: %d\n", no_ready);
50364 for (i = 0; i < no_ready; i++)
50365 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
50368 /* This routine is the driver of the dispatch scheduler. */
50370 static void
50371 do_dispatch (rtx_insn *insn, int mode)
50373 if (mode == DISPATCH_INIT)
50374 init_dispatch_sched ();
50375 else if (mode == ADD_TO_DISPATCH_WINDOW)
50376 add_to_dispatch_window (insn);
50379 /* Return TRUE if Dispatch Scheduling is supported. */
50381 static bool
50382 has_dispatch (rtx_insn *insn, int action)
50384 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
50385 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
50386 switch (action)
50388 default:
50389 return false;
50391 case IS_DISPATCH_ON:
50392 return true;
50394 case IS_CMP:
50395 return is_cmp (insn);
50397 case DISPATCH_VIOLATION:
50398 return dispatch_violation ();
50400 case FITS_DISPATCH_WINDOW:
50401 return fits_dispatch_window (insn);
50404 return false;
50407 /* Implementation of reassociation_width target hook used by
50408 reassoc phase to identify parallelism level in reassociated
50409 tree. Statements tree_code is passed in OPC. Arguments type
50410 is passed in MODE.
50412 Currently parallel reassociation is enabled for Atom
50413 processors only and we set reassociation width to be 2
50414 because Atom may issue up to 2 instructions per cycle.
50416 Return value should be fixed if parallel reassociation is
50417 enabled for other processors. */
50419 static int
50420 ix86_reassociation_width (unsigned int, machine_mode mode)
50422 /* Vector part. */
50423 if (VECTOR_MODE_P (mode))
50425 if (TARGET_VECTOR_PARALLEL_EXECUTION)
50426 return 2;
50427 else
50428 return 1;
50431 /* Scalar part. */
50432 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
50433 return 2;
50434 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
50435 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
50436 else
50437 return 1;
50440 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50441 place emms and femms instructions. */
50443 static machine_mode
50444 ix86_preferred_simd_mode (machine_mode mode)
50446 if (!TARGET_SSE)
50447 return word_mode;
50449 switch (mode)
50451 case QImode:
50452 return TARGET_AVX512BW ? V64QImode :
50453 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
50454 case HImode:
50455 return TARGET_AVX512BW ? V32HImode :
50456 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
50457 case SImode:
50458 return TARGET_AVX512F ? V16SImode :
50459 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
50460 case DImode:
50461 return TARGET_AVX512F ? V8DImode :
50462 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
50464 case SFmode:
50465 if (TARGET_AVX512F)
50466 return V16SFmode;
50467 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50468 return V8SFmode;
50469 else
50470 return V4SFmode;
50472 case DFmode:
50473 if (TARGET_AVX512F)
50474 return V8DFmode;
50475 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50476 return V4DFmode;
50477 else if (TARGET_SSE2)
50478 return V2DFmode;
50479 /* FALLTHRU */
50481 default:
50482 return word_mode;
50486 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50487 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50488 256bit and 128bit vectors. */
50490 static unsigned int
50491 ix86_autovectorize_vector_sizes (void)
50493 return TARGET_AVX512F ? 64 | 32 | 16 :
50494 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
50497 /* Implemenation of targetm.vectorize.get_mask_mode. */
50499 static machine_mode
50500 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
50502 unsigned elem_size = vector_size / nunits;
50504 /* Scalar mask case. */
50505 if ((TARGET_AVX512F && vector_size == 64)
50506 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50508 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50509 return smallest_mode_for_size (nunits, MODE_INT);
50512 machine_mode elem_mode
50513 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
50515 gcc_assert (elem_size * nunits == vector_size);
50517 return mode_for_vector (elem_mode, nunits);
50522 /* Return class of registers which could be used for pseudo of MODE
50523 and of class RCLASS for spilling instead of memory. Return NO_REGS
50524 if it is not possible or non-profitable. */
50526 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50528 static reg_class_t
50529 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50531 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50532 && TARGET_SSE2
50533 && TARGET_INTER_UNIT_MOVES_TO_VEC
50534 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50535 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50536 && INTEGER_CLASS_P (rclass))
50537 return ALL_SSE_REGS;
50538 return NO_REGS;
50541 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50542 but returns a lower bound. */
50544 static unsigned int
50545 ix86_max_noce_ifcvt_seq_cost (edge e)
50547 bool predictable_p = predictable_edge_p (e);
50549 enum compiler_param param
50550 = (predictable_p
50551 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50552 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50554 /* If we have a parameter set, use that, otherwise take a guess using
50555 BRANCH_COST. */
50556 if (global_options_set.x_param_values[param])
50557 return PARAM_VALUE (param);
50558 else
50559 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50563 /* Implement targetm.vectorize.init_cost. */
50565 static void *
50566 ix86_init_cost (struct loop *)
50568 unsigned *cost = XNEWVEC (unsigned, 3);
50569 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50570 return cost;
50573 /* Implement targetm.vectorize.add_stmt_cost. */
50575 static unsigned
50576 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50577 struct _stmt_vec_info *stmt_info, int misalign,
50578 enum vect_cost_model_location where)
50580 unsigned *cost = (unsigned *) data;
50581 unsigned retval = 0;
50583 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50584 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50586 /* Penalize DFmode vector operations for Bonnell. */
50587 if (TARGET_BONNELL && kind == vector_stmt
50588 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50589 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50591 /* Statements in an inner loop relative to the loop being
50592 vectorized are weighted more heavily. The value here is
50593 arbitrary and could potentially be improved with analysis. */
50594 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50595 count *= 50; /* FIXME. */
50597 retval = (unsigned) (count * stmt_cost);
50599 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50600 for Silvermont as it has out of order integer pipeline and can execute
50601 2 scalar instruction per tick, but has in order SIMD pipeline. */
50602 if ((TARGET_SILVERMONT || TARGET_INTEL)
50603 && stmt_info && stmt_info->stmt)
50605 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50606 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50607 retval = (retval * 17) / 10;
50610 cost[where] += retval;
50612 return retval;
50615 /* Implement targetm.vectorize.finish_cost. */
50617 static void
50618 ix86_finish_cost (void *data, unsigned *prologue_cost,
50619 unsigned *body_cost, unsigned *epilogue_cost)
50621 unsigned *cost = (unsigned *) data;
50622 *prologue_cost = cost[vect_prologue];
50623 *body_cost = cost[vect_body];
50624 *epilogue_cost = cost[vect_epilogue];
50627 /* Implement targetm.vectorize.destroy_cost_data. */
50629 static void
50630 ix86_destroy_cost_data (void *data)
50632 free (data);
50635 /* Validate target specific memory model bits in VAL. */
50637 static unsigned HOST_WIDE_INT
50638 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50640 enum memmodel model = memmodel_from_int (val);
50641 bool strong;
50643 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50644 |MEMMODEL_MASK)
50645 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50647 warning (OPT_Winvalid_memory_model,
50648 "Unknown architecture specific memory model");
50649 return MEMMODEL_SEQ_CST;
50651 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50652 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50654 warning (OPT_Winvalid_memory_model,
50655 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50656 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50658 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50660 warning (OPT_Winvalid_memory_model,
50661 "HLE_RELEASE not used with RELEASE or stronger memory model");
50662 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50664 return val;
50667 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50668 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50669 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50670 or number of vecsize_mangle variants that should be emitted. */
50672 static int
50673 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50674 struct cgraph_simd_clone *clonei,
50675 tree base_type, int num)
50677 int ret = 1;
50679 if (clonei->simdlen
50680 && (clonei->simdlen < 2
50681 || clonei->simdlen > 1024
50682 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50684 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50685 "unsupported simdlen %d", clonei->simdlen);
50686 return 0;
50689 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50690 if (TREE_CODE (ret_type) != VOID_TYPE)
50691 switch (TYPE_MODE (ret_type))
50693 case QImode:
50694 case HImode:
50695 case SImode:
50696 case DImode:
50697 case SFmode:
50698 case DFmode:
50699 /* case SCmode: */
50700 /* case DCmode: */
50701 break;
50702 default:
50703 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50704 "unsupported return type %qT for simd\n", ret_type);
50705 return 0;
50708 tree t;
50709 int i;
50711 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50712 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50713 switch (TYPE_MODE (TREE_TYPE (t)))
50715 case QImode:
50716 case HImode:
50717 case SImode:
50718 case DImode:
50719 case SFmode:
50720 case DFmode:
50721 /* case SCmode: */
50722 /* case DCmode: */
50723 break;
50724 default:
50725 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50726 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
50727 return 0;
50730 if (clonei->cilk_elemental)
50732 /* Parse here processor clause. If not present, default to 'b'. */
50733 clonei->vecsize_mangle = 'b';
50735 else if (!TREE_PUBLIC (node->decl))
50737 /* If the function isn't exported, we can pick up just one ISA
50738 for the clones. */
50739 if (TARGET_AVX512F)
50740 clonei->vecsize_mangle = 'e';
50741 else if (TARGET_AVX2)
50742 clonei->vecsize_mangle = 'd';
50743 else if (TARGET_AVX)
50744 clonei->vecsize_mangle = 'c';
50745 else
50746 clonei->vecsize_mangle = 'b';
50747 ret = 1;
50749 else
50751 clonei->vecsize_mangle = "bcde"[num];
50752 ret = 4;
50754 clonei->mask_mode = VOIDmode;
50755 switch (clonei->vecsize_mangle)
50757 case 'b':
50758 clonei->vecsize_int = 128;
50759 clonei->vecsize_float = 128;
50760 break;
50761 case 'c':
50762 clonei->vecsize_int = 128;
50763 clonei->vecsize_float = 256;
50764 break;
50765 case 'd':
50766 clonei->vecsize_int = 256;
50767 clonei->vecsize_float = 256;
50768 break;
50769 case 'e':
50770 clonei->vecsize_int = 512;
50771 clonei->vecsize_float = 512;
50772 if (TYPE_MODE (base_type) == QImode)
50773 clonei->mask_mode = DImode;
50774 else
50775 clonei->mask_mode = SImode;
50776 break;
50778 if (clonei->simdlen == 0)
50780 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50781 clonei->simdlen = clonei->vecsize_int;
50782 else
50783 clonei->simdlen = clonei->vecsize_float;
50784 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50786 else if (clonei->simdlen > 16)
50788 /* For compatibility with ICC, use the same upper bounds
50789 for simdlen. In particular, for CTYPE below, use the return type,
50790 unless the function returns void, in that case use the characteristic
50791 type. If it is possible for given SIMDLEN to pass CTYPE value
50792 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50793 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50794 emit corresponding clone. */
50795 tree ctype = ret_type;
50796 if (TREE_CODE (ret_type) == VOID_TYPE)
50797 ctype = base_type;
50798 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50799 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50800 cnt /= clonei->vecsize_int;
50801 else
50802 cnt /= clonei->vecsize_float;
50803 if (cnt > (TARGET_64BIT ? 16 : 8))
50805 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50806 "unsupported simdlen %d", clonei->simdlen);
50807 return 0;
50810 return ret;
50813 /* Add target attribute to SIMD clone NODE if needed. */
50815 static void
50816 ix86_simd_clone_adjust (struct cgraph_node *node)
50818 const char *str = NULL;
50819 gcc_assert (node->decl == cfun->decl);
50820 switch (node->simdclone->vecsize_mangle)
50822 case 'b':
50823 if (!TARGET_SSE2)
50824 str = "sse2";
50825 break;
50826 case 'c':
50827 if (!TARGET_AVX)
50828 str = "avx";
50829 break;
50830 case 'd':
50831 if (!TARGET_AVX2)
50832 str = "avx2";
50833 break;
50834 case 'e':
50835 if (!TARGET_AVX512F)
50836 str = "avx512f";
50837 break;
50838 default:
50839 gcc_unreachable ();
50841 if (str == NULL)
50842 return;
50843 push_cfun (NULL);
50844 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50845 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50846 gcc_assert (ok);
50847 pop_cfun ();
50848 ix86_reset_previous_fndecl ();
50849 ix86_set_current_function (node->decl);
50852 /* If SIMD clone NODE can't be used in a vectorized loop
50853 in current function, return -1, otherwise return a badness of using it
50854 (0 if it is most desirable from vecsize_mangle point of view, 1
50855 slightly less desirable, etc.). */
50857 static int
50858 ix86_simd_clone_usable (struct cgraph_node *node)
50860 switch (node->simdclone->vecsize_mangle)
50862 case 'b':
50863 if (!TARGET_SSE2)
50864 return -1;
50865 if (!TARGET_AVX)
50866 return 0;
50867 return TARGET_AVX2 ? 2 : 1;
50868 case 'c':
50869 if (!TARGET_AVX)
50870 return -1;
50871 return TARGET_AVX2 ? 1 : 0;
50872 case 'd':
50873 if (!TARGET_AVX2)
50874 return -1;
50875 return 0;
50876 case 'e':
50877 if (!TARGET_AVX512F)
50878 return -1;
50879 return 0;
50880 default:
50881 gcc_unreachable ();
50885 /* This function adjusts the unroll factor based on
50886 the hardware capabilities. For ex, bdver3 has
50887 a loop buffer which makes unrolling of smaller
50888 loops less important. This function decides the
50889 unroll factor using number of memory references
50890 (value 32 is used) as a heuristic. */
50892 static unsigned
50893 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50895 basic_block *bbs;
50896 rtx_insn *insn;
50897 unsigned i;
50898 unsigned mem_count = 0;
50900 if (!TARGET_ADJUST_UNROLL)
50901 return nunroll;
50903 /* Count the number of memory references within the loop body.
50904 This value determines the unrolling factor for bdver3 and bdver4
50905 architectures. */
50906 subrtx_iterator::array_type array;
50907 bbs = get_loop_body (loop);
50908 for (i = 0; i < loop->num_nodes; i++)
50909 FOR_BB_INSNS (bbs[i], insn)
50910 if (NONDEBUG_INSN_P (insn))
50911 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50912 if (const_rtx x = *iter)
50913 if (MEM_P (x))
50915 machine_mode mode = GET_MODE (x);
50916 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50917 if (n_words > 4)
50918 mem_count += 2;
50919 else
50920 mem_count += 1;
50922 free (bbs);
50924 if (mem_count && mem_count <=32)
50925 return 32/mem_count;
50927 return nunroll;
50931 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50933 static bool
50934 ix86_float_exceptions_rounding_supported_p (void)
50936 /* For x87 floating point with standard excess precision handling,
50937 there is no adddf3 pattern (since x87 floating point only has
50938 XFmode operations) so the default hook implementation gets this
50939 wrong. */
50940 return TARGET_80387 || TARGET_SSE_MATH;
50943 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50945 static void
50946 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50948 if (!TARGET_80387 && !TARGET_SSE_MATH)
50949 return;
50950 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50951 if (TARGET_80387)
50953 tree fenv_index_type = build_index_type (size_int (6));
50954 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50955 tree fenv_var = create_tmp_var_raw (fenv_type);
50956 TREE_ADDRESSABLE (fenv_var) = 1;
50957 tree fenv_ptr = build_pointer_type (fenv_type);
50958 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50959 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50960 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50961 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50962 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50963 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50964 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50965 tree hold_fnclex = build_call_expr (fnclex, 0);
50966 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50967 NULL_TREE, NULL_TREE);
50968 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50969 hold_fnclex);
50970 *clear = build_call_expr (fnclex, 0);
50971 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50972 tree fnstsw_call = build_call_expr (fnstsw, 0);
50973 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50974 sw_var, fnstsw_call);
50975 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50976 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50977 exceptions_var, exceptions_x87);
50978 *update = build2 (COMPOUND_EXPR, integer_type_node,
50979 sw_mod, update_mod);
50980 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50981 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50983 if (TARGET_SSE_MATH)
50985 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50986 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50987 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50988 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50989 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50990 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50991 mxcsr_orig_var, stmxcsr_hold_call);
50992 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50993 mxcsr_orig_var,
50994 build_int_cst (unsigned_type_node, 0x1f80));
50995 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50996 build_int_cst (unsigned_type_node, 0xffffffc0));
50997 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50998 mxcsr_mod_var, hold_mod_val);
50999 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51000 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51001 hold_assign_orig, hold_assign_mod);
51002 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51003 ldmxcsr_hold_call);
51004 if (*hold)
51005 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51006 else
51007 *hold = hold_all;
51008 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51009 if (*clear)
51010 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51011 ldmxcsr_clear_call);
51012 else
51013 *clear = ldmxcsr_clear_call;
51014 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51015 tree exceptions_sse = fold_convert (integer_type_node,
51016 stxmcsr_update_call);
51017 if (*update)
51019 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51020 exceptions_var, exceptions_sse);
51021 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51022 exceptions_var, exceptions_mod);
51023 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51024 exceptions_assign);
51026 else
51027 *update = build2 (MODIFY_EXPR, integer_type_node,
51028 exceptions_var, exceptions_sse);
51029 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51030 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51031 ldmxcsr_update_call);
51033 tree atomic_feraiseexcept
51034 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51035 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51036 1, exceptions_var);
51037 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51038 atomic_feraiseexcept_call);
51041 /* Return mode to be used for bounds or VOIDmode
51042 if bounds are not supported. */
51044 static enum machine_mode
51045 ix86_mpx_bound_mode ()
51047 /* Do not support pointer checker if MPX
51048 is not enabled. */
51049 if (!TARGET_MPX)
51051 if (flag_check_pointer_bounds)
51052 warning (0, "Pointer Checker requires MPX support on this target."
51053 " Use -mmpx options to enable MPX.");
51054 return VOIDmode;
51057 return BNDmode;
51060 /* Return constant used to statically initialize constant bounds.
51062 This function is used to create special bound values. For now
51063 only INIT bounds and NONE bounds are expected. More special
51064 values may be added later. */
51066 static tree
51067 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51069 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51070 : build_zero_cst (pointer_sized_int_node);
51071 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51072 : build_minus_one_cst (pointer_sized_int_node);
51074 /* This function is supposed to be used to create INIT and
51075 NONE bounds only. */
51076 gcc_assert ((lb == 0 && ub == -1)
51077 || (lb == -1 && ub == 0));
51079 return build_complex (NULL, low, high);
51082 /* Generate a list of statements STMTS to initialize pointer bounds
51083 variable VAR with bounds LB and UB. Return the number of generated
51084 statements. */
51086 static int
51087 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51089 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51090 tree lhs, modify, var_p;
51092 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51093 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51095 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51096 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51097 append_to_statement_list (modify, stmts);
51099 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51100 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51101 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51102 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51103 append_to_statement_list (modify, stmts);
51105 return 2;
51108 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51109 /* For i386, common symbol is local only for non-PIE binaries. For
51110 x86-64, common symbol is local only for non-PIE binaries or linker
51111 supports copy reloc in PIE binaries. */
51113 static bool
51114 ix86_binds_local_p (const_tree exp)
51116 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51117 (!flag_pic
51118 || (TARGET_64BIT
51119 && HAVE_LD_PIE_COPYRELOC != 0)));
51121 #endif
51123 /* If MEM is in the form of [base+offset], extract the two parts
51124 of address and set to BASE and OFFSET, otherwise return false. */
51126 static bool
51127 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51129 rtx addr;
51131 gcc_assert (MEM_P (mem));
51133 addr = XEXP (mem, 0);
51135 if (GET_CODE (addr) == CONST)
51136 addr = XEXP (addr, 0);
51138 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51140 *base = addr;
51141 *offset = const0_rtx;
51142 return true;
51145 if (GET_CODE (addr) == PLUS
51146 && (REG_P (XEXP (addr, 0))
51147 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51148 && CONST_INT_P (XEXP (addr, 1)))
51150 *base = XEXP (addr, 0);
51151 *offset = XEXP (addr, 1);
51152 return true;
51155 return false;
51158 /* Given OPERANDS of consecutive load/store, check if we can merge
51159 them into move multiple. LOAD is true if they are load instructions.
51160 MODE is the mode of memory operands. */
51162 bool
51163 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51164 enum machine_mode mode)
51166 HOST_WIDE_INT offval_1, offval_2, msize;
51167 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51169 if (load)
51171 mem_1 = operands[1];
51172 mem_2 = operands[3];
51173 reg_1 = operands[0];
51174 reg_2 = operands[2];
51176 else
51178 mem_1 = operands[0];
51179 mem_2 = operands[2];
51180 reg_1 = operands[1];
51181 reg_2 = operands[3];
51184 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51186 if (REGNO (reg_1) != REGNO (reg_2))
51187 return false;
51189 /* Check if the addresses are in the form of [base+offset]. */
51190 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51191 return false;
51192 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51193 return false;
51195 /* Check if the bases are the same. */
51196 if (!rtx_equal_p (base_1, base_2))
51197 return false;
51199 offval_1 = INTVAL (offset_1);
51200 offval_2 = INTVAL (offset_2);
51201 msize = GET_MODE_SIZE (mode);
51202 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51203 if (offval_1 + msize != offval_2)
51204 return false;
51206 return true;
51209 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51211 static bool
51212 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51213 optimization_type opt_type)
51215 switch (op)
51217 case asin_optab:
51218 case acos_optab:
51219 case log1p_optab:
51220 case exp_optab:
51221 case exp10_optab:
51222 case exp2_optab:
51223 case expm1_optab:
51224 case ldexp_optab:
51225 case scalb_optab:
51226 case round_optab:
51227 return opt_type == OPTIMIZE_FOR_SPEED;
51229 case rint_optab:
51230 if (SSE_FLOAT_MODE_P (mode1)
51231 && TARGET_SSE_MATH
51232 && !flag_trapping_math
51233 && !TARGET_ROUND)
51234 return opt_type == OPTIMIZE_FOR_SPEED;
51235 return true;
51237 case floor_optab:
51238 case ceil_optab:
51239 case btrunc_optab:
51240 if (SSE_FLOAT_MODE_P (mode1)
51241 && TARGET_SSE_MATH
51242 && !flag_trapping_math
51243 && TARGET_ROUND)
51244 return true;
51245 return opt_type == OPTIMIZE_FOR_SPEED;
51247 case rsqrt_optab:
51248 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51250 default:
51251 return true;
51255 /* Address space support.
51257 This is not "far pointers" in the 16-bit sense, but an easy way
51258 to use %fs and %gs segment prefixes. Therefore:
51260 (a) All address spaces have the same modes,
51261 (b) All address spaces have the same addresss forms,
51262 (c) While %fs and %gs are technically subsets of the generic
51263 address space, they are probably not subsets of each other.
51264 (d) Since we have no access to the segment base register values
51265 without resorting to a system call, we cannot convert a
51266 non-default address space to a default address space.
51267 Therefore we do not claim %fs or %gs are subsets of generic.
51269 Therefore we can (mostly) use the default hooks. */
51271 /* All use of segmentation is assumed to make address 0 valid. */
51273 static bool
51274 ix86_addr_space_zero_address_valid (addr_space_t as)
51276 return as != ADDR_SPACE_GENERIC;
51279 static void
51280 ix86_init_libfuncs (void)
51282 if (TARGET_64BIT)
51284 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51285 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51287 else
51289 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51290 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51293 #if TARGET_MACHO
51294 darwin_rename_builtins ();
51295 #endif
51298 /* Generate call to __divmoddi4. */
51300 static void
51301 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51302 rtx op0, rtx op1,
51303 rtx *quot_p, rtx *rem_p)
51305 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51307 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51308 mode, 3,
51309 op0, GET_MODE (op0),
51310 op1, GET_MODE (op1),
51311 XEXP (rem, 0), Pmode);
51312 *quot_p = quot;
51313 *rem_p = rem;
51316 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51317 FPU, assume that the fpcw is set to extended precision; when using
51318 only SSE, rounding is correct; when using both SSE and the FPU,
51319 the rounding precision is indeterminate, since either may be chosen
51320 apparently at random. */
51322 static enum flt_eval_method
51323 ix86_excess_precision (enum excess_precision_type type)
51325 switch (type)
51327 case EXCESS_PRECISION_TYPE_FAST:
51328 /* The fastest type to promote to will always be the native type,
51329 whether that occurs with implicit excess precision or
51330 otherwise. */
51331 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51332 case EXCESS_PRECISION_TYPE_STANDARD:
51333 case EXCESS_PRECISION_TYPE_IMPLICIT:
51334 /* Otherwise, the excess precision we want when we are
51335 in a standards compliant mode, and the implicit precision we
51336 provide would be identical were it not for the unpredictable
51337 cases. */
51338 if (!TARGET_80387)
51339 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51340 else if (!TARGET_MIX_SSE_I387)
51342 if (!TARGET_SSE_MATH)
51343 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51344 else if (TARGET_SSE2)
51345 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51348 /* If we are in standards compliant mode, but we know we will
51349 calculate in unpredictable precision, return
51350 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51351 excess precision if the target can't guarantee it will honor
51352 it. */
51353 return (type == EXCESS_PRECISION_TYPE_STANDARD
51354 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51355 : FLT_EVAL_METHOD_UNPREDICTABLE);
51356 default:
51357 gcc_unreachable ();
51360 return FLT_EVAL_METHOD_UNPREDICTABLE;
51363 /* Target-specific selftests. */
51365 #if CHECKING_P
51367 namespace selftest {
51369 /* Verify that hard regs are dumped as expected (in compact mode). */
51371 static void
51372 ix86_test_dumping_hard_regs ()
51374 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51375 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51378 /* Test dumping an insn with repeated references to the same SCRATCH,
51379 to verify the rtx_reuse code. */
51381 static void
51382 ix86_test_dumping_memory_blockage ()
51384 set_new_first_and_last_insn (NULL, NULL);
51386 rtx pat = gen_memory_blockage ();
51387 rtx_reuse_manager r;
51388 r.preprocess (pat);
51390 /* Verify that the repeated references to the SCRATCH show use
51391 reuse IDS. The first should be prefixed with a reuse ID,
51392 and the second should be dumped as a "reuse_rtx" of that ID.
51393 The expected string assumes Pmode == DImode. */
51394 if (Pmode == DImode)
51395 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51396 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51397 " (unspec:BLK [\n"
51398 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51399 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51402 /* Verify loading an RTL dump; specifically a dump of copying
51403 a param on x86_64 from a hard reg into the frame.
51404 This test is target-specific since the dump contains target-specific
51405 hard reg names. */
51407 static void
51408 ix86_test_loading_dump_fragment_1 ()
51410 rtl_dump_test t (SELFTEST_LOCATION,
51411 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51413 rtx_insn *insn = get_insn_by_uid (1);
51415 /* The block structure and indentation here is purely for
51416 readability; it mirrors the structure of the rtx. */
51417 tree mem_expr;
51419 rtx pat = PATTERN (insn);
51420 ASSERT_EQ (SET, GET_CODE (pat));
51422 rtx dest = SET_DEST (pat);
51423 ASSERT_EQ (MEM, GET_CODE (dest));
51424 /* Verify the "/c" was parsed. */
51425 ASSERT_TRUE (RTX_FLAG (dest, call));
51426 ASSERT_EQ (SImode, GET_MODE (dest));
51428 rtx addr = XEXP (dest, 0);
51429 ASSERT_EQ (PLUS, GET_CODE (addr));
51430 ASSERT_EQ (DImode, GET_MODE (addr));
51432 rtx lhs = XEXP (addr, 0);
51433 /* Verify that the "frame" REG was consolidated. */
51434 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51437 rtx rhs = XEXP (addr, 1);
51438 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51439 ASSERT_EQ (-4, INTVAL (rhs));
51442 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51443 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51444 /* "i" should have been handled by synthesizing a global int
51445 variable named "i". */
51446 mem_expr = MEM_EXPR (dest);
51447 ASSERT_NE (mem_expr, NULL);
51448 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51449 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51450 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51451 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51452 /* "+0". */
51453 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51454 ASSERT_EQ (0, MEM_OFFSET (dest));
51455 /* "S4". */
51456 ASSERT_EQ (4, MEM_SIZE (dest));
51457 /* "A32. */
51458 ASSERT_EQ (32, MEM_ALIGN (dest));
51461 rtx src = SET_SRC (pat);
51462 ASSERT_EQ (REG, GET_CODE (src));
51463 ASSERT_EQ (SImode, GET_MODE (src));
51464 ASSERT_EQ (5, REGNO (src));
51465 tree reg_expr = REG_EXPR (src);
51466 /* "i" here should point to the same var as for the MEM_EXPR. */
51467 ASSERT_EQ (reg_expr, mem_expr);
51472 /* Verify that the RTL loader copes with a call_insn dump.
51473 This test is target-specific since the dump contains a target-specific
51474 hard reg name. */
51476 static void
51477 ix86_test_loading_call_insn ()
51479 /* The test dump includes register "xmm0", where requires TARGET_SSE
51480 to exist. */
51481 if (!TARGET_SSE)
51482 return;
51484 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51486 rtx_insn *insn = get_insns ();
51487 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51489 /* "/j". */
51490 ASSERT_TRUE (RTX_FLAG (insn, jump));
51492 rtx pat = PATTERN (insn);
51493 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51495 /* Verify REG_NOTES. */
51497 /* "(expr_list:REG_CALL_DECL". */
51498 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51499 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51500 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51502 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51503 rtx_expr_list *note1 = note0->next ();
51504 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51506 ASSERT_EQ (NULL, note1->next ());
51509 /* Verify CALL_INSN_FUNCTION_USAGE. */
51511 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51512 rtx_expr_list *usage
51513 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51514 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51515 ASSERT_EQ (DFmode, GET_MODE (usage));
51516 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51517 ASSERT_EQ (NULL, usage->next ());
51521 /* Verify that the RTL loader copes a dump from print_rtx_function.
51522 This test is target-specific since the dump contains target-specific
51523 hard reg names. */
51525 static void
51526 ix86_test_loading_full_dump ()
51528 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51530 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51532 rtx_insn *insn_1 = get_insn_by_uid (1);
51533 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51535 rtx_insn *insn_7 = get_insn_by_uid (7);
51536 ASSERT_EQ (INSN, GET_CODE (insn_7));
51537 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51539 rtx_insn *insn_15 = get_insn_by_uid (15);
51540 ASSERT_EQ (INSN, GET_CODE (insn_15));
51541 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51543 /* Verify crtl->return_rtx. */
51544 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51545 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51546 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51549 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51550 In particular, verify that it correctly loads the 2nd operand.
51551 This test is target-specific since these are machine-specific
51552 operands (and enums). */
51554 static void
51555 ix86_test_loading_unspec ()
51557 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51559 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51561 ASSERT_TRUE (cfun);
51563 /* Test of an UNSPEC. */
51564 rtx_insn *insn = get_insns ();
51565 ASSERT_EQ (INSN, GET_CODE (insn));
51566 rtx set = single_set (insn);
51567 ASSERT_NE (NULL, set);
51568 rtx dst = SET_DEST (set);
51569 ASSERT_EQ (MEM, GET_CODE (dst));
51570 rtx src = SET_SRC (set);
51571 ASSERT_EQ (UNSPEC, GET_CODE (src));
51572 ASSERT_EQ (BLKmode, GET_MODE (src));
51573 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51575 rtx v0 = XVECEXP (src, 0, 0);
51577 /* Verify that the two uses of the first SCRATCH have pointer
51578 equality. */
51579 rtx scratch_a = XEXP (dst, 0);
51580 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51582 rtx scratch_b = XEXP (v0, 0);
51583 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51585 ASSERT_EQ (scratch_a, scratch_b);
51587 /* Verify that the two mems are thus treated as equal. */
51588 ASSERT_TRUE (rtx_equal_p (dst, v0));
51590 /* Verify the the insn is recognized. */
51591 ASSERT_NE(-1, recog_memoized (insn));
51593 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51594 insn = NEXT_INSN (insn);
51595 ASSERT_EQ (INSN, GET_CODE (insn));
51597 set = single_set (insn);
51598 ASSERT_NE (NULL, set);
51600 src = SET_SRC (set);
51601 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51602 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51605 /* Run all target-specific selftests. */
51607 static void
51608 ix86_run_selftests (void)
51610 ix86_test_dumping_hard_regs ();
51611 ix86_test_dumping_memory_blockage ();
51613 /* Various tests of loading RTL dumps, here because they contain
51614 ix86-isms (e.g. names of hard regs). */
51615 ix86_test_loading_dump_fragment_1 ();
51616 ix86_test_loading_call_insn ();
51617 ix86_test_loading_full_dump ();
51618 ix86_test_loading_unspec ();
51621 } // namespace selftest
51623 #endif /* CHECKING_P */
51625 /* Initialize the GCC target structure. */
51626 #undef TARGET_RETURN_IN_MEMORY
51627 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51629 #undef TARGET_LEGITIMIZE_ADDRESS
51630 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51632 #undef TARGET_ATTRIBUTE_TABLE
51633 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51634 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51635 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51636 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51637 # undef TARGET_MERGE_DECL_ATTRIBUTES
51638 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51639 #endif
51641 #undef TARGET_COMP_TYPE_ATTRIBUTES
51642 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51644 #undef TARGET_INIT_BUILTINS
51645 #define TARGET_INIT_BUILTINS ix86_init_builtins
51646 #undef TARGET_BUILTIN_DECL
51647 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51648 #undef TARGET_EXPAND_BUILTIN
51649 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51651 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51652 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51653 ix86_builtin_vectorized_function
51655 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51656 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51658 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51659 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51661 #undef TARGET_BUILTIN_RECIPROCAL
51662 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51664 #undef TARGET_ASM_FUNCTION_EPILOGUE
51665 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51667 #undef TARGET_ENCODE_SECTION_INFO
51668 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51669 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51670 #else
51671 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51672 #endif
51674 #undef TARGET_ASM_OPEN_PAREN
51675 #define TARGET_ASM_OPEN_PAREN ""
51676 #undef TARGET_ASM_CLOSE_PAREN
51677 #define TARGET_ASM_CLOSE_PAREN ""
51679 #undef TARGET_ASM_BYTE_OP
51680 #define TARGET_ASM_BYTE_OP ASM_BYTE
51682 #undef TARGET_ASM_ALIGNED_HI_OP
51683 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51684 #undef TARGET_ASM_ALIGNED_SI_OP
51685 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51686 #ifdef ASM_QUAD
51687 #undef TARGET_ASM_ALIGNED_DI_OP
51688 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51689 #endif
51691 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51692 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51694 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51695 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51697 #undef TARGET_ASM_UNALIGNED_HI_OP
51698 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51699 #undef TARGET_ASM_UNALIGNED_SI_OP
51700 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51701 #undef TARGET_ASM_UNALIGNED_DI_OP
51702 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51704 #undef TARGET_PRINT_OPERAND
51705 #define TARGET_PRINT_OPERAND ix86_print_operand
51706 #undef TARGET_PRINT_OPERAND_ADDRESS
51707 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51708 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51709 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51710 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51711 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51713 #undef TARGET_SCHED_INIT_GLOBAL
51714 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51715 #undef TARGET_SCHED_ADJUST_COST
51716 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51717 #undef TARGET_SCHED_ISSUE_RATE
51718 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51719 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51720 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51721 ia32_multipass_dfa_lookahead
51722 #undef TARGET_SCHED_MACRO_FUSION_P
51723 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51724 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51725 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51727 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51728 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51730 #undef TARGET_MEMMODEL_CHECK
51731 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51733 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51734 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51736 #ifdef HAVE_AS_TLS
51737 #undef TARGET_HAVE_TLS
51738 #define TARGET_HAVE_TLS true
51739 #endif
51740 #undef TARGET_CANNOT_FORCE_CONST_MEM
51741 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51742 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51743 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51745 #undef TARGET_DELEGITIMIZE_ADDRESS
51746 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51748 #undef TARGET_MS_BITFIELD_LAYOUT_P
51749 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51751 #if TARGET_MACHO
51752 #undef TARGET_BINDS_LOCAL_P
51753 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51754 #else
51755 #undef TARGET_BINDS_LOCAL_P
51756 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51757 #endif
51758 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51759 #undef TARGET_BINDS_LOCAL_P
51760 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51761 #endif
51763 #undef TARGET_ASM_OUTPUT_MI_THUNK
51764 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51765 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51766 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51768 #undef TARGET_ASM_FILE_START
51769 #define TARGET_ASM_FILE_START x86_file_start
51771 #undef TARGET_OPTION_OVERRIDE
51772 #define TARGET_OPTION_OVERRIDE ix86_option_override
51774 #undef TARGET_REGISTER_MOVE_COST
51775 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51776 #undef TARGET_MEMORY_MOVE_COST
51777 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51778 #undef TARGET_RTX_COSTS
51779 #define TARGET_RTX_COSTS ix86_rtx_costs
51780 #undef TARGET_ADDRESS_COST
51781 #define TARGET_ADDRESS_COST ix86_address_cost
51783 #undef TARGET_FIXED_CONDITION_CODE_REGS
51784 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51785 #undef TARGET_CC_MODES_COMPATIBLE
51786 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51788 #undef TARGET_MACHINE_DEPENDENT_REORG
51789 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51791 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51792 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51794 #undef TARGET_BUILD_BUILTIN_VA_LIST
51795 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51797 #undef TARGET_FOLD_BUILTIN
51798 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51800 #undef TARGET_GIMPLE_FOLD_BUILTIN
51801 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51803 #undef TARGET_COMPARE_VERSION_PRIORITY
51804 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51806 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51807 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51808 ix86_generate_version_dispatcher_body
51810 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51811 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51812 ix86_get_function_versions_dispatcher
51814 #undef TARGET_ENUM_VA_LIST_P
51815 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51817 #undef TARGET_FN_ABI_VA_LIST
51818 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51820 #undef TARGET_CANONICAL_VA_LIST_TYPE
51821 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51823 #undef TARGET_EXPAND_BUILTIN_VA_START
51824 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51826 #undef TARGET_MD_ASM_ADJUST
51827 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51829 #undef TARGET_C_EXCESS_PRECISION
51830 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51831 #undef TARGET_PROMOTE_PROTOTYPES
51832 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51833 #undef TARGET_SETUP_INCOMING_VARARGS
51834 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51835 #undef TARGET_MUST_PASS_IN_STACK
51836 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51837 #undef TARGET_FUNCTION_ARG_ADVANCE
51838 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51839 #undef TARGET_FUNCTION_ARG
51840 #define TARGET_FUNCTION_ARG ix86_function_arg
51841 #undef TARGET_INIT_PIC_REG
51842 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51843 #undef TARGET_USE_PSEUDO_PIC_REG
51844 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51845 #undef TARGET_FUNCTION_ARG_BOUNDARY
51846 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51847 #undef TARGET_PASS_BY_REFERENCE
51848 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51849 #undef TARGET_INTERNAL_ARG_POINTER
51850 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51851 #undef TARGET_UPDATE_STACK_BOUNDARY
51852 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51853 #undef TARGET_GET_DRAP_RTX
51854 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51855 #undef TARGET_STRICT_ARGUMENT_NAMING
51856 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51857 #undef TARGET_STATIC_CHAIN
51858 #define TARGET_STATIC_CHAIN ix86_static_chain
51859 #undef TARGET_TRAMPOLINE_INIT
51860 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51861 #undef TARGET_RETURN_POPS_ARGS
51862 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51864 #undef TARGET_LEGITIMATE_COMBINED_INSN
51865 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51867 #undef TARGET_ASAN_SHADOW_OFFSET
51868 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51870 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51871 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51873 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51874 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51876 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51877 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51879 #undef TARGET_C_MODE_FOR_SUFFIX
51880 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51882 #ifdef HAVE_AS_TLS
51883 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51884 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51885 #endif
51887 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51888 #undef TARGET_INSERT_ATTRIBUTES
51889 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51890 #endif
51892 #undef TARGET_MANGLE_TYPE
51893 #define TARGET_MANGLE_TYPE ix86_mangle_type
51895 #ifdef TARGET_THREAD_SSP_OFFSET
51896 #undef TARGET_STACK_PROTECT_GUARD
51897 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51898 #endif
51900 #if !TARGET_MACHO
51901 #undef TARGET_STACK_PROTECT_FAIL
51902 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51903 #endif
51905 #undef TARGET_FUNCTION_VALUE
51906 #define TARGET_FUNCTION_VALUE ix86_function_value
51908 #undef TARGET_FUNCTION_VALUE_REGNO_P
51909 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51911 #undef TARGET_PROMOTE_FUNCTION_MODE
51912 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51914 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51915 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51917 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51918 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51920 #undef TARGET_INSTANTIATE_DECLS
51921 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51923 #undef TARGET_SECONDARY_RELOAD
51924 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51926 #undef TARGET_CLASS_MAX_NREGS
51927 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51929 #undef TARGET_PREFERRED_RELOAD_CLASS
51930 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51931 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51932 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51933 #undef TARGET_CLASS_LIKELY_SPILLED_P
51934 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51936 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51937 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51938 ix86_builtin_vectorization_cost
51939 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
51940 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
51941 ix86_vectorize_vec_perm_const_ok
51942 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51943 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51944 ix86_preferred_simd_mode
51945 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51946 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51947 ix86_autovectorize_vector_sizes
51948 #undef TARGET_VECTORIZE_GET_MASK_MODE
51949 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51950 #undef TARGET_VECTORIZE_INIT_COST
51951 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51952 #undef TARGET_VECTORIZE_ADD_STMT_COST
51953 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51954 #undef TARGET_VECTORIZE_FINISH_COST
51955 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51956 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51957 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51959 #undef TARGET_SET_CURRENT_FUNCTION
51960 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51962 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51963 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51965 #undef TARGET_OPTION_SAVE
51966 #define TARGET_OPTION_SAVE ix86_function_specific_save
51968 #undef TARGET_OPTION_RESTORE
51969 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51971 #undef TARGET_OPTION_POST_STREAM_IN
51972 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51974 #undef TARGET_OPTION_PRINT
51975 #define TARGET_OPTION_PRINT ix86_function_specific_print
51977 #undef TARGET_OPTION_FUNCTION_VERSIONS
51978 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
51980 #undef TARGET_CAN_INLINE_P
51981 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51983 #undef TARGET_LEGITIMATE_ADDRESS_P
51984 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51986 #undef TARGET_REGISTER_PRIORITY
51987 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51989 #undef TARGET_REGISTER_USAGE_LEVELING_P
51990 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51992 #undef TARGET_LEGITIMATE_CONSTANT_P
51993 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51995 #undef TARGET_FRAME_POINTER_REQUIRED
51996 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51998 #undef TARGET_CAN_ELIMINATE
51999 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52001 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52002 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52004 #undef TARGET_ASM_CODE_END
52005 #define TARGET_ASM_CODE_END ix86_code_end
52007 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52008 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52010 #undef TARGET_LOOP_UNROLL_ADJUST
52011 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52013 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52014 #undef TARGET_SPILL_CLASS
52015 #define TARGET_SPILL_CLASS ix86_spill_class
52017 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52018 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52019 ix86_simd_clone_compute_vecsize_and_simdlen
52021 #undef TARGET_SIMD_CLONE_ADJUST
52022 #define TARGET_SIMD_CLONE_ADJUST \
52023 ix86_simd_clone_adjust
52025 #undef TARGET_SIMD_CLONE_USABLE
52026 #define TARGET_SIMD_CLONE_USABLE \
52027 ix86_simd_clone_usable
52029 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52030 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52031 ix86_float_exceptions_rounding_supported_p
52033 #undef TARGET_MODE_EMIT
52034 #define TARGET_MODE_EMIT ix86_emit_mode_set
52036 #undef TARGET_MODE_NEEDED
52037 #define TARGET_MODE_NEEDED ix86_mode_needed
52039 #undef TARGET_MODE_AFTER
52040 #define TARGET_MODE_AFTER ix86_mode_after
52042 #undef TARGET_MODE_ENTRY
52043 #define TARGET_MODE_ENTRY ix86_mode_entry
52045 #undef TARGET_MODE_EXIT
52046 #define TARGET_MODE_EXIT ix86_mode_exit
52048 #undef TARGET_MODE_PRIORITY
52049 #define TARGET_MODE_PRIORITY ix86_mode_priority
52051 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52052 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52054 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52055 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52057 #undef TARGET_STORE_BOUNDS_FOR_ARG
52058 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52060 #undef TARGET_LOAD_RETURNED_BOUNDS
52061 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52063 #undef TARGET_STORE_RETURNED_BOUNDS
52064 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52066 #undef TARGET_CHKP_BOUND_MODE
52067 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52069 #undef TARGET_BUILTIN_CHKP_FUNCTION
52070 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52072 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52073 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52075 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52076 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52078 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52079 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52081 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52082 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52084 #undef TARGET_OFFLOAD_OPTIONS
52085 #define TARGET_OFFLOAD_OPTIONS \
52086 ix86_offload_options
52088 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52089 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52091 #undef TARGET_OPTAB_SUPPORTED_P
52092 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52094 #undef TARGET_HARD_REGNO_SCRATCH_OK
52095 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52097 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52098 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52100 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52101 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52103 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52104 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52106 #undef TARGET_INIT_LIBFUNCS
52107 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52109 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52110 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52112 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52113 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52114 #if CHECKING_P
52115 #undef TARGET_RUN_TARGET_SELFTESTS
52116 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52117 #endif /* #if CHECKING_P */
52119 struct gcc_target targetm = TARGET_INITIALIZER;
52121 #include "gt-i386.h"